genoray 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ # SCM syntax highlighting & preventing 3-way merges
2
+ pixi.lock merge=binary linguist-language=YAML linguist-generated=true
@@ -0,0 +1,8 @@
1
+ .pytest_cache/
2
+ __pycache__/
3
+ notebooks/
4
+ .venv/
5
+
6
+ # pixi environments
7
+ .pixi
8
+ *.egg-info
@@ -0,0 +1,19 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: check-merge-conflict
6
+ - id: debug-statements
7
+ - id: mixed-line-ending
8
+ - id: check-case-conflict
9
+ - id: check-yaml
10
+ - repo: https://github.com/astral-sh/ruff-pre-commit
11
+ rev: v0.9.4
12
+ hooks:
13
+ - id: ruff
14
+ - id: ruff-format
15
+ - repo: https://github.com/commitizen-tools/commitizen
16
+ rev: v4.4.1
17
+ hooks:
18
+ - id: commitizen
19
+ stages: [commit-msg]
@@ -0,0 +1 @@
1
+ 3.9
@@ -0,0 +1,16 @@
1
+ ## 0.1.0 (2025-04-12)
2
+
3
+ ### Feat
4
+
5
+ - sketching out support for PGEN dosages
6
+ - refactor readers to be type safe. pass all tests.
7
+ - **wip**: reasonable output from PGEN in notebook
8
+ - initial PGEN support
9
+ - rename package to genoray
10
+ - rename package to genoray
11
+ - **wip**: initial prototype of VCF reader
12
+ - **wip**: VCF support
13
+
14
+ ### Fix
15
+
16
+ - use future annotations for union types
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2025 David Laub
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
genoray-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: genoray
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Author-email: David Laub <dlaub@ucsd.edu>
6
+ License: Copyright (c) 2025 David Laub
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11
+
12
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
13
+ License-File: LICENSE.txt
14
+ Requires-Python: >=3.9
15
+ Requires-Dist: cyvcf2>=0.31.1
16
+ Requires-Dist: hirola>=0.3.0
17
+ Requires-Dist: numpy
18
+ Requires-Dist: pandas
19
+ Requires-Dist: pgenlib>=0.92.0
20
+ Requires-Dist: phantom-types>=3.0.2
21
+ Requires-Dist: polars>=1.27.1
22
+ Requires-Dist: pyarrow>=19.0.1
23
+ Requires-Dist: pyranges==0.1.3
24
+ Requires-Dist: setuptools>=78.1.0
25
+ Requires-Dist: tqdm>=4.67.1
26
+ Requires-Dist: typing-extensions>=4.13.2
File without changes
@@ -0,0 +1,5 @@
1
+ from ._pgen import PGEN
2
+ from ._types import Reader
3
+ from ._vcf import VCF
4
+
5
+ __all__ = ["Reader", "VCF", "PGEN"]
@@ -0,0 +1,414 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import partial
4
+ from pathlib import Path
5
+ from typing import Generator, TypeVar, cast
6
+
7
+ import numpy as np
8
+ import pgenlib
9
+ import polars as pl
10
+ import pyranges as pr
11
+ from hirola import HashTable
12
+ from numpy.typing import ArrayLike, NDArray
13
+ from phantom import Phantom
14
+ from typing_extensions import Self, TypeGuard, assert_never
15
+
16
+ from ._types import Reader
17
+ from ._utils import (
18
+ ContigNormalizer,
19
+ format_memory,
20
+ is_dtype,
21
+ lengths_to_offsets,
22
+ parse_memory,
23
+ )
24
+
25
+
26
+ def _is_genos_dosages(obj) -> TypeGuard[tuple[Genos, Dosages]]:
27
+ """Check if the object is a tuple of genotypes and dosages.
28
+
29
+ Parameters
30
+ ----------
31
+ obj
32
+ Object to check.
33
+
34
+ Returns
35
+ -------
36
+ bool
37
+ True if the object is a tuple of genotypes and dosages, False otherwise.
38
+ """
39
+ return (
40
+ isinstance(obj, tuple)
41
+ and len(obj) == 2
42
+ and isinstance(obj[0], Genos)
43
+ and isinstance(obj[1], Dosages)
44
+ )
45
+
46
+
47
+ class Genos(
48
+ NDArray[np.int32], Phantom, predicate=partial(is_dtype, dtype=np.int32)
49
+ ): ...
50
+
51
+
52
+ class Dosages(
53
+ NDArray[np.float32], Phantom, predicate=partial(is_dtype, dtype=np.float32)
54
+ ): ...
55
+
56
+
57
+ class GenosDosages(tuple[Genos, Dosages], Phantom, predicate=_is_genos_dosages): ...
58
+
59
+
60
+ T = TypeVar(
61
+ "T",
62
+ Genos,
63
+ Dosages,
64
+ GenosDosages,
65
+ )
66
+
67
+
68
+ class PGEN(Reader[T]):
69
+ available_samples: list[str]
70
+ filter: pl.Expr | None
71
+ ploidy = 2
72
+ contigs: list[str]
73
+ _index: pr.PyRanges
74
+ _geno_pgen: pgenlib.PgenReader
75
+ _dose_pgen: pgenlib.PgenReader
76
+ _s_idx: NDArray[np.uint32]
77
+ _read_as: type[T]
78
+
79
+ Genos = Genos
80
+ Dosages = Dosages
81
+ GenosDosages = GenosDosages
82
+
83
+ def __init__(
84
+ self,
85
+ geno_path: str | Path,
86
+ filter: pl.Expr | None = None,
87
+ read_as: type[T] = Genos,
88
+ dosage_path: str | Path | None = None,
89
+ ):
90
+ # TODO: support dosages and allow user to either provide a second PGEN file for dosages
91
+ # or else use the same PGEN file for both genotypes and dosages.
92
+ # That being said, there's probably not much point for a user to use the same PGEN file
93
+ # for genos and dosages since PLINK2 defines hardcalls as a simple threshold on the dosages
94
+ # when dosages are available.
95
+ if read_as is Dosages or read_as is GenosDosages:
96
+ raise NotImplementedError("PGEN dosages are not yet supported.")
97
+
98
+ geno_path = Path(geno_path)
99
+ samples = _read_psam(geno_path.with_suffix(".psam"))
100
+
101
+ self.filter = filter
102
+ self.available_samples = samples.tolist()
103
+ self._s2i = HashTable(
104
+ max=len(samples) * 2, # type: ignore
105
+ dtype=samples.dtype,
106
+ )
107
+ self._s2i.add(samples)
108
+ self._s_idx = np.arange(len(samples), dtype=np.uint32)
109
+ self._geno_pgen = pgenlib.PgenReader(bytes(geno_path))
110
+
111
+ if dosage_path is not None:
112
+ dosage_path = Path(dosage_path)
113
+ dose_samples = _read_psam(dosage_path.with_suffix(".psam"))
114
+ if (samples != dose_samples).any():
115
+ raise ValueError(
116
+ "Samples in dosage file do not match those in genotype file."
117
+ )
118
+ self._dose_pgen = pgenlib.PgenReader(bytes(Path(dosage_path)))
119
+ else:
120
+ self._dose_pgen = self._geno_pgen
121
+
122
+ if not geno_path.with_suffix(".gvi").exists():
123
+ _write_index(geno_path.with_suffix(".pvar"))
124
+ self._index = _read_index(geno_path.with_suffix(".gvi"), self.filter)
125
+ self.contigs = self._index.chromosomes
126
+ self._c_norm = ContigNormalizer(self._index.chromosomes)
127
+ self._read_as = read_as
128
+
129
+ @property
130
+ def current_samples(self) -> list[str]:
131
+ return self._s2i.keys[self._s_idx].tolist()
132
+
133
+ def set_samples(self, samples: list[str]) -> Self:
134
+ _samples = np.atleast_1d(samples)
135
+ s_idx = self._s2i.get(_samples).astype(np.uint32)
136
+ if (missing := _samples[s_idx == -1]).any():
137
+ raise ValueError(f"Samples {missing} not found in the file.")
138
+ self._s_idx = s_idx
139
+ self._geno_pgen.change_sample_subset(np.sort(s_idx))
140
+ return self
141
+
142
+ def __del__(self):
143
+ self._geno_pgen.close()
144
+ if self._dose_pgen is not None:
145
+ self._dose_pgen.close()
146
+
147
+ def n_vars_in_ranges(
148
+ self,
149
+ contig: str,
150
+ starts: ArrayLike = 0,
151
+ ends: ArrayLike | None = None,
152
+ ) -> NDArray[np.uint32]:
153
+ c = self._c_norm.norm(contig)
154
+ if c is None:
155
+ return np.zeros_like(np.atleast_1d(starts), dtype=np.uint32)
156
+
157
+ starts = np.atleast_1d(starts)
158
+ if ends is None:
159
+ ends = np.full_like(starts, np.iinfo(np.int32).max)
160
+ queries = pr.PyRanges(
161
+ pl.DataFrame(
162
+ {
163
+ "Chromosome": np.full_like(starts, contig),
164
+ "Start": starts,
165
+ "End": ends,
166
+ }
167
+ ).to_pandas(use_pyarrow_extension_array=True)
168
+ )
169
+ return (
170
+ queries.count_overlaps(self._index)
171
+ .df["NumberOverlaps"]
172
+ .to_numpy()
173
+ .astype(np.uint32)
174
+ )
175
+
176
+ def _var_idxs(
177
+ self, contig: str, starts: ArrayLike = 0, ends: ArrayLike | None = None
178
+ ) -> tuple[NDArray[np.uint32], NDArray[np.uint64]]:
179
+ """Get variant indices and the number of indices per region.
180
+
181
+ Parameters
182
+ ----------
183
+ contig
184
+ Contig name.
185
+ starts
186
+ 0-based start positions of the regions.
187
+ ends
188
+ 0-based, exclusive end positions of the regions.
189
+
190
+ Returns
191
+ -------
192
+ idxs
193
+ Shape: (tot_variants). Variant indices for the given ranges.
194
+ offsets
195
+ Shape: (regions+1). Offsets to get variant indices for each region.
196
+ """
197
+ starts = np.atleast_1d(starts)
198
+
199
+ c = self._c_norm.norm(contig)
200
+ if c is None:
201
+ return np.empty(0, np.uint32), np.zeros_like(
202
+ np.atleast_1d(starts), np.uint64
203
+ )
204
+
205
+ starts = np.atleast_1d(starts)
206
+ if ends is None:
207
+ ends = np.full_like(starts, np.iinfo(np.int32).max)
208
+ queries = pr.PyRanges(
209
+ pl.DataFrame(
210
+ {
211
+ "Chromosome": np.full_like(starts, contig),
212
+ "Start": starts,
213
+ "End": ends,
214
+ }
215
+ )
216
+ .with_row_index("query")
217
+ .to_pandas(use_pyarrow_extension_array=True)
218
+ )
219
+ join = pl.from_pandas(queries.join(self._index).df)
220
+ if join.height == 0:
221
+ return np.empty(0, np.uint32), np.zeros_like(
222
+ np.atleast_1d(starts), np.uint64
223
+ )
224
+ join = join.sort("query", "index")
225
+ idxs = join["index"].to_numpy()
226
+ lens = (
227
+ join.group_by("query", maintain_order=True).agg(pl.len())["len"].to_numpy()
228
+ )
229
+ offsets = lengths_to_offsets(lens)
230
+ return idxs, offsets
231
+
232
+ def read(
233
+ self,
234
+ contig: str,
235
+ start: int = 0,
236
+ end: int | None = None,
237
+ out: T | None = None,
238
+ ) -> T | None:
239
+ c = self._c_norm.norm(contig)
240
+ if c is None:
241
+ return
242
+
243
+ if end is None:
244
+ end = np.iinfo(np.int64).max
245
+
246
+ var_idxs, _ = self._var_idxs(c, start, end)
247
+ n_variants = len(var_idxs)
248
+ if n_variants == 0:
249
+ return
250
+
251
+ # TODO: support dosages
252
+
253
+ if out is None:
254
+ data = np.empty((n_variants, self.n_samples * self.ploidy), dtype=np.int32)
255
+ else:
256
+ if not isinstance(out, Genos):
257
+ raise ValueError(f"Expected a np.int32 array, got {type(out)}.")
258
+ data = out
259
+
260
+ self._geno_pgen.read_alleles_list(var_idxs, data)
261
+ data = data.reshape(n_variants, self.n_samples, self.ploidy).transpose(1, 2, 0)[
262
+ self._s_idx
263
+ ]
264
+ data[data == -9] = -1
265
+
266
+ data = cast(T, data)
267
+
268
+ return data
269
+
270
+ def read_chunks(
271
+ self,
272
+ contig: str,
273
+ start: int = 0,
274
+ end: int | None = None,
275
+ max_mem: int | str = "4g",
276
+ ) -> Generator[T]:
277
+ # TODO: support dosages
278
+
279
+ max_mem = parse_memory(max_mem)
280
+
281
+ c = self._c_norm.norm(contig)
282
+ if c is None:
283
+ return
284
+
285
+ if end is None:
286
+ end = np.iinfo(np.int64).max
287
+
288
+ var_idxs, _ = self._var_idxs(c, start, end)
289
+ n_variants = len(var_idxs)
290
+ if n_variants == 0:
291
+ return
292
+
293
+ mem_per_v = self._mem_per_variant()
294
+ vars_per_chunk = min(max_mem // mem_per_v, n_variants)
295
+ if vars_per_chunk == 0:
296
+ raise ValueError(
297
+ f"Maximum memory {format_memory(max_mem)} insufficient to read a single variant."
298
+ f" Memory per variant: {format_memory(mem_per_v)}."
299
+ )
300
+
301
+ n_chunks = -(-n_variants // vars_per_chunk)
302
+ v_chunks = np.array_split(var_idxs, n_chunks)
303
+ for var_idx in v_chunks:
304
+ chunk_size = len(var_idx)
305
+ out = np.empty((chunk_size, self.n_samples * self.ploidy), dtype=np.int32)
306
+ self._geno_pgen.read_alleles_list(var_idx, out)
307
+ out = out.reshape(chunk_size, self.n_samples, self.ploidy).transpose(
308
+ 1, 2, 0
309
+ )[self._s_idx]
310
+ out[out == -9] = -1
311
+ yield cast(T, out)
312
+
313
+ def read_ranges(
314
+ self,
315
+ contig: str,
316
+ starts: ArrayLike = 0,
317
+ ends: ArrayLike | None = None,
318
+ ) -> tuple[T, NDArray[np.uint32]] | None:
319
+ # TODO: support dosages
320
+
321
+ starts = np.atleast_1d(starts)
322
+
323
+ c = self._c_norm.norm(contig)
324
+ if c is None:
325
+ return
326
+
327
+ var_idxs, offsets = self._var_idxs(c, starts, ends)
328
+ n_variants = len(var_idxs)
329
+ if n_variants == 0:
330
+ return
331
+
332
+ out = np.empty((n_variants, self.n_samples * self.ploidy), dtype=np.int32)
333
+
334
+ self._geno_pgen.read_alleles_list(var_idxs, out)
335
+ out = out.reshape(n_variants, self.n_samples, self.ploidy).transpose(1, 2, 0)[
336
+ self._s_idx
337
+ ]
338
+ out[out == -9] = -1
339
+
340
+ return cast(T, out), np.diff(offsets).astype(np.uint32)
341
+
342
+ def _mem_per_variant(self) -> int:
343
+ if issubclass(self._read_as, Genos):
344
+ return self.n_samples * self.ploidy * np.int32().itemsize
345
+ elif issubclass(self._read_as, (Dosages, GenosDosages)):
346
+ raise NotImplementedError("Dosages are not yet supported.")
347
+ else:
348
+ assert_never(self._read_as)
349
+
350
+
351
+ def _read_psam(path: Path) -> NDArray[np.str_]:
352
+ with open(path.with_suffix(".psam")) as f:
353
+ cols = [c.strip("#") for c in f.readline().strip().split()]
354
+
355
+ psam = pl.read_csv(
356
+ path.with_suffix(".psam"),
357
+ separator="\t",
358
+ has_header=False,
359
+ skip_rows=1,
360
+ new_columns=cols,
361
+ schema_overrides={
362
+ "FID": pl.Utf8,
363
+ "IID": pl.Utf8,
364
+ "SID": pl.Utf8,
365
+ "PAT": pl.Utf8,
366
+ "MAT": pl.Utf8,
367
+ "SEX": pl.Utf8,
368
+ },
369
+ )
370
+ samples = psam["IID"].to_numpy().astype(str)
371
+ return samples
372
+
373
+
374
+ RLEN = pl.col("REF").str.len_bytes()
375
+ ALEN = pl.col("ALT").str.len_bytes()
376
+ ILEN = ALEN - RLEN
377
+ KIND = (
378
+ pl.when(ILEN != 0)
379
+ .then(pl.lit("INDEL"))
380
+ .when(RLEN == 1)
381
+ .then(pl.lit("SNP"))
382
+ .otherwise(pl.lit("MNP"))
383
+ .cast(pl.Categorical)
384
+ )
385
+
386
+
387
+ # TODO: index can likely be implemented using the NCLS lib underlying PyRanges and then we can
388
+ # pass np.memmap arrays directly instead of having to futz with DataFrames. This will likely make
389
+ # filtering less ergonomic/harder to make ergonomic though, but a memmap approach will be scalable
390
+ # to ultra-large datasets (100k+ individuals).
391
+ def _write_index(path: Path):
392
+ (
393
+ pl.scan_csv(
394
+ path.with_suffix(".pvar"),
395
+ separator="\t",
396
+ comment_prefix="##",
397
+ schema_overrides={"#CHROM": pl.Utf8, "POS": pl.Int32},
398
+ )
399
+ .select(
400
+ Chromosome="#CHROM",
401
+ Start=pl.col("POS") - 1,
402
+ End=pl.col("POS") + RLEN - 1,
403
+ kind=KIND,
404
+ )
405
+ .sink_ipc(path.with_suffix(".gvi"))
406
+ )
407
+
408
+
409
+ def _read_index(path: Path, filter: pl.Expr | None) -> pr.PyRanges:
410
+ index = pl.read_ipc(path, row_index_name="index", memory_map=False)
411
+ if filter is not None:
412
+ index = index.filter(filter)
413
+ pyr = pr.PyRanges(index.drop("kind").to_pandas(use_pyarrow_extension_array=True))
414
+ return pyr
@@ -0,0 +1,154 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Generator, Generic, Protocol, TypeVar
4
+
5
+ import numpy as np
6
+ from numpy.typing import ArrayLike, NDArray
7
+ from typing_extensions import Self
8
+
9
+ T = TypeVar("T")
10
+
11
+
12
+ class Reader(Protocol, Generic[T]):
13
+ available_samples: list[str]
14
+ """All samples in the file, in the order they exist on-disk."""
15
+ ploidy: int
16
+ filter: Any | None
17
+ contigs: list[str]
18
+
19
+ @property
20
+ def current_samples(self) -> list[str]:
21
+ """The samples this reader will return, in order along the sample axis."""
22
+ ...
23
+
24
+ def set_samples(self, samples: list[str]) -> Self:
25
+ """Set the samples this reader will return, in order along the sample axis."""
26
+ ...
27
+
28
+ @property
29
+ def n_samples(self) -> int:
30
+ return len(self.current_samples)
31
+
32
+ def n_vars_in_ranges(
33
+ self, contig: str, starts: ArrayLike = 0, ends: ArrayLike | None = None
34
+ ) -> NDArray[np.uint32]:
35
+ """Return the start and end indices of the variants in the given ranges.
36
+
37
+ Parameters
38
+ ----------
39
+ contig
40
+ Contig name.
41
+ starts
42
+ 0-based start positions of the regions.
43
+ ends
44
+ 0-based, exclusive end positions of the regions.
45
+
46
+ Returns
47
+ -------
48
+ n_variants
49
+ Shape: (regions). Number of variants in the given ranges.
50
+ """
51
+ ...
52
+
53
+ def read(
54
+ self,
55
+ contig: str,
56
+ start: int = 0,
57
+ end: int | None = None,
58
+ out: T | None = None,
59
+ ) -> T | None:
60
+ """Read genotypes and/or dosages for a region.
61
+
62
+ Parameters
63
+ ----------
64
+ contig
65
+ Contig name.
66
+ start
67
+ 0-based start position of the region.
68
+ end
69
+ 0-based, exclusive end position of the region.
70
+ samples
71
+ Samples to read. If None, all samples are read.
72
+ ploids
73
+ Ploids to read. If None, all ploids are read.
74
+ dosage_field
75
+ Dosage field to read. If True, use the default dosage field for the format.
76
+
77
+ Returns
78
+ -------
79
+ data
80
+ Genotypes and/or dosages. Genotypes have shape (samples ploidy variants) and
81
+ dosages have shape (samples variants). Missing genotypes have value -1 and missing dosages
82
+ have value np.nan. If just using genotypes or dosages, will be a single array, otherwise
83
+ will be a tuple of arrays.
84
+ dosage
85
+ Shape: (samples variants)
86
+ """
87
+ ...
88
+
89
+ def read_chunks(
90
+ self,
91
+ contig: str,
92
+ start: int = 0,
93
+ end: int | None = None,
94
+ max_mem: int | str = "4g",
95
+ ) -> Generator[T]:
96
+ """Iterate over genotypes and/or dosages for a region in chunks limited by max_mem.
97
+
98
+ Parameters
99
+ ----------
100
+ contig
101
+ Contig name.
102
+ start
103
+ 0-based start position.
104
+ end
105
+ 0-based, exclusive end position of the region.
106
+ samples
107
+ Samples to read. If None, all samples are read.
108
+ ploids
109
+ Ploids to read. If None, all ploids are read.
110
+
111
+ Returns
112
+ -------
113
+ data
114
+ Generator of genotypes and/or dosages. Genotypes have shape (samples ploidy variants) and
115
+ dosages have shape (samples variants). Missing genotypes have value -1 and missing dosages
116
+ have value np.nan. If just using genotypes or dosages, will be a single array, otherwise
117
+ will be a tuple of arrays.
118
+ """
119
+ ...
120
+
121
+ def read_ranges(
122
+ self,
123
+ contig: str,
124
+ starts: ArrayLike = 0,
125
+ ends: ArrayLike | None = None,
126
+ ) -> tuple[T, NDArray[np.uint32]] | None:
127
+ """Read genotypes and/or dosages for multiple regions.
128
+
129
+ Parameters
130
+ ----------
131
+ contig
132
+ Contig name.
133
+ start
134
+ 0-based start position of the region.
135
+ end
136
+ 0-based, exclusive end position of the region.
137
+ samples
138
+ Samples to read. If None, all samples are read.
139
+ ploids
140
+ Ploids to read. If None, all ploids are read.
141
+ dosage_field
142
+ Dosage field to read. If True, use the default dosage field for the format.
143
+
144
+ Returns
145
+ -------
146
+ data
147
+ Genotypes and/or dosages. Genotypes have shape (samples ploidy variants) and
148
+ dosages have shape (samples variants). Missing genotypes have value -1 and missing dosages
149
+ have value np.nan. If just using genotypes or dosages, will be a single array, otherwise
150
+ will be a tuple of arrays.
151
+ n_variants_per_region
152
+ Shape: (regions). Number of variants in the given ranges.
153
+ """
154
+ ...