genoray 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genoray-0.1.0/.gitattributes +2 -0
- genoray-0.1.0/.gitignore +8 -0
- genoray-0.1.0/.pre-commit-config.yaml +19 -0
- genoray-0.1.0/.python-version +1 -0
- genoray-0.1.0/CHANGELOG.md +16 -0
- genoray-0.1.0/LICENSE.txt +7 -0
- genoray-0.1.0/PKG-INFO +26 -0
- genoray-0.1.0/README.md +0 -0
- genoray-0.1.0/genoray/__init__.py +5 -0
- genoray-0.1.0/genoray/_pgen.py +414 -0
- genoray-0.1.0/genoray/_types.py +154 -0
- genoray-0.1.0/genoray/_utils.py +114 -0
- genoray-0.1.0/genoray/_vcf.py +434 -0
- genoray-0.1.0/genoray/py.typed +0 -0
- genoray-0.1.0/pixi.lock +2092 -0
- genoray-0.1.0/pixi.toml +22 -0
- genoray-0.1.0/pyproject.toml +34 -0
- genoray-0.1.0/tests/data/gen_from_vcf.sh +12 -0
- genoray-0.1.0/tests/data/test.gvi +0 -0
- genoray-0.1.0/tests/data/test.log +26 -0
- genoray-0.1.0/tests/data/test.pgen +0 -0
- genoray-0.1.0/tests/data/test.psam +3 -0
- genoray-0.1.0/tests/data/test.pvar +4 -0
- genoray-0.1.0/tests/data/test.vcf +7 -0
- genoray-0.1.0/tests/data/test.vcf.gz +0 -0
- genoray-0.1.0/tests/data/test.vcf.gz.csi +0 -0
- genoray-0.1.0/tests/test_pgen.py +66 -0
- genoray-0.1.0/tests/test_utils.py +177 -0
- genoray-0.1.0/tests/test_vcf.py +67 -0
- genoray-0.1.0/uv.lock +668 -0
genoray-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: check-merge-conflict
|
|
6
|
+
- id: debug-statements
|
|
7
|
+
- id: mixed-line-ending
|
|
8
|
+
- id: check-case-conflict
|
|
9
|
+
- id: check-yaml
|
|
10
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
11
|
+
rev: v0.9.4
|
|
12
|
+
hooks:
|
|
13
|
+
- id: ruff
|
|
14
|
+
- id: ruff-format
|
|
15
|
+
- repo: https://github.com/commitizen-tools/commitizen
|
|
16
|
+
rev: v4.4.1
|
|
17
|
+
hooks:
|
|
18
|
+
- id: commitizen
|
|
19
|
+
stages: [commit-msg]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.9
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
## 0.1.0 (2025-04-12)
|
|
2
|
+
|
|
3
|
+
### Feat
|
|
4
|
+
|
|
5
|
+
- sketching out support for PGEN dosages
|
|
6
|
+
- refactor readers to be type safe. pass all tests.
|
|
7
|
+
- **wip**: reasonable output from PGEN in notebook
|
|
8
|
+
- initial PGEN support
|
|
9
|
+
- rename package to genoray
|
|
10
|
+
- rename package to genoray
|
|
11
|
+
- **wip**: initial prototype of VCF reader
|
|
12
|
+
- **wip**: VCF support
|
|
13
|
+
|
|
14
|
+
### Fix
|
|
15
|
+
|
|
16
|
+
- use future annotations for union types
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright (c) 2025 David Laub
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
genoray-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genoray
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author-email: David Laub <dlaub@ucsd.edu>
|
|
6
|
+
License: Copyright (c) 2025 David Laub
|
|
7
|
+
|
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
11
|
+
|
|
12
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
13
|
+
License-File: LICENSE.txt
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Requires-Dist: cyvcf2>=0.31.1
|
|
16
|
+
Requires-Dist: hirola>=0.3.0
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: pandas
|
|
19
|
+
Requires-Dist: pgenlib>=0.92.0
|
|
20
|
+
Requires-Dist: phantom-types>=3.0.2
|
|
21
|
+
Requires-Dist: polars>=1.27.1
|
|
22
|
+
Requires-Dist: pyarrow>=19.0.1
|
|
23
|
+
Requires-Dist: pyranges==0.1.3
|
|
24
|
+
Requires-Dist: setuptools>=78.1.0
|
|
25
|
+
Requires-Dist: tqdm>=4.67.1
|
|
26
|
+
Requires-Dist: typing-extensions>=4.13.2
|
genoray-0.1.0/README.md
ADDED
|
File without changes
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from functools import partial
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Generator, TypeVar, cast
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pgenlib
|
|
9
|
+
import polars as pl
|
|
10
|
+
import pyranges as pr
|
|
11
|
+
from hirola import HashTable
|
|
12
|
+
from numpy.typing import ArrayLike, NDArray
|
|
13
|
+
from phantom import Phantom
|
|
14
|
+
from typing_extensions import Self, TypeGuard, assert_never
|
|
15
|
+
|
|
16
|
+
from ._types import Reader
|
|
17
|
+
from ._utils import (
|
|
18
|
+
ContigNormalizer,
|
|
19
|
+
format_memory,
|
|
20
|
+
is_dtype,
|
|
21
|
+
lengths_to_offsets,
|
|
22
|
+
parse_memory,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _is_genos_dosages(obj) -> TypeGuard[tuple[Genos, Dosages]]:
|
|
27
|
+
"""Check if the object is a tuple of genotypes and dosages.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
obj
|
|
32
|
+
Object to check.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
bool
|
|
37
|
+
True if the object is a tuple of genotypes and dosages, False otherwise.
|
|
38
|
+
"""
|
|
39
|
+
return (
|
|
40
|
+
isinstance(obj, tuple)
|
|
41
|
+
and len(obj) == 2
|
|
42
|
+
and isinstance(obj[0], Genos)
|
|
43
|
+
and isinstance(obj[1], Dosages)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Genos(
|
|
48
|
+
NDArray[np.int32], Phantom, predicate=partial(is_dtype, dtype=np.int32)
|
|
49
|
+
): ...
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Dosages(
|
|
53
|
+
NDArray[np.float32], Phantom, predicate=partial(is_dtype, dtype=np.float32)
|
|
54
|
+
): ...
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class GenosDosages(tuple[Genos, Dosages], Phantom, predicate=_is_genos_dosages): ...
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
T = TypeVar(
|
|
61
|
+
"T",
|
|
62
|
+
Genos,
|
|
63
|
+
Dosages,
|
|
64
|
+
GenosDosages,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class PGEN(Reader[T]):
|
|
69
|
+
available_samples: list[str]
|
|
70
|
+
filter: pl.Expr | None
|
|
71
|
+
ploidy = 2
|
|
72
|
+
contigs: list[str]
|
|
73
|
+
_index: pr.PyRanges
|
|
74
|
+
_geno_pgen: pgenlib.PgenReader
|
|
75
|
+
_dose_pgen: pgenlib.PgenReader
|
|
76
|
+
_s_idx: NDArray[np.uint32]
|
|
77
|
+
_read_as: type[T]
|
|
78
|
+
|
|
79
|
+
Genos = Genos
|
|
80
|
+
Dosages = Dosages
|
|
81
|
+
GenosDosages = GenosDosages
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
geno_path: str | Path,
|
|
86
|
+
filter: pl.Expr | None = None,
|
|
87
|
+
read_as: type[T] = Genos,
|
|
88
|
+
dosage_path: str | Path | None = None,
|
|
89
|
+
):
|
|
90
|
+
# TODO: support dosages and allow user to either provide a second PGEN file for dosages
|
|
91
|
+
# or else use the same PGEN file for both genotypes and dosages.
|
|
92
|
+
# That being said, there's probably not much point for a user to use the same PGEN file
|
|
93
|
+
# for genos and dosages since PLINK2 defines hardcalls as a simple threshold on the dosages
|
|
94
|
+
# when dosages are available.
|
|
95
|
+
if read_as is Dosages or read_as is GenosDosages:
|
|
96
|
+
raise NotImplementedError("PGEN dosages are not yet supported.")
|
|
97
|
+
|
|
98
|
+
geno_path = Path(geno_path)
|
|
99
|
+
samples = _read_psam(geno_path.with_suffix(".psam"))
|
|
100
|
+
|
|
101
|
+
self.filter = filter
|
|
102
|
+
self.available_samples = samples.tolist()
|
|
103
|
+
self._s2i = HashTable(
|
|
104
|
+
max=len(samples) * 2, # type: ignore
|
|
105
|
+
dtype=samples.dtype,
|
|
106
|
+
)
|
|
107
|
+
self._s2i.add(samples)
|
|
108
|
+
self._s_idx = np.arange(len(samples), dtype=np.uint32)
|
|
109
|
+
self._geno_pgen = pgenlib.PgenReader(bytes(geno_path))
|
|
110
|
+
|
|
111
|
+
if dosage_path is not None:
|
|
112
|
+
dosage_path = Path(dosage_path)
|
|
113
|
+
dose_samples = _read_psam(dosage_path.with_suffix(".psam"))
|
|
114
|
+
if (samples != dose_samples).any():
|
|
115
|
+
raise ValueError(
|
|
116
|
+
"Samples in dosage file do not match those in genotype file."
|
|
117
|
+
)
|
|
118
|
+
self._dose_pgen = pgenlib.PgenReader(bytes(Path(dosage_path)))
|
|
119
|
+
else:
|
|
120
|
+
self._dose_pgen = self._geno_pgen
|
|
121
|
+
|
|
122
|
+
if not geno_path.with_suffix(".gvi").exists():
|
|
123
|
+
_write_index(geno_path.with_suffix(".pvar"))
|
|
124
|
+
self._index = _read_index(geno_path.with_suffix(".gvi"), self.filter)
|
|
125
|
+
self.contigs = self._index.chromosomes
|
|
126
|
+
self._c_norm = ContigNormalizer(self._index.chromosomes)
|
|
127
|
+
self._read_as = read_as
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def current_samples(self) -> list[str]:
|
|
131
|
+
return self._s2i.keys[self._s_idx].tolist()
|
|
132
|
+
|
|
133
|
+
def set_samples(self, samples: list[str]) -> Self:
|
|
134
|
+
_samples = np.atleast_1d(samples)
|
|
135
|
+
s_idx = self._s2i.get(_samples).astype(np.uint32)
|
|
136
|
+
if (missing := _samples[s_idx == -1]).any():
|
|
137
|
+
raise ValueError(f"Samples {missing} not found in the file.")
|
|
138
|
+
self._s_idx = s_idx
|
|
139
|
+
self._geno_pgen.change_sample_subset(np.sort(s_idx))
|
|
140
|
+
return self
|
|
141
|
+
|
|
142
|
+
def __del__(self):
|
|
143
|
+
self._geno_pgen.close()
|
|
144
|
+
if self._dose_pgen is not None:
|
|
145
|
+
self._dose_pgen.close()
|
|
146
|
+
|
|
147
|
+
def n_vars_in_ranges(
|
|
148
|
+
self,
|
|
149
|
+
contig: str,
|
|
150
|
+
starts: ArrayLike = 0,
|
|
151
|
+
ends: ArrayLike | None = None,
|
|
152
|
+
) -> NDArray[np.uint32]:
|
|
153
|
+
c = self._c_norm.norm(contig)
|
|
154
|
+
if c is None:
|
|
155
|
+
return np.zeros_like(np.atleast_1d(starts), dtype=np.uint32)
|
|
156
|
+
|
|
157
|
+
starts = np.atleast_1d(starts)
|
|
158
|
+
if ends is None:
|
|
159
|
+
ends = np.full_like(starts, np.iinfo(np.int32).max)
|
|
160
|
+
queries = pr.PyRanges(
|
|
161
|
+
pl.DataFrame(
|
|
162
|
+
{
|
|
163
|
+
"Chromosome": np.full_like(starts, contig),
|
|
164
|
+
"Start": starts,
|
|
165
|
+
"End": ends,
|
|
166
|
+
}
|
|
167
|
+
).to_pandas(use_pyarrow_extension_array=True)
|
|
168
|
+
)
|
|
169
|
+
return (
|
|
170
|
+
queries.count_overlaps(self._index)
|
|
171
|
+
.df["NumberOverlaps"]
|
|
172
|
+
.to_numpy()
|
|
173
|
+
.astype(np.uint32)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def _var_idxs(
|
|
177
|
+
self, contig: str, starts: ArrayLike = 0, ends: ArrayLike | None = None
|
|
178
|
+
) -> tuple[NDArray[np.uint32], NDArray[np.uint64]]:
|
|
179
|
+
"""Get variant indices and the number of indices per region.
|
|
180
|
+
|
|
181
|
+
Parameters
|
|
182
|
+
----------
|
|
183
|
+
contig
|
|
184
|
+
Contig name.
|
|
185
|
+
starts
|
|
186
|
+
0-based start positions of the regions.
|
|
187
|
+
ends
|
|
188
|
+
0-based, exclusive end positions of the regions.
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
idxs
|
|
193
|
+
Shape: (tot_variants). Variant indices for the given ranges.
|
|
194
|
+
offsets
|
|
195
|
+
Shape: (regions+1). Offsets to get variant indices for each region.
|
|
196
|
+
"""
|
|
197
|
+
starts = np.atleast_1d(starts)
|
|
198
|
+
|
|
199
|
+
c = self._c_norm.norm(contig)
|
|
200
|
+
if c is None:
|
|
201
|
+
return np.empty(0, np.uint32), np.zeros_like(
|
|
202
|
+
np.atleast_1d(starts), np.uint64
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
starts = np.atleast_1d(starts)
|
|
206
|
+
if ends is None:
|
|
207
|
+
ends = np.full_like(starts, np.iinfo(np.int32).max)
|
|
208
|
+
queries = pr.PyRanges(
|
|
209
|
+
pl.DataFrame(
|
|
210
|
+
{
|
|
211
|
+
"Chromosome": np.full_like(starts, contig),
|
|
212
|
+
"Start": starts,
|
|
213
|
+
"End": ends,
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
.with_row_index("query")
|
|
217
|
+
.to_pandas(use_pyarrow_extension_array=True)
|
|
218
|
+
)
|
|
219
|
+
join = pl.from_pandas(queries.join(self._index).df)
|
|
220
|
+
if join.height == 0:
|
|
221
|
+
return np.empty(0, np.uint32), np.zeros_like(
|
|
222
|
+
np.atleast_1d(starts), np.uint64
|
|
223
|
+
)
|
|
224
|
+
join = join.sort("query", "index")
|
|
225
|
+
idxs = join["index"].to_numpy()
|
|
226
|
+
lens = (
|
|
227
|
+
join.group_by("query", maintain_order=True).agg(pl.len())["len"].to_numpy()
|
|
228
|
+
)
|
|
229
|
+
offsets = lengths_to_offsets(lens)
|
|
230
|
+
return idxs, offsets
|
|
231
|
+
|
|
232
|
+
def read(
|
|
233
|
+
self,
|
|
234
|
+
contig: str,
|
|
235
|
+
start: int = 0,
|
|
236
|
+
end: int | None = None,
|
|
237
|
+
out: T | None = None,
|
|
238
|
+
) -> T | None:
|
|
239
|
+
c = self._c_norm.norm(contig)
|
|
240
|
+
if c is None:
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
if end is None:
|
|
244
|
+
end = np.iinfo(np.int64).max
|
|
245
|
+
|
|
246
|
+
var_idxs, _ = self._var_idxs(c, start, end)
|
|
247
|
+
n_variants = len(var_idxs)
|
|
248
|
+
if n_variants == 0:
|
|
249
|
+
return
|
|
250
|
+
|
|
251
|
+
# TODO: support dosages
|
|
252
|
+
|
|
253
|
+
if out is None:
|
|
254
|
+
data = np.empty((n_variants, self.n_samples * self.ploidy), dtype=np.int32)
|
|
255
|
+
else:
|
|
256
|
+
if not isinstance(out, Genos):
|
|
257
|
+
raise ValueError(f"Expected a np.int32 array, got {type(out)}.")
|
|
258
|
+
data = out
|
|
259
|
+
|
|
260
|
+
self._geno_pgen.read_alleles_list(var_idxs, data)
|
|
261
|
+
data = data.reshape(n_variants, self.n_samples, self.ploidy).transpose(1, 2, 0)[
|
|
262
|
+
self._s_idx
|
|
263
|
+
]
|
|
264
|
+
data[data == -9] = -1
|
|
265
|
+
|
|
266
|
+
data = cast(T, data)
|
|
267
|
+
|
|
268
|
+
return data
|
|
269
|
+
|
|
270
|
+
def read_chunks(
|
|
271
|
+
self,
|
|
272
|
+
contig: str,
|
|
273
|
+
start: int = 0,
|
|
274
|
+
end: int | None = None,
|
|
275
|
+
max_mem: int | str = "4g",
|
|
276
|
+
) -> Generator[T]:
|
|
277
|
+
# TODO: support dosages
|
|
278
|
+
|
|
279
|
+
max_mem = parse_memory(max_mem)
|
|
280
|
+
|
|
281
|
+
c = self._c_norm.norm(contig)
|
|
282
|
+
if c is None:
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
if end is None:
|
|
286
|
+
end = np.iinfo(np.int64).max
|
|
287
|
+
|
|
288
|
+
var_idxs, _ = self._var_idxs(c, start, end)
|
|
289
|
+
n_variants = len(var_idxs)
|
|
290
|
+
if n_variants == 0:
|
|
291
|
+
return
|
|
292
|
+
|
|
293
|
+
mem_per_v = self._mem_per_variant()
|
|
294
|
+
vars_per_chunk = min(max_mem // mem_per_v, n_variants)
|
|
295
|
+
if vars_per_chunk == 0:
|
|
296
|
+
raise ValueError(
|
|
297
|
+
f"Maximum memory {format_memory(max_mem)} insufficient to read a single variant."
|
|
298
|
+
f" Memory per variant: {format_memory(mem_per_v)}."
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
n_chunks = -(-n_variants // vars_per_chunk)
|
|
302
|
+
v_chunks = np.array_split(var_idxs, n_chunks)
|
|
303
|
+
for var_idx in v_chunks:
|
|
304
|
+
chunk_size = len(var_idx)
|
|
305
|
+
out = np.empty((chunk_size, self.n_samples * self.ploidy), dtype=np.int32)
|
|
306
|
+
self._geno_pgen.read_alleles_list(var_idx, out)
|
|
307
|
+
out = out.reshape(chunk_size, self.n_samples, self.ploidy).transpose(
|
|
308
|
+
1, 2, 0
|
|
309
|
+
)[self._s_idx]
|
|
310
|
+
out[out == -9] = -1
|
|
311
|
+
yield cast(T, out)
|
|
312
|
+
|
|
313
|
+
def read_ranges(
|
|
314
|
+
self,
|
|
315
|
+
contig: str,
|
|
316
|
+
starts: ArrayLike = 0,
|
|
317
|
+
ends: ArrayLike | None = None,
|
|
318
|
+
) -> tuple[T, NDArray[np.uint32]] | None:
|
|
319
|
+
# TODO: support dosages
|
|
320
|
+
|
|
321
|
+
starts = np.atleast_1d(starts)
|
|
322
|
+
|
|
323
|
+
c = self._c_norm.norm(contig)
|
|
324
|
+
if c is None:
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
var_idxs, offsets = self._var_idxs(c, starts, ends)
|
|
328
|
+
n_variants = len(var_idxs)
|
|
329
|
+
if n_variants == 0:
|
|
330
|
+
return
|
|
331
|
+
|
|
332
|
+
out = np.empty((n_variants, self.n_samples * self.ploidy), dtype=np.int32)
|
|
333
|
+
|
|
334
|
+
self._geno_pgen.read_alleles_list(var_idxs, out)
|
|
335
|
+
out = out.reshape(n_variants, self.n_samples, self.ploidy).transpose(1, 2, 0)[
|
|
336
|
+
self._s_idx
|
|
337
|
+
]
|
|
338
|
+
out[out == -9] = -1
|
|
339
|
+
|
|
340
|
+
return cast(T, out), np.diff(offsets).astype(np.uint32)
|
|
341
|
+
|
|
342
|
+
def _mem_per_variant(self) -> int:
|
|
343
|
+
if issubclass(self._read_as, Genos):
|
|
344
|
+
return self.n_samples * self.ploidy * np.int32().itemsize
|
|
345
|
+
elif issubclass(self._read_as, (Dosages, GenosDosages)):
|
|
346
|
+
raise NotImplementedError("Dosages are not yet supported.")
|
|
347
|
+
else:
|
|
348
|
+
assert_never(self._read_as)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _read_psam(path: Path) -> NDArray[np.str_]:
|
|
352
|
+
with open(path.with_suffix(".psam")) as f:
|
|
353
|
+
cols = [c.strip("#") for c in f.readline().strip().split()]
|
|
354
|
+
|
|
355
|
+
psam = pl.read_csv(
|
|
356
|
+
path.with_suffix(".psam"),
|
|
357
|
+
separator="\t",
|
|
358
|
+
has_header=False,
|
|
359
|
+
skip_rows=1,
|
|
360
|
+
new_columns=cols,
|
|
361
|
+
schema_overrides={
|
|
362
|
+
"FID": pl.Utf8,
|
|
363
|
+
"IID": pl.Utf8,
|
|
364
|
+
"SID": pl.Utf8,
|
|
365
|
+
"PAT": pl.Utf8,
|
|
366
|
+
"MAT": pl.Utf8,
|
|
367
|
+
"SEX": pl.Utf8,
|
|
368
|
+
},
|
|
369
|
+
)
|
|
370
|
+
samples = psam["IID"].to_numpy().astype(str)
|
|
371
|
+
return samples
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
RLEN = pl.col("REF").str.len_bytes()
|
|
375
|
+
ALEN = pl.col("ALT").str.len_bytes()
|
|
376
|
+
ILEN = ALEN - RLEN
|
|
377
|
+
KIND = (
|
|
378
|
+
pl.when(ILEN != 0)
|
|
379
|
+
.then(pl.lit("INDEL"))
|
|
380
|
+
.when(RLEN == 1)
|
|
381
|
+
.then(pl.lit("SNP"))
|
|
382
|
+
.otherwise(pl.lit("MNP"))
|
|
383
|
+
.cast(pl.Categorical)
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
# TODO: index can likely be implemented using the NCLS lib underlying PyRanges and then we can
|
|
388
|
+
# pass np.memmap arrays directly instead of having to futz with DataFrames. This will likely make
|
|
389
|
+
# filtering less ergonomic/harder to make ergonomic though, but a memmap approach will be scalable
|
|
390
|
+
# to ultra-large datasets (100k+ individuals).
|
|
391
|
+
def _write_index(path: Path):
|
|
392
|
+
(
|
|
393
|
+
pl.scan_csv(
|
|
394
|
+
path.with_suffix(".pvar"),
|
|
395
|
+
separator="\t",
|
|
396
|
+
comment_prefix="##",
|
|
397
|
+
schema_overrides={"#CHROM": pl.Utf8, "POS": pl.Int32},
|
|
398
|
+
)
|
|
399
|
+
.select(
|
|
400
|
+
Chromosome="#CHROM",
|
|
401
|
+
Start=pl.col("POS") - 1,
|
|
402
|
+
End=pl.col("POS") + RLEN - 1,
|
|
403
|
+
kind=KIND,
|
|
404
|
+
)
|
|
405
|
+
.sink_ipc(path.with_suffix(".gvi"))
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _read_index(path: Path, filter: pl.Expr | None) -> pr.PyRanges:
|
|
410
|
+
index = pl.read_ipc(path, row_index_name="index", memory_map=False)
|
|
411
|
+
if filter is not None:
|
|
412
|
+
index = index.filter(filter)
|
|
413
|
+
pyr = pr.PyRanges(index.drop("kind").to_pandas(use_pyarrow_extension_array=True))
|
|
414
|
+
return pyr
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Generator, Generic, Protocol, TypeVar
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from numpy.typing import ArrayLike, NDArray
|
|
7
|
+
from typing_extensions import Self
|
|
8
|
+
|
|
9
|
+
T = TypeVar("T")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Reader(Protocol, Generic[T]):
|
|
13
|
+
available_samples: list[str]
|
|
14
|
+
"""All samples in the file, in the order they exist on-disk."""
|
|
15
|
+
ploidy: int
|
|
16
|
+
filter: Any | None
|
|
17
|
+
contigs: list[str]
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def current_samples(self) -> list[str]:
|
|
21
|
+
"""The samples this reader will return, in order along the sample axis."""
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
def set_samples(self, samples: list[str]) -> Self:
|
|
25
|
+
"""Set the samples this reader will return, in order along the sample axis."""
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def n_samples(self) -> int:
|
|
30
|
+
return len(self.current_samples)
|
|
31
|
+
|
|
32
|
+
def n_vars_in_ranges(
|
|
33
|
+
self, contig: str, starts: ArrayLike = 0, ends: ArrayLike | None = None
|
|
34
|
+
) -> NDArray[np.uint32]:
|
|
35
|
+
"""Return the start and end indices of the variants in the given ranges.
|
|
36
|
+
|
|
37
|
+
Parameters
|
|
38
|
+
----------
|
|
39
|
+
contig
|
|
40
|
+
Contig name.
|
|
41
|
+
starts
|
|
42
|
+
0-based start positions of the regions.
|
|
43
|
+
ends
|
|
44
|
+
0-based, exclusive end positions of the regions.
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
n_variants
|
|
49
|
+
Shape: (regions). Number of variants in the given ranges.
|
|
50
|
+
"""
|
|
51
|
+
...
|
|
52
|
+
|
|
53
|
+
def read(
|
|
54
|
+
self,
|
|
55
|
+
contig: str,
|
|
56
|
+
start: int = 0,
|
|
57
|
+
end: int | None = None,
|
|
58
|
+
out: T | None = None,
|
|
59
|
+
) -> T | None:
|
|
60
|
+
"""Read genotypes and/or dosages for a region.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
contig
|
|
65
|
+
Contig name.
|
|
66
|
+
start
|
|
67
|
+
0-based start position of the region.
|
|
68
|
+
end
|
|
69
|
+
0-based, exclusive end position of the region.
|
|
70
|
+
samples
|
|
71
|
+
Samples to read. If None, all samples are read.
|
|
72
|
+
ploids
|
|
73
|
+
Ploids to read. If None, all ploids are read.
|
|
74
|
+
dosage_field
|
|
75
|
+
Dosage field to read. If True, use the default dosage field for the format.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
data
|
|
80
|
+
Genotypes and/or dosages. Genotypes have shape (samples ploidy variants) and
|
|
81
|
+
dosages have shape (samples variants). Missing genotypes have value -1 and missing dosages
|
|
82
|
+
have value np.nan. If just using genotypes or dosages, will be a single array, otherwise
|
|
83
|
+
will be a tuple of arrays.
|
|
84
|
+
dosage
|
|
85
|
+
Shape: (samples variants)
|
|
86
|
+
"""
|
|
87
|
+
...
|
|
88
|
+
|
|
89
|
+
def read_chunks(
|
|
90
|
+
self,
|
|
91
|
+
contig: str,
|
|
92
|
+
start: int = 0,
|
|
93
|
+
end: int | None = None,
|
|
94
|
+
max_mem: int | str = "4g",
|
|
95
|
+
) -> Generator[T]:
|
|
96
|
+
"""Iterate over genotypes and/or dosages for a region in chunks limited by max_mem.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
contig
|
|
101
|
+
Contig name.
|
|
102
|
+
start
|
|
103
|
+
0-based start position.
|
|
104
|
+
end
|
|
105
|
+
0-based, exclusive end position of the region.
|
|
106
|
+
samples
|
|
107
|
+
Samples to read. If None, all samples are read.
|
|
108
|
+
ploids
|
|
109
|
+
Ploids to read. If None, all ploids are read.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
data
|
|
114
|
+
Generator of genotypes and/or dosages. Genotypes have shape (samples ploidy variants) and
|
|
115
|
+
dosages have shape (samples variants). Missing genotypes have value -1 and missing dosages
|
|
116
|
+
have value np.nan. If just using genotypes or dosages, will be a single array, otherwise
|
|
117
|
+
will be a tuple of arrays.
|
|
118
|
+
"""
|
|
119
|
+
...
|
|
120
|
+
|
|
121
|
+
def read_ranges(
|
|
122
|
+
self,
|
|
123
|
+
contig: str,
|
|
124
|
+
starts: ArrayLike = 0,
|
|
125
|
+
ends: ArrayLike | None = None,
|
|
126
|
+
) -> tuple[T, NDArray[np.uint32]] | None:
|
|
127
|
+
"""Read genotypes and/or dosages for multiple regions.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
contig
|
|
132
|
+
Contig name.
|
|
133
|
+
start
|
|
134
|
+
0-based start position of the region.
|
|
135
|
+
end
|
|
136
|
+
0-based, exclusive end position of the region.
|
|
137
|
+
samples
|
|
138
|
+
Samples to read. If None, all samples are read.
|
|
139
|
+
ploids
|
|
140
|
+
Ploids to read. If None, all ploids are read.
|
|
141
|
+
dosage_field
|
|
142
|
+
Dosage field to read. If True, use the default dosage field for the format.
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
data
|
|
147
|
+
Genotypes and/or dosages. Genotypes have shape (samples ploidy variants) and
|
|
148
|
+
dosages have shape (samples variants). Missing genotypes have value -1 and missing dosages
|
|
149
|
+
have value np.nan. If just using genotypes or dosages, will be a single array, otherwise
|
|
150
|
+
will be a tuple of arrays.
|
|
151
|
+
n_variants_per_region
|
|
152
|
+
Shape: (regions). Number of variants in the given ranges.
|
|
153
|
+
"""
|
|
154
|
+
...
|