bio2zarr 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bio2zarr might be problematic. Click here for more details.
- bio2zarr/_version.py +2 -2
- bio2zarr/cli.py +46 -12
- bio2zarr/core.py +32 -2
- bio2zarr/plink.py +19 -14
- bio2zarr/vcf2zarr/icf.py +30 -17
- bio2zarr/vcf2zarr/vcz.py +460 -138
- bio2zarr/vcf2zarr/verification.py +19 -16
- bio2zarr/vcf_utils.py +30 -14
- bio2zarr/zarr_utils.py +19 -0
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/METADATA +15 -13
- bio2zarr-0.1.2.dist-info/RECORD +21 -0
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/WHEEL +1 -1
- bio2zarr-0.1.0.dist-info/RECORD +0 -20
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/LICENSE +0 -0
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/entry_points.txt +0 -0
- {bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,8 @@ import numpy.testing as nt
|
|
|
4
4
|
import tqdm
|
|
5
5
|
import zarr
|
|
6
6
|
|
|
7
|
+
from bio2zarr.zarr_utils import first_dim_iter
|
|
8
|
+
|
|
7
9
|
from .. import constants
|
|
8
10
|
|
|
9
11
|
|
|
@@ -77,7 +79,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
|
|
|
77
79
|
if vcf_type in ("String", "Character"):
|
|
78
80
|
split = list(vcf_val.split(","))
|
|
79
81
|
k = len(split)
|
|
80
|
-
if isinstance(zarr_val, str):
|
|
82
|
+
if isinstance(zarr_val, str) or zarr_val.ndim == 0:
|
|
81
83
|
assert k == 1
|
|
82
84
|
# Scalar
|
|
83
85
|
assert vcf_val == zarr_val
|
|
@@ -109,19 +111,17 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
|
|
|
109
111
|
assert_all_fill(zarr_val[1:], vcf_type)
|
|
110
112
|
|
|
111
113
|
|
|
112
|
-
def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
|
|
114
|
+
def assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number):
|
|
113
115
|
assert vcf_val is not None
|
|
114
116
|
assert isinstance(vcf_val, np.ndarray)
|
|
115
117
|
if vcf_type in ("String", "Character"):
|
|
116
118
|
assert len(vcf_val) == len(zarr_val)
|
|
117
119
|
for v, z in zip(vcf_val, zarr_val):
|
|
118
|
-
|
|
119
|
-
# Note: deliberately duplicating logic here between this and the
|
|
120
|
-
# INFO col above to make sure all combinations are covered by tests
|
|
121
|
-
k = len(split)
|
|
122
|
-
if k == 1:
|
|
120
|
+
if vcf_number == "1":
|
|
123
121
|
assert v == z
|
|
124
122
|
else:
|
|
123
|
+
split = list(v.split(","))
|
|
124
|
+
k = len(split)
|
|
125
125
|
nt.assert_equal(split, z[:k])
|
|
126
126
|
assert_all_fill(z[k:], vcf_type)
|
|
127
127
|
else:
|
|
@@ -147,16 +147,14 @@ def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
|
|
|
147
147
|
|
|
148
148
|
|
|
149
149
|
def verify(vcf_path, zarr_path, show_progress=False):
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
root = zarr.group(store=store)
|
|
150
|
+
root = zarr.open(store=zarr_path, mode="r")
|
|
153
151
|
pos = root["variant_position"][:]
|
|
154
152
|
allele = root["variant_allele"][:]
|
|
155
153
|
chrom = root["contig_id"][:][root["variant_contig"][:]]
|
|
156
154
|
vid = root["variant_id"][:]
|
|
157
155
|
call_genotype = None
|
|
158
|
-
if "call_genotype" in root:
|
|
159
|
-
call_genotype =
|
|
156
|
+
if "call_genotype" in root and root["call_genotype"].size > 0:
|
|
157
|
+
call_genotype = first_dim_iter(root["call_genotype"])
|
|
160
158
|
|
|
161
159
|
vcf = cyvcf2.VCF(vcf_path)
|
|
162
160
|
format_headers = {}
|
|
@@ -173,12 +171,17 @@ def verify(vcf_path, zarr_path, show_progress=False):
|
|
|
173
171
|
if colname.startswith("call") and not colname.startswith("call_genotype"):
|
|
174
172
|
vcf_name = colname.split("_", 1)[1]
|
|
175
173
|
vcf_type = format_headers[vcf_name]["Type"]
|
|
176
|
-
|
|
174
|
+
vcf_number = format_headers[vcf_name]["Number"]
|
|
175
|
+
format_fields[vcf_name] = (
|
|
176
|
+
vcf_type,
|
|
177
|
+
vcf_number,
|
|
178
|
+
first_dim_iter(root[colname]),
|
|
179
|
+
)
|
|
177
180
|
if colname.startswith("variant"):
|
|
178
181
|
name = colname.split("_", 1)[1]
|
|
179
182
|
if name.isupper():
|
|
180
183
|
vcf_type = info_headers[name]["Type"]
|
|
181
|
-
info_fields[name] = vcf_type,
|
|
184
|
+
info_fields[name] = vcf_type, first_dim_iter(root[colname])
|
|
182
185
|
|
|
183
186
|
first_pos = next(vcf).POS
|
|
184
187
|
start_index = np.searchsorted(pos, first_pos)
|
|
@@ -221,10 +224,10 @@ def verify(vcf_path, zarr_path, show_progress=False):
|
|
|
221
224
|
else:
|
|
222
225
|
assert_info_val_equal(vcf_val, zarr_val, vcf_type)
|
|
223
226
|
|
|
224
|
-
for name, (vcf_type, zarr_iter) in format_fields.items():
|
|
227
|
+
for name, (vcf_type, vcf_number, zarr_iter) in format_fields.items():
|
|
225
228
|
vcf_val = row.format(name)
|
|
226
229
|
zarr_val = next(zarr_iter)
|
|
227
230
|
if vcf_val is None:
|
|
228
231
|
assert_format_val_missing(zarr_val, vcf_type)
|
|
229
232
|
else:
|
|
230
|
-
assert_format_val_equal(vcf_val, zarr_val, vcf_type)
|
|
233
|
+
assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number)
|
bio2zarr/vcf_utils.py
CHANGED
|
@@ -6,6 +6,7 @@ import pathlib
|
|
|
6
6
|
import struct
|
|
7
7
|
from collections.abc import Sequence
|
|
8
8
|
from dataclasses import dataclass
|
|
9
|
+
from enum import Enum
|
|
9
10
|
from typing import IO, Any, Optional, Union
|
|
10
11
|
|
|
11
12
|
import cyvcf2
|
|
@@ -382,46 +383,61 @@ def read_tabix(
|
|
|
382
383
|
)
|
|
383
384
|
|
|
384
385
|
|
|
386
|
+
class VcfFileType(Enum):
|
|
387
|
+
VCF = ".vcf"
|
|
388
|
+
BCF = ".bcf"
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
class VcfIndexType(Enum):
|
|
392
|
+
CSI = ".csi"
|
|
393
|
+
TABIX = ".tbi"
|
|
394
|
+
|
|
395
|
+
|
|
385
396
|
class IndexedVcf(contextlib.AbstractContextManager):
|
|
386
397
|
def __init__(self, vcf_path, index_path=None):
|
|
387
398
|
self.vcf = None
|
|
388
399
|
vcf_path = pathlib.Path(vcf_path)
|
|
389
400
|
if not vcf_path.exists():
|
|
390
401
|
raise FileNotFoundError(vcf_path)
|
|
391
|
-
# TODO use constants here instead of strings
|
|
392
402
|
if index_path is None:
|
|
393
|
-
index_path = vcf_path.with_suffix(
|
|
403
|
+
index_path = vcf_path.with_suffix(
|
|
404
|
+
vcf_path.suffix + VcfIndexType.TABIX.value
|
|
405
|
+
)
|
|
394
406
|
if not index_path.exists():
|
|
395
|
-
index_path = vcf_path.with_suffix(
|
|
407
|
+
index_path = vcf_path.with_suffix(
|
|
408
|
+
vcf_path.suffix + VcfIndexType.CSI.value
|
|
409
|
+
)
|
|
396
410
|
if not index_path.exists():
|
|
397
411
|
raise FileNotFoundError(
|
|
398
|
-
"Cannot find .tbi or .csi file for {vcf_path}"
|
|
412
|
+
f"Cannot find .tbi or .csi file for {vcf_path}"
|
|
399
413
|
)
|
|
400
414
|
else:
|
|
401
415
|
index_path = pathlib.Path(index_path)
|
|
402
416
|
|
|
403
417
|
self.vcf_path = vcf_path
|
|
404
418
|
self.index_path = index_path
|
|
405
|
-
# TODO use Enums for these
|
|
406
419
|
self.file_type = None
|
|
407
420
|
self.index_type = None
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
self.
|
|
421
|
+
|
|
422
|
+
if index_path.suffix == VcfIndexType.CSI.value:
|
|
423
|
+
self.index_type = VcfIndexType.CSI
|
|
424
|
+
elif index_path.suffix == VcfIndexType.TABIX.value:
|
|
425
|
+
self.index_type = VcfIndexType.TABIX
|
|
426
|
+
self.file_type = VcfFileType.VCF
|
|
413
427
|
else:
|
|
414
428
|
raise ValueError("Only .tbi or .csi indexes are supported.")
|
|
429
|
+
|
|
415
430
|
self.vcf = cyvcf2.VCF(vcf_path)
|
|
416
431
|
self.vcf.set_index(str(self.index_path))
|
|
417
432
|
logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
|
|
418
433
|
self.sequence_names = None
|
|
419
|
-
|
|
434
|
+
|
|
435
|
+
if self.index_type == VcfIndexType.CSI:
|
|
420
436
|
# Determine the file-type based on the "aux" field.
|
|
421
437
|
self.index = read_csi(self.index_path)
|
|
422
|
-
self.file_type =
|
|
438
|
+
self.file_type = VcfFileType.BCF
|
|
423
439
|
if len(self.index.aux) > 0:
|
|
424
|
-
self.file_type =
|
|
440
|
+
self.file_type = VcfFileType.VCF
|
|
425
441
|
self.sequence_names = self.index.parse_vcf_aux()
|
|
426
442
|
else:
|
|
427
443
|
self.sequence_names = self.vcf.seqnames
|
|
@@ -437,7 +453,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
|
|
|
437
453
|
|
|
438
454
|
def contig_record_counts(self):
|
|
439
455
|
d = dict(zip(self.sequence_names, self.index.record_counts))
|
|
440
|
-
if self.file_type ==
|
|
456
|
+
if self.file_type == VcfFileType.BCF:
|
|
441
457
|
d = {k: v for k, v in d.items() if v > 0}
|
|
442
458
|
return d
|
|
443
459
|
|
bio2zarr/zarr_utils.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import zarr
|
|
2
|
+
from packaging.version import Version
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def zarr_v3() -> bool:
|
|
6
|
+
return Version(zarr.__version__).major >= 3
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
if zarr_v3():
|
|
10
|
+
# Use zarr format v2 even when running with zarr-python v3
|
|
11
|
+
ZARR_FORMAT_KWARGS = dict(zarr_format=2)
|
|
12
|
+
else:
|
|
13
|
+
ZARR_FORMAT_KWARGS = dict()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
|
|
17
|
+
def first_dim_iter(z):
|
|
18
|
+
for chunk in range(z.cdata_shape[0]):
|
|
19
|
+
yield from z.blocks[chunk]
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: bio2zarr
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Convert bioinformatics data to Zarr
|
|
5
5
|
Author-email: sgkit Developers <project@sgkit.dev>
|
|
6
|
-
License:
|
|
6
|
+
License: Apache License
|
|
7
7
|
Version 2.0, January 2004
|
|
8
8
|
http://www.apache.org/licenses/
|
|
9
9
|
|
|
@@ -219,26 +219,28 @@ Classifier: Programming Language :: Python :: 3
|
|
|
219
219
|
Classifier: Programming Language :: Python :: 3.9
|
|
220
220
|
Classifier: Programming Language :: Python :: 3.10
|
|
221
221
|
Classifier: Programming Language :: Python :: 3.11
|
|
222
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
222
223
|
Classifier: Topic :: Scientific/Engineering
|
|
223
224
|
Requires-Python: >=3.9
|
|
224
225
|
Description-Content-Type: text/markdown
|
|
225
226
|
License-File: LICENSE
|
|
226
|
-
Requires-Dist: numpy
|
|
227
|
-
Requires-Dist: zarr
|
|
227
|
+
Requires-Dist: numpy>=1.26
|
|
228
|
+
Requires-Dist: zarr<3,>=2.17
|
|
228
229
|
Requires-Dist: click
|
|
229
230
|
Requires-Dist: tabulate
|
|
230
231
|
Requires-Dist: tqdm
|
|
231
232
|
Requires-Dist: humanfriendly
|
|
232
233
|
Requires-Dist: cyvcf2
|
|
233
|
-
Requires-Dist:
|
|
234
|
+
Requires-Dist: bed_reader
|
|
234
235
|
Provides-Extra: dev
|
|
235
|
-
Requires-Dist:
|
|
236
|
-
Requires-Dist:
|
|
237
|
-
Requires-Dist:
|
|
238
|
-
Requires-Dist: pytest
|
|
239
|
-
Requires-Dist: pytest-
|
|
240
|
-
Requires-Dist:
|
|
241
|
-
Requires-Dist:
|
|
236
|
+
Requires-Dist: hypothesis-vcf; extra == "dev"
|
|
237
|
+
Requires-Dist: msprime; extra == "dev"
|
|
238
|
+
Requires-Dist: pysam; extra == "dev"
|
|
239
|
+
Requires-Dist: pytest; extra == "dev"
|
|
240
|
+
Requires-Dist: pytest-coverage; extra == "dev"
|
|
241
|
+
Requires-Dist: pytest-xdist; extra == "dev"
|
|
242
|
+
Requires-Dist: sgkit>=0.8.0; extra == "dev"
|
|
243
|
+
Requires-Dist: tqdm; extra == "dev"
|
|
242
244
|
|
|
243
245
|
[](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
|
|
244
246
|
[](https://coveralls.io/github/sgkit-dev/bio2zarr)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
|
|
2
|
+
bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
|
|
3
|
+
bio2zarr/_version.py,sha256=SFCDdrYA67D1Je-jHgVVh4LOopkPvuV6NMtqSJ7Tfhg,411
|
|
4
|
+
bio2zarr/cli.py,sha256=Iife89BfTR_AUarm-AIW0lAIYxd370OmP1KKePgFXzk,16008
|
|
5
|
+
bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
|
|
6
|
+
bio2zarr/core.py,sha256=4xqNf3Txgyhcx23bzXZHq3GW0Jh24fPQwob7lKO7s0w,11668
|
|
7
|
+
bio2zarr/plink.py,sha256=Yr1meT4AgS2qnwM64-Nmthh4HbjaPXsddYiJdtfYWBg,6999
|
|
8
|
+
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
9
|
+
bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
|
|
10
|
+
bio2zarr/vcf_utils.py,sha256=tuPzuMiwGYgMlQA49L6EuIplw9DOVaOw1DTa03OJS7k,18268
|
|
11
|
+
bio2zarr/zarr_utils.py,sha256=Ldt65YdyR8bxa0ZyVvMYqQ8z72vx1iPLbL7J2wx8No8,466
|
|
12
|
+
bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
|
|
13
|
+
bio2zarr/vcf2zarr/icf.py,sha256=Lz5BeqP-d1G8xihZvua9911UJh-aszegCdj_7l32QV4,42074
|
|
14
|
+
bio2zarr/vcf2zarr/vcz.py,sha256=cfUCBsQW5dbhDu7NzXkd1Dalsev7UkFDXVOyChAHw8Q,49409
|
|
15
|
+
bio2zarr/vcf2zarr/verification.py,sha256=uM-mg0yvUTBs-MvWBd4jxTS0zKCUbxEQpm4ALJADdMI,8037
|
|
16
|
+
bio2zarr-0.1.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
17
|
+
bio2zarr-0.1.2.dist-info/METADATA,sha256=tG3yLpms9XBSyJKfyXOIYw59qRoYVyKpJFKTLDpbO2Q,14978
|
|
18
|
+
bio2zarr-0.1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
19
|
+
bio2zarr-0.1.2.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
|
|
20
|
+
bio2zarr-0.1.2.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
21
|
+
bio2zarr-0.1.2.dist-info/RECORD,,
|
bio2zarr-0.1.0.dist-info/RECORD
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
|
|
2
|
-
bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
|
|
3
|
-
bio2zarr/_version.py,sha256=IMl2Pr_Sy4LVRKy_Sm4CdwUl1Gryous6ncL96EMYsnM,411
|
|
4
|
-
bio2zarr/cli.py,sha256=-6cU26n5f8CpBSj6RGC-fpNByjuJ0KxSFz85O9tITPg,14961
|
|
5
|
-
bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
|
|
6
|
-
bio2zarr/core.py,sha256=Yd3Z6-mFI_neaxoWT6t6Tip0k1VZEcWbautHcJ0ep8Q,10486
|
|
7
|
-
bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
|
|
8
|
-
bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
|
|
9
|
-
bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
|
|
10
|
-
bio2zarr/vcf_utils.py,sha256=R3bes-xYLZ4ekaxtqDd39YVV20qHmwei3XiIg1UFhRA,17996
|
|
11
|
-
bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
|
|
12
|
-
bio2zarr/vcf2zarr/icf.py,sha256=rIC35RIfkk5gEE8cOmBg1d9Pj-HkPivmGvYp4PrVN1Q,41589
|
|
13
|
-
bio2zarr/vcf2zarr/vcz.py,sha256=2WE4RX5jZBiKDFEztNGYgXyrLRmVWeLKlFzh0GOzylk,38198
|
|
14
|
-
bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
|
|
15
|
-
bio2zarr-0.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
16
|
-
bio2zarr-0.1.0.dist-info/METADATA,sha256=zezBzqrJPB4ED7IqFvVj8Lura2untJA8optBdVTBNzc,14848
|
|
17
|
-
bio2zarr-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
18
|
-
bio2zarr-0.1.0.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
|
|
19
|
-
bio2zarr-0.1.0.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
|
|
20
|
-
bio2zarr-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|