bio2zarr 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bio2zarr might be problematic. Click here for more details.

@@ -4,6 +4,8 @@ import numpy.testing as nt
4
4
  import tqdm
5
5
  import zarr
6
6
 
7
+ from bio2zarr.zarr_utils import first_dim_iter
8
+
7
9
  from .. import constants
8
10
 
9
11
 
@@ -77,7 +79,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
77
79
  if vcf_type in ("String", "Character"):
78
80
  split = list(vcf_val.split(","))
79
81
  k = len(split)
80
- if isinstance(zarr_val, str):
82
+ if isinstance(zarr_val, str) or zarr_val.ndim == 0:
81
83
  assert k == 1
82
84
  # Scalar
83
85
  assert vcf_val == zarr_val
@@ -109,19 +111,17 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
109
111
  assert_all_fill(zarr_val[1:], vcf_type)
110
112
 
111
113
 
112
- def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
114
+ def assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number):
113
115
  assert vcf_val is not None
114
116
  assert isinstance(vcf_val, np.ndarray)
115
117
  if vcf_type in ("String", "Character"):
116
118
  assert len(vcf_val) == len(zarr_val)
117
119
  for v, z in zip(vcf_val, zarr_val):
118
- split = list(v.split(","))
119
- # Note: deliberately duplicating logic here between this and the
120
- # INFO col above to make sure all combinations are covered by tests
121
- k = len(split)
122
- if k == 1:
120
+ if vcf_number == "1":
123
121
  assert v == z
124
122
  else:
123
+ split = list(v.split(","))
124
+ k = len(split)
125
125
  nt.assert_equal(split, z[:k])
126
126
  assert_all_fill(z[k:], vcf_type)
127
127
  else:
@@ -147,16 +147,14 @@ def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
147
147
 
148
148
 
149
149
  def verify(vcf_path, zarr_path, show_progress=False):
150
- store = zarr.DirectoryStore(zarr_path)
151
-
152
- root = zarr.group(store=store)
150
+ root = zarr.open(store=zarr_path, mode="r")
153
151
  pos = root["variant_position"][:]
154
152
  allele = root["variant_allele"][:]
155
153
  chrom = root["contig_id"][:][root["variant_contig"][:]]
156
154
  vid = root["variant_id"][:]
157
155
  call_genotype = None
158
- if "call_genotype" in root:
159
- call_genotype = iter(root["call_genotype"])
156
+ if "call_genotype" in root and root["call_genotype"].size > 0:
157
+ call_genotype = first_dim_iter(root["call_genotype"])
160
158
 
161
159
  vcf = cyvcf2.VCF(vcf_path)
162
160
  format_headers = {}
@@ -173,12 +171,17 @@ def verify(vcf_path, zarr_path, show_progress=False):
173
171
  if colname.startswith("call") and not colname.startswith("call_genotype"):
174
172
  vcf_name = colname.split("_", 1)[1]
175
173
  vcf_type = format_headers[vcf_name]["Type"]
176
- format_fields[vcf_name] = vcf_type, iter(root[colname])
174
+ vcf_number = format_headers[vcf_name]["Number"]
175
+ format_fields[vcf_name] = (
176
+ vcf_type,
177
+ vcf_number,
178
+ first_dim_iter(root[colname]),
179
+ )
177
180
  if colname.startswith("variant"):
178
181
  name = colname.split("_", 1)[1]
179
182
  if name.isupper():
180
183
  vcf_type = info_headers[name]["Type"]
181
- info_fields[name] = vcf_type, iter(root[colname])
184
+ info_fields[name] = vcf_type, first_dim_iter(root[colname])
182
185
 
183
186
  first_pos = next(vcf).POS
184
187
  start_index = np.searchsorted(pos, first_pos)
@@ -221,10 +224,10 @@ def verify(vcf_path, zarr_path, show_progress=False):
221
224
  else:
222
225
  assert_info_val_equal(vcf_val, zarr_val, vcf_type)
223
226
 
224
- for name, (vcf_type, zarr_iter) in format_fields.items():
227
+ for name, (vcf_type, vcf_number, zarr_iter) in format_fields.items():
225
228
  vcf_val = row.format(name)
226
229
  zarr_val = next(zarr_iter)
227
230
  if vcf_val is None:
228
231
  assert_format_val_missing(zarr_val, vcf_type)
229
232
  else:
230
- assert_format_val_equal(vcf_val, zarr_val, vcf_type)
233
+ assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number)
bio2zarr/vcf_utils.py CHANGED
@@ -6,6 +6,7 @@ import pathlib
6
6
  import struct
7
7
  from collections.abc import Sequence
8
8
  from dataclasses import dataclass
9
+ from enum import Enum
9
10
  from typing import IO, Any, Optional, Union
10
11
 
11
12
  import cyvcf2
@@ -382,46 +383,61 @@ def read_tabix(
382
383
  )
383
384
 
384
385
 
386
+ class VcfFileType(Enum):
387
+ VCF = ".vcf"
388
+ BCF = ".bcf"
389
+
390
+
391
+ class VcfIndexType(Enum):
392
+ CSI = ".csi"
393
+ TABIX = ".tbi"
394
+
395
+
385
396
  class IndexedVcf(contextlib.AbstractContextManager):
386
397
  def __init__(self, vcf_path, index_path=None):
387
398
  self.vcf = None
388
399
  vcf_path = pathlib.Path(vcf_path)
389
400
  if not vcf_path.exists():
390
401
  raise FileNotFoundError(vcf_path)
391
- # TODO use constants here instead of strings
392
402
  if index_path is None:
393
- index_path = vcf_path.with_suffix(vcf_path.suffix + ".tbi")
403
+ index_path = vcf_path.with_suffix(
404
+ vcf_path.suffix + VcfIndexType.TABIX.value
405
+ )
394
406
  if not index_path.exists():
395
- index_path = vcf_path.with_suffix(vcf_path.suffix + ".csi")
407
+ index_path = vcf_path.with_suffix(
408
+ vcf_path.suffix + VcfIndexType.CSI.value
409
+ )
396
410
  if not index_path.exists():
397
411
  raise FileNotFoundError(
398
- "Cannot find .tbi or .csi file for {vcf_path}"
412
+ f"Cannot find .tbi or .csi file for {vcf_path}"
399
413
  )
400
414
  else:
401
415
  index_path = pathlib.Path(index_path)
402
416
 
403
417
  self.vcf_path = vcf_path
404
418
  self.index_path = index_path
405
- # TODO use Enums for these
406
419
  self.file_type = None
407
420
  self.index_type = None
408
- if index_path.suffix == ".csi":
409
- self.index_type = "csi"
410
- elif index_path.suffix == ".tbi":
411
- self.index_type = "tabix"
412
- self.file_type = "vcf"
421
+
422
+ if index_path.suffix == VcfIndexType.CSI.value:
423
+ self.index_type = VcfIndexType.CSI
424
+ elif index_path.suffix == VcfIndexType.TABIX.value:
425
+ self.index_type = VcfIndexType.TABIX
426
+ self.file_type = VcfFileType.VCF
413
427
  else:
414
428
  raise ValueError("Only .tbi or .csi indexes are supported.")
429
+
415
430
  self.vcf = cyvcf2.VCF(vcf_path)
416
431
  self.vcf.set_index(str(self.index_path))
417
432
  logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
418
433
  self.sequence_names = None
419
- if self.index_type == "csi":
434
+
435
+ if self.index_type == VcfIndexType.CSI:
420
436
  # Determine the file-type based on the "aux" field.
421
437
  self.index = read_csi(self.index_path)
422
- self.file_type = "bcf"
438
+ self.file_type = VcfFileType.BCF
423
439
  if len(self.index.aux) > 0:
424
- self.file_type = "vcf"
440
+ self.file_type = VcfFileType.VCF
425
441
  self.sequence_names = self.index.parse_vcf_aux()
426
442
  else:
427
443
  self.sequence_names = self.vcf.seqnames
@@ -437,7 +453,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
437
453
 
438
454
  def contig_record_counts(self):
439
455
  d = dict(zip(self.sequence_names, self.index.record_counts))
440
- if self.file_type == "bcf":
456
+ if self.file_type == VcfFileType.BCF:
441
457
  d = {k: v for k, v in d.items() if v > 0}
442
458
  return d
443
459
 
bio2zarr/zarr_utils.py ADDED
@@ -0,0 +1,18 @@
1
+ import zarr
2
+
3
+
4
+ def zarr_v3() -> bool:
5
+ return zarr.__version__ >= "3"
6
+
7
+
8
+ if zarr_v3():
9
+ # Use zarr format v2 even when running with zarr-python v3
10
+ ZARR_FORMAT_KWARGS = dict(zarr_format=2)
11
+ else:
12
+ ZARR_FORMAT_KWARGS = dict()
13
+
14
+
15
+ # See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
16
+ def first_dim_iter(z):
17
+ for chunk in range(z.cdata_shape[0]):
18
+ yield from z.blocks[chunk]
@@ -1,9 +1,9 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: bio2zarr
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Convert bioinformatics data to Zarr
5
5
  Author-email: sgkit Developers <project@sgkit.dev>
6
- License: Apache License
6
+ License: Apache License
7
7
  Version 2.0, January 2004
8
8
  http://www.apache.org/licenses/
9
9
 
@@ -219,26 +219,28 @@ Classifier: Programming Language :: Python :: 3
219
219
  Classifier: Programming Language :: Python :: 3.9
220
220
  Classifier: Programming Language :: Python :: 3.10
221
221
  Classifier: Programming Language :: Python :: 3.11
222
+ Classifier: Programming Language :: Python :: 3.12
222
223
  Classifier: Topic :: Scientific/Engineering
223
224
  Requires-Python: >=3.9
224
225
  Description-Content-Type: text/markdown
225
226
  License-File: LICENSE
226
- Requires-Dist: numpy <2
227
- Requires-Dist: zarr <3,>=2.17
227
+ Requires-Dist: numpy>=1.26
228
+ Requires-Dist: zarr<3,>=2.17
228
229
  Requires-Dist: click
229
230
  Requires-Dist: tabulate
230
231
  Requires-Dist: tqdm
231
232
  Requires-Dist: humanfriendly
232
233
  Requires-Dist: cyvcf2
233
- Requires-Dist: bed-reader
234
+ Requires-Dist: bed_reader
234
235
  Provides-Extra: dev
235
- Requires-Dist: msprime ; extra == 'dev'
236
- Requires-Dist: pysam ; extra == 'dev'
237
- Requires-Dist: pytest ; extra == 'dev'
238
- Requires-Dist: pytest-coverage ; extra == 'dev'
239
- Requires-Dist: pytest-xdist ; extra == 'dev'
240
- Requires-Dist: sgkit >=0.8.0 ; extra == 'dev'
241
- Requires-Dist: tqdm ; extra == 'dev'
236
+ Requires-Dist: hypothesis-vcf; extra == "dev"
237
+ Requires-Dist: msprime; extra == "dev"
238
+ Requires-Dist: pysam; extra == "dev"
239
+ Requires-Dist: pytest; extra == "dev"
240
+ Requires-Dist: pytest-coverage; extra == "dev"
241
+ Requires-Dist: pytest-xdist; extra == "dev"
242
+ Requires-Dist: sgkit>=0.8.0; extra == "dev"
243
+ Requires-Dist: tqdm; extra == "dev"
242
244
 
243
245
  [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
244
246
  [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
@@ -0,0 +1,21 @@
1
+ bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
+ bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
3
+ bio2zarr/_version.py,sha256=NIzzV8ZM0W-CSLuEs1weG4zPrn_-8yr1AwwI1iuS6yo,511
4
+ bio2zarr/cli.py,sha256=Iife89BfTR_AUarm-AIW0lAIYxd370OmP1KKePgFXzk,16008
5
+ bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
+ bio2zarr/core.py,sha256=4xqNf3Txgyhcx23bzXZHq3GW0Jh24fPQwob7lKO7s0w,11668
7
+ bio2zarr/plink.py,sha256=Yr1meT4AgS2qnwM64-Nmthh4HbjaPXsddYiJdtfYWBg,6999
8
+ bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
+ bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
10
+ bio2zarr/vcf_utils.py,sha256=tuPzuMiwGYgMlQA49L6EuIplw9DOVaOw1DTa03OJS7k,18268
11
+ bio2zarr/zarr_utils.py,sha256=99J7ycaG92K_AcWRF2S9A4ec2_4cXL6kjYT99GBfli4,415
12
+ bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
13
+ bio2zarr/vcf2zarr/icf.py,sha256=wGYgDMvfZBgNp2KuUiVhxbG9KIeT0W4-gQOb7SfZkis,42418
14
+ bio2zarr/vcf2zarr/vcz.py,sha256=cfUCBsQW5dbhDu7NzXkd1Dalsev7UkFDXVOyChAHw8Q,49409
15
+ bio2zarr/vcf2zarr/verification.py,sha256=uM-mg0yvUTBs-MvWBd4jxTS0zKCUbxEQpm4ALJADdMI,8037
16
+ bio2zarr-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
17
+ bio2zarr-0.1.3.dist-info/METADATA,sha256=Joji4xU0q9NoTKh1qiM1opSrNDDdzG6V05t0qx_emj4,14978
18
+ bio2zarr-0.1.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
19
+ bio2zarr-0.1.3.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
20
+ bio2zarr-0.1.3.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
21
+ bio2zarr-0.1.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,20 +0,0 @@
1
- bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
2
- bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
3
- bio2zarr/_version.py,sha256=PKIMyjdUACH4-ONvtunQCnYE2UhlMfp9su83e3HXl5E,411
4
- bio2zarr/cli.py,sha256=-6cU26n5f8CpBSj6RGC-fpNByjuJ0KxSFz85O9tITPg,14961
5
- bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
6
- bio2zarr/core.py,sha256=Yd3Z6-mFI_neaxoWT6t6Tip0k1VZEcWbautHcJ0ep8Q,10486
7
- bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
8
- bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
9
- bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
10
- bio2zarr/vcf_utils.py,sha256=R3bes-xYLZ4ekaxtqDd39YVV20qHmwei3XiIg1UFhRA,17996
11
- bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
12
- bio2zarr/vcf2zarr/icf.py,sha256=rIC35RIfkk5gEE8cOmBg1d9Pj-HkPivmGvYp4PrVN1Q,41589
13
- bio2zarr/vcf2zarr/vcz.py,sha256=2WE4RX5jZBiKDFEztNGYgXyrLRmVWeLKlFzh0GOzylk,38198
14
- bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
15
- bio2zarr-0.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
16
- bio2zarr-0.1.1.dist-info/METADATA,sha256=RR9oM_5UYB5slsheIFzkIlRZt5du8eCb1_bMT_e7QjY,14854
17
- bio2zarr-0.1.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
18
- bio2zarr-0.1.1.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
19
- bio2zarr-0.1.1.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
20
- bio2zarr-0.1.1.dist-info/RECORD,,