PyPI - bio2zarr - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

bio2zarr 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (16) hide show

bio2zarr/_version.py +9 -4
bio2zarr/cli.py +46 -12
bio2zarr/core.py +32 -2
bio2zarr/plink.py +19 -14
bio2zarr/vcf2zarr/icf.py +41 -18
bio2zarr/vcf2zarr/vcz.py +460 -138
bio2zarr/vcf2zarr/verification.py +19 -16
bio2zarr/vcf_utils.py +30 -14
bio2zarr/zarr_utils.py +18 -0
{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/METADATA +15 -13
bio2zarr-0.1.3.dist-info/RECORD +21 -0
{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/WHEEL +1 -1
bio2zarr-0.1.1.dist-info/RECORD +0 -20
{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/LICENSE +0 -0
{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/top_level.txt +0 -0

bio2zarr/vcf2zarr/verification.py CHANGED Viewed

@@ -4,6 +4,8 @@ import numpy.testing as nt
 import tqdm
 import zarr
+from bio2zarr.zarr_utils import first_dim_iter
 from .. import constants
@@ -77,7 +79,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
     if vcf_type in ("String", "Character"):
         split = list(vcf_val.split(","))
         k = len(split)
-        if isinstance(zarr_val, str):
+        if isinstance(zarr_val, str) or zarr_val.ndim == 0:
             assert k == 1
             # Scalar
             assert vcf_val == zarr_val
@@ -109,19 +111,17 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
             assert_all_fill(zarr_val[1:], vcf_type)
-def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
+def assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number):
     assert vcf_val is not None
     assert isinstance(vcf_val, np.ndarray)
     if vcf_type in ("String", "Character"):
         assert len(vcf_val) == len(zarr_val)
         for v, z in zip(vcf_val, zarr_val):
-            split = list(v.split(","))
-            # Note: deliberately duplicating logic here between this and the
-            # INFO col above to make sure all combinations are covered by tests
-            k = len(split)
-            if k == 1:
+            if vcf_number == "1":
                 assert v == z
             else:
+                split = list(v.split(","))
+                k = len(split)
                 nt.assert_equal(split, z[:k])
                 assert_all_fill(z[k:], vcf_type)
     else:
@@ -147,16 +147,14 @@ def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
 def verify(vcf_path, zarr_path, show_progress=False):
-    store = zarr.DirectoryStore(zarr_path)
-    root = zarr.group(store=store)
+    root = zarr.open(store=zarr_path, mode="r")
     pos = root["variant_position"][:]
     allele = root["variant_allele"][:]
     chrom = root["contig_id"][:][root["variant_contig"][:]]
     vid = root["variant_id"][:]
     call_genotype = None
-    if "call_genotype" in root:
-        call_genotype = iter(root["call_genotype"])
+    if "call_genotype" in root and root["call_genotype"].size > 0:
+        call_genotype = first_dim_iter(root["call_genotype"])
     vcf = cyvcf2.VCF(vcf_path)
     format_headers = {}
@@ -173,12 +171,17 @@ def verify(vcf_path, zarr_path, show_progress=False):
         if colname.startswith("call") and not colname.startswith("call_genotype"):
             vcf_name = colname.split("_", 1)[1]
             vcf_type = format_headers[vcf_name]["Type"]
-            format_fields[vcf_name] = vcf_type, iter(root[colname])
+            vcf_number = format_headers[vcf_name]["Number"]
+            format_fields[vcf_name] = (
+                vcf_type,
+                vcf_number,
+                first_dim_iter(root[colname]),
+            )
         if colname.startswith("variant"):
             name = colname.split("_", 1)[1]
             if name.isupper():
                 vcf_type = info_headers[name]["Type"]
-                info_fields[name] = vcf_type, iter(root[colname])
+                info_fields[name] = vcf_type, first_dim_iter(root[colname])
     first_pos = next(vcf).POS
     start_index = np.searchsorted(pos, first_pos)
@@ -221,10 +224,10 @@ def verify(vcf_path, zarr_path, show_progress=False):
             else:
                 assert_info_val_equal(vcf_val, zarr_val, vcf_type)
-        for name, (vcf_type, zarr_iter) in format_fields.items():
+        for name, (vcf_type, vcf_number, zarr_iter) in format_fields.items():
             vcf_val = row.format(name)
             zarr_val = next(zarr_iter)
             if vcf_val is None:
                 assert_format_val_missing(zarr_val, vcf_type)
             else:
-                assert_format_val_equal(vcf_val, zarr_val, vcf_type)
+                assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number)

bio2zarr/vcf_utils.py CHANGED Viewed

@@ -6,6 +6,7 @@ import pathlib
 import struct
 from collections.abc import Sequence
 from dataclasses import dataclass
+from enum import Enum
 from typing import IO, Any, Optional, Union
 import cyvcf2
@@ -382,46 +383,61 @@ def read_tabix(
         )
+class VcfFileType(Enum):
+    VCF = ".vcf"
+    BCF = ".bcf"
+class VcfIndexType(Enum):
+    CSI = ".csi"
+    TABIX = ".tbi"
 class IndexedVcf(contextlib.AbstractContextManager):
     def __init__(self, vcf_path, index_path=None):
         self.vcf = None
         vcf_path = pathlib.Path(vcf_path)
         if not vcf_path.exists():
             raise FileNotFoundError(vcf_path)
-        # TODO use constants here instead of strings
         if index_path is None:
-            index_path = vcf_path.with_suffix(vcf_path.suffix + ".tbi")
+            index_path = vcf_path.with_suffix(
+                vcf_path.suffix + VcfIndexType.TABIX.value
+            )
             if not index_path.exists():
-                index_path = vcf_path.with_suffix(vcf_path.suffix + ".csi")
+                index_path = vcf_path.with_suffix(
+                    vcf_path.suffix + VcfIndexType.CSI.value
+                )
                 if not index_path.exists():
                     raise FileNotFoundError(
-                        "Cannot find .tbi or .csi file for {vcf_path}"
+                        f"Cannot find .tbi or .csi file for {vcf_path}"
                     )
         else:
             index_path = pathlib.Path(index_path)
         self.vcf_path = vcf_path
         self.index_path = index_path
-        # TODO use Enums for these
         self.file_type = None
         self.index_type = None
-        if index_path.suffix == ".csi":
-            self.index_type = "csi"
-        elif index_path.suffix == ".tbi":
-            self.index_type = "tabix"
-            self.file_type = "vcf"
+        if index_path.suffix == VcfIndexType.CSI.value:
+            self.index_type = VcfIndexType.CSI
+        elif index_path.suffix == VcfIndexType.TABIX.value:
+            self.index_type = VcfIndexType.TABIX
+            self.file_type = VcfFileType.VCF
         else:
             raise ValueError("Only .tbi or .csi indexes are supported.")
         self.vcf = cyvcf2.VCF(vcf_path)
         self.vcf.set_index(str(self.index_path))
         logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
         self.sequence_names = None
-        if self.index_type == "csi":
+        if self.index_type == VcfIndexType.CSI:
             # Determine the file-type based on the "aux" field.
             self.index = read_csi(self.index_path)
-            self.file_type = "bcf"
+            self.file_type = VcfFileType.BCF
             if len(self.index.aux) > 0:
-                self.file_type = "vcf"
+                self.file_type = VcfFileType.VCF
                 self.sequence_names = self.index.parse_vcf_aux()
             else:
                 self.sequence_names = self.vcf.seqnames
@@ -437,7 +453,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
     def contig_record_counts(self):
         d = dict(zip(self.sequence_names, self.index.record_counts))
-        if self.file_type == "bcf":
+        if self.file_type == VcfFileType.BCF:
             d = {k: v for k, v in d.items() if v > 0}
         return d

bio2zarr/zarr_utils.py ADDED Viewed

@@ -0,0 +1,18 @@
+import zarr
+def zarr_v3() -> bool:
+    return zarr.__version__ >= "3"
+if zarr_v3():
+    # Use zarr format v2 even when running with zarr-python v3
+    ZARR_FORMAT_KWARGS = dict(zarr_format=2)
+else:
+    ZARR_FORMAT_KWARGS = dict()
+# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
+def first_dim_iter(z):
+    for chunk in range(z.cdata_shape[0]):
+        yield from z.blocks[chunk]

{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: bio2zarr
-Version: 0.1.1
+Version: 0.1.3
 Summary: Convert bioinformatics data to Zarr
 Author-email: sgkit Developers <project@sgkit.dev>
-License: Apache License
+License:                                  Apache License
                                    Version 2.0, January 2004
                                 http://www.apache.org/licenses/
@@ -219,26 +219,28 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy <2
-Requires-Dist: zarr <3,>=2.17
+Requires-Dist: numpy>=1.26
+Requires-Dist: zarr<3,>=2.17
 Requires-Dist: click
 Requires-Dist: tabulate
 Requires-Dist: tqdm
 Requires-Dist: humanfriendly
 Requires-Dist: cyvcf2
-Requires-Dist: bed-reader
+Requires-Dist: bed_reader
 Provides-Extra: dev
-Requires-Dist: msprime ; extra == 'dev'
-Requires-Dist: pysam ; extra == 'dev'
-Requires-Dist: pytest ; extra == 'dev'
-Requires-Dist: pytest-coverage ; extra == 'dev'
-Requires-Dist: pytest-xdist ; extra == 'dev'
-Requires-Dist: sgkit >=0.8.0 ; extra == 'dev'
-Requires-Dist: tqdm ; extra == 'dev'
+Requires-Dist: hypothesis-vcf; extra == "dev"
+Requires-Dist: msprime; extra == "dev"
+Requires-Dist: pysam; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: pytest-coverage; extra == "dev"
+Requires-Dist: pytest-xdist; extra == "dev"
+Requires-Dist: sgkit>=0.8.0; extra == "dev"
+Requires-Dist: tqdm; extra == "dev"
 [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
 [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)

bio2zarr-0.1.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,21 @@
+bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
+bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
+bio2zarr/_version.py,sha256=NIzzV8ZM0W-CSLuEs1weG4zPrn_-8yr1AwwI1iuS6yo,511
+bio2zarr/cli.py,sha256=Iife89BfTR_AUarm-AIW0lAIYxd370OmP1KKePgFXzk,16008
+bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
+bio2zarr/core.py,sha256=4xqNf3Txgyhcx23bzXZHq3GW0Jh24fPQwob7lKO7s0w,11668
+bio2zarr/plink.py,sha256=Yr1meT4AgS2qnwM64-Nmthh4HbjaPXsddYiJdtfYWBg,6999
+bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
+bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
+bio2zarr/vcf_utils.py,sha256=tuPzuMiwGYgMlQA49L6EuIplw9DOVaOw1DTa03OJS7k,18268
+bio2zarr/zarr_utils.py,sha256=99J7ycaG92K_AcWRF2S9A4ec2_4cXL6kjYT99GBfli4,415
+bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
+bio2zarr/vcf2zarr/icf.py,sha256=wGYgDMvfZBgNp2KuUiVhxbG9KIeT0W4-gQOb7SfZkis,42418
+bio2zarr/vcf2zarr/vcz.py,sha256=cfUCBsQW5dbhDu7NzXkd1Dalsev7UkFDXVOyChAHw8Q,49409
+bio2zarr/vcf2zarr/verification.py,sha256=uM-mg0yvUTBs-MvWBd4jxTS0zKCUbxEQpm4ALJADdMI,8037
+bio2zarr-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+bio2zarr-0.1.3.dist-info/METADATA,sha256=Joji4xU0q9NoTKh1qiM1opSrNDDdzG6V05t0qx_emj4,14978
+bio2zarr-0.1.3.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+bio2zarr-0.1.3.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
+bio2zarr-0.1.3.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
+bio2zarr-0.1.3.dist-info/RECORD,,

{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (75.8.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

bio2zarr-0.1.1.dist-info/RECORD DELETED Viewed

@@ -1,20 +0,0 @@
-bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
-bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
-bio2zarr/_version.py,sha256=PKIMyjdUACH4-ONvtunQCnYE2UhlMfp9su83e3HXl5E,411
-bio2zarr/cli.py,sha256=-6cU26n5f8CpBSj6RGC-fpNByjuJ0KxSFz85O9tITPg,14961
-bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
-bio2zarr/core.py,sha256=Yd3Z6-mFI_neaxoWT6t6Tip0k1VZEcWbautHcJ0ep8Q,10486
-bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
-bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
-bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
-bio2zarr/vcf_utils.py,sha256=R3bes-xYLZ4ekaxtqDd39YVV20qHmwei3XiIg1UFhRA,17996
-bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
-bio2zarr/vcf2zarr/icf.py,sha256=rIC35RIfkk5gEE8cOmBg1d9Pj-HkPivmGvYp4PrVN1Q,41589
-bio2zarr/vcf2zarr/vcz.py,sha256=2WE4RX5jZBiKDFEztNGYgXyrLRmVWeLKlFzh0GOzylk,38198
-bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
-bio2zarr-0.1.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-bio2zarr-0.1.1.dist-info/METADATA,sha256=RR9oM_5UYB5slsheIFzkIlRZt5du8eCb1_bMT_e7QjY,14854
-bio2zarr-0.1.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-bio2zarr-0.1.1.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
-bio2zarr-0.1.1.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
-bio2zarr-0.1.1.dist-info/RECORD,,

{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{bio2zarr-0.1.1.dist-info → bio2zarr-0.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

bio2zarr 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl