PyPI - bio2zarr - Versions diffs - 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

bio2zarr 0.0.9py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (19) hide show

bio2zarr/__main__.py +2 -2
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +176 -113
bio2zarr/constants.py +18 -0
bio2zarr/core.py +65 -20
bio2zarr/vcf2zarr/__init__.py +38 -0
bio2zarr/vcf2zarr/icf.py +1221 -0
bio2zarr/vcf2zarr/vcz.py +1053 -0
bio2zarr/vcf2zarr/verification.py +230 -0
bio2zarr/vcf_utils.py +11 -6
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/METADATA +10 -123
bio2zarr-0.1.0.dist-info/RECORD +20 -0
bio2zarr-0.1.0.dist-info/entry_points.txt +3 -0
bio2zarr/vcf.py +0 -2445
bio2zarr-0.0.9.dist-info/RECORD +0 -16
bio2zarr-0.0.9.dist-info/entry_points.txt +0 -4
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/top_level.txt +0 -0

bio2zarr/vcf2zarr/verification.py ADDED Viewed

@@ -0,0 +1,230 @@
+import cyvcf2
+import numpy as np
+import numpy.testing as nt
+import tqdm
+import zarr
+from .. import constants
+def assert_all_missing_float(a):
+    v = np.array(a, dtype=np.float32).view(np.int32)
+    nt.assert_equal(v, constants.FLOAT32_MISSING_AS_INT32)
+def assert_all_fill_float(a):
+    v = np.array(a, dtype=np.float32).view(np.int32)
+    nt.assert_equal(v, constants.FLOAT32_FILL_AS_INT32)
+def assert_all_missing_int(a):
+    v = np.array(a, dtype=int)
+    nt.assert_equal(v, constants.INT_MISSING)
+def assert_all_fill_int(a):
+    v = np.array(a, dtype=int)
+    nt.assert_equal(v, constants.INT_FILL)
+def assert_all_missing_string(a):
+    nt.assert_equal(a, constants.STR_MISSING)
+def assert_all_fill_string(a):
+    nt.assert_equal(a, constants.STR_FILL)
+def assert_all_fill(zarr_val, vcf_type):
+    if vcf_type == "Integer":
+        assert_all_fill_int(zarr_val)
+    elif vcf_type in ("String", "Character"):
+        assert_all_fill_string(zarr_val)
+    elif vcf_type == "Float":
+        assert_all_fill_float(zarr_val)
+    else:  # pragma: no cover
+        assert False  # noqa PT015
+def assert_all_missing(zarr_val, vcf_type):
+    if vcf_type == "Integer":
+        assert_all_missing_int(zarr_val)
+    elif vcf_type in ("String", "Character"):
+        assert_all_missing_string(zarr_val)
+    elif vcf_type == "Flag":
+        assert zarr_val == False  # noqa 712
+    elif vcf_type == "Float":
+        assert_all_missing_float(zarr_val)
+    else:  # pragma: no cover
+        assert False  # noqa PT015
+def assert_info_val_missing(zarr_val, vcf_type):
+    assert_all_missing(zarr_val, vcf_type)
+def assert_format_val_missing(zarr_val, vcf_type):
+    assert_info_val_missing(zarr_val, vcf_type)
+# Note: checking exact equality may prove problematic here
+# but we should be deterministically storing what cyvcf2
+# provides, which should compare equal.
+def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
+    assert vcf_val is not None
+    if vcf_type in ("String", "Character"):
+        split = list(vcf_val.split(","))
+        k = len(split)
+        if isinstance(zarr_val, str):
+            assert k == 1
+            # Scalar
+            assert vcf_val == zarr_val
+        else:
+            nt.assert_equal(split, zarr_val[:k])
+            assert_all_fill(zarr_val[k:], vcf_type)
+    elif isinstance(vcf_val, tuple):
+        vcf_missing_value_map = {
+            "Integer": constants.INT_MISSING,
+            "Float": constants.FLOAT32_MISSING,
+        }
+        v = [vcf_missing_value_map[vcf_type] if x is None else x for x in vcf_val]
+        missing = np.array([j for j, x in enumerate(vcf_val) if x is None], dtype=int)
+        a = np.array(v)
+        k = len(a)
+        # We are checking for int missing twice here, but it's necessary to have
+        # a separate check for floats because different NaNs compare equal
+        nt.assert_equal(a, zarr_val[:k])
+        assert_all_missing(zarr_val[missing], vcf_type)
+        if k < len(zarr_val):
+            assert_all_fill(zarr_val[k:], vcf_type)
+    else:
+        # Scalar
+        zarr_val = np.array(zarr_val, ndmin=1)
+        assert len(zarr_val.shape) == 1
+        assert vcf_val == zarr_val[0]
+        if len(zarr_val) > 1:
+            assert_all_fill(zarr_val[1:], vcf_type)
+def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
+    assert vcf_val is not None
+    assert isinstance(vcf_val, np.ndarray)
+    if vcf_type in ("String", "Character"):
+        assert len(vcf_val) == len(zarr_val)
+        for v, z in zip(vcf_val, zarr_val):
+            split = list(v.split(","))
+            # Note: deliberately duplicating logic here between this and the
+            # INFO col above to make sure all combinations are covered by tests
+            k = len(split)
+            if k == 1:
+                assert v == z
+            else:
+                nt.assert_equal(split, z[:k])
+                assert_all_fill(z[k:], vcf_type)
+    else:
+        assert vcf_val.shape[0] == zarr_val.shape[0]
+        if len(vcf_val.shape) == len(zarr_val.shape) + 1:
+            assert vcf_val.shape[-1] == 1
+            vcf_val = vcf_val[..., 0]
+        assert len(vcf_val.shape) <= 2
+        assert len(vcf_val.shape) == len(zarr_val.shape)
+        if len(vcf_val.shape) == 2:
+            k = vcf_val.shape[1]
+            if zarr_val.shape[1] != k:
+                assert_all_fill(zarr_val[:, k:], vcf_type)
+                zarr_val = zarr_val[:, :k]
+        assert vcf_val.shape == zarr_val.shape
+        if vcf_type == "Integer":
+            vcf_val[vcf_val == constants.VCF_INT_MISSING] = constants.INT_MISSING
+            vcf_val[vcf_val == constants.VCF_INT_FILL] = constants.INT_FILL
+        elif vcf_type == "Float":
+            nt.assert_equal(vcf_val.view(np.int32), zarr_val.view(np.int32))
+        nt.assert_equal(vcf_val, zarr_val)
+def verify(vcf_path, zarr_path, show_progress=False):
+    store = zarr.DirectoryStore(zarr_path)
+    root = zarr.group(store=store)
+    pos = root["variant_position"][:]
+    allele = root["variant_allele"][:]
+    chrom = root["contig_id"][:][root["variant_contig"][:]]
+    vid = root["variant_id"][:]
+    call_genotype = None
+    if "call_genotype" in root:
+        call_genotype = iter(root["call_genotype"])
+    vcf = cyvcf2.VCF(vcf_path)
+    format_headers = {}
+    info_headers = {}
+    for h in vcf.header_iter():
+        if h["HeaderType"] == "FORMAT":
+            format_headers[h["ID"]] = h
+        if h["HeaderType"] == "INFO":
+            info_headers[h["ID"]] = h
+    format_fields = {}
+    info_fields = {}
+    for colname in root.keys():
+        if colname.startswith("call") and not colname.startswith("call_genotype"):
+            vcf_name = colname.split("_", 1)[1]
+            vcf_type = format_headers[vcf_name]["Type"]
+            format_fields[vcf_name] = vcf_type, iter(root[colname])
+        if colname.startswith("variant"):
+            name = colname.split("_", 1)[1]
+            if name.isupper():
+                vcf_type = info_headers[name]["Type"]
+                info_fields[name] = vcf_type, iter(root[colname])
+    first_pos = next(vcf).POS
+    start_index = np.searchsorted(pos, first_pos)
+    assert pos[start_index] == first_pos
+    vcf = cyvcf2.VCF(vcf_path)
+    if show_progress:
+        iterator = tqdm.tqdm(vcf, desc="  Verify", total=vcf.num_records)  # NEEDS TEST
+    else:
+        iterator = vcf
+    for j, row in enumerate(iterator, start_index):
+        assert chrom[j] == row.CHROM
+        assert pos[j] == row.POS
+        assert vid[j] == ("." if row.ID is None else row.ID)
+        assert allele[j, 0] == row.REF
+        k = len(row.ALT)
+        nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
+        assert np.all(allele[j, k + 1 :] == "")
+        # TODO FILTERS
+        if call_genotype is None:
+            val = None
+            try:
+                val = row.format("GT")
+            except KeyError:
+                pass
+            assert val is None
+        else:
+            gt = row.genotype.array()
+            gt_zarr = next(call_genotype)
+            gt_vcf = gt[:, :-1]
+            # NOTE cyvcf2 remaps genotypes automatically
+            # into the same missing/pad encoding that sgkit uses.
+            nt.assert_array_equal(gt_zarr, gt_vcf)
+        for name, (vcf_type, zarr_iter) in info_fields.items():
+            vcf_val = row.INFO.get(name, None)
+            zarr_val = next(zarr_iter)
+            if vcf_val is None:
+                assert_info_val_missing(zarr_val, vcf_type)
+            else:
+                assert_info_val_equal(vcf_val, zarr_val, vcf_type)
+        for name, (vcf_type, zarr_iter) in format_fields.items():
+            vcf_val = row.format(name)
+            zarr_val = next(zarr_iter)
+            if vcf_val is None:
+                assert_format_val_missing(zarr_val, vcf_type)
+            else:
+                assert_format_val_equal(vcf_val, zarr_val, vcf_type)

bio2zarr/vcf_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import contextlib
 import gzip
+import logging
 import os
 import pathlib
 import struct
@@ -13,6 +14,8 @@ import numpy as np
 from bio2zarr.typing import PathType
+logger = logging.getLogger(__name__)
 CSI_EXTENSION = ".csi"
 TABIX_EXTENSION = ".tbi"
 TABIX_LINEAR_INDEX_INTERVAL_SIZE = 1 << 14  # 16kb interval size
@@ -411,6 +414,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
             raise ValueError("Only .tbi or .csi indexes are supported.")
         self.vcf = cyvcf2.VCF(vcf_path)
         self.vcf.set_index(str(self.index_path))
+        logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
         self.sequence_names = None
         if self.index_type == "csi":
             # Determine the file-type based on the "aux" field.
@@ -441,24 +445,25 @@ class IndexedVcf(contextlib.AbstractContextManager):
         return sum(1 for _ in self.variants(region))
     def variants(self, region):
-        # Need to filter because of indels overlapping the region
         start = 1 if region.start is None else region.start
         for var in self.vcf(str(region)):
+            # Need to filter because of indels overlapping the region
             if var.POS >= start:
                 yield var
     def _filter_empty_and_refine(self, regions):
         """
         Return all regions in the specified list that have one or more records,
-        and refine the start coordinate of the region to be the actual first coord
+        and refine the start coordinate of the region to be the actual first coord.
+        Because this is a relatively expensive operation requiring seeking around
+        the file, we return the results as an iterator.
         """
-        ret = []
         for region in regions:
             var = next(self.variants(region), None)
             if var is not None:
                 region.start = var.POS
-                ret.append(region)
-        return ret
+                yield region
     def partition_into_regions(
         self,
@@ -490,7 +495,7 @@ class IndexedVcf(contextlib.AbstractContextManager):
             target_part_size_bytes = file_length // num_parts
         elif target_part_size_bytes is not None:
             num_parts = ceildiv(file_length, target_part_size_bytes)
-        part_lengths = np.array([i * target_part_size_bytes for i in range(num_parts)])
+        part_lengths = target_part_size_bytes * np.arange(num_parts, dtype=int)
         file_offsets, region_contig_indexes, region_positions = self.index.offsets()
         # Search the file offsets to find which indexes the part lengths fall at

{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bio2zarr
-Version: 0.0.9
+Version: 0.1.0
 Summary: Convert bioinformatics data to Zarr
 Author-email: sgkit Developers <project@sgkit.dev>
 License: Apache License
@@ -206,10 +206,13 @@ License: Apache License
            limitations under the License.
 Project-URL: repository, https://github.com/sgkit-dev/bio2zarr
-Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/intro.html
-Classifier: Development Status :: 3 - Alpha
+Project-URL: documentation, https://sgkit-dev.github.io/bio2zarr/
+Classifier: Development Status :: 4 - Beta
 Classifier: License :: OSI Approved :: Apache Software License
-Classifier: Operating System :: OS Independent
+Classifier: Operating System :: POSIX
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: MacOS
+Classifier: Operating System :: MacOS :: MacOS X
 Classifier: Intended Audience :: Science/Research
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
@@ -238,126 +241,10 @@ Requires-Dist: sgkit >=0.8.0 ; extra == 'dev'
 Requires-Dist: tqdm ; extra == 'dev'
 [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
+[![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
 # bio2zarr
 Convert bioinformatics file formats to Zarr
-Initially supports converting VCF to the
-[sgkit vcf-zarr specification](https://github.com/pystatgen/vcf-zarr-spec/)
-**This is early alpha-status code: everything is subject to change,
-and it has not been thoroughly tested**
-## Install
-```
-$ python3 -m pip install bio2zarr
-```
-This will install the programs ``vcf2zarr``, ``plink2zarr`` and ``vcf_partition``
-into your local Python path. You may need to update your $PATH to call the
-executables directly.
-Alternatively, calling
-```
-$ python3 -m bio2zarr vcf2zarr <args>
-```
-is equivalent to
-```
-$ vcf2zarr <args>
-```
-and will always work.
-## vcf2zarr
-Convert a VCF to zarr format:
-```
-$ vcf2zarr convert <VCF1> <VCF2> <zarr>
-```
-Converts the VCF to zarr format.
-**Do not use this for anything but the smallest files**
-The recommended approach is to use a multi-stage conversion
-First, convert the VCF into the intermediate format:
-```
-vcf2zarr explode tests/data/vcf/sample.vcf.gz tmp/sample.exploded
-```
-Then, (optionally) inspect this representation to get a feel for your dataset
-```
-vcf2zarr inspect tmp/sample.exploded
-```
-Then, (optionally) generate a conversion schema to describe the corresponding
-Zarr arrays:
-```
-vcf2zarr mkschema tmp/sample.exploded > sample.schema.json
-```
-View and edit the schema, deleting any columns you don't want, or tweaking
-dtypes and compression settings to your taste.
-Finally, encode to Zarr:
-```
-vcf2zarr encode tmp/sample.exploded tmp/sample.zarr -s sample.schema.json
-```
-Use the ``-p, --worker-processes`` argument to control the number of workers used
-in the ``explode`` and ``encode`` phases.
-### Shell completion
-To enable shell completion for a particular session in Bash do:
-```
-eval "$(_VCF2ZARR_COMPLETE=bash_source vcf2zarr)"
-```
-If you add this to your ``.bashrc`` vcf2zarr shell completion should available
-in all new shell sessions.
-See the [Click documentation](https://click.palletsprojects.com/en/8.1.x/shell-completion/#enabling-completion)
-for instructions on how to enable completion in other shells.
-a
-## plink2zarr
-Convert a plink ``.bed`` file to zarr format. **This is incomplete**
-## vcf_partition
-Partition a given VCF file into (approximately) a give number of regions:
-```
-vcf_partition 20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz -n 10
-```
-gives
-```
-chr20:1-6799360
-chr20:6799361-14319616
-chr20:14319617-21790720
-chr20:21790721-28770304
-chr20:28770305-31096832
-chr20:31096833-38043648
-chr20:38043649-45580288
-chr20:45580289-52117504
-chr20:52117505-58834944
-chr20:58834945-
-```
-These reqion strings can then be used to split computation of the VCF
-into chunks for parallelisation.
-**TODO give a nice example here using xargs**
-**WARNING that this does not take into account that indels may overlap
-partitions and you may count variants twice or more if they do**
+See the [documentation](https://sgkit-dev.github.io/bio2zarr/) for details.

bio2zarr-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
+bio2zarr/__main__.py,sha256=wUKNNps8MAAEpMvLgVaI449eKyfr7Jpk2mMtYbNl4Ek,531
+bio2zarr/_version.py,sha256=IMl2Pr_Sy4LVRKy_Sm4CdwUl1Gryous6ncL96EMYsnM,411
+bio2zarr/cli.py,sha256=-6cU26n5f8CpBSj6RGC-fpNByjuJ0KxSFz85O9tITPg,14961
+bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
+bio2zarr/core.py,sha256=Yd3Z6-mFI_neaxoWT6t6Tip0k1VZEcWbautHcJ0ep8Q,10486
+bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
+bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
+bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
+bio2zarr/vcf_utils.py,sha256=R3bes-xYLZ4ekaxtqDd39YVV20qHmwei3XiIg1UFhRA,17996
+bio2zarr/vcf2zarr/__init__.py,sha256=0_of1iGzIDhvti49Gbcgd47oP63mKvouk9uLgKgiwoQ,791
+bio2zarr/vcf2zarr/icf.py,sha256=rIC35RIfkk5gEE8cOmBg1d9Pj-HkPivmGvYp4PrVN1Q,41589
+bio2zarr/vcf2zarr/vcz.py,sha256=2WE4RX5jZBiKDFEztNGYgXyrLRmVWeLKlFzh0GOzylk,38198
+bio2zarr/vcf2zarr/verification.py,sha256=6xcBy-cJLaQz2Qj2crffXFMjUG-H7z637Csxe5ZCmds,7898
+bio2zarr-0.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+bio2zarr-0.1.0.dist-info/METADATA,sha256=zezBzqrJPB4ED7IqFvVj8Lura2untJA8optBdVTBNzc,14848
+bio2zarr-0.1.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+bio2zarr-0.1.0.dist-info/entry_points.txt,sha256=3adtRrClMpjatEbiYqK5bm9WHA2PaJN5hK-Cs_zkpaI,97
+bio2zarr-0.1.0.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
+bio2zarr-0.1.0.dist-info/RECORD,,

bio2zarr-0.1.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,3 @@
+[console_scripts]
+vcf2zarr = bio2zarr.cli:vcf2zarr_main
+vcfpartition = bio2zarr.cli:vcfpartition

bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.9py3-none-any.whl → 0.1.0py3-none-any.whl