PyPI - bio2zarr - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

bio2zarr 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (19) hide show

bio2zarr/__main__.py +2 -1
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +91 -24
bio2zarr/core.py +43 -22
bio2zarr/plink.py +314 -189
bio2zarr/tskit.py +301 -0
bio2zarr/typing.py +1 -2
bio2zarr/{vcf2zarr/icf.py → vcf.py} +614 -118
bio2zarr/vcf_utils.py +66 -33
bio2zarr/{vcf2zarr/vcz.py → vcz.py} +544 -708
bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/METADATA +19 -7
bio2zarr-0.1.6.dist-info/RECORD +21 -0
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/WHEEL +1 -1
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/entry_points.txt +2 -0
bio2zarr/vcf2zarr/__init__.py +0 -38
bio2zarr-0.1.4.dist-info/RECORD +0 -21
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info/licenses}/LICENSE +0 -0
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/top_level.txt +0 -0

bio2zarr/vcf_utils.py CHANGED Viewed

@@ -7,12 +7,12 @@ import struct
 from collections.abc import Sequence
 from dataclasses import dataclass
 from enum import Enum
-from typing import IO, Any, Optional, Union
+from typing import IO, Any
-import cyvcf2
 import humanfriendly
 import numpy as np
+from bio2zarr import core
 from bio2zarr.typing import PathType
 logger = logging.getLogger(__name__)
@@ -33,7 +33,7 @@ def get_file_offset(vfp: int) -> int:
     return vfp >> 16 & address_mask
-def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Optional[Any] = None) -> Any:
+def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Any | None = None) -> Any:
     """Read bytes using a `struct` format string and return the unpacked data value.
     Parameters
@@ -85,11 +85,14 @@ class Region:
     """
     contig: str
-    start: Optional[int] = None
-    end: Optional[int] = None
+    start: int | None = None
+    end: int | None = None
     def __post_init__(self):
-        if self.start is not None:
+        assert self.contig is not None
+        if self.start is None:
+            self.start = 1
+        else:
             self.start = int(self.start)
             assert self.start > 0
         if self.end is not None:
@@ -194,9 +197,7 @@ def get_first_locus_in_bin(csi: CSIIndex, bin: int) -> int:
     return (bin - first_bin_on_level) * (max_span // level_size) + 1
-def read_csi(
-    file: PathType, storage_options: Optional[dict[str, str]] = None
-) -> CSIIndex:
+def read_csi(file: PathType, storage_options: dict[str, str] | None = None) -> CSIIndex:
     """Parse a CSI file into a `CSIIndex` object.
     Parameters
@@ -311,7 +312,7 @@ class TabixIndex:
 def read_tabix(
-    file: PathType, storage_options: Optional[dict[str, str]] = None
+    file: PathType, storage_options: dict[str, str] | None = None
 ) -> TabixIndex:
     """Parse a tabix file into a `TabixIndex` object.
@@ -393,9 +394,15 @@ class VcfIndexType(Enum):
     TABIX = ".tbi"
-class IndexedVcf(contextlib.AbstractContextManager):
+class VcfFile(contextlib.AbstractContextManager):
+    @core.requires_optional_dependency("cyvcf2", "vcf")
     def __init__(self, vcf_path, index_path=None):
+        import cyvcf2
         self.vcf = None
+        self.file_type = None
+        self.index_type = None
         vcf_path = pathlib.Path(vcf_path)
         if not vcf_path.exists():
             raise FileNotFoundError(vcf_path)
@@ -408,30 +415,34 @@ class IndexedVcf(contextlib.AbstractContextManager):
                     vcf_path.suffix + VcfIndexType.CSI.value
                 )
                 if not index_path.exists():
-                    raise FileNotFoundError(
-                        f"Cannot find .tbi or .csi file for {vcf_path}"
-                    )
+                    # No supported index found
+                    index_path = None
         else:
             index_path = pathlib.Path(index_path)
+            if not index_path.exists():
+                raise FileNotFoundError(
+                    f"Specified index path {index_path} does not exist"
+                )
         self.vcf_path = vcf_path
         self.index_path = index_path
-        self.file_type = None
-        self.index_type = None
-        if index_path.suffix == VcfIndexType.CSI.value:
-            self.index_type = VcfIndexType.CSI
-        elif index_path.suffix == VcfIndexType.TABIX.value:
-            self.index_type = VcfIndexType.TABIX
-            self.file_type = VcfFileType.VCF
-        else:
-            raise ValueError("Only .tbi or .csi indexes are supported.")
+        if index_path is not None:
+            if index_path.suffix == VcfIndexType.CSI.value:
+                self.index_type = VcfIndexType.CSI
+            elif index_path.suffix == VcfIndexType.TABIX.value:
+                self.index_type = VcfIndexType.TABIX
+                self.file_type = VcfFileType.VCF
+            else:
+                raise ValueError("Only .tbi or .csi indexes are supported.")
         self.vcf = cyvcf2.VCF(vcf_path)
-        self.vcf.set_index(str(self.index_path))
+        if self.index_path is not None:
+            self.vcf.set_index(str(self.index_path))
         logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
         self.sequence_names = None
+        self.index = None
         if self.index_type == VcfIndexType.CSI:
             # Determine the file-type based on the "aux" field.
             self.index = read_csi(self.index_path)
@@ -441,9 +452,17 @@ class IndexedVcf(contextlib.AbstractContextManager):
                 self.sequence_names = self.index.parse_vcf_aux()
             else:
                 self.sequence_names = self.vcf.seqnames
-        else:
+        elif self.index_type == VcfIndexType.TABIX:
             self.index = read_tabix(self.index_path)
+            self.file_type = VcfFileType.VCF
             self.sequence_names = self.index.sequence_names
+        else:
+            assert self.index is None
+            var = next(self.vcf)
+            self.sequence_names = [var.CHROM]
+            self.vcf.close()
+            # There doesn't seem to be a way to reset the iterator
+            self.vcf = cyvcf2.VCF(vcf_path)
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self.vcf is not None:
@@ -452,6 +471,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
         return False
     def contig_record_counts(self):
+        if self.index is None:
+            return {self.sequence_names[0]: RECORD_COUNT_UNKNOWN}
         d = dict(zip(self.sequence_names, self.index.record_counts))
         if self.file_type == VcfFileType.BCF:
             d = {k: v for k, v in d.items() if v > 0}
@@ -460,12 +481,21 @@ class IndexedVcf(contextlib.AbstractContextManager):
     def count_variants(self, region):
         return sum(1 for _ in self.variants(region))
-    def variants(self, region):
-        start = 1 if region.start is None else region.start
-        for var in self.vcf(str(region)):
-            # Need to filter because of indels overlapping the region
-            if var.POS >= start:
+    def variants(self, region=None):
+        if self.index is None:
+            contig = self.sequence_names[0]
+            if region is not None:
+                assert region.contig == contig
+            for var in self.vcf:
+                if var.CHROM != contig:
+                    raise ValueError("Multi-contig VCFs must be indexed")
                 yield var
+        else:
+            start = 1 if region.start is None else region.start
+            for var in self.vcf(str(region)):
+                # Need to filter because of indels overlapping the region
+                if var.POS >= start:
+                    yield var
     def _filter_empty_and_refine(self, regions):
         """
@@ -483,8 +513,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
     def partition_into_regions(
         self,
-        num_parts: Optional[int] = None,
-        target_part_size: Union[None, int, str] = None,
+        num_parts: int | None = None,
+        target_part_size: None | int | str = None,
     ):
         if num_parts is None and target_part_size is None:
             raise ValueError("One of num_parts or target_part_size must be specified")
@@ -505,6 +535,9 @@ class IndexedVcf(contextlib.AbstractContextManager):
             if target_part_size_bytes < 1:
                 raise ValueError("target_part_size must be positive")
+        if self.index is None:
+            return [Region(self.sequence_names[0])]
         # Calculate the desired part file boundaries
         file_length = os.stat(self.vcf_path).st_size
         if num_parts is not None:

bio2zarr 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl