PyPI - bio2zarr - Versions diffs - 0.1.4__tar.gz → 0.1.5__tar.gz - Mend

bio2zarr 0.1.4tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (55) hide show

{bio2zarr-0.1.4 → bio2zarr-0.1.5}/CHANGELOG.md RENAMED Viewed

@@ -1,3 +1,9 @@
+# 0.1.5 2025-03-31
+- Add support for merging contig IDs across multiple VCFs (#335)
+- Add support for unindexed (and uncompressed) VCFs (#337)
 # 0.1.4 2025-03-10
 - Fix bug in handling all-missing genotypes (#328)

{bio2zarr-0.1.4 → bio2zarr-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: bio2zarr
-Version: 0.1.4
+Version: 0.1.5
 Summary: Convert bioinformatics data to Zarr
 Author-email: sgkit Developers <project@sgkit.dev>
 License:                                  Apache License
@@ -241,6 +241,7 @@ Requires-Dist: pytest-coverage; extra == "dev"
 Requires-Dist: pytest-xdist; extra == "dev"
 Requires-Dist: sgkit>=0.8.0; extra == "dev"
 Requires-Dist: tqdm; extra == "dev"
+Dynamic: license-file
 [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
 [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)

{bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/_version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.1.4'
-__version_tuple__ = version_tuple = (0, 1, 4)
+__version__ = version = '0.1.5'
+__version_tuple__ = version_tuple = (0, 1, 5)

{bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/cli.py RENAMED Viewed

@@ -624,8 +624,8 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
         num_parts_per_path = max(1, num_partitions // len(vcfs))
     for vcf_path in vcfs:
-        indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
-        regions = indexed_vcf.partition_into_regions(
+        vcf_file = vcf_utils.VcfFile(vcf_path)
+        regions = vcf_file.partition_into_regions(
             num_parts=num_parts_per_path, target_part_size=partition_size
         )
         for region in regions:

{bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/vcf2zarr/icf.py RENAMED Viewed

@@ -228,8 +228,8 @@ def fixed_vcf_field_definitions():
 def scan_vcf(path, target_num_partitions):
-    with vcf_utils.IndexedVcf(path) as indexed_vcf:
-        vcf = indexed_vcf.vcf
+    with vcf_utils.VcfFile(path) as vcf_file:
+        vcf = vcf_file.vcf
         filters = []
         pass_index = -1
         for h in vcf.header_iter():
@@ -270,10 +270,10 @@ def scan_vcf(path, target_num_partitions):
             filters=filters,
             fields=fields,
             partitions=[],
-            num_records=sum(indexed_vcf.contig_record_counts().values()),
+            num_records=sum(vcf_file.contig_record_counts().values()),
         )
-        regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
+        regions = vcf_file.partition_into_regions(num_parts=target_num_partitions)
         for region in regions:
             metadata.partitions.append(
                 VcfPartition(
@@ -324,14 +324,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     # are compatible.
     all_partitions = []
     total_records = 0
+    contigs = {}
     for metadata, _ in results:
         for partition in metadata.partitions:
             logger.debug(f"Scanned partition {partition}")
             all_partitions.append(partition)
+        for contig in metadata.contigs:
+            if contig.id in contigs:
+                if contig != contigs[contig.id]:
+                    raise ValueError(
+                        "Incompatible contig definitions: "
+                        f"{contig} != {contigs[contig.id]}"
+                    )
+            else:
+                contigs[contig.id] = contig
         total_records += metadata.num_records
         metadata.num_records = 0
         metadata.partitions = []
+    contig_union = list(contigs.values())
+    for metadata, _ in results:
+        metadata.contigs = contig_union
     icf_metadata, header = results[0]
     for metadata, _ in results[1:]:
         if metadata != icf_metadata:
@@ -1079,9 +1093,9 @@ class IntermediateColumnarFormatWriter:
             self.path,
             partition_index,
         ) as tcw:
-            with vcf_utils.IndexedVcf(partition.vcf_path) as ivcf:
+            with vcf_utils.VcfFile(partition.vcf_path) as vcf:
                 num_records = 0
-                for variant in ivcf.variants(partition.region):
+                for variant in vcf.variants(partition.region):
                     num_records += 1
                     last_position = variant.POS
                     tcw.append("CHROM", variant.CHROM)

{bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr/vcf_utils.py RENAMED Viewed

@@ -89,7 +89,10 @@ class Region:
     end: Optional[int] = None
     def __post_init__(self):
-        if self.start is not None:
+        assert self.contig is not None
+        if self.start is None:
+            self.start = 1
+        else:
             self.start = int(self.start)
             assert self.start > 0
         if self.end is not None:
@@ -393,9 +396,12 @@ class VcfIndexType(Enum):
     TABIX = ".tbi"
-class IndexedVcf(contextlib.AbstractContextManager):
+class VcfFile(contextlib.AbstractContextManager):
     def __init__(self, vcf_path, index_path=None):
         self.vcf = None
+        self.file_type = None
+        self.index_type = None
         vcf_path = pathlib.Path(vcf_path)
         if not vcf_path.exists():
             raise FileNotFoundError(vcf_path)
@@ -408,30 +414,34 @@ class IndexedVcf(contextlib.AbstractContextManager):
                     vcf_path.suffix + VcfIndexType.CSI.value
                 )
                 if not index_path.exists():
-                    raise FileNotFoundError(
-                        f"Cannot find .tbi or .csi file for {vcf_path}"
-                    )
+                    # No supported index found
+                    index_path = None
         else:
             index_path = pathlib.Path(index_path)
+            if not index_path.exists():
+                raise FileNotFoundError(
+                    f"Specified index path {index_path} does not exist"
+                )
         self.vcf_path = vcf_path
         self.index_path = index_path
-        self.file_type = None
-        self.index_type = None
-        if index_path.suffix == VcfIndexType.CSI.value:
-            self.index_type = VcfIndexType.CSI
-        elif index_path.suffix == VcfIndexType.TABIX.value:
-            self.index_type = VcfIndexType.TABIX
-            self.file_type = VcfFileType.VCF
-        else:
-            raise ValueError("Only .tbi or .csi indexes are supported.")
+        if index_path is not None:
+            if index_path.suffix == VcfIndexType.CSI.value:
+                self.index_type = VcfIndexType.CSI
+            elif index_path.suffix == VcfIndexType.TABIX.value:
+                self.index_type = VcfIndexType.TABIX
+                self.file_type = VcfFileType.VCF
+            else:
+                raise ValueError("Only .tbi or .csi indexes are supported.")
         self.vcf = cyvcf2.VCF(vcf_path)
-        self.vcf.set_index(str(self.index_path))
+        if self.index_path is not None:
+            self.vcf.set_index(str(self.index_path))
         logger.debug(f"Loaded {vcf_path} with index {self.index_path}")
         self.sequence_names = None
+        self.index = None
         if self.index_type == VcfIndexType.CSI:
             # Determine the file-type based on the "aux" field.
             self.index = read_csi(self.index_path)
@@ -441,9 +451,17 @@ class IndexedVcf(contextlib.AbstractContextManager):
                 self.sequence_names = self.index.parse_vcf_aux()
             else:
                 self.sequence_names = self.vcf.seqnames
-        else:
+        elif self.index_type == VcfIndexType.TABIX:
             self.index = read_tabix(self.index_path)
+            self.file_type = VcfFileType.VCF
             self.sequence_names = self.index.sequence_names
+        else:
+            assert self.index is None
+            var = next(self.vcf)
+            self.sequence_names = [var.CHROM]
+            self.vcf.close()
+            # There doesn't seem to be a way to reset the iterator
+            self.vcf = cyvcf2.VCF(vcf_path)
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self.vcf is not None:
@@ -452,6 +470,8 @@ class IndexedVcf(contextlib.AbstractContextManager):
         return False
     def contig_record_counts(self):
+        if self.index is None:
+            return {self.sequence_names[0]: RECORD_COUNT_UNKNOWN}
         d = dict(zip(self.sequence_names, self.index.record_counts))
         if self.file_type == VcfFileType.BCF:
             d = {k: v for k, v in d.items() if v > 0}
@@ -460,12 +480,21 @@ class IndexedVcf(contextlib.AbstractContextManager):
     def count_variants(self, region):
         return sum(1 for _ in self.variants(region))
-    def variants(self, region):
-        start = 1 if region.start is None else region.start
-        for var in self.vcf(str(region)):
-            # Need to filter because of indels overlapping the region
-            if var.POS >= start:
+    def variants(self, region=None):
+        if self.index is None:
+            contig = self.sequence_names[0]
+            if region is not None:
+                assert region.contig == contig
+            for var in self.vcf:
+                if var.CHROM != contig:
+                    raise ValueError("Multi-contig VCFs must be indexed")
                 yield var
+        else:
+            start = 1 if region.start is None else region.start
+            for var in self.vcf(str(region)):
+                # Need to filter because of indels overlapping the region
+                if var.POS >= start:
+                    yield var
     def _filter_empty_and_refine(self, regions):
         """
@@ -505,6 +534,9 @@ class IndexedVcf(contextlib.AbstractContextManager):
             if target_part_size_bytes < 1:
                 raise ValueError("target_part_size must be positive")
+        if self.index is None:
+            return [Region(self.sequence_names[0])]
         # Calculate the desired part file boundaries
         file_length = os.stat(self.vcf_path).st_size
         if num_parts is not None:

{bio2zarr-0.1.4 → bio2zarr-0.1.5}/bio2zarr.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: bio2zarr
-Version: 0.1.4
+Version: 0.1.5
 Summary: Convert bioinformatics data to Zarr
 Author-email: sgkit Developers <project@sgkit.dev>
 License:                                  Apache License
@@ -241,6 +241,7 @@ Requires-Dist: pytest-coverage; extra == "dev"
 Requires-Dist: pytest-xdist; extra == "dev"
 Requires-Dist: sgkit>=0.8.0; extra == "dev"
 Requires-Dist: tqdm; extra == "dev"
+Dynamic: license-file
 [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
 [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)