PyPI - bio2zarr - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

bio2zarr 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (16) hide show

bio2zarr/__init__.py +1 -1
bio2zarr/__main__.py +2 -0
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +42 -14
bio2zarr/core.py +7 -7
bio2zarr/plink.py +6 -8
bio2zarr/typing.py +1 -1
bio2zarr/vcf.py +136 -87
bio2zarr/vcf_utils.py +26 -8
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/METADATA +2 -2
bio2zarr-0.0.5.dist-info/RECORD +16 -0
bio2zarr-0.0.3.dist-info/RECORD +0 -16
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/top_level.txt +0 -0

bio2zarr/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from . provenance import __version__
1	+ from .provenance import __version__ # noqa F401

bio2zarr/__main__.py CHANGED Viewed

@@ -2,11 +2,13 @@ import click
 from . import cli
 @cli.version
 @click.group()
 def bio2zarr():
     pass
 # Provide a single top-level interface to all of the functionality.
 # This probably isn't the recommended way of interacting, as we
 # install individual commands as console scripts. However, this

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.0.3'
-__version_tuple__ = version_tuple = (0, 0, 3)
+__version__ = version = '0.0.5'
+__version_tuple__ = version_tuple = (0, 0, 5)

bio2zarr/cli.py CHANGED Viewed

@@ -4,14 +4,11 @@ import pathlib
 import shutil
 import click
-import tabulate
 import coloredlogs
+import numcodecs
+import tabulate
-from . import vcf
-from . import vcf_utils
-from . import plink
-from . import provenance
+from . import plink, provenance, vcf, vcf_utils
 logger = logging.getLogger(__name__)
@@ -66,6 +63,17 @@ column_chunk_size = click.option(
     help="Approximate uncompressed size of exploded column chunks in MiB",
 )
+# We could provide the full flexiblity of numcodecs/Blosc here, but there
+# doesn't seem much point. Can always add more arguments here to control
+# compression level, etc.
+compressor = click.option(
+    "-C",
+    "--compressor",
+    type=click.Choice(["lz4", "zstd"]),
+    default=None,
+    help="Codec to use for compressing column chunks (Default=zstd).",
+)
 # Note: -l and -w were chosen when these were called "width" and "length".
 # possibly there are better letters now.
 variants_chunk_size = click.option(
@@ -113,24 +121,36 @@ def check_overwrite_dir(path, force):
         shutil.rmtree(tmp_delete_path)
+def get_compressor(cname):
+    if cname is None:
+        return None
+    config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
+    config["cname"] = cname
+    return numcodecs.get_codec(config)
 @click.command
 @vcfs
 @new_icf_path
 @force
 @verbose
-@worker_processes
 @column_chunk_size
-def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size):
+@compressor
+@worker_processes
+def explode(
+    vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
+):
     """
     Convert VCF(s) to intermediate columnar format
     """
     setup_logging(verbose)
     check_overwrite_dir(icf_path, force)
     vcf.explode(
-        vcfs,
         icf_path,
+        vcfs,
         worker_processes=worker_processes,
         column_chunk_size=column_chunk_size,
+        compressor=get_compressor(compressor),
         show_progress=True,
     )
@@ -141,10 +161,18 @@ def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size)
 @click.argument("num_partitions", type=click.IntRange(min=1))
 @force
 @column_chunk_size
+@compressor
 @verbose
 @worker_processes
 def dexplode_init(
-    vcfs, icf_path, num_partitions, force, column_chunk_size, verbose, worker_processes
+    vcfs,
+    icf_path,
+    num_partitions,
+    force,
+    column_chunk_size,
+    compressor,
+    verbose,
+    worker_processes,
 ):
     """
     Initial step for distributed conversion of VCF(s) to intermediate columnar format
@@ -158,6 +186,7 @@ def dexplode_init(
         target_num_partitions=num_partitions,
         column_chunk_size=column_chunk_size,
         worker_processes=worker_processes,
+        compressor=get_compressor(compressor),
         show_progress=True,
     )
     click.echo(num_partitions)
@@ -174,7 +203,7 @@ def dexplode_partition(icf_path, partition, verbose):
     from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
     """
     setup_logging(verbose)
-    vcf.explode_partition(icf_path, partition, show_progress=True)
+    vcf.explode_partition(icf_path, partition, show_progress=False)
 @click.command
@@ -232,9 +261,8 @@ def mkschema(icf_path):
 @click.option(
     "-M",
     "--max-memory",
-    type=int,
     default=None,
-    help="An approximate bound on overall memory usage in megabytes",
+    help="An approximate bound on overall memory usage (e.g. 10G),",
 )
 @worker_processes
 def encode(
@@ -250,7 +278,7 @@ def encode(
     worker_processes,
 ):
     """
-    Encode intermediate columnar format (see explode) to vcfzarr.
+    Convert intermediate columnar format to vcfzarr.
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)

bio2zarr/core.py CHANGED Viewed

@@ -1,16 +1,15 @@
-import dataclasses
-import contextlib
 import concurrent.futures as cf
+import contextlib
+import dataclasses
+import logging
 import multiprocessing
 import threading
-import logging
 import time
-import zarr
+import numcodecs
 import numpy as np
 import tqdm
-import numcodecs
+import zarr
 logger = logging.getLogger(__name__)
@@ -50,7 +49,8 @@ def wait_on_futures(futures):
             cancel_futures(futures)
             if isinstance(exception, cf.process.BrokenProcessPool):
                 raise RuntimeError(
-                    "Worker process died: you may have run out of memory") from exception
+                    "Worker process died: you may have run out of memory"
+                ) from exception
             else:
                 raise exception

bio2zarr/plink.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import logging
+import bed_reader
 import humanfriendly
+import numcodecs
 import numpy as np
 import zarr
-import bed_reader
-import numcodecs
 from . import core
 logger = logging.getLogger(__name__)
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
     gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
     gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
     variants_chunk_size = gt.array.chunks[0]
-    n = gt.array.shape[1]
     assert start % variants_chunk_size == 0
     logger.debug(f"Reading slice {start}:{stop}")
@@ -96,7 +94,7 @@ def convert(
         chunks=(samples_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
-    logger.debug(f"Encoded samples")
+    logger.debug("Encoded samples")
     # TODO encode these in slices - but read them in one go to avoid
     # fetching repeatedly from bim file
@@ -108,7 +106,7 @@ def convert(
         chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
-    logger.debug(f"encoded variant_position")
+    logger.debug("encoded variant_position")
     alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
     a = root.array(
@@ -119,7 +117,7 @@ def convert(
         chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
-    logger.debug(f"encoded variant_allele")
+    logger.debug("encoded variant_allele")
     # TODO remove this?
     a = root.empty(
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
             elif bed_call == 2:
                 assert list(zarr_call) == [1, 1]
             else:  # pragma no cover
-                assert False
+                raise AssertionError(f"Unexpected bed call {bed_call}")

bio2zarr/typing.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from pathlib import Path
 from typing import Union
-PathType = Union[str, Path]
+PathType = Union[str, Path]

bio2zarr/vcf.py CHANGED Viewed

@@ -1,29 +1,27 @@
 import collections
+import contextlib
 import dataclasses
 import functools
+import json
 import logging
+import math
 import os
 import pathlib
 import pickle
-import sys
 import shutil
-import json
-import math
+import sys
 import tempfile
-import contextlib
 from typing import Any, List
-import humanfriendly
 import cyvcf2
+import humanfriendly
 import numcodecs
 import numpy as np
 import numpy.testing as nt
 import tqdm
 import zarr
-from . import core
-from . import provenance
-from . import vcf_utils
+from . import core, provenance, vcf_utils
 logger = logging.getLogger(__name__)
@@ -151,8 +149,8 @@ class VcfPartition:
 ICF_METADATA_FORMAT_VERSION = "0.2"
 ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
-    cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
-).get_config()
+    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
+)
 @dataclasses.dataclass
@@ -284,11 +282,25 @@ def scan_vcf(path, target_num_partitions):
         return metadata, vcf.raw_header
-def scan_vcfs(
-    paths, show_progress, target_num_partitions, column_chunk_size, worker_processes=1
-):
+def check_overlap(partitions):
+    for i in range(1, len(partitions)):
+        prev_partition = partitions[i - 1]
+        current_partition = partitions[i]
+        if (
+            prev_partition.region.contig == current_partition.region.contig
+            and prev_partition.region.end > current_partition.region.start
+        ):
+            raise ValueError(
+                f"Multiple VCFs have the region "
+                f"{prev_partition.region.contig}:{prev_partition.region.start}-"
+                f"{current_partition.region.end}"
+            )
+def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     logger.info(
-        f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
+        f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
+        f" partitions."
     )
     # An easy mistake to make is to pass the same file twice. Check this early on.
     for path, count in collections.Counter(paths).items():
@@ -333,13 +345,8 @@ def scan_vcfs(
     all_partitions.sort(
         key=lambda x: (contig_index_map[x.region.contig], x.region.start)
     )
+    check_overlap(all_partitions)
     icf_metadata.partitions = all_partitions
-    icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
-    icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
-    icf_metadata.column_chunk_size = column_chunk_size
-    # Bare minimum here for provenance - would be nice to include versions of key
-    # dependencies as well.
-    icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
     logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
     return icf_metadata, header
@@ -799,6 +806,8 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
         for vcf_field in icf_metadata.fields:
             field_path = get_vcf_field_path(out_path, vcf_field)
             field_partition_path = field_path / f"p{partition_index}"
+            # Should be robust to running explode_partition twice.
+            field_partition_path.mkdir(exist_ok=True)
             transformer = VcfValueTransformer.factory(vcf_field, num_samples)
             self.field_writers[vcf_field.full_name] = IcfFieldWriter(
                 vcf_field,
@@ -824,13 +833,7 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
         return False
-# TODO rename to IntermediateColumnarFormat and move to icf.py
 class IntermediateColumnarFormat(collections.abc.Mapping):
-    # TODO Check if other compressors would give reasonable compression
-    # with significantly faster times
     def __init__(self, path):
         self.path = pathlib.Path(path)
         # TODO raise a more informative error here telling people this
@@ -846,7 +849,7 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
             partition.num_records for partition in self.metadata.partitions
         ]
         # Allow us to find which partition a given record is in
-        self.partition_record_index = np.cumsum([0] + partition_num_records)
+        self.partition_record_index = np.cumsum([0, *partition_num_records])
         for field in self.metadata.fields:
             self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
         logger.info(
@@ -856,7 +859,8 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
     def __repr__(self):
         return (
-            f"IntermediateColumnarFormat(fields={len(self)}, partitions={self.num_partitions}, "
+            f"IntermediateColumnarFormat(fields={len(self)}, "
+            f"partitions={self.num_partitions}, "
             f"records={self.num_records}, path={self.path})"
         )
@@ -922,9 +926,12 @@ class IntermediateColumnarFormatWriter:
         worker_processes=1,
         target_num_partitions=None,
         show_progress=False,
+        compressor=None,
     ):
         if self.path.exists():
-            shutil.rmtree(self.path)
+            raise ValueError("ICF path already exists")
+        if compressor is None:
+            compressor = ICF_DEFAULT_COMPRESSOR
         vcfs = [pathlib.Path(vcf) for vcf in vcfs]
         target_num_partitions = max(target_num_partitions, len(vcfs))
@@ -934,39 +941,38 @@ class IntermediateColumnarFormatWriter:
             worker_processes=worker_processes,
             show_progress=show_progress,
             target_num_partitions=target_num_partitions,
-            column_chunk_size=column_chunk_size,
         )
         self.metadata = icf_metadata
+        self.metadata.format_version = ICF_METADATA_FORMAT_VERSION
+        self.metadata.compressor = compressor.get_config()
+        self.metadata.column_chunk_size = column_chunk_size
+        # Bare minimum here for provenance - would be nice to include versions of key
+        # dependencies as well.
+        self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
         self.mkdirs()
         # Note: this is needed for the current version of the vcfzarr spec, but it's
-        # probably goint to be dropped.
+        # probably going to be dropped.
         # https://github.com/pystatgen/vcf-zarr-spec/issues/15
         # May be useful to keep lying around still though?
-        logger.info(f"Writing VCF header")
+        logger.info("Writing VCF header")
         with open(self.path / "header.txt", "w") as f:
             f.write(header)
-        logger.info(f"Writing WIP metadata")
+        logger.info("Writing WIP metadata")
         with open(self.wip_path / "metadata.json", "w") as f:
             json.dump(self.metadata.asdict(), f, indent=4)
         return self.num_partitions
     def mkdirs(self):
-        # TODO add worker_processes here and do this with the ParallelWorkManager
-        logger.info(
-            f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
-        )
+        num_dirs = len(self.metadata.fields)
+        logger.info(f"Creating {num_dirs} field directories")
         self.path.mkdir()
         self.wip_path.mkdir()
         for field in self.metadata.fields:
             col_path = get_vcf_field_path(self.path, field)
-            logger.debug(f"Make directories for {field.full_name} at {col_path}")
             col_path.mkdir(parents=True)
-            for j in range(self.num_partitions):
-                part_path = col_path / f"p{j}"
-                part_path.mkdir()
     def load_partition_summaries(self):
         summaries = []
@@ -982,13 +988,14 @@ class IntermediateColumnarFormatWriter:
                 not_found.append(j)
         if len(not_found) > 0:
             raise FileNotFoundError(
-                f"Partition metadata not found for {len(not_found)} partitions: {not_found}"
+                f"Partition metadata not found for {len(not_found)}"
+                f" partitions: {not_found}"
             )
         return summaries
     def load_metadata(self):
         if self.metadata is None:
-            with open(self.wip_path / f"metadata.json") as f:
+            with open(self.wip_path / "metadata.json") as f:
                 self.metadata = IcfMetadata.fromdict(json.load(f))
     def process_partition(self, partition_index):
@@ -1037,12 +1044,14 @@ class IntermediateColumnarFormatWriter:
                     for field in format_fields:
                         val = variant.format(field.name)
                         tcw.append(field.full_name, val)
-                    # Note: an issue with updating the progress per variant here like this
-                    # is that we get a significant pause at the end of the counter while
-                    # all the "small" fields get flushed. Possibly not much to be done about it.
+                    # Note: an issue with updating the progress per variant here like
+                    # this is that we get a significant pause at the end of the counter
+                    # while all the "small" fields get flushed. Possibly not much to be
+                    # done about it.
                     core.update_progress(1)
             logger.info(
-                f"Finished reading VCF for partition {partition_index}, flushing buffers"
+                f"Finished reading VCF for partition {partition_index}, "
+                f"flushing buffers"
             )
         partition_metadata = {
@@ -1124,30 +1133,32 @@ class IntermediateColumnarFormatWriter:
             for summary in partition_summaries:
                 field.summary.update(summary["field_summaries"][field.full_name])
-        logger.info(f"Finalising metadata")
+        logger.info("Finalising metadata")
         with open(self.path / "metadata.json", "w") as f:
             json.dump(self.metadata.asdict(), f, indent=4)
-        logger.debug(f"Removing WIP directory")
+        logger.debug("Removing WIP directory")
         shutil.rmtree(self.wip_path)
 def explode(
-    vcfs,
     icf_path,
+    vcfs,
     *,
     column_chunk_size=16,
     worker_processes=1,
     show_progress=False,
+    compressor=None,
 ):
     writer = IntermediateColumnarFormatWriter(icf_path)
-    num_partitions = writer.init(
+    writer.init(
         vcfs,
         # Heuristic to get reasonable worker utilisation with lumpy partition sizing
         target_num_partitions=max(1, worker_processes * 4),
         worker_processes=worker_processes,
         show_progress=show_progress,
         column_chunk_size=column_chunk_size,
+        compressor=compressor,
     )
     writer.explode(worker_processes=worker_processes, show_progress=show_progress)
     writer.finalise()
@@ -1162,6 +1173,7 @@ def explode_init(
     target_num_partitions=1,
     worker_processes=1,
     show_progress=False,
+    compressor=None,
 ):
     writer = IntermediateColumnarFormatWriter(icf_path)
     return writer.init(
@@ -1170,6 +1182,7 @@ def explode_init(
         worker_processes=worker_processes,
         show_progress=show_progress,
         column_chunk_size=column_chunk_size,
+        compressor=compressor,
     )
@@ -1209,20 +1222,25 @@ class ZarrColumnSpec:
     dtype: str
     shape: tuple
     chunks: tuple
-    dimensions: list
+    dimensions: tuple
     description: str
     vcf_field: str
-    compressor: dict = None
-    filters: list = None
-    # TODO add filters
+    compressor: dict
+    filters: list
     def __post_init__(self):
+        # Ensure these are tuples for ease of comparison and consistency
         self.shape = tuple(self.shape)
         self.chunks = tuple(self.chunks)
         self.dimensions = tuple(self.dimensions)
-        self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
-        self.filters = []
-        self._choose_compressor_settings()
+    @staticmethod
+    def new(**kwargs):
+        spec = ZarrColumnSpec(
+            **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
+        )
+        spec._choose_compressor_settings()
+        return spec
     def _choose_compressor_settings(self):
         """
@@ -1298,7 +1316,7 @@ class VcfZarrSchema:
         def fixed_field_spec(
             name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
         ):
-            return ZarrColumnSpec(
+            return ZarrColumnSpec.new(
                 vcf_field=vcf_field,
                 name=name,
                 dtype=dtype,
@@ -1366,14 +1384,23 @@ class VcfZarrSchema:
             if field.category == "FORMAT":
                 prefix = "call_"
                 shape.append(n)
-                chunks.append(samples_chunk_size),
+                chunks.append(samples_chunk_size)
                 dimensions.append("samples")
             # TODO make an option to add in the empty extra dimension
             if field.summary.max_number > 1:
                 shape.append(field.summary.max_number)
-                dimensions.append(field.name)
+                # TODO we should really be checking this to see if the named dimensions
+                # are actually correct.
+                if field.vcf_number == "R":
+                    dimensions.append("alleles")
+                elif field.vcf_number == "A":
+                    dimensions.append("alt_alleles")
+                elif field.vcf_number == "G":
+                    dimensions.append("genotypes")
+                else:
+                    dimensions.append(f"{field.category}_{field.name}_dim")
             variable_name = prefix + field.name
-            colspec = ZarrColumnSpec(
+            colspec = ZarrColumnSpec.new(
                 vcf_field=field.full_name,
                 name=variable_name,
                 dtype=field.smallest_dtype(),
@@ -1391,7 +1418,7 @@ class VcfZarrSchema:
             dimensions = ["variants", "samples"]
             colspecs.append(
-                ZarrColumnSpec(
+                ZarrColumnSpec.new(
                     vcf_field=None,
                     name="call_genotype_phased",
                     dtype="bool",
@@ -1404,7 +1431,7 @@ class VcfZarrSchema:
             shape += [ploidy]
             dimensions += ["ploidy"]
             colspecs.append(
-                ZarrColumnSpec(
+                ZarrColumnSpec.new(
                     vcf_field=None,
                     name="call_genotype",
                     dtype=gt_field.smallest_dtype(),
@@ -1415,7 +1442,7 @@ class VcfZarrSchema:
                 )
             )
             colspecs.append(
-                ZarrColumnSpec(
+                ZarrColumnSpec.new(
                     vcf_field=None,
                     name="call_genotype_mask",
                     dtype="bool",
@@ -1480,16 +1507,30 @@ class EncodingWork:
     memory: int = 0
+def parse_max_memory(max_memory):
+    if max_memory is None:
+        # Effectively unbounded
+        return 2**63
+    if isinstance(max_memory, str):
+        max_memory = humanfriendly.parse_size(max_memory)
+    logger.info(f"Set memory budget to {display_size(max_memory)}")
+    return max_memory
 class VcfZarrWriter:
-    def __init__(self, path, icf, schema):
+    def __init__(self, path, icf, schema, dimension_separator=None):
         self.path = pathlib.Path(path)
         self.icf = icf
         self.schema = schema
+        # Default to using nested directories following the Zarr v3 default.
+        # This seems to require version 2.17+ to work properly
+        self.dimension_separator = (
+            "/" if dimension_separator is None else dimension_separator
+        )
         store = zarr.DirectoryStore(self.path)
         self.root = zarr.group(store=store)
     def init_array(self, variable):
-        # print("CREATE", variable)
         object_codec = None
         if variable.dtype == "O":
             object_codec = numcodecs.VLenUTF8()
@@ -1501,7 +1542,9 @@ class VcfZarrWriter:
             compressor=numcodecs.get_codec(variable.compressor),
             filters=[numcodecs.get_codec(filt) for filt in variable.filters],
             object_codec=object_codec,
+            dimension_separator=self.dimension_separator,
         )
+        # Dimension names are part of the spec in Zarr v3
         a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
     def get_array(self, name):
@@ -1593,7 +1636,9 @@ class VcfZarrWriter:
                 try:
                     var_filter.buff[j, lookup[f]] = True
                 except KeyError:
-                    raise ValueError(f"Filter '{f}' was not defined in the header.")
+                    raise ValueError(
+                        f"Filter '{f}' was not defined " f"in the header."
+                    ) from None
         var_filter.flush()
         logger.debug(f"Encoded FILTERS slice {start}:{stop}")
@@ -1639,6 +1684,7 @@ class VcfZarrWriter:
                 "contig_length",
                 self.schema.contig_length,
                 dtype=np.int64,
+                compressor=DEFAULT_ZARR_COMPRESSOR,
             )
             array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         return {v: j for j, v in enumerate(self.schema.contig_id)}
@@ -1661,8 +1707,6 @@ class VcfZarrWriter:
             self.init_array(column)
     def finalise(self):
-        # for column in self.schema.columns.values():
-        #     self.finalise_array(column)
         zarr.consolidate_metadata(self.path)
     def encode(
@@ -1672,12 +1716,7 @@ class VcfZarrWriter:
         show_progress=False,
         max_memory=None,
     ):
-        if max_memory is None:
-            # Unbounded
-            max_memory = 2**63
-        else:
-            # Value is specified in Mibibytes
-            max_memory *= 2**20  # NEEDS TEST
+        max_memory = parse_max_memory(max_memory)
         # TODO this will move into the setup logic later when we're making it possible
         # to split the work by slice
@@ -1702,7 +1741,8 @@ class VcfZarrWriter:
             variant_chunk_size = array.blocks[0].nbytes
             encoding_memory_requirements[col.name] = variant_chunk_size
             logger.debug(
-                f"{col.name} requires at least {display_size(variant_chunk_size)} per worker"
+                f"{col.name} requires at least {display_size(variant_chunk_size)} "
+                f"per worker"
             )
             total_bytes += array.nbytes
@@ -1764,8 +1804,8 @@ class VcfZarrWriter:
         # Fail early if we can't fit a particular column into memory
         for wp in work:
-            if wp.memory >= max_memory:
-                raise ValueError(  # NEEDS TEST
+            if wp.memory > max_memory:
+                raise ValueError(
                     f"Insufficient memory for {wp.columns}: "
                     f"{display_size(wp.memory)} > {display_size(max_memory)}"
                 )
@@ -1778,6 +1818,8 @@ class VcfZarrWriter:
         )
         used_memory = 0
+        # We need to keep some bounds on the queue size or the memory bounds algorithm
+        # below doesn't really work.
         max_queued = 4 * max(1, worker_processes)
         encoded_slices = collections.Counter()
@@ -1804,10 +1846,15 @@ class VcfZarrWriter:
                             self.finalise_array(column)
             for wp in work:
-                if (
+                while (
                     used_memory + wp.memory > max_memory
                     or len(future_to_work) > max_queued
                 ):
+                    logger.debug(
+                        f"Wait: mem_required={used_memory + wp.memory} "
+                        f"max_mem={max_memory} queued={len(future_to_work)} "
+                        f"max_queued={max_queued}"
+                    )
                     service_completed_futures()
                 future = pwm.submit(wp.func, wp.start, wp.stop)
                 used_memory += wp.memory
@@ -1832,6 +1879,7 @@ def encode(
     variants_chunk_size=None,
     samples_chunk_size=None,
     max_v_chunks=None,
+    dimension_separator=None,
     max_memory=None,
     worker_processes=1,
     show_progress=False,
@@ -1849,13 +1897,13 @@ def encode(
             raise ValueError(
                 "Cannot specify schema along with chunk sizes"
             )  # NEEDS TEST
-        with open(schema_path, "r") as f:
+        with open(schema_path) as f:
             schema = VcfZarrSchema.fromjson(f.read())
     zarr_path = pathlib.Path(zarr_path)
     if zarr_path.exists():
         logger.warning(f"Deleting existing {zarr_path}")
         shutil.rmtree(zarr_path)
-    vzw = VcfZarrWriter(zarr_path, icf, schema)
+    vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
     vzw.init()
     vzw.encode(
         max_v_chunks=max_v_chunks,
@@ -1876,10 +1924,11 @@ def convert(
     show_progress=False,
     # TODO add arguments to control location of tmpdir
 ):
-    with tempfile.TemporaryDirectory(prefix="vcf2zarr_if_") as if_dir:
+    with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
+        if_dir = pathlib.Path(tmp) / "if"
         explode(
-            vcfs,
             if_dir,
+            vcfs,
             worker_processes=worker_processes,
             show_progress=show_progress,
         )
@@ -1929,7 +1978,7 @@ def assert_all_fill(zarr_val, vcf_type):
     elif vcf_type == "Float":
         assert_all_fill_float(zarr_val)
     else:  # pragma: no cover
-        assert False
+        assert False  # noqa PT015
 def assert_all_missing(zarr_val, vcf_type):
@@ -1942,7 +1991,7 @@ def assert_all_missing(zarr_val, vcf_type):
     elif vcf_type == "Float":
         assert_all_missing_float(zarr_val)
     else:  # pragma: no cover
-        assert False
+        assert False  # noqa PT015
 def assert_info_val_missing(zarr_val, vcf_type):
@@ -2081,7 +2130,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
         assert vid[j] == ("." if row.ID is None else row.ID)
         assert allele[j, 0] == row.REF
         k = len(row.ALT)
-        nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT),
+        nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
         assert np.all(allele[j, k + 1 :] == "")
         # TODO FILTERS

bio2zarr/vcf_utils.py CHANGED Viewed

@@ -1,14 +1,14 @@
-from typing import IO, Any, Dict, Optional, Sequence, Union
 import contextlib
-import struct
-import pathlib
 import gzip
-from dataclasses import dataclass
 import os
+import pathlib
+import struct
+from dataclasses import dataclass
+from typing import IO, Any, Dict, Optional, Sequence, Union
-import numpy as np
 import cyvcf2
 import humanfriendly
+import numpy as np
 from bio2zarr.typing import PathType
@@ -38,7 +38,8 @@ def read_bytes_as_value(f: IO[Any], fmt: str, nodata: Optional[Any] = None) -> A
     fmt : str
         A Python `struct` format string.
     nodata : Optional[Any], optional
-        The value to return in case there is no further data in the stream, by default None
+        The value to return in case there is no further data in the stream,
+        by default None
     Returns
     -------
@@ -277,7 +278,8 @@ class TabixIndex:
         # Create file offsets for each element in the linear index
         file_offsets = np.array([get_file_offset(vfp) for vfp in linear_index])
-        # Calculate corresponding contigs and positions or each element in the linear index
+        # Calculate corresponding contigs and positions or each element in
+        # the linear index
         contig_indexes = np.hstack(
             [np.full(len(li), i) for (i, li) in enumerate(linear_indexes)]
         )
@@ -433,6 +435,22 @@ class IndexedVcf(contextlib.AbstractContextManager):
             if var.POS >= start:
                 yield var
+    def _filter_empty(self, regions):
+        """
+        Return all regions in the specified list that have one or more records.
+        Sometimes with Tabix indexes these seem to crop up:
+        - https://github.com/sgkit-dev/bio2zarr/issues/45
+        - https://github.com/sgkit-dev/bio2zarr/issues/120
+        """
+        ret = []
+        for region in regions:
+            variants = self.variants(region)
+            if next(variants, None) is not None:
+                ret.append(region)
+        return ret
     def partition_into_regions(
         self,
         num_parts: Optional[int] = None,
@@ -509,4 +527,4 @@ class IndexedVcf(contextlib.AbstractContextManager):
             if self.index.record_counts[ri] > 0:
                 regions.append(Region(self.sequence_names[ri]))
-        return regions
+        return self._filter_empty(regions)

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bio2zarr
-Version: 0.0.3
+Version: 0.0.5
 Summary: Convert bioinformatics data to Zarr
 Home-page: https://github.com/pystatgen/bio2zarr
 Author: sgkit Developers
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
 Description-Content-Type: text/x-rst
 License-File: LICENSE
 Requires-Dist: numpy
-Requires-Dist: zarr !=2.11.0,!=2.11.1,!=2.11.2,>=2.10.0
+Requires-Dist: zarr >=2.17
 Requires-Dist: click
 Requires-Dist: tabulate
 Requires-Dist: tqdm

bio2zarr-0.0.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
+bio2zarr/__main__.py,sha256=hO4vV-kPFgsYq0NQwG2r-WkserPL27oqae_tUvNB7yE,527
+bio2zarr/_version.py,sha256=EJB7__SNK9kQS_SWZB_U4DHJ3P8ftF6etZEihTYnuXE,411
+bio2zarr/cli.py,sha256=k63xex-tQkogAlJ3N68Ikx8LqZrksXbZB2s6Z7h-zXc,11446
+bio2zarr/core.py,sha256=reF9elN1dwmCoXXLgci-y5pXmAm3fTntmomHTRcG54g,8127
+bio2zarr/plink.py,sha256=huXMlxQ5C3gPmOYCavA-QW7PzaV48I2lo80cQqHT1wY,6768
+bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
+bio2zarr/typing.py,sha256=BYxhL16sKRoNxa6amf6AYxvt5Ke9qzv2np_kOT_zPJo,79
+bio2zarr/vcf.py,sha256=GFnwR2YP-cHU4tfHloRjyiBK9-xXDgXcAM_tz-w2qck,74324
+bio2zarr/vcf_utils.py,sha256=r3NQXxWK1SYU7CcwDzSWXdX5Q8Ixk7gdCTEiFPzfUAk,17307
+bio2zarr-0.0.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+bio2zarr-0.0.5.dist-info/METADATA,sha256=SasGYcKSRb7NqnYR98ODFvPEMdBNdpxWx5gqOt038QU,1077
+bio2zarr-0.0.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+bio2zarr-0.0.5.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
+bio2zarr-0.0.5.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
+bio2zarr-0.0.5.dist-info/RECORD,,

bio2zarr-0.0.3.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
-bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
-bio2zarr/_version.py,sha256=hB095avW4HuDZxn8qPHRG1UMzSSonb8ZDAsLxt9hmk8,411
-bio2zarr/cli.py,sha256=N_vEFj730p_TL7Dk9m9T3ceAhVV58BMYRDmBmoeKH7A,10766
-bio2zarr/core.py,sha256=sBlWmHjcb7tAn_7WQRBdrbGcEd_lT_3HTQ_JbzomVMg,8111
-bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
-bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
-bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
-bio2zarr/vcf.py,sha256=g2TqH9Lbp4Ds8kjOnjvHvoMAgnG6Kx8pKPN1bqBKKIQ,72201
-bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
-bio2zarr-0.0.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-bio2zarr-0.0.3.dist-info/METADATA,sha256=dc2y5xrnkcvD1qmKGFL5GrsbM1_tiIlAYB2GrAlLunM,1106
-bio2zarr-0.0.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-bio2zarr-0.0.3.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
-bio2zarr-0.0.3.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
-bio2zarr-0.0.3.dist-info/RECORD,,

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/LICENSE RENAMED Viewed

File without changes

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

bio2zarr 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl