PyPI - bio2zarr - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

bio2zarr 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

bio2zarr/_version.py +16 -3
bio2zarr/cli.py +16 -3
bio2zarr/plink.py +7 -5
bio2zarr/tskit.py +14 -19
bio2zarr/vcf.py +23 -13
bio2zarr/vcz.py +34 -41
bio2zarr/zarr_utils.py +169 -2
{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/METADATA +10 -6
bio2zarr-0.1.7.dist-info/RECORD +21 -0
{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/WHEEL +1 -1
bio2zarr-0.1.6.dist-info/RECORD +0 -21
{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/licenses/LICENSE +0 -0
{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/top_level.txt +0 -0

bio2zarr/_version.py CHANGED Viewed

@@ -1,7 +1,14 @@
 # file generated by setuptools-scm
 # don't change, don't track in version control
-__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
 TYPE_CHECKING = False
 if TYPE_CHECKING:
@@ -9,13 +16,19 @@ if TYPE_CHECKING:
     from typing import Union
     VERSION_TUPLE = Tuple[Union[int, str], ...]
+    COMMIT_ID = Union[str, None]
 else:
     VERSION_TUPLE = object
+    COMMIT_ID = object
 version: str
 __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
+commit_id: COMMIT_ID
+__commit_id__: COMMIT_ID
-__version__ = version = '0.1.6'
-__version_tuple__ = version_tuple = (0, 1, 6)
+__version__ = version = '0.1.7'
+__version_tuple__ = version_tuple = (0, 1, 7)
+__commit_id__ = commit_id = None

bio2zarr/cli.py CHANGED Viewed

@@ -652,7 +652,12 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
 @click.argument("zarr_path", type=click.Path())
 @click.option("--contig-id", type=str, help="Contig/chromosome ID (default: '1')")
 @click.option(
-    "--isolated-as-missing", is_flag=True, help="Treat isolated nodes as missing"
+    "--isolated-as-missing/--isolated-as-ancestral",
+    default=None,
+    help=(
+        "Treat isolated samples without mutations as missing or ancestral "
+        "(default: tskit default)"
+    ),
 )
 @variants_chunk_size
 @samples_chunk_size
@@ -660,6 +665,7 @@ def vcfpartition(vcfs, verbose, num_partitions, partition_size):
 @progress
 @worker_processes
 @force
+@core.requires_optional_dependency("tskit", "tskit")
 def convert_tskit(
     ts_path,
     zarr_path,
@@ -675,11 +681,18 @@ def convert_tskit(
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
+    import tskit
+    ts = tskit.load(ts_path)
+    model_mapping = ts.map_to_vcf_model(
+        contig_id=contig_id,
+        isolated_as_missing=isolated_as_missing,
+    )
     tskit_mod.convert(
         ts_path,
         zarr_path,
-        contig_id=contig_id,
-        isolated_as_missing=isolated_as_missing,
+        model_mapping=model_mapping,
         variants_chunk_size=variants_chunk_size,
         samples_chunk_size=samples_chunk_size,
         worker_processes=worker_processes,

bio2zarr/plink.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 import pandas as pd
 from bio2zarr import constants, core, vcz
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME
 logger = logging.getLogger(__name__)
@@ -198,7 +199,7 @@ class PlinkFormat(vcz.Source):
         ref_iter = self.bim.allele_2.values[start:stop]
         gt_iter = self.bed_reader.iter_decode(start, stop)
         for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
-            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
+            alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
             alleles[0] = ref
             alleles[1 : 1 + len(alt)] = alt
             phased = np.zeros(gt.shape[0], dtype=bool)
@@ -234,8 +235,9 @@ class PlinkFormat(vcz.Source):
         )
         # If we don't have SVLEN or END annotations, the rlen field is defined
         # as the length of the REF
-        max_len = self.bim.allele_2.values.itemsize
+        # Explicitly cast to fixed size array to support pandas 2.x and 3.x
+        allele_2_array = self.bim.allele_2.values.astype("S")
+        max_len = allele_2_array.itemsize
         array_specs = [
             vcz.ZarrArraySpec(
                 source="position",
@@ -246,13 +248,13 @@ class PlinkFormat(vcz.Source):
             ),
             vcz.ZarrArraySpec(
                 name="variant_allele",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
                 dimensions=["variants", "alleles"],
                 description=None,
             ),
             vcz.ZarrArraySpec(
                 name="variant_id",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
                 dimensions=["variants"],
                 description=None,
             ),

bio2zarr/tskit.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pathlib
 import numpy as np
 from bio2zarr import constants, core, vcz
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME
 logger = logging.getLogger(__name__)
@@ -15,8 +16,6 @@ class TskitFormat(vcz.Source):
         ts,
         *,
         model_mapping=None,
-        contig_id=None,
-        isolated_as_missing=False,
     ):
         import tskit
@@ -35,14 +34,14 @@ class TskitFormat(vcz.Source):
             f"{self.ts.num_sites} sites"
         )
-        self.contig_id = contig_id if contig_id is not None else "1"
-        self.isolated_as_missing = isolated_as_missing
-        self.positions = self.ts.sites_position
         if model_mapping is None:
             model_mapping = self.ts.map_to_vcf_model()
+        self.contig_id = model_mapping.contig_id
+        self.contig_length = model_mapping.contig_length
+        self.isolated_as_missing = model_mapping.isolated_as_missing
+        self.raw_positions = self.ts.sites_position
+        self.vcf_positions = model_mapping.transformed_positions
         individuals_nodes = model_mapping.individuals_nodes
         sample_ids = model_mapping.individuals_name
@@ -91,14 +90,14 @@ class TskitFormat(vcz.Source):
     @property
     def contigs(self):
-        return [vcz.Contig(id=self.contig_id)]
+        return [vcz.Contig(id=self.contig_id, length=self.contig_length)]
     def iter_contig(self, start, stop):
         yield from (0 for _ in range(start, stop))
     def iter_field(self, field_name, shape, start, stop):
         if field_name == "position":
-            for pos in self.ts.sites_position[start:stop]:
+            for pos in self.vcf_positions[start:stop]:
                 yield int(pos)
         else:
             raise ValueError(f"Unknown field {field_name}")
@@ -110,13 +109,13 @@ class TskitFormat(vcz.Source):
         for variant in self.ts.variants(
             isolated_as_missing=self.isolated_as_missing,
-            left=self.positions[start],
-            right=self.positions[stop] if stop < self.num_records else None,
+            left=self.raw_positions[start],
+            right=self.raw_positions[stop] if stop < self.num_records else None,
             samples=self.tskit_samples,
             copy=False,
         ):
             gt = np.full(shape, constants.INT_FILL, dtype=np.int8)
-            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
+            alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
             # length is the length of the REF allele unless other fields
             # are included.
             variant_length = len(variant.alleles[0])
@@ -176,8 +175,8 @@ class TskitFormat(vcz.Source):
         min_position = 0
         max_position = 0
         if self.ts.num_sites > 0:
-            min_position = np.min(self.ts.sites_position)
-            max_position = np.max(self.ts.sites_position)
+            min_position = np.min(self.vcf_positions)
+            max_position = np.max(self.vcf_positions)
         tables = self.ts.tables
         ancestral_state_offsets = tables.sites.ancestral_state_offset
@@ -200,7 +199,7 @@ class TskitFormat(vcz.Source):
             vcz.ZarrArraySpec(
                 source=None,
                 name="variant_allele",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
                 dimensions=["variants", "alleles"],
                 description="Alleles for each variant",
             ),
@@ -252,8 +251,6 @@ def convert(
     vcz_path,
     *,
     model_mapping=None,
-    contig_id=None,
-    isolated_as_missing=False,
     variants_chunk_size=None,
     samples_chunk_size=None,
     worker_processes=core.DEFAULT_WORKER_PROCESSES,
@@ -277,8 +274,6 @@ def convert(
     tskit_format = TskitFormat(
         ts_or_path,
         model_mapping=model_mapping,
-        contig_id=contig_id,
-        isolated_as_missing=isolated_as_missing,
     )
     schema_instance = tskit_format.generate_schema(
         variants_chunk_size=variants_chunk_size,

bio2zarr/vcf.py CHANGED Viewed

@@ -16,6 +16,8 @@ from typing import Any
 import numcodecs
 import numpy as np
+from bio2zarr.zarr_utils import STRING_DTYPE_NAME, zarr_exists
 from . import constants, core, provenance, vcf_utils, vcz
 logger = logging.getLogger(__name__)
@@ -110,7 +112,7 @@ class VcfField:
             ret = "U1"
         else:
             assert self.vcf_type == "String"
-            ret = "O"
+            ret = STRING_DTYPE_NAME
         return ret
@@ -397,7 +399,7 @@ def sanitise_value_string_scalar(shape, value):
 def sanitise_value_string_1d(shape, value):
     if value is None:
-        return np.full(shape, ".", dtype="O")
+        return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
     else:
         value = drop_empty_second_dim(value)
         result = np.full(shape, "", dtype=value.dtype)
@@ -407,9 +409,9 @@ def sanitise_value_string_1d(shape, value):
 def sanitise_value_string_2d(shape, value):
     if value is None:
-        return np.full(shape, ".", dtype="O")
+        return np.full(shape, ".", dtype=STRING_DTYPE_NAME)
     else:
-        result = np.full(shape, "", dtype="O")
+        result = np.full(shape, "", dtype=STRING_DTYPE_NAME)
         if value.ndim == 2:
             result[: value.shape[0], : value.shape[1]] = value
         else:
@@ -569,7 +571,12 @@ class StringValueTransformer(VcfValueTransformer):
             value = np.array(list(vcf_value.split(",")))
         else:
             # TODO can we make this faster??
-            value = np.array([v.split(",") for v in vcf_value], dtype="O")
+            var_len_values = [v.split(",") for v in vcf_value]
+            number = max(len(v) for v in var_len_values)
+            value = np.array(
+                [v + [""] * (number - len(v)) for v in var_len_values],
+                dtype=STRING_DTYPE_NAME,
+            )
             # print("HERE", vcf_value, value)
             # for v in vcf_value:
             #     print("\t", type(v), len(v), v.split(","))
@@ -1044,7 +1051,7 @@ class IntermediateColumnarFormat(vcz.Source):
             ref_field.iter_values(start, stop),
             alt_field.iter_values(start, stop),
         ):
-            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
+            alleles = np.full(num_alleles, constants.STR_FILL, dtype=STRING_DTYPE_NAME)
             alleles[0] = ref[0]
             alleles[1 : 1 + len(alt)] = alt
             yield alleles
@@ -1068,14 +1075,16 @@ class IntermediateColumnarFormat(vcz.Source):
             for variant_length, alleles in zip(
                 variant_lengths, self.iter_alleles(start, stop, num_alleles)
             ):
-                yield vcz.VariantData(variant_length, alleles, None, None)
+                # Stored ICF values are always at least 1D arrays; "rlen" is Number=1
+                # so we must extract the scalar to avoid NumPy scalar-conversion issues.
+                yield vcz.VariantData(variant_length[0], alleles, None, None)
         else:
             for variant_length, alleles, (gt, phased) in zip(
                 variant_lengths,
                 self.iter_alleles(start, stop, num_alleles),
                 self.iter_genotypes(shape, start, stop),
             ):
-                yield vcz.VariantData(variant_length, alleles, gt, phased)
+                yield vcz.VariantData(variant_length[0], alleles, gt, phased)
     def generate_schema(
         self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
@@ -1087,8 +1096,10 @@ class IntermediateColumnarFormat(vcz.Source):
         # Add ploidy and genotypes dimensions only when needed
         max_genotypes = 0
+        has_g_field = False
         for field in self.metadata.format_fields:
             if field.vcf_number == "G":
+                has_g_field = True
                 max_genotypes = max(max_genotypes, field.summary.max_number)
         ploidy = None
@@ -1100,7 +1111,7 @@ class IntermediateColumnarFormat(vcz.Source):
             genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
             # assert max_genotypes == genotypes_size
         else:
-            if max_genotypes > 0:
+            if max_genotypes > 0 or has_g_field:
                 # there is no GT field, but there is at least one Number=G field,
                 # so need to define genotypes dimension
                 genotypes_size = max_genotypes
@@ -1163,7 +1174,7 @@ class IntermediateColumnarFormat(vcz.Source):
             ),
             fixed_field_spec(
                 name="variant_allele",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
                 dimensions=["variants", "alleles"],
             ),
             fixed_field_spec(
@@ -1173,7 +1184,7 @@ class IntermediateColumnarFormat(vcz.Source):
             ),
             fixed_field_spec(
                 name="variant_id",
-                dtype="O",
+                dtype=STRING_DTYPE_NAME,
             ),
             fixed_field_spec(
                 name="variant_id_mask",
@@ -1581,8 +1592,7 @@ def inspect(path):
         raise ValueError(f"Path not found: {path}")
     if (path / "metadata.json").exists():
         obj = IntermediateColumnarFormat(path)
-    # NOTE: this is too strict, we should support more general Zarrs, see #276
-    elif (path / ".zmetadata").exists():
+    elif zarr_exists(path):
         obj = vcz.VcfZarr(path)
     else:
         raise ValueError(f"{path} not in ICF or VCF Zarr format")

bio2zarr/vcz.py CHANGED Viewed

@@ -284,7 +284,7 @@ class ZarrArraySpec:
         for size in self.get_shape(schema)[1:]:
             chunk_items *= size
         dt = np.dtype(self.dtype)
-        if dt.kind == "O" and "samples" in self.dimensions:
+        if dt.kind == zarr_utils.STRING_DTYPE_NAME and "samples" in self.dimensions:
             logger.warning(
                 f"Field {self.name} is a string; max memory usage may "
                 "be a significant underestimate"
@@ -643,55 +643,60 @@ class VcfZarrWriter:
     def encode_samples(self, root):
         samples = self.source.samples
-        array = root.array(
+        zarr_utils.create_group_array(
+            root,
             "sample_id",
             data=[sample.id for sample in samples],
             shape=len(samples),
             dtype="str",
             compressor=DEFAULT_ZARR_COMPRESSOR,
             chunks=(self.schema.get_chunks(["samples"])[0],),
+            dimension_names=["samples"],
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
         logger.debug("Samples done")
     def encode_contigs(self, root):
         contigs = self.source.contigs
-        array = root.array(
+        zarr_utils.create_group_array(
+            root,
             "contig_id",
             data=[contig.id for contig in contigs],
             shape=len(contigs),
             dtype="str",
             compressor=DEFAULT_ZARR_COMPRESSOR,
+            dimension_names=["contigs"],
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         if all(contig.length is not None for contig in contigs):
-            array = root.array(
+            zarr_utils.create_group_array(
+                root,
                 "contig_length",
                 data=[contig.length for contig in contigs],
                 shape=len(contigs),
                 dtype=np.int64,
                 compressor=DEFAULT_ZARR_COMPRESSOR,
+                dimension_names=["contigs"],
             )
-            array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
     def encode_filters(self, root):
         filters = self.source.filters
-        array = root.array(
+        zarr_utils.create_group_array(
+            root,
             "filter_id",
             data=[filt.id for filt in filters],
             shape=len(filters),
             dtype="str",
             compressor=DEFAULT_ZARR_COMPRESSOR,
+            dimension_names=["filters"],
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
-        array = root.array(
+        zarr_utils.create_group_array(
+            root,
             "filter_description",
             data=[filt.description for filt in filters],
             shape=len(filters),
             dtype="str",
             compressor=DEFAULT_ZARR_COMPRESSOR,
+            dimension_names=["filters"],
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
     def init_array(self, root, schema, array_spec, variants_dim_size):
         kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
@@ -707,34 +712,33 @@ class VcfZarrWriter:
             else schema.defaults["compressor"]
         )
         compressor = numcodecs.get_codec(compressor)
-        if array_spec.dtype == "O":
+        if array_spec.dtype == zarr_utils.STRING_DTYPE_NAME:
             if zarr_utils.zarr_v3():
                 filters = [*list(filters), numcodecs.VLenUTF8()]
             else:
                 kwargs["object_codec"] = numcodecs.VLenUTF8()
-        if not zarr_utils.zarr_v3():
+        if zarr_utils.zarr_v3():
+            # see https://github.com/zarr-developers/zarr-python/issues/3197
+            kwargs["fill_value"] = None
+        else:
             kwargs["dimension_separator"] = self.metadata.dimension_separator
         shape = schema.get_shape(array_spec.dimensions)
         # Truncate the variants dimension if max_variant_chunks was specified
         shape[0] = variants_dim_size
-        a = root.empty(
+        a = zarr_utils.create_empty_group_array(
+            root,
             name=array_spec.name,
             shape=shape,
             chunks=schema.get_chunks(array_spec.dimensions),
             dtype=array_spec.dtype,
             compressor=compressor,
             filters=filters,
+            dimension_names=array_spec.dimensions,
             **kwargs,
         )
-        a.attrs.update(
-            {
-                "description": array_spec.description,
-                # Dimension names are part of the spec in Zarr v3
-                "_ARRAY_DIMENSIONS": array_spec.dimensions,
-            }
-        )
+        a.attrs.update({"description": array_spec.description})
         logger.debug(f"Initialised {a}")
         return a
@@ -977,19 +981,7 @@ class VcfZarrWriter:
             if not src.exists():
                 # Needs test
                 raise ValueError(f"Partition {partition} of {name} does not exist")
-            dest = self.arrays_path / name
-            # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
-            chunk_files = [
-                path for path in src.iterdir() if not path.name.startswith(".")
-            ]
-            # TODO check for a count of then number of files. If we require a
-            # dimension_separator of "/" then we could make stronger assertions
-            # here, as we'd always have num_variant_chunks
-            logger.debug(
-                f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
-            )
-            for chunk_file in chunk_files:
-                os.rename(chunk_file, dest / chunk_file.name)
+            zarr_utils.move_chunks(src, self.arrays_path, partition, name)
         # Finally, once all the chunks have moved into the arrays dir,
         # we move it out of wip
         os.rename(self.arrays_path / name, self.path / name)
@@ -1108,7 +1100,7 @@ class VcfZarrWriter:
 class VcfZarr:
     def __init__(self, path):
-        if not (path / ".zmetadata").exists():
+        if not zarr_utils.zarr_exists(path):
             raise ValueError("Not in VcfZarr format")  # NEEDS TEST
         self.path = path
         self.root = zarr.open(path, mode="r")
@@ -1129,7 +1121,7 @@ class VcfZarr:
                 "avg_chunk_stored": core.display_size(int(stored / array.nchunks)),
                 "shape": str(array.shape),
                 "chunk_shape": str(array.chunks),
-                "compressor": str(array.compressor),
+                "compressor": str(zarr_utils.get_compressor(array)),
                 "filters": str(array.filters),
             }
             data.append(d)
@@ -1192,7 +1184,8 @@ class VcfZarrIndexer:
         kwargs = {}
         if not zarr_utils.zarr_v3():
             kwargs["dimension_separator"] = "/"
-        array = root.array(
+        zarr_utils.create_group_array(
+            root,
             "region_index",
             data=index,
             shape=index.shape,
@@ -1200,12 +1193,12 @@ class VcfZarrIndexer:
             dtype=index.dtype,
             compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
             fill_value=None,
+            dimension_names=[
+                "region_index_values",
+                "region_index_fields",
+            ],
             **kwargs,
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = [
-            "region_index_values",
-            "region_index_fields",
-        ]
         logger.info("Consolidating Zarr metadata")
         zarr.consolidate_metadata(self.path)

bio2zarr/zarr_utils.py CHANGED Viewed

@@ -1,18 +1,185 @@
+import logging
+import os
 import zarr
+logger = logging.getLogger(__name__)
+# Use zarr format v2 by default even when running with zarr-python v3
+# NOTE: this interface was introduced for experimentation with zarr
+# format 3 and is not envisaged as a long-term interface.
+try:
+    ZARR_FORMAT = int(os.environ.get("BIO2ZARR_ZARR_FORMAT", "2"))
+except Exception:
+    ZARR_FORMAT = 2
 def zarr_v3() -> bool:
     return zarr.__version__ >= "3"
 if zarr_v3():
-    # Use zarr format v2 even when running with zarr-python v3
-    ZARR_FORMAT_KWARGS = dict(zarr_format=2)
+    ZARR_FORMAT_KWARGS = dict(zarr_format=ZARR_FORMAT)
+    # In zarr-python v3 strings are stored as string arrays (T) with itemsize 16
+    STRING_DTYPE_NAME = "T"
+    STRING_ITEMSIZE = 16
 else:
     ZARR_FORMAT_KWARGS = dict()
+    # In zarr-python v2 strings are stored as object arrays (O) with itemsize 8
+    STRING_DTYPE_NAME = "O"
+    STRING_ITEMSIZE = 8
 # See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
 def first_dim_iter(z):
     for chunk in range(z.cdata_shape[0]):
         yield from z.blocks[chunk]
+def zarr_exists(path):
+    # NOTE: this is too strict, we should support more general Zarrs, see #276
+    return (path / ".zmetadata").exists() or (path / "zarr.json").exists()
+def create_group_array(
+    group,
+    name,
+    *,
+    data,
+    shape,
+    dtype,
+    compressor=None,
+    dimension_names=None,
+    **kwargs,
+):
+    """Create an array within a group."""
+    if ZARR_FORMAT == 2:
+        array = group.array(
+            name,
+            data=data,
+            shape=shape,
+            dtype=dtype,
+            compressor=compressor,
+            **kwargs,
+        )
+        if dimension_names is not None:
+            array.attrs["_ARRAY_DIMENSIONS"] = dimension_names
+        return array
+    else:
+        new_kwargs = {**kwargs}
+        if compressor is not None:
+            compressors = [_convert_v2_compressor_to_v3_codec(compressor, dtype)]
+            # TODO: seems odd that we need to set this
+            new_kwargs["compressor"] = "auto"
+            new_kwargs["compressors"] = compressors
+        return group.array(
+            name,
+            data=data,
+            shape=shape,
+            dtype=dtype,
+            dimension_names=dimension_names,
+            **new_kwargs,
+        )
+def create_empty_group_array(
+    group,
+    name,
+    *,
+    shape,
+    dtype,
+    chunks,
+    compressor=None,
+    filters=None,
+    dimension_names=None,
+    **kwargs,
+):
+    """Create an empty array within a group."""
+    if ZARR_FORMAT == 2:
+        array = group.empty(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            chunks=chunks,
+            compressor=compressor,
+            filters=filters,
+            **kwargs,
+        )
+        if dimension_names is not None:
+            array.attrs["_ARRAY_DIMENSIONS"] = dimension_names
+        return array
+    else:
+        new_kwargs = {**kwargs}
+        new_kwargs.pop("zarr_format")
+        if compressor is not None:
+            compressors = [_convert_v2_compressor_to_v3_codec(compressor, dtype)]
+            # TODO: seems odd that we need to set this
+            new_kwargs["compressor"] = "auto"
+            new_kwargs["compressors"] = compressors
+        return group.array(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            chunks=chunks,
+            dimension_names=dimension_names,
+            **new_kwargs,
+        )
+def get_compressor(array):
+    try:
+        # zarr format v2: compressor (singular)
+        return array.compressor
+    except TypeError as e:
+        # zarr format v3: compressors (plural)
+        compressors = array.compressors
+        if len(compressors) > 1:
+            raise ValueError(
+                f"Only one compressor is supported but found {compressors}"
+            ) from e
+        return compressors[0] if len(compressors) == 1 else None
+def get_compressor_config(array):
+    compressor = get_compressor(array)
+    if hasattr(compressor, "get_config"):
+        return compressor.get_config()
+    else:
+        from zarr.codecs.blosc import BloscCodec
+        if isinstance(compressor, BloscCodec):
+            return compressor._blosc_codec.get_config()
+        else:
+            return compressor.as_dict()["configuration"]
+def _convert_v2_compressor_to_v3_codec(compressor, dtype):
+    # import here since this is zarr-python v3 only
+    from zarr.core.dtype import parse_dtype
+    from zarr.metadata.migrate_v3 import _convert_compressor
+    return _convert_compressor(compressor, parse_dtype(dtype, zarr_format=3))
+def move_chunks(src_path, dest_path, partition, name):
+    if ZARR_FORMAT == 2:
+        dest = dest_path / name
+        chunk_files = [
+            path for path in src_path.iterdir() if not path.name.startswith(".")
+        ]
+    else:
+        dest = dest_path / name / "c"
+        dest.mkdir(exist_ok=True)
+        src_chunks = src_path / "c"
+        if not src_chunks.exists():
+            chunk_files = []
+        else:
+            chunk_files = [
+                path for path in src_chunks.iterdir() if not path.name.startswith(".")
+            ]
+    # TODO check for a count of then number of files. If we require a
+    # dimension_separator of "/" then we could make stronger assertions
+    # here, as we'd always have num_variant_chunks
+    logger.debug(f"Moving {len(chunk_files)} chunks for {name} partition {partition}")
+    for chunk_file in chunk_files:
+        os.rename(chunk_file, dest / chunk_file.name)

{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bio2zarr
-Version: 0.1.6
+Version: 0.1.7
 Summary: Convert bioinformatics data to Zarr
 Author-email: sgkit Developers <project@sgkit.dev>
 License:                                  Apache License
@@ -219,11 +219,12 @@ Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: numpy>=1.26
+Requires-Dist: numpy>=2
 Requires-Dist: zarr<3,>=2.17
 Requires-Dist: numcodecs[msgpack]!=0.14.0,!=0.14.1,<0.16
 Requires-Dist: tabulate
@@ -240,22 +241,25 @@ Requires-Dist: pysam; extra == "dev"
 Requires-Dist: pytest; extra == "dev"
 Requires-Dist: pytest-coverage; extra == "dev"
 Requires-Dist: pytest-xdist; extra == "dev"
-Requires-Dist: sgkit>=0.8.0; extra == "dev"
 Requires-Dist: tqdm; extra == "dev"
-Requires-Dist: tskit>=0.6.4; extra == "dev"
+Requires-Dist: tskit>=1; extra == "dev"
 Requires-Dist: bed_reader; extra == "dev"
 Requires-Dist: cyvcf2; extra == "dev"
+Requires-Dist: xarray<2025.03.1; extra == "dev"
+Requires-Dist: dask[array]<=2024.8.0,>=2022.01.0; extra == "dev"
 Provides-Extra: tskit
-Requires-Dist: tskit>=0.6.4; extra == "tskit"
+Requires-Dist: tskit>=1; extra == "tskit"
 Provides-Extra: vcf
 Requires-Dist: cyvcf2; extra == "vcf"
 Provides-Extra: all
-Requires-Dist: tskit>=0.6.4; extra == "all"
+Requires-Dist: tskit>=1; extra == "all"
 Requires-Dist: cyvcf2; extra == "all"
 Dynamic: license-file
 [![CI](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sgkit-dev/bio2zarr/actions/workflows/ci.yml)
 [![Coverage Status](https://coveralls.io/repos/github/sgkit-dev/bio2zarr/badge.svg)](https://coveralls.io/github/sgkit-dev/bio2zarr)
+[![PyPI Downloads](https://static.pepy.tech/badge/bio2zarr)](https://pepy.tech/projects/bio2zarr)
+[![Anaconda-Server Badge](https://anaconda.org/bioconda/bio2zarr/badges/downloads.svg)](https://anaconda.org/bioconda/bio2zarr)
 # bio2zarr

bio2zarr-0.1.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,21 @@
+bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
+bio2zarr/__main__.py,sha256=4pF1IBO4CcswA_Fe7NmK_pqGOUHCwsd_8YU7dP92n9c,578
+bio2zarr/_version.py,sha256=szvPIs2C82UunpzuvVg3MbF4QhzbBYTsVJ8DmPfq6_E,704
+bio2zarr/cli.py,sha256=iHfmc-qU2roQXm9Bt3TyR2bmgH-2p3DqYosQERePMZ8,17873
+bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
+bio2zarr/core.py,sha256=mYi2Vmh_YdNEd3weE0zZIPr7ToEUynq8nNCVvONVaqM,12140
+bio2zarr/plink.py,sha256=ELGhsSdH1Xmxx6agCfTx1kYyntrU0XQ384wxTEn87BM,11717
+bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
+bio2zarr/tskit.py,sha256=iLheNWtX7Pad1oNfijf6THMphzXwEtuQ6Zmi94pRZHg,10847
+bio2zarr/typing.py,sha256=HdXNwIBEqYtGNwKyeUDQv6-H-pKSwNZO0qD2_VxTXEY,48
+bio2zarr/vcf.py,sha256=3aXCdTAIuGoUmpbPIPVKhNj4oevkF0s_l7gRB0QmaPU,60738
+bio2zarr/vcf_utils.py,sha256=xrsmxpu1xyXtl6FaYuU562WZP-iVUIaqzxD-11MHfAM,19541
+bio2zarr/vcz.py,sha256=3IkcrAsQkWCiHiMBh0bbxzHtvX8qaUV3W84y1ojUWSs,42204
+bio2zarr/vcz_verification.py,sha256=4YZZnAuMH-z9uPqAeBONdsZADz2MtY57D7RAbMa90yY,8119
+bio2zarr/zarr_utils.py,sha256=4vE6CqnOLqZExc_7Z0jGGbA-kjqz9NPSqSBue10bzHk,5443
+bio2zarr-0.1.7.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+bio2zarr-0.1.7.dist-info/METADATA,sha256=wXANeYEuZh41wH_nay96e4xobWhpBhL-BzkBcdGAR04,15736
+bio2zarr-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+bio2zarr-0.1.7.dist-info/entry_points.txt,sha256=bbIbR8fWMGruyLaoCxO1O22nKidWKUzMgYbTYdsN6YQ,181
+bio2zarr-0.1.7.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
+bio2zarr-0.1.7.dist-info/RECORD,,

{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.8.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

bio2zarr-0.1.6.dist-info/RECORD DELETED Viewed

@@ -1,21 +0,0 @@
-bio2zarr/__init__.py,sha256=KiUGyya-9RHNcBldB8Lc1g3rP3CRjaL-5Olben0_6qA,49
-bio2zarr/__main__.py,sha256=4pF1IBO4CcswA_Fe7NmK_pqGOUHCwsd_8YU7dP92n9c,578
-bio2zarr/_version.py,sha256=ESbJO0YD7TYfOUv_WDIJJgWELGepEWsoyhqVifEcXPA,511
-bio2zarr/cli.py,sha256=WrLfUyV6VggqtDAcI3c1S5YN62ZVOent5f9JzSkX_vA,17570
-bio2zarr/constants.py,sha256=QjbtFeBUZ-XqG35ZFIFj8EYrta_EwUkC2B5VGRP7oQs,425
-bio2zarr/core.py,sha256=mYi2Vmh_YdNEd3weE0zZIPr7ToEUynq8nNCVvONVaqM,12140
-bio2zarr/plink.py,sha256=hkrgXKkxfExgOpgNkj0SszEh9qA8R3T6kXCd-4jsXO8,11498
-bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
-bio2zarr/tskit.py,sha256=6YWbh8M3VJQtVpy2pD8x7Zf0jmc4HOIZwIlWcVaqjvU,10816
-bio2zarr/typing.py,sha256=HdXNwIBEqYtGNwKyeUDQv6-H-pKSwNZO0qD2_VxTXEY,48
-bio2zarr/vcf.py,sha256=_eQJm74YcKBfKDGM283ibhE40nUrkxO6Ee1giDfKjLg,60207
-bio2zarr/vcf_utils.py,sha256=xrsmxpu1xyXtl6FaYuU562WZP-iVUIaqzxD-11MHfAM,19541
-bio2zarr/vcz.py,sha256=yD2mvDZuzlAH73qPRVsUwqHSK-9HMdV4Vcif2JxfcCM,42610
-bio2zarr/vcz_verification.py,sha256=4YZZnAuMH-z9uPqAeBONdsZADz2MtY57D7RAbMa90yY,8119
-bio2zarr/zarr_utils.py,sha256=99J7ycaG92K_AcWRF2S9A4ec2_4cXL6kjYT99GBfli4,415
-bio2zarr-0.1.6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-bio2zarr-0.1.6.dist-info/METADATA,sha256=Me_jLTDVz76lOtidDs1gVrXnwU_rm4ARBpEz_Ozmt6U,15405
-bio2zarr-0.1.6.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
-bio2zarr-0.1.6.dist-info/entry_points.txt,sha256=bbIbR8fWMGruyLaoCxO1O22nKidWKUzMgYbTYdsN6YQ,181
-bio2zarr-0.1.6.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
-bio2zarr-0.1.6.dist-info/RECORD,,

{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{bio2zarr-0.1.6.dist-info → bio2zarr-0.1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

bio2zarr 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

bio2zarr 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl