PyPI - bio2zarr - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

bio2zarr 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (19) hide show

bio2zarr/__main__.py +2 -1
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +91 -24
bio2zarr/core.py +43 -22
bio2zarr/plink.py +314 -189
bio2zarr/tskit.py +301 -0
bio2zarr/typing.py +1 -2
bio2zarr/{vcf2zarr/icf.py → vcf.py} +614 -118
bio2zarr/vcf_utils.py +66 -33
bio2zarr/{vcf2zarr/vcz.py → vcz.py} +544 -708
bio2zarr/{vcf2zarr/verification.py → vcz_verification.py} +5 -2
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/METADATA +19 -7
bio2zarr-0.1.6.dist-info/RECORD +21 -0
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/WHEEL +1 -1
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/entry_points.txt +2 -0
bio2zarr/vcf2zarr/__init__.py +0 -38
bio2zarr-0.1.4.dist-info/RECORD +0 -21
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info/licenses}/LICENSE +0 -0
{bio2zarr-0.1.4.dist-info → bio2zarr-0.1.6.dist-info}/top_level.txt +0 -0

bio2zarr/{vcf2zarr/icf.py → vcf.py} RENAMED Viewed

@@ -6,14 +6,17 @@ import logging
 import math
 import pathlib
 import pickle
+import re
 import shutil
 import sys
+import tempfile
+from functools import partial
 from typing import Any
 import numcodecs
 import numpy as np
-from .. import constants, core, provenance, vcf_utils
+from . import constants, core, provenance, vcf_utils, vcz
 logger = logging.getLogger(__name__)
@@ -77,6 +80,14 @@ class VcfField:
             return self.name
         return f"{self.category}/{self.name}"
+    @property
+    def max_number(self):
+        if self.vcf_number in ("R", "A", "G", "."):
+            return self.summary.max_number
+        else:
+            # use declared number if larger than max found
+            return max(self.summary.max_number, int(self.vcf_number))
     def smallest_dtype(self):
         """
         Returns the smallest dtype suitable for this field based
@@ -116,23 +127,6 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
 )
-@dataclasses.dataclass
-class Contig:
-    id: str
-    length: int = None
-@dataclasses.dataclass
-class Sample:
-    id: str
-@dataclasses.dataclass
-class Filter:
-    id: str
-    description: str = ""
 @dataclasses.dataclass
 class IcfMetadata(core.JsonDataclass):
     samples: list
@@ -187,9 +181,9 @@ class IcfMetadata(core.JsonDataclass):
         d = d.copy()
         d["partitions"] = partitions
         d["fields"] = [VcfField.fromdict(fd) for fd in d["fields"]]
-        d["samples"] = [Sample(**sd) for sd in d["samples"]]
-        d["filters"] = [Filter(**fd) for fd in d["filters"]]
-        d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
+        d["samples"] = [vcz.Sample(**sd) for sd in d["samples"]]
+        d["filters"] = [vcz.Filter(**fd) for fd in d["filters"]]
+        d["contigs"] = [vcz.Contig(**cd) for cd in d["contigs"]]
         return IcfMetadata(**d)
     def __eq__(self, other):
@@ -228,8 +222,8 @@ def fixed_vcf_field_definitions():
 def scan_vcf(path, target_num_partitions):
-    with vcf_utils.IndexedVcf(path) as indexed_vcf:
-        vcf = indexed_vcf.vcf
+    with vcf_utils.VcfFile(path) as vcf_file:
+        vcf = vcf_file.vcf
         filters = []
         pass_index = -1
         for h in vcf.header_iter():
@@ -240,7 +234,7 @@ def scan_vcf(path, target_num_partitions):
                     description = ""
                 if h["ID"] == "PASS":
                     pass_index = len(filters)
-                filters.append(Filter(h["ID"], description))
+                filters.append(vcz.Filter(h["ID"], description))
         # Ensure PASS is the first filter if present
         if pass_index > 0:
@@ -262,18 +256,18 @@ def scan_vcf(path, target_num_partitions):
             contig_lengths = [None for _ in vcf.seqnames]
         metadata = IcfMetadata(
-            samples=[Sample(sample_id) for sample_id in vcf.samples],
+            samples=[vcz.Sample(sample_id) for sample_id in vcf.samples],
             contigs=[
-                Contig(contig_id, length)
+                vcz.Contig(contig_id, length)
                 for contig_id, length in zip(vcf.seqnames, contig_lengths)
             ],
             filters=filters,
             fields=fields,
             partitions=[],
-            num_records=sum(indexed_vcf.contig_record_counts().values()),
+            num_records=sum(vcf_file.contig_record_counts().values()),
         )
-        regions = indexed_vcf.partition_into_regions(num_parts=target_num_partitions)
+        regions = vcf_file.partition_into_regions(num_parts=target_num_partitions)
         for region in regions:
             metadata.partitions.append(
                 VcfPartition(
@@ -291,7 +285,12 @@ def scan_vcf(path, target_num_partitions):
         return metadata, vcf.raw_header
-def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
+def scan_vcfs(
+    paths,
+    show_progress,
+    target_num_partitions,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
+):
     logger.info(
         f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
         f" partitions."
@@ -324,14 +323,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     # are compatible.
     all_partitions = []
     total_records = 0
+    contigs = {}
     for metadata, _ in results:
         for partition in metadata.partitions:
             logger.debug(f"Scanned partition {partition}")
             all_partitions.append(partition)
+        for contig in metadata.contigs:
+            if contig.id in contigs:
+                if contig != contigs[contig.id]:
+                    raise ValueError(
+                        "Incompatible contig definitions: "
+                        f"{contig} != {contigs[contig.id]}"
+                    )
+            else:
+                contigs[contig.id] = contig
         total_records += metadata.num_records
         metadata.num_records = 0
         metadata.partitions = []
+    contig_union = list(contigs.values())
+    for metadata, _ in results:
+        metadata.contigs = contig_union
     icf_metadata, header = results[0]
     for metadata, _ in results[1:]:
         if metadata != icf_metadata:
@@ -352,64 +365,58 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     return icf_metadata, header
-def sanitise_value_bool(buff, j, value):
+def sanitise_value_bool(shape, value):
     x = True
     if value is None:
         x = False
-    buff[j] = x
+    return x
-def sanitise_value_float_scalar(buff, j, value):
+def sanitise_value_float_scalar(shape, value):
     x = value
     if value is None:
         x = [constants.FLOAT32_MISSING]
-    buff[j] = x[0]
+    return x[0]
-def sanitise_value_int_scalar(buff, j, value):
+def sanitise_value_int_scalar(shape, value):
     x = value
     if value is None:
-        # print("MISSING", INT_MISSING, INT_FILL)
         x = [constants.INT_MISSING]
     else:
         x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
-    buff[j] = x[0]
+    return x[0]
-def sanitise_value_string_scalar(buff, j, value):
+def sanitise_value_string_scalar(shape, value):
     if value is None:
-        buff[j] = "."
+        return "."
     else:
-        buff[j] = value[0]
+        return value[0]
-def sanitise_value_string_1d(buff, j, value):
+def sanitise_value_string_1d(shape, value):
     if value is None:
-        buff[j] = "."
+        return np.full(shape, ".", dtype="O")
     else:
-        # value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
-        # FIXME failure isn't coming from here, it seems to be from an
-        # incorrectly detected dimension in the zarr array
-        # The dimesions look all wrong, and the dtype should be Object
-        # not str
         value = drop_empty_second_dim(value)
-        buff[j] = ""
-        buff[j, : value.shape[0]] = value
+        result = np.full(shape, "", dtype=value.dtype)
+        result[: value.shape[0]] = value
+        return result
-def sanitise_value_string_2d(buff, j, value):
+def sanitise_value_string_2d(shape, value):
     if value is None:
-        buff[j] = "."
+        return np.full(shape, ".", dtype="O")
     else:
-        # print(buff.shape, value.dtype, value)
-        # assert value.ndim == 2
-        buff[j] = ""
+        result = np.full(shape, "", dtype="O")
         if value.ndim == 2:
-            buff[j, :, : value.shape[1]] = value
+            result[: value.shape[0], : value.shape[1]] = value
         else:
-            # TODO check if this is still necessary
+            # Convert 1D array into 2D with appropriate shape
             for k, val in enumerate(value):
-                buff[j, k, : len(val)] = val
+                result[k, : len(val)] = val
+        return result
 def drop_empty_second_dim(value):
@@ -419,27 +426,28 @@ def drop_empty_second_dim(value):
     return value
-def sanitise_value_float_1d(buff, j, value):
+def sanitise_value_float_1d(shape, value):
     if value is None:
-        buff[j] = constants.FLOAT32_MISSING
+        return np.full(shape, constants.FLOAT32_MISSING)
     else:
-        value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
+        value = np.array(value, ndmin=1, dtype=np.float32, copy=True)
         # numpy will map None values to Nan, but we need a
         # specific NaN
         value[np.isnan(value)] = constants.FLOAT32_MISSING
         value = drop_empty_second_dim(value)
-        buff[j] = constants.FLOAT32_FILL
-        buff[j, : value.shape[0]] = value
+        result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
+        result[: value.shape[0]] = value
+        return result
-def sanitise_value_float_2d(buff, j, value):
+def sanitise_value_float_2d(shape, value):
     if value is None:
-        buff[j] = constants.FLOAT32_MISSING
+        return np.full(shape, constants.FLOAT32_MISSING)
     else:
-        # print("value = ", value)
-        value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
-        buff[j] = constants.FLOAT32_FILL
-        buff[j, :, : value.shape[1]] = value
+        value = np.array(value, ndmin=2, dtype=np.float32, copy=True)
+        result = np.full(shape, constants.FLOAT32_FILL, dtype=np.float32)
+        result[:, : value.shape[1]] = value
+        return result
 def sanitise_int_array(value, ndmin, dtype):
@@ -454,23 +462,25 @@ def sanitise_int_array(value, ndmin, dtype):
     return value.astype(dtype)
-def sanitise_value_int_1d(buff, j, value):
+def sanitise_value_int_1d(shape, value):
     if value is None:
-        buff[j] = -1
+        return np.full(shape, -1)
     else:
-        value = sanitise_int_array(value, 1, buff.dtype)
+        value = sanitise_int_array(value, 1, np.int32)
         value = drop_empty_second_dim(value)
-        buff[j] = -2
-        buff[j, : value.shape[0]] = value
+        result = np.full(shape, -2, dtype=np.int32)
+        result[: value.shape[0]] = value
+        return result
-def sanitise_value_int_2d(buff, j, value):
+def sanitise_value_int_2d(shape, value):
     if value is None:
-        buff[j] = -1
+        return np.full(shape, -1)
     else:
-        value = sanitise_int_array(value, 2, buff.dtype)
-        buff[j] = -2
-        buff[j, :, : value.shape[1]] = value
+        value = sanitise_int_array(value, 2, np.int32)
+        result = np.full(shape, -2, dtype=np.int32)
+        result[:, : value.shape[1]] = value
+        return result
 missing_value_map = {
@@ -634,7 +644,8 @@ class IntermediateColumnarFormatField:
         chunk_cumulative_records = self.chunk_record_index(partition_id)
         chunk_num_records = np.diff(chunk_cumulative_records)
         for count, cumulative in zip(
-            chunk_num_records[start_chunk:], chunk_cumulative_records[start_chunk + 1 :]
+            chunk_num_records[start_chunk:],
+            chunk_cumulative_records[start_chunk + 1 :],
         ):
             path = partition_path / f"{cumulative}"
             chunk = self.read_chunk(path)
@@ -693,36 +704,32 @@ class IntermediateColumnarFormatField:
         return ret
     def sanitiser_factory(self, shape):
-        """
-        Return a function that sanitised values from this column
-        and writes into a buffer of the specified shape.
-        """
-        assert len(shape) <= 3
+        assert len(shape) <= 2
         if self.vcf_field.vcf_type == "Flag":
-            assert len(shape) == 1
-            return sanitise_value_bool
+            assert len(shape) == 0
+            return partial(sanitise_value_bool, shape)
         elif self.vcf_field.vcf_type == "Float":
-            if len(shape) == 1:
-                return sanitise_value_float_scalar
-            elif len(shape) == 2:
-                return sanitise_value_float_1d
+            if len(shape) == 0:
+                return partial(sanitise_value_float_scalar, shape)
+            elif len(shape) == 1:
+                return partial(sanitise_value_float_1d, shape)
             else:
-                return sanitise_value_float_2d
+                return partial(sanitise_value_float_2d, shape)
         elif self.vcf_field.vcf_type == "Integer":
-            if len(shape) == 1:
-                return sanitise_value_int_scalar
-            elif len(shape) == 2:
-                return sanitise_value_int_1d
+            if len(shape) == 0:
+                return partial(sanitise_value_int_scalar, shape)
+            elif len(shape) == 1:
+                return partial(sanitise_value_int_1d, shape)
             else:
-                return sanitise_value_int_2d
+                return partial(sanitise_value_int_2d, shape)
         else:
             assert self.vcf_field.vcf_type in ("String", "Character")
-            if len(shape) == 1:
-                return sanitise_value_string_scalar
-            elif len(shape) == 2:
-                return sanitise_value_string_1d
+            if len(shape) == 0:
+                return partial(sanitise_value_string_scalar, shape)
+            elif len(shape) == 1:
+                return partial(sanitise_value_string_1d, shape)
             else:
-                return sanitise_value_string_2d
+                return partial(sanitise_value_string_2d, shape)
 @dataclasses.dataclass
@@ -829,9 +836,66 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
         return False
-class IntermediateColumnarFormat(collections.abc.Mapping):
+def convert_local_allele_field_types(fields, schema_instance):
+    """
+    Update the specified list of fields to include the LAA field, and to convert
+    any supported localisable fields to the L* counterpart.
+    Note that we currently support only two ALT alleles per sample, and so the
+    dimensions of these fields are fixed by that requirement. Later versions may
+    use summary data storted in the ICF to make different choices, if information
+    about subsequent alleles (not in the actual genotype calls) should also be
+    stored.
+    """
+    fields_by_name = {field.name: field for field in fields}
+    gt = fields_by_name["call_genotype"]
+    if schema_instance.get_shape(["ploidy"])[0] != 2:
+        raise ValueError("Local alleles only supported on diploid data")
+    dimensions = gt.dimensions[:-1]
+    la = vcz.ZarrArraySpec(
+        name="call_LA",
+        dtype="i1",
+        dimensions=(*dimensions, "local_alleles"),
+        description=(
+            "0-based indices into REF+ALT, indicating which alleles"
+            " are relevant (local) for the current sample"
+        ),
+    )
+    schema_instance.dimensions["local_alleles"] = vcz.VcfZarrDimension.unchunked(
+        schema_instance.dimensions["ploidy"].size
+    )
+    ad = fields_by_name.get("call_AD", None)
+    if ad is not None:
+        # TODO check if call_LAD is in the list already
+        ad.name = "call_LAD"
+        ad.source = None
+        ad.dimensions = (*dimensions, "local_alleles_AD")
+        ad.description += " (local-alleles)"
+        schema_instance.dimensions["local_alleles_AD"] = vcz.VcfZarrDimension.unchunked(
+            2
+        )
+    pl = fields_by_name.get("call_PL", None)
+    if pl is not None:
+        # TODO check if call_LPL is in the list already
+        pl.name = "call_LPL"
+        pl.source = None
+        pl.description += " (local-alleles)"
+        pl.dimensions = (*dimensions, "local_" + pl.dimensions[-1].split("_")[-1])
+        schema_instance.dimensions["local_" + pl.dimensions[-1].split("_")[-1]] = (
+            vcz.VcfZarrDimension.unchunked(3)
+        )
+    return [*fields, la]
+class IntermediateColumnarFormat(vcz.Source):
     def __init__(self, path):
-        self.path = pathlib.Path(path)
+        self._path = pathlib.Path(path)
         # TODO raise a more informative error here telling people this
         # directory is either a WIP or the wrong format.
         with open(self.path / "metadata.json") as f:
@@ -845,8 +909,12 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
         ]
         # Allow us to find which partition a given record is in
         self.partition_record_index = np.cumsum([0, *partition_num_records])
+        self.gt_field = None
         for field in self.metadata.fields:
             self.fields[field.full_name] = IntermediateColumnarFormatField(self, field)
+            if field.name == "GT":
+                self.gt_field = field
         logger.info(
             f"Loaded IntermediateColumnarFormat(partitions={self.num_partitions}, "
             f"records={self.num_records}, fields={self.num_fields})"
@@ -854,20 +922,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
     def __repr__(self):
         return (
-            f"IntermediateColumnarFormat(fields={len(self)}, "
+            f"IntermediateColumnarFormat(fields={len(self.fields)}, "
             f"partitions={self.num_partitions}, "
             f"records={self.num_records}, path={self.path})"
         )
-    def __getitem__(self, key):
-        return self.fields[key]
-    def __iter__(self):
-        return iter(self.fields)
-    def __len__(self):
-        return len(self.fields)
     def summary_table(self):
         data = []
         for name, icf_field in self.fields.items():
@@ -886,6 +945,10 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
             data.append(d)
         return data
+    @property
+    def path(self):
+        return self._path
     @property
     def num_records(self):
         return self.metadata.num_records
@@ -894,6 +957,18 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
     def num_partitions(self):
         return len(self.metadata.partitions)
+    @property
+    def samples(self):
+        return self.metadata.samples
+    @property
+    def contigs(self):
+        return self.metadata.contigs
+    @property
+    def filters(self):
+        return self.metadata.filters
     @property
     def num_samples(self):
         return len(self.metadata.samples)
@@ -902,6 +977,261 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
     def num_fields(self):
         return len(self.fields)
+    @property
+    def root_attrs(self):
+        meta_information_pattern = re.compile("##([^=]+)=(.*)")
+        vcf_meta_information = []
+        for line in self.vcf_header.split("\n"):
+            match = re.fullmatch(meta_information_pattern, line)
+            if match:
+                key = match.group(1)
+                if key in ("contig", "FILTER", "INFO", "FORMAT"):
+                    # these fields are stored in Zarr arrays
+                    continue
+                value = match.group(2)
+                vcf_meta_information.append((key, value))
+        return {
+            "vcf_meta_information": vcf_meta_information,
+        }
+    def iter_id(self, start, stop):
+        for value in self.fields["ID"].iter_values(start, stop):
+            if value is not None:
+                yield value[0]
+            else:
+                yield None
+    def iter_filters(self, start, stop):
+        source_field = self.fields["FILTERS"]
+        lookup = {filt.id: index for index, filt in enumerate(self.metadata.filters)}
+        for filter_values in source_field.iter_values(start, stop):
+            filters = np.zeros(len(self.metadata.filters), dtype=bool)
+            if filter_values is not None:
+                for filter_id in filter_values:
+                    try:
+                        filters[lookup[filter_id]] = True
+                    except KeyError:
+                        raise ValueError(
+                            f"Filter '{filter_id}' was not defined in the header."
+                        ) from None
+            yield filters
+    def iter_contig(self, start, stop):
+        source_field = self.fields["CHROM"]
+        lookup = {
+            contig.id: index for index, contig in enumerate(self.metadata.contigs)
+        }
+        for value in source_field.iter_values(start, stop):
+            # Note: because we are using the indexes to define the lookups
+            # and we always have an index, it seems that we the contig lookup
+            # will always succeed. However, if anyone ever does hit a KeyError
+            # here, please do open an issue with a reproducible example!
+            yield lookup[value[0]]
+    def iter_field(self, field_name, shape, start, stop):
+        source_field = self.fields[field_name]
+        sanitiser = source_field.sanitiser_factory(shape)
+        for value in source_field.iter_values(start, stop):
+            yield sanitiser(value)
+    def iter_alleles(self, start, stop, num_alleles):
+        ref_field = self.fields["REF"]
+        alt_field = self.fields["ALT"]
+        for ref, alt in zip(
+            ref_field.iter_values(start, stop),
+            alt_field.iter_values(start, stop),
+        ):
+            alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
+            alleles[0] = ref[0]
+            alleles[1 : 1 + len(alt)] = alt
+            yield alleles
+    def iter_genotypes(self, shape, start, stop):
+        source_field = self.fields["FORMAT/GT"]
+        for value in source_field.iter_values(start, stop):
+            genotypes = value[:, :-1] if value is not None else None
+            phased = value[:, -1] if value is not None else None
+            sanitised_genotypes = sanitise_value_int_2d(shape, genotypes)
+            sanitised_phased = sanitise_value_int_1d(shape[:-1], phased)
+            # Force haploids to always be phased
+            # https://github.com/sgkit-dev/bio2zarr/issues/399
+            if sanitised_genotypes.shape[1] == 1:
+                sanitised_phased[:] = True
+            yield sanitised_genotypes, sanitised_phased
+    def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
+        variant_lengths = self.fields["rlen"].iter_values(start, stop)
+        if self.gt_field is None or shape is None:
+            for variant_length, alleles in zip(
+                variant_lengths, self.iter_alleles(start, stop, num_alleles)
+            ):
+                yield vcz.VariantData(variant_length, alleles, None, None)
+        else:
+            for variant_length, alleles, (gt, phased) in zip(
+                variant_lengths,
+                self.iter_alleles(start, stop, num_alleles),
+                self.iter_genotypes(shape, start, stop),
+            ):
+                yield vcz.VariantData(variant_length, alleles, gt, phased)
+    def generate_schema(
+        self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
+    ):
+        if local_alleles is None:
+            local_alleles = False
+        max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
+        # Add ploidy and genotypes dimensions only when needed
+        max_genotypes = 0
+        for field in self.metadata.format_fields:
+            if field.vcf_number == "G":
+                max_genotypes = max(max_genotypes, field.summary.max_number)
+        ploidy = None
+        genotypes_size = None
+        if self.gt_field is not None:
+            ploidy = max(self.gt_field.summary.max_number - 1, 1)
+            # NOTE: it's not clear why we're computing this, when we must have had
+            # at least one number=G field to require it anyway?
+            genotypes_size = math.comb(max_alleles + ploidy - 1, ploidy)
+            # assert max_genotypes == genotypes_size
+        else:
+            if max_genotypes > 0:
+                # there is no GT field, but there is at least one Number=G field,
+                # so need to define genotypes dimension
+                genotypes_size = max_genotypes
+        dimensions = vcz.standard_dimensions(
+            variants_size=self.num_records,
+            variants_chunk_size=variants_chunk_size,
+            samples_size=self.num_samples,
+            samples_chunk_size=samples_chunk_size,
+            alleles_size=max_alleles,
+            filters_size=self.metadata.num_filters,
+            ploidy_size=ploidy,
+            genotypes_size=genotypes_size,
+        )
+        schema_instance = vcz.VcfZarrSchema(
+            format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
+            dimensions=dimensions,
+            fields=[],
+        )
+        logger.info(
+            "Generating schema with chunks="
+            f"variants={dimensions['variants'].chunk_size}, "
+            f"samples={dimensions['samples'].chunk_size}"
+        )
+        def spec_from_field(field, array_name=None):
+            return vcz.ZarrArraySpec.from_field(
+                field,
+                schema_instance,
+                array_name=array_name,
+            )
+        def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
+            compressor = (
+                vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
+                if dtype == "bool"
+                else None
+            )
+            return vcz.ZarrArraySpec(
+                source=source,
+                name=name,
+                dtype=dtype,
+                description="",
+                dimensions=dimensions,
+                compressor=compressor,
+            )
+        name_map = {field.full_name: field for field in self.metadata.fields}
+        array_specs = [
+            fixed_field_spec(
+                name="variant_contig",
+                dtype=core.min_int_dtype(0, self.metadata.num_contigs),
+            ),
+            fixed_field_spec(
+                name="variant_filter",
+                dtype="bool",
+                dimensions=["variants", "filters"],
+            ),
+            fixed_field_spec(
+                name="variant_allele",
+                dtype="O",
+                dimensions=["variants", "alleles"],
+            ),
+            fixed_field_spec(
+                name="variant_length",
+                dtype=name_map["rlen"].smallest_dtype(),
+                dimensions=["variants"],
+            ),
+            fixed_field_spec(
+                name="variant_id",
+                dtype="O",
+            ),
+            fixed_field_spec(
+                name="variant_id_mask",
+                dtype="bool",
+            ),
+        ]
+        # Only two of the fixed fields have a direct one-to-one mapping.
+        array_specs.extend(
+            [
+                spec_from_field(name_map["QUAL"], array_name="variant_quality"),
+                spec_from_field(name_map["POS"], array_name="variant_position"),
+            ]
+        )
+        array_specs.extend(
+            [spec_from_field(field) for field in self.metadata.info_fields]
+        )
+        for field in self.metadata.format_fields:
+            if field.name == "GT":
+                continue
+            array_specs.append(spec_from_field(field))
+        if self.gt_field is not None and self.num_samples > 0:
+            array_specs.append(
+                vcz.ZarrArraySpec(
+                    name="call_genotype_phased",
+                    dtype="bool",
+                    dimensions=["variants", "samples"],
+                    description="",
+                    compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
+                )
+            )
+            array_specs.append(
+                vcz.ZarrArraySpec(
+                    name="call_genotype",
+                    dtype=self.gt_field.smallest_dtype(),
+                    dimensions=["variants", "samples", "ploidy"],
+                    description="",
+                    compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
+                )
+            )
+            array_specs.append(
+                vcz.ZarrArraySpec(
+                    name="call_genotype_mask",
+                    dtype="bool",
+                    dimensions=["variants", "samples", "ploidy"],
+                    description="",
+                    compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
+                )
+            )
+        if local_alleles:
+            array_specs = convert_local_allele_field_types(array_specs, schema_instance)
+        schema_instance.fields = array_specs
+        return schema_instance
 @dataclasses.dataclass
 class IcfPartitionMetadata(core.JsonDataclass):
@@ -973,7 +1303,7 @@ class IntermediateColumnarFormatWriter:
         vcfs,
         *,
         column_chunk_size=16,
-        worker_processes=1,
+        worker_processes=core.DEFAULT_WORKER_PROCESSES,
         target_num_partitions=None,
         show_progress=False,
         compressor=None,
@@ -1079,9 +1409,9 @@ class IntermediateColumnarFormatWriter:
             self.path,
             partition_index,
         ) as tcw:
-            with vcf_utils.IndexedVcf(partition.vcf_path) as ivcf:
+            with vcf_utils.VcfFile(partition.vcf_path) as vcf:
                 num_records = 0
-                for variant in ivcf.variants(partition.region):
+                for variant in vcf.variants(partition.region):
                     num_records += 1
                     last_position = variant.POS
                     tcw.append("CHROM", variant.CHROM)
@@ -1125,7 +1455,9 @@ class IntermediateColumnarFormatWriter:
             f"{num_records} records last_pos={last_position}"
         )
-    def explode(self, *, worker_processes=1, show_progress=False):
+    def explode(
+        self, *, worker_processes=core.DEFAULT_WORKER_PROCESSES, show_progress=False
+    ):
         self.load_metadata()
         num_records = self.metadata.num_records
         if np.isinf(num_records):
@@ -1193,7 +1525,7 @@ def explode(
     vcfs,
     *,
     column_chunk_size=16,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=False,
     compressor=None,
 ):
@@ -1218,7 +1550,7 @@ def explode_init(
     *,
     column_chunk_size=16,
     target_num_partitions=1,
-    worker_processes=1,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
     show_progress=False,
     compressor=None,
 ):
@@ -1241,3 +1573,167 @@ def explode_partition(icf_path, partition):
 def explode_finalise(icf_path):
     writer = IntermediateColumnarFormatWriter(icf_path)
     writer.finalise()
+def inspect(path):
+    path = pathlib.Path(path)
+    if not path.exists():
+        raise ValueError(f"Path not found: {path}")
+    if (path / "metadata.json").exists():
+        obj = IntermediateColumnarFormat(path)
+    # NOTE: this is too strict, we should support more general Zarrs, see #276
+    elif (path / ".zmetadata").exists():
+        obj = vcz.VcfZarr(path)
+    else:
+        raise ValueError(f"{path} not in ICF or VCF Zarr format")
+    return obj.summary_table()
+def mkschema(
+    if_path,
+    out,
+    *,
+    variants_chunk_size=None,
+    samples_chunk_size=None,
+    local_alleles=None,
+):
+    store = IntermediateColumnarFormat(if_path)
+    spec = store.generate_schema(
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
+        local_alleles=local_alleles,
+    )
+    out.write(spec.asjson())
+def convert(
+    vcfs,
+    vcz_path,
+    *,
+    variants_chunk_size=None,
+    samples_chunk_size=None,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
+    local_alleles=None,
+    show_progress=False,
+    icf_path=None,
+):
+    """
+    Convert the VCF data at the specified list of paths
+    to VCF Zarr format stored at the specified path.
+    .. todo:: Document parameters
+    """
+    if icf_path is None:
+        cm = temp_icf_path(prefix="vcf2zarr")
+    else:
+        cm = contextlib.nullcontext(icf_path)
+    with cm as icf_path:
+        explode(
+            icf_path,
+            vcfs,
+            worker_processes=worker_processes,
+            show_progress=show_progress,
+        )
+        encode(
+            icf_path,
+            vcz_path,
+            variants_chunk_size=variants_chunk_size,
+            samples_chunk_size=samples_chunk_size,
+            worker_processes=worker_processes,
+            show_progress=show_progress,
+            local_alleles=local_alleles,
+        )
+@contextlib.contextmanager
+def temp_icf_path(prefix=None):
+    with tempfile.TemporaryDirectory(prefix=prefix) as tmp:
+        yield pathlib.Path(tmp) / "icf"
+def encode(
+    icf_path,
+    zarr_path,
+    schema_path=None,
+    variants_chunk_size=None,
+    samples_chunk_size=None,
+    max_variant_chunks=None,
+    dimension_separator=None,
+    max_memory=None,
+    local_alleles=None,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
+    show_progress=False,
+):
+    # Rough heuristic to split work up enough to keep utilisation high
+    target_num_partitions = max(1, worker_processes * 4)
+    encode_init(
+        icf_path,
+        zarr_path,
+        target_num_partitions,
+        schema_path=schema_path,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
+        local_alleles=local_alleles,
+        max_variant_chunks=max_variant_chunks,
+        dimension_separator=dimension_separator,
+    )
+    vzw = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
+    vzw.encode_all_partitions(
+        worker_processes=worker_processes,
+        show_progress=show_progress,
+        max_memory=max_memory,
+    )
+    vzw.finalise(show_progress)
+    vzw.create_index()
+def encode_init(
+    icf_path,
+    zarr_path,
+    target_num_partitions,
+    *,
+    schema_path=None,
+    variants_chunk_size=None,
+    samples_chunk_size=None,
+    local_alleles=None,
+    max_variant_chunks=None,
+    dimension_separator=None,
+    max_memory=None,
+    worker_processes=core.DEFAULT_WORKER_PROCESSES,
+    show_progress=False,
+):
+    icf_store = IntermediateColumnarFormat(icf_path)
+    if schema_path is None:
+        schema_instance = icf_store.generate_schema(
+            variants_chunk_size=variants_chunk_size,
+            samples_chunk_size=samples_chunk_size,
+            local_alleles=local_alleles,
+        )
+    else:
+        logger.info(f"Reading schema from {schema_path}")
+        if variants_chunk_size is not None or samples_chunk_size is not None:
+            raise ValueError(
+                "Cannot specify schema along with chunk sizes"
+            )  # NEEDS TEST
+        with open(schema_path) as f:
+            schema_instance = vcz.VcfZarrSchema.fromjson(f.read())
+    zarr_path = pathlib.Path(zarr_path)
+    vzw = vcz.VcfZarrWriter("icf", zarr_path)
+    return vzw.init(
+        icf_store,
+        target_num_partitions=target_num_partitions,
+        schema=schema_instance,
+        dimension_separator=dimension_separator,
+        max_variant_chunks=max_variant_chunks,
+    )
+def encode_partition(zarr_path, partition):
+    writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
+    writer_instance.encode_partition(partition)
+def encode_finalise(zarr_path, show_progress=False):
+    writer_instance = vcz.VcfZarrWriter(IntermediateColumnarFormat, zarr_path)
+    writer_instance.finalise(show_progress=show_progress)

bio2zarr 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl