PyPI - bio2zarr - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

bio2zarr 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (16) hide show

bio2zarr/__init__.py +1 -1
bio2zarr/__main__.py +2 -0
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +129 -32
bio2zarr/core.py +18 -9
bio2zarr/plink.py +6 -8
bio2zarr/typing.py +1 -1
bio2zarr/vcf.py +642 -386
bio2zarr/vcf_utils.py +26 -8
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/METADATA +1 -1
bio2zarr-0.0.6.dist-info/RECORD +16 -0
bio2zarr-0.0.4.dist-info/RECORD +0 -16
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/top_level.txt +0 -0

bio2zarr/vcf.py CHANGED Viewed

@@ -1,29 +1,27 @@
 import collections
+import contextlib
 import dataclasses
 import functools
+import json
 import logging
+import math
 import os
 import pathlib
 import pickle
-import sys
 import shutil
-import json
-import math
+import sys
 import tempfile
-import contextlib
 from typing import Any, List
-import humanfriendly
 import cyvcf2
+import humanfriendly
 import numcodecs
 import numpy as np
 import numpy.testing as nt
 import tqdm
 import zarr
-from . import core
-from . import provenance
-from . import vcf_utils
+from . import core, provenance, vcf_utils
 logger = logging.getLogger(__name__)
@@ -113,9 +111,6 @@ class VcfField:
             return self.name
         return f"{self.category}/{self.name}"
-    # TODO add method here to choose a good set compressor and
-    # filters default here for this field.
     def smallest_dtype(self):
         """
         Returns the smallest dtype suitable for this field based
@@ -125,13 +120,13 @@ class VcfField:
         if self.vcf_type == "Float":
             ret = "f4"
         elif self.vcf_type == "Integer":
-            dtype = "i4"
-            for a_dtype in ["i1", "i2"]:
-                info = np.iinfo(a_dtype)
-                if info.min <= s.min_value and s.max_value <= info.max:
-                    dtype = a_dtype
-                    break
-            ret = dtype
+            if not math.isfinite(s.max_value):
+                # All missing values; use i1. Note we should have some API to
+                # check more explicitly for missingness:
+                # https://github.com/sgkit-dev/bio2zarr/issues/131
+                ret = "i1"
+            else:
+                ret = core.min_int_dtype(s.min_value, s.max_value)
         elif self.vcf_type == "Flag":
             ret = "bool"
         elif self.vcf_type == "Character":
@@ -154,6 +149,10 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
     cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
 )
+# TODO refactor this to have embedded Contig dataclass, Filters
+# and Samples dataclasses to allow for more information to be
+# retained and forward compatibility.
 @dataclasses.dataclass
 class IcfMetadata:
@@ -185,6 +184,14 @@ class IcfMetadata:
                 fields.append(field)
         return fields
+    @property
+    def num_contigs(self):
+        return len(self.contig_names)
+    @property
+    def num_filters(self):
+        return len(self.filters)
     @property
     def num_records(self):
         return sum(self.contig_record_counts.values())
@@ -284,9 +291,25 @@ def scan_vcf(path, target_num_partitions):
         return metadata, vcf.raw_header
+def check_overlap(partitions):
+    for i in range(1, len(partitions)):
+        prev_partition = partitions[i - 1]
+        current_partition = partitions[i]
+        if (
+            prev_partition.region.contig == current_partition.region.contig
+            and prev_partition.region.end > current_partition.region.start
+        ):
+            raise ValueError(
+                f"Multiple VCFs have the region "
+                f"{prev_partition.region.contig}:{prev_partition.region.start}-"
+                f"{current_partition.region.end}"
+            )
 def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     logger.info(
-        f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
+        f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions}"
+        f" partitions."
     )
     # An easy mistake to make is to pass the same file twice. Check this early on.
     for path, count in collections.Counter(paths).items():
@@ -331,6 +354,7 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     all_partitions.sort(
         key=lambda x: (contig_index_map[x.region.contig], x.region.start)
     )
+    check_overlap(all_partitions)
     icf_metadata.partitions = all_partitions
     logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
     return icf_metadata, header
@@ -791,6 +815,8 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
         for vcf_field in icf_metadata.fields:
             field_path = get_vcf_field_path(out_path, vcf_field)
             field_partition_path = field_path / f"p{partition_index}"
+            # Should be robust to running explode_partition twice.
+            field_partition_path.mkdir(exist_ok=True)
             transformer = VcfValueTransformer.factory(vcf_field, num_samples)
             self.field_writers[vcf_field.full_name] = IcfFieldWriter(
                 vcf_field,
@@ -832,7 +858,7 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
             partition.num_records for partition in self.metadata.partitions
         ]
         # Allow us to find which partition a given record is in
-        self.partition_record_index = np.cumsum([0] + partition_num_records)
+        self.partition_record_index = np.cumsum([0, *partition_num_records])
         for field in self.metadata.fields:
             self.columns[field.full_name] = IntermediateColumnarFormatField(self, field)
         logger.info(
@@ -842,7 +868,8 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
     def __repr__(self):
         return (
-            f"IntermediateColumnarFormat(fields={len(self)}, partitions={self.num_partitions}, "
+            f"IntermediateColumnarFormat(fields={len(self)}, "
+            f"partitions={self.num_partitions}, "
             f"records={self.num_records}, path={self.path})"
         )
@@ -890,15 +917,6 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
         return len(self.columns)
-def mkdir_with_progress(path):
-    logger.debug(f"mkdir f{path}")
-    # NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
-    # parents=True will take care of it.
-    path.mkdir(parents=True)
-    core.update_progress(1)
 class IntermediateColumnarFormatWriter:
     def __init__(self, path):
         self.path = pathlib.Path(path)
@@ -941,45 +959,29 @@ class IntermediateColumnarFormatWriter:
         # dependencies as well.
         self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
-        self.mkdirs(worker_processes, show_progress=show_progress)
+        self.mkdirs()
         # Note: this is needed for the current version of the vcfzarr spec, but it's
         # probably going to be dropped.
         # https://github.com/pystatgen/vcf-zarr-spec/issues/15
         # May be useful to keep lying around still though?
-        logger.info(f"Writing VCF header")
+        logger.info("Writing VCF header")
         with open(self.path / "header.txt", "w") as f:
             f.write(header)
-        logger.info(f"Writing WIP metadata")
+        logger.info("Writing WIP metadata")
         with open(self.wip_path / "metadata.json", "w") as f:
             json.dump(self.metadata.asdict(), f, indent=4)
         return self.num_partitions
-    def mkdirs(self, worker_processes=1, show_progress=False):
-        num_dirs = len(self.metadata.fields) * self.num_partitions
-        logger.info(f"Creating {num_dirs} directories")
+    def mkdirs(self):
+        num_dirs = len(self.metadata.fields)
+        logger.info(f"Creating {num_dirs} field directories")
         self.path.mkdir()
         self.wip_path.mkdir()
-        # Due to high latency batch system filesystems, we create all the directories in
-        # parallel
-        progress_config = core.ProgressConfig(
-            total=num_dirs,
-            units="dirs",
-            title="Mkdirs",
-            show=show_progress,
-        )
-        with core.ParallelWorkManager(
-            worker_processes=worker_processes, progress_config=progress_config
-        ) as manager:
-            for field in self.metadata.fields:
-                col_path = get_vcf_field_path(self.path, field)
-                # Don't bother trying to count the intermediate directories towards
-                # progress
-                manager.submit(col_path.mkdir, parents=True)
-                for j in range(self.num_partitions):
-                    part_path = col_path / f"p{j}"
-                    manager.submit(mkdir_with_progress, part_path)
+        for field in self.metadata.fields:
+            col_path = get_vcf_field_path(self.path, field)
+            col_path.mkdir(parents=True)
     def load_partition_summaries(self):
         summaries = []
@@ -995,13 +997,14 @@ class IntermediateColumnarFormatWriter:
                 not_found.append(j)
         if len(not_found) > 0:
             raise FileNotFoundError(
-                f"Partition metadata not found for {len(not_found)} partitions: {not_found}"
+                f"Partition metadata not found for {len(not_found)}"
+                f" partitions: {not_found}"
             )
         return summaries
     def load_metadata(self):
         if self.metadata is None:
-            with open(self.wip_path / f"metadata.json") as f:
+            with open(self.wip_path / "metadata.json") as f:
                 self.metadata = IcfMetadata.fromdict(json.load(f))
     def process_partition(self, partition_index):
@@ -1050,12 +1053,14 @@ class IntermediateColumnarFormatWriter:
                     for field in format_fields:
                         val = variant.format(field.name)
                         tcw.append(field.full_name, val)
-                    # Note: an issue with updating the progress per variant here like this
-                    # is that we get a significant pause at the end of the counter while
-                    # all the "small" fields get flushed. Possibly not much to be done about it.
+                    # Note: an issue with updating the progress per variant here like
+                    # this is that we get a significant pause at the end of the counter
+                    # while all the "small" fields get flushed. Possibly not much to be
+                    # done about it.
                     core.update_progress(1)
             logger.info(
-                f"Finished reading VCF for partition {partition_index}, flushing buffers"
+                f"Finished reading VCF for partition {partition_index}, "
+                f"flushing buffers"
             )
         partition_metadata = {
@@ -1137,11 +1142,11 @@ class IntermediateColumnarFormatWriter:
             for summary in partition_summaries:
                 field.summary.update(summary["field_summaries"][field.full_name])
-        logger.info(f"Finalising metadata")
+        logger.info("Finalising metadata")
         with open(self.path / "metadata.json", "w") as f:
             json.dump(self.metadata.asdict(), f, indent=4)
-        logger.debug(f"Removing WIP directory")
+        logger.debug("Removing WIP directory")
         shutil.rmtree(self.wip_path)
@@ -1155,7 +1160,7 @@ def explode(
     compressor=None,
 ):
     writer = IntermediateColumnarFormatWriter(icf_path)
-    num_partitions = writer.init(
+    writer.init(
         vcfs,
         # Heuristic to get reasonable worker utilisation with lumpy partition sizing
         target_num_partitions=max(1, worker_processes * 4),
@@ -1226,20 +1231,69 @@ class ZarrColumnSpec:
     dtype: str
     shape: tuple
     chunks: tuple
-    dimensions: list
+    dimensions: tuple
     description: str
     vcf_field: str
-    compressor: dict = None
-    filters: list = None
-    # TODO add filters
+    compressor: dict
+    filters: list
     def __post_init__(self):
+        # Ensure these are tuples for ease of comparison and consistency
         self.shape = tuple(self.shape)
         self.chunks = tuple(self.chunks)
         self.dimensions = tuple(self.dimensions)
-        self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
-        self.filters = []
-        self._choose_compressor_settings()
+    @staticmethod
+    def new(**kwargs):
+        spec = ZarrColumnSpec(
+            **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
+        )
+        spec._choose_compressor_settings()
+        return spec
+    @staticmethod
+    def from_field(
+        vcf_field,
+        *,
+        num_variants,
+        num_samples,
+        variants_chunk_size,
+        samples_chunk_size,
+        variable_name=None,
+    ):
+        shape = [num_variants]
+        prefix = "variant_"
+        dimensions = ["variants"]
+        chunks = [variants_chunk_size]
+        if vcf_field.category == "FORMAT":
+            prefix = "call_"
+            shape.append(num_samples)
+            chunks.append(samples_chunk_size)
+            dimensions.append("samples")
+        if variable_name is None:
+            variable_name = prefix + vcf_field.name
+        # TODO make an option to add in the empty extra dimension
+        if vcf_field.summary.max_number > 1:
+            shape.append(vcf_field.summary.max_number)
+            # TODO we should really be checking this to see if the named dimensions
+            # are actually correct.
+            if vcf_field.vcf_number == "R":
+                dimensions.append("alleles")
+            elif vcf_field.vcf_number == "A":
+                dimensions.append("alt_alleles")
+            elif vcf_field.vcf_number == "G":
+                dimensions.append("genotypes")
+            else:
+                dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
+        return ZarrColumnSpec.new(
+            vcf_field=vcf_field.full_name,
+            name=variable_name,
+            dtype=vcf_field.smallest_dtype(),
+            shape=shape,
+            chunks=chunks,
+            dimensions=dimensions,
+            description=vcf_field.description,
+        )
     def _choose_compressor_settings(self):
         """
@@ -1249,17 +1303,32 @@ class ZarrColumnSpec:
         See https://github.com/pystatgen/bio2zarr/discussions/74
         """
-        dt = np.dtype(self.dtype)
         # Default is to not shuffle, because autoshuffle isn't recognised
         # by many Zarr implementations, and shuffling can lead to worse
         # performance in some cases anyway. Turning on shuffle should be a
         # deliberate choice.
         shuffle = numcodecs.Blosc.NOSHUFFLE
-        if dt.itemsize == 1:
-            # Any 1 byte field gets BITSHUFFLE by default
+        if self.name == "call_genotype" and self.dtype == "i1":
+            # call_genotype gets BITSHUFFLE by default as it gets
+            # significantly better compression (at a cost of slower
+            # decoding)
             shuffle = numcodecs.Blosc.BITSHUFFLE
+        elif self.dtype == "bool":
+            shuffle = numcodecs.Blosc.BITSHUFFLE
         self.compressor["shuffle"] = shuffle
+    @property
+    def variant_chunk_nbytes(self):
+        """
+        Returns the nbytes for a single variant chunk of this array.
+        """
+        chunk_items = self.chunks[0]
+        for size in self.shape[1:]:
+            chunk_items *= size
+        dt = np.dtype(self.dtype)
+        return chunk_items * dt.itemsize
 ZARR_SCHEMA_FORMAT_VERSION = "0.2"
@@ -1312,10 +1381,20 @@ class VcfZarrSchema:
             f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
         )
+        def spec_from_field(field, variable_name=None):
+            return ZarrColumnSpec.from_field(
+                field,
+                num_samples=n,
+                num_variants=m,
+                samples_chunk_size=samples_chunk_size,
+                variants_chunk_size=variants_chunk_size,
+                variable_name=variable_name,
+            )
         def fixed_field_spec(
             name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
         ):
-            return ZarrColumnSpec(
+            return ZarrColumnSpec.new(
                 vcf_field=vcf_field,
                 name=name,
                 dtype=dtype,
@@ -1327,88 +1406,58 @@ class VcfZarrSchema:
         alt_col = icf.columns["ALT"]
         max_alleles = alt_col.vcf_field.summary.max_number + 1
-        num_filters = len(icf.metadata.filters)
-        # # FIXME get dtype from lookup table
         colspecs = [
             fixed_field_spec(
                 name="variant_contig",
-                dtype="i2",  # FIXME
+                dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
             ),
             fixed_field_spec(
                 name="variant_filter",
                 dtype="bool",
-                shape=(m, num_filters),
+                shape=(m, icf.metadata.num_filters),
                 dimensions=["variants", "filters"],
             ),
             fixed_field_spec(
                 name="variant_allele",
                 dtype="str",
-                shape=[m, max_alleles],
+                shape=(m, max_alleles),
                 dimensions=["variants", "alleles"],
             ),
             fixed_field_spec(
-                vcf_field="POS",
-                name="variant_position",
-                dtype="i4",
-            ),
-            fixed_field_spec(
-                vcf_field=None,
                 name="variant_id",
                 dtype="str",
             ),
             fixed_field_spec(
-                vcf_field=None,
                 name="variant_id_mask",
                 dtype="bool",
             ),
-            fixed_field_spec(
-                vcf_field="QUAL",
-                name="variant_quality",
-                dtype="f4",
-            ),
         ]
+        name_map = {field.full_name: field for field in icf.metadata.fields}
+        # Only two of the fixed fields have a direct one-to-one mapping.
+        colspecs.extend(
+            [
+                spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
+                spec_from_field(name_map["POS"], variable_name="variant_position"),
+            ]
+        )
+        colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
         gt_field = None
-        for field in icf.metadata.fields:
-            if field.category == "fixed":
-                continue
+        for field in icf.metadata.format_fields:
             if field.name == "GT":
                 gt_field = field
                 continue
-            shape = [m]
-            prefix = "variant_"
-            dimensions = ["variants"]
-            chunks = [variants_chunk_size]
-            if field.category == "FORMAT":
-                prefix = "call_"
-                shape.append(n)
-                chunks.append(samples_chunk_size),
-                dimensions.append("samples")
-            # TODO make an option to add in the empty extra dimension
-            if field.summary.max_number > 1:
-                shape.append(field.summary.max_number)
-                dimensions.append(field.name)
-            variable_name = prefix + field.name
-            colspec = ZarrColumnSpec(
-                vcf_field=field.full_name,
-                name=variable_name,
-                dtype=field.smallest_dtype(),
-                shape=shape,
-                chunks=chunks,
-                dimensions=dimensions,
-                description=field.description,
-            )
-            colspecs.append(colspec)
+            colspecs.append(spec_from_field(field))
         if gt_field is not None:
             ploidy = gt_field.summary.max_number - 1
             shape = [m, n]
             chunks = [variants_chunk_size, samples_chunk_size]
             dimensions = ["variants", "samples"]
             colspecs.append(
-                ZarrColumnSpec(
+                ZarrColumnSpec.new(
                     vcf_field=None,
                     name="call_genotype_phased",
                     dtype="bool",
@@ -1421,7 +1470,7 @@ class VcfZarrSchema:
             shape += [ploidy]
             dimensions += ["ploidy"]
             colspecs.append(
-                ZarrColumnSpec(
+                ZarrColumnSpec.new(
                     vcf_field=None,
                     name="call_genotype",
                     dtype=gt_field.smallest_dtype(),
@@ -1432,7 +1481,7 @@ class VcfZarrSchema:
                 )
             )
             colspecs.append(
-                ZarrColumnSpec(
+                ZarrColumnSpec.new(
                     vcf_field=None,
                     name="call_genotype_mask",
                     dtype="bool",
@@ -1488,15 +1537,6 @@ class VcfZarr:
         return data
-@dataclasses.dataclass
-class EncodingWork:
-    func: callable = dataclasses.field(repr=False)
-    start: int
-    stop: int
-    columns: list[str]
-    memory: int = 0
 def parse_max_memory(max_memory):
     if max_memory is None:
         # Effectively unbounded
@@ -1507,65 +1547,299 @@ def parse_max_memory(max_memory):
     return max_memory
+@dataclasses.dataclass
+class VcfZarrPartition:
+    start_index: int
+    stop_index: int
+    start_chunk: int
+    stop_chunk: int
+    @staticmethod
+    def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
+        num_chunks = int(np.ceil(num_records / chunk_size))
+        if max_chunks is not None:
+            num_chunks = min(num_chunks, max_chunks)
+        partitions = []
+        splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
+        for chunk_slice in splits:
+            start_chunk = int(chunk_slice[0])
+            stop_chunk = int(chunk_slice[-1]) + 1
+            start_index = start_chunk * chunk_size
+            stop_index = min(stop_chunk * chunk_size, num_records)
+            partitions.append(
+                VcfZarrPartition(start_index, stop_index, start_chunk, stop_chunk)
+            )
+        return partitions
+VZW_METADATA_FORMAT_VERSION = "0.1"
+@dataclasses.dataclass
+class VcfZarrWriterMetadata:
+    format_version: str
+    icf_path: str
+    schema: VcfZarrSchema
+    dimension_separator: str
+    partitions: list
+    provenance: dict
+    def asdict(self):
+        return dataclasses.asdict(self)
+    @staticmethod
+    def fromdict(d):
+        if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
+            raise ValueError(
+                "VcfZarrWriter  format version mismatch: "
+                f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
+            )
+        ret = VcfZarrWriterMetadata(**d)
+        ret.schema = VcfZarrSchema.fromdict(ret.schema)
+        ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
+        return ret
 class VcfZarrWriter:
-    def __init__(self, path, icf, schema, dimension_separator=None):
+    def __init__(self, path):
         self.path = pathlib.Path(path)
+        self.wip_path = self.path / "wip"
+        self.arrays_path = self.wip_path / "arrays"
+        self.partitions_path = self.wip_path / "partitions"
+        self.metadata = None
+        self.icf = None
+    @property
+    def schema(self):
+        return self.metadata.schema
+    @property
+    def num_partitions(self):
+        return len(self.metadata.partitions)
+    #######################
+    # init
+    #######################
+    def init(
+        self,
+        icf,
+        *,
+        target_num_partitions,
+        schema,
+        dimension_separator=None,
+        max_variant_chunks=None,
+    ):
         self.icf = icf
-        self.schema = schema
+        if self.path.exists():
+            raise ValueError("Zarr path already exists")  # NEEDS TEST
+        partitions = VcfZarrPartition.generate_partitions(
+            self.icf.num_records,
+            schema.variants_chunk_size,
+            target_num_partitions,
+            max_chunks=max_variant_chunks,
+        )
         # Default to using nested directories following the Zarr v3 default.
         # This seems to require version 2.17+ to work properly
-        self.dimension_separator = "/" if dimension_separator is None else dimension_separator
+        dimension_separator = (
+            "/" if dimension_separator is None else dimension_separator
+        )
+        self.metadata = VcfZarrWriterMetadata(
+            format_version=VZW_METADATA_FORMAT_VERSION,
+            icf_path=str(self.icf.path),
+            schema=schema,
+            dimension_separator=dimension_separator,
+            partitions=partitions,
+            # Bare minimum here for provenance - see comments above
+            provenance={"source": f"bio2zarr-{provenance.__version__}"},
+        )
+        self.path.mkdir()
         store = zarr.DirectoryStore(self.path)
-        self.root = zarr.group(store=store)
+        root = zarr.group(store=store)
+        root.attrs.update(
+            {
+                "vcf_zarr_version": "0.2",
+                "vcf_header": self.icf.vcf_header,
+                "source": f"bio2zarr-{provenance.__version__}",
+            }
+        )
+        # Doing this syncronously - this is fine surely
+        self.encode_samples(root)
+        self.encode_filter_id(root)
+        self.encode_contig_id(root)
+        self.wip_path.mkdir()
+        self.arrays_path.mkdir()
+        self.partitions_path.mkdir()
+        store = zarr.DirectoryStore(self.arrays_path)
+        root = zarr.group(store=store)
+        for column in self.schema.columns.values():
+            self.init_array(root, column, partitions[-1].stop_index)
+        logger.info("Writing WIP metadata")
+        with open(self.wip_path / "metadata.json", "w") as f:
+            json.dump(self.metadata.asdict(), f, indent=4)
+        return len(partitions)
+    def encode_samples(self, root):
+        if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
+            raise ValueError(
+                "Subsetting or reordering samples not supported currently"
+            )  # NEEDS TEST
+        array = root.array(
+            "sample_id",
+            self.schema.sample_id,
+            dtype="str",
+            compressor=DEFAULT_ZARR_COMPRESSOR,
+            chunks=(self.schema.samples_chunk_size,),
+        )
+        array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
+        logger.debug("Samples done")
-    def init_array(self, variable):
+    def encode_contig_id(self, root):
+        array = root.array(
+            "contig_id",
+            self.schema.contig_id,
+            dtype="str",
+            compressor=DEFAULT_ZARR_COMPRESSOR,
+        )
+        array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
+        if self.schema.contig_length is not None:
+            array = root.array(
+                "contig_length",
+                self.schema.contig_length,
+                dtype=np.int64,
+                compressor=DEFAULT_ZARR_COMPRESSOR,
+            )
+            array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
+    def encode_filter_id(self, root):
+        array = root.array(
+            "filter_id",
+            self.schema.filter_id,
+            dtype="str",
+            compressor=DEFAULT_ZARR_COMPRESSOR,
+        )
+        array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
+    def init_array(self, root, variable, variants_dim_size):
         object_codec = None
         if variable.dtype == "O":
             object_codec = numcodecs.VLenUTF8()
-        a = self.root.empty(
-            "wip_" + variable.name,
-            shape=variable.shape,
+        shape = list(variable.shape)
+        # Truncate the variants dimension is max_variant_chunks was specified
+        shape[0] = variants_dim_size
+        a = root.empty(
+            variable.name,
+            shape=shape,
             chunks=variable.chunks,
             dtype=variable.dtype,
             compressor=numcodecs.get_codec(variable.compressor),
             filters=[numcodecs.get_codec(filt) for filt in variable.filters],
             object_codec=object_codec,
-            dimension_separator=self.dimension_separator,
+            dimension_separator=self.metadata.dimension_separator,
+        )
+        a.attrs.update(
+            {
+                "description": variable.description,
+                # Dimension names are part of the spec in Zarr v3
+                "_ARRAY_DIMENSIONS": variable.dimensions,
+            }
         )
-        # Dimension names are part of the spec in Zarr v3
-        a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
+        logger.debug(f"Initialised {a}")
+    #######################
+    # encode_partition
+    #######################
+    def load_metadata(self):
+        if self.metadata is None:
+            with open(self.wip_path / "metadata.json") as f:
+                self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
+            self.icf = IntermediateColumnarFormat(self.metadata.icf_path)
-    def get_array(self, name):
-        return self.root["wip_" + name]
+    def partition_path(self, partition_index):
+        return self.partitions_path / f"p{partition_index}"
-    def finalise_array(self, variable_name):
-        source = self.path / ("wip_" + variable_name)
-        dest = self.path / variable_name
+    def wip_partition_array_path(self, partition_index, name):
+        return self.partition_path(partition_index) / f"wip_{name}"
+    def partition_array_path(self, partition_index, name):
+        return self.partition_path(partition_index) / name
+    def encode_partition(self, partition_index):
+        self.load_metadata()
+        partition_path = self.partition_path(partition_index)
+        partition_path.mkdir(exist_ok=True)
+        logger.info(f"Encoding partition {partition_index} to {partition_path}")
+        self.encode_alleles_partition(partition_index)
+        self.encode_id_partition(partition_index)
+        self.encode_filters_partition(partition_index)
+        self.encode_contig_partition(partition_index)
+        for col in self.schema.columns.values():
+            if col.vcf_field is not None:
+                self.encode_array_partition(col, partition_index)
+        if "call_genotype" in self.schema.columns:
+            self.encode_genotypes_partition(partition_index)
+    def init_partition_array(self, partition_index, name):
+        wip_path = self.wip_partition_array_path(partition_index, name)
+        # Create an empty array like the definition
+        src = self.arrays_path / name
+        # Overwrite any existing WIP files
+        shutil.copytree(src, wip_path, dirs_exist_ok=True)
+        array = zarr.open(wip_path)
+        logger.debug(f"Opened empty array {array} @ {wip_path}")
+        return array
+    def finalise_partition_array(self, partition_index, name):
+        wip_path = self.wip_partition_array_path(partition_index, name)
+        final_path = self.partition_array_path(partition_index, name)
+        if final_path.exists():
+            # NEEDS TEST
+            logger.warning(f"Removing existing {final_path}")
+            shutil.rmtree(final_path)
         # Atomic swap
-        os.rename(source, dest)
-        logger.info(f"Finalised {variable_name}")
+        os.rename(wip_path, final_path)
+        logger.debug(f"Encoded {name} partition {partition_index}")
+    def encode_array_partition(self, column, partition_index):
+        array = self.init_partition_array(partition_index, column.name)
-    def encode_array_slice(self, column, start, stop):
+        partition = self.metadata.partitions[partition_index]
+        ba = core.BufferedArray(array, partition.start_index)
         source_col = self.icf.columns[column.vcf_field]
-        array = self.get_array(column.name)
-        ba = core.BufferedArray(array, start)
         sanitiser = source_col.sanitiser_factory(ba.buff.shape)
-        for value in source_col.iter_values(start, stop):
+        for value in source_col.iter_values(
+            partition.start_index, partition.stop_index
+        ):
             # We write directly into the buffer in the sanitiser function
             # to make it easier to reason about dimension padding
             j = ba.next_buffer_row()
             sanitiser(ba.buff, j, value)
         ba.flush()
-        logger.debug(f"Encoded {column.name} slice {start}:{stop}")
+        self.finalise_partition_array(partition_index, column.name)
-    def encode_genotypes_slice(self, start, stop):
-        source_col = self.icf.columns["FORMAT/GT"]
-        gt = core.BufferedArray(self.get_array("call_genotype"), start)
-        gt_mask = core.BufferedArray(self.get_array("call_genotype_mask"), start)
-        gt_phased = core.BufferedArray(self.get_array("call_genotype_phased"), start)
+    def encode_genotypes_partition(self, partition_index):
+        gt_array = self.init_partition_array(partition_index, "call_genotype")
+        gt_mask_array = self.init_partition_array(partition_index, "call_genotype_mask")
+        gt_phased_array = self.init_partition_array(
+            partition_index, "call_genotype_phased"
+        )
-        for value in source_col.iter_values(start, stop):
+        partition = self.metadata.partitions[partition_index]
+        gt = core.BufferedArray(gt_array, partition.start_index)
+        gt_mask = core.BufferedArray(gt_mask_array, partition.start_index)
+        gt_phased = core.BufferedArray(gt_phased_array, partition.start_index)
+        source_col = self.icf.columns["FORMAT/GT"]
+        for value in source_col.iter_values(
+            partition.start_index, partition.stop_index
+        ):
             j = gt.next_buffer_row()
             sanitise_value_int_2d(gt.buff, j, value[:, :-1])
             j = gt_phased.next_buffer_row()
@@ -1577,29 +1851,40 @@ class VcfZarrWriter:
         gt.flush()
         gt_phased.flush()
         gt_mask.flush()
-        logger.debug(f"Encoded GT slice {start}:{stop}")
-    def encode_alleles_slice(self, start, stop):
+        self.finalise_partition_array(partition_index, "call_genotype")
+        self.finalise_partition_array(partition_index, "call_genotype_mask")
+        self.finalise_partition_array(partition_index, "call_genotype_phased")
+    def encode_alleles_partition(self, partition_index):
+        array_name = "variant_allele"
+        alleles_array = self.init_partition_array(partition_index, array_name)
+        partition = self.metadata.partitions[partition_index]
+        alleles = core.BufferedArray(alleles_array, partition.start_index)
         ref_col = self.icf.columns["REF"]
         alt_col = self.icf.columns["ALT"]
-        alleles = core.BufferedArray(self.get_array("variant_allele"), start)
         for ref, alt in zip(
-            ref_col.iter_values(start, stop), alt_col.iter_values(start, stop)
+            ref_col.iter_values(partition.start_index, partition.stop_index),
+            alt_col.iter_values(partition.start_index, partition.stop_index),
         ):
             j = alleles.next_buffer_row()
             alleles.buff[j, :] = STR_FILL
             alleles.buff[j, 0] = ref[0]
             alleles.buff[j, 1 : 1 + len(alt)] = alt
         alleles.flush()
-        logger.debug(f"Encoded alleles slice {start}:{stop}")
-    def encode_id_slice(self, start, stop):
+        self.finalise_partition_array(partition_index, array_name)
+    def encode_id_partition(self, partition_index):
+        vid_array = self.init_partition_array(partition_index, "variant_id")
+        vid_mask_array = self.init_partition_array(partition_index, "variant_id_mask")
+        partition = self.metadata.partitions[partition_index]
+        vid = core.BufferedArray(vid_array, partition.start_index)
+        vid_mask = core.BufferedArray(vid_mask_array, partition.start_index)
         col = self.icf.columns["ID"]
-        vid = core.BufferedArray(self.get_array("variant_id"), start)
-        vid_mask = core.BufferedArray(self.get_array("variant_id_mask"), start)
-        for value in col.iter_values(start, stop):
+        for value in col.iter_values(partition.start_index, partition.stop_index):
             j = vid.next_buffer_row()
             k = vid_mask.next_buffer_row()
             assert j == k
@@ -1611,28 +1896,41 @@ class VcfZarrWriter:
                 vid_mask.buff[j] = True
         vid.flush()
         vid_mask.flush()
-        logger.debug(f"Encoded ID slice {start}:{stop}")
-    def encode_filters_slice(self, lookup, start, stop):
-        col = self.icf.columns["FILTERS"]
-        var_filter = core.BufferedArray(self.get_array("variant_filter"), start)
+        self.finalise_partition_array(partition_index, "variant_id")
+        self.finalise_partition_array(partition_index, "variant_id_mask")
+    def encode_filters_partition(self, partition_index):
+        lookup = {filt: index for index, filt in enumerate(self.schema.filter_id)}
+        array_name = "variant_filter"
+        array = self.init_partition_array(partition_index, array_name)
+        partition = self.metadata.partitions[partition_index]
+        var_filter = core.BufferedArray(array, partition.start_index)
-        for value in col.iter_values(start, stop):
+        col = self.icf.columns["FILTERS"]
+        for value in col.iter_values(partition.start_index, partition.stop_index):
             j = var_filter.next_buffer_row()
             var_filter.buff[j] = False
             for f in value:
                 try:
                     var_filter.buff[j, lookup[f]] = True
                 except KeyError:
-                    raise ValueError(f"Filter '{f}' was not defined in the header.")
+                    raise ValueError(
+                        f"Filter '{f}' was not defined in the header."
+                    ) from None
         var_filter.flush()
-        logger.debug(f"Encoded FILTERS slice {start}:{stop}")
-    def encode_contig_slice(self, lookup, start, stop):
+        self.finalise_partition_array(partition_index, array_name)
+    def encode_contig_partition(self, partition_index):
+        lookup = {contig: index for index, contig in enumerate(self.schema.contig_id)}
+        array_name = "variant_contig"
+        array = self.init_partition_array(partition_index, array_name)
+        partition = self.metadata.partitions[partition_index]
+        contig = core.BufferedArray(array, partition.start_index)
         col = self.icf.columns["CHROM"]
-        contig = core.BufferedArray(self.get_array("variant_contig"), start)
-        for value in col.iter_values(start, stop):
+        for value in col.iter_values(partition.start_index, partition.stop_index):
             j = contig.next_buffer_row()
             # Note: because we are using the indexes to define the lookups
             # and we always have an index, it seems that we the contig lookup
@@ -1640,160 +1938,120 @@ class VcfZarrWriter:
             # here, please do open an issue with a reproducible example!
             contig.buff[j] = lookup[value[0]]
         contig.flush()
-        logger.debug(f"Encoded CHROM slice {start}:{stop}")
-    def encode_samples(self):
-        if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
-            raise ValueError(
-                "Subsetting or reordering samples not supported currently"
-            )  # NEEDS TEST
-        array = self.root.array(
-            "sample_id",
-            self.schema.sample_id,
-            dtype="str",
-            compressor=DEFAULT_ZARR_COMPRESSOR,
-            chunks=(self.schema.samples_chunk_size,),
-        )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
-        logger.debug("Samples done")
-    def encode_contig_id(self):
-        array = self.root.array(
-            "contig_id",
-            self.schema.contig_id,
-            dtype="str",
-            compressor=DEFAULT_ZARR_COMPRESSOR,
-        )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
-        if self.schema.contig_length is not None:
-            array = self.root.array(
-                "contig_length",
-                self.schema.contig_length,
-                dtype=np.int64,
-                compressor=DEFAULT_ZARR_COMPRESSOR,
+        self.finalise_partition_array(partition_index, array_name)
+    #######################
+    # finalise
+    #######################
+    def finalise_array(self, name):
+        logger.info(f"Finalising {name}")
+        final_path = self.path / name
+        if final_path.exists():
+            # NEEDS TEST
+            raise ValueError(f"Array {name} already exists")
+        for partition in range(len(self.metadata.partitions)):
+            # Move all the files in partition dir to dest dir
+            src = self.partition_array_path(partition, name)
+            if not src.exists():
+                # Needs test
+                raise ValueError(f"Partition {partition} of {name} does not exist")
+            dest = self.arrays_path / name
+            # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
+            chunk_files = [
+                path for path in src.iterdir() if not path.name.startswith(".")
+            ]
+            # TODO check for a count of then number of files. If we require a
+            # dimension_separator of "/" then we could make stronger assertions
+            # here, as we'd always have num_variant_chunks
+            logger.debug(
+                f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
             )
-            array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
-        return {v: j for j, v in enumerate(self.schema.contig_id)}
+            for chunk_file in chunk_files:
+                os.rename(chunk_file, dest / chunk_file.name)
+        # Finally, once all the chunks have moved into the arrays dir,
+        # we move it out of wip
+        os.rename(self.arrays_path / name, self.path / name)
+        core.update_progress(1)
-    def encode_filter_id(self):
-        array = self.root.array(
-            "filter_id",
-            self.schema.filter_id,
-            dtype="str",
-            compressor=DEFAULT_ZARR_COMPRESSOR,
+    def finalise(self, show_progress=False):
+        self.load_metadata()
+        progress_config = core.ProgressConfig(
+            total=len(self.schema.columns),
+            title="Finalise",
+            units="array",
+            show=show_progress,
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
-        return {v: j for j, v in enumerate(self.schema.filter_id)}
+        # NOTE: it's not clear that adding more workers will make this quicker,
+        # as it's just going to be causing contention on the file system.
+        # Something to check empirically in some deployments.
+        # FIXME we're just using worker_processes=0 here to hook into the
+        # SynchronousExecutor which is intended for testing purposes so
+        # that we get test coverage. Should fix this either by allowing
+        # for multiple workers, or making a standard wrapper for tqdm
+        # that allows us to have a consistent look and feel.
+        with core.ParallelWorkManager(0, progress_config) as pwm:
+            for name in self.schema.columns:
+                pwm.submit(self.finalise_array, name)
+        zarr.consolidate_metadata(self.path)
-    def init(self):
-        self.root.attrs["vcf_zarr_version"] = "0.2"
-        self.root.attrs["vcf_header"] = self.icf.vcf_header
-        self.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
-        for column in self.schema.columns.values():
-            self.init_array(column)
+    ######################
+    # encode_all_partitions
+    ######################
-    def finalise(self):
-        zarr.consolidate_metadata(self.path)
+    def get_max_encoding_memory(self):
+        """
+        Return the approximate maximum memory used to encode a variant chunk.
+        """
+        max_encoding_mem = max(
+            col.variant_chunk_nbytes for col in self.schema.columns.values()
+        )
+        gt_mem = 0
+        if "call_genotype" in self.schema.columns:
+            encoded_together = [
+                "call_genotype",
+                "call_genotype_phased",
+                "call_genotype_mask",
+            ]
+            gt_mem = sum(
+                self.schema.columns[col].variant_chunk_nbytes
+                for col in encoded_together
+            )
+        return max(max_encoding_mem, gt_mem)
-    def encode(
-        self,
-        worker_processes=1,
-        max_v_chunks=None,
-        show_progress=False,
-        max_memory=None,
+    def encode_all_partitions(
+        self, *, worker_processes=1, show_progress=False, max_memory=None
     ):
         max_memory = parse_max_memory(max_memory)
-        # TODO this will move into the setup logic later when we're making it possible
-        # to split the work by slice
-        num_slices = max(1, worker_processes * 4)
-        # Using POS arbitrarily to get the array slices
-        slices = core.chunk_aligned_slices(
-            self.get_array("variant_position"), num_slices, max_chunks=max_v_chunks
+        self.load_metadata()
+        num_partitions = self.num_partitions
+        per_worker_memory = self.get_max_encoding_memory()
+        logger.info(
+            f"Encoding Zarr over {num_partitions} partitions with "
+            f"{worker_processes} workers and {display_size(per_worker_memory)} "
+            "per worker"
         )
-        truncated = slices[-1][-1]
-        for array in self.root.values():
-            if array.attrs["_ARRAY_DIMENSIONS"][0] == "variants":
-                shape = list(array.shape)
-                shape[0] = truncated
-                array.resize(shape)
-        total_bytes = 0
-        encoding_memory_requirements = {}
-        for col in self.schema.columns.values():
-            array = self.get_array(col.name)
-            # NOTE!! this is bad, we're potentially creating quite a large
-            # numpy array for basically nothing. We can compute this.
-            variant_chunk_size = array.blocks[0].nbytes
-            encoding_memory_requirements[col.name] = variant_chunk_size
-            logger.debug(
-                f"{col.name} requires at least {display_size(variant_chunk_size)} per worker"
+        # Each partition requires per_worker_memory bytes, so to prevent more that
+        # max_memory being used, we clamp the number of workers
+        max_num_workers = max_memory // per_worker_memory
+        if max_num_workers < worker_processes:
+            logger.warning(
+                f"Limiting number of workers to {max_num_workers} to "
+                f"keep within specified memory budget of {display_size(max_memory)}"
             )
-            total_bytes += array.nbytes
-        filter_id_map = self.encode_filter_id()
-        contig_id_map = self.encode_contig_id()
-        work = []
-        for start, stop in slices:
-            for col in self.schema.columns.values():
-                if col.vcf_field is not None:
-                    f = functools.partial(self.encode_array_slice, col)
-                    work.append(
-                        EncodingWork(
-                            f,
-                            start,
-                            stop,
-                            [col.name],
-                            encoding_memory_requirements[col.name],
-                        )
-                    )
-            work.append(
-                EncodingWork(self.encode_alleles_slice, start, stop, ["variant_allele"])
-            )
-            work.append(
-                EncodingWork(
-                    self.encode_id_slice, start, stop, ["variant_id", "variant_id_mask"]
-                )
-            )
-            work.append(
-                EncodingWork(
-                    functools.partial(self.encode_filters_slice, filter_id_map),
-                    start,
-                    stop,
-                    ["variant_filter"],
-                )
-            )
-            work.append(
-                EncodingWork(
-                    functools.partial(self.encode_contig_slice, contig_id_map),
-                    start,
-                    stop,
-                    ["variant_contig"],
-                )
+        if max_num_workers <= 0:
+            raise ValueError(
+                f"Insufficient memory to encode a partition:"
+                f"{display_size(per_worker_memory)} > {display_size(max_memory)}"
             )
-            if "call_genotype" in self.schema.columns:
-                variables = [
-                    "call_genotype",
-                    "call_genotype_phased",
-                    "call_genotype_mask",
-                ]
-                gt_memory = sum(
-                    encoding_memory_requirements[name] for name in variables
-                )
-                work.append(
-                    EncodingWork(
-                        self.encode_genotypes_slice, start, stop, variables, gt_memory
-                    )
-                )
+        num_workers = min(max_num_workers, worker_processes)
-        # Fail early if we can't fit a particular column into memory
-        for wp in work:
-            if wp.memory > max_memory:
-                raise ValueError(
-                    f"Insufficient memory for {wp.columns}: "
-                    f"{display_size(wp.memory)} > {display_size(max_memory)}"
-                )
+        total_bytes = 0
+        for col in self.schema.columns.values():
+            # Open the array definition to get the total size
+            total_bytes += zarr.open(self.arrays_path / col.name).nbytes
         progress_config = core.ProgressConfig(
             total=total_bytes,
@@ -1801,53 +2059,9 @@ class VcfZarrWriter:
             units="B",
             show=show_progress,
         )
-        used_memory = 0
-        # We need to keep some bounds on the queue size or the memory bounds algorithm
-        # below doesn't really work.
-        max_queued = 4 * max(1, worker_processes)
-        encoded_slices = collections.Counter()
-        with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
-            future = pwm.submit(self.encode_samples)
-            future_to_work = {future: EncodingWork(None, 0, 0, [])}
-            def service_completed_futures():
-                nonlocal used_memory
-                completed = pwm.wait_for_completed()
-                for future in completed:
-                    wp_done = future_to_work.pop(future)
-                    used_memory -= wp_done.memory
-                    logger.debug(
-                        f"Complete {wp_done}: used mem={display_size(used_memory)}"
-                    )
-                    for column in wp_done.columns:
-                        encoded_slices[column] += 1
-                        if encoded_slices[column] == len(slices):
-                            # Do this syncronously for simplicity. Should be
-                            # fine as the workers will probably be busy with
-                            # large encode tasks most of the time.
-                            self.finalise_array(column)
-            for wp in work:
-                while (
-                    used_memory + wp.memory > max_memory
-                    or len(future_to_work) > max_queued
-                ):
-                    logger.debug(
-                        f"Wait: mem_required={used_memory + wp.memory} max_mem={max_memory} "
-                        f"queued={len(future_to_work)} max_queued={max_queued}"
-                    )
-                    service_completed_futures()
-                future = pwm.submit(wp.func, wp.start, wp.stop)
-                used_memory += wp.memory
-                logger.debug(f"Submit {wp}: used mem={display_size(used_memory)}")
-                future_to_work[future] = wp
-            logger.debug("All work submitted")
-            while len(future_to_work) > 0:
-                service_completed_futures()
+        with core.ParallelWorkManager(num_workers, progress_config) as pwm:
+            for partition_index in range(num_partitions):
+                pwm.submit(self.encode_partition, partition_index)
 def mkschema(if_path, out):
@@ -1862,13 +2076,48 @@ def encode(
     schema_path=None,
     variants_chunk_size=None,
     samples_chunk_size=None,
-    max_v_chunks=None,
+    max_variant_chunks=None,
     dimension_separator=None,
     max_memory=None,
     worker_processes=1,
     show_progress=False,
 ):
-    icf = IntermediateColumnarFormat(if_path)
+    # Rough heuristic to split work up enough to keep utilisation high
+    target_num_partitions = max(1, worker_processes * 4)
+    encode_init(
+        if_path,
+        zarr_path,
+        target_num_partitions,
+        schema_path=schema_path,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
+        max_variant_chunks=max_variant_chunks,
+        dimension_separator=dimension_separator,
+    )
+    vzw = VcfZarrWriter(zarr_path)
+    vzw.encode_all_partitions(
+        worker_processes=worker_processes,
+        show_progress=show_progress,
+        max_memory=max_memory,
+    )
+    vzw.finalise(show_progress)
+def encode_init(
+    icf_path,
+    zarr_path,
+    target_num_partitions,
+    *,
+    schema_path=None,
+    variants_chunk_size=None,
+    samples_chunk_size=None,
+    max_variant_chunks=None,
+    dimension_separator=None,
+    max_memory=None,
+    worker_processes=1,
+    show_progress=False,
+):
+    icf = IntermediateColumnarFormat(icf_path)
     if schema_path is None:
         schema = VcfZarrSchema.generate(
             icf,
@@ -1881,21 +2130,28 @@ def encode(
             raise ValueError(
                 "Cannot specify schema along with chunk sizes"
             )  # NEEDS TEST
-        with open(schema_path, "r") as f:
+        with open(schema_path) as f:
             schema = VcfZarrSchema.fromjson(f.read())
     zarr_path = pathlib.Path(zarr_path)
-    if zarr_path.exists():
-        logger.warning(f"Deleting existing {zarr_path}")
-        shutil.rmtree(zarr_path)
-    vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
-    vzw.init()
-    vzw.encode(
-        max_v_chunks=max_v_chunks,
-        worker_processes=worker_processes,
-        max_memory=max_memory,
-        show_progress=show_progress,
+    vzw = VcfZarrWriter(zarr_path)
+    vzw.init(
+        icf,
+        target_num_partitions=target_num_partitions,
+        schema=schema,
+        dimension_separator=dimension_separator,
+        max_variant_chunks=max_variant_chunks,
     )
-    vzw.finalise()
+    return vzw.num_partitions, vzw.get_max_encoding_memory()
+def encode_partition(zarr_path, partition):
+    writer = VcfZarrWriter(zarr_path)
+    writer.encode_partition(partition)
+def encode_finalise(zarr_path, show_progress=False):
+    writer = VcfZarrWriter(zarr_path)
+    writer.finalise(show_progress=show_progress)
 def convert(
@@ -1962,7 +2218,7 @@ def assert_all_fill(zarr_val, vcf_type):
     elif vcf_type == "Float":
         assert_all_fill_float(zarr_val)
     else:  # pragma: no cover
-        assert False
+        assert False  # noqa PT015
 def assert_all_missing(zarr_val, vcf_type):
@@ -1975,7 +2231,7 @@ def assert_all_missing(zarr_val, vcf_type):
     elif vcf_type == "Float":
         assert_all_missing_float(zarr_val)
     else:  # pragma: no cover
-        assert False
+        assert False  # noqa PT015
 def assert_info_val_missing(zarr_val, vcf_type):
@@ -2105,7 +2361,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
     assert pos[start_index] == first_pos
     vcf = cyvcf2.VCF(vcf_path)
     if show_progress:
-        iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records)  # NEEDS TEST
+        iterator = tqdm.tqdm(vcf, desc="  Verify", total=vcf.num_records)  # NEEDS TEST
     else:
         iterator = vcf
     for j, row in enumerate(iterator, start_index):
@@ -2114,7 +2370,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
         assert vid[j] == ("." if row.ID is None else row.ID)
         assert allele[j, 0] == row.REF
         k = len(row.ALT)
-        nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT),
+        nt.assert_array_equal(allele[j, 1 : k + 1], row.ALT)
         assert np.all(allele[j, k + 1 :] == "")
         # TODO FILTERS

bio2zarr 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl