PyPI - bio2zarr - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

bio2zarr 0.0.5py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (11) hide show

bio2zarr/_version.py +2 -2
bio2zarr/cli.py +125 -24
bio2zarr/core.py +13 -3
bio2zarr/vcf.py +568 -328
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/METADATA +1 -1
bio2zarr-0.0.6.dist-info/RECORD +16 -0
bio2zarr-0.0.5.dist-info/RECORD +0 -16
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.6.dist-info}/top_level.txt +0 -0

bio2zarr/vcf.py CHANGED Viewed

@@ -111,9 +111,6 @@ class VcfField:
             return self.name
         return f"{self.category}/{self.name}"
-    # TODO add method here to choose a good set compressor and
-    # filters default here for this field.
     def smallest_dtype(self):
         """
         Returns the smallest dtype suitable for this field based
@@ -123,13 +120,13 @@ class VcfField:
         if self.vcf_type == "Float":
             ret = "f4"
         elif self.vcf_type == "Integer":
-            dtype = "i4"
-            for a_dtype in ["i1", "i2"]:
-                info = np.iinfo(a_dtype)
-                if info.min <= s.min_value and s.max_value <= info.max:
-                    dtype = a_dtype
-                    break
-            ret = dtype
+            if not math.isfinite(s.max_value):
+                # All missing values; use i1. Note we should have some API to
+                # check more explicitly for missingness:
+                # https://github.com/sgkit-dev/bio2zarr/issues/131
+                ret = "i1"
+            else:
+                ret = core.min_int_dtype(s.min_value, s.max_value)
         elif self.vcf_type == "Flag":
             ret = "bool"
         elif self.vcf_type == "Character":
@@ -152,6 +149,10 @@ ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
     cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
 )
+# TODO refactor this to have embedded Contig dataclass, Filters
+# and Samples dataclasses to allow for more information to be
+# retained and forward compatibility.
 @dataclasses.dataclass
 class IcfMetadata:
@@ -183,6 +184,14 @@ class IcfMetadata:
                 fields.append(field)
         return fields
+    @property
+    def num_contigs(self):
+        return len(self.contig_names)
+    @property
+    def num_filters(self):
+        return len(self.filters)
     @property
     def num_records(self):
         return sum(self.contig_record_counts.values())
@@ -1242,6 +1251,50 @@ class ZarrColumnSpec:
         spec._choose_compressor_settings()
         return spec
+    @staticmethod
+    def from_field(
+        vcf_field,
+        *,
+        num_variants,
+        num_samples,
+        variants_chunk_size,
+        samples_chunk_size,
+        variable_name=None,
+    ):
+        shape = [num_variants]
+        prefix = "variant_"
+        dimensions = ["variants"]
+        chunks = [variants_chunk_size]
+        if vcf_field.category == "FORMAT":
+            prefix = "call_"
+            shape.append(num_samples)
+            chunks.append(samples_chunk_size)
+            dimensions.append("samples")
+        if variable_name is None:
+            variable_name = prefix + vcf_field.name
+        # TODO make an option to add in the empty extra dimension
+        if vcf_field.summary.max_number > 1:
+            shape.append(vcf_field.summary.max_number)
+            # TODO we should really be checking this to see if the named dimensions
+            # are actually correct.
+            if vcf_field.vcf_number == "R":
+                dimensions.append("alleles")
+            elif vcf_field.vcf_number == "A":
+                dimensions.append("alt_alleles")
+            elif vcf_field.vcf_number == "G":
+                dimensions.append("genotypes")
+            else:
+                dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
+        return ZarrColumnSpec.new(
+            vcf_field=vcf_field.full_name,
+            name=variable_name,
+            dtype=vcf_field.smallest_dtype(),
+            shape=shape,
+            chunks=chunks,
+            dimensions=dimensions,
+            description=vcf_field.description,
+        )
     def _choose_compressor_settings(self):
         """
         Choose compressor and filter settings based on the size and
@@ -1250,17 +1303,32 @@ class ZarrColumnSpec:
         See https://github.com/pystatgen/bio2zarr/discussions/74
         """
-        dt = np.dtype(self.dtype)
         # Default is to not shuffle, because autoshuffle isn't recognised
         # by many Zarr implementations, and shuffling can lead to worse
         # performance in some cases anyway. Turning on shuffle should be a
         # deliberate choice.
         shuffle = numcodecs.Blosc.NOSHUFFLE
-        if dt.itemsize == 1:
-            # Any 1 byte field gets BITSHUFFLE by default
+        if self.name == "call_genotype" and self.dtype == "i1":
+            # call_genotype gets BITSHUFFLE by default as it gets
+            # significantly better compression (at a cost of slower
+            # decoding)
             shuffle = numcodecs.Blosc.BITSHUFFLE
+        elif self.dtype == "bool":
+            shuffle = numcodecs.Blosc.BITSHUFFLE
         self.compressor["shuffle"] = shuffle
+    @property
+    def variant_chunk_nbytes(self):
+        """
+        Returns the nbytes for a single variant chunk of this array.
+        """
+        chunk_items = self.chunks[0]
+        for size in self.shape[1:]:
+            chunk_items *= size
+        dt = np.dtype(self.dtype)
+        return chunk_items * dt.itemsize
 ZARR_SCHEMA_FORMAT_VERSION = "0.2"
@@ -1313,6 +1381,16 @@ class VcfZarrSchema:
             f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
         )
+        def spec_from_field(field, variable_name=None):
+            return ZarrColumnSpec.from_field(
+                field,
+                num_samples=n,
+                num_variants=m,
+                samples_chunk_size=samples_chunk_size,
+                variants_chunk_size=variants_chunk_size,
+                variable_name=variable_name,
+            )
         def fixed_field_spec(
             name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
         ):
@@ -1328,95 +1406,56 @@ class VcfZarrSchema:
         alt_col = icf.columns["ALT"]
         max_alleles = alt_col.vcf_field.summary.max_number + 1
-        num_filters = len(icf.metadata.filters)
-        # # FIXME get dtype from lookup table
         colspecs = [
             fixed_field_spec(
                 name="variant_contig",
-                dtype="i2",  # FIXME
+                dtype=core.min_int_dtype(0, icf.metadata.num_contigs),
             ),
             fixed_field_spec(
                 name="variant_filter",
                 dtype="bool",
-                shape=(m, num_filters),
+                shape=(m, icf.metadata.num_filters),
                 dimensions=["variants", "filters"],
             ),
             fixed_field_spec(
                 name="variant_allele",
                 dtype="str",
-                shape=[m, max_alleles],
+                shape=(m, max_alleles),
                 dimensions=["variants", "alleles"],
             ),
             fixed_field_spec(
-                vcf_field="POS",
-                name="variant_position",
-                dtype="i4",
-            ),
-            fixed_field_spec(
-                vcf_field=None,
                 name="variant_id",
                 dtype="str",
             ),
             fixed_field_spec(
-                vcf_field=None,
                 name="variant_id_mask",
                 dtype="bool",
             ),
-            fixed_field_spec(
-                vcf_field="QUAL",
-                name="variant_quality",
-                dtype="f4",
-            ),
         ]
+        name_map = {field.full_name: field for field in icf.metadata.fields}
+        # Only two of the fixed fields have a direct one-to-one mapping.
+        colspecs.extend(
+            [
+                spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
+                spec_from_field(name_map["POS"], variable_name="variant_position"),
+            ]
+        )
+        colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
         gt_field = None
-        for field in icf.metadata.fields:
-            if field.category == "fixed":
-                continue
+        for field in icf.metadata.format_fields:
             if field.name == "GT":
                 gt_field = field
                 continue
-            shape = [m]
-            prefix = "variant_"
-            dimensions = ["variants"]
-            chunks = [variants_chunk_size]
-            if field.category == "FORMAT":
-                prefix = "call_"
-                shape.append(n)
-                chunks.append(samples_chunk_size)
-                dimensions.append("samples")
-            # TODO make an option to add in the empty extra dimension
-            if field.summary.max_number > 1:
-                shape.append(field.summary.max_number)
-                # TODO we should really be checking this to see if the named dimensions
-                # are actually correct.
-                if field.vcf_number == "R":
-                    dimensions.append("alleles")
-                elif field.vcf_number == "A":
-                    dimensions.append("alt_alleles")
-                elif field.vcf_number == "G":
-                    dimensions.append("genotypes")
-                else:
-                    dimensions.append(f"{field.category}_{field.name}_dim")
-            variable_name = prefix + field.name
-            colspec = ZarrColumnSpec.new(
-                vcf_field=field.full_name,
-                name=variable_name,
-                dtype=field.smallest_dtype(),
-                shape=shape,
-                chunks=chunks,
-                dimensions=dimensions,
-                description=field.description,
-            )
-            colspecs.append(colspec)
+            colspecs.append(spec_from_field(field))
         if gt_field is not None:
             ploidy = gt_field.summary.max_number - 1
             shape = [m, n]
             chunks = [variants_chunk_size, samples_chunk_size]
             dimensions = ["variants", "samples"]
             colspecs.append(
                 ZarrColumnSpec.new(
                     vcf_field=None,
@@ -1498,15 +1537,6 @@ class VcfZarr:
         return data
-@dataclasses.dataclass
-class EncodingWork:
-    func: callable = dataclasses.field(repr=False)
-    start: int
-    stop: int
-    columns: list[str]
-    memory: int = 0
 def parse_max_memory(max_memory):
     if max_memory is None:
         # Effectively unbounded
@@ -1517,67 +1547,299 @@ def parse_max_memory(max_memory):
     return max_memory
+@dataclasses.dataclass
+class VcfZarrPartition:
+    start_index: int
+    stop_index: int
+    start_chunk: int
+    stop_chunk: int
+    @staticmethod
+    def generate_partitions(num_records, chunk_size, num_partitions, max_chunks=None):
+        num_chunks = int(np.ceil(num_records / chunk_size))
+        if max_chunks is not None:
+            num_chunks = min(num_chunks, max_chunks)
+        partitions = []
+        splits = np.array_split(np.arange(num_chunks), min(num_partitions, num_chunks))
+        for chunk_slice in splits:
+            start_chunk = int(chunk_slice[0])
+            stop_chunk = int(chunk_slice[-1]) + 1
+            start_index = start_chunk * chunk_size
+            stop_index = min(stop_chunk * chunk_size, num_records)
+            partitions.append(
+                VcfZarrPartition(start_index, stop_index, start_chunk, stop_chunk)
+            )
+        return partitions
+VZW_METADATA_FORMAT_VERSION = "0.1"
+@dataclasses.dataclass
+class VcfZarrWriterMetadata:
+    format_version: str
+    icf_path: str
+    schema: VcfZarrSchema
+    dimension_separator: str
+    partitions: list
+    provenance: dict
+    def asdict(self):
+        return dataclasses.asdict(self)
+    @staticmethod
+    def fromdict(d):
+        if d["format_version"] != VZW_METADATA_FORMAT_VERSION:
+            raise ValueError(
+                "VcfZarrWriter  format version mismatch: "
+                f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
+            )
+        ret = VcfZarrWriterMetadata(**d)
+        ret.schema = VcfZarrSchema.fromdict(ret.schema)
+        ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
+        return ret
 class VcfZarrWriter:
-    def __init__(self, path, icf, schema, dimension_separator=None):
+    def __init__(self, path):
         self.path = pathlib.Path(path)
+        self.wip_path = self.path / "wip"
+        self.arrays_path = self.wip_path / "arrays"
+        self.partitions_path = self.wip_path / "partitions"
+        self.metadata = None
+        self.icf = None
+    @property
+    def schema(self):
+        return self.metadata.schema
+    @property
+    def num_partitions(self):
+        return len(self.metadata.partitions)
+    #######################
+    # init
+    #######################
+    def init(
+        self,
+        icf,
+        *,
+        target_num_partitions,
+        schema,
+        dimension_separator=None,
+        max_variant_chunks=None,
+    ):
         self.icf = icf
-        self.schema = schema
+        if self.path.exists():
+            raise ValueError("Zarr path already exists")  # NEEDS TEST
+        partitions = VcfZarrPartition.generate_partitions(
+            self.icf.num_records,
+            schema.variants_chunk_size,
+            target_num_partitions,
+            max_chunks=max_variant_chunks,
+        )
         # Default to using nested directories following the Zarr v3 default.
         # This seems to require version 2.17+ to work properly
-        self.dimension_separator = (
+        dimension_separator = (
             "/" if dimension_separator is None else dimension_separator
         )
+        self.metadata = VcfZarrWriterMetadata(
+            format_version=VZW_METADATA_FORMAT_VERSION,
+            icf_path=str(self.icf.path),
+            schema=schema,
+            dimension_separator=dimension_separator,
+            partitions=partitions,
+            # Bare minimum here for provenance - see comments above
+            provenance={"source": f"bio2zarr-{provenance.__version__}"},
+        )
+        self.path.mkdir()
         store = zarr.DirectoryStore(self.path)
-        self.root = zarr.group(store=store)
+        root = zarr.group(store=store)
+        root.attrs.update(
+            {
+                "vcf_zarr_version": "0.2",
+                "vcf_header": self.icf.vcf_header,
+                "source": f"bio2zarr-{provenance.__version__}",
+            }
+        )
+        # Doing this syncronously - this is fine surely
+        self.encode_samples(root)
+        self.encode_filter_id(root)
+        self.encode_contig_id(root)
+        self.wip_path.mkdir()
+        self.arrays_path.mkdir()
+        self.partitions_path.mkdir()
+        store = zarr.DirectoryStore(self.arrays_path)
+        root = zarr.group(store=store)
+        for column in self.schema.columns.values():
+            self.init_array(root, column, partitions[-1].stop_index)
+        logger.info("Writing WIP metadata")
+        with open(self.wip_path / "metadata.json", "w") as f:
+            json.dump(self.metadata.asdict(), f, indent=4)
+        return len(partitions)
+    def encode_samples(self, root):
+        if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
+            raise ValueError(
+                "Subsetting or reordering samples not supported currently"
+            )  # NEEDS TEST
+        array = root.array(
+            "sample_id",
+            self.schema.sample_id,
+            dtype="str",
+            compressor=DEFAULT_ZARR_COMPRESSOR,
+            chunks=(self.schema.samples_chunk_size,),
+        )
+        array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
+        logger.debug("Samples done")
-    def init_array(self, variable):
+    def encode_contig_id(self, root):
+        array = root.array(
+            "contig_id",
+            self.schema.contig_id,
+            dtype="str",
+            compressor=DEFAULT_ZARR_COMPRESSOR,
+        )
+        array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
+        if self.schema.contig_length is not None:
+            array = root.array(
+                "contig_length",
+                self.schema.contig_length,
+                dtype=np.int64,
+                compressor=DEFAULT_ZARR_COMPRESSOR,
+            )
+            array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
+    def encode_filter_id(self, root):
+        array = root.array(
+            "filter_id",
+            self.schema.filter_id,
+            dtype="str",
+            compressor=DEFAULT_ZARR_COMPRESSOR,
+        )
+        array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
+    def init_array(self, root, variable, variants_dim_size):
         object_codec = None
         if variable.dtype == "O":
             object_codec = numcodecs.VLenUTF8()
-        a = self.root.empty(
-            "wip_" + variable.name,
-            shape=variable.shape,
+        shape = list(variable.shape)
+        # Truncate the variants dimension is max_variant_chunks was specified
+        shape[0] = variants_dim_size
+        a = root.empty(
+            variable.name,
+            shape=shape,
             chunks=variable.chunks,
             dtype=variable.dtype,
             compressor=numcodecs.get_codec(variable.compressor),
             filters=[numcodecs.get_codec(filt) for filt in variable.filters],
             object_codec=object_codec,
-            dimension_separator=self.dimension_separator,
+            dimension_separator=self.metadata.dimension_separator,
+        )
+        a.attrs.update(
+            {
+                "description": variable.description,
+                # Dimension names are part of the spec in Zarr v3
+                "_ARRAY_DIMENSIONS": variable.dimensions,
+            }
         )
-        # Dimension names are part of the spec in Zarr v3
-        a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
+        logger.debug(f"Initialised {a}")
-    def get_array(self, name):
-        return self.root["wip_" + name]
+    #######################
+    # encode_partition
+    #######################
-    def finalise_array(self, variable_name):
-        source = self.path / ("wip_" + variable_name)
-        dest = self.path / variable_name
+    def load_metadata(self):
+        if self.metadata is None:
+            with open(self.wip_path / "metadata.json") as f:
+                self.metadata = VcfZarrWriterMetadata.fromdict(json.load(f))
+            self.icf = IntermediateColumnarFormat(self.metadata.icf_path)
+    def partition_path(self, partition_index):
+        return self.partitions_path / f"p{partition_index}"
+    def wip_partition_array_path(self, partition_index, name):
+        return self.partition_path(partition_index) / f"wip_{name}"
+    def partition_array_path(self, partition_index, name):
+        return self.partition_path(partition_index) / name
+    def encode_partition(self, partition_index):
+        self.load_metadata()
+        partition_path = self.partition_path(partition_index)
+        partition_path.mkdir(exist_ok=True)
+        logger.info(f"Encoding partition {partition_index} to {partition_path}")
+        self.encode_alleles_partition(partition_index)
+        self.encode_id_partition(partition_index)
+        self.encode_filters_partition(partition_index)
+        self.encode_contig_partition(partition_index)
+        for col in self.schema.columns.values():
+            if col.vcf_field is not None:
+                self.encode_array_partition(col, partition_index)
+        if "call_genotype" in self.schema.columns:
+            self.encode_genotypes_partition(partition_index)
+    def init_partition_array(self, partition_index, name):
+        wip_path = self.wip_partition_array_path(partition_index, name)
+        # Create an empty array like the definition
+        src = self.arrays_path / name
+        # Overwrite any existing WIP files
+        shutil.copytree(src, wip_path, dirs_exist_ok=True)
+        array = zarr.open(wip_path)
+        logger.debug(f"Opened empty array {array} @ {wip_path}")
+        return array
+    def finalise_partition_array(self, partition_index, name):
+        wip_path = self.wip_partition_array_path(partition_index, name)
+        final_path = self.partition_array_path(partition_index, name)
+        if final_path.exists():
+            # NEEDS TEST
+            logger.warning(f"Removing existing {final_path}")
+            shutil.rmtree(final_path)
         # Atomic swap
-        os.rename(source, dest)
-        logger.info(f"Finalised {variable_name}")
+        os.rename(wip_path, final_path)
+        logger.debug(f"Encoded {name} partition {partition_index}")
+    def encode_array_partition(self, column, partition_index):
+        array = self.init_partition_array(partition_index, column.name)
-    def encode_array_slice(self, column, start, stop):
+        partition = self.metadata.partitions[partition_index]
+        ba = core.BufferedArray(array, partition.start_index)
         source_col = self.icf.columns[column.vcf_field]
-        array = self.get_array(column.name)
-        ba = core.BufferedArray(array, start)
         sanitiser = source_col.sanitiser_factory(ba.buff.shape)
-        for value in source_col.iter_values(start, stop):
+        for value in source_col.iter_values(
+            partition.start_index, partition.stop_index
+        ):
             # We write directly into the buffer in the sanitiser function
             # to make it easier to reason about dimension padding
             j = ba.next_buffer_row()
             sanitiser(ba.buff, j, value)
         ba.flush()
-        logger.debug(f"Encoded {column.name} slice {start}:{stop}")
+        self.finalise_partition_array(partition_index, column.name)
-    def encode_genotypes_slice(self, start, stop):
-        source_col = self.icf.columns["FORMAT/GT"]
-        gt = core.BufferedArray(self.get_array("call_genotype"), start)
-        gt_mask = core.BufferedArray(self.get_array("call_genotype_mask"), start)
-        gt_phased = core.BufferedArray(self.get_array("call_genotype_phased"), start)
+    def encode_genotypes_partition(self, partition_index):
+        gt_array = self.init_partition_array(partition_index, "call_genotype")
+        gt_mask_array = self.init_partition_array(partition_index, "call_genotype_mask")
+        gt_phased_array = self.init_partition_array(
+            partition_index, "call_genotype_phased"
+        )
+        partition = self.metadata.partitions[partition_index]
+        gt = core.BufferedArray(gt_array, partition.start_index)
+        gt_mask = core.BufferedArray(gt_mask_array, partition.start_index)
+        gt_phased = core.BufferedArray(gt_phased_array, partition.start_index)
-        for value in source_col.iter_values(start, stop):
+        source_col = self.icf.columns["FORMAT/GT"]
+        for value in source_col.iter_values(
+            partition.start_index, partition.stop_index
+        ):
             j = gt.next_buffer_row()
             sanitise_value_int_2d(gt.buff, j, value[:, :-1])
             j = gt_phased.next_buffer_row()
@@ -1589,29 +1851,40 @@ class VcfZarrWriter:
         gt.flush()
         gt_phased.flush()
         gt_mask.flush()
-        logger.debug(f"Encoded GT slice {start}:{stop}")
-    def encode_alleles_slice(self, start, stop):
+        self.finalise_partition_array(partition_index, "call_genotype")
+        self.finalise_partition_array(partition_index, "call_genotype_mask")
+        self.finalise_partition_array(partition_index, "call_genotype_phased")
+    def encode_alleles_partition(self, partition_index):
+        array_name = "variant_allele"
+        alleles_array = self.init_partition_array(partition_index, array_name)
+        partition = self.metadata.partitions[partition_index]
+        alleles = core.BufferedArray(alleles_array, partition.start_index)
         ref_col = self.icf.columns["REF"]
         alt_col = self.icf.columns["ALT"]
-        alleles = core.BufferedArray(self.get_array("variant_allele"), start)
         for ref, alt in zip(
-            ref_col.iter_values(start, stop), alt_col.iter_values(start, stop)
+            ref_col.iter_values(partition.start_index, partition.stop_index),
+            alt_col.iter_values(partition.start_index, partition.stop_index),
         ):
             j = alleles.next_buffer_row()
             alleles.buff[j, :] = STR_FILL
             alleles.buff[j, 0] = ref[0]
             alleles.buff[j, 1 : 1 + len(alt)] = alt
         alleles.flush()
-        logger.debug(f"Encoded alleles slice {start}:{stop}")
-    def encode_id_slice(self, start, stop):
+        self.finalise_partition_array(partition_index, array_name)
+    def encode_id_partition(self, partition_index):
+        vid_array = self.init_partition_array(partition_index, "variant_id")
+        vid_mask_array = self.init_partition_array(partition_index, "variant_id_mask")
+        partition = self.metadata.partitions[partition_index]
+        vid = core.BufferedArray(vid_array, partition.start_index)
+        vid_mask = core.BufferedArray(vid_mask_array, partition.start_index)
         col = self.icf.columns["ID"]
-        vid = core.BufferedArray(self.get_array("variant_id"), start)
-        vid_mask = core.BufferedArray(self.get_array("variant_id_mask"), start)
-        for value in col.iter_values(start, stop):
+        for value in col.iter_values(partition.start_index, partition.stop_index):
             j = vid.next_buffer_row()
             k = vid_mask.next_buffer_row()
             assert j == k
@@ -1623,13 +1896,19 @@ class VcfZarrWriter:
                 vid_mask.buff[j] = True
         vid.flush()
         vid_mask.flush()
-        logger.debug(f"Encoded ID slice {start}:{stop}")
-    def encode_filters_slice(self, lookup, start, stop):
-        col = self.icf.columns["FILTERS"]
-        var_filter = core.BufferedArray(self.get_array("variant_filter"), start)
+        self.finalise_partition_array(partition_index, "variant_id")
+        self.finalise_partition_array(partition_index, "variant_id_mask")
-        for value in col.iter_values(start, stop):
+    def encode_filters_partition(self, partition_index):
+        lookup = {filt: index for index, filt in enumerate(self.schema.filter_id)}
+        array_name = "variant_filter"
+        array = self.init_partition_array(partition_index, array_name)
+        partition = self.metadata.partitions[partition_index]
+        var_filter = core.BufferedArray(array, partition.start_index)
+        col = self.icf.columns["FILTERS"]
+        for value in col.iter_values(partition.start_index, partition.stop_index):
             j = var_filter.next_buffer_row()
             var_filter.buff[j] = False
             for f in value:
@@ -1637,16 +1916,21 @@ class VcfZarrWriter:
                     var_filter.buff[j, lookup[f]] = True
                 except KeyError:
                     raise ValueError(
-                        f"Filter '{f}' was not defined " f"in the header."
+                        f"Filter '{f}' was not defined in the header."
                     ) from None
         var_filter.flush()
-        logger.debug(f"Encoded FILTERS slice {start}:{stop}")
-    def encode_contig_slice(self, lookup, start, stop):
+        self.finalise_partition_array(partition_index, array_name)
+    def encode_contig_partition(self, partition_index):
+        lookup = {contig: index for index, contig in enumerate(self.schema.contig_id)}
+        array_name = "variant_contig"
+        array = self.init_partition_array(partition_index, array_name)
+        partition = self.metadata.partitions[partition_index]
+        contig = core.BufferedArray(array, partition.start_index)
         col = self.icf.columns["CHROM"]
-        contig = core.BufferedArray(self.get_array("variant_contig"), start)
-        for value in col.iter_values(start, stop):
+        for value in col.iter_values(partition.start_index, partition.stop_index):
             j = contig.next_buffer_row()
             # Note: because we are using the indexes to define the lookups
             # and we always have an index, it seems that we the contig lookup
@@ -1654,161 +1938,120 @@ class VcfZarrWriter:
             # here, please do open an issue with a reproducible example!
             contig.buff[j] = lookup[value[0]]
         contig.flush()
-        logger.debug(f"Encoded CHROM slice {start}:{stop}")
-    def encode_samples(self):
-        if not np.array_equal(self.schema.sample_id, self.icf.metadata.samples):
-            raise ValueError(
-                "Subsetting or reordering samples not supported currently"
-            )  # NEEDS TEST
-        array = self.root.array(
-            "sample_id",
-            self.schema.sample_id,
-            dtype="str",
-            compressor=DEFAULT_ZARR_COMPRESSOR,
-            chunks=(self.schema.samples_chunk_size,),
-        )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
-        logger.debug("Samples done")
-    def encode_contig_id(self):
-        array = self.root.array(
-            "contig_id",
-            self.schema.contig_id,
-            dtype="str",
-            compressor=DEFAULT_ZARR_COMPRESSOR,
-        )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
-        if self.schema.contig_length is not None:
-            array = self.root.array(
-                "contig_length",
-                self.schema.contig_length,
-                dtype=np.int64,
-                compressor=DEFAULT_ZARR_COMPRESSOR,
+        self.finalise_partition_array(partition_index, array_name)
+    #######################
+    # finalise
+    #######################
+    def finalise_array(self, name):
+        logger.info(f"Finalising {name}")
+        final_path = self.path / name
+        if final_path.exists():
+            # NEEDS TEST
+            raise ValueError(f"Array {name} already exists")
+        for partition in range(len(self.metadata.partitions)):
+            # Move all the files in partition dir to dest dir
+            src = self.partition_array_path(partition, name)
+            if not src.exists():
+                # Needs test
+                raise ValueError(f"Partition {partition} of {name} does not exist")
+            dest = self.arrays_path / name
+            # This is Zarr v2 specific. Chunks in v3 with start with "c" prefix.
+            chunk_files = [
+                path for path in src.iterdir() if not path.name.startswith(".")
+            ]
+            # TODO check for a count of then number of files. If we require a
+            # dimension_separator of "/" then we could make stronger assertions
+            # here, as we'd always have num_variant_chunks
+            logger.debug(
+                f"Moving {len(chunk_files)} chunks for {name} partition {partition}"
             )
-            array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
-        return {v: j for j, v in enumerate(self.schema.contig_id)}
+            for chunk_file in chunk_files:
+                os.rename(chunk_file, dest / chunk_file.name)
+        # Finally, once all the chunks have moved into the arrays dir,
+        # we move it out of wip
+        os.rename(self.arrays_path / name, self.path / name)
+        core.update_progress(1)
-    def encode_filter_id(self):
-        array = self.root.array(
-            "filter_id",
-            self.schema.filter_id,
-            dtype="str",
-            compressor=DEFAULT_ZARR_COMPRESSOR,
+    def finalise(self, show_progress=False):
+        self.load_metadata()
+        progress_config = core.ProgressConfig(
+            total=len(self.schema.columns),
+            title="Finalise",
+            units="array",
+            show=show_progress,
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
-        return {v: j for j, v in enumerate(self.schema.filter_id)}
+        # NOTE: it's not clear that adding more workers will make this quicker,
+        # as it's just going to be causing contention on the file system.
+        # Something to check empirically in some deployments.
+        # FIXME we're just using worker_processes=0 here to hook into the
+        # SynchronousExecutor which is intended for testing purposes so
+        # that we get test coverage. Should fix this either by allowing
+        # for multiple workers, or making a standard wrapper for tqdm
+        # that allows us to have a consistent look and feel.
+        with core.ParallelWorkManager(0, progress_config) as pwm:
+            for name in self.schema.columns:
+                pwm.submit(self.finalise_array, name)
+        zarr.consolidate_metadata(self.path)
-    def init(self):
-        self.root.attrs["vcf_zarr_version"] = "0.2"
-        self.root.attrs["vcf_header"] = self.icf.vcf_header
-        self.root.attrs["source"] = f"bio2zarr-{provenance.__version__}"
-        for column in self.schema.columns.values():
-            self.init_array(column)
+    ######################
+    # encode_all_partitions
+    ######################
-    def finalise(self):
-        zarr.consolidate_metadata(self.path)
+    def get_max_encoding_memory(self):
+        """
+        Return the approximate maximum memory used to encode a variant chunk.
+        """
+        max_encoding_mem = max(
+            col.variant_chunk_nbytes for col in self.schema.columns.values()
+        )
+        gt_mem = 0
+        if "call_genotype" in self.schema.columns:
+            encoded_together = [
+                "call_genotype",
+                "call_genotype_phased",
+                "call_genotype_mask",
+            ]
+            gt_mem = sum(
+                self.schema.columns[col].variant_chunk_nbytes
+                for col in encoded_together
+            )
+        return max(max_encoding_mem, gt_mem)
-    def encode(
-        self,
-        worker_processes=1,
-        max_v_chunks=None,
-        show_progress=False,
-        max_memory=None,
+    def encode_all_partitions(
+        self, *, worker_processes=1, show_progress=False, max_memory=None
     ):
         max_memory = parse_max_memory(max_memory)
-        # TODO this will move into the setup logic later when we're making it possible
-        # to split the work by slice
-        num_slices = max(1, worker_processes * 4)
-        # Using POS arbitrarily to get the array slices
-        slices = core.chunk_aligned_slices(
-            self.get_array("variant_position"), num_slices, max_chunks=max_v_chunks
+        self.load_metadata()
+        num_partitions = self.num_partitions
+        per_worker_memory = self.get_max_encoding_memory()
+        logger.info(
+            f"Encoding Zarr over {num_partitions} partitions with "
+            f"{worker_processes} workers and {display_size(per_worker_memory)} "
+            "per worker"
         )
-        truncated = slices[-1][-1]
-        for array in self.root.values():
-            if array.attrs["_ARRAY_DIMENSIONS"][0] == "variants":
-                shape = list(array.shape)
-                shape[0] = truncated
-                array.resize(shape)
-        total_bytes = 0
-        encoding_memory_requirements = {}
-        for col in self.schema.columns.values():
-            array = self.get_array(col.name)
-            # NOTE!! this is bad, we're potentially creating quite a large
-            # numpy array for basically nothing. We can compute this.
-            variant_chunk_size = array.blocks[0].nbytes
-            encoding_memory_requirements[col.name] = variant_chunk_size
-            logger.debug(
-                f"{col.name} requires at least {display_size(variant_chunk_size)} "
-                f"per worker"
+        # Each partition requires per_worker_memory bytes, so to prevent more that
+        # max_memory being used, we clamp the number of workers
+        max_num_workers = max_memory // per_worker_memory
+        if max_num_workers < worker_processes:
+            logger.warning(
+                f"Limiting number of workers to {max_num_workers} to "
+                f"keep within specified memory budget of {display_size(max_memory)}"
             )
-            total_bytes += array.nbytes
-        filter_id_map = self.encode_filter_id()
-        contig_id_map = self.encode_contig_id()
-        work = []
-        for start, stop in slices:
-            for col in self.schema.columns.values():
-                if col.vcf_field is not None:
-                    f = functools.partial(self.encode_array_slice, col)
-                    work.append(
-                        EncodingWork(
-                            f,
-                            start,
-                            stop,
-                            [col.name],
-                            encoding_memory_requirements[col.name],
-                        )
-                    )
-            work.append(
-                EncodingWork(self.encode_alleles_slice, start, stop, ["variant_allele"])
-            )
-            work.append(
-                EncodingWork(
-                    self.encode_id_slice, start, stop, ["variant_id", "variant_id_mask"]
-                )
-            )
-            work.append(
-                EncodingWork(
-                    functools.partial(self.encode_filters_slice, filter_id_map),
-                    start,
-                    stop,
-                    ["variant_filter"],
-                )
-            )
-            work.append(
-                EncodingWork(
-                    functools.partial(self.encode_contig_slice, contig_id_map),
-                    start,
-                    stop,
-                    ["variant_contig"],
-                )
+        if max_num_workers <= 0:
+            raise ValueError(
+                f"Insufficient memory to encode a partition:"
+                f"{display_size(per_worker_memory)} > {display_size(max_memory)}"
             )
-            if "call_genotype" in self.schema.columns:
-                variables = [
-                    "call_genotype",
-                    "call_genotype_phased",
-                    "call_genotype_mask",
-                ]
-                gt_memory = sum(
-                    encoding_memory_requirements[name] for name in variables
-                )
-                work.append(
-                    EncodingWork(
-                        self.encode_genotypes_slice, start, stop, variables, gt_memory
-                    )
-                )
+        num_workers = min(max_num_workers, worker_processes)
-        # Fail early if we can't fit a particular column into memory
-        for wp in work:
-            if wp.memory > max_memory:
-                raise ValueError(
-                    f"Insufficient memory for {wp.columns}: "
-                    f"{display_size(wp.memory)} > {display_size(max_memory)}"
-                )
+        total_bytes = 0
+        for col in self.schema.columns.values():
+            # Open the array definition to get the total size
+            total_bytes += zarr.open(self.arrays_path / col.name).nbytes
         progress_config = core.ProgressConfig(
             total=total_bytes,
@@ -1816,54 +2059,9 @@ class VcfZarrWriter:
             units="B",
             show=show_progress,
         )
-        used_memory = 0
-        # We need to keep some bounds on the queue size or the memory bounds algorithm
-        # below doesn't really work.
-        max_queued = 4 * max(1, worker_processes)
-        encoded_slices = collections.Counter()
-        with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
-            future = pwm.submit(self.encode_samples)
-            future_to_work = {future: EncodingWork(None, 0, 0, [])}
-            def service_completed_futures():
-                nonlocal used_memory
-                completed = pwm.wait_for_completed()
-                for future in completed:
-                    wp_done = future_to_work.pop(future)
-                    used_memory -= wp_done.memory
-                    logger.debug(
-                        f"Complete {wp_done}: used mem={display_size(used_memory)}"
-                    )
-                    for column in wp_done.columns:
-                        encoded_slices[column] += 1
-                        if encoded_slices[column] == len(slices):
-                            # Do this syncronously for simplicity. Should be
-                            # fine as the workers will probably be busy with
-                            # large encode tasks most of the time.
-                            self.finalise_array(column)
-            for wp in work:
-                while (
-                    used_memory + wp.memory > max_memory
-                    or len(future_to_work) > max_queued
-                ):
-                    logger.debug(
-                        f"Wait: mem_required={used_memory + wp.memory} "
-                        f"max_mem={max_memory} queued={len(future_to_work)} "
-                        f"max_queued={max_queued}"
-                    )
-                    service_completed_futures()
-                future = pwm.submit(wp.func, wp.start, wp.stop)
-                used_memory += wp.memory
-                logger.debug(f"Submit {wp}: used mem={display_size(used_memory)}")
-                future_to_work[future] = wp
-            logger.debug("All work submitted")
-            while len(future_to_work) > 0:
-                service_completed_futures()
+        with core.ParallelWorkManager(num_workers, progress_config) as pwm:
+            for partition_index in range(num_partitions):
+                pwm.submit(self.encode_partition, partition_index)
 def mkschema(if_path, out):
@@ -1878,13 +2076,48 @@ def encode(
     schema_path=None,
     variants_chunk_size=None,
     samples_chunk_size=None,
-    max_v_chunks=None,
+    max_variant_chunks=None,
     dimension_separator=None,
     max_memory=None,
     worker_processes=1,
     show_progress=False,
 ):
-    icf = IntermediateColumnarFormat(if_path)
+    # Rough heuristic to split work up enough to keep utilisation high
+    target_num_partitions = max(1, worker_processes * 4)
+    encode_init(
+        if_path,
+        zarr_path,
+        target_num_partitions,
+        schema_path=schema_path,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
+        max_variant_chunks=max_variant_chunks,
+        dimension_separator=dimension_separator,
+    )
+    vzw = VcfZarrWriter(zarr_path)
+    vzw.encode_all_partitions(
+        worker_processes=worker_processes,
+        show_progress=show_progress,
+        max_memory=max_memory,
+    )
+    vzw.finalise(show_progress)
+def encode_init(
+    icf_path,
+    zarr_path,
+    target_num_partitions,
+    *,
+    schema_path=None,
+    variants_chunk_size=None,
+    samples_chunk_size=None,
+    max_variant_chunks=None,
+    dimension_separator=None,
+    max_memory=None,
+    worker_processes=1,
+    show_progress=False,
+):
+    icf = IntermediateColumnarFormat(icf_path)
     if schema_path is None:
         schema = VcfZarrSchema.generate(
             icf,
@@ -1900,18 +2133,25 @@ def encode(
         with open(schema_path) as f:
             schema = VcfZarrSchema.fromjson(f.read())
     zarr_path = pathlib.Path(zarr_path)
-    if zarr_path.exists():
-        logger.warning(f"Deleting existing {zarr_path}")
-        shutil.rmtree(zarr_path)
-    vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
-    vzw.init()
-    vzw.encode(
-        max_v_chunks=max_v_chunks,
-        worker_processes=worker_processes,
-        max_memory=max_memory,
-        show_progress=show_progress,
+    vzw = VcfZarrWriter(zarr_path)
+    vzw.init(
+        icf,
+        target_num_partitions=target_num_partitions,
+        schema=schema,
+        dimension_separator=dimension_separator,
+        max_variant_chunks=max_variant_chunks,
     )
-    vzw.finalise()
+    return vzw.num_partitions, vzw.get_max_encoding_memory()
+def encode_partition(zarr_path, partition):
+    writer = VcfZarrWriter(zarr_path)
+    writer.encode_partition(partition)
+def encode_finalise(zarr_path, show_progress=False):
+    writer = VcfZarrWriter(zarr_path)
+    writer.finalise(show_progress=show_progress)
 def convert(
@@ -2121,7 +2361,7 @@ def validate(vcf_path, zarr_path, show_progress=False):
     assert pos[start_index] == first_pos
     vcf = cyvcf2.VCF(vcf_path)
     if show_progress:
-        iterator = tqdm.tqdm(vcf, desc=" Verify", total=vcf.num_records)  # NEEDS TEST
+        iterator = tqdm.tqdm(vcf, desc="  Verify", total=vcf.num_records)  # NEEDS TEST
     else:
         iterator = vcf
     for j, row in enumerate(iterator, start_index):

bio2zarr 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.5py3-none-any.whl → 0.0.6py3-none-any.whl