PyPI - bio2zarr - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

bio2zarr 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (16) hide show

bio2zarr/_version.py +2 -2
bio2zarr/cli.py +46 -12
bio2zarr/core.py +32 -2
bio2zarr/plink.py +19 -14
bio2zarr/vcf2zarr/icf.py +30 -17
bio2zarr/vcf2zarr/vcz.py +460 -138
bio2zarr/vcf2zarr/verification.py +19 -16
bio2zarr/vcf_utils.py +30 -14
bio2zarr/zarr_utils.py +19 -0
{bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/METADATA +15 -13
bio2zarr-0.1.2.dist-info/RECORD +21 -0
{bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/WHEEL +1 -1
bio2zarr-0.1.0.dist-info/RECORD +0 -20
{bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/LICENSE +0 -0
{bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.1.0.dist-info → bio2zarr-0.1.2.dist-info}/top_level.txt +0 -0

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.1.0'
-__version_tuple__ = version_tuple = (0, 1, 0)
+__version__ = version = '0.1.2'
+__version_tuple__ = version_tuple = (0, 1, 2)

bio2zarr/cli.py CHANGED Viewed

@@ -149,6 +149,13 @@ max_memory = click.option(
     help="An approximate bound on overall memory usage (e.g. 10G),",
 )
+local_alleles = click.option(
+    "--local-alleles/--no-local-alleles",
+    show_default=True,
+    default=False,
+    help="Use local allele fields to reduce the storage requirements of the output.",
+)
 def setup_logging(verbosity):
     level = "WARNING"
@@ -312,7 +319,7 @@ def dexplode_finalise(icf_path, verbose):
 @click.command
-@click.argument("path", type=click.Path())
+@click.argument("path", type=click.Path(exists=True))
 @verbose
 def inspect(path, verbose):
     """
@@ -325,12 +332,26 @@ def inspect(path, verbose):
 @click.command
 @icf_path
-def mkschema(icf_path):
+@variants_chunk_size
+@samples_chunk_size
+@local_alleles
+def mkschema(icf_path, variants_chunk_size, samples_chunk_size, local_alleles):
     """
     Generate a schema for zarr encoding
     """
+    if local_alleles:
+        click.echo(
+            "WARNING: Local alleles support is preliminary; please use with caution.",
+            err=True,
+        )
     stream = click.get_text_stream("stdout")
-    vcf2zarr.mkschema(icf_path, stream)
+    vcf2zarr.mkschema(
+        icf_path,
+        stream,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
+        local_alleles=local_alleles,
+    )
 @click.command
@@ -469,6 +490,7 @@ def dencode_finalise(zarr_path, verbose, progress):
 @verbose
 @progress
 @worker_processes
+@local_alleles
 def convert_vcf(
     vcfs,
     zarr_path,
@@ -478,6 +500,7 @@ def convert_vcf(
     verbose,
     progress,
     worker_processes,
+    local_alleles,
 ):
     """
     Convert input VCF(s) directly to vcfzarr (not recommended for large files).
@@ -491,6 +514,7 @@ def convert_vcf(
         samples_chunk_size=samples_chunk_size,
         show_progress=progress,
         worker_processes=worker_processes,
+        local_alleles=local_alleles,
     )
@@ -560,7 +584,7 @@ plink2zarr.add_command(convert_plink)
 @click.command
 @version
-@click.argument("vcf_path", type=click.Path(exists=True, dir_okay=False))
+@vcfs
 @verbose
 @num_partitions
 @click.option(
@@ -570,12 +594,16 @@ plink2zarr.add_command(convert_plink)
     default=None,
     help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
 )
-def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
+def vcfpartition(vcfs, verbose, num_partitions, partition_size):
     """
-    Output bcftools region strings that partition an indexed VCF/BCF file
+    Output bcftools region strings that partition the indexed VCF/BCF files
     into either an approximate number of parts (-n), or parts of approximately
     a given size (-s). One of -n or -s must be supplied.
+    If multiple VCF/BCF files are provided, the number of parts (-n) is
+    interpreted as the total number of partitions across all the files,
+    and the partitions are distributed evenly among the files.
     Note that both the number of partitions and sizes are a target, and the
     returned number of partitions may not exactly correspond. In particular,
     there is a maximum level of granularity determined by the associated index
@@ -590,9 +618,15 @@ def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
             "Either --num-partitions or --partition-size must be specified"
         )
-    indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
-    regions = indexed_vcf.partition_into_regions(
-        num_parts=num_partitions, target_part_size=partition_size
-    )
-    for region in regions:
-        click.echo(f"{region}\t{vcf_path}")
+    if num_partitions is None:
+        num_parts_per_path = None
+    else:
+        num_parts_per_path = max(1, num_partitions // len(vcfs))
+    for vcf_path in vcfs:
+        indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
+        regions = indexed_vcf.partition_into_regions(
+            num_parts=num_parts_per_path, target_part_size=partition_size
+        )
+        for region in regions:
+            click.echo(f"{region}\t{vcf_path}")

bio2zarr/core.py CHANGED Viewed

@@ -63,6 +63,27 @@ def chunk_aligned_slices(z, n, max_chunks=None):
     return slices
+def first_dim_slice_iter(z, start, stop):
+    """
+    Efficiently iterate over the specified slice of the first dimension of the zarr
+    array z.
+    """
+    chunk_size = z.chunks[0]
+    first_chunk = start // chunk_size
+    last_chunk = (stop // chunk_size) + (stop % chunk_size != 0)
+    for chunk in range(first_chunk, last_chunk):
+        Z = z.blocks[chunk]
+        chunk_start = chunk * chunk_size
+        chunk_stop = chunk_start + chunk_size
+        slice_start = None
+        if start > chunk_start:
+            slice_start = start - chunk_start
+        slice_stop = None
+        if stop < chunk_stop:
+            slice_stop = stop - chunk_start
+        yield from Z[slice_start:slice_stop]
 def du(path):
     """
     Return the total bytes stored at this path.
@@ -113,13 +134,16 @@ def cancel_futures(futures):
 class BufferedArray:
     array: zarr.Array
     array_offset: int
+    name: str
     buff: np.ndarray
     buffer_row: int
+    max_buff_size: int = 0
-    def __init__(self, array, offset):
+    def __init__(self, array, offset, name="Unknown"):
         self.array = array
         self.array_offset = offset
         assert offset % array.chunks[0] == 0
+        self.name = name
         dims = list(array.shape)
         dims[0] = min(array.chunks[0], array.shape[0])
         self.buff = np.empty(dims, dtype=array.dtype)
@@ -150,11 +174,17 @@ class BufferedArray:
                     self.buff[: self.buffer_row], self.array, self.array_offset
                 )
             logger.debug(
-                f"Flushed <{self.array.name} {self.array.shape} "
+                f"Flushed <{self.name} {self.array.shape} "
                 f"{self.array.dtype}> "
                 f"{self.array_offset}:{self.array_offset + self.buffer_row}"
                 f"{self.buff.nbytes / 2**20: .2f}Mb"
             )
+            # Note this is inaccurate for string data as we're just reporting the
+            # size of the container. When we switch the numpy 2 StringDtype this
+            # should improve and we can get more visibility on how memory
+            # is being used.
+            # https://github.com/sgkit-dev/bio2zarr/issues/30
+            self.max_buff_size = max(self.max_buff_size, self.buff.nbytes)
             self.array_offset += self.variants_chunk_size
             self.buffer_row = 0

bio2zarr/plink.py CHANGED Viewed

@@ -6,6 +6,8 @@ import numcodecs
 import numpy as np
 import zarr
+from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
 from . import core
 logger = logging.getLogger(__name__)
@@ -17,8 +19,7 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
     # the correct approach is, but it is important to note that the
     # 0th allele is *not* necessarily the REF for these datasets.
     bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
-    store = zarr.DirectoryStore(zarr_path)
-    root = zarr.group(store=store)
+    root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
     gt = core.BufferedArray(root["call_genotype"], start)
     gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
     gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
@@ -73,8 +74,7 @@ def convert(
     if variants_chunk_size is None:
         variants_chunk_size = 10_000
-    store = zarr.DirectoryStore(zarr_path)
-    root = zarr.group(store=store, overwrite=True)
+    root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
     ploidy = 2
     shape = [m, n]
@@ -88,7 +88,8 @@ def convert(
     a = root.array(
         "sample_id",
-        bed.iid,
+        data=bed.iid,
+        shape=bed.iid.shape,
         dtype="str",
         compressor=default_compressor,
         chunks=(samples_chunk_size,),
@@ -100,7 +101,8 @@ def convert(
     # fetching repeatedly from bim file
     a = root.array(
         "variant_position",
-        bed.bp_position,
+        data=bed.bp_position,
+        shape=bed.bp_position.shape,
         dtype=np.int32,
         compressor=default_compressor,
         chunks=(variants_chunk_size,),
@@ -111,41 +113,45 @@ def convert(
     alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
     a = root.array(
         "variant_allele",
-        alleles,
+        data=alleles,
+        shape=alleles.shape,
         dtype="str",
         compressor=default_compressor,
-        chunks=(variants_chunk_size,),
+        chunks=(variants_chunk_size, alleles.shape[1]),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
     logger.debug("encoded variant_allele")
     # TODO remove this?
     a = root.empty(
-        "call_genotype_phased",
+        name="call_genotype_phased",
         dtype="bool",
         shape=list(shape),
         chunks=list(chunks),
         compressor=default_compressor,
+        **ZARR_FORMAT_KWARGS,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
     shape += [ploidy]
     dimensions += ["ploidy"]
     a = root.empty(
-        "call_genotype",
+        name="call_genotype",
         dtype="i1",
         shape=list(shape),
         chunks=list(chunks),
         compressor=default_compressor,
+        **ZARR_FORMAT_KWARGS,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
     a = root.empty(
-        "call_genotype_mask",
+        name="call_genotype_mask",
         dtype="bool",
         shape=list(shape),
         chunks=list(chunks),
         compressor=default_compressor,
+        **ZARR_FORMAT_KWARGS,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
@@ -154,7 +160,7 @@ def convert(
     num_slices = max(1, worker_processes * 4)
     slices = core.chunk_aligned_slices(a, num_slices)
-    total_chunks = sum(a.nchunks for a in root.values())
+    total_chunks = sum(a.nchunks for _, a in root.arrays())
     progress_config = core.ProgressConfig(
         total=total_chunks, title="Convert", units="chunks", show=show_progress
@@ -171,8 +177,7 @@ def convert(
 # FIXME do this more efficiently - currently reading the whole thing
 # in for convenience, and also comparing call-by-call
 def validate(bed_path, zarr_path):
-    store = zarr.DirectoryStore(zarr_path)
-    root = zarr.group(store=store)
+    root = zarr.open(store=zarr_path, mode="r")
     call_genotype = root["call_genotype"][:]
     bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)

bio2zarr/vcf2zarr/icf.py CHANGED Viewed

@@ -110,7 +110,7 @@ class VcfPartition:
     num_records: int = -1
-ICF_METADATA_FORMAT_VERSION = "0.3"
+ICF_METADATA_FORMAT_VERSION = "0.4"
 ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
     cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
 )
@@ -212,6 +212,7 @@ def fixed_vcf_field_definitions():
         make_field_def("FILTERS", "String", "."),
         make_field_def("REF", "String", "1"),
         make_field_def("ALT", "String", "."),
+        make_field_def("rlen", "Integer", "1"),  # computed field
     ]
     return fields
@@ -240,7 +241,7 @@ def scan_vcf(path, target_num_partitions):
         for h in vcf.header_iter():
             if h["HeaderType"] in ["INFO", "FORMAT"]:
                 field = VcfField.from_header(h)
-                if field.name == "GT":
+                if h["HeaderType"] == "FORMAT" and field.name == "GT":
                     field.vcf_type = "Integer"
                     field.vcf_number = "."
                 fields.append(field)
@@ -300,7 +301,11 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     )
     with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
         for path in paths:
-            pwm.submit(scan_vcf, path, max(1, target_num_partitions // len(paths)))
+            pwm.submit(
+                scan_vcf,
+                path,
+                max(1, target_num_partitions // len(paths)),
+            )
         results = list(pwm.results_as_completed())
     # Sort to make the ordering deterministic
@@ -408,7 +413,7 @@ def sanitise_value_float_1d(buff, j, value):
     if value is None:
         buff[j] = constants.FLOAT32_MISSING
     else:
-        value = np.array(value, ndmin=1, dtype=buff.dtype, copy=False)
+        value = np.array(value, ndmin=1, dtype=buff.dtype, copy=True)
         # numpy will map None values to Nan, but we need a
         # specific NaN
         value[np.isnan(value)] = constants.FLOAT32_MISSING
@@ -422,7 +427,7 @@ def sanitise_value_float_2d(buff, j, value):
         buff[j] = constants.FLOAT32_MISSING
     else:
         # print("value = ", value)
-        value = np.array(value, ndmin=2, dtype=buff.dtype, copy=False)
+        value = np.array(value, ndmin=2, dtype=buff.dtype, copy=True)
         buff[j] = constants.FLOAT32_FILL
         buff[j, :, : value.shape[1]] = value
@@ -432,7 +437,7 @@ def sanitise_int_array(value, ndmin, dtype):
         value = [
             constants.VCF_INT_MISSING if x is None else x for x in value
         ]  # NEEDS TEST
-    value = np.array(value, ndmin=ndmin, copy=False)
+    value = np.array(value, ndmin=ndmin, copy=True)
     value[value == constants.VCF_INT_MISSING] = -1
     value[value == constants.VCF_INT_FILL] = -2
     # TODO watch out for clipping here!
@@ -494,15 +499,15 @@ class VcfValueTransformer:
     def transform(self, vcf_value):
         if isinstance(vcf_value, tuple):
             vcf_value = [self.missing if v is None else v for v in vcf_value]
-        value = np.array(vcf_value, ndmin=self.dimension, copy=False)
+        value = np.array(vcf_value, ndmin=self.dimension, copy=True)
         return value
     def transform_and_update_bounds(self, vcf_value):
         if vcf_value is None:
             return None
+        # print(self, self.field.full_name, "T", vcf_value)
         value = self.transform(vcf_value)
         self.update_bounds(value)
-        # print(self.field.full_name, "T", vcf_value, "->", value)
         return value
@@ -531,13 +536,15 @@ class FloatValueTransformer(VcfValueTransformer):
 class StringValueTransformer(VcfValueTransformer):
     def update_bounds(self, value):
         summary = self.field.summary
-        number = value.shape[-1]
+        if self.field.category == "FORMAT":
+            number = max(len(v) for v in value)
+        else:
+            number = value.shape[-1]
         # TODO would be nice to report string lengths, but not
         # really necessary.
         summary.max_number = max(summary.max_number, number)
     def transform(self, vcf_value):
-        # print("transform", vcf_value)
         if self.dimension == 1:
             value = np.array(list(vcf_value.split(",")))
         else:
@@ -853,11 +860,11 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
     def summary_table(self):
         data = []
-        for name, col in self.fields.items():
-            summary = col.vcf_field.summary
+        for name, icf_field in self.fields.items():
+            summary = icf_field.vcf_field.summary
             d = {
                 "name": name,
-                "type": col.vcf_field.vcf_type,
+                "type": icf_field.vcf_field.vcf_type,
                 "chunks": summary.num_chunks,
                 "size": core.display_size(summary.uncompressed_size),
                 "compressed": core.display_size(summary.compressed_size),
@@ -962,7 +969,7 @@ class IntermediateColumnarFormatWriter:
         compressor=None,
     ):
         if self.path.exists():
-            raise ValueError("ICF path already exists")
+            raise ValueError(f"ICF path already exists: {self.path}")
         if compressor is None:
             compressor = ICF_DEFAULT_COMPRESSOR
         vcfs = [pathlib.Path(vcf) for vcf in vcfs]
@@ -1009,8 +1016,8 @@ class IntermediateColumnarFormatWriter:
         self.path.mkdir()
         self.wip_path.mkdir()
         for field in self.metadata.fields:
-            col_path = get_vcf_field_path(self.path, field)
-            col_path.mkdir(parents=True)
+            field_path = get_vcf_field_path(self.path, field)
+            field_path.mkdir(parents=True)
     def load_partition_summaries(self):
         summaries = []
@@ -1074,13 +1081,19 @@ class IntermediateColumnarFormatWriter:
                     tcw.append("FILTERS", variant.FILTERS)
                     tcw.append("REF", variant.REF)
                     tcw.append("ALT", variant.ALT)
+                    tcw.append("rlen", variant.end - variant.start)
                     for field in info_fields:
                         tcw.append(field.full_name, variant.INFO.get(field.name, None))
                     if has_gt:
-                        tcw.append("FORMAT/GT", variant.genotype.array())
+                        if variant.genotype is None:
+                            val = None
+                        else:
+                            val = variant.genotype.array()
+                        tcw.append("FORMAT/GT", val)
                     for field in format_fields:
                         val = variant.format(field.name)
                         tcw.append(field.full_name, val)
                     # Note: an issue with updating the progress per variant here like
                     # this is that we get a significant pause at the end of the counter
                     # while all the "small" fields get flushed. Possibly not much to be

bio2zarr 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl