PyPI - bio2zarr - Versions diffs - 0.0.5__py3-none-any.whl → 0.0.9__py3-none-any.whl - Mend

bio2zarr 0.0.5py3-none-any.whl → 0.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

bio2zarr/_version.py +2 -2
bio2zarr/cli.py +126 -25
bio2zarr/core.py +31 -3
bio2zarr/vcf.py +754 -475
bio2zarr/vcf_utils.py +25 -16
bio2zarr-0.0.9.dist-info/METADATA +363 -0
bio2zarr-0.0.9.dist-info/RECORD +16 -0
bio2zarr-0.0.5.dist-info/METADATA +0 -33
bio2zarr-0.0.5.dist-info/RECORD +0 -16
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.5.dist-info → bio2zarr-0.0.9.dist-info}/top_level.txt +0 -0

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.0.5'
-__version_tuple__ = version_tuple = (0, 0, 5)
+__version__ = version = '0.0.9'
+__version_tuple__ = version_tuple = (0, 0, 9)

bio2zarr/cli.py CHANGED Viewed

@@ -5,6 +5,7 @@ import shutil
 import click
 import coloredlogs
+import humanfriendly
 import numcodecs
 import tabulate
@@ -39,6 +40,14 @@ new_zarr_path = click.argument(
     "zarr_path", type=click.Path(file_okay=False, dir_okay=True)
 )
+zarr_path = click.argument(
+    "zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
+)
+num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
+partition = click.argument("partition", type=click.IntRange(min=0))
 verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
 force = click.option(
@@ -92,6 +101,27 @@ samples_chunk_size = click.option(
     help="Chunk size in the samples dimension",
 )
+schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
+max_variant_chunks = click.option(
+    "-V",
+    "--max-variant-chunks",
+    type=int,
+    default=None,
+    help=(
+        "Truncate the output in the variants dimension to have "
+        "this number of chunks. Mainly intended to help with "
+        "schema tuning."
+    ),
+)
+max_memory = click.option(
+    "-M",
+    "--max-memory",
+    default=None,
+    help="An approximate bound on overall memory usage (e.g. 10G),",
+)
 def setup_logging(verbosity):
     level = "WARNING"
@@ -158,7 +188,7 @@ def explode(
 @click.command
 @vcfs
 @new_icf_path
-@click.argument("num_partitions", type=click.IntRange(min=1))
+@num_partitions
 @force
 @column_chunk_size
 @compressor
@@ -194,7 +224,7 @@ def dexplode_init(
 @click.command
 @icf_path
-@click.argument("partition", type=click.IntRange(min=0))
+@partition
 @verbose
 def dexplode_partition(icf_path, partition, verbose):
     """
@@ -203,18 +233,18 @@ def dexplode_partition(icf_path, partition, verbose):
     from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
     """
     setup_logging(verbose)
-    vcf.explode_partition(icf_path, partition, show_progress=False)
+    vcf.explode_partition(icf_path, partition)
 @click.command
-@click.argument("path", type=click.Path(), required=True)
+@icf_path
 @verbose
-def dexplode_finalise(path, verbose):
+def dexplode_finalise(icf_path, verbose):
     """
     Final step for distributed conversion of VCF(s) to intermediate columnar format.
     """
     setup_logging(verbose)
-    vcf.explode_finalise(path)
+    vcf.explode_finalise(icf_path)
 @click.command
@@ -244,26 +274,11 @@ def mkschema(icf_path):
 @new_zarr_path
 @force
 @verbose
-@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
+@schema
 @variants_chunk_size
 @samples_chunk_size
-@click.option(
-    "-V",
-    "--max-variant-chunks",
-    type=int,
-    default=None,
-    help=(
-        "Truncate the output in the variants dimension to have "
-        "this number of chunks. Mainly intended to help with "
-        "schema tuning."
-    ),
-)
-@click.option(
-    "-M",
-    "--max-memory",
-    default=None,
-    help="An approximate bound on overall memory usage (e.g. 10G),",
-)
+@max_variant_chunks
+@max_memory
 @worker_processes
 def encode(
     icf_path,
@@ -288,13 +303,96 @@ def encode(
         schema_path=schema,
         variants_chunk_size=variants_chunk_size,
         samples_chunk_size=samples_chunk_size,
-        max_v_chunks=max_variant_chunks,
+        max_variant_chunks=max_variant_chunks,
         worker_processes=worker_processes,
         max_memory=max_memory,
         show_progress=True,
     )
+@click.command
+@icf_path
+@new_zarr_path
+@num_partitions
+@force
+@schema
+@variants_chunk_size
+@samples_chunk_size
+@max_variant_chunks
+@verbose
+def dencode_init(
+    icf_path,
+    zarr_path,
+    num_partitions,
+    force,
+    schema,
+    variants_chunk_size,
+    samples_chunk_size,
+    max_variant_chunks,
+    verbose,
+):
+    """
+    Initialise conversion of intermediate format to VCF Zarr. This will
+    set up the specified ZARR_PATH to perform this conversion over
+    NUM_PARTITIONS.
+    The output of this commmand is the actual number of partitions generated
+    (which may be less then the requested number, if there is not sufficient
+    chunks in the variants dimension) and a rough lower-bound on the amount
+    of memory required to encode a partition.
+    NOTE: the format of this output will likely change in subsequent releases;
+    it should not be considered machine-readable for now.
+    """
+    setup_logging(verbose)
+    check_overwrite_dir(zarr_path, force)
+    num_partitions, max_memory = vcf.encode_init(
+        icf_path,
+        zarr_path,
+        target_num_partitions=num_partitions,
+        schema_path=schema,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
+        max_variant_chunks=max_variant_chunks,
+        show_progress=True,
+    )
+    formatted_size = humanfriendly.format_size(max_memory, binary=True)
+    # NOTE adding the size to the stdout here so that users can parse it
+    # and use in their submission scripts. This is a first pass, and
+    # will most likely change as we see what works and doesn't.
+    # NOTE we probably want to format this as a table, which lists
+    # some other properties, line by line
+    # NOTE This size number is also not quite enough, you need a bit of
+    # headroom with it (probably 10% or so). We should include this.
+    click.echo(f"{num_partitions}\t{formatted_size}")
+@click.command
+@zarr_path
+@partition
+@verbose
+def dencode_partition(zarr_path, partition, verbose):
+    """
+    Convert a partition from intermediate columnar format to VCF Zarr.
+    Must be called *after* the Zarr path has been initialised with dencode_init.
+    Partition indexes must be from 0 (inclusive) to the number of paritions
+    returned by dencode_init (exclusive).
+    """
+    setup_logging(verbose)
+    vcf.encode_partition(zarr_path, partition)
+@click.command
+@zarr_path
+@verbose
+def dencode_finalise(zarr_path, verbose):
+    """
+    Final step for distributed conversion of ICF to VCF Zarr.
+    """
+    setup_logging(verbose)
+    vcf.encode_finalise(zarr_path, show_progress=True)
 @click.command(name="convert")
 @vcfs
 @new_zarr_path
@@ -382,6 +480,9 @@ vcf2zarr.add_command(encode)
 vcf2zarr.add_command(dexplode_init)
 vcf2zarr.add_command(dexplode_partition)
 vcf2zarr.add_command(dexplode_finalise)
+vcf2zarr.add_command(dencode_init)
+vcf2zarr.add_command(dencode_partition)
+vcf2zarr.add_command(dencode_finalise)
 @click.command(name="convert")

bio2zarr/core.py CHANGED Viewed

@@ -3,6 +3,8 @@ import contextlib
 import dataclasses
 import logging
 import multiprocessing
+import os
+import os.path
 import threading
 import time
@@ -16,6 +18,16 @@ logger = logging.getLogger(__name__)
 numcodecs.blosc.use_threads = False
+def min_int_dtype(min_value, max_value):
+    if min_value > max_value:
+        raise ValueError("min_value must be <= max_value")
+    for a_dtype in ["i1", "i2", "i4", "i8"]:
+        info = np.iinfo(a_dtype)
+        if info.min <= min_value and max_value <= info.max:
+            return a_dtype
+    raise OverflowError("Integer cannot be represented")
 def chunk_aligned_slices(z, n, max_chunks=None):
     """
     Returns at n slices in the specified zarr array, aligned
@@ -35,6 +47,22 @@ def chunk_aligned_slices(z, n, max_chunks=None):
     return slices
+def du(path):
+    """
+    Return the total bytes stored at this path.
+    """
+    total = os.path.getsize(path)
+    # pathlib walk method doesn't exist until 3.12 :(
+    for root, dirs, files in os.walk(path):
+        for lst in [dirs, files]:
+            for name in lst:
+                fullname = os.path.join(root, name)
+                size = os.path.getsize(fullname)
+                total += size
+    logger.debug(f"du({path}) = {total}")
+    return total
 class SynchronousExecutor(cf.Executor):
     def submit(self, fn, /, *args, **kwargs):
         future = cf.Future()
@@ -100,6 +128,7 @@ class BufferedArray:
                 sync_flush_2d_array(
                     self.buff[: self.buffer_row], self.array, self.array_offset
                 )
+            # FIXME the array.name doesn't seem to be working here for some reason
             logger.debug(
                 f"Flushed <{self.array.name} {self.array.shape} "
                 f"{self.array.dtype}> "
@@ -121,8 +150,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
     # encoder implementations.
     s = slice(offset, offset + np_buffer.shape[0])
     samples_chunk_size = zarr_array.chunks[1]
-    # TODO use zarr chunks here to support non-uniform chunking later
-    # and for simplicity
+    # TODO use zarr chunks here for simplicity
     zarr_array_width = zarr_array.shape[1]
     start = 0
     while start < zarr_array_width:
@@ -182,7 +210,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         self.progress_config = progress_config
         self.progress_bar = tqdm.tqdm(
             total=progress_config.total,
-            desc=f"{progress_config.title:>7}",
+            desc=f"{progress_config.title:>8}",
             unit_scale=True,
             unit=progress_config.units,
             smoothing=0.1,

bio2zarr 0.0.5__py3-none-any.whl → 0.0.9__py3-none-any.whl

bio2zarr 0.0.5py3-none-any.whl → 0.0.9py3-none-any.whl