PyPI - bio2zarr - Versions diffs - 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

bio2zarr 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (16) hide show

bio2zarr/__init__.py +1 -1
bio2zarr/__main__.py +2 -0
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +129 -32
bio2zarr/core.py +18 -9
bio2zarr/plink.py +6 -8
bio2zarr/typing.py +1 -1
bio2zarr/vcf.py +642 -386
bio2zarr/vcf_utils.py +26 -8
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/METADATA +1 -1
bio2zarr-0.0.6.dist-info/RECORD +16 -0
bio2zarr-0.0.4.dist-info/RECORD +0 -16
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.4.dist-info → bio2zarr-0.0.6.dist-info}/top_level.txt +0 -0

bio2zarr/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- from . provenance import __version__
1	+ from .provenance import __version__ # noqa F401

bio2zarr/__main__.py CHANGED Viewed

@@ -2,11 +2,13 @@ import click
 from . import cli
 @cli.version
 @click.group()
 def bio2zarr():
     pass
 # Provide a single top-level interface to all of the functionality.
 # This probably isn't the recommended way of interacting, as we
 # install individual commands as console scripts. However, this

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.0.4'
-__version_tuple__ = version_tuple = (0, 0, 4)
+__version__ = version = '0.0.6'
+__version_tuple__ = version_tuple = (0, 0, 6)

bio2zarr/cli.py CHANGED Viewed

@@ -4,15 +4,12 @@ import pathlib
 import shutil
 import click
-import tabulate
 import coloredlogs
+import humanfriendly
 import numcodecs
+import tabulate
-from . import vcf
-from . import vcf_utils
-from . import plink
-from . import provenance
+from . import plink, provenance, vcf, vcf_utils
 logger = logging.getLogger(__name__)
@@ -43,6 +40,14 @@ new_zarr_path = click.argument(
     "zarr_path", type=click.Path(file_okay=False, dir_okay=True)
 )
+zarr_path = click.argument(
+    "zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
+)
+num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
+partition = click.argument("partition", type=click.IntRange(min=0))
 verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
 force = click.option(
@@ -75,7 +80,7 @@ compressor = click.option(
     "--compressor",
     type=click.Choice(["lz4", "zstd"]),
     default=None,
-    help="Codec to use for compressing column chunks (Default=zstd)."
+    help="Codec to use for compressing column chunks (Default=zstd).",
 )
 # Note: -l and -w were chosen when these were called "width" and "length".
@@ -96,6 +101,27 @@ samples_chunk_size = click.option(
     help="Chunk size in the samples dimension",
 )
+schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
+max_variant_chunks = click.option(
+    "-V",
+    "--max-variant-chunks",
+    type=int,
+    default=None,
+    help=(
+        "Truncate the output in the variants dimension to have "
+        "this number of chunks. Mainly intended to help with "
+        "schema tuning."
+    ),
+)
+max_memory = click.option(
+    "-M",
+    "--max-memory",
+    default=None,
+    help="An approximate bound on overall memory usage (e.g. 10G),",
+)
 def setup_logging(verbosity):
     level = "WARNING"
@@ -162,7 +188,7 @@ def explode(
 @click.command
 @vcfs
 @new_icf_path
-@click.argument("num_partitions", type=click.IntRange(min=1))
+@num_partitions
 @force
 @column_chunk_size
 @compressor
@@ -198,7 +224,7 @@ def dexplode_init(
 @click.command
 @icf_path
-@click.argument("partition", type=click.IntRange(min=0))
+@partition
 @verbose
 def dexplode_partition(icf_path, partition, verbose):
     """
@@ -207,18 +233,18 @@ def dexplode_partition(icf_path, partition, verbose):
     from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
     """
     setup_logging(verbose)
-    vcf.explode_partition(icf_path, partition, show_progress=True)
+    vcf.explode_partition(icf_path, partition, show_progress=False)
 @click.command
-@click.argument("path", type=click.Path(), required=True)
+@icf_path
 @verbose
-def dexplode_finalise(path, verbose):
+def dexplode_finalise(icf_path, verbose):
     """
     Final step for distributed conversion of VCF(s) to intermediate columnar format.
     """
     setup_logging(verbose)
-    vcf.explode_finalise(path)
+    vcf.explode_finalise(icf_path)
 @click.command
@@ -248,26 +274,11 @@ def mkschema(icf_path):
 @new_zarr_path
 @force
 @verbose
-@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
+@schema
 @variants_chunk_size
 @samples_chunk_size
-@click.option(
-    "-V",
-    "--max-variant-chunks",
-    type=int,
-    default=None,
-    help=(
-        "Truncate the output in the variants dimension to have "
-        "this number of chunks. Mainly intended to help with "
-        "schema tuning."
-    ),
-)
-@click.option(
-    "-M",
-    "--max-memory",
-    default=None,
-    help="An approximate bound on overall memory usage (e.g. 10G),",
-)
+@max_variant_chunks
+@max_memory
 @worker_processes
 def encode(
     icf_path,
@@ -292,13 +303,96 @@ def encode(
         schema_path=schema,
         variants_chunk_size=variants_chunk_size,
         samples_chunk_size=samples_chunk_size,
-        max_v_chunks=max_variant_chunks,
+        max_variant_chunks=max_variant_chunks,
         worker_processes=worker_processes,
         max_memory=max_memory,
         show_progress=True,
     )
+@click.command
+@icf_path
+@new_zarr_path
+@num_partitions
+@force
+@schema
+@variants_chunk_size
+@samples_chunk_size
+@max_variant_chunks
+@verbose
+def dencode_init(
+    icf_path,
+    zarr_path,
+    num_partitions,
+    force,
+    schema,
+    variants_chunk_size,
+    samples_chunk_size,
+    max_variant_chunks,
+    verbose,
+):
+    """
+    Initialise conversion of intermediate format to VCF Zarr. This will
+    set up the specified ZARR_PATH to perform this conversion over
+    NUM_PARTITIONS.
+    The output of this commmand is the actual number of partitions generated
+    (which may be less then the requested number, if there is not sufficient
+    chunks in the variants dimension) and a rough lower-bound on the amount
+    of memory required to encode a partition.
+    NOTE: the format of this output will likely change in subsequent releases;
+    it should not be considered machine-readable for now.
+    """
+    setup_logging(verbose)
+    check_overwrite_dir(zarr_path, force)
+    num_partitions, max_memory = vcf.encode_init(
+        icf_path,
+        zarr_path,
+        target_num_partitions=num_partitions,
+        schema_path=schema,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
+        max_variant_chunks=max_variant_chunks,
+        show_progress=True,
+    )
+    formatted_size = humanfriendly.format_size(max_memory, binary=True)
+    # NOTE adding the size to the stdout here so that users can parse it
+    # and use in their submission scripts. This is a first pass, and
+    # will most likely change as we see what works and doesn't.
+    # NOTE we probably want to format this as a table, which lists
+    # some other properties, line by line
+    # NOTE This size number is also not quite enough, you need a bit of
+    # headroom with it (probably 10% or so). We should include this.
+    click.echo(f"{num_partitions}\t{formatted_size}")
+@click.command
+@zarr_path
+@partition
+@verbose
+def dencode_partition(zarr_path, partition, verbose):
+    """
+    Convert a partition from intermediate columnar format to VCF Zarr.
+    Must be called *after* the Zarr path has been initialised with dencode_init.
+    Partition indexes must be from 0 (inclusive) to the number of paritions
+    returned by dencode_init (exclusive).
+    """
+    setup_logging(verbose)
+    vcf.encode_partition(zarr_path, partition)
+@click.command
+@zarr_path
+@verbose
+def dencode_finalise(zarr_path, verbose):
+    """
+    Final step for distributed conversion of ICF to VCF Zarr.
+    """
+    setup_logging(verbose)
+    vcf.encode_finalise(zarr_path, show_progress=True)
 @click.command(name="convert")
 @vcfs
 @new_zarr_path
@@ -386,6 +480,9 @@ vcf2zarr.add_command(encode)
 vcf2zarr.add_command(dexplode_init)
 vcf2zarr.add_command(dexplode_partition)
 vcf2zarr.add_command(dexplode_finalise)
+vcf2zarr.add_command(dencode_init)
+vcf2zarr.add_command(dencode_partition)
+vcf2zarr.add_command(dencode_finalise)
 @click.command(name="convert")

bio2zarr/core.py CHANGED Viewed

@@ -1,22 +1,31 @@
-import dataclasses
-import contextlib
 import concurrent.futures as cf
+import contextlib
+import dataclasses
+import logging
 import multiprocessing
 import threading
-import logging
 import time
-import zarr
+import numcodecs
 import numpy as np
 import tqdm
-import numcodecs
+import zarr
 logger = logging.getLogger(__name__)
 numcodecs.blosc.use_threads = False
+def min_int_dtype(min_value, max_value):
+    if min_value > max_value:
+        raise ValueError("min_value must be <= max_value")
+    for a_dtype in ["i1", "i2", "i4", "i8"]:
+        info = np.iinfo(a_dtype)
+        if info.min <= min_value and max_value <= info.max:
+            return a_dtype
+    raise OverflowError("Integer cannot be represented")
 def chunk_aligned_slices(z, n, max_chunks=None):
     """
     Returns at n slices in the specified zarr array, aligned
@@ -101,6 +110,7 @@ class BufferedArray:
                 sync_flush_2d_array(
                     self.buff[: self.buffer_row], self.array, self.array_offset
                 )
+            # FIXME the array.name doesn't seem to be working here for some reason
             logger.debug(
                 f"Flushed <{self.array.name} {self.array.shape} "
                 f"{self.array.dtype}> "
@@ -122,8 +132,7 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
     # encoder implementations.
     s = slice(offset, offset + np_buffer.shape[0])
     samples_chunk_size = zarr_array.chunks[1]
-    # TODO use zarr chunks here to support non-uniform chunking later
-    # and for simplicity
+    # TODO use zarr chunks here for simplicity
     zarr_array_width = zarr_array.shape[1]
     start = 0
     while start < zarr_array_width:
@@ -183,7 +192,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         self.progress_config = progress_config
         self.progress_bar = tqdm.tqdm(
             total=progress_config.total,
-            desc=f"{progress_config.title:>7}",
+            desc=f"{progress_config.title:>8}",
             unit_scale=True,
             unit=progress_config.units,
             smoothing=0.1,

bio2zarr/plink.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import logging
+import bed_reader
 import humanfriendly
+import numcodecs
 import numpy as np
 import zarr
-import bed_reader
-import numcodecs
 from . import core
 logger = logging.getLogger(__name__)
@@ -24,7 +23,6 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
     gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
     gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
     variants_chunk_size = gt.array.chunks[0]
-    n = gt.array.shape[1]
     assert start % variants_chunk_size == 0
     logger.debug(f"Reading slice {start}:{stop}")
@@ -96,7 +94,7 @@ def convert(
         chunks=(samples_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
-    logger.debug(f"Encoded samples")
+    logger.debug("Encoded samples")
     # TODO encode these in slices - but read them in one go to avoid
     # fetching repeatedly from bim file
@@ -108,7 +106,7 @@ def convert(
         chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
-    logger.debug(f"encoded variant_position")
+    logger.debug("encoded variant_position")
     alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
     a = root.array(
@@ -119,7 +117,7 @@ def convert(
         chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
-    logger.debug(f"encoded variant_allele")
+    logger.debug("encoded variant_allele")
     # TODO remove this?
     a = root.empty(
@@ -201,4 +199,4 @@ def validate(bed_path, zarr_path):
             elif bed_call == 2:
                 assert list(zarr_call) == [1, 1]
             else:  # pragma no cover
-                assert False
+                raise AssertionError(f"Unexpected bed call {bed_call}")

bio2zarr/typing.py CHANGED Viewed

@@ -1,4 +1,4 @@
 from pathlib import Path
 from typing import Union
-PathType = Union[str, Path]
+PathType = Union[str, Path]

bio2zarr 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.4py3-none-any.whl → 0.0.6py3-none-any.whl