PyPI - bio2zarr - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

bio2zarr 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (13) hide show

bio2zarr/_version.py +2 -2
bio2zarr/cli.py +245 -68
bio2zarr/core.py +36 -19
bio2zarr/plink.py +25 -19
bio2zarr/vcf.py +704 -389
bio2zarr/vcf_utils.py +0 -1
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/METADATA +1 -1
bio2zarr-0.0.3.dist-info/RECORD +16 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/WHEEL +1 -1
bio2zarr-0.0.1.dist-info/RECORD +0 -16
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/top_level.txt +0 -0

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.0.1'
-__version_tuple__ = version_tuple = (0, 0, 1)
+__version__ = version = '0.0.3'
+__version_tuple__ = version_tuple = (0, 0, 3)

bio2zarr/cli.py CHANGED Viewed

@@ -1,3 +1,8 @@
+import logging
+import os
+import pathlib
+import shutil
 import click
 import tabulate
 import coloredlogs
@@ -7,35 +12,79 @@ from . import vcf_utils
 from . import plink
 from . import provenance
+logger = logging.getLogger(__name__)
+class NaturalOrderGroup(click.Group):
+    """
+    List commands in the order they are provided in the help text.
+    """
+    def list_commands(self, ctx):
+        return self.commands.keys()
 # Common arguments/options
+vcfs = click.argument(
+    "vcfs", nargs=-1, required=True, type=click.Path(exists=True, dir_okay=False)
+)
+new_icf_path = click.argument(
+    "icf_path", type=click.Path(file_okay=False, dir_okay=True)
+)
+icf_path = click.argument(
+    "icf_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
+)
+new_zarr_path = click.argument(
+    "zarr_path", type=click.Path(file_okay=False, dir_okay=True)
+)
 verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
+force = click.option(
+    "-f",
+    "--force",
+    is_flag=True,
+    flag_value=True,
+    help="Force overwriting of existing directories",
+)
+version = click.version_option(version=f"{provenance.__version__}")
 worker_processes = click.option(
     "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
 )
-# TODO help text
-chunk_length = click.option(
+column_chunk_size = click.option(
+    "-c",
+    "--column-chunk-size",
+    type=int,
+    default=64,
+    help="Approximate uncompressed size of exploded column chunks in MiB",
+)
+# Note: -l and -w were chosen when these were called "width" and "length".
+# possibly there are better letters now.
+variants_chunk_size = click.option(
     "-l",
-    "--chunk-length",
+    "--variants-chunk-size",
     type=int,
     default=None,
     help="Chunk size in the variants dimension",
 )
-chunk_width = click.option(
+samples_chunk_size = click.option(
     "-w",
-    "--chunk-width",
+    "--samples-chunk-size",
     type=int,
     default=None,
     help="Chunk size in the samples dimension",
 )
-version = click.version_option(version=f"bio2zarr {provenance.__version__}")
-# Note: logging hasn't been implemented in the code at all, this is just
-# a first pass to try out some ways of doing things to see what works.
 def setup_logging(verbosity):
     level = "WARNING"
     if verbosity == 1:
@@ -43,26 +92,43 @@ def setup_logging(verbosity):
     elif verbosity >= 2:
         level = "DEBUG"
     # NOTE: I'm not that excited about coloredlogs, just trying it out
-    # as it is installed by cyvcf2 anyway. We will have some complicated
-    # stuff doing on with threads and processes, to logs might not work
-    # so well anyway.
+    # as it is installed by cyvcf2 anyway.
     coloredlogs.install(level=level)
+def check_overwrite_dir(path, force):
+    path = pathlib.Path(path)
+    if path.exists():
+        if not force:
+            click.confirm(
+                f"Do you want to overwrite {path}? (use --force to skip this check)",
+                abort=True,
+            )
+        # These trees can be mondo-big and on slow file systems, so it's entirely
+        # feasible that the delete would fail or be killed. This makes it less likely
+        # that partially deleted paths are mistaken for good paths.
+        tmp_delete_path = path.with_suffix(f"{path.suffix}.{os.getpid()}.DELETING")
+        logger.info(f"Deleting {path} (renamed to {tmp_delete_path} while in progress)")
+        os.rename(path, tmp_delete_path)
+        shutil.rmtree(tmp_delete_path)
 @click.command
-@click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
+@vcfs
+@new_icf_path
+@force
 @verbose
 @worker_processes
-@click.option("-c", "--column-chunk-size", type=int, default=64)
-def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
+@column_chunk_size
+def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size):
     """
-    Convert VCF(s) to columnar intermediate format
+    Convert VCF(s) to intermediate columnar format
     """
     setup_logging(verbose)
+    check_overwrite_dir(icf_path, force)
     vcf.explode(
         vcfs,
-        out_path,
+        icf_path,
         worker_processes=worker_processes,
         column_chunk_size=column_chunk_size,
         show_progress=True,
@@ -70,34 +136,88 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
 @click.command
-@click.argument("if_path", type=click.Path())
+@vcfs
+@new_icf_path
+@click.argument("num_partitions", type=click.IntRange(min=1))
+@force
+@column_chunk_size
 @verbose
-def inspect(if_path, verbose):
+@worker_processes
+def dexplode_init(
+    vcfs, icf_path, num_partitions, force, column_chunk_size, verbose, worker_processes
+):
     """
-    Inspect an intermediate format file
+    Initial step for distributed conversion of VCF(s) to intermediate columnar format
+    over the requested number of paritions.
     """
     setup_logging(verbose)
-    data = vcf.inspect(if_path)
+    check_overwrite_dir(icf_path, force)
+    num_partitions = vcf.explode_init(
+        icf_path,
+        vcfs,
+        target_num_partitions=num_partitions,
+        column_chunk_size=column_chunk_size,
+        worker_processes=worker_processes,
+        show_progress=True,
+    )
+    click.echo(num_partitions)
+@click.command
+@icf_path
+@click.argument("partition", type=click.IntRange(min=0))
+@verbose
+def dexplode_partition(icf_path, partition, verbose):
+    """
+    Convert a VCF partition to intermediate columnar format. Must be called *after*
+    the ICF path has been initialised with dexplode_init. Partition indexes must be
+    from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
+    """
+    setup_logging(verbose)
+    vcf.explode_partition(icf_path, partition, show_progress=True)
+@click.command
+@click.argument("path", type=click.Path(), required=True)
+@verbose
+def dexplode_finalise(path, verbose):
+    """
+    Final step for distributed conversion of VCF(s) to intermediate columnar format.
+    """
+    setup_logging(verbose)
+    vcf.explode_finalise(path)
+@click.command
+@click.argument("path", type=click.Path())
+@verbose
+def inspect(path, verbose):
+    """
+    Inspect an intermediate columnar format or Zarr path.
+    """
+    setup_logging(verbose)
+    data = vcf.inspect(path)
     click.echo(tabulate.tabulate(data, headers="keys"))
 @click.command
-@click.argument("if_path", type=click.Path())
-def mkschema(if_path):
+@icf_path
+def mkschema(icf_path):
     """
     Generate a schema for zarr encoding
     """
     stream = click.get_text_stream("stdout")
-    vcf.mkschema(if_path, stream)
+    vcf.mkschema(icf_path, stream)
 @click.command
-@click.argument("if_path", type=click.Path())
-@click.argument("zarr_path", type=click.Path())
+@icf_path
+@new_zarr_path
+@force
 @verbose
 @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
-@chunk_length
-@chunk_width
+@variants_chunk_size
+@samples_chunk_size
 @click.option(
     "-V",
     "--max-variant-chunks",
@@ -109,90 +229,147 @@ def mkschema(if_path):
         "schema tuning."
     ),
 )
+@click.option(
+    "-M",
+    "--max-memory",
+    type=int,
+    default=None,
+    help="An approximate bound on overall memory usage in megabytes",
+)
 @worker_processes
 def encode(
-    if_path,
+    icf_path,
     zarr_path,
+    force,
     verbose,
     schema,
-    chunk_length,
-    chunk_width,
+    variants_chunk_size,
+    samples_chunk_size,
     max_variant_chunks,
+    max_memory,
     worker_processes,
 ):
     """
-    Encode intermediate format (see explode) to vcfzarr
+    Encode intermediate columnar format (see explode) to vcfzarr.
     """
     setup_logging(verbose)
+    check_overwrite_dir(zarr_path, force)
     vcf.encode(
-        if_path,
+        icf_path,
         zarr_path,
-        schema,
-        chunk_length=chunk_length,
-        chunk_width=chunk_width,
+        schema_path=schema,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
         max_v_chunks=max_variant_chunks,
         worker_processes=worker_processes,
+        max_memory=max_memory,
         show_progress=True,
     )
 @click.command(name="convert")
-@click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
-@chunk_length
-@chunk_width
+@vcfs
+@new_zarr_path
+@variants_chunk_size
+@samples_chunk_size
 @verbose
 @worker_processes
-def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_processes):
+def convert_vcf(
+    vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
+):
     """
-    Convert input VCF(s) directly to vcfzarr (not recommended for large files)
+    Convert input VCF(s) directly to vcfzarr (not recommended for large files).
     """
     setup_logging(verbose)
     vcf.convert(
         vcfs,
-        out_path,
-        chunk_length=chunk_length,
-        chunk_width=chunk_width,
+        zarr_path,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
         show_progress=True,
         worker_processes=worker_processes,
     )
-@click.command
-@click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
-def validate(vcfs, out_path):
-    """
-    Development only, do not use. Will be removed before release.
+@version
+@click.group(cls=NaturalOrderGroup)
+def vcf2zarr():
     """
-    # FIXME! Will silently not look at remaining VCFs
-    vcf.validate(vcfs[0], out_path, show_progress=True)
+    Convert VCF file(s) to the vcfzarr format.
+    The simplest usage is:
-@version
-@click.group()
-def vcf2zarr():
-    pass
+    $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
+    This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
+    step. As this writes the intermediate columnar format to a temporary directory,
+    we only recommend this approach for small files (< 1GB, say).
+    The recommended approach is to run the conversion in two passes, and
+    to keep the intermediate columnar format ("exploded") around to facilitate
+    experimentation with chunk sizes and compression settings:
+    \b
+    $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
+    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
+    The inspect command provides a way to view contents of an exploded ICF
+    or Zarr:
+    $ vcf2zarr inspect [PATH]
+    This is useful when tweaking chunk sizes and compression settings to suit
+    your dataset, using the mkschema command and --schema option to encode:
+    \b
+    $ vcf2zarr mkschema [ICF_PATH] > schema.json
+    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
+    By editing the schema.json file you can drop columns that are not of interest
+    and edit column specific compression settings. The --max-variant-chunks option
+    to encode allows you to try out these options on small subsets, hopefully
+    arriving at settings with the desired balance of compression and query
+    performance.
+    ADVANCED USAGE
+    For very large datasets (terabyte scale) it may be necessary to distribute the
+    explode and encode steps across a cluster:
+    \b
+    $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
+    $ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
+    $ vcf2zarr dexplode-finalise [ICF_PATH]
+    See the online documentation at [FIXME] for more details on distributed explode.
+    """
 # TODO figure out how to get click to list these in the given order.
-vcf2zarr.add_command(explode)
+vcf2zarr.add_command(convert_vcf)
 vcf2zarr.add_command(inspect)
+vcf2zarr.add_command(explode)
 vcf2zarr.add_command(mkschema)
 vcf2zarr.add_command(encode)
-vcf2zarr.add_command(convert_vcf)
-vcf2zarr.add_command(validate)
+vcf2zarr.add_command(dexplode_init)
+vcf2zarr.add_command(dexplode_partition)
+vcf2zarr.add_command(dexplode_finalise)
 @click.command(name="convert")
 @click.argument("in_path", type=click.Path())
-@click.argument("out_path", type=click.Path())
+@click.argument("zarr_path", type=click.Path())
 @worker_processes
 @verbose
-@chunk_length
-@chunk_width
+@variants_chunk_size
+@samples_chunk_size
 def convert_plink(
-    in_path, out_path, verbose, worker_processes, chunk_length, chunk_width
+    in_path,
+    zarr_path,
+    verbose,
+    worker_processes,
+    variants_chunk_size,
+    samples_chunk_size,
 ):
     """
     In development; DO NOT USE!
@@ -200,11 +377,11 @@ def convert_plink(
     setup_logging(verbose)
     plink.convert(
         in_path,
-        out_path,
+        zarr_path,
         show_progress=True,
         worker_processes=worker_processes,
-        chunk_width=chunk_width,
-        chunk_length=chunk_length,
+        samples_chunk_size=samples_chunk_size,
+        variants_chunk_size=variants_chunk_size,
     )

bio2zarr/core.py CHANGED Viewed

@@ -16,12 +16,6 @@ logger = logging.getLogger(__name__)
 numcodecs.blosc.use_threads = False
-# TODO this should probably go in another module where we abstract
-# out the zarr defaults
-default_compressor = numcodecs.Blosc(
-    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
-)
 def chunk_aligned_slices(z, n, max_chunks=None):
     """
@@ -53,7 +47,12 @@ def wait_on_futures(futures):
     for future in cf.as_completed(futures):
         exception = future.exception()
         if exception is not None:
-            raise exception
+            cancel_futures(futures)
+            if isinstance(exception, cf.process.BrokenProcessPool):
+                raise RuntimeError(
+                    "Worker process died: you may have run out of memory") from exception
+            else:
+                raise exception
 def cancel_futures(futures):
@@ -74,15 +73,18 @@ class BufferedArray:
         assert offset % array.chunks[0] == 0
         dims = list(array.shape)
         dims[0] = min(array.chunks[0], array.shape[0])
-        self.buff = np.zeros(dims, dtype=array.dtype)
+        self.buff = np.empty(dims, dtype=array.dtype)
+        # Explicitly Fill with zeros here to make any out-of-memory errors happen
+        # quickly.
+        self.buff[:] = 0
         self.buffer_row = 0
     @property
-    def chunk_length(self):
+    def variants_chunk_size(self):
         return self.buff.shape[0]
     def next_buffer_row(self):
-        if self.buffer_row == self.chunk_length:
+        if self.buffer_row == self.variants_chunk_size:
             self.flush()
         row = self.buffer_row
         self.buffer_row += 1
@@ -104,13 +106,13 @@ class BufferedArray:
                 f"{self.array_offset}:{self.array_offset + self.buffer_row}"
                 f"{self.buff.nbytes / 2**20: .2f}Mb"
             )
-            self.array_offset += self.chunk_length
+            self.array_offset += self.variants_chunk_size
             self.buffer_row = 0
 def sync_flush_1d_array(np_buffer, zarr_array, offset):
     zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
-    update_progress(1)
+    update_progress(np_buffer.nbytes)
 def sync_flush_2d_array(np_buffer, zarr_array, offset):
@@ -118,13 +120,16 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
     # incremental, and to avoid large memcopies in the underlying
     # encoder implementations.
     s = slice(offset, offset + np_buffer.shape[0])
-    chunk_width = zarr_array.chunks[1]
+    samples_chunk_size = zarr_array.chunks[1]
+    # TODO use zarr chunks here to support non-uniform chunking later
+    # and for simplicity
     zarr_array_width = zarr_array.shape[1]
     start = 0
     while start < zarr_array_width:
-        stop = min(start + chunk_width, zarr_array_width)
-        zarr_array[s, start:stop] = np_buffer[:, start:stop]
-        update_progress(1)
+        stop = min(start + samples_chunk_size, zarr_array_width)
+        chunk_buffer = np_buffer[:, start:stop]
+        zarr_array[s, start:stop] = chunk_buffer
+        update_progress(chunk_buffer.nbytes)
         start = stop
@@ -169,7 +174,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
             self.executor = cf.ProcessPoolExecutor(
                 max_workers=worker_processes,
             )
-        self.futures = []
+        self.futures = set()
         set_progress(0)
         if progress_config is None:
@@ -177,7 +182,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         self.progress_config = progress_config
         self.progress_bar = tqdm.tqdm(
             total=progress_config.total,
-            desc=f"{progress_config.title:>9}",
+            desc=f"{progress_config.title:>7}",
             unit_scale=True,
             unit=progress_config.units,
             smoothing=0.1,
@@ -208,7 +213,19 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         logger.debug("Exit progress thread")
     def submit(self, *args, **kwargs):
-        self.futures.append(self.executor.submit(*args, **kwargs))
+        future = self.executor.submit(*args, **kwargs)
+        self.futures.add(future)
+        return future
+    def wait_for_completed(self, timeout=None):
+        done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
+        for future in done:
+            exception = future.exception()
+            # TODO do the check for BrokenProcessPool here
+            if exception is not None:
+                raise exception
+        self.futures = not_done
+        return done
     def results_as_completed(self):
         for future in cf.as_completed(self.futures):

bio2zarr 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl