PyPI - bio2zarr - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

bio2zarr 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (13) hide show

bio2zarr/_version.py +2 -2
bio2zarr/cli.py +189 -56
bio2zarr/core.py +36 -19
bio2zarr/plink.py +25 -19
bio2zarr/vcf.py +704 -389
bio2zarr/vcf_utils.py +0 -1
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/METADATA +1 -1
bio2zarr-0.0.2.dist-info/RECORD +16 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/WHEEL +1 -1
bio2zarr-0.0.1.dist-info/RECORD +0 -16
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.2.dist-info}/top_level.txt +0 -0

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.0.1'
-__version_tuple__ = version_tuple = (0, 0, 1)
+__version__ = version = '0.0.2'
+__version_tuple__ = version_tuple = (0, 0, 2)

bio2zarr/cli.py CHANGED Viewed

@@ -7,35 +7,52 @@ from . import vcf_utils
 from . import plink
 from . import provenance
+class NaturalOrderGroup(click.Group):
+    """
+    List commands in the order they are provided in the help text.
+    """
+    def list_commands(self, ctx):
+        return self.commands.keys()
 # Common arguments/options
 verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
+version = click.version_option(version=f"{provenance.__version__}")
 worker_processes = click.option(
     "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
 )
-# TODO help text
-chunk_length = click.option(
+column_chunk_size = click.option(
+    "-c",
+    "--column-chunk-size",
+    type=int,
+    default=64,
+    help="Approximate uncompressed size of exploded column chunks in MiB",
+)
+# Note: -l and -w were chosen when these were called "width" and "length".
+# possibly there are better letters now.
+variants_chunk_size = click.option(
     "-l",
-    "--chunk-length",
+    "--variants-chunk-size",
     type=int,
     default=None,
     help="Chunk size in the variants dimension",
 )
-chunk_width = click.option(
+samples_chunk_size = click.option(
     "-w",
-    "--chunk-width",
+    "--samples-chunk-size",
     type=int,
     default=None,
     help="Chunk size in the samples dimension",
 )
-version = click.version_option(version=f"bio2zarr {provenance.__version__}")
-# Note: logging hasn't been implemented in the code at all, this is just
-# a first pass to try out some ways of doing things to see what works.
 def setup_logging(verbosity):
     level = "WARNING"
     if verbosity == 1:
@@ -43,26 +60,24 @@ def setup_logging(verbosity):
     elif verbosity >= 2:
         level = "DEBUG"
     # NOTE: I'm not that excited about coloredlogs, just trying it out
-    # as it is installed by cyvcf2 anyway. We will have some complicated
-    # stuff doing on with threads and processes, to logs might not work
-    # so well anyway.
+    # as it is installed by cyvcf2 anyway.
     coloredlogs.install(level=level)
 @click.command
 @click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
+@click.argument("zarr_path", type=click.Path())
 @verbose
 @worker_processes
-@click.option("-c", "--column-chunk-size", type=int, default=64)
-def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
+@column_chunk_size
+def explode(vcfs, zarr_path, verbose, worker_processes, column_chunk_size):
     """
-    Convert VCF(s) to columnar intermediate format
+    Convert VCF(s) to intermediate columnar format
     """
     setup_logging(verbose)
     vcf.explode(
         vcfs,
-        out_path,
+        zarr_path,
         worker_processes=worker_processes,
         column_chunk_size=column_chunk_size,
         show_progress=True,
@@ -70,34 +85,85 @@ def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
 @click.command
-@click.argument("if_path", type=click.Path())
+@click.argument("vcfs", nargs=-1, required=True)
+@click.argument("icf_path", type=click.Path())
+@click.argument("num_partitions", type=int)
+@column_chunk_size
 @verbose
-def inspect(if_path, verbose):
+@worker_processes
+def dexplode_init(
+    vcfs, icf_path, num_partitions, column_chunk_size, verbose, worker_processes
+):
     """
-    Inspect an intermediate format file
+    Initial step for parallel conversion of VCF(s) to intermediate columnar format
+    over the requested number of paritions.
     """
     setup_logging(verbose)
-    data = vcf.inspect(if_path)
+    num_partitions = vcf.explode_init(
+        icf_path,
+        vcfs,
+        target_num_partitions=num_partitions,
+        column_chunk_size=column_chunk_size,
+        worker_processes=worker_processes,
+        show_progress=True,
+    )
+    click.echo(num_partitions)
+@click.command
+@click.argument("icf_path", type=click.Path())
+@click.argument("partition", type=int)
+@verbose
+def dexplode_partition(icf_path, partition, verbose):
+    """
+    Convert a VCF partition into intermediate columnar format. Must be called *after*
+    the ICF path has been initialised with dexplode_init. Partition indexes must be
+    from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
+    """
+    setup_logging(verbose)
+    vcf.explode_partition(icf_path, partition, show_progress=True)
+@click.command
+@click.argument("path", type=click.Path(), required=True)
+@verbose
+def dexplode_finalise(path, verbose):
+    """
+    Final step for parallel conversion of VCF(s) to intermediate columnar format
+    """
+    setup_logging(verbose)
+    vcf.explode_finalise(path)
+@click.command
+@click.argument("icf_path", type=click.Path())
+@verbose
+def inspect(icf_path, verbose):
+    """
+    Inspect an intermediate format or Zarr path.
+    """
+    setup_logging(verbose)
+    data = vcf.inspect(icf_path)
     click.echo(tabulate.tabulate(data, headers="keys"))
 @click.command
-@click.argument("if_path", type=click.Path())
-def mkschema(if_path):
+@click.argument("icf_path", type=click.Path())
+def mkschema(icf_path):
     """
     Generate a schema for zarr encoding
     """
     stream = click.get_text_stream("stdout")
-    vcf.mkschema(if_path, stream)
+    vcf.mkschema(icf_path, stream)
 @click.command
-@click.argument("if_path", type=click.Path())
+@click.argument("icf_path", type=click.Path())
 @click.argument("zarr_path", type=click.Path())
 @verbose
 @click.option("-s", "--schema", default=None, type=click.Path(exists=True))
-@chunk_length
-@chunk_width
+@variants_chunk_size
+@samples_chunk_size
 @click.option(
     "-V",
     "--max-variant-chunks",
@@ -109,50 +175,61 @@ def mkschema(if_path):
         "schema tuning."
     ),
 )
+@click.option(
+    "-M",
+    "--max-memory",
+    type=int,
+    default=None,
+    help="An approximate bound on overall memory usage in megabytes",
+)
 @worker_processes
 def encode(
-    if_path,
+    icf_path,
     zarr_path,
     verbose,
     schema,
-    chunk_length,
-    chunk_width,
+    variants_chunk_size,
+    samples_chunk_size,
     max_variant_chunks,
+    max_memory,
     worker_processes,
 ):
     """
-    Encode intermediate format (see explode) to vcfzarr
+    Encode intermediate columnar format (see explode) to vcfzarr.
     """
     setup_logging(verbose)
     vcf.encode(
-        if_path,
+        icf_path,
         zarr_path,
         schema,
-        chunk_length=chunk_length,
-        chunk_width=chunk_width,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
         max_v_chunks=max_variant_chunks,
         worker_processes=worker_processes,
+        max_memory=max_memory,
         show_progress=True,
     )
 @click.command(name="convert")
 @click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
-@chunk_length
-@chunk_width
+@click.argument("zarr_path", type=click.Path())
+@variants_chunk_size
+@samples_chunk_size
 @verbose
 @worker_processes
-def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_processes):
+def convert_vcf(
+    vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
+):
     """
-    Convert input VCF(s) directly to vcfzarr (not recommended for large files)
+    Convert input VCF(s) directly to vcfzarr (not recommended for large files).
     """
     setup_logging(verbose)
     vcf.convert(
         vcfs,
-        out_path,
-        chunk_length=chunk_length,
-        chunk_width=chunk_width,
+        zarr_path,
+        variants_chunk_size=variants_chunk_size,
+        samples_chunk_size=samples_chunk_size,
         show_progress=True,
         worker_processes=worker_processes,
     )
@@ -160,39 +237,95 @@ def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_proce
 @click.command
 @click.argument("vcfs", nargs=-1, required=True)
-@click.argument("out_path", type=click.Path())
-def validate(vcfs, out_path):
+@click.argument("zarr_path", type=click.Path())
+def validate(vcfs, zarr_path):
     """
     Development only, do not use. Will be removed before release.
     """
     # FIXME! Will silently not look at remaining VCFs
-    vcf.validate(vcfs[0], out_path, show_progress=True)
+    vcf.validate(vcfs[0], zarr_path, show_progress=True)
 @version
-@click.group()
+@click.group(cls=NaturalOrderGroup)
 def vcf2zarr():
-    pass
+    """
+    Convert VCF file(s) to the vcfzarr format.
+    The simplest usage is:
+    $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
+    This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
+    step. As this writes the intermediate columnar format to a temporary directory,
+    we only recommend this approach for small files (< 1GB, say).
+    The recommended approach is to run the conversion in two passes, and
+    to keep the intermediate columnar format ("exploded") around to facilitate
+    experimentation with chunk sizes and compression settings:
+    \b
+    $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
+    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
+    The inspect command provides a way to view contents of an exploded ICF
+    or Zarr:
+    $ vcf2zarr inspect [PATH]
+    This is useful when tweaking chunk sizes and compression settings to suit
+    your dataset, using the mkschema command and --schema option to encode:
+    \b
+    $ vcf2zarr mkschema [ICF_PATH] > schema.json
+    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
+    By editing the schema.json file you can drop columns that are not of interest
+    and edit column specific compression settings. The --max-variant-chunks option
+    to encode allows you to try out these options on small subsets, hopefully
+    arriving at settings with the desired balance of compression and query
+    performance.
+    ADVANCED USAGE
+    For very large datasets (terabyte scale) it may be necessary to distribute the
+    explode and encode steps across a cluster:
+    \b
+    $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
+    $ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
+    $ vcf2zarr dexplode-finalise [ICF_PATH]
+    See the online documentation at [FIXME] for more details on distributed explode.
+    """
 # TODO figure out how to get click to list these in the given order.
-vcf2zarr.add_command(explode)
+vcf2zarr.add_command(convert_vcf)
 vcf2zarr.add_command(inspect)
+vcf2zarr.add_command(explode)
 vcf2zarr.add_command(mkschema)
 vcf2zarr.add_command(encode)
-vcf2zarr.add_command(convert_vcf)
+vcf2zarr.add_command(dexplode_init)
+vcf2zarr.add_command(dexplode_partition)
+vcf2zarr.add_command(dexplode_finalise)
 vcf2zarr.add_command(validate)
 @click.command(name="convert")
 @click.argument("in_path", type=click.Path())
-@click.argument("out_path", type=click.Path())
+@click.argument("zarr_path", type=click.Path())
 @worker_processes
 @verbose
-@chunk_length
-@chunk_width
+@variants_chunk_size
+@samples_chunk_size
 def convert_plink(
-    in_path, out_path, verbose, worker_processes, chunk_length, chunk_width
+    in_path,
+    zarr_path,
+    verbose,
+    worker_processes,
+    variants_chunk_size,
+    samples_chunk_size,
 ):
     """
     In development; DO NOT USE!
@@ -200,11 +333,11 @@ def convert_plink(
     setup_logging(verbose)
     plink.convert(
         in_path,
-        out_path,
+        zarr_path,
         show_progress=True,
         worker_processes=worker_processes,
-        chunk_width=chunk_width,
-        chunk_length=chunk_length,
+        samples_chunk_size=samples_chunk_size,
+        variants_chunk_size=variants_chunk_size,
     )

bio2zarr/core.py CHANGED Viewed

@@ -16,12 +16,6 @@ logger = logging.getLogger(__name__)
 numcodecs.blosc.use_threads = False
-# TODO this should probably go in another module where we abstract
-# out the zarr defaults
-default_compressor = numcodecs.Blosc(
-    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
-)
 def chunk_aligned_slices(z, n, max_chunks=None):
     """
@@ -53,7 +47,12 @@ def wait_on_futures(futures):
     for future in cf.as_completed(futures):
         exception = future.exception()
         if exception is not None:
-            raise exception
+            cancel_futures(futures)
+            if isinstance(exception, cf.process.BrokenProcessPool):
+                raise RuntimeError(
+                    "Worker process died: you may have run out of memory") from exception
+            else:
+                raise exception
 def cancel_futures(futures):
@@ -74,15 +73,18 @@ class BufferedArray:
         assert offset % array.chunks[0] == 0
         dims = list(array.shape)
         dims[0] = min(array.chunks[0], array.shape[0])
-        self.buff = np.zeros(dims, dtype=array.dtype)
+        self.buff = np.empty(dims, dtype=array.dtype)
+        # Explicitly Fill with zeros here to make any out-of-memory errors happen
+        # quickly.
+        self.buff[:] = 0
         self.buffer_row = 0
     @property
-    def chunk_length(self):
+    def variants_chunk_size(self):
         return self.buff.shape[0]
     def next_buffer_row(self):
-        if self.buffer_row == self.chunk_length:
+        if self.buffer_row == self.variants_chunk_size:
             self.flush()
         row = self.buffer_row
         self.buffer_row += 1
@@ -104,13 +106,13 @@ class BufferedArray:
                 f"{self.array_offset}:{self.array_offset + self.buffer_row}"
                 f"{self.buff.nbytes / 2**20: .2f}Mb"
             )
-            self.array_offset += self.chunk_length
+            self.array_offset += self.variants_chunk_size
             self.buffer_row = 0
 def sync_flush_1d_array(np_buffer, zarr_array, offset):
     zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
-    update_progress(1)
+    update_progress(np_buffer.nbytes)
 def sync_flush_2d_array(np_buffer, zarr_array, offset):
@@ -118,13 +120,16 @@ def sync_flush_2d_array(np_buffer, zarr_array, offset):
     # incremental, and to avoid large memcopies in the underlying
     # encoder implementations.
     s = slice(offset, offset + np_buffer.shape[0])
-    chunk_width = zarr_array.chunks[1]
+    samples_chunk_size = zarr_array.chunks[1]
+    # TODO use zarr chunks here to support non-uniform chunking later
+    # and for simplicity
     zarr_array_width = zarr_array.shape[1]
     start = 0
     while start < zarr_array_width:
-        stop = min(start + chunk_width, zarr_array_width)
-        zarr_array[s, start:stop] = np_buffer[:, start:stop]
-        update_progress(1)
+        stop = min(start + samples_chunk_size, zarr_array_width)
+        chunk_buffer = np_buffer[:, start:stop]
+        zarr_array[s, start:stop] = chunk_buffer
+        update_progress(chunk_buffer.nbytes)
         start = stop
@@ -169,7 +174,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
             self.executor = cf.ProcessPoolExecutor(
                 max_workers=worker_processes,
             )
-        self.futures = []
+        self.futures = set()
         set_progress(0)
         if progress_config is None:
@@ -177,7 +182,7 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         self.progress_config = progress_config
         self.progress_bar = tqdm.tqdm(
             total=progress_config.total,
-            desc=f"{progress_config.title:>9}",
+            desc=f"{progress_config.title:>7}",
             unit_scale=True,
             unit=progress_config.units,
             smoothing=0.1,
@@ -208,7 +213,19 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         logger.debug("Exit progress thread")
     def submit(self, *args, **kwargs):
-        self.futures.append(self.executor.submit(*args, **kwargs))
+        future = self.executor.submit(*args, **kwargs)
+        self.futures.add(future)
+        return future
+    def wait_for_completed(self, timeout=None):
+        done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
+        for future in done:
+            exception = future.exception()
+            # TODO do the check for BrokenProcessPool here
+            if exception is not None:
+                raise exception
+        self.futures = not_done
+        return done
     def results_as_completed(self):
         for future in cf.as_completed(self.futures):

bio2zarr/plink.py CHANGED Viewed

@@ -4,6 +4,7 @@ import humanfriendly
 import numpy as np
 import zarr
 import bed_reader
+import numcodecs
 from . import core
@@ -22,14 +23,14 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
     gt = core.BufferedArray(root["call_genotype"], start)
     gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
     gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
-    chunk_length = gt.array.chunks[0]
+    variants_chunk_size = gt.array.chunks[0]
     n = gt.array.shape[1]
-    assert start % chunk_length == 0
+    assert start % variants_chunk_size == 0
     logger.debug(f"Reading slice {start}:{stop}")
     chunk_start = start
     while chunk_start < stop:
-        chunk_stop = min(chunk_start + chunk_length, stop)
+        chunk_stop = min(chunk_start + variants_chunk_size, stop)
         logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
         bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
         logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
@@ -60,8 +61,8 @@ def convert(
     *,
     show_progress=False,
     worker_processes=1,
-    chunk_length=None,
-    chunk_width=None,
+    variants_chunk_size=None,
+    samples_chunk_size=None,
 ):
     bed = bed_reader.open_bed(bed_path, num_threads=1)
     n = bed.iid_count
@@ -69,25 +70,30 @@ def convert(
     logging.info(f"Scanned plink with {n} samples and {m} variants")
     # FIXME
-    if chunk_width is None:
-        chunk_width = 1000
-    if chunk_length is None:
-        chunk_length = 10_000
+    if samples_chunk_size is None:
+        samples_chunk_size = 1000
+    if variants_chunk_size is None:
+        variants_chunk_size = 10_000
     store = zarr.DirectoryStore(zarr_path)
     root = zarr.group(store=store, overwrite=True)
     ploidy = 2
     shape = [m, n]
-    chunks = [chunk_length, chunk_width]
+    chunks = [variants_chunk_size, samples_chunk_size]
     dimensions = ["variants", "samples"]
+    # TODO we should be reusing some logic from vcfzarr here on laying
+    # out the basic dataset, and using the schema generator. Currently
+    # we're not using the best Blosc settings for genotypes here.
+    default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
     a = root.array(
         "sample_id",
         bed.iid,
         dtype="str",
-        compressor=core.default_compressor,
-        chunks=(chunk_width,),
+        compressor=default_compressor,
+        chunks=(samples_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
     logger.debug(f"Encoded samples")
@@ -98,8 +104,8 @@ def convert(
         "variant_position",
         bed.bp_position,
         dtype=np.int32,
-        compressor=core.default_compressor,
-        chunks=(chunk_length,),
+        compressor=default_compressor,
+        chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
     logger.debug(f"encoded variant_position")
@@ -109,8 +115,8 @@ def convert(
         "variant_allele",
         alleles,
         dtype="str",
-        compressor=core.default_compressor,
-        chunks=(chunk_length,),
+        compressor=default_compressor,
+        chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
     logger.debug(f"encoded variant_allele")
@@ -121,7 +127,7 @@ def convert(
         dtype="bool",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
@@ -132,7 +138,7 @@ def convert(
         dtype="i1",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
@@ -141,7 +147,7 @@ def convert(
         dtype="bool",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)

bio2zarr 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl