PyPI - bio2zarr - Versions diffs - 0.0.6__py3-none-any.whl → 0.0.10__py3-none-any.whl - Mend

bio2zarr 0.0.6py3-none-any.whl → 0.0.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (20) hide show

bio2zarr/__main__.py +2 -2
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +87 -95
bio2zarr/constants.py +18 -0
bio2zarr/core.py +52 -16
bio2zarr/vcf2zarr/__init__.py +38 -0
bio2zarr/vcf2zarr/icf.py +1220 -0
bio2zarr/vcf2zarr/vcz.py +1017 -0
bio2zarr/vcf2zarr/verification.py +230 -0
bio2zarr/vcf_utils.py +26 -17
bio2zarr-0.0.10.dist-info/METADATA +250 -0
bio2zarr-0.0.10.dist-info/RECORD +20 -0
bio2zarr-0.0.10.dist-info/entry_points.txt +3 -0
bio2zarr/vcf.py +0 -2406
bio2zarr-0.0.6.dist-info/METADATA +0 -33
bio2zarr-0.0.6.dist-info/RECORD +0 -16
bio2zarr-0.0.6.dist-info/entry_points.txt +0 -4
{bio2zarr-0.0.6.dist-info → bio2zarr-0.0.10.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.6.dist-info → bio2zarr-0.0.10.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.6.dist-info → bio2zarr-0.0.10.dist-info}/top_level.txt +0 -0

bio2zarr/__main__.py CHANGED Viewed

@@ -14,9 +14,9 @@ def bio2zarr():
 # install individual commands as console scripts. However, this
 # is handy for development and for those whose PATHs aren't set
 # up in the right way.
-bio2zarr.add_command(cli.vcf2zarr)
+bio2zarr.add_command(cli.vcf2zarr_main)
 bio2zarr.add_command(cli.plink2zarr)
-bio2zarr.add_command(cli.vcf_partition)
+bio2zarr.add_command(cli.vcfpartition)
 if __name__ == "__main__":
     bio2zarr()

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.0.6'
-__version_tuple__ = version_tuple = (0, 0, 6)
+__version__ = version = '0.0.10'
+__version_tuple__ = version_tuple = (0, 0, 10)

bio2zarr/cli.py CHANGED Viewed

@@ -5,11 +5,11 @@ import shutil
 import click
 import coloredlogs
-import humanfriendly
 import numcodecs
 import tabulate
-from . import plink, provenance, vcf, vcf_utils
+from . import plink, provenance, vcf2zarr, vcf_utils
+from .vcf2zarr import icf as icf_mod
 logger = logging.getLogger(__name__)
@@ -58,6 +58,20 @@ force = click.option(
     help="Force overwriting of existing directories",
 )
+one_based = click.option(
+    "--one-based",
+    is_flag=True,
+    flag_value=True,
+    help="Partition indexes are interpreted as one-based",
+)
+json = click.option(
+    "--json",
+    is_flag=True,
+    flag_value=True,
+    help="Output summary data in JSON format",
+)
 version = click.version_option(version=f"{provenance.__version__}")
 worker_processes = click.option(
@@ -154,11 +168,21 @@ def check_overwrite_dir(path, force):
 def get_compressor(cname):
     if cname is None:
         return None
-    config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
+    config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
     config["cname"] = cname
     return numcodecs.get_codec(config)
+def show_work_summary(work_summary, json):
+    if json:
+        output = work_summary.asjson()
+    else:
+        data = work_summary.asdict()
+        output = tabulate.tabulate(list(data.items()), tablefmt="plain")
+        # output = "\n".join(f"{k}\t{v}" for k, v in data.items())
+    click.echo(output)
 @click.command
 @vcfs
 @new_icf_path
@@ -175,7 +199,7 @@ def explode(
     """
     setup_logging(verbose)
     check_overwrite_dir(icf_path, force)
-    vcf.explode(
+    vcf2zarr.explode(
         icf_path,
         vcfs,
         worker_processes=worker_processes,
@@ -192,6 +216,7 @@ def explode(
 @force
 @column_chunk_size
 @compressor
+@json
 @verbose
 @worker_processes
 def dexplode_init(
@@ -201,6 +226,7 @@ def dexplode_init(
     force,
     column_chunk_size,
     compressor,
+    json,
     verbose,
     worker_processes,
 ):
@@ -210,7 +236,7 @@ def dexplode_init(
     """
     setup_logging(verbose)
     check_overwrite_dir(icf_path, force)
-    num_partitions = vcf.explode_init(
+    work_summary = vcf2zarr.explode_init(
         icf_path,
         vcfs,
         target_num_partitions=num_partitions,
@@ -219,21 +245,26 @@ def dexplode_init(
         compressor=get_compressor(compressor),
         show_progress=True,
     )
-    click.echo(num_partitions)
+    show_work_summary(work_summary, json)
 @click.command
 @icf_path
 @partition
 @verbose
-def dexplode_partition(icf_path, partition, verbose):
+@one_based
+def dexplode_partition(icf_path, partition, verbose, one_based):
     """
-    Convert a VCF partition to intermediate columnar format. Must be called *after*
-    the ICF path has been initialised with dexplode_init. Partition indexes must be
-    from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
+    Convert a VCF partition to intermediate columnar format. Must be called
+    after the ICF path has been initialised with dexplode_init. By default,
+    partition indexes are from 0 to the number of partitions N (returned by
+    dexplode_init), exclusive. If the --one-based option is specifed,
+    partition indexes are in the range 1 to N, inclusive.
     """
     setup_logging(verbose)
-    vcf.explode_partition(icf_path, partition, show_progress=False)
+    if one_based:
+        partition -= 1
+    vcf2zarr.explode_partition(icf_path, partition)
 @click.command
@@ -244,7 +275,7 @@ def dexplode_finalise(icf_path, verbose):
     Final step for distributed conversion of VCF(s) to intermediate columnar format.
     """
     setup_logging(verbose)
-    vcf.explode_finalise(icf_path)
+    vcf2zarr.explode_finalise(icf_path)
 @click.command
@@ -255,7 +286,7 @@ def inspect(path, verbose):
     Inspect an intermediate columnar format or Zarr path.
     """
     setup_logging(verbose)
-    data = vcf.inspect(path)
+    data = vcf2zarr.inspect(path)
     click.echo(tabulate.tabulate(data, headers="keys"))
@@ -266,7 +297,7 @@ def mkschema(icf_path):
     Generate a schema for zarr encoding
     """
     stream = click.get_text_stream("stdout")
-    vcf.mkschema(icf_path, stream)
+    vcf2zarr.mkschema(icf_path, stream)
 @click.command
@@ -297,7 +328,7 @@ def encode(
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
-    vcf.encode(
+    vcf2zarr.encode(
         icf_path,
         zarr_path,
         schema_path=schema,
@@ -319,6 +350,7 @@ def encode(
 @variants_chunk_size
 @samples_chunk_size
 @max_variant_chunks
+@json
 @verbose
 def dencode_init(
     icf_path,
@@ -329,6 +361,7 @@ def dencode_init(
     variants_chunk_size,
     samples_chunk_size,
     max_variant_chunks,
+    json,
     verbose,
 ):
     """
@@ -346,7 +379,7 @@ def dencode_init(
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
-    num_partitions, max_memory = vcf.encode_init(
+    work_summary = vcf2zarr.encode_init(
         icf_path,
         zarr_path,
         target_num_partitions=num_partitions,
@@ -356,30 +389,25 @@ def dencode_init(
         max_variant_chunks=max_variant_chunks,
         show_progress=True,
     )
-    formatted_size = humanfriendly.format_size(max_memory, binary=True)
-    # NOTE adding the size to the stdout here so that users can parse it
-    # and use in their submission scripts. This is a first pass, and
-    # will most likely change as we see what works and doesn't.
-    # NOTE we probably want to format this as a table, which lists
-    # some other properties, line by line
-    # NOTE This size number is also not quite enough, you need a bit of
-    # headroom with it (probably 10% or so). We should include this.
-    click.echo(f"{num_partitions}\t{formatted_size}")
+    show_work_summary(work_summary, json)
 @click.command
 @zarr_path
 @partition
 @verbose
-def dencode_partition(zarr_path, partition, verbose):
-    """
-    Convert a partition from intermediate columnar format to VCF Zarr.
-    Must be called *after* the Zarr path has been initialised with dencode_init.
-    Partition indexes must be from 0 (inclusive) to the number of paritions
-    returned by dencode_init (exclusive).
+@one_based
+def dencode_partition(zarr_path, partition, verbose, one_based):
     """
+    Convert a partition from intermediate columnar format to VCF Zarr. Must be
+    called after the Zarr path has been initialised with dencode_init. By
+    default, partition indexes are from 0 to the number of partitions N
+    (returned by dencode_init), exclusive. If the --one-based option is
+    specifed, partition indexes are in the range 1 to N, inclusive."""
     setup_logging(verbose)
-    vcf.encode_partition(zarr_path, partition)
+    if one_based:
+        partition -= 1
+    vcf2zarr.encode_partition(zarr_path, partition)
 @click.command
@@ -390,24 +418,32 @@ def dencode_finalise(zarr_path, verbose):
     Final step for distributed conversion of ICF to VCF Zarr.
     """
     setup_logging(verbose)
-    vcf.encode_finalise(zarr_path, show_progress=True)
+    vcf2zarr.encode_finalise(zarr_path, show_progress=True)
 @click.command(name="convert")
 @vcfs
 @new_zarr_path
+@force
 @variants_chunk_size
 @samples_chunk_size
 @verbose
 @worker_processes
 def convert_vcf(
-    vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
+    vcfs,
+    zarr_path,
+    force,
+    variants_chunk_size,
+    samples_chunk_size,
+    verbose,
+    worker_processes,
 ):
     """
     Convert input VCF(s) directly to vcfzarr (not recommended for large files).
     """
     setup_logging(verbose)
-    vcf.convert(
+    check_overwrite_dir(zarr_path, force)
+    vcf2zarr.convert(
         vcfs,
         zarr_path,
         variants_chunk_size=variants_chunk_size,
@@ -418,71 +454,27 @@ def convert_vcf(
 @version
-@click.group(cls=NaturalOrderGroup)
-def vcf2zarr():
+@click.group(cls=NaturalOrderGroup, name="vcf2zarr")
+def vcf2zarr_main():
     """
     Convert VCF file(s) to the vcfzarr format.
-    The simplest usage is:
-    $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
-    This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
-    step. As this writes the intermediate columnar format to a temporary directory,
-    we only recommend this approach for small files (< 1GB, say).
-    The recommended approach is to run the conversion in two passes, and
-    to keep the intermediate columnar format ("exploded") around to facilitate
-    experimentation with chunk sizes and compression settings:
-    \b
-    $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
-    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
-    The inspect command provides a way to view contents of an exploded ICF
-    or Zarr:
-    $ vcf2zarr inspect [PATH]
-    This is useful when tweaking chunk sizes and compression settings to suit
-    your dataset, using the mkschema command and --schema option to encode:
-    \b
-    $ vcf2zarr mkschema [ICF_PATH] > schema.json
-    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
-    By editing the schema.json file you can drop columns that are not of interest
-    and edit column specific compression settings. The --max-variant-chunks option
-    to encode allows you to try out these options on small subsets, hopefully
-    arriving at settings with the desired balance of compression and query
-    performance.
-    ADVANCED USAGE
-    For very large datasets (terabyte scale) it may be necessary to distribute the
-    explode and encode steps across a cluster:
-    \b
-    $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
-    $ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
-    $ vcf2zarr dexplode-finalise [ICF_PATH]
-    See the online documentation at [FIXME] for more details on distributed explode.
+    See the online documentation at https://sgkit-dev.github.io/bio2zarr/
+    for more information.
     """
-# TODO figure out how to get click to list these in the given order.
-vcf2zarr.add_command(convert_vcf)
-vcf2zarr.add_command(inspect)
-vcf2zarr.add_command(explode)
-vcf2zarr.add_command(mkschema)
-vcf2zarr.add_command(encode)
-vcf2zarr.add_command(dexplode_init)
-vcf2zarr.add_command(dexplode_partition)
-vcf2zarr.add_command(dexplode_finalise)
-vcf2zarr.add_command(dencode_init)
-vcf2zarr.add_command(dencode_partition)
-vcf2zarr.add_command(dencode_finalise)
+vcf2zarr_main.add_command(convert_vcf)
+vcf2zarr_main.add_command(inspect)
+vcf2zarr_main.add_command(explode)
+vcf2zarr_main.add_command(mkschema)
+vcf2zarr_main.add_command(encode)
+vcf2zarr_main.add_command(dexplode_init)
+vcf2zarr_main.add_command(dexplode_partition)
+vcf2zarr_main.add_command(dexplode_finalise)
+vcf2zarr_main.add_command(dencode_init)
+vcf2zarr_main.add_command(dencode_partition)
+vcf2zarr_main.add_command(dencode_finalise)
 @click.command(name="convert")
@@ -529,7 +521,7 @@ plink2zarr.add_command(convert_plink)
 @click.option("-i", "--index", type=click.Path(), default=None)
 @click.option("-n", "--num-parts", type=int, default=None)
 # @click.option("-s", "--part-size", type=int, default=None)
-def vcf_partition(vcf_path, index, num_parts):
+def vcfpartition(vcf_path, index, num_parts):
     indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
     regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
     click.echo("\n".join(map(str, regions)))

bio2zarr/constants.py ADDED Viewed

@@ -0,0 +1,18 @@
+import numpy as np
+INT_MISSING = -1
+INT_FILL = -2
+STR_MISSING = "."
+STR_FILL = ""
+FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
+    np.float32
+)
+FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
+    [0x7F800001, 0x7F800002], dtype=np.int32
+)
+MIN_INT_VALUE = np.iinfo(np.int32).min + 2
+VCF_INT_MISSING = np.iinfo(np.int32).min
+VCF_INT_FILL = np.iinfo(np.int32).min + 1

bio2zarr/core.py CHANGED Viewed

@@ -1,11 +1,16 @@
 import concurrent.futures as cf
 import contextlib
 import dataclasses
+import json
 import logging
+import math
 import multiprocessing
+import os
+import os.path
 import threading
 import time
+import humanfriendly
 import numcodecs
 import numpy as np
 import tqdm
@@ -16,6 +21,17 @@ logger = logging.getLogger(__name__)
 numcodecs.blosc.use_threads = False
+def display_number(x):
+    ret = "n/a"
+    if math.isfinite(x):
+        ret = f"{x: 0.2g}"
+    return ret
+def display_size(n):
+    return humanfriendly.format_size(n, binary=True)
 def min_int_dtype(min_value, max_value):
     if min_value > max_value:
         raise ValueError("min_value must be <= max_value")
@@ -45,6 +61,22 @@ def chunk_aligned_slices(z, n, max_chunks=None):
     return slices
+def du(path):
+    """
+    Return the total bytes stored at this path.
+    """
+    total = os.path.getsize(path)
+    # pathlib walk method doesn't exist until 3.12 :(
+    for root, dirs, files in os.walk(path):
+        for lst in [dirs, files]:
+            for name in lst:
+                fullname = os.path.join(root, name)
+                size = os.path.getsize(fullname)
+                total += size
+    logger.debug(f"du({path}) = {total}")
+    return total
 class SynchronousExecutor(cf.Executor):
     def submit(self, fn, /, *args, **kwargs):
         future = cf.Future()
@@ -110,7 +142,6 @@ class BufferedArray:
                 sync_flush_2d_array(
                     self.buff[: self.buffer_row], self.array, self.array_offset
                 )
-            # FIXME the array.name doesn't seem to be working here for some reason
             logger.debug(
                 f"Flushed <{self.array.name} {self.array.shape} "
                 f"{self.array.dtype}> "
@@ -156,7 +187,7 @@ class ProgressConfig:
 # progressable thing happening per source process. This is
 # probably fine in practise, but there could be corner cases
 # where it's not. Something to watch out for.
-_progress_counter = multiprocessing.Value("Q", 0)
+_progress_counter = None
 def update_progress(inc):
@@ -170,23 +201,30 @@ def get_progress():
     return val
-def set_progress(value):
-    with _progress_counter.get_lock():
-        _progress_counter.value = value
+def setup_progress_counter(counter):
+    global _progress_counter
+    _progress_counter = counter
 class ParallelWorkManager(contextlib.AbstractContextManager):
     def __init__(self, worker_processes=1, progress_config=None):
+        # Need to specify this explicitly to suppport Macs and
+        # for future proofing.
+        ctx = multiprocessing.get_context("spawn")
+        global _progress_counter
+        _progress_counter = ctx.Value("Q", 0)
         if worker_processes <= 0:
             # NOTE: this is only for testing, not for production use!
             self.executor = SynchronousExecutor()
         else:
             self.executor = cf.ProcessPoolExecutor(
                 max_workers=worker_processes,
+                mp_context=ctx,
+                initializer=setup_progress_counter,
+                initargs=(_progress_counter,),
             )
         self.futures = set()
-        set_progress(0)
         if progress_config is None:
             progress_config = ProgressConfig()
         self.progress_config = progress_config
@@ -227,16 +265,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         self.futures.add(future)
         return future
-    def wait_for_completed(self, timeout=None):
-        done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
-        for future in done:
-            exception = future.exception()
-            # TODO do the check for BrokenProcessPool here
-            if exception is not None:
-                raise exception
-        self.futures = not_done
-        return done
     def results_as_completed(self):
         for future in cf.as_completed(self.futures):
             yield future.result()
@@ -260,3 +288,11 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         self._update_progress()
         self.progress_bar.close()
         return False
+class JsonDataclass:
+    def asdict(self):
+        return dataclasses.asdict(self)
+    def asjson(self):
+        return json.dumps(self.asdict(), indent=4)

bio2zarr/vcf2zarr/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+from .icf import (
+    IntermediateColumnarFormat,
+    explode,
+    explode_finalise,
+    explode_init,
+    explode_partition,
+)
+from .vcz import (
+    VcfZarrSchema,
+    convert,
+    encode,
+    encode_finalise,
+    encode_init,
+    encode_partition,
+    inspect,
+    mkschema,
+)
+from .verification import verify
+# NOTE some of these aren't intended to be part of the external
+# interface (like IntermediateColumnarFormat), but putting
+# them into the list to keep the lint nagging under control
+__all__ = [
+    "IntermediateColumnarFormat",
+    "explode",
+    "explode_finalise",
+    "explode_init",
+    "explode_partition",
+    "VcfZarrSchema",
+    "convert",
+    "encode",
+    "encode_finalise",
+    "encode_init",
+    "encode_partition",
+    "inspect",
+    "mkschema",
+    "verify",
+]

bio2zarr 0.0.6__py3-none-any.whl → 0.0.10__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.6py3-none-any.whl → 0.0.10py3-none-any.whl