PyPI - bio2zarr - Versions diffs - 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

bio2zarr 0.0.9py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (19) hide show

bio2zarr/__main__.py +2 -2
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +176 -113
bio2zarr/constants.py +18 -0
bio2zarr/core.py +65 -20
bio2zarr/vcf2zarr/__init__.py +38 -0
bio2zarr/vcf2zarr/icf.py +1221 -0
bio2zarr/vcf2zarr/vcz.py +1053 -0
bio2zarr/vcf2zarr/verification.py +230 -0
bio2zarr/vcf_utils.py +11 -6
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/METADATA +10 -123
bio2zarr-0.1.0.dist-info/RECORD +20 -0
bio2zarr-0.1.0.dist-info/entry_points.txt +3 -0
bio2zarr/vcf.py +0 -2445
bio2zarr-0.0.9.dist-info/RECORD +0 -16
bio2zarr-0.0.9.dist-info/entry_points.txt +0 -4
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/top_level.txt +0 -0

bio2zarr/__main__.py CHANGED Viewed

@@ -14,9 +14,9 @@ def bio2zarr():
 # install individual commands as console scripts. However, this
 # is handy for development and for those whose PATHs aren't set
 # up in the right way.
-bio2zarr.add_command(cli.vcf2zarr)
+bio2zarr.add_command(cli.vcf2zarr_main)
 bio2zarr.add_command(cli.plink2zarr)
-bio2zarr.add_command(cli.vcf_partition)
+bio2zarr.add_command(cli.vcfpartition)
 if __name__ == "__main__":
     bio2zarr()

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.0.9'
-__version_tuple__ = version_tuple = (0, 0, 9)
+__version__ = version = '0.1.0'
+__version_tuple__ = version_tuple = (0, 1, 0)

bio2zarr/cli.py CHANGED Viewed

@@ -5,11 +5,11 @@ import shutil
 import click
 import coloredlogs
-import humanfriendly
 import numcodecs
 import tabulate
-from . import plink, provenance, vcf, vcf_utils
+from . import plink, provenance, vcf2zarr, vcf_utils
+from .vcf2zarr import icf as icf_mod
 logger = logging.getLogger(__name__)
@@ -44,7 +44,13 @@ zarr_path = click.argument(
     "zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
 )
-num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
+num_partitions = click.option(
+    "-n",
+    "--num-partitions",
+    type=click.IntRange(min=1),
+    default=None,
+    help="Target number of partitions to split into",
+)
 partition = click.argument("partition", type=click.IntRange(min=0))
@@ -58,6 +64,27 @@ force = click.option(
     help="Force overwriting of existing directories",
 )
+progress = click.option(
+    "-P /-Q",
+    "--progress/--no-progress",
+    default=True,
+    help="Show progress bars (default: show)",
+)
+one_based = click.option(
+    "--one-based",
+    is_flag=True,
+    flag_value=True,
+    help="Partition indexes are interpreted as one-based",
+)
+json = click.option(
+    "--json",
+    is_flag=True,
+    flag_value=True,
+    help="Output summary data in JSON format",
+)
 version = click.version_option(version=f"{provenance.__version__}")
 worker_processes = click.option(
@@ -151,14 +178,33 @@ def check_overwrite_dir(path, force):
         shutil.rmtree(tmp_delete_path)
+def check_partitions(num_partitions):
+    if num_partitions is None:
+        raise click.UsageError(
+            "-n/--num-partitions must currently be specified. Future versions "
+            "will provide reasonable defaults or other means of specifying "
+            "partitions."
+        )
 def get_compressor(cname):
     if cname is None:
         return None
-    config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
+    config = icf_mod.ICF_DEFAULT_COMPRESSOR.get_config()
     config["cname"] = cname
     return numcodecs.get_codec(config)
+def show_work_summary(work_summary, json):
+    if json:
+        output = work_summary.asjson()
+    else:
+        data = work_summary.asdict()
+        output = tabulate.tabulate(list(data.items()), tablefmt="plain")
+        # output = "\n".join(f"{k}\t{v}" for k, v in data.items())
+    click.echo(output)
 @click.command
 @vcfs
 @new_icf_path
@@ -166,22 +212,30 @@ def get_compressor(cname):
 @verbose
 @column_chunk_size
 @compressor
+@progress
 @worker_processes
 def explode(
-    vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
+    vcfs,
+    icf_path,
+    force,
+    verbose,
+    column_chunk_size,
+    compressor,
+    progress,
+    worker_processes,
 ):
     """
     Convert VCF(s) to intermediate columnar format
     """
     setup_logging(verbose)
     check_overwrite_dir(icf_path, force)
-    vcf.explode(
+    vcf2zarr.explode(
         icf_path,
         vcfs,
         worker_processes=worker_processes,
         column_chunk_size=column_chunk_size,
         compressor=get_compressor(compressor),
-        show_progress=True,
+        show_progress=progress,
     )
@@ -192,7 +246,9 @@ def explode(
 @force
 @column_chunk_size
 @compressor
+@json
 @verbose
+@progress
 @worker_processes
 def dexplode_init(
     vcfs,
@@ -201,39 +257,47 @@ def dexplode_init(
     force,
     column_chunk_size,
     compressor,
+    json,
     verbose,
+    progress,
     worker_processes,
 ):
     """
     Initial step for distributed conversion of VCF(s) to intermediate columnar format
-    over the requested number of paritions.
+    over some number of paritions.
     """
     setup_logging(verbose)
     check_overwrite_dir(icf_path, force)
-    num_partitions = vcf.explode_init(
+    check_partitions(num_partitions)
+    work_summary = vcf2zarr.explode_init(
         icf_path,
         vcfs,
         target_num_partitions=num_partitions,
         column_chunk_size=column_chunk_size,
         worker_processes=worker_processes,
         compressor=get_compressor(compressor),
-        show_progress=True,
+        show_progress=progress,
     )
-    click.echo(num_partitions)
+    show_work_summary(work_summary, json)
 @click.command
 @icf_path
 @partition
 @verbose
-def dexplode_partition(icf_path, partition, verbose):
+@one_based
+def dexplode_partition(icf_path, partition, verbose, one_based):
     """
-    Convert a VCF partition to intermediate columnar format. Must be called *after*
-    the ICF path has been initialised with dexplode_init. Partition indexes must be
-    from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
+    Convert a VCF partition to intermediate columnar format. Must be called
+    after the ICF path has been initialised with dexplode_init. By default,
+    partition indexes are from 0 to the number of partitions N (returned by
+    dexplode_init), exclusive. If the --one-based option is specifed,
+    partition indexes are in the range 1 to N, inclusive.
     """
     setup_logging(verbose)
-    vcf.explode_partition(icf_path, partition)
+    if one_based:
+        partition -= 1
+    vcf2zarr.explode_partition(icf_path, partition)
 @click.command
@@ -244,7 +308,7 @@ def dexplode_finalise(icf_path, verbose):
     Final step for distributed conversion of VCF(s) to intermediate columnar format.
     """
     setup_logging(verbose)
-    vcf.explode_finalise(icf_path)
+    vcf2zarr.explode_finalise(icf_path)
 @click.command
@@ -255,7 +319,7 @@ def inspect(path, verbose):
     Inspect an intermediate columnar format or Zarr path.
     """
     setup_logging(verbose)
-    data = vcf.inspect(path)
+    data = vcf2zarr.inspect(path)
     click.echo(tabulate.tabulate(data, headers="keys"))
@@ -266,7 +330,7 @@ def mkschema(icf_path):
     Generate a schema for zarr encoding
     """
     stream = click.get_text_stream("stdout")
-    vcf.mkschema(icf_path, stream)
+    vcf2zarr.mkschema(icf_path, stream)
 @click.command
@@ -279,6 +343,7 @@ def mkschema(icf_path):
 @samples_chunk_size
 @max_variant_chunks
 @max_memory
+@progress
 @worker_processes
 def encode(
     icf_path,
@@ -290,6 +355,7 @@ def encode(
     samples_chunk_size,
     max_variant_chunks,
     max_memory,
+    progress,
     worker_processes,
 ):
     """
@@ -297,7 +363,7 @@ def encode(
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
-    vcf.encode(
+    vcf2zarr.encode(
         icf_path,
         zarr_path,
         schema_path=schema,
@@ -306,7 +372,7 @@ def encode(
         max_variant_chunks=max_variant_chunks,
         worker_processes=worker_processes,
         max_memory=max_memory,
-        show_progress=True,
+        show_progress=progress,
     )
@@ -319,6 +385,8 @@ def encode(
 @variants_chunk_size
 @samples_chunk_size
 @max_variant_chunks
+@json
+@progress
 @verbose
 def dencode_init(
     icf_path,
@@ -329,12 +397,14 @@ def dencode_init(
     variants_chunk_size,
     samples_chunk_size,
     max_variant_chunks,
+    json,
+    progress,
     verbose,
 ):
     """
     Initialise conversion of intermediate format to VCF Zarr. This will
     set up the specified ZARR_PATH to perform this conversion over
-    NUM_PARTITIONS.
+    some number of partitions.
     The output of this commmand is the actual number of partitions generated
     (which may be less then the requested number, if there is not sufficient
@@ -346,7 +416,8 @@ def dencode_init(
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
-    num_partitions, max_memory = vcf.encode_init(
+    check_partitions(num_partitions)
+    work_summary = vcf2zarr.encode_init(
         icf_path,
         zarr_path,
         target_num_partitions=num_partitions,
@@ -354,141 +425,104 @@ def dencode_init(
         variants_chunk_size=variants_chunk_size,
         samples_chunk_size=samples_chunk_size,
         max_variant_chunks=max_variant_chunks,
-        show_progress=True,
+        show_progress=progress,
     )
-    formatted_size = humanfriendly.format_size(max_memory, binary=True)
-    # NOTE adding the size to the stdout here so that users can parse it
-    # and use in their submission scripts. This is a first pass, and
-    # will most likely change as we see what works and doesn't.
-    # NOTE we probably want to format this as a table, which lists
-    # some other properties, line by line
-    # NOTE This size number is also not quite enough, you need a bit of
-    # headroom with it (probably 10% or so). We should include this.
-    click.echo(f"{num_partitions}\t{formatted_size}")
+    show_work_summary(work_summary, json)
 @click.command
 @zarr_path
 @partition
 @verbose
-def dencode_partition(zarr_path, partition, verbose):
-    """
-    Convert a partition from intermediate columnar format to VCF Zarr.
-    Must be called *after* the Zarr path has been initialised with dencode_init.
-    Partition indexes must be from 0 (inclusive) to the number of paritions
-    returned by dencode_init (exclusive).
+@one_based
+def dencode_partition(zarr_path, partition, verbose, one_based):
     """
+    Convert a partition from intermediate columnar format to VCF Zarr. Must be
+    called after the Zarr path has been initialised with dencode_init. By
+    default, partition indexes are from 0 to the number of partitions N
+    (returned by dencode_init), exclusive. If the --one-based option is
+    specifed, partition indexes are in the range 1 to N, inclusive."""
     setup_logging(verbose)
-    vcf.encode_partition(zarr_path, partition)
+    if one_based:
+        partition -= 1
+    vcf2zarr.encode_partition(zarr_path, partition)
 @click.command
 @zarr_path
 @verbose
-def dencode_finalise(zarr_path, verbose):
+@progress
+def dencode_finalise(zarr_path, verbose, progress):
     """
     Final step for distributed conversion of ICF to VCF Zarr.
     """
     setup_logging(verbose)
-    vcf.encode_finalise(zarr_path, show_progress=True)
+    vcf2zarr.encode_finalise(zarr_path, show_progress=progress)
 @click.command(name="convert")
 @vcfs
 @new_zarr_path
+@force
 @variants_chunk_size
 @samples_chunk_size
 @verbose
+@progress
 @worker_processes
 def convert_vcf(
-    vcfs, zarr_path, variants_chunk_size, samples_chunk_size, verbose, worker_processes
+    vcfs,
+    zarr_path,
+    force,
+    variants_chunk_size,
+    samples_chunk_size,
+    verbose,
+    progress,
+    worker_processes,
 ):
     """
     Convert input VCF(s) directly to vcfzarr (not recommended for large files).
     """
     setup_logging(verbose)
-    vcf.convert(
+    check_overwrite_dir(zarr_path, force)
+    vcf2zarr.convert(
         vcfs,
         zarr_path,
         variants_chunk_size=variants_chunk_size,
         samples_chunk_size=samples_chunk_size,
-        show_progress=True,
+        show_progress=progress,
         worker_processes=worker_processes,
     )
 @version
-@click.group(cls=NaturalOrderGroup)
-def vcf2zarr():
+@click.group(cls=NaturalOrderGroup, name="vcf2zarr")
+def vcf2zarr_main():
     """
     Convert VCF file(s) to the vcfzarr format.
-    The simplest usage is:
-    $ vcf2zarr convert [VCF_FILE] [ZARR_PATH]
-    This will convert the indexed VCF (or BCF) into the vcfzarr format in a single
-    step. As this writes the intermediate columnar format to a temporary directory,
-    we only recommend this approach for small files (< 1GB, say).
-    The recommended approach is to run the conversion in two passes, and
-    to keep the intermediate columnar format ("exploded") around to facilitate
-    experimentation with chunk sizes and compression settings:
-    \b
-    $ vcf2zarr explode [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH]
-    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH]
-    The inspect command provides a way to view contents of an exploded ICF
-    or Zarr:
-    $ vcf2zarr inspect [PATH]
-    This is useful when tweaking chunk sizes and compression settings to suit
-    your dataset, using the mkschema command and --schema option to encode:
-    \b
-    $ vcf2zarr mkschema [ICF_PATH] > schema.json
-    $ vcf2zarr encode [ICF_PATH] [ZARR_PATH] --schema schema.json
-    By editing the schema.json file you can drop columns that are not of interest
-    and edit column specific compression settings. The --max-variant-chunks option
-    to encode allows you to try out these options on small subsets, hopefully
-    arriving at settings with the desired balance of compression and query
-    performance.
-    ADVANCED USAGE
-    For very large datasets (terabyte scale) it may be necessary to distribute the
-    explode and encode steps across a cluster:
-    \b
-    $ vcf2zarr dexplode-init [VCF_FILE_1] ... [VCF_FILE_N] [ICF_PATH] [NUM_PARTITIONS]
-    $ vcf2zarr dexplode-partition [ICF_PATH] [PARTITION_INDEX]
-    $ vcf2zarr dexplode-finalise [ICF_PATH]
-    See the online documentation at [FIXME] for more details on distributed explode.
+    See the online documentation at https://sgkit-dev.github.io/bio2zarr/
+    for more information.
     """
-# TODO figure out how to get click to list these in the given order.
-vcf2zarr.add_command(convert_vcf)
-vcf2zarr.add_command(inspect)
-vcf2zarr.add_command(explode)
-vcf2zarr.add_command(mkschema)
-vcf2zarr.add_command(encode)
-vcf2zarr.add_command(dexplode_init)
-vcf2zarr.add_command(dexplode_partition)
-vcf2zarr.add_command(dexplode_finalise)
-vcf2zarr.add_command(dencode_init)
-vcf2zarr.add_command(dencode_partition)
-vcf2zarr.add_command(dencode_finalise)
+vcf2zarr_main.add_command(convert_vcf)
+vcf2zarr_main.add_command(inspect)
+vcf2zarr_main.add_command(explode)
+vcf2zarr_main.add_command(mkschema)
+vcf2zarr_main.add_command(encode)
+vcf2zarr_main.add_command(dexplode_init)
+vcf2zarr_main.add_command(dexplode_partition)
+vcf2zarr_main.add_command(dexplode_finalise)
+vcf2zarr_main.add_command(dencode_init)
+vcf2zarr_main.add_command(dencode_partition)
+vcf2zarr_main.add_command(dencode_finalise)
 @click.command(name="convert")
 @click.argument("in_path", type=click.Path())
 @click.argument("zarr_path", type=click.Path())
 @worker_processes
+@progress
 @verbose
 @variants_chunk_size
 @samples_chunk_size
@@ -497,6 +531,7 @@ def convert_plink(
     zarr_path,
     verbose,
     worker_processes,
+    progress,
     variants_chunk_size,
     samples_chunk_size,
 ):
@@ -507,7 +542,7 @@ def convert_plink(
     plink.convert(
         in_path,
         zarr_path,
-        show_progress=True,
+        show_progress=progress,
         worker_processes=worker_processes,
         samples_chunk_size=samples_chunk_size,
         variants_chunk_size=variants_chunk_size,
@@ -525,11 +560,39 @@ plink2zarr.add_command(convert_plink)
 @click.command
 @version
-@click.argument("vcf_path", type=click.Path())
-@click.option("-i", "--index", type=click.Path(), default=None)
-@click.option("-n", "--num-parts", type=int, default=None)
-# @click.option("-s", "--part-size", type=int, default=None)
-def vcf_partition(vcf_path, index, num_parts):
-    indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
-    regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
-    click.echo("\n".join(map(str, regions)))
+@click.argument("vcf_path", type=click.Path(exists=True, dir_okay=False))
+@verbose
+@num_partitions
+@click.option(
+    "-s",
+    "--partition-size",
+    type=str,
+    default=None,
+    help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
+)
+def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
+    """
+    Output bcftools region strings that partition an indexed VCF/BCF file
+    into either an approximate number of parts (-n), or parts of approximately
+    a given size (-s). One of -n or -s must be supplied.
+    Note that both the number of partitions and sizes are a target, and the
+    returned number of partitions may not exactly correspond. In particular,
+    there is a maximum level of granularity determined by the associated index
+    which cannot be exceeded.
+    Note also that the partitions returned may vary considerably in the number
+    of records that they contain.
+    """
+    setup_logging(verbose)
+    if num_partitions is None and partition_size is None:
+        raise click.UsageError(
+            "Either --num-partitions or --partition-size must be specified"
+        )
+    indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
+    regions = indexed_vcf.partition_into_regions(
+        num_parts=num_partitions, target_part_size=partition_size
+    )
+    for region in regions:
+        click.echo(f"{region}\t{vcf_path}")

bio2zarr/constants.py ADDED Viewed

@@ -0,0 +1,18 @@
+import numpy as np
+INT_MISSING = -1
+INT_FILL = -2
+STR_MISSING = "."
+STR_FILL = ""
+FLOAT32_MISSING, FLOAT32_FILL = np.array([0x7F800001, 0x7F800002], dtype=np.int32).view(
+    np.float32
+)
+FLOAT32_MISSING_AS_INT32, FLOAT32_FILL_AS_INT32 = np.array(
+    [0x7F800001, 0x7F800002], dtype=np.int32
+)
+MIN_INT_VALUE = np.iinfo(np.int32).min + 2
+VCF_INT_MISSING = np.iinfo(np.int32).min
+VCF_INT_FILL = np.iinfo(np.int32).min + 1

bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.9py3-none-any.whl → 0.1.0py3-none-any.whl