PyPI - bio2zarr - Versions diffs - 0.0.1__py3-none-any.whl - Mend

bio2zarr 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (16) hide show

bio2zarr/__init__.py +1 -0
bio2zarr/__main__.py +20 -0
bio2zarr/_version.py +16 -0
bio2zarr/cli.py +229 -0
bio2zarr/core.py +235 -0
bio2zarr/plink.py +198 -0
bio2zarr/provenance.py +7 -0
bio2zarr/typing.py +4 -0
bio2zarr/vcf.py +1802 -0
bio2zarr/vcf_utils.py +513 -0
bio2zarr-0.0.1.dist-info/LICENSE +201 -0
bio2zarr-0.0.1.dist-info/METADATA +33 -0
bio2zarr-0.0.1.dist-info/RECORD +16 -0
bio2zarr-0.0.1.dist-info/WHEEL +5 -0
bio2zarr-0.0.1.dist-info/entry_points.txt +4 -0
bio2zarr-0.0.1.dist-info/top_level.txt +1 -0

bio2zarr/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from . provenance import __version__

bio2zarr/__main__.py ADDED Viewed

@@ -0,0 +1,20 @@
+import click
+from . import cli
+@cli.version
+@click.group()
+def bio2zarr():
+    pass
+# Provide a single top-level interface to all of the functionality.
+# This probably isn't the recommended way of interacting, as we
+# install individual commands as console scripts. However, this
+# is handy for development and for those whose PATHs aren't set
+# up in the right way.
+bio2zarr.add_command(cli.vcf2zarr)
+bio2zarr.add_command(cli.plink2zarr)
+bio2zarr.add_command(cli.vcf_partition)
+if __name__ == "__main__":
+    bio2zarr()

bio2zarr/_version.py ADDED Viewed

@@ -0,0 +1,16 @@
+# file generated by setuptools_scm
+# don't change, don't track in version control
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple, Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+else:
+    VERSION_TUPLE = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+__version__ = version = '0.0.1'
+__version_tuple__ = version_tuple = (0, 0, 1)

bio2zarr/cli.py ADDED Viewed

@@ -0,0 +1,229 @@
+import click
+import tabulate
+import coloredlogs
+from . import vcf
+from . import vcf_utils
+from . import plink
+from . import provenance
+# Common arguments/options
+verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
+worker_processes = click.option(
+    "-p", "--worker-processes", type=int, default=1, help="Number of worker processes"
+)
+# TODO help text
+chunk_length = click.option(
+    "-l",
+    "--chunk-length",
+    type=int,
+    default=None,
+    help="Chunk size in the variants dimension",
+)
+chunk_width = click.option(
+    "-w",
+    "--chunk-width",
+    type=int,
+    default=None,
+    help="Chunk size in the samples dimension",
+)
+version = click.version_option(version=f"bio2zarr {provenance.__version__}")
+# Note: logging hasn't been implemented in the code at all, this is just
+# a first pass to try out some ways of doing things to see what works.
+def setup_logging(verbosity):
+    level = "WARNING"
+    if verbosity == 1:
+        level = "INFO"
+    elif verbosity >= 2:
+        level = "DEBUG"
+    # NOTE: I'm not that excited about coloredlogs, just trying it out
+    # as it is installed by cyvcf2 anyway. We will have some complicated
+    # stuff doing on with threads and processes, to logs might not work
+    # so well anyway.
+    coloredlogs.install(level=level)
+@click.command
+@click.argument("vcfs", nargs=-1, required=True)
+@click.argument("out_path", type=click.Path())
+@verbose
+@worker_processes
+@click.option("-c", "--column-chunk-size", type=int, default=64)
+def explode(vcfs, out_path, verbose, worker_processes, column_chunk_size):
+    """
+    Convert VCF(s) to columnar intermediate format
+    """
+    setup_logging(verbose)
+    vcf.explode(
+        vcfs,
+        out_path,
+        worker_processes=worker_processes,
+        column_chunk_size=column_chunk_size,
+        show_progress=True,
+    )
+@click.command
+@click.argument("if_path", type=click.Path())
+@verbose
+def inspect(if_path, verbose):
+    """
+    Inspect an intermediate format file
+    """
+    setup_logging(verbose)
+    data = vcf.inspect(if_path)
+    click.echo(tabulate.tabulate(data, headers="keys"))
+@click.command
+@click.argument("if_path", type=click.Path())
+def mkschema(if_path):
+    """
+    Generate a schema for zarr encoding
+    """
+    stream = click.get_text_stream("stdout")
+    vcf.mkschema(if_path, stream)
+@click.command
+@click.argument("if_path", type=click.Path())
+@click.argument("zarr_path", type=click.Path())
+@verbose
+@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
+@chunk_length
+@chunk_width
+@click.option(
+    "-V",
+    "--max-variant-chunks",
+    type=int,
+    default=None,
+    help=(
+        "Truncate the output in the variants dimension to have "
+        "this number of chunks. Mainly intended to help with "
+        "schema tuning."
+    ),
+)
+@worker_processes
+def encode(
+    if_path,
+    zarr_path,
+    verbose,
+    schema,
+    chunk_length,
+    chunk_width,
+    max_variant_chunks,
+    worker_processes,
+):
+    """
+    Encode intermediate format (see explode) to vcfzarr
+    """
+    setup_logging(verbose)
+    vcf.encode(
+        if_path,
+        zarr_path,
+        schema,
+        chunk_length=chunk_length,
+        chunk_width=chunk_width,
+        max_v_chunks=max_variant_chunks,
+        worker_processes=worker_processes,
+        show_progress=True,
+    )
+@click.command(name="convert")
+@click.argument("vcfs", nargs=-1, required=True)
+@click.argument("out_path", type=click.Path())
+@chunk_length
+@chunk_width
+@verbose
+@worker_processes
+def convert_vcf(vcfs, out_path, chunk_length, chunk_width, verbose, worker_processes):
+    """
+    Convert input VCF(s) directly to vcfzarr (not recommended for large files)
+    """
+    setup_logging(verbose)
+    vcf.convert(
+        vcfs,
+        out_path,
+        chunk_length=chunk_length,
+        chunk_width=chunk_width,
+        show_progress=True,
+        worker_processes=worker_processes,
+    )
+@click.command
+@click.argument("vcfs", nargs=-1, required=True)
+@click.argument("out_path", type=click.Path())
+def validate(vcfs, out_path):
+    """
+    Development only, do not use. Will be removed before release.
+    """
+    # FIXME! Will silently not look at remaining VCFs
+    vcf.validate(vcfs[0], out_path, show_progress=True)
+@version
+@click.group()
+def vcf2zarr():
+    pass
+# TODO figure out how to get click to list these in the given order.
+vcf2zarr.add_command(explode)
+vcf2zarr.add_command(inspect)
+vcf2zarr.add_command(mkschema)
+vcf2zarr.add_command(encode)
+vcf2zarr.add_command(convert_vcf)
+vcf2zarr.add_command(validate)
+@click.command(name="convert")
+@click.argument("in_path", type=click.Path())
+@click.argument("out_path", type=click.Path())
+@worker_processes
+@verbose
+@chunk_length
+@chunk_width
+def convert_plink(
+    in_path, out_path, verbose, worker_processes, chunk_length, chunk_width
+):
+    """
+    In development; DO NOT USE!
+    """
+    setup_logging(verbose)
+    plink.convert(
+        in_path,
+        out_path,
+        show_progress=True,
+        worker_processes=worker_processes,
+        chunk_width=chunk_width,
+        chunk_length=chunk_length,
+    )
+@version
+@click.group()
+def plink2zarr():
+    pass
+plink2zarr.add_command(convert_plink)
+@click.command
+@version
+@click.argument("vcf_path", type=click.Path())
+@click.option("-i", "--index", type=click.Path(), default=None)
+@click.option("-n", "--num-parts", type=int, default=None)
+# @click.option("-s", "--part-size", type=int, default=None)
+def vcf_partition(vcf_path, index, num_parts):
+    indexed_vcf = vcf_utils.IndexedVcf(vcf_path, index)
+    regions = indexed_vcf.partition_into_regions(num_parts=num_parts)
+    click.echo("\n".join(map(str, regions)))

bio2zarr/core.py ADDED Viewed

@@ -0,0 +1,235 @@
+import dataclasses
+import contextlib
+import concurrent.futures as cf
+import multiprocessing
+import threading
+import logging
+import time
+import zarr
+import numpy as np
+import tqdm
+import numcodecs
+logger = logging.getLogger(__name__)
+numcodecs.blosc.use_threads = False
+# TODO this should probably go in another module where we abstract
+# out the zarr defaults
+default_compressor = numcodecs.Blosc(
+    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
+)
+def chunk_aligned_slices(z, n, max_chunks=None):
+    """
+    Returns at n slices in the specified zarr array, aligned
+    with its chunks
+    """
+    chunk_size = z.chunks[0]
+    num_chunks = int(np.ceil(z.shape[0] / chunk_size))
+    if max_chunks is not None:
+        num_chunks = min(num_chunks, max_chunks)
+    slices = []
+    splits = np.array_split(np.arange(num_chunks), min(n, num_chunks))
+    for split in splits:
+        start = split[0] * chunk_size
+        stop = (split[-1] + 1) * chunk_size
+        stop = min(stop, z.shape[0])
+        slices.append((start, stop))
+    return slices
+class SynchronousExecutor(cf.Executor):
+    def submit(self, fn, /, *args, **kwargs):
+        future = cf.Future()
+        future.set_result(fn(*args, **kwargs))
+        return future
+def wait_on_futures(futures):
+    for future in cf.as_completed(futures):
+        exception = future.exception()
+        if exception is not None:
+            raise exception
+def cancel_futures(futures):
+    for future in futures:
+        future.cancel()
+@dataclasses.dataclass
+class BufferedArray:
+    array: zarr.Array
+    array_offset: int
+    buff: np.ndarray
+    buffer_row: int
+    def __init__(self, array, offset):
+        self.array = array
+        self.array_offset = offset
+        assert offset % array.chunks[0] == 0
+        dims = list(array.shape)
+        dims[0] = min(array.chunks[0], array.shape[0])
+        self.buff = np.zeros(dims, dtype=array.dtype)
+        self.buffer_row = 0
+    @property
+    def chunk_length(self):
+        return self.buff.shape[0]
+    def next_buffer_row(self):
+        if self.buffer_row == self.chunk_length:
+            self.flush()
+        row = self.buffer_row
+        self.buffer_row += 1
+        return row
+    def flush(self):
+        if self.buffer_row != 0:
+            if len(self.array.chunks) <= 1:
+                sync_flush_1d_array(
+                    self.buff[: self.buffer_row], self.array, self.array_offset
+                )
+            else:
+                sync_flush_2d_array(
+                    self.buff[: self.buffer_row], self.array, self.array_offset
+                )
+            logger.debug(
+                f"Flushed <{self.array.name} {self.array.shape} "
+                f"{self.array.dtype}> "
+                f"{self.array_offset}:{self.array_offset + self.buffer_row}"
+                f"{self.buff.nbytes / 2**20: .2f}Mb"
+            )
+            self.array_offset += self.chunk_length
+            self.buffer_row = 0
+def sync_flush_1d_array(np_buffer, zarr_array, offset):
+    zarr_array[offset : offset + np_buffer.shape[0]] = np_buffer
+    update_progress(1)
+def sync_flush_2d_array(np_buffer, zarr_array, offset):
+    # Write chunks in the second dimension 1-by-1 to make progress more
+    # incremental, and to avoid large memcopies in the underlying
+    # encoder implementations.
+    s = slice(offset, offset + np_buffer.shape[0])
+    chunk_width = zarr_array.chunks[1]
+    zarr_array_width = zarr_array.shape[1]
+    start = 0
+    while start < zarr_array_width:
+        stop = min(start + chunk_width, zarr_array_width)
+        zarr_array[s, start:stop] = np_buffer[:, start:stop]
+        update_progress(1)
+        start = stop
+@dataclasses.dataclass
+class ProgressConfig:
+    total: int = 0
+    units: str = ""
+    title: str = ""
+    show: bool = False
+    poll_interval: float = 0.01
+# NOTE: this approach means that we cannot have more than one
+# progressable thing happening per source process. This is
+# probably fine in practise, but there could be corner cases
+# where it's not. Something to watch out for.
+_progress_counter = multiprocessing.Value("Q", 0)
+def update_progress(inc):
+    with _progress_counter.get_lock():
+        _progress_counter.value += inc
+def get_progress():
+    with _progress_counter.get_lock():
+        val = _progress_counter.value
+    return val
+def set_progress(value):
+    with _progress_counter.get_lock():
+        _progress_counter.value = value
+class ParallelWorkManager(contextlib.AbstractContextManager):
+    def __init__(self, worker_processes=1, progress_config=None):
+        if worker_processes <= 0:
+            # NOTE: this is only for testing, not for production use!
+            self.executor = SynchronousExecutor()
+        else:
+            self.executor = cf.ProcessPoolExecutor(
+                max_workers=worker_processes,
+            )
+        self.futures = []
+        set_progress(0)
+        if progress_config is None:
+            progress_config = ProgressConfig()
+        self.progress_config = progress_config
+        self.progress_bar = tqdm.tqdm(
+            total=progress_config.total,
+            desc=f"{progress_config.title:>9}",
+            unit_scale=True,
+            unit=progress_config.units,
+            smoothing=0.1,
+            disable=not progress_config.show,
+        )
+        self.completed = False
+        self.completed_lock = threading.Lock()
+        self.progress_thread = threading.Thread(
+            target=self._update_progress_worker,
+            name="progress-update",
+            daemon=True,  # Avoids deadlock on exit in awkward error conditions
+        )
+        self.progress_thread.start()
+    def _update_progress(self):
+        current = get_progress()
+        inc = current - self.progress_bar.n
+        # print("UPDATE PROGRESS: current = ", current, self.progress_config.total, inc)
+        self.progress_bar.update(inc)
+    def _update_progress_worker(self):
+        completed = False
+        while not completed:
+            self._update_progress()
+            time.sleep(self.progress_config.poll_interval)
+            with self.completed_lock:
+                completed = self.completed
+        logger.debug("Exit progress thread")
+    def submit(self, *args, **kwargs):
+        self.futures.append(self.executor.submit(*args, **kwargs))
+    def results_as_completed(self):
+        for future in cf.as_completed(self.futures):
+            yield future.result()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is None:
+            wait_on_futures(self.futures)
+        else:
+            cancel_futures(self.futures)
+        # There's probably a much cleaner way of doing this with a Condition
+        # or something, but this seems to work OK for now. This setup might
+        # make small conversions a bit laggy as we wait on the sleep interval
+        # though.
+        with self.completed_lock:
+            self.completed = True
+        self.executor.shutdown(wait=False)
+        # FIXME there's currently some thing weird happening at the end of
+        # Encode 1D for 1kg-p3. The progress bar disappears, like we're
+        # setting a total of zero or something.
+        self.progress_thread.join()
+        self._update_progress()
+        self.progress_bar.close()
+        return False

bio2zarr/plink.py ADDED Viewed

@@ -0,0 +1,198 @@
+import logging
+import humanfriendly
+import numpy as np
+import zarr
+import bed_reader
+from . import core
+logger = logging.getLogger(__name__)
+def encode_genotypes_slice(bed_path, zarr_path, start, stop):
+    # We need to count the A2 alleles here if we want to keep the
+    # alleles reported as allele_1, allele_2. It's obvious here what
+    # the correct approach is, but it is important to note that the
+    # 0th allele is *not* necessarily the REF for these datasets.
+    bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
+    store = zarr.DirectoryStore(zarr_path)
+    root = zarr.group(store=store)
+    gt = core.BufferedArray(root["call_genotype"], start)
+    gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
+    gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
+    chunk_length = gt.array.chunks[0]
+    n = gt.array.shape[1]
+    assert start % chunk_length == 0
+    logger.debug(f"Reading slice {start}:{stop}")
+    chunk_start = start
+    while chunk_start < stop:
+        chunk_stop = min(chunk_start + chunk_length, stop)
+        logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
+        bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
+        logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
+        # Probably should do this without iterating over rows, but it's a bit
+        # simpler and lines up better with the array buffering API. The bottleneck
+        # is in the encoding anyway.
+        for values in bed_chunk:
+            j = gt.next_buffer_row()
+            g = np.zeros_like(gt.buff[j])
+            g[values == -127] = -1
+            g[values == 2] = 1
+            g[values == 1, 0] = 1
+            gt.buff[j] = g
+            j = gt_phased.next_buffer_row()
+            gt_phased.buff[j] = False
+            j = gt_mask.next_buffer_row()
+            gt_mask.buff[j] = gt.buff[j] == -1
+        chunk_start = chunk_stop
+    gt.flush()
+    gt_phased.flush()
+    gt_mask.flush()
+    logger.debug(f"GT slice {start}:{stop} done")
+def convert(
+    bed_path,
+    zarr_path,
+    *,
+    show_progress=False,
+    worker_processes=1,
+    chunk_length=None,
+    chunk_width=None,
+):
+    bed = bed_reader.open_bed(bed_path, num_threads=1)
+    n = bed.iid_count
+    m = bed.sid_count
+    logging.info(f"Scanned plink with {n} samples and {m} variants")
+    # FIXME
+    if chunk_width is None:
+        chunk_width = 1000
+    if chunk_length is None:
+        chunk_length = 10_000
+    store = zarr.DirectoryStore(zarr_path)
+    root = zarr.group(store=store, overwrite=True)
+    ploidy = 2
+    shape = [m, n]
+    chunks = [chunk_length, chunk_width]
+    dimensions = ["variants", "samples"]
+    a = root.array(
+        "sample_id",
+        bed.iid,
+        dtype="str",
+        compressor=core.default_compressor,
+        chunks=(chunk_width,),
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
+    logger.debug(f"Encoded samples")
+    # TODO encode these in slices - but read them in one go to avoid
+    # fetching repeatedly from bim file
+    a = root.array(
+        "variant_position",
+        bed.bp_position,
+        dtype=np.int32,
+        compressor=core.default_compressor,
+        chunks=(chunk_length,),
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
+    logger.debug(f"encoded variant_position")
+    alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
+    a = root.array(
+        "variant_allele",
+        alleles,
+        dtype="str",
+        compressor=core.default_compressor,
+        chunks=(chunk_length,),
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
+    logger.debug(f"encoded variant_allele")
+    # TODO remove this?
+    a = root.empty(
+        "call_genotype_phased",
+        dtype="bool",
+        shape=list(shape),
+        chunks=list(chunks),
+        compressor=core.default_compressor,
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
+    shape += [ploidy]
+    dimensions += ["ploidy"]
+    a = root.empty(
+        "call_genotype",
+        dtype="i1",
+        shape=list(shape),
+        chunks=list(chunks),
+        compressor=core.default_compressor,
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
+    a = root.empty(
+        "call_genotype_mask",
+        dtype="bool",
+        shape=list(shape),
+        chunks=list(chunks),
+        compressor=core.default_compressor,
+    )
+    a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
+    del bed
+    num_slices = max(1, worker_processes * 4)
+    slices = core.chunk_aligned_slices(a, num_slices)
+    total_chunks = sum(a.nchunks for a in root.values())
+    progress_config = core.ProgressConfig(
+        total=total_chunks, title="Convert", units="chunks", show=show_progress
+    )
+    with core.ParallelWorkManager(worker_processes, progress_config) as pwm:
+        for start, stop in slices:
+            pwm.submit(encode_genotypes_slice, bed_path, zarr_path, start, stop)
+    # TODO also add atomic swap like VCF. Should be abstracted to
+    # share basic code for setting up the variation dataset zarr
+    zarr.consolidate_metadata(zarr_path)
+# FIXME do this more efficiently - currently reading the whole thing
+# in for convenience, and also comparing call-by-call
+def validate(bed_path, zarr_path):
+    store = zarr.DirectoryStore(zarr_path)
+    root = zarr.group(store=store)
+    call_genotype = root["call_genotype"][:]
+    bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
+    assert call_genotype.shape[0] == bed.sid_count
+    assert call_genotype.shape[1] == bed.iid_count
+    bed_genotypes = bed.read(dtype="int8").T
+    assert call_genotype.shape[0] == bed_genotypes.shape[0]
+    assert call_genotype.shape[1] == bed_genotypes.shape[1]
+    assert call_genotype.shape[2] == 2
+    row_id = 0
+    for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
+        # print("ROW", row_id)
+        # print(bed_row, zarr_row)
+        row_id += 1
+        for bed_call, zarr_call in zip(bed_row, zarr_row):
+            if bed_call == -127:
+                assert list(zarr_call) == [-1, -1]
+            elif bed_call == 0:
+                assert list(zarr_call) == [0, 0]
+            elif bed_call == 1:
+                assert list(zarr_call) == [1, 0]
+            elif bed_call == 2:
+                assert list(zarr_call) == [1, 1]
+            else:  # pragma no cover
+                assert False

bio2zarr/provenance.py ADDED Viewed

@@ -0,0 +1,7 @@
+__version__ = "undefined"
+try:
+    from . import _version
+    __version__ = _version.version
+except ImportError:  # pragma: nocover
+    pass