PyPI - bio2zarr - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

bio2zarr 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (13) hide show

bio2zarr/_version.py +2 -2
bio2zarr/cli.py +245 -68
bio2zarr/core.py +36 -19
bio2zarr/plink.py +25 -19
bio2zarr/vcf.py +704 -389
bio2zarr/vcf_utils.py +0 -1
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/METADATA +1 -1
bio2zarr-0.0.3.dist-info/RECORD +16 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/WHEEL +1 -1
bio2zarr-0.0.1.dist-info/RECORD +0 -16
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.1.dist-info → bio2zarr-0.0.3.dist-info}/top_level.txt +0 -0

bio2zarr/plink.py CHANGED Viewed

@@ -4,6 +4,7 @@ import humanfriendly
 import numpy as np
 import zarr
 import bed_reader
+import numcodecs
 from . import core
@@ -22,14 +23,14 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
     gt = core.BufferedArray(root["call_genotype"], start)
     gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
     gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
-    chunk_length = gt.array.chunks[0]
+    variants_chunk_size = gt.array.chunks[0]
     n = gt.array.shape[1]
-    assert start % chunk_length == 0
+    assert start % variants_chunk_size == 0
     logger.debug(f"Reading slice {start}:{stop}")
     chunk_start = start
     while chunk_start < stop:
-        chunk_stop = min(chunk_start + chunk_length, stop)
+        chunk_stop = min(chunk_start + variants_chunk_size, stop)
         logger.debug(f"Reading bed slice {chunk_start}:{chunk_stop}")
         bed_chunk = bed.read(slice(chunk_start, chunk_stop), dtype=np.int8).T
         logger.debug(f"Got bed slice {humanfriendly.format_size(bed_chunk.nbytes)}")
@@ -60,8 +61,8 @@ def convert(
     *,
     show_progress=False,
     worker_processes=1,
-    chunk_length=None,
-    chunk_width=None,
+    variants_chunk_size=None,
+    samples_chunk_size=None,
 ):
     bed = bed_reader.open_bed(bed_path, num_threads=1)
     n = bed.iid_count
@@ -69,25 +70,30 @@ def convert(
     logging.info(f"Scanned plink with {n} samples and {m} variants")
     # FIXME
-    if chunk_width is None:
-        chunk_width = 1000
-    if chunk_length is None:
-        chunk_length = 10_000
+    if samples_chunk_size is None:
+        samples_chunk_size = 1000
+    if variants_chunk_size is None:
+        variants_chunk_size = 10_000
     store = zarr.DirectoryStore(zarr_path)
     root = zarr.group(store=store, overwrite=True)
     ploidy = 2
     shape = [m, n]
-    chunks = [chunk_length, chunk_width]
+    chunks = [variants_chunk_size, samples_chunk_size]
     dimensions = ["variants", "samples"]
+    # TODO we should be reusing some logic from vcfzarr here on laying
+    # out the basic dataset, and using the schema generator. Currently
+    # we're not using the best Blosc settings for genotypes here.
+    default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
     a = root.array(
         "sample_id",
         bed.iid,
         dtype="str",
-        compressor=core.default_compressor,
-        chunks=(chunk_width,),
+        compressor=default_compressor,
+        chunks=(samples_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
     logger.debug(f"Encoded samples")
@@ -98,8 +104,8 @@ def convert(
         "variant_position",
         bed.bp_position,
         dtype=np.int32,
-        compressor=core.default_compressor,
-        chunks=(chunk_length,),
+        compressor=default_compressor,
+        chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
     logger.debug(f"encoded variant_position")
@@ -109,8 +115,8 @@ def convert(
         "variant_allele",
         alleles,
         dtype="str",
-        compressor=core.default_compressor,
-        chunks=(chunk_length,),
+        compressor=default_compressor,
+        chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
     logger.debug(f"encoded variant_allele")
@@ -121,7 +127,7 @@ def convert(
         dtype="bool",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
@@ -132,7 +138,7 @@ def convert(
         dtype="i1",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
@@ -141,7 +147,7 @@ def convert(
         dtype="bool",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)

bio2zarr 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl