PyPI - bio2zarr - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

bio2zarr 0.0.3py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (11) hide show

bio2zarr/_version.py +2 -2
bio2zarr/cli.py +39 -7
bio2zarr/core.py +2 -1
bio2zarr/vcf.py +83 -50
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/METADATA +2 -2
bio2zarr-0.0.4.dist-info/RECORD +16 -0
bio2zarr-0.0.3.dist-info/RECORD +0 -16
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/entry_points.txt +0 -0
{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/top_level.txt +0 -0

bio2zarr/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.0.3'
-__version_tuple__ = version_tuple = (0, 0, 3)
+__version__ = version = '0.0.4'
+__version_tuple__ = version_tuple = (0, 0, 4)

bio2zarr/cli.py CHANGED Viewed

@@ -6,6 +6,7 @@ import shutil
 import click
 import tabulate
 import coloredlogs
+import numcodecs
 from . import vcf
 from . import vcf_utils
@@ -66,6 +67,17 @@ column_chunk_size = click.option(
     help="Approximate uncompressed size of exploded column chunks in MiB",
 )
+# We could provide the full flexiblity of numcodecs/Blosc here, but there
+# doesn't seem much point. Can always add more arguments here to control
+# compression level, etc.
+compressor = click.option(
+    "-C",
+    "--compressor",
+    type=click.Choice(["lz4", "zstd"]),
+    default=None,
+    help="Codec to use for compressing column chunks (Default=zstd)."
+)
 # Note: -l and -w were chosen when these were called "width" and "length".
 # possibly there are better letters now.
 variants_chunk_size = click.option(
@@ -113,24 +125,36 @@ def check_overwrite_dir(path, force):
         shutil.rmtree(tmp_delete_path)
+def get_compressor(cname):
+    if cname is None:
+        return None
+    config = vcf.ICF_DEFAULT_COMPRESSOR.get_config()
+    config["cname"] = cname
+    return numcodecs.get_codec(config)
 @click.command
 @vcfs
 @new_icf_path
 @force
 @verbose
-@worker_processes
 @column_chunk_size
-def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size):
+@compressor
+@worker_processes
+def explode(
+    vcfs, icf_path, force, verbose, column_chunk_size, compressor, worker_processes
+):
     """
     Convert VCF(s) to intermediate columnar format
     """
     setup_logging(verbose)
     check_overwrite_dir(icf_path, force)
     vcf.explode(
-        vcfs,
         icf_path,
+        vcfs,
         worker_processes=worker_processes,
         column_chunk_size=column_chunk_size,
+        compressor=get_compressor(compressor),
         show_progress=True,
     )
@@ -141,10 +165,18 @@ def explode(vcfs, icf_path, force, verbose, worker_processes, column_chunk_size)
 @click.argument("num_partitions", type=click.IntRange(min=1))
 @force
 @column_chunk_size
+@compressor
 @verbose
 @worker_processes
 def dexplode_init(
-    vcfs, icf_path, num_partitions, force, column_chunk_size, verbose, worker_processes
+    vcfs,
+    icf_path,
+    num_partitions,
+    force,
+    column_chunk_size,
+    compressor,
+    verbose,
+    worker_processes,
 ):
     """
     Initial step for distributed conversion of VCF(s) to intermediate columnar format
@@ -158,6 +190,7 @@ def dexplode_init(
         target_num_partitions=num_partitions,
         column_chunk_size=column_chunk_size,
         worker_processes=worker_processes,
+        compressor=get_compressor(compressor),
         show_progress=True,
     )
     click.echo(num_partitions)
@@ -232,9 +265,8 @@ def mkschema(icf_path):
 @click.option(
     "-M",
     "--max-memory",
-    type=int,
     default=None,
-    help="An approximate bound on overall memory usage in megabytes",
+    help="An approximate bound on overall memory usage (e.g. 10G),",
 )
 @worker_processes
 def encode(
@@ -250,7 +282,7 @@ def encode(
     worker_processes,
 ):
     """
-    Encode intermediate columnar format (see explode) to vcfzarr.
+    Convert intermediate columnar format to vcfzarr.
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)

bio2zarr/core.py CHANGED Viewed

@@ -50,7 +50,8 @@ def wait_on_futures(futures):
             cancel_futures(futures)
             if isinstance(exception, cf.process.BrokenProcessPool):
                 raise RuntimeError(
-                    "Worker process died: you may have run out of memory") from exception
+                    "Worker process died: you may have run out of memory"
+                ) from exception
             else:
                 raise exception

bio2zarr/vcf.py CHANGED Viewed

@@ -151,8 +151,8 @@ class VcfPartition:
 ICF_METADATA_FORMAT_VERSION = "0.2"
 ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
-    cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
-).get_config()
+    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
+)
 @dataclasses.dataclass
@@ -284,9 +284,7 @@ def scan_vcf(path, target_num_partitions):
         return metadata, vcf.raw_header
-def scan_vcfs(
-    paths, show_progress, target_num_partitions, column_chunk_size, worker_processes=1
-):
+def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     logger.info(
         f"Scanning {len(paths)} VCFs attempting to split into {target_num_partitions} partitions."
     )
@@ -334,12 +332,6 @@ def scan_vcfs(
         key=lambda x: (contig_index_map[x.region.contig], x.region.start)
     )
     icf_metadata.partitions = all_partitions
-    icf_metadata.format_version = ICF_METADATA_FORMAT_VERSION
-    icf_metadata.compressor = ICF_DEFAULT_COMPRESSOR
-    icf_metadata.column_chunk_size = column_chunk_size
-    # Bare minimum here for provenance - would be nice to include versions of key
-    # dependencies as well.
-    icf_metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
     logger.info(f"Scan complete, resulting in {len(all_partitions)} partitions.")
     return icf_metadata, header
@@ -824,13 +816,7 @@ class IcfPartitionWriter(contextlib.AbstractContextManager):
         return False
-# TODO rename to IntermediateColumnarFormat and move to icf.py
 class IntermediateColumnarFormat(collections.abc.Mapping):
-    # TODO Check if other compressors would give reasonable compression
-    # with significantly faster times
     def __init__(self, path):
         self.path = pathlib.Path(path)
         # TODO raise a more informative error here telling people this
@@ -904,6 +890,15 @@ class IntermediateColumnarFormat(collections.abc.Mapping):
         return len(self.columns)
+def mkdir_with_progress(path):
+    logger.debug(f"mkdir f{path}")
+    # NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
+    # parents=True will take care of it.
+    path.mkdir(parents=True)
+    core.update_progress(1)
 class IntermediateColumnarFormatWriter:
     def __init__(self, path):
         self.path = pathlib.Path(path)
@@ -922,9 +917,12 @@ class IntermediateColumnarFormatWriter:
         worker_processes=1,
         target_num_partitions=None,
         show_progress=False,
+        compressor=None,
     ):
         if self.path.exists():
-            shutil.rmtree(self.path)
+            raise ValueError("ICF path already exists")
+        if compressor is None:
+            compressor = ICF_DEFAULT_COMPRESSOR
         vcfs = [pathlib.Path(vcf) for vcf in vcfs]
         target_num_partitions = max(target_num_partitions, len(vcfs))
@@ -934,14 +932,19 @@ class IntermediateColumnarFormatWriter:
             worker_processes=worker_processes,
             show_progress=show_progress,
             target_num_partitions=target_num_partitions,
-            column_chunk_size=column_chunk_size,
         )
         self.metadata = icf_metadata
+        self.metadata.format_version = ICF_METADATA_FORMAT_VERSION
+        self.metadata.compressor = compressor.get_config()
+        self.metadata.column_chunk_size = column_chunk_size
+        # Bare minimum here for provenance - would be nice to include versions of key
+        # dependencies as well.
+        self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
-        self.mkdirs()
+        self.mkdirs(worker_processes, show_progress=show_progress)
         # Note: this is needed for the current version of the vcfzarr spec, but it's
-        # probably goint to be dropped.
+        # probably going to be dropped.
         # https://github.com/pystatgen/vcf-zarr-spec/issues/15
         # May be useful to keep lying around still though?
         logger.info(f"Writing VCF header")
@@ -953,20 +956,30 @@ class IntermediateColumnarFormatWriter:
             json.dump(self.metadata.asdict(), f, indent=4)
         return self.num_partitions
-    def mkdirs(self):
-        # TODO add worker_processes here and do this with the ParallelWorkManager
-        logger.info(
-            f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
-        )
+    def mkdirs(self, worker_processes=1, show_progress=False):
+        num_dirs = len(self.metadata.fields) * self.num_partitions
+        logger.info(f"Creating {num_dirs} directories")
         self.path.mkdir()
         self.wip_path.mkdir()
-        for field in self.metadata.fields:
-            col_path = get_vcf_field_path(self.path, field)
-            logger.debug(f"Make directories for {field.full_name} at {col_path}")
-            col_path.mkdir(parents=True)
-            for j in range(self.num_partitions):
-                part_path = col_path / f"p{j}"
-                part_path.mkdir()
+        # Due to high latency batch system filesystems, we create all the directories in
+        # parallel
+        progress_config = core.ProgressConfig(
+            total=num_dirs,
+            units="dirs",
+            title="Mkdirs",
+            show=show_progress,
+        )
+        with core.ParallelWorkManager(
+            worker_processes=worker_processes, progress_config=progress_config
+        ) as manager:
+            for field in self.metadata.fields:
+                col_path = get_vcf_field_path(self.path, field)
+                # Don't bother trying to count the intermediate directories towards
+                # progress
+                manager.submit(col_path.mkdir, parents=True)
+                for j in range(self.num_partitions):
+                    part_path = col_path / f"p{j}"
+                    manager.submit(mkdir_with_progress, part_path)
     def load_partition_summaries(self):
         summaries = []
@@ -1133,12 +1146,13 @@ class IntermediateColumnarFormatWriter:
 def explode(
-    vcfs,
     icf_path,
+    vcfs,
     *,
     column_chunk_size=16,
     worker_processes=1,
     show_progress=False,
+    compressor=None,
 ):
     writer = IntermediateColumnarFormatWriter(icf_path)
     num_partitions = writer.init(
@@ -1148,6 +1162,7 @@ def explode(
         worker_processes=worker_processes,
         show_progress=show_progress,
         column_chunk_size=column_chunk_size,
+        compressor=compressor,
     )
     writer.explode(worker_processes=worker_processes, show_progress=show_progress)
     writer.finalise()
@@ -1162,6 +1177,7 @@ def explode_init(
     target_num_partitions=1,
     worker_processes=1,
     show_progress=False,
+    compressor=None,
 ):
     writer = IntermediateColumnarFormatWriter(icf_path)
     return writer.init(
@@ -1170,6 +1186,7 @@ def explode_init(
         worker_processes=worker_processes,
         show_progress=show_progress,
         column_chunk_size=column_chunk_size,
+        compressor=compressor,
     )
@@ -1480,16 +1497,28 @@ class EncodingWork:
     memory: int = 0
+def parse_max_memory(max_memory):
+    if max_memory is None:
+        # Effectively unbounded
+        return 2**63
+    if isinstance(max_memory, str):
+        max_memory = humanfriendly.parse_size(max_memory)
+    logger.info(f"Set memory budget to {display_size(max_memory)}")
+    return max_memory
 class VcfZarrWriter:
-    def __init__(self, path, icf, schema):
+    def __init__(self, path, icf, schema, dimension_separator=None):
         self.path = pathlib.Path(path)
         self.icf = icf
         self.schema = schema
+        # Default to using nested directories following the Zarr v3 default.
+        # This seems to require version 2.17+ to work properly
+        self.dimension_separator = "/" if dimension_separator is None else dimension_separator
         store = zarr.DirectoryStore(self.path)
         self.root = zarr.group(store=store)
     def init_array(self, variable):
-        # print("CREATE", variable)
         object_codec = None
         if variable.dtype == "O":
             object_codec = numcodecs.VLenUTF8()
@@ -1501,7 +1530,9 @@ class VcfZarrWriter:
             compressor=numcodecs.get_codec(variable.compressor),
             filters=[numcodecs.get_codec(filt) for filt in variable.filters],
             object_codec=object_codec,
+            dimension_separator=self.dimension_separator,
         )
+        # Dimension names are part of the spec in Zarr v3
         a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
     def get_array(self, name):
@@ -1639,6 +1670,7 @@ class VcfZarrWriter:
                 "contig_length",
                 self.schema.contig_length,
                 dtype=np.int64,
+                compressor=DEFAULT_ZARR_COMPRESSOR,
             )
             array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         return {v: j for j, v in enumerate(self.schema.contig_id)}
@@ -1661,8 +1693,6 @@ class VcfZarrWriter:
             self.init_array(column)
     def finalise(self):
-        # for column in self.schema.columns.values():
-        #     self.finalise_array(column)
         zarr.consolidate_metadata(self.path)
     def encode(
@@ -1672,12 +1702,7 @@ class VcfZarrWriter:
         show_progress=False,
         max_memory=None,
     ):
-        if max_memory is None:
-            # Unbounded
-            max_memory = 2**63
-        else:
-            # Value is specified in Mibibytes
-            max_memory *= 2**20  # NEEDS TEST
+        max_memory = parse_max_memory(max_memory)
         # TODO this will move into the setup logic later when we're making it possible
         # to split the work by slice
@@ -1764,8 +1789,8 @@ class VcfZarrWriter:
         # Fail early if we can't fit a particular column into memory
         for wp in work:
-            if wp.memory >= max_memory:
-                raise ValueError(  # NEEDS TEST
+            if wp.memory > max_memory:
+                raise ValueError(
                     f"Insufficient memory for {wp.columns}: "
                     f"{display_size(wp.memory)} > {display_size(max_memory)}"
                 )
@@ -1778,6 +1803,8 @@ class VcfZarrWriter:
         )
         used_memory = 0
+        # We need to keep some bounds on the queue size or the memory bounds algorithm
+        # below doesn't really work.
         max_queued = 4 * max(1, worker_processes)
         encoded_slices = collections.Counter()
@@ -1804,10 +1831,14 @@ class VcfZarrWriter:
                             self.finalise_array(column)
             for wp in work:
-                if (
+                while (
                     used_memory + wp.memory > max_memory
                     or len(future_to_work) > max_queued
                 ):
+                    logger.debug(
+                        f"Wait: mem_required={used_memory + wp.memory} max_mem={max_memory} "
+                        f"queued={len(future_to_work)} max_queued={max_queued}"
+                    )
                     service_completed_futures()
                 future = pwm.submit(wp.func, wp.start, wp.stop)
                 used_memory += wp.memory
@@ -1832,6 +1863,7 @@ def encode(
     variants_chunk_size=None,
     samples_chunk_size=None,
     max_v_chunks=None,
+    dimension_separator=None,
     max_memory=None,
     worker_processes=1,
     show_progress=False,
@@ -1855,7 +1887,7 @@ def encode(
     if zarr_path.exists():
         logger.warning(f"Deleting existing {zarr_path}")
         shutil.rmtree(zarr_path)
-    vzw = VcfZarrWriter(zarr_path, icf, schema)
+    vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
     vzw.init()
     vzw.encode(
         max_v_chunks=max_v_chunks,
@@ -1876,10 +1908,11 @@ def convert(
     show_progress=False,
     # TODO add arguments to control location of tmpdir
 ):
-    with tempfile.TemporaryDirectory(prefix="vcf2zarr_if_") as if_dir:
+    with tempfile.TemporaryDirectory(prefix="vcf2zarr") as tmp:
+        if_dir = pathlib.Path(tmp) / "if"
         explode(
-            vcfs,
             if_dir,
+            vcfs,
             worker_processes=worker_processes,
             show_progress=show_progress,
         )

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bio2zarr
-Version: 0.0.3
+Version: 0.0.4
 Summary: Convert bioinformatics data to Zarr
 Home-page: https://github.com/pystatgen/bio2zarr
 Author: sgkit Developers
@@ -20,7 +20,7 @@ Requires-Python: >=3.9
 Description-Content-Type: text/x-rst
 License-File: LICENSE
 Requires-Dist: numpy
-Requires-Dist: zarr !=2.11.0,!=2.11.1,!=2.11.2,>=2.10.0
+Requires-Dist: zarr >=2.17
 Requires-Dist: click
 Requires-Dist: tabulate
 Requires-Dist: tqdm

bio2zarr-0.0.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
+bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
+bio2zarr/_version.py,sha256=yBVOKdXLEcTVc7YV7ZPqRXhRDRt-pKrfXxcgHkgPY5g,411
+bio2zarr/cli.py,sha256=QE0DfoZHbBbxq9K_im9y4tJ49_Wss0zzavSjjz-85Xw,11484
+bio2zarr/core.py,sha256=tZb9exfFmuzbA8tUpPY8avSm9YvfH31-vUCTM4fpj78,8128
+bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
+bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
+bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
+bio2zarr/vcf.py,sha256=MEskVTDq4QntzoawPz0sfmInV0aPkIPLXXNv7GmVcmY,73870
+bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
+bio2zarr-0.0.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+bio2zarr-0.0.4.dist-info/METADATA,sha256=DISckjzZ0b6FpBTfBvpmJmEe00SIdTHyB3UTsTR8rws,1077
+bio2zarr-0.0.4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+bio2zarr-0.0.4.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
+bio2zarr-0.0.4.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
+bio2zarr-0.0.4.dist-info/RECORD,,

bio2zarr-0.0.3.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-bio2zarr/__init__.py,sha256=yIJYx4GyKtOLOtODOX0kGCeGPYgQ-TBbsRdT1NwBpQQ,37
-bio2zarr/__main__.py,sha256=3cgaQ4x8YKXt-9xC2GLrHnS6UA38y1GXqttwZiBZJg4,525
-bio2zarr/_version.py,sha256=hB095avW4HuDZxn8qPHRG1UMzSSonb8ZDAsLxt9hmk8,411
-bio2zarr/cli.py,sha256=N_vEFj730p_TL7Dk9m9T3ceAhVV58BMYRDmBmoeKH7A,10766
-bio2zarr/core.py,sha256=sBlWmHjcb7tAn_7WQRBdrbGcEd_lT_3HTQ_JbzomVMg,8111
-bio2zarr/plink.py,sha256=llhfP-v44BVPvgCcwXktk0YrKaJSII63U_PTtpHlGtM,6755
-bio2zarr/provenance.py,sha256=c_Z__QbWkLS0Rfa8D7LgEhtStng_zRMJX8comaDXIkw,142
-bio2zarr/typing.py,sha256=wZ99Zzp5BD9Nqpd-S5bn38fSdPzfj6Z9IHPBfZqt9Gs,78
-bio2zarr/vcf.py,sha256=g2TqH9Lbp4Ds8kjOnjvHvoMAgnG6Kx8pKPN1bqBKKIQ,72201
-bio2zarr/vcf_utils.py,sha256=_kMZdpye15HGpniv8wwISw0L6NEEi54ZFaTcM83wLGs,16751
-bio2zarr-0.0.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-bio2zarr-0.0.3.dist-info/METADATA,sha256=dc2y5xrnkcvD1qmKGFL5GrsbM1_tiIlAYB2GrAlLunM,1106
-bio2zarr-0.0.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-bio2zarr-0.0.3.dist-info/entry_points.txt,sha256=pklStOdATE5hHJm4qiIvmhHkcn21Si_XAu6MC7ieNrk,131
-bio2zarr-0.0.3.dist-info/top_level.txt,sha256=ouAvp3u9N25eKrQbN8BCDLPcWWQLhtlgdHKu8AtEj5Q,9
-bio2zarr-0.0.3.dist-info/RECORD,,

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{bio2zarr-0.0.3.dist-info → bio2zarr-0.0.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

bio2zarr 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.3py3-none-any.whl → 0.0.4py3-none-any.whl