PyPI - bio2zarr - Versions diffs - 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

bio2zarr 0.0.9py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of bio2zarr might be problematic. Click here for more details.

Files changed (19) hide show

bio2zarr/__main__.py +2 -2
bio2zarr/_version.py +2 -2
bio2zarr/cli.py +176 -113
bio2zarr/constants.py +18 -0
bio2zarr/core.py +65 -20
bio2zarr/vcf2zarr/__init__.py +38 -0
bio2zarr/vcf2zarr/icf.py +1221 -0
bio2zarr/vcf2zarr/vcz.py +1053 -0
bio2zarr/vcf2zarr/verification.py +230 -0
bio2zarr/vcf_utils.py +11 -6
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/METADATA +10 -123
bio2zarr-0.1.0.dist-info/RECORD +20 -0
bio2zarr-0.1.0.dist-info/entry_points.txt +3 -0
bio2zarr/vcf.py +0 -2445
bio2zarr-0.0.9.dist-info/RECORD +0 -16
bio2zarr-0.0.9.dist-info/entry_points.txt +0 -4
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/LICENSE +0 -0
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/WHEEL +0 -0
{bio2zarr-0.0.9.dist-info → bio2zarr-0.1.0.dist-info}/top_level.txt +0 -0

bio2zarr/core.py CHANGED Viewed

@@ -1,13 +1,18 @@
 import concurrent.futures as cf
 import contextlib
 import dataclasses
+import json
 import logging
+import math
 import multiprocessing
 import os
 import os.path
+import sys
 import threading
 import time
+import warnings
+import humanfriendly
 import numcodecs
 import numpy as np
 import tqdm
@@ -18,6 +23,17 @@ logger = logging.getLogger(__name__)
 numcodecs.blosc.use_threads = False
+def display_number(x):
+    ret = "n/a"
+    if math.isfinite(x):
+        ret = f"{x: 0.2g}"
+    return ret
+def display_size(n):
+    return humanfriendly.format_size(n, binary=True)
 def min_int_dtype(min_value, max_value):
     if min_value > max_value:
         raise ValueError("min_value must be <= max_value")
@@ -64,6 +80,11 @@ def du(path):
 class SynchronousExecutor(cf.Executor):
+    # Arguably we should use workers=0 as the default and use this
+    # executor implementation. However, the docs are fairly explicit
+    # about saying we shouldn't instantiate Future objects directly,
+    # so it's best to keep this as a semi-secret debugging interface
+    # for now.
     def submit(self, fn, /, *args, **kwargs):
         future = cf.Future()
         future.set_result(fn(*args, **kwargs))
@@ -128,7 +149,6 @@ class BufferedArray:
                 sync_flush_2d_array(
                     self.buff[: self.buffer_row], self.array, self.array_offset
                 )
-            # FIXME the array.name doesn't seem to be working here for some reason
             logger.debug(
                 f"Flushed <{self.array.name} {self.array.shape} "
                 f"{self.array.dtype}> "
@@ -174,12 +194,15 @@ class ProgressConfig:
 # progressable thing happening per source process. This is
 # probably fine in practise, but there could be corner cases
 # where it's not. Something to watch out for.
-_progress_counter = multiprocessing.Value("Q", 0)
+_progress_counter = None
 def update_progress(inc):
-    with _progress_counter.get_lock():
-        _progress_counter.value += inc
+    # If the _progress_counter has not been set we are working in a
+    # synchronous non-progress tracking context
+    if _progress_counter is not None:
+        with _progress_counter.get_lock():
+            _progress_counter.value += inc
 def get_progress():
@@ -188,23 +211,48 @@ def get_progress():
     return val
-def set_progress(value):
-    with _progress_counter.get_lock():
-        _progress_counter.value = value
+def setup_progress_counter(counter):
+    global _progress_counter
+    _progress_counter = counter
+def warn_py39_mac():
+    if sys.platform == "darwin" and sys.version_info[:2] == (3, 9):
+        warnings.warn(
+            "There is a known issue with bio2zarr on MacOS Python 3.9 "
+            "in which OS-level named semaphores are leaked. "
+            "You will also probably see warnings like 'There appear to be N "
+            "leaked semaphore objects at shutdown'. "
+            "While this is likely harmless for a few runs, it could lead to "
+            "issues if you do a lot of conversion. To get prevent this issue "
+            "either: (1) use --worker-processes=0 or (2) upgrade to a newer "
+            "Python version. See https://github.com/sgkit-dev/bio2zarr/issues/209 "
+            "for more details.",
+            stacklevel=2,
+        )
 class ParallelWorkManager(contextlib.AbstractContextManager):
     def __init__(self, worker_processes=1, progress_config=None):
+        # Need to specify this explicitly to suppport Macs and
+        # for future proofing.
+        ctx = multiprocessing.get_context("spawn")
+        global _progress_counter
+        _progress_counter = ctx.Value("Q", 0)
         if worker_processes <= 0:
-            # NOTE: this is only for testing, not for production use!
+            # NOTE: this is only for testing and debugging, not for
+            # production. See note on the SynchronousExecutor class.
             self.executor = SynchronousExecutor()
         else:
+            warn_py39_mac()
             self.executor = cf.ProcessPoolExecutor(
                 max_workers=worker_processes,
+                mp_context=ctx,
+                initializer=setup_progress_counter,
+                initargs=(_progress_counter,),
             )
         self.futures = set()
-        set_progress(0)
         if progress_config is None:
             progress_config = ProgressConfig()
         self.progress_config = progress_config
@@ -228,7 +276,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
     def _update_progress(self):
         current = get_progress()
         inc = current - self.progress_bar.n
-        # print("UPDATE PROGRESS: current = ", current, self.progress_config.total, inc)
         self.progress_bar.update(inc)
     def _update_progress_worker(self):
@@ -245,16 +292,6 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         self.futures.add(future)
         return future
-    def wait_for_completed(self, timeout=None):
-        done, not_done = cf.wait(self.futures, timeout, cf.FIRST_COMPLETED)
-        for future in done:
-            exception = future.exception()
-            # TODO do the check for BrokenProcessPool here
-            if exception is not None:
-                raise exception
-        self.futures = not_done
-        return done
     def results_as_completed(self):
         for future in cf.as_completed(self.futures):
             yield future.result()
@@ -278,3 +315,11 @@ class ParallelWorkManager(contextlib.AbstractContextManager):
         self._update_progress()
         self.progress_bar.close()
         return False
+class JsonDataclass:
+    def asdict(self):
+        return dataclasses.asdict(self)
+    def asjson(self):
+        return json.dumps(self.asdict(), indent=4)

bio2zarr/vcf2zarr/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+from .icf import (
+    IntermediateColumnarFormat,
+    explode,
+    explode_finalise,
+    explode_init,
+    explode_partition,
+)
+from .vcz import (
+    VcfZarrSchema,
+    convert,
+    encode,
+    encode_finalise,
+    encode_init,
+    encode_partition,
+    inspect,
+    mkschema,
+)
+from .verification import verify
+# NOTE some of these aren't intended to be part of the external
+# interface (like IntermediateColumnarFormat), but putting
+# them into the list to keep the lint nagging under control
+__all__ = [
+    "IntermediateColumnarFormat",
+    "explode",
+    "explode_finalise",
+    "explode_init",
+    "explode_partition",
+    "VcfZarrSchema",
+    "convert",
+    "encode",
+    "encode_finalise",
+    "encode_init",
+    "encode_partition",
+    "inspect",
+    "mkschema",
+    "verify",
+]

bio2zarr 0.0.9__py3-none-any.whl → 0.1.0__py3-none-any.whl

Potentially problematic release.

bio2zarr 0.0.9py3-none-any.whl → 0.1.0py3-none-any.whl