PyPI - dcnum - Versions diffs - 0.16.2__tar.gz → 0.16.3__tar.gz - Mend

dcnum 0.16.2tar.gz → 0.16.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dcnum might be problematic. Click here for more details.

Files changed (91) hide show

{dcnum-0.16.2 → dcnum-0.16.3}/CHANGELOG RENAMED Viewed

@@ -1,3 +1,11 @@
+0.16.3
+ - enh: define valid DCNumJobRunner state
+ - enh: more robust computation of progress
+ - enh: use HDF5Data when loading input data for background computation
+ - enh: automatically split segmenters and axtractors equally
+ - ref: reduce default image cache size from 5 to 2
+ - ref: move dataset generation default kwargs to writer submodule
+ - ref: warn above 0.5% of discarded events in EventExtractorManagerThread
 0.16.2
  - fix: ignore empty HDF5 datasets when copying metadata
  - fix: logging from subprocesses did not work as expected

{dcnum-0.16.2/src/dcnum.egg-info → dcnum-0.16.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dcnum
-Version: 0.16.2
+Version: 0.16.3
 Summary: numerics toolbox for imaging deformability cytometry
 Author: Paul Müller
 Maintainer-email: Paul Müller <dev@craban.de>

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/_version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.16.2'
-__version_tuple__ = version_tuple = (0, 16, 2)
+__version__ = version = '0.16.3'
+__version_tuple__ = version_tuple = (0, 16, 3)

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/feat/event_extractor_manager_thread.py RENAMED Viewed

@@ -81,9 +81,9 @@ class EventExtractorManagerThread(threading.Thread):
                    for _ in range(self.num_workers)]
         [w.start() for w in workers]
+        num_slots = len(self.slot_states)
         chunks_processed = 0
         while True:
-            num_slots = len(self.slot_states)
             cur_slot = 0
             unavailable_slots = 0
             # Check all slots for segmented labels
@@ -93,8 +93,10 @@ class EventExtractorManagerThread(threading.Thread):
                 # - "s" the extractor processed the data and is waiting
                 #   for the segmenter
                 if self.slot_states[cur_slot] == "e":
+                    # The segmenter has something for us in this slot.
                     break
                 else:
+                    # Try another slot.
                     unavailable_slots += 1
                     cur_slot = (cur_slot + 1) % num_slots
                 if unavailable_slots >= num_slots:
@@ -152,7 +154,7 @@ class EventExtractorManagerThread(threading.Thread):
         if inv_masks:
             self.logger.info(f"Encountered {inv_masks} invalid masks.")
             inv_frac = inv_masks / len(self.data)
-            if inv_frac > 0.0:
+            if inv_frac > 0.005:  # warn above one half percent
                 self.logger.warning(f"Discarded {inv_frac:.1%} of the masks. "
                                     f"Please check segmenter applicability.")

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/feat/feat_background/base.py RENAMED Viewed

@@ -7,12 +7,11 @@ import uuid
 import warnings
 import h5py
-import hdf5plugin
 import numpy as np
 from ...meta import ppid
 from ...read import HDF5Data
-from ...write import create_with_basins
+from ...write import create_with_basins, set_default_filter_kwargs
 # All subprocesses should use 'spawn' to avoid issues with threads
@@ -112,10 +111,7 @@ class Background(abc.ABC):
             self.h5out = h5py.File(output_path, "a", libver="latest")
         # Initialize background data
-        if compress:
-            compression_kwargs = hdf5plugin.Zstd(clevel=5)
-        else:
-            compression_kwargs = {}
+        ds_kwargs = set_default_filter_kwargs(compression=compress)
         h5bg = self.h5out.require_dataset(
             "events/image_bg",
             shape=self.input_data.shape,
@@ -123,8 +119,7 @@ class Background(abc.ABC):
             chunks=(min(100, self.image_count),
                     self.image_shape[0],
                     self.image_shape[1]),
-            fletcher32=True,
-            **compression_kwargs,
+            **ds_kwargs,
         )
         h5bg.attrs.create('CLASS', np.string_('IMAGE'))
         h5bg.attrs.create('IMAGE_VERSION', np.string_('1.2'))

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/feat/feat_background/bg_sparse_median.py RENAMED Viewed

@@ -5,6 +5,8 @@ import time
 import numpy as np
 from scipy import ndimage
+from ...read import HDF5Data
 from .base import mp_spawn, Background
 logger = logging.getLogger(__name__)
@@ -90,15 +92,16 @@ class BackgroundSparseMed(Background):
         # time axis
         self.time = None
         if self.h5in is not None:
-            if "time" in self.h5in["events"]:
+            hd = HDF5Data(self.h5in)
+            if "time" in hd:
                 # use actual time from dataset
-                self.time = self.h5in["/events/time"][:]
+                self.time = hd["time"][:]
                 self.time -= self.time[0]
-            elif "imaging:frame rate" in self.h5in.attrs:
-                fr = self.h5in.attrs["imaging:frame rate"]
-                if "frame" in self.h5in["/events"]:
+            elif "imaging:frame rate" in hd.meta:
+                fr = hd.meta["imaging:frame rate"]
+                if "frame" in hd:
                     # compute time from frame rate and frame numbers
-                    self.time = self.h5in["/events/frame"] / fr
+                    self.time = hd["frame"] / fr
                     self.time -= self.time[0]
                 else:
                     # compute time using frame rate (approximate)

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/feat/queue_event_extractor.py RENAMED Viewed

@@ -315,7 +315,7 @@ class QueueEventExtractor:
         queue_handler = QueueHandler(self.log_queue)
         queue_handler.setLevel(self.log_level)
         self.logger.addHandler(queue_handler)
-        self.logger.info("Running")
+        self.logger.info("Ready")
         mp_array = np.ctypeslib.as_array(
             self.label_array).reshape(self.data.image.chunk_shape)

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/logic/ctrl.py RENAMED Viewed

@@ -13,7 +13,6 @@ import time
 import traceback
 import uuid
-import hdf5plugin
 import h5py
 from ..feat.feat_background.base import get_available_background_methods
@@ -26,7 +25,7 @@ from ..read import HDF5Data
 from .._version import version_tuple
 from ..write import (
     DequeWriterThread, HDF5Writer, QueueCollectorThread,
-    copy_metadata, create_with_basins,
+    copy_metadata, create_with_basins, set_default_filter_kwargs
 )
 from .job import DCNumPipelineJob
@@ -36,6 +35,19 @@ from .json_encoder import ExtendedJSONEncoder
 # queues and threads and would end up with race conditions otherwise.
 mp_spawn = mp.get_context("spawn")
+#: valid states for a job runnter. The states must be in logical ordern,
+#: not in alphabetical order.
+valid_states = [
+    "created",
+    "init",
+    "setup",
+    "background",
+    "segmentation",
+    "cleanup",
+    "done",
+    "error",
+]
 class DCNumJobRunner(threading.Thread):
     def __init__(self,
@@ -178,6 +190,16 @@ class DCNumJobRunner(threading.Thread):
         po = pathlib.Path(self.job["path_out"])
         return po.with_name(po.stem + f"_output_{self.tmp_suffix}.rtdc~")
+    @property
+    def state(self):
+        return self._state
+    @state.setter
+    def state(self, state):
+        if state not in valid_states:
+            raise ValueError(f"Invalid state '{state}' specified!")
+        self._state = state
     def close(self, delete_temporary_files=True):
         if self._data_raw is not None:
             self._data_raw.close()
@@ -209,17 +231,32 @@ class DCNumJobRunner(threading.Thread):
         self.close(delete_temporary_files=delete_temporary_files)
     def get_status(self):
-        bgpart = .1  # fraction of background
-        expart = 0.85  # fraction of segmentation and feature extraction
-        clpart = 0.05  # fraction of cleanup
+        # Compute the total progress. The following weights indicate
+        # how much fractional time each processing step takes.
+        bgw = 4  # fraction of background
+        exw = 27  # fraction of segmentation and feature extraction
+        clw = 1  # fraction of cleanup operations
+        tot = bgw + exw + clw
         progress = 0
-        if self._progress_bg is not None:
+        st = self.state
+        # background
+        if valid_states.index(st) > valid_states.index("background"):
+            # background already computed
+            progress += bgw / tot
+        elif self._progress_bg is not None:
             # This is the image count of the input dataset
-            progress += bgpart * (self._progress_bg.value / len(self.draw))
-        if self._progress_ex is not None:
-            progress += expart * self._progress_ex
-        if self._state == "done":
-            progress += clpart
+            progress += bgw / tot * (self._progress_bg.value / len(self.draw))
+        # segmentation
+        if valid_states.index(st) > valid_states.index("segmentation"):
+            # segmentation already done
+            progress += exw / tot
+        elif self._progress_ex is not None:
+            progress += exw / tot * self._progress_ex
+        if self.state == "done":
+            progress = 1
         return {
             "progress": progress,
@@ -231,7 +268,7 @@ class DCNumJobRunner(threading.Thread):
         try:
             self.run_pipeline()
         except BaseException:
-            self._state = "error"
+            self.state = "error"
             self.error_tb = traceback.format_exc()
             if not self.is_alive():
                 # Thread has not been started. This means we are not running
@@ -248,7 +285,7 @@ class DCNumJobRunner(threading.Thread):
                 f"Output file {self.job['path_out']} already exists!")
         # Make sure the output directory exists.
         self.job["path_out"].parent.mkdir(parents=True, exist_ok=True)
-        self._state = "setup"
+        self.state = "setup"
         # First get a list of all pipeline IDs. If the input file has
         # already been processed by dcnum, then we do not have to redo
         # everything.
@@ -290,7 +327,7 @@ class DCNumJobRunner(threading.Thread):
             or (datdict["feat_id"] != self.ppdict["feat_id"])
             or (datdict["gate_id"] != self.ppdict["gate_id"]))
-        self._state = "background"
+        self.state = "background"
         if redo_bg:
             # The 'image_bg' feature is written to `self.path_temp_in`.
@@ -299,7 +336,7 @@ class DCNumJobRunner(threading.Thread):
             # (note that `self.path_temp_in` is basin-based).
             self.task_background()
-        self._state = "segmentation"
+        self.state = "segmentation"
         # We have the input data covered, and we have to run the
         # long-lasting segmentation and feature extraction step.
@@ -323,7 +360,7 @@ class DCNumJobRunner(threading.Thread):
             # reflected in `self.path_temp_out`.
             self.path_temp_in.rename(self.path_temp_out)
-        self._state = "cleanup"
+        self.state = "cleanup"
         # The user would normally expect the output file to be something
         # that is self-contained (copying the file wildly across file
@@ -410,7 +447,7 @@ class DCNumJobRunner(threading.Thread):
         # Rename the output file
         self.path_temp_out.rename(self.job["path_out"])
-        self._state = "done"
+        self.state = "done"
     def task_background(self):
         """Perform background computation task
@@ -442,8 +479,7 @@ class DCNumJobRunner(threading.Thread):
         self.logger.info("Starting segmentation and feature extraction")
         # Start writer thread
         writer_dq = collections.deque()
-        ds_kwds = dict(hdf5plugin.Zstd(clevel=5))
-        ds_kwds["fletcher32"] = True
+        ds_kwds = set_default_filter_kwargs()
         thr_write = DequeWriterThread(
             path_out=self.path_temp_out,
             dq=writer_dq,
@@ -462,13 +498,24 @@ class DCNumJobRunner(threading.Thread):
         if self.job["debug"]:
             num_slots = 1
             num_extractors = 1
+            num_segmenters = 1
         elif seg_cls.hardware_processor == "cpu":  # CPU segmenter
+            # We could in principle set the number of slots to one and
+            # jave both number of extractors and number of segmenters set
+            # to the total number of CPUs. However, we would need more RAM
+            # (for caching the image data) and we also have more overhead.
+            # Having two slots shared between all workers is more efficient.
             num_slots = 2
+            # Split segmentation and feature extraction workers evenly.
             num_extractors = self.job["num_procs"] // 2
+            num_segmenters = self.job["num_procs"] - num_extractors
         else:  # GPU segmenter
             num_slots = 3
             num_extractors = self.job["num_procs"]
+            num_segmenters = 1
         num_extractors = max(1, num_extractors)
+        num_segmenters = max(1, num_segmenters)
+        self.job["segmenter_kwargs"]["num_workers"] = num_segmenters
         slot_chunks = mp_spawn.Array("i", num_slots)
         slot_states = mp_spawn.Array("u", num_slots)

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/read/cache.py RENAMED Viewed

@@ -11,7 +11,7 @@ class HDF5ImageCache:
     def __init__(self,
                  h5ds: h5py.Dataset,
                  chunk_size: int = 1000,
-                 cache_size: int = 5,
+                 cache_size: int = 2,
                  boolean: bool = False):
         """An HDF5 image cache

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/read/hdf5_data.py RENAMED Viewed

@@ -25,7 +25,7 @@ class HDF5Data:
                  basins: List[Dict[List[str] | str]] = None,
                  logs: Dict[List[str]] = None,
                  tables: Dict[np.ndarray] = None,
-                 image_cache_size: int = 5,
+                 image_cache_size: int = 2,
                  ):
         # Init is in __setstate__ so we can pickle this class
         # and use it for multiprocessing.

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/segm/segmenter_gpu.py RENAMED Viewed

@@ -15,6 +15,7 @@ class GPUSegmenter(Segmenter, abc.ABC):
     def __init__(self,
                  *,
+                 num_workers: int = None,
                  kwargs_mask: Dict = None,
                  debug: bool = False,
                  **kwargs
@@ -31,6 +32,9 @@ class GPUSegmenter(Segmenter, abc.ABC):
             Additional, optional keyword arguments for `segment_approach`
             defined in the subclass.
         """
+        if num_workers not in [None, 1]:
+            raise ValueError(f"Number of workers must not be larger than 1 "
+                             f"for GPU segmenter, got '{num_workers}'!")
         super(GPUSegmenter, self).__init__(kwargs_mask=kwargs_mask,
                                            debug=debug,
                                            **kwargs)

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/segm/segmenter_manager_thread.py RENAMED Viewed

@@ -77,9 +77,9 @@ class SegmenterManagerThread(threading.Thread):
         self.debug = debug
     def run(self):
+        num_slots = len(self.slot_states)
         # We iterate over all the chunks of the image data.
         for chunk in self.image_data.iter_chunks():
-            num_slots = len(self.slot_states)
             cur_slot = 0
             empty_slots = 0
             # Wait for a free slot to perform segmentation (compute labels)
@@ -89,8 +89,11 @@ class SegmenterManagerThread(threading.Thread):
                 # - "s" the extractor processed the data and is waiting
                 #   for the segmenter
                 if self.slot_states[cur_slot] != "e":
+                    # It's the segmenters turn. Note that we use '!= "e"',
+                    # because the initial value is "\x00".
                     break
                 else:
+                    # Try another slot.
                     empty_slots += 1
                     cur_slot = (cur_slot + 1) % num_slots
                 if empty_slots >= num_slots:

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/write/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 # flake8: noqa: F401
 from .deque_writer_thread import DequeWriterThread
 from .queue_collector_thread import EventStash, QueueCollectorThread
-from .writer import HDF5Writer, copy_metadata, create_with_basins
+from .writer import (
+    HDF5Writer, copy_metadata, create_with_basins, set_default_filter_kwargs)

{dcnum-0.16.2 → dcnum-0.16.3}/src/dcnum/write/writer.py RENAMED Viewed

@@ -21,11 +21,7 @@ class HDF5Writer:
         """Write deformability cytometry HDF5 data"""
         self.h5 = h5py.File(path, mode=mode, libver="latest")
         self.events = self.h5.require_group("events")
-        if ds_kwds is None:
-            ds_kwds = {}
-        for key, val in dict(hdf5plugin.Zstd(clevel=5)).items():
-            ds_kwds.setdefault(key, val)
-        ds_kwds.setdefault("fletcher32", True)
+        ds_kwds = set_default_filter_kwargs(ds_kwds)
         self.ds_kwds = ds_kwds
     def __enter__(self):
@@ -249,10 +245,7 @@ def copy_metadata(h5_src: h5py.File,
     are not defined already are added.
     """
     # compress data
-    ds_kwds = {}
-    for key, val in dict(hdf5plugin.Zstd(clevel=5)).items():
-        ds_kwds.setdefault(key, val)
-    ds_kwds.setdefault("fletcher32", True)
+    ds_kwds = set_default_filter_kwargs()
     # set attributes
     src_attrs = dict(h5_src.attrs)
     for kk in src_attrs:
@@ -283,3 +276,15 @@ def copy_metadata(h5_src: h5py.File,
                                       f"dcnum {version}"]
                         soft_strgs = [s for s in soft_strgs if s is not None]
                         ds.attrs["software"] = " | ".join(soft_strgs)
+def set_default_filter_kwargs(ds_kwds=None, compression=True):
+    if ds_kwds is None:
+        ds_kwds = {}
+    if compression:
+        # compression
+        for key, val in dict(hdf5plugin.Zstd(clevel=5)).items():
+            ds_kwds.setdefault(key, val)
+    # checksums
+    ds_kwds.setdefault("fletcher32", True)
+    return ds_kwds

{dcnum-0.16.2 → dcnum-0.16.3/src/dcnum.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dcnum
-Version: 0.16.2
+Version: 0.16.3
 Summary: numerics toolbox for imaging deformability cytometry
 Author: Paul Müller
 Maintainer-email: Paul Müller <dev@craban.de>