PyPI - dcnum - Versions diffs - 0.16.1__tar.gz → 0.16.3__tar.gz - Mend

dcnum 0.16.1tar.gz → 0.16.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dcnum might be problematic. Click here for more details.

Files changed (91) hide show

{dcnum-0.16.1 → dcnum-0.16.3}/CHANGELOG RENAMED Viewed

@@ -1,3 +1,23 @@
+0.16.3
+ - enh: define valid DCNumJobRunner state
+ - enh: more robust computation of progress
+ - enh: use HDF5Data when loading input data for background computation
+ - enh: automatically split segmenters and axtractors equally
+ - ref: reduce default image cache size from 5 to 2
+ - ref: move dataset generation default kwargs to writer submodule
+ - ref: warn above 0.5% of discarded events in EventExtractorManagerThread
+0.16.2
+ - fix: ignore empty HDF5 datasets when copying metadata
+ - fix: logging from subprocesses did not work as expected
+ - enh: warn user about total number of invalid masks
+ - enh: introduce DCNumJobRunner.error_tb for errors happening in threads
+ - enh: improve logging verbosity
+ - enh: append job information as log entry in DCNumJobRunner output file
+ - enh: set chunk size for all feature data to 1MiB in HDF5Writer
+ - ref: removed close_queues argument from EventExtractor init
+ - ref: rename event_count with image_count in background computation
+ - ref: do not print anything to stdout when computing background data
+ - ref: use data from background computer in DCNumJobRunner.get_status
 0.16.1
  - fix: when checking for ppid kwargs, allow kwargs defined in `__init__`
  - ref: use kwonly arguments for segmenter `__init__` method

{dcnum-0.16.1/src/dcnum.egg-info → dcnum-0.16.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dcnum
-Version: 0.16.1
+Version: 0.16.3
 Summary: numerics toolbox for imaging deformability cytometry
 Author: Paul Müller
 Maintainer-email: Paul Müller <dev@craban.de>

{dcnum-0.16.1 → dcnum-0.16.3}/src/dcnum/_version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.16.1'
-__version_tuple__ = version_tuple = (0, 16, 1)
+__version__ = version = '0.16.3'
+__version_tuple__ = version_tuple = (0, 16, 3)

{dcnum-0.16.1 → dcnum-0.16.3}/src/dcnum/feat/event_extractor_manager_thread.py RENAMED Viewed

@@ -46,8 +46,6 @@ class EventExtractorManagerThread(threading.Thread):
         """
         super(EventExtractorManagerThread, self).__init__(
               name="EventExtractorManager", *args, **kwargs)
-        if debug:
-            fe_kwargs["close_queues"] = False
         self.logger = logging.getLogger(
             "dcnum.feat.EventExtractorManagerThread")
         #: Keyword arguments for class:`.EventExtractor`
@@ -83,9 +81,9 @@ class EventExtractorManagerThread(threading.Thread):
                    for _ in range(self.num_workers)]
         [w.start() for w in workers]
+        num_slots = len(self.slot_states)
         chunks_processed = 0
         while True:
-            num_slots = len(self.slot_states)
             cur_slot = 0
             unavailable_slots = 0
             # Check all slots for segmented labels
@@ -95,8 +93,10 @@ class EventExtractorManagerThread(threading.Thread):
                 # - "s" the extractor processed the data and is waiting
                 #   for the segmenter
                 if self.slot_states[cur_slot] == "e":
+                    # The segmenter has something for us in this slot.
                     break
                 else:
+                    # Try another slot.
                     unavailable_slots += 1
                     cur_slot = (cur_slot + 1) % num_slots
                 if unavailable_slots >= num_slots:
@@ -136,15 +136,31 @@ class EventExtractorManagerThread(threading.Thread):
             if chunks_processed == self.data.image.num_chunks:
                 break
-        self.logger.debug("Waiting for event_queue to empty.")
         # Wait until the event queue is empty.
+        self.logger.debug("Waiting for event_queue to empty.")
         event_queue = self.fe_kwargs["event_queue"]
         while not event_queue.empty():
             # The collector thread is still sorting things out. Wait
             # before joining the threads.
-            time.sleep(.1)
+            time.sleep(.05)
+        # Wait until log queue is empty
+        self.logger.debug("Waiting for log_queue to empty.")
+        log_queue = self.fe_kwargs["log_queue"]
+        while not log_queue.empty():
+            time.sleep(.05)
+        inv_masks = self.fe_kwargs["invalid_mask_counter"].value
+        if inv_masks:
+            self.logger.info(f"Encountered {inv_masks} invalid masks.")
+            inv_frac = inv_masks / len(self.data)
+            if inv_frac > 0.005:  # warn above one half percent
+                self.logger.warning(f"Discarded {inv_frac:.1%} of the masks. "
+                                    f"Please check segmenter applicability.")
         self.logger.debug("Requesting extraction workers to join.")
         self.fe_kwargs["finalize_extraction"].value = True
         [w.join() for w in workers]
         self.logger.debug("Finished extraction.")
         self.logger.info(f"Extraction time: {self.t_count:.1f}s")

{dcnum-0.16.1 → dcnum-0.16.3}/src/dcnum/feat/feat_background/base.py RENAMED Viewed

@@ -7,12 +7,16 @@ import uuid
 import warnings
 import h5py
-import hdf5plugin
 import numpy as np
 from ...meta import ppid
 from ...read import HDF5Data
-from ...write import create_with_basins
+from ...write import create_with_basins, set_default_filter_kwargs
+# All subprocesses should use 'spawn' to avoid issues with threads
+# and 'fork' on POSIX systems.
+mp_spawn = mp.get_context('spawn')
 class Background(abc.ABC):
@@ -55,12 +59,14 @@ class Background(abc.ABC):
         self.kwargs.update(kwargs)
         if num_cpus is None:
-            num_cpus = mp.cpu_count()
+            num_cpus = mp_spawn.cpu_count()
         #: number of CPUs used
         self.num_cpus = num_cpus
-        #: number of frames
-        self.event_count = None
+        #: number of images in the input data
+        self.image_count = None
+        #: number of images that have been processed
+        self.image_proc = mp_spawn.Value("L", 0)
         #: HDF5Data instance for input data
         self.hdin = None
@@ -93,7 +99,7 @@ class Background(abc.ABC):
         #: shape of event images
         self.image_shape = self.input_data[0].shape
         #: total number of events
-        self.event_count = len(self.input_data)
+        self.image_count = len(self.input_data)
         if self.h5out is None:
             if not output_path.exists():
@@ -105,19 +111,15 @@ class Background(abc.ABC):
             self.h5out = h5py.File(output_path, "a", libver="latest")
         # Initialize background data
-        if compress:
-            compression_kwargs = hdf5plugin.Zstd(clevel=5)
-        else:
-            compression_kwargs = {}
+        ds_kwargs = set_default_filter_kwargs(compression=compress)
         h5bg = self.h5out.require_dataset(
             "events/image_bg",
             shape=self.input_data.shape,
             dtype=np.uint8,
-            chunks=(min(100, self.event_count),
+            chunks=(min(100, self.image_count),
                     self.image_shape[0],
                     self.image_shape[1]),
-            fletcher32=True,
-            **compression_kwargs,
+            **ds_kwargs,
         )
         h5bg.attrs.create('CLASS', np.string_('IMAGE'))
         h5bg.attrs.create('IMAGE_VERSION', np.string_('1.2'))
@@ -191,6 +193,13 @@ class Background(abc.ABC):
                                      ppid=pp_check_user_kwargs)
         return kwargs
+    def get_progress(self):
+        """Return progress of background computation, float in [0,1]"""
+        if self.image_count == 0:
+            return 0.
+        else:
+            return self.image_proc.value / self.image_count
     def process(self):
         self.process_approach()
         bg_ppid = self.get_ppid()

{dcnum-0.16.1 → dcnum-0.16.3}/src/dcnum/feat/feat_background/bg_roll_median.py RENAMED Viewed

@@ -1,16 +1,10 @@
-import multiprocessing as mp
 import queue
 import time
 import numpy as np
 from scipy import ndimage
-from .base import Background
-# All subprocesses should use 'spawn' to avoid issues with threads
-# and 'fork' on POSIX systems.
-mp_spawn = mp.get_context('spawn')
+from .base import mp_spawn, Background
 class BackgroundRollMed(Background):
@@ -152,9 +146,9 @@ class BackgroundRollMed(Background):
         stop_in = (batch_index + 1) * self.batch_size + self.kernel_size
         stop_out = (batch_index + 1) * self.batch_size
-        if stop_in > self.event_count:
-            stop_in = self.event_count
-            stop_out = self.event_count - self.kernel_size
+        if stop_in > self.image_count:
+            stop_in = self.image_count
+            stop_out = self.image_count - self.kernel_size
         slice_in = slice(start, stop_in)
         slice_out = slice(start, stop_out)
@@ -175,16 +169,14 @@ class BackgroundRollMed(Background):
     def process_approach(self):
         """Perform median computation on entire input data"""
-        num_steps = int(np.ceil(self.event_count / self.batch_size))
+        num_steps = int(np.ceil(self.image_count / self.batch_size))
         for ii in range(num_steps):
-            print(f"Computing background {ii/num_steps*100:.0f}%",
-                  end="\r", flush=True)
             self.process_next_batch()
         # Set the remaining kernel_size median values to the last one
         last_image = self.h5out["events/image_bg"][-self.kernel_size-1]
         for ii in range(self.kernel_size):
-            self.h5out["events/image_bg"][self.event_count-ii-1] = last_image
-        print("Computing background 100%    ", flush=True)
+            self.h5out["events/image_bg"][self.image_count-ii-1] = last_image
+        self.image_proc.value = self.image_count
     def process_next_batch(self):
         """Process one batch of input data"""
@@ -221,6 +213,7 @@ class BackgroundRollMed(Background):
                                                          *self.image_shape)
         self.current_batch += 1
+        self.image_proc.value += self.batch_size
 class MedianWorker(mp_spawn.Process):

{dcnum-0.16.1 → dcnum-0.16.3}/src/dcnum/feat/feat_background/bg_sparse_median.py RENAMED Viewed

@@ -1,19 +1,15 @@
 import logging
-import multiprocessing as mp
 import queue
 import time
 import numpy as np
 from scipy import ndimage
-from .base import Background
-logger = logging.getLogger(__name__)
+from ...read import HDF5Data
+from .base import mp_spawn, Background
-# All subprocesses should use 'spawn' to avoid issues with threads
-# and 'fork' on POSIX systems.
-mp_spawn = mp.get_context('spawn')
+logger = logging.getLogger(__name__)
 class BackgroundSparseMed(Background):
@@ -96,27 +92,28 @@ class BackgroundSparseMed(Background):
         # time axis
         self.time = None
         if self.h5in is not None:
-            if "time" in self.h5in["events"]:
+            hd = HDF5Data(self.h5in)
+            if "time" in hd:
                 # use actual time from dataset
-                self.time = self.h5in["/events/time"][:]
+                self.time = hd["time"][:]
                 self.time -= self.time[0]
-            elif "imaging:frame rate" in self.h5in.attrs:
-                fr = self.h5in.attrs["imaging:frame rate"]
-                if "frame" in self.h5in["/events"]:
+            elif "imaging:frame rate" in hd.meta:
+                fr = hd.meta["imaging:frame rate"]
+                if "frame" in hd:
                     # compute time from frame rate and frame numbers
-                    self.time = self.h5in["/events/frame"] / fr
+                    self.time = hd["frame"] / fr
                     self.time -= self.time[0]
                 else:
                     # compute time using frame rate (approximate)
-                    dur = self.event_count / fr * 1.5
+                    dur = self.image_count / fr * 1.5
                     logger.info(f"Approximating duration: {dur/60:.1f}min")
-                    self.time = np.linspace(0, dur, self.event_count,
+                    self.time = np.linspace(0, dur, self.image_count,
                                             endpoint=True)
         if self.time is None:
             # No HDF5 file or no information therein; Make an educated guess.
-            dur = self.event_count / 3600 * 1.5
+            dur = self.image_count / 3600 * 1.5
             logger.info(f"Guessing duration: {dur/60:.1f}min")
-            self.time = np.linspace(0, dur, self.event_count,
+            self.time = np.linspace(0, dur, self.image_count,
                                     endpoint=True)
         #: duration of the measurement
@@ -212,10 +209,7 @@ class BackgroundSparseMed(Background):
         # Compute initial background images (populates self.bg_images)
         for ii, ti in enumerate(self.step_times):
-            print(f"Computing background {ii / self.step_times.size:.0%}",
-                  end="\r", flush=True)
             self.process_second(ii, ti)
-        print("Computing background 100%    ", flush=True)
         if self.frac_cleansing != 1:
             # The following algorithm finds background images that contain
@@ -277,7 +271,7 @@ class BackgroundSparseMed(Background):
                     f"`thresh_cleansing` or `frac_cleansing`. The new "
                     f"threshold is {thresh_fact / thresh}.")
-            logger.info(f"Removed {frac_remove:.2%} of the background series")
+            logger.info(f"Cleansed {frac_remove:.2%}")
             step_times = self.step_times[used]
             bg_images = self.bg_images[used]
         else:
@@ -286,7 +280,7 @@ class BackgroundSparseMed(Background):
             bg_images = self.bg_images
         # Assign each frame to a certain background index
-        bg_idx = np.zeros(self.event_count, dtype=int)
+        bg_idx = np.zeros(self.image_count, dtype=int)
         idx0 = 0
         idx1 = None
         for ii in range(len(step_times)):
@@ -298,21 +292,25 @@ class BackgroundSparseMed(Background):
             # Fill up remainder of index array with last entry
             bg_idx[idx1:] = ii
+        self.image_proc.value = self.image_count
         # Write background data
         pos = 0
         step = 1000
-        while pos < self.event_count:
-            stop = min(pos + step, self.event_count)
+        while pos < self.image_count:
+            stop = min(pos + step, self.image_count)
             cur_slice = slice(pos, stop)
             self.h5out["events/image_bg"][cur_slice] = \
                 bg_images[bg_idx[cur_slice]]
             pos += step
-    def process_second(self, ii, second):
+    def process_second(self,
+                       ii: int,
+                       second: float | int):
         idx_start = np.argmin(np.abs(second - self.time))
         idx_stop = idx_start + self.kernel_size
-        if idx_stop >= self.event_count:
-            idx_stop = self.event_count
+        if idx_stop >= self.image_count:
+            idx_stop = self.image_count
             idx_start = max(0, idx_stop - self.kernel_size)
         assert idx_stop - idx_start == self.kernel_size
@@ -347,6 +345,8 @@ class BackgroundSparseMed(Background):
         self.bg_images[ii] = self.shared_output.reshape(self.image_shape)
+        self.image_proc.value = idx_stop
 class MedianWorkerSingle(mp_spawn.Process):
     def __init__(self, job_queue, counter, shared_input, shared_output,

{dcnum-0.16.1 → dcnum-0.16.3}/src/dcnum/feat/queue_event_extractor.py RENAMED Viewed

@@ -35,7 +35,8 @@ class QueueEventExtractor:
                  feat_nevents: mp.Array,
                  label_array: mp.Array,
                  finalize_extraction: mp.Value,
-                 close_queues: bool = True,
+                 invalid_mask_counter: mp.Value,
+                 log_level: int = logging.INFO,
                  extract_kwargs: dict = None,
                  *args, **kwargs):
         """Base class for event extraction from label images
@@ -66,9 +67,10 @@ class QueueEventExtractor:
         finalize_extraction:
             Shared value indicating whether this worker should stop as
             soon as the `raw_queue` is empty.
-        close_queues: bool
-            Whether to close event and logging queues
-            (set to False in debug mode)
+        invalid_mask_counter:
+            Counts masks labeled as invalid by the feature extractor
+        log_level:
+            Logging level to use
         extract_kwargs:
             Keyword arguments for the extraction process. See the
             keyword-only arguments in
@@ -85,7 +87,13 @@ class QueueEventExtractor:
         self.event_queue = event_queue
         #: queue for logging
         self.log_queue = log_queue
-        self.close_queues = close_queues
+        #: invalid mask counter
+        self.invalid_mask_counter = invalid_mask_counter
+        # Logging needs to be set up after `start` is called, otherwise
+        # it looks like we have the same PID as the parent process. We
+        # are setting up logging in `run`.
+        self.logger = None
+        self.log_level = log_level
         #: Shared array of length `len(data)` into which the number of
         #: events per frame is written.
         self.feat_nevents = feat_nevents
@@ -100,15 +108,12 @@ class QueueEventExtractor:
         extract_kwargs.setdefault("haralick", True)
         #: Feature extraction keyword arguments.
         self.extract_kwargs = extract_kwargs
-        # Logging needs to be set up after `start` is called, otherwise
-        # it looks like we have the same PID as the parent process. We
-        # are setting up logging in `run`.
-        self.logger = None
     @staticmethod
     def get_init_kwargs(data: HDF5Data,
                         gate: Gate,
                         log_queue: mp.Queue,
+                        log_level: int = logging.INFO,
                         preselect: None = None,
                         ptp_median: None = None):
         """Get initialization arguments for :cass:`.QueueEventExtractor`
@@ -125,7 +130,9 @@ class QueueEventExtractor:
         gate: HDF5Data
             Gating class to use
         log_queue: mp.Queue
-            Queue for sending log messages
+            Queue the worker uses for sending log messages
+        log_level: int
+            Logging level to use in the worker process
         preselect, ptp_median:
             Deprecated
@@ -146,6 +153,7 @@ class QueueEventExtractor:
             warnings.warn("The `ptp_median` argument is deprecated!",
                           DeprecationWarning)
+        # Note that the order must be identical to  __init__
         args = collections.OrderedDict()
         args["data"] = data
         args["gate"] = gate
@@ -159,7 +167,8 @@ class QueueEventExtractor:
             np.ctypeslib.ctypes.c_int16,
             int(np.prod(data.image.chunk_shape)))
         args["finalize_extraction"] = mp_spawn.Value("b", False)
-        args["close_queues"] = True
+        args["invalid_mask_counter"] = mp_spawn.Value("L", 0)
+        args["log_level"] = log_level
         return args
     def get_events_from_masks(self, masks, data_index, *,
@@ -207,8 +216,7 @@ class QueueEventExtractor:
         # over from gated_events to valid_events. According to our experience
         # invalid events happen rarely though.
         if np.any(invalid):
-            self.logger.info(f"Discarded {np.sum(invalid)} events due to "
-                             "invalid segmentation.")
+            self.invalid_mask_counter.value += np.sum(invalid)
             for key in gated_events:
                 valid_events[key] = gated_events[key][valid]
         else:
@@ -294,17 +302,27 @@ class QueueEventExtractor:
         """Main loop of worker process"""
         # Don't wait for these two queues when joining workers
         self.raw_queue.cancel_join_thread()
-        self.log_queue.cancel_join_thread()
         #: logger sends all logs to `self.log_queue`
         self.logger = logging.getLogger(
             f"dcnum.feat.EventExtractor.{os.getpid()}")
+        self.logger.setLevel(self.log_level)
+        # Clear any handlers that might be set for this logger. This is
+        # important for the case when we are an instance of
+        # EventExtractorThread, because then all handlers from the main
+        # thread are inherited (as opposed to no handlers in the case
+        # of EventExtractorProcess).
+        self.logger.handlers.clear()
         queue_handler = QueueHandler(self.log_queue)
+        queue_handler.setLevel(self.log_level)
         self.logger.addHandler(queue_handler)
-        self.logger.addFilter(DeduplicatingLoggingFilter())
-        self.logger.debug(f"Running {self} in PID {os.getpid()}")
+        self.logger.info("Ready")
         mp_array = np.ctypeslib.as_array(
             self.label_array).reshape(self.data.image.chunk_shape)
+        # only close queues when we have created them ourselves.
+        close_queues = isinstance(self, EventExtractorProcess)
         while True:
             try:
                 chunk_index, label_index = self.raw_queue.get(timeout=.03)
@@ -332,15 +350,21 @@ class QueueEventExtractor:
                     self.event_queue.put((index, events))
         self.logger.debug(f"Finalizing `run` for PID {os.getpid()}, {self}")
-        if self.close_queues:
+        if close_queues:
             # Explicitly close the event queue and join it
             self.event_queue.close()
             self.event_queue.join_thread()
             self.logger.debug(f"End of `run` for PID {os.getpid()}, {self}")
+        # Make sure everything gets written to the queue.
+        queue_handler.flush()
+        if close_queues:
             # Also close the logging queue. Note that not all messages might
             # arrive in the logging queue, since we called `cancel_join_thread`
             # earlier.
             self.log_queue.close()
+            self.log_queue.join_thread()
     @classmethod
     def get_ppid_from_kwargs(cls, kwargs):
@@ -362,17 +386,3 @@ class EventExtractorThread(QueueEventExtractor, threading.Thread):
     def __init__(self, *args, **kwargs):
         super(EventExtractorThread, self).__init__(
             name="EventExtractorThread", *args, **kwargs)
-class DeduplicatingLoggingFilter(logging.Filter):
-    def __init__(self, *args, **kwargs):
-        super(DeduplicatingLoggingFilter, self).__init__(*args, **kwargs)
-        self._records = []
-    def filter(self, record):
-        """Return True if the record should be logged"""
-        msg = record.getMessage()
-        logged = msg in self._records
-        if not logged:
-            self._records.append(msg)
-        return not logged

dcnum 0.16.1__tar.gz → 0.16.3__tar.gz

Potentially problematic release.

dcnum 0.16.1tar.gz → 0.16.3tar.gz