PyPI - dcnum - Versions diffs - 0.23.3__tar.gz → 0.24.0__tar.gz - Mend

dcnum 0.23.3tar.gz → 0.24.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dcnum might be problematic. Click here for more details.

Files changed (118) hide show

{dcnum-0.23.3 → dcnum-0.24.0}/.github/workflows/check.yml RENAMED Viewed

@@ -29,8 +29,8 @@ jobs:
         python -m pip install coverage flake8 pytest
     - name: Install dcnum
       run: |
-        # https://github.com/luispedro/mahotas/issues/144
-        pip install mahotas==1.4.13
+        # mahotas 1.4.15 does not yet support numpy 2.0
+        pip install "numpy<2"
         pip install .[torch]
     - name: List installed packages
       run: |

{dcnum-0.23.3 → dcnum-0.24.0}/CHANGELOG RENAMED Viewed

@@ -1,3 +1,17 @@
+0.24.0
+ - feat: add support for internal basins
+ - feat: "image_bg" as internal basin for "sparsemed" background computer
+ - fix: "sparsmed" background computer attributed background images with
+   an offset of `split_time` (the fist event obtained the background image
+   of the first event of the first second and so on)
+ - enh: support numpy indexing for mapped basins
+ - enh: add new `write.copy_basins` method
+ - ref: return `h5py.Group` in `HDF5Data.get_basin_data` instead of
+   a basin `HDF5Data` instance
+ - ref: perform "plumbing" before "cleanup" in pipeline
+ - ref: increment DCNUM_PPID_GENERATION to 11
+0.23.4
+ - enh: run set_num_interop_threads(1) for torchmpo segmenter
 0.23.3
  - fix: ignore non-file-type-like basins
  - fix: workaround for slow reading from HDF5 (don't use index arrays)

{dcnum-0.23.3 → dcnum-0.24.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dcnum
-Version: 0.23.3
+Version: 0.24.0
 Summary: numerics toolbox for imaging deformability cytometry
 Author: Maximilian Schlögel, Paul Müller, Raghava Alajangi
 Maintainer-email: Paul Müller <dev@craban.de>

{dcnum-0.23.3 → dcnum-0.24.0}/src/dcnum/_version.py RENAMED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.23.3'
-__version_tuple__ = version_tuple = (0, 23, 3)
+__version__ = version = '0.24.0'
+__version_tuple__ = version_tuple = (0, 24, 0)

{dcnum-0.23.3 → dcnum-0.24.0}/src/dcnum/feat/feat_background/base.py RENAMED Viewed

@@ -1,8 +1,10 @@
 import abc
 import functools
 import inspect
+import logging
 import multiprocessing as mp
 import pathlib
+import time
 import h5py
@@ -41,8 +43,11 @@ class Background(abc.ABC):
         kwargs:
             Additional keyword arguments passed to the subclass.
         """
+        self.logger = logging.getLogger(
+            f"dcnum.feat.feat_background.{self.__class__.__name__}")
         # proper conversion to Path objects
         output_path = pathlib.Path(output_path)
+        self.output_path = output_path
         if isinstance(input_data, str):
             input_data = pathlib.Path(input_data)
         # kwargs checks
@@ -188,20 +193,30 @@ class Background(abc.ABC):
             return self.image_proc.value
     def process(self):
+        """Perform the background computation
+        This irreversibly removes/overrides any "image_bg" and
+        "bg_off" features defined in the output file `self.h5out`.
+        """
+        t0 = time.perf_counter()
         # Delete any old background data
-        for key in ["image_bg", "bg_off"]:
-            if key in self.h5out["events"]:
-                del self.h5out["events"][key]
+        for ds_key in ["image_bg", "bg_off"]:
+            for grp_key in ["events", "basin_events"]:
+                if grp_key in self.h5out and ds_key in self.h5out[grp_key]:
+                    del self.h5out[grp_key][ds_key]
         # Perform the actual background computation
         self.process_approach()
         bg_ppid = self.get_ppid()
         # Store pipeline information in the image_bg/bg_off feature
-        for key in ["image_bg", "bg_off"]:
-            if key in self.h5out["events"]:
-                self.h5out[f"events/{key}"].attrs["dcnum ppid background"] = \
-                    bg_ppid
-                self.h5out[F"events/{key}"].attrs["dcnum ppid generation"] = \
-                    ppid.DCNUM_PPID_GENERATION
+        for ds_key in ["image_bg", "bg_off"]:
+            for grp_key in ["events", "basin_events"]:
+                if grp_key in self.h5out and ds_key in self.h5out[grp_key]:
+                    self.h5out[f"{grp_key}/{ds_key}"].attrs[
+                        "dcnum ppid background"] = bg_ppid
+                    self.h5out[F"{grp_key}/{ds_key}"].attrs[
+                        "dcnum ppid generation"] = ppid.DCNUM_PPID_GENERATION
+        self.logger.info(
+            f"Background computation time: {time.perf_counter()-t0:.1f}s")
     @abc.abstractmethod
     def process_approach(self):

{dcnum-0.23.3 → dcnum-0.24.0}/src/dcnum/feat/feat_background/bg_sparse_median.py RENAMED Viewed

@@ -1,4 +1,3 @@
-import logging
 import queue
 import time
@@ -9,14 +8,13 @@ from ...read import HDF5Data
 from .base import mp_spawn, Background
-logger = logging.getLogger(__name__)
 class BackgroundSparseMed(Background):
     def __init__(self, input_data, output_path, kernel_size=200,
                  split_time=1., thresh_cleansing=0, frac_cleansing=.8,
                  offset_correction=True,
-                 compress=True, num_cpus=None):
+                 compress=True,
+                 num_cpus=None):
         """Sparse median background correction with cleansing
         In contrast to the rolling median background correction,
@@ -79,6 +77,11 @@ class BackgroundSparseMed(Background):
         num_cpus: int
             Number of CPUs to use for median computation. Defaults to
             `multiprocessing.cpu_count()`.
+        .. versionchanged:: 0.23.5
+            The background image data are stored as an internal
+            mapped basin to reduce the output file size.
         """
         super(BackgroundSparseMed, self).__init__(
             input_data=input_data,
@@ -93,7 +96,7 @@ class BackgroundSparseMed(Background):
         )
         if kernel_size > len(self.input_data):
-            logger.warning(
+            self.logger.warning(
                 f"The kernel size {kernel_size} is too large for input data"
                 f"size {len(self.input_data)}. Setting it to input data size!")
             kernel_size = len(self.input_data)
@@ -126,13 +129,14 @@ class BackgroundSparseMed(Background):
                 else:
                     # compute time using frame rate (approximate)
                     dur = self.image_count / fr * 1.5
-                    logger.info(f"Approximating duration: {dur/60:.1f}min")
+                    self.logger.info(
+                        f"Approximating duration: {dur/60:.1f}min")
                     self.time = np.linspace(0, dur, self.image_count,
                                             endpoint=True)
         if self.time is None:
             # No HDF5 file or no information therein; Make an educated guess.
             dur = self.image_count / 3600 * 1.5
-            logger.info(f"Guessing duration: {dur/60:.1f}min")
+            self.logger.info(f"Guessing duration: {dur/60:.1f}min")
             self.time = np.linspace(0, dur, self.image_count,
                                     endpoint=True)
@@ -301,18 +305,18 @@ class BackgroundSparseMed(Background):
                 thresh = np.quantile(ref, self.frac_cleansing)
                 used = ref <= thresh
                 frac_remove = np.sum(~used) / used.size
-                logger.warning(
+                self.logger.warning(
                     f"{frac_remove_user:.1%} of the background images would "
                     f"be removed with the current settings, so we enforce "
                     f"`frac_cleansing`. To avoid this warning, try decreasing "
                     f"`thresh_cleansing` or `frac_cleansing`. The new "
                     f"threshold is {thresh_fact / thresh}.")
-            logger.info(f"Cleansed {frac_remove:.2%}")
+            self.logger.info(f"Cleansed {frac_remove:.2%}")
             step_times = self.step_times[used]
             bg_images = self.bg_images[used]
         else:
-            logger.info("Background series cleansing disabled")
+            self.logger.info("Background series cleansing disabled")
             step_times = self.step_times
             bg_images = self.bg_images
@@ -322,35 +326,55 @@ class BackgroundSparseMed(Background):
         idx1 = None
         for ii in range(len(step_times)):
             t1 = step_times[ii]
-            idx1 = np.argmin(np.abs(self.time - t1 + self.split_time/2))
+            idx1 = np.argmin(np.abs(self.time - t1 - self.split_time/2))
             bg_idx[idx0:idx1] = ii
             idx0 = idx1
         if idx1 is not None:
             # Fill up remainder of index array with last entry
             bg_idx[idx1:] = ii
-        self.image_proc.value = 1
-        # Write background data
-        pos = 0
-        step = 1000
-        while pos < self.image_count:
-            stop = min(pos + step, self.image_count)
-            cur_slice = slice(pos, stop)
-            cur_bg_data = bg_images[bg_idx[cur_slice]]
-            self.writer.store_feature_chunk("image_bg", cur_bg_data)
-            if self.offset_correction:
+        # Store the background images as an internal mapped basin
+        self.writer.store_basin(
+            name="background images",
+            description=f"Pipeline identifier: {self.get_ppid()}",
+            mapping=bg_idx,
+            internal_data={"image_bg": bg_images}
+            )
+        # store the offset correction, if applicable
+        if self.offset_correction:
+            self.logger.info("Computing offset correction")
+            # compute the mean at the top of all background images
+            sh, sw = self.input_data.shape[1:]
+            roi_full = (slice(None), slice(0, 20), slice(0, sw))
+            bg_data_mean = np.mean(bg_images[roi_full], axis=(1, 2))
+            pos = 0
+            step = self.writer.get_best_nd_chunks(item_shape=(sh, sw),
+                                                  feat_dtype=np.uint8)[0]
+            bg_off = np.zeros(self.image_count, dtype=float)
+            # For every chunk in the input image data, compute that
+            # value as well and store the resulting offset value.
+            # TODO: Could this be parallelized, or are we limited in reading?
+            while pos < self.image_count:
+                stop = min(pos + step, self.image_count)
                 # Record background offset correction "bg_off". We take a
                 # slice of 20px from the top of the image (there are normally
                 # no events here, only the channel walls are visible).
-                sh, sw = self.input_data.shape[1:]
-                roi_full = (slice(None), slice(0, 20), slice(0, sw))
+                cur_slice = slice(pos, stop)
+                # mean background brightness
+                val_bg = bg_data_mean[bg_idx[cur_slice]]
+                # mean image brightness
                 roi_cur = (cur_slice, slice(0, 20), slice(0, sw))
-                val_bg = np.mean(cur_bg_data[roi_full], axis=(1, 2))
                 val_dat = np.mean(self.input_data[roi_cur], axis=(1, 2))
                 # background image = image_bg + bg_off
-                self.writer.store_feature_chunk("bg_off", val_dat - val_bg)
-            pos += step
+                bg_off[cur_slice] = val_dat - val_bg
+                # set progress
+                self.image_proc.value = 0.5 * (1 + pos / self.image_count)
+                pos = stop
+            # finally, store the background offset feature
+            self.writer.store_feature_chunk("bg_off", bg_off)
+        self.image_proc.value = 1
     def process_second(self,
                        ii: int,
@@ -393,7 +417,9 @@ class BackgroundSparseMed(Background):
         self.bg_images[ii] = self.shared_output.reshape(self.image_shape)
-        self.image_proc.value = idx_stop / self.image_count
+        self.image_proc.value = idx_stop / (
+                # with offset correction, everything is slower
+                self.image_count * (1 + self.offset_correction))
 class WorkerSparseMed(mp_spawn.Process):

{dcnum-0.23.3 → dcnum-0.24.0}/src/dcnum/logic/ctrl.py RENAMED Viewed

@@ -403,6 +403,12 @@ class DCNumJobRunner(threading.Thread):
                                   features=orig_feats,
                                   mapping=None)
+        # Handle basin data according to the user's request
+        self.state = "plumbing"
+        self.task_enforce_basin_strategy()
+        self.state = "cleanup"
         with HDF5Writer(self.path_temp_out) as hw:
             # pipeline metadata
             hw.h5.attrs["pipeline:dcnum generation"] = self.ppdict["gen_id"]
@@ -462,11 +468,7 @@ class DCNumJobRunner(threading.Thread):
             # copy metadata/logs/tables from original file
             with h5py.File(self.job["path_in"]) as h5_src:
-                copy_metadata(h5_src=h5_src,
-                              h5_dst=hw.h5,
-                              # Don't copy basins, we would have to index-map
-                              # them first.
-                              copy_basins=False)
+                copy_metadata(h5_src=h5_src, h5_dst=hw.h5)
             if redo_seg:
                 # Store the correct measurement identifier. This is used to
                 # identify this file as a correct basin in subsequent pipeline
@@ -490,12 +492,6 @@ class DCNumJobRunner(threading.Thread):
                 mid_new = f"{mid_cur}_{mid_ap}" if mid_cur else mid_ap
                 hw.h5.attrs["experiment:run identifier"] = mid_new
-        # Handle basin data according to the user's request
-        self.state = "plumbing"
-        self.task_enforce_basin_strategy()
-        self.state = "cleanup"
         trun = datetime.timedelta(seconds=round(time.monotonic() - time_start))
         self.logger.info(f"Run duration: {str(trun)}")
         self.logger.info(time.strftime("Run stop: %Y-%m-%d-%H.%M.%S",
@@ -547,22 +543,17 @@ class DCNumJobRunner(threading.Thread):
         """
         self._progress_bn = 0
         t0 = time.perf_counter()
-        # We need to make sure that the features are correctly attributed
-        # from the input files. E.g. if the input file already has
-        # background images, but we recompute the background images, then
-        # we have to use the data from the recomputed background file.
-        # We achieve this by keeping a specific order and only copying those
-        # features that we don't already have in the output file.
-        feats_raw = [
-            # 1. background data from the temporary input image
-            # (this must come before draw [sic!])
-            [self.dtin.h5, ["image_bg", "bg_off"], "critical"],
-            # 2. frame-based scalar features from the raw input file
-            # (e.g. "temp" or "frame")
-            [self.draw.h5, self.draw.features_scalar_frame, "optional"],
-            # 3. image features from the input file
-            [self.draw.h5, ["image", "image_bg", "bg_off"], "optional"],
-        ]
+        # We have these points to consider:
+        # - We must use the `basinmap` feature to map from the original
+        #   file to the output file.
+        # - We must copy "bg_off" and "image_bg" to the output file.
+        # - For the "drain" basin strategy, we also have to copy all the
+        #   other features.
+        # - If "image_bg" is defined as an internal basin in the input
+        #   file, we have to convert the mapping and store a corresponding
+        #   internal basin in the output file.
+        # Determine the basinmap feature
         with HDF5Writer(self.path_temp_out) as hw:
             hout = hw.h5
             # First, we have to determine the basin mapping from input to
@@ -584,14 +575,15 @@ class DCNumJobRunner(threading.Thread):
                 # to the original input HDF5 file.
                 raw_im = self.draw.index_mapping
                 if raw_im is None:
-                    self.logger.info("Input file mapped with basinmap0")
                     # Create a hard link to save time and space
                     hout["events/basinmap0"] = hout["events/index_unmapped"]
-                    basinmap = idx_um
+                    basinmap0 = idx_um
                 else:
-                    basinmap = get_mapping_indices(raw_im)[idx_um]
+                    self.logger.info("Converting input mapping")
+                    basinmap0 = get_mapping_indices(raw_im)[idx_um]
                     # Store the mapped basin data in the output file.
-                    hw.store_feature_chunk("basinmap0", basinmap)
+                    hw.store_feature_chunk("basinmap0", basinmap0)
+                self.logger.info("Input mapped to output with basinmap0")
                 # We don't need them anymore.
                 del hout["events/index_unmapped"]
@@ -599,19 +591,72 @@ class DCNumJobRunner(threading.Thread):
                 # is the size of the raw dataset and the latter is its mapped
                 # size!
                 size_raw = self.draw.h5.attrs["experiment:event count"]
-                if (len(basinmap) == size_raw
-                        and np.all(basinmap == np.arange(size_raw))):
+                if (len(basinmap0) == size_raw
+                        and np.all(basinmap0 == np.arange(size_raw))):
                     # This means that the images in the input overlap perfectly
                     # with the images in the output, i.e. a "copy" segmenter
                     # was used or something is very reproducible.
                     # We set basinmap to None to be more efficient.
-                    basinmap = None
+                    basinmap0 = None
             else:
                 # The input is identical to the output, because we are using
                 # the same pipeline identifier.
-                basinmap = None
+                basinmap0 = None
+            # List of features we have to copy from input to output.
+            # We need to make sure that the features are correctly attributed
+            # from the input files. E.g. if the input file already has
+            # background images, but we recompute the background images, then
+            # we have to use the data from the recomputed background file.
+            # We achieve this by keeping a specific order and only copying
+            # those features that we don't already have in the output file.
+            feats_raw = [
+                # background data from the temporary input image
+                [self.dtin.h5, ["bg_off"], "critical"],
+                [self.draw.h5, self.draw.features_scalar_frame, "optional"],
+                [self.draw.h5, ["image", "bg_off"], "optional"],
+            ]
+            # Store image_bg as an internal basin, if defined in input
+            for idx in range(len(self.dtin.basins)):
+                bn_dict = self.dtin.basins[idx]
+                if (bn_dict["type"] == "internal"
+                        and "image_bg" in bn_dict["features"]):
+                    self.logger.info(
+                        "Copying internal basin background images")
+                    bn_grp, bn_feats, bn_map = self.dtin.get_basin_data(idx)
+                    assert "image_bg" in bn_feats
+                    # Load all images into memory (should only be ~600)
+                    bg_images1 = self.dtin.h5["basin_events"]["image_bg"][:]
+                    # Get the original internal mapping for these images
+                    # Note that `basinmap0` always refers to indices in the
+                    # original raw input file, and not to indices in an
+                    # optional mapped input file (using `index_mapping`).
+                    # Therefore, we do `self.dtin.h5["events"]["basinmap0"]`
+                    # instead of `self.dtin["basinmap0"]`
+                    basinmap_in = self.dtin.h5["events"][bn_dict["mapping"]][:]
+                    # Now we have to convert the indices in `basinmap_in`
+                    # to indices in the output file.
+                    basinmap1 = basinmap_in[basinmap0]
+                    # Store the internal mapping in the output file
+                    hw.store_basin(name=bn_dict["name"],
+                                   description=bn_dict["description"],
+                                   mapping=basinmap1,
+                                   internal_data={"image_bg": bg_images1}
+                                   )
+                    break
+            else:
+                self.logger.info("Background images must be copied")
+                # There is no internal image_bg feature, probably because
+                # the user did not use the sparsemed background correction.
+                # In this case, we simply add "image_bg" to the `feats_raw`.
+                feats_raw += [
+                    [self.dtin.h5, ["image_bg"], "critical"],
+                    [self.draw.h5, ["image_bg"], "optional"],
+                ]
+            # Copy the features required in the output file.
             for hin, feats, importance in feats_raw:
                 # Only consider features that are available in the input
                 # and that are not already in the output.
@@ -626,7 +671,7 @@ class DCNumJobRunner(threading.Thread):
                     copy_features(h5_src=hin,
                                   h5_dst=hout,
                                   features=feats,
-                                  mapping=basinmap)
+                                  mapping=basinmap0)
                 else:
                     # TAP: Create basins for the "optional" features in the
                     # output file. Note that the "critical" features never
@@ -638,7 +683,7 @@ class DCNumJobRunner(threading.Thread):
                     paths = [pin, os.path.relpath(pin, pout)]
                     hw.store_basin(name="dcnum basin",
                                    features=feats,
-                                   mapping=basinmap,
+                                   mapping=basinmap0,
                                    paths=paths,
                                    description=f"Created with dcnum {version}",
                                    )

{dcnum-0.23.3 → dcnum-0.24.0}/src/dcnum/meta/ppid.py RENAMED Viewed

@@ -10,7 +10,7 @@ import warnings
 #: Increment this string if there are breaking changes that make
 #: previous pipelines unreproducible.
-DCNUM_PPID_GENERATION = "10"
+DCNUM_PPID_GENERATION = "11"
 class ClassWithPPIDCapabilities(Protocol):

dcnum 0.23.3__tar.gz → 0.24.0__tar.gz

Potentially problematic release.

dcnum 0.23.3tar.gz → 0.24.0tar.gz