PyPI - dcnum - Versions diffs - 0.19.1__py3-none-any.whl → 0.20.1__py3-none-any.whl - Mend

dcnum 0.19.1py3-none-any.whl → 0.20.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dcnum might be problematic. Click here for more details.

Files changed (22) hide show

dcnum/_version.py +2 -2
dcnum/feat/event_extractor_manager_thread.py +2 -2
dcnum/feat/feat_background/base.py +3 -3
dcnum/feat/feat_background/bg_copy.py +10 -8
dcnum/feat/feat_background/bg_roll_median.py +2 -2
dcnum/feat/feat_background/bg_sparse_median.py +2 -2
dcnum/logic/ctrl.py +154 -35
dcnum/logic/job.py +55 -2
dcnum/read/__init__.py +1 -1
dcnum/read/cache.py +24 -2
dcnum/read/const.py +1 -1
dcnum/read/hdf5_data.py +32 -9
dcnum/read/mapped.py +10 -0
dcnum/write/__init__.py +2 -1
dcnum/write/deque_writer_thread.py +9 -1
dcnum/write/queue_collector_thread.py +8 -14
dcnum/write/writer.py +91 -0
{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/METADATA +1 -1
{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/RECORD +22 -22
{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/LICENSE +0 -0
{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/WHEEL +0 -0
{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/top_level.txt +0 -0

dcnum/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.19.1'
-__version_tuple__ = version_tuple = (0, 19, 1)
+__version__ = version = '0.20.1'
+__version_tuple__ = version_tuple = (0, 20, 1)

dcnum/feat/event_extractor_manager_thread.py CHANGED Viewed

@@ -96,8 +96,8 @@ class EventExtractorManagerThread(threading.Thread):
             # If the writer_dq starts filling up, then this could lead to
             # an oom-kill signal. Stall for the writer to prevent this.
             ldq = len(self.writer_dq)
-            if ldq > 100:
-                stallsec = ldq / 100
+            if ldq > 1000:
+                stallsec = ldq / 1000
                 self.logger.warning(
                     f"Stalling {stallsec:.1f}s for slow writer")
                 time.sleep(stallsec)

dcnum/feat/feat_background/base.py CHANGED Viewed

@@ -62,8 +62,8 @@ class Background(abc.ABC):
         #: number of images in the input data
         self.image_count = None
-        #: number of images that have been processed
-        self.image_proc = mp_spawn.Value("L", 0)
+        #: fraction images that have been processed
+        self.image_proc = mp_spawn.Value("d", 0)
         #: HDF5Data instance for input data
         self.hdin = None
@@ -185,7 +185,7 @@ class Background(abc.ABC):
         if self.image_count == 0:
             return 0.
         else:
-            return self.image_proc.value / self.image_count
+            return self.image_proc.value
     def process(self):
         # Delete any old background data

dcnum/feat/feat_background/bg_copy.py CHANGED Viewed

@@ -9,17 +9,19 @@ class BackgroundCopy(Background):
         pass
     def process(self):
-        """Perform median computation on entire input data"""
+        """Copy input data to output dataset"""
         if self.h5in != self.h5out:
-            hin = self.hdin.image_bg.h5ds
-            h5py.h5o.copy(src_loc=hin.parent.id,
-                          src_name=b"image_bg",
-                          dst_loc=self.h5out["events"].id,
-                          dst_name=b"image_bg",
-                          )
+            hin = self.hdin.h5
+            for feat in ["image_bg", "bg_off"]:
+                if feat in hin["events"]:
+                    h5py.h5o.copy(src_loc=hin["events"].id,
+                                  src_name=feat.encode("utf-8"),
+                                  dst_loc=self.h5out["events"].id,
+                                  dst_name=feat.encode("utf-8"),
+                                  )
         # set progress to 100%
-        self.image_proc.value = self.image_count
+        self.image_proc.value = 1
     def process_approach(self):
         # We do the copying in `process`, because we do not want to modify

dcnum/feat/feat_background/bg_roll_median.py CHANGED Viewed

@@ -184,7 +184,7 @@ class BackgroundRollMed(Background):
                 num_remaining,
                 axis=0)
             self.writer.store_feature_chunk("image_bg", last_chunk)
-            self.image_proc.value += num_remaining
+        self.image_proc.value = 1
     def process_next_batch(self):
         """Process one batch of input data"""
@@ -223,7 +223,7 @@ class BackgroundRollMed(Background):
             )
         self.current_batch += 1
-        self.image_proc.value += self.batch_size
+        self.image_proc.value += self.batch_size / self.image_count
 class WorkerRollMed(mp_spawn.Process):

dcnum/feat/feat_background/bg_sparse_median.py CHANGED Viewed

@@ -329,7 +329,7 @@ class BackgroundSparseMed(Background):
             # Fill up remainder of index array with last entry
             bg_idx[idx1:] = ii
-        self.image_proc.value = self.image_count
+        self.image_proc.value = 1
         # Write background data
         pos = 0
@@ -393,7 +393,7 @@ class BackgroundSparseMed(Background):
         self.bg_images[ii] = self.shared_output.reshape(self.image_shape)
-        self.image_proc.value = idx_stop
+        self.image_proc.value = idx_stop / self.image_count
 class WorkerSparseMed(mp_spawn.Process):

dcnum/logic/ctrl.py CHANGED Viewed

@@ -14,6 +14,7 @@ import traceback
 import uuid
 import h5py
+import numpy as np
 from ..feat.feat_background.base import get_available_background_methods
 from ..feat.queue_event_extractor import QueueEventExtractor
@@ -21,10 +22,10 @@ from ..feat import gate
 from ..feat import EventExtractorManagerThread
 from ..segm import SegmenterManagerThread, get_available_segmenters
 from ..meta import ppid
-from ..read import HDF5Data
-from .._version import version_tuple
+from ..read import HDF5Data, get_mapping_indices
+from .._version import version, version_tuple
 from ..write import (
-    DequeWriterThread, HDF5Writer, QueueCollectorThread,
+    DequeWriterThread, HDF5Writer, QueueCollectorThread, copy_features,
     copy_metadata, create_with_basins, set_default_filter_kwargs
 )
@@ -43,6 +44,7 @@ valid_states = [
     "setup",
     "background",
     "segmentation",
+    "plumbing",
     "cleanup",
     "done",
     "error",
@@ -79,8 +81,9 @@ class DCNumJobRunner(threading.Thread):
         # current job state
         self._state = "init"
         # overall progress [0, 1]
-        self._progress_bg = None
-        self._progress_ex = None
+        self._progress_bg = None  # background
+        self._progress_ex = None  # segmentation
+        self._progress_bn = None  # creating basins
         # segmentation frame rate
         self._segm_rate = 0
@@ -237,8 +240,12 @@ class DCNumJobRunner(threading.Thread):
         # how much fractional time each processing step takes.
         bgw = 4  # fraction of background
         exw = 27  # fraction of segmentation and feature extraction
+        if self.job["basin_strategy"] == "drain":
+            drw = 15  # because data need to be copied
+        else:
+            drw = 1  # just creating the basins in output file
         clw = 1  # fraction of cleanup operations
-        tot = bgw + exw + clw
+        tot = bgw + exw + drw + clw
         progress = 0
         st = self.state
@@ -247,15 +254,22 @@ class DCNumJobRunner(threading.Thread):
             # background already computed
             progress += bgw / tot
         elif self._progress_bg is not None:
-            # This is the image count of the input dataset
-            progress += bgw / tot * (self._progress_bg.value / len(self.draw))
+            # This is the image count of the input dataset.
+            progress += self._progress_bg.value * bgw / tot
         # segmentation
         if valid_states.index(st) > valid_states.index("segmentation"):
             # segmentation already done
             progress += exw / tot
         elif self._progress_ex is not None:
-            progress += exw / tot * self._progress_ex
+            progress += self._progress_ex * exw / tot
+        # draining basins
+        if valid_states.index(st) > valid_states.index("plumbing"):
+            # plumbing already done
+            progress += drw / tot
+        if self._progress_bn is not None:
+            progress += self._progress_bn * drw / tot
         if self.state == "done":
             progress = 1
@@ -371,16 +385,20 @@ class DCNumJobRunner(threading.Thread):
             # Note any new actions that work on `self.path_temp_in` are not
             # reflected in `self.path_temp_out`.
             self.path_temp_in.rename(self.path_temp_out)
-        self.state = "cleanup"
-        # The user would normally expect the output file to be something
-        # that is self-contained (copying the file wildly across file
-        # systems and network shares should not impair feature availability).
-        # Therefore, we copy any remaining basin-based features to the
-        # temporary output file.
-        if self.job["no_basins_in_output"]:
-            self.task_transfer_basin_data()
+            # Since no segmentation was done, the output file now does not
+            # contain any events. This is not really what we wanted, but we
+            # can still store all features in the output file if required.
+            if self.job["basin_strategy"] == "drain":
+                orig_feats = []
+                for feat in self.draw.h5["events"].keys():
+                    if isinstance(self.draw.h5["events"][feat], h5py.Dataset):
+                        # copy_features does not support Groups
+                        orig_feats.append(feat)
+                with h5py.File(self.path_temp_out, "a") as h5_dst:
+                    copy_features(h5_src=self.draw.h5,
+                                  h5_dst=h5_dst,
+                                  features=orig_feats,
+                                  mapping=None)
         with HDF5Writer(self.path_temp_out) as hw:
             # pipeline metadata
@@ -433,7 +451,8 @@ class DCNumJobRunner(threading.Thread):
             with h5py.File(self.job["path_in"]) as h5_src:
                 copy_metadata(h5_src=h5_src,
                               h5_dst=hw.h5,
-                              # don't copy basins
+                              # Don't copy basins, we would have to index-map
+                              # them first.
                               copy_basins=False)
             if redo_seg:
                 # Store the correct measurement identifier. This is used to
@@ -450,6 +469,12 @@ class DCNumJobRunner(threading.Thread):
                 mid_new = f"{mid_cur}_{mid_ap}" if mid_cur else mid_ap
                 hw.h5.attrs["experiment:run identifier"] = mid_new
+        # Handle basin data according to the user's request
+        self.state = "plumbing"
+        self.task_enforce_basin_strategy()
+        self.state = "cleanup"
         trun = datetime.timedelta(seconds=round(time.monotonic() - time_start))
         self.logger.info(f"Run duration: {str(trun)}")
         self.logger.info(time.strftime("Run stop: %Y-%m-%d-%H.%M.%S",
@@ -491,6 +516,115 @@ class DCNumJobRunner(threading.Thread):
             bic.process()
         self.logger.info("Finished background computation")
+    def task_enforce_basin_strategy(self):
+        """Transfer basin data from input files to output if requested
+        The user specified the "basin_strategy" keyword argument in
+        `self.job`. If this is set to "drain", then copy all basin
+        information from the input file to the output file. If it
+        is set to "tap", then only create basins in the output file.
+        """
+        self._progress_bn = 0
+        t0 = time.perf_counter()
+        # We need to make sure that the features are correctly attributed
+        # from the input files. E.g. if the input file already has
+        # background images, but we recompute the background images, then
+        # we have to use the data from the recomputed background file.
+        # We achieve this by keeping a specific order and only copying those
+        # features that we don't already have in the output file.
+        feats_raw = [
+            # 1. background data from the temporary input image
+            # (this must come before draw [sic!])
+            [self.dtin.h5, ["image_bg", "bg_off"], "critical"],
+            # 2. frame-based scalar features from the raw input file
+            # (e.g. "temp" or "frame")
+            [self.draw.h5, self.draw.features_scalar_frame, "optional"],
+            # 3. image features from the input file
+            [self.draw.h5, ["image", "image_bg", "bg_off"], "optional"],
+        ]
+        with h5py.File(self.path_temp_out, "a") as hout:
+            hw = HDF5Writer(hout)
+            # First, we have to determine the basin mapping from input to
+            # output. This information is stored by the QueueCollectorThread
+            # in the "basinmap0" feature, ready to be used by us.
+            if "index_unmapped" in hout["events"]:
+                # The unmapped indices enumerate the events in the output file
+                # with indices from the mapped input file. E.g. if for the
+                # first image in the input file, two events are found and for
+                # the second image in the input file, three events are found,
+                # then this would contain [0, 0, 1, 1, 1, ...]. If the index
+                # mapping of the input file was set to slice(1, 100), then the
+                # first image would not be there, and we would have
+                # [1, 1, 1, ...].
+                idx_um = hout["events/index_unmapped"]
+                # If we want to convert this to an actual basinmap feature,
+                # then we have to convert those indices to indices that map
+                # to the original input HDF5 file.
+                raw_im = self.draw.index_mapping
+                if raw_im is None:
+                    self.logger.info("Input file mapped with basinmap0")
+                    # Create a hard link to save time and space
+                    hout["events/basinmap0"] = hout["events/index_unmapped"]
+                    basinmap = idx_um
+                else:
+                    basinmap = get_mapping_indices(raw_im)[idx_um]
+                    # Store the mapped basin data in the output file.
+                    hw.store_feature_chunk("basinmap0", basinmap)
+                # We don't need them anymore.
+                del hout["events/index_unmapped"]
+                # Note that `size_raw != (len(self.draw))` [sic!]. The former
+                # is the size of the raw dataset and the latter is its mapped
+                # size!
+                size_raw = self.draw.h5.attrs["experiment:event count"]
+                if (len(basinmap) == size_raw
+                        and np.all(basinmap == np.arange(size_raw))):
+                    # This means that the images in the input overlap perfectly
+                    # with the images in the output, i.e. a "copy" segmenter
+                    # was used or something is very reproducible.
+                    # We set basinmap to None to be more efficient.
+                    basinmap = None
+            else:
+                # The input is identical to the output, because we are using
+                # the same pipeline identifier.
+                basinmap = None
+            for hin, feats, importance in feats_raw:
+                # Only consider features that are available in the input
+                # and that are not already in the output.
+                feats = [f for f in feats
+                         if (f in hin["events"] and f not in hout["events"])]
+                if not feats:
+                    continue
+                elif (self.job["basin_strategy"] == "drain"
+                      or importance == "critical"):
+                    # DRAIN: Copy all features over to the output file.
+                    self.logger.debug(f"Transferring {feats} to output file")
+                    copy_features(h5_src=hin,
+                                  h5_dst=hout,
+                                  features=feats,
+                                  mapping=basinmap)
+                else:
+                    # TAP: Create basins for the "optional" features in the
+                    # output file. Note that the "critical" features never
+                    # reach this case.
+                    self.logger.debug(f"Creating basin for {feats}")
+                    # Relative and absolute paths.
+                    pin = pathlib.Path(hin.filename).resolve()
+                    pout = pathlib.Path(hout.filename).resolve().parent
+                    paths = [pin, os.path.relpath(pin, pout)]
+                    hw.store_basin(name="dcnum basin",
+                                   features=feats,
+                                   mapping=basinmap,
+                                   paths=paths,
+                                   description=f"Created with dcnum {version}",
+                                   )
+                self._progress_bn += 1 / len(feats_raw)
+        t_tot = time.perf_counter() - t0
+        self.logger.info(f"Enforcing basin strategy time: {t_tot:.1f}s")
     def task_segment_extract(self):
         self.logger.info("Starting segmentation and feature extraction")
         # Start writer thread
@@ -629,21 +763,6 @@ class DCNumJobRunner(threading.Thread):
         self.logger.info("Finished segmentation and feature extraction")
-    def task_transfer_basin_data(self):
-        with h5py.File(self.path_temp_out, "a") as hout:
-            hd = HDF5Data(hout)
-            for ii, _ in enumerate(hd.basins):
-                hindat, features = hd.get_basin_data(ii)
-                for feat in features:
-                    if feat not in hout["events"]:
-                        self.logger.debug(
-                            f"Transferring {feat} to output file")
-                        h5py.h5o.copy(src_loc=hindat.h5["events"].id,
-                                      src_name=feat.encode(),
-                                      dst_loc=hout["events"].id,
-                                      dst_name=feat.encode(),
-                                      )
 def join_thread_helper(thr, timeout, retries, logger, name):
     for _ in range(retries):

dcnum/logic/job.py CHANGED Viewed

@@ -3,7 +3,8 @@ import copy
 import inspect
 import multiprocessing as mp
 import pathlib
-from typing import Dict
+from typing import Dict, Literal
+import warnings
 from ..feat import QueueEventExtractor
 from ..feat.feat_background.base import get_available_background_methods
@@ -27,10 +28,62 @@ class DCNumPipelineJob:
                  feature_kwargs: Dict = None,
                  gate_code: str = "norm",
                  gate_kwargs: Dict = None,
-                 no_basins_in_output: bool = True,
+                 basin_strategy: Literal["drain", "tap"] = "drain",
+                 no_basins_in_output: bool = None,
                  num_procs: int = None,
                  debug: bool = False,
                  ):
+        """Pipeline job recipe
+        Parameters
+        ----------
+        path_in: pathlib.Path | str
+            input data path
+        path_out: pathlib.Path | str
+            output data path
+        data_code: str
+            code of input data reader to use
+        data_kwargs: dict
+            keyword arguments for data reader
+        background_code: str
+            code of background data computer to use
+        background_kwargs: dict
+            keyword arguments for background data computer
+        segmenter_code: str
+            code of segmenter to use
+        segmenter_kwargs: dict
+            keyword arguments for segmenter
+        feature_code: str
+            code of feature extractor
+        feature_kwargs: dict
+            keyword arguments for feature extractor
+        gate_code: str
+            code for gating/event filtering class
+        gate_kwargs: dict
+            keyword arguments for gating/event filtering class
+        basin_strategy: str
+            strategy on how to handle event data; In principle, not all
+            events have to be stored in the output file if basins are
+            defined, linking back to the original file.
+            - You can "drain" all basins which means that the output file
+              will contain all features, but will also be very big.
+            - You can "tap" the basins, including the input file, which means
+              that the output file will be comparatively small.
+        no_basins_in_output: bool
+            Deprecated
+        num_procs: int
+            Number of processes to use
+        debug: bool
+            Whether to be verbose and use threads instead of processes
+        """
+        if no_basins_in_output is not None:
+            warnings.warn("The `no_basins_in_output` keyword argument is "
+                          "deprecated. Please use `basin_strategy` instead.")
+            if no_basins_in_output:
+                basin_strategy = "drain"
+            else:
+                basin_strategy = "tap"
         #: initialize keyword arguments for this job
         self.kwargs = {}
         spec = inspect.getfullargspec(DCNumPipelineJob.__init__)

dcnum/read/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 from .cache import md5sum
 from .const import PROTECTED_FEATURES
 from .hdf5_data import HDF5Data, HDF5ImageCache, concatenated_hdf5_data
-from .mapped import get_mapping_indices, get_mapped_object
+from .mapped import get_mapping_indices, get_mapped_object

dcnum/read/cache.py CHANGED Viewed

@@ -22,6 +22,7 @@ class BaseImageChunkCache(abc.ABC):
                  cache_size: int = 2,
                  ):
         self.shape = shape
+        self._dtype = None
         chunk_size = min(shape[0], chunk_size)
         self._len = self.shape[0]
         #: This is a FILO cache for the chunks
@@ -33,12 +34,32 @@ class BaseImageChunkCache(abc.ABC):
         self.num_chunks = int(np.ceil(self._len / (self.chunk_size or 1)))
     def __getitem__(self, index):
-        chunk_index, sub_index = self._get_chunk_index_for_index(index)
-        return self.get_chunk(chunk_index)[sub_index]
+        if isinstance(index, (slice, list, np.ndarray)):
+            if isinstance(index, slice):
+                indices = np.arange(index.start or 0,
+                                    index.stop or len(self),
+                                    index.step)
+            else:
+                indices = index
+            array_out = np.empty((len(indices),) + self.image_shape,
+                                 dtype=self.dtype)
+            for ii, idx in enumerate(indices):
+                array_out[ii] = self[idx]
+            return array_out
+        else:
+            chunk_index, sub_index = self._get_chunk_index_for_index(index)
+            return self.get_chunk(chunk_index)[sub_index]
     def __len__(self):
         return self._len
+    @property
+    def dtype(self):
+        """data type of the image data"""
+        if self._dtype is None:
+            self._dtype = self[0].dtype
+        return self._dtype
     @abc.abstractmethod
     def _get_chunk_data(self, chunk_slice):
         """Implemented in subclass to obtain actual data"""
@@ -50,6 +71,7 @@ class BaseImageChunkCache(abc.ABC):
             raise IndexError(
                 f"Index {index} out of bounds for HDF5ImageCache "
                 f"of size {self._len}")
+        index = int(index)  # convert np.uint64 to int, so we get ints below
         chunk_index = index // self.chunk_size
         sub_index = index % self.chunk_size
         return chunk_index, sub_index

dcnum/read/const.py CHANGED Viewed

@@ -8,7 +8,7 @@ PROTECTED_FEATURES = [
     "pressure",
     "temp",
     "temp_amb",
-    "time"
+    "time",
 ]
 # User-defined features may be anything, but if the user needs something

dcnum/read/hdf5_data.py CHANGED Viewed

@@ -186,25 +186,27 @@ class HDF5Data:
                     if isinstance(self.meta[key], bytes):
                         self.meta[key] = self.meta[key].decode("utf-8")
                 # logs
-                for key in h5.get("logs", []):
+                for key in sorted(h5.get("logs", {}).keys()):
                     alog = list(h5["logs"][key])
                     if alog:
                         if isinstance(alog[0], bytes):
                             alog = [ll.decode("utf") for ll in alog]
                         self.logs[key] = alog
                 # tables
-                for tab in h5.get("tables", []):
+                for tab in sorted(h5.get("tables", {}).keys()):
                     tabdict = {}
                     for tkey in h5["tables"][tab].dtype.fields.keys():
                         tabdict[tkey] = \
                             np.array(h5["tables"][tab][tkey]).reshape(-1)
                     self.tables[tab] = tabdict
                 # basins
-                for bnkey in h5.get("basins", []):
+                basins = []
+                for bnkey in h5.get("basins", {}).keys():
                     bn_data = "\n".join(
                         [s.decode() for s in h5["basins"][bnkey][:].tolist()])
                     bn_dict = json.loads(bn_data)
-                    self.basins.append(bn_dict)
+                    basins.append(bn_dict)
+                self.basins = sorted(basins, key=lambda x: x["name"])
         if state["pixel_size"] is not None:
             self.pixel_size = state["pixel_size"]
@@ -395,7 +397,22 @@ class HDF5Data:
             if path is None:
                 self._basin_data[index] = (None, None)
             else:
-                h5dat = HDF5Data(path, index_mapping=self.index_mapping)
+                feat_basinmap = bn_dict.get("mapping", None)
+                if feat_basinmap is None:
+                    # This is NOT a mapped basin.
+                    index_mapping = self.index_mapping
+                else:
+                    # This is a mapped basin. Create an indexing list.
+                    if self.index_mapping is None:
+                        # The current dataset is not mapped.
+                        basinmap_idx = slice(None)
+                    else:
+                        # The current dataset is also mapped.
+                        basinmap_idx = get_mapping_indices(self.index_mapping)
+                    basinmap = self.h5[f"events/{feat_basinmap}"]
+                    index_mapping = basinmap[basinmap_idx]
+                h5dat = HDF5Data(path, index_mapping=index_mapping)
                 features = bn_dict.get("features")
                 if features is None:
                     # Only get the features from the actual HDF5 file.
@@ -420,21 +437,27 @@ class HDF5Data:
         if feat not in self._image_cache:
             if f"events/{feat}" in self.h5:
                 ds = self.h5[f"events/{feat}"]
+                idx_map = self.index_mapping
             else:
+                idx_map = None
                 # search all basins
                 for idx in range(len(self.basins)):
-                    bndat, features = self.get_basin_data(idx)
+                    bn_dat, features = self.get_basin_data(idx)
                     if features is not None:
                         if feat in features:
-                            ds = bndat.h5[f"events/{feat}"]
+                            # HDF5 dataset
+                            ds = bn_dat.h5[f"events/{feat}"]
+                            # Index mapping (taken from the basins which
+                            # already includes the mapping from the current
+                            # instance).
+                            idx_map = bn_dat.index_mapping
                             break
                 else:
                     ds = None
             if ds is not None:
                 image = HDF5ImageCache(
-                    h5ds=get_mapped_object(obj=ds,
-                                           index_mapping=self.index_mapping),
+                    h5ds=get_mapped_object(obj=ds, index_mapping=idx_map),
                     cache_size=self.image_cache_size,
                     boolean=feat == "mask")
             else:

dcnum/read/mapped.py CHANGED Viewed

@@ -34,6 +34,16 @@ class MappedHDF5Dataset:
 def get_mapping_indices(
         index_mapping: numbers.Integral | slice | list | np.ndarray
         ):
+    """Return integer numpy array with mapping indices for a range
+    Parameters
+    ----------
+    index_mapping: numbers.Integral | slice | list | np.ndarray
+        Several options you have here:
+        - integer: results in np.arrange(integer)
+        - slice: results in np.arrange(slice.start, slice.stop, slice.step)
+        - list or np.ndarray: returns the input as  unit32 array
+    """
     if isinstance(index_mapping, numbers.Integral):
         return _get_mapping_indices_cached(index_mapping)
     elif isinstance(index_mapping, slice):

dcnum/write/__init__.py CHANGED Viewed

@@ -2,4 +2,5 @@
 from .deque_writer_thread import DequeWriterThread
 from .queue_collector_thread import EventStash, QueueCollectorThread
 from .writer import (
-    HDF5Writer, copy_metadata, create_with_basins, set_default_filter_kwargs)
+    HDF5Writer, copy_features, copy_metadata, create_with_basins,
+    set_default_filter_kwargs)

dcnum/write/deque_writer_thread.py CHANGED Viewed

@@ -1,14 +1,17 @@
 import collections
+import logging
 import pathlib
 import threading
 import time
+import h5py
 from .writer import HDF5Writer
 class DequeWriterThread(threading.Thread):
     def __init__(self,
-                 path_out: pathlib.Path,
+                 path_out: pathlib.Path | h5py.File,
                  dq: collections.deque,
                  ds_kwds: dict = None,
                  mode: str = "a",
@@ -24,6 +27,7 @@ class DequeWriterThread(threading.Thread):
             using `popleft()`.
         """
         super(DequeWriterThread, self).__init__(*args, **kwargs)
+        self.logger = logging.getLogger("dcnum.write.DequeWriterThread")
         if mode == "w":
             path_out.unlink(missing_ok=True)
         self.writer = HDF5Writer(path_out, mode=mode, ds_kwds=ds_kwds)
@@ -40,17 +44,21 @@ class DequeWriterThread(threading.Thread):
         self.may_stop_loop = True
     def run(self):
+        time_tot = 0
         while True:
             ldq = len(self.dq)
             if self.must_stop_loop:
                 break
             elif ldq:
+                t0 = time.perf_counter()
                 for _ in range(ldq):
                     feat, data = self.dq.popleft()
                     self.writer.store_feature_chunk(feat=feat, data=data)
+                time_tot += time.perf_counter() - t0
             elif self.may_stop_loop:
                 break
             else:
                 # wait for the next item to arrive
                 time.sleep(.1)
+        self.logger.info(f"Disk time: {time_tot:.1f}s")
         self.writer.close()

dcnum/write/queue_collector_thread.py CHANGED Viewed

@@ -245,20 +245,14 @@ class QueueCollectorThread(threading.Thread):
             # the events that we just saved.
             indices = stash.indices_for_data
-            # Write all the scalar features.
-            for feat in self.data.features_scalar_frame:
-                self.writer_dq.append((feat, self.data[feat][indices]))
-            # Write the image and background data.
-            imdat = np.zeros((stash.size,) + self.data.image.image_shape,
-                             dtype=np.uint8)
-            bgdat = np.zeros((stash.size,) + self.data.image.image_shape,
-                             dtype=np.uint8)
-            for ii, idx in enumerate(indices):
-                imdat[ii] = self.data.image[idx]
-                bgdat[ii] = self.data.image_bg[idx]
-            self.writer_dq.append(("image", imdat))
-            self.writer_dq.append(("image_bg", bgdat))
+            # This is the unmapped index from the input HDF5Data instance.
+            # Unmapped means that this only enumerates HDF5Data, but since
+            # HDF5Data can be mapped, the index does not necessarily enumerate
+            # the underlying HDF5 file. Later on, we will have to convert this
+            # to the correct "basinmap0" feature
+            # (see `DCNumJobRunner.task_enforce_basin_strategy`)
+            self.writer_dq.append(("index_unmapped",
+                                   np.array(indices, dtype=np.uint32)))
             # Write the number of events.
             self.writer_dq.append(("nevents",

dcnum/write/writer.py CHANGED Viewed

@@ -115,6 +115,7 @@ class HDF5Writer:
                     paths: List[str | pathlib.Path],
                     features: List[str] = None,
                     description: str | None = None,
+                    mapping: np.ndarray = None
                     ):
         """Write an HDF5-based file basin
@@ -128,6 +129,9 @@ class HDF5Writer:
             list of features provided by `paths`
         description: str
             optional string describing the basin
+        mapping: 1D array
+            integer array with indices that map the basin dataset
+            to this dataset
         """
         bdat = {
             "description": description,
@@ -136,8 +140,38 @@ class HDF5Writer:
             "paths": [str(pp) for pp in paths],
             "type": "file",
         }
+        # Explicit features stored in basin file
         if features is not None and len(features):
             bdat["features"] = features
+        # Mapped basin information
+        if mapping is not None:
+            events = self.h5.require_group("events")
+            # Reserve a mapping feature for this dataset
+            for ii in range(10):  # basinmap0 to basinmap9
+                bm_cand = f"basinmap{ii}"
+                if bm_cand in events:
+                    # There is a basin mapping defined in the file. Check
+                    # whether it is identical to ours.
+                    if np.all(events[bm_cand] == mapping):
+                        # Great, we are done here.
+                        feat_basinmap = bm_cand
+                        break
+                    else:
+                        # This mapping belongs to a different basin,
+                        # try the next mapping.
+                        continue
+                else:
+                    # The mapping is not defined in the dataset, and we may
+                    # write it to a new feature.
+                    feat_basinmap = bm_cand
+                    self.store_feature_chunk(feat=feat_basinmap, data=mapping)
+                    break
+            else:
+                raise ValueError(
+                    "You have exhausted the usage of mapped basins for "
+                    "the current dataset. Please revise your analysis "
+                    "pipeline.")
+            bdat["mapping"] = feat_basinmap
         bstring = json.dumps(bdat, indent=2)
         # basin key is its hash
         key = hashlib.md5(bstring.encode("utf-8",
@@ -266,6 +300,63 @@ def create_with_basins(
                            )
+def copy_features(h5_src: h5py.File,
+                  h5_dst: h5py.File,
+                  features: List[str],
+                  mapping: np.ndarray = None,
+                  ):
+    """Copy feature data from one HDF5 file to another
+    The feature must not exist in the destination file.
+    Parameters
+    ----------
+    h5_src: h5py.File
+        Input HDF5File containing `features` in the "events" group
+    h5_dst: h5py.File
+        Output HDF5File opened in write mode not containing `features`
+    features: List[str]
+        List of features to copy from source to destination
+    mapping: 1D array
+        If given, contains indices in the input file that should be
+        written to the output file. If set to None, all features are written.
+    """
+    ei = h5_src["events"]
+    eo = h5_dst.require_group("events")
+    # This is the size of the output dataset
+    size = h5_dst.attrs["experiment:event count"]
+    hw = HDF5Writer(h5_dst)
+    for feat in features:
+        if feat in eo:
+            raise ValueError(f"Output file {h5_dst.filename} already contains "
+                             f"the feature {feat}.")
+        if not isinstance(ei[feat], h5py.Dataset):
+            raise NotImplementedError(
+                f"Only dataset-based features are supported here, not {feat}")
+        if mapping is None:
+            # Just copy the data as-is.
+            h5py.h5o.copy(src_loc=ei.id,
+                          src_name=feat.encode(),
+                          dst_loc=eo.id,
+                          dst_name=feat.encode(),
+                          )
+        else:
+            # Perform mapping and store the features in chunks to keep
+            # memory usage down.
+            dsi = ei[feat]
+            chunk_size = hw.get_best_nd_chunks(dsi[0].shape, dsi.dtype)[0]
+            start = 0
+            while start < size:
+                chunk_idx = mapping[start:start + chunk_size]
+                # h5py only supports indexing in increasing order
+                chunk_unique, order = np.unique(chunk_idx, return_inverse=True)
+                data_unique = dsi[chunk_unique]
+                data = data_unique[order]
+                hw.store_feature_chunk(feat, data)
+                # increment start
+                start += chunk_size
 def copy_metadata(h5_src: h5py.File,
                   h5_dst: h5py.File,
                   copy_basins=True):

{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dcnum
-Version: 0.19.1
+Version: 0.20.1
 Summary: numerics toolbox for imaging deformability cytometry
 Author: Maximilian Schlögel, Paul Müller
 Maintainer-email: Paul Müller <dev@craban.de>

{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 dcnum/__init__.py,sha256=hcawIKS7utYiOyVhOAX9t7K3xYzP1b9862VV0b6qSrQ,74
-dcnum/_version.py,sha256=wOLHPF5OO0ubEMjzvaXg4CVhVL1uy5Ci-sH1WTZH0Dg,413
+dcnum/_version.py,sha256=cyxBp0FYMpyeeOYYUcvD5Pt3djNYQojwuNgSo8-1Bp4,413
 dcnum/feat/__init__.py,sha256=jUJYWTD3VIoDNKrmryXbjHb1rGwYtK4b7VPWihYgUoo,325
-dcnum/feat/event_extractor_manager_thread.py,sha256=Ocid_t1awH6pOmurCmKYkC51XsXB0-DoN3fzjFDgE4c,7129
+dcnum/feat/event_extractor_manager_thread.py,sha256=mAjPnS7K-ZmKbWolTNCnjXe3e-y5canNhf1l_GRYil0,7131
 dcnum/feat/gate.py,sha256=svbObmqpYdqPawpfrsEjTiUPJXf24GrNi8PXTKT-z44,7225
 dcnum/feat/queue_event_extractor.py,sha256=XhA930QVQ1Z4saisbcGTrEut-fSgwTyfDn6b9GzD4iY,15644
 dcnum/feat/feat_background/__init__.py,sha256=OTmMuazHNaSrZb2XW4cnJ6PlgJLbKrPbaidpEixYa0A,341
-dcnum/feat/feat_background/base.py,sha256=IYBFfsGXBfmFnZfD9QrmfrXbJtFSfVOS-v-u-uxSThs,7985
-dcnum/feat/feat_background/bg_copy.py,sha256=muU-6eTUf3HTA2K2asrLWcR_hbRMjdygZROmjNXCm7Q,923
-dcnum/feat/feat_background/bg_roll_median.py,sha256=j3seExcWkk6IeFIOO4zkj-LIA7ryig9bmGYYj_dDgoM,13173
-dcnum/feat/feat_background/bg_sparse_median.py,sha256=-CShs4UAuZd00rACaXTZj3dccKevhcSGRsILFgMPLWo,20705
+dcnum/feat/feat_background/base.py,sha256=phZdyOrHQPjvYlw1JQ8DkdXw5H2-eE1LfLGqCAo1rlo,7965
+dcnum/feat/feat_background/bg_copy.py,sha256=PK8x4_Uph-_A6uszZC5uhe1gD1dSRdHnDMEsN0HSGHA,1034
+dcnum/feat/feat_background/bg_roll_median.py,sha256=EyjstMDXFBYuJB1lN6g4Uw7tPm434X3hXQxKSqvcoJ4,13175
+dcnum/feat/feat_background/bg_sparse_median.py,sha256=ab7Boj7cmr6PBdTbyWTj_yNNJSfuowr7u-iSGW989WI,20709
 dcnum/feat/feat_brightness/__init__.py,sha256=o6AebVlmydwNgVF5kW6ITqJyFreoKrU3Ki_3EC8If-s,155
 dcnum/feat/feat_brightness/bright_all.py,sha256=vf8xaYBdKD24hHUXdkI0_S7nbr7m49KW6gvuWvbHDVg,4545
 dcnum/feat/feat_brightness/common.py,sha256=JX49EszYDmnvoOKXFVV1CalEIWRmOuY5EryNbqGbdac,156
@@ -20,29 +20,29 @@ dcnum/feat/feat_texture/__init__.py,sha256=6StM9S540UVtdFFR3bHa7nfCTomeVdoo7Uy9C
 dcnum/feat/feat_texture/common.py,sha256=COXHpXS-7DMouGu3WF83I76L02Sr7P9re4lxajh6g0E,439
 dcnum/feat/feat_texture/tex_all.py,sha256=eGjjNfPpfZw7FA_VNFCIMiU38KD0qcGbxLciYy-tCiA,4097
 dcnum/logic/__init__.py,sha256=7J3GrwJInNQbrLk61HRIV7X7p69TAIbMYpR34hh6u14,177
-dcnum/logic/ctrl.py,sha256=FvVXbrP7WqgYeDznep0KyfMck3cbCO8Yoli8P6clRPc,27956
-dcnum/logic/job.py,sha256=M0Q-Rfcm-zkTXTQc79W6YSNUjUlgmRPG0Ikbdn1aOpY,4608
+dcnum/logic/ctrl.py,sha256=eaA_eO8X9c8wXFo35GwcMZEKQwzsVual7JTNL9f12y4,34412
+dcnum/logic/job.py,sha256=cF4bPiEy5UkDmQN91Ku2yxBW0nXBEmKTNkIHlL2LT-U,6724
 dcnum/logic/json_encoder.py,sha256=cxMnqisbKEVf-rVcw6rK2BBAb6iz_hKFaGl81kK36lQ,571
 dcnum/meta/__init__.py,sha256=AVqRgyKXO1orKnE305h88IBvoZ1oz6X11HN1WP5nGvg,60
 dcnum/meta/paths.py,sha256=J_ikeHzd7gEeRgAKjuayz3x6q4h1fOiDadM-ZxhAGm4,1053
 dcnum/meta/ppid.py,sha256=Q3jg8lZt5tlGIby_-7rBqTANesMjJrmxASXZhsvBD_Y,7706
-dcnum/read/__init__.py,sha256=8uGj4YN7pDP4FO9TkZWXrpScwTLVWSEZexFq-TS9vsA,215
-dcnum/read/cache.py,sha256=kC2Y9hXA92ARQ2Vgm1kBFCU-s6TPE1tPYvpzWI0aPow,5619
-dcnum/read/const.py,sha256=8ih8rlWM7ntp8phrr9dh22hXXb210igSCatOSI9Ou30,463
-dcnum/read/hdf5_data.py,sha256=psMN2CGorU4uFO1nlGcpUxKFLZ9HaKCReTi7tVx50tg,22291
-dcnum/read/mapped.py,sha256=Oh1jH2yVqWBPomEf8vlGvsGOMc02ldapAAjgNo-bS7g,2676
+dcnum/read/__init__.py,sha256=ksLdV8EkOU3EPje8teCOSehcUeGAZfg9TQ5ltuEUgls,216
+dcnum/read/cache.py,sha256=lisrGG7AyvVitf0h92wh5FvYCsxa0pWyGcAyYwGP-LQ,6471
+dcnum/read/const.py,sha256=GG9iyXDtEldvJYOBnhZjlimzIeBMAt4bSr2-xn2gzzc,464
+dcnum/read/hdf5_data.py,sha256=Yyq02UTILc5ZgIQXpR9Y0wuX2WT8s0g23PraI7KxmJY,23489
+dcnum/read/mapped.py,sha256=UryArlrIsHxjOyimBL2Nooi3r73zuGtnGdqdxa6PK_g,3076
 dcnum/segm/__init__.py,sha256=iiq_1A9DU5wMUcKnsZ53E7NyzCkbZCJeUDimzunE-OM,247
 dcnum/segm/segm_thresh.py,sha256=lMf-lso_O_5Q5lJiiIQdYkM3zlj4uwNz9cNvLxVMeXc,1396
 dcnum/segm/segmenter.py,sha256=gVzmP6CuwI9Qfk8GN_xWGu_xbtVTOhxIOWn-2yr_H1Y,12220
 dcnum/segm/segmenter_cpu.py,sha256=IzhPNQaO4TBh3EzZqLGaBAeRryfBKnld7Joe8qY4AB4,10690
 dcnum/segm/segmenter_gpu.py,sha256=Au1MQdAalVsmJ-cmb3OcCmEMBfXSDuJjdXJTGqEIcG8,1962
 dcnum/segm/segmenter_manager_thread.py,sha256=xQEioOkASlm8DTdG0RBtjCJP1cOuiyJAm4q2n1l_tfM,5710
-dcnum/write/__init__.py,sha256=Cpn3LqL18hh8OScUnGp_AnNfpWPpKW-oAJZH6ot7aRA,241
-dcnum/write/deque_writer_thread.py,sha256=KpJ6po8JPlM696MITN-bhNnWQcy9E-qlhg9g-uzoPZg,1710
-dcnum/write/queue_collector_thread.py,sha256=YQ6pvKNmCDf1C6HVx6gOA-q-FBoI6nkhOo-tAVYnyag,11906
-dcnum/write/writer.py,sha256=nlJfQCPoW2Wze72y_256G4qmgYMdh5mL0vpvqg7lSaU,11728
-dcnum-0.19.1.dist-info/LICENSE,sha256=YRChA1C8A2E-amJbudwMcbTCZy_HzmeY0hMIvduh1MM,1089
-dcnum-0.19.1.dist-info/METADATA,sha256=tWMwpFt4Nn8vs1H0aRTBWDmH3pej-O3gWHJ6ESkbvSw,2194
-dcnum-0.19.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-dcnum-0.19.1.dist-info/top_level.txt,sha256=Hmh38rgG_MFTVDpUDGuO2HWTSq80P585Het4COQzFTg,6
-dcnum-0.19.1.dist-info/RECORD,,
+dcnum/write/__init__.py,sha256=QvWHeZmjHI18i-YlGYuzN3i7dVWY9UCReKchrJ-gif0,260
+dcnum/write/deque_writer_thread.py,sha256=ao7F1yrVKyufgC4rC0Y2_Vt7snuT6KpI7W2qVxcjdhk,1994
+dcnum/write/queue_collector_thread.py,sha256=d_WfdsZdFnFsiAY0zVMwUlA4juIMeiWYmE_-rezBQCE,11734
+dcnum/write/writer.py,sha256=e6J8YVqhS7kzkpPIMoDMokJpqSy1WWNdOrwaJof1oVc,15601
+dcnum-0.20.1.dist-info/LICENSE,sha256=YRChA1C8A2E-amJbudwMcbTCZy_HzmeY0hMIvduh1MM,1089
+dcnum-0.20.1.dist-info/METADATA,sha256=8hfnqtJ-lrkKlXnbWBGqRK4bSDDb0C4zmQDB6Os8f-U,2194
+dcnum-0.20.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+dcnum-0.20.1.dist-info/top_level.txt,sha256=Hmh38rgG_MFTVDpUDGuO2HWTSq80P585Het4COQzFTg,6
+dcnum-0.20.1.dist-info/RECORD,,

{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{dcnum-0.19.1.dist-info → dcnum-0.20.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

dcnum 0.19.1__py3-none-any.whl → 0.20.1__py3-none-any.whl

Potentially problematic release.

dcnum 0.19.1py3-none-any.whl → 0.20.1py3-none-any.whl