PyPI - dcnum - Versions diffs - 0.23.2__py3-none-any.whl → 0.25.1__py3-none-any.whl - Mend

dcnum 0.23.2py3-none-any.whl → 0.25.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dcnum might be problematic. Click here for more details.

Files changed (20) hide show

dcnum/_version.py +2 -2
dcnum/feat/event_extractor_manager_thread.py +6 -5
dcnum/feat/feat_background/base.py +24 -9
dcnum/feat/feat_background/bg_sparse_median.py +56 -30
dcnum/logic/ctrl.py +94 -43
dcnum/meta/ppid.py +4 -3
dcnum/read/__init__.py +1 -0
dcnum/read/cache.py +4 -3
dcnum/read/detect_flicker.py +44 -0
dcnum/read/hdf5_data.py +138 -70
dcnum/read/mapped.py +15 -2
dcnum/segm/segm_torch/segm_torch_mpo.py +4 -1
dcnum/write/__init__.py +1 -1
dcnum/write/queue_collector_thread.py +7 -14
dcnum/write/writer.py +149 -36
{dcnum-0.23.2.dist-info → dcnum-0.25.1.dist-info}/METADATA +1 -1
{dcnum-0.23.2.dist-info → dcnum-0.25.1.dist-info}/RECORD +20 -19
{dcnum-0.23.2.dist-info → dcnum-0.25.1.dist-info}/WHEEL +1 -1
{dcnum-0.23.2.dist-info → dcnum-0.25.1.dist-info}/LICENSE +0 -0
{dcnum-0.23.2.dist-info → dcnum-0.25.1.dist-info}/top_level.txt +0 -0

dcnum/read/hdf5_data.py CHANGED Viewed

@@ -102,10 +102,12 @@ class HDF5Data:
         elif (feat in self.h5["events"]
               and len(self.h5["events"][feat].shape) == 1):  # cache scalar
             if self.index_mapping is None:
-                idx_map = slice(None)  # no mapping indices, just slice
+                # no mapping indices, just slice
+                dat_sc = self.h5["events"][feat][:]
             else:
-                idx_map = get_mapping_indices(self.index_mapping)
-            self._cache_scalar[feat] = self.h5["events"][feat][idx_map]
+                dat_sc = get_mapped_object(self.h5["events"][feat],
+                                           index_mapping=self.index_mapping)[:]
+            self._cache_scalar[feat] = dat_sc
             return self._cache_scalar[feat]
         else:
             if feat in self.h5["events"]:
@@ -117,9 +119,11 @@ class HDF5Data:
             else:
                 # Check the basins
                 for idx in range(len(self.basins)):
-                    bn, bn_features = self.get_basin_data(idx)
-                    if bn_features and feat in bn_features:
-                        return bn[feat]  # already index-mapped
+                    bn_grp, bn_feats, bn_map = self.get_basin_data(idx)
+                    if bn_feats and feat in bn_feats:
+                        mapped_ds = get_mapped_object(obj=bn_grp[feat],
+                                                      index_mapping=bn_map)
+                        return mapped_ds
         # If we got here, then the feature data does not exist.
         raise KeyError(f"Feature '{feat}' not found in {self}!")
@@ -200,12 +204,7 @@ class HDF5Data:
                             np.array(h5["tables"][tab][tkey]).reshape(-1)
                     self.tables[tab] = tabdict
                 # basins
-                basins = []
-                for bnkey in h5.get("basins", {}).keys():
-                    bn_data = "\n".join(
-                        [s.decode() for s in h5["basins"][bnkey][:].tolist()])
-                    bn_dict = json.loads(bn_data)
-                    basins.append(bn_dict)
+                basins = self.extract_basin_dicts(h5)
                 self.basins = sorted(basins, key=lambda x: x["name"])
         if state["pixel_size"] is not None:
@@ -271,6 +270,30 @@ class HDF5Data:
         pixel_size = float(f"{pixel_size:.8f}")
         self.meta["imaging:pixel size"] = pixel_size
+    @staticmethod
+    def extract_basin_dicts(h5, check=True):
+        """Return list of basin dictionaries"""
+        # TODO:
+        #  - support iterative mapped basins and catch
+        #    circular basin definitions.
+        basins = []
+        for bnkey in h5.get("basins", {}).keys():
+            bn_data = "\n".join(
+                [s.decode() for s in h5["basins"][bnkey][:].tolist()])
+            bn_dict = json.loads(bn_data)
+            if check:
+                if bn_dict["type"] not in ["internal", "file"]:
+                    # we only support file-based and internal basins
+                    continue
+                basinmap = bn_dict.get("mapping")
+                if basinmap is not None and basinmap not in h5["events"]:
+                    # basinmap feature is missing
+                    continue
+            # Add the basin
+            basins.append(bn_dict)
+        return basins
     @property
     def features_scalar_frame(self):
         """Scalar features that apply to all events in a frame
@@ -289,9 +312,10 @@ class HDF5Data:
     def close(self):
         """Close the underlying HDF5 file"""
-        for bn, _ in self._basin_data.values():
-            if bn is not None:
-                bn.close()
+        for bn_group, _, _ in self._basin_data.values():
+            if bn_group is not None:
+                if bn_group.id.valid:
+                    bn_group.file.close()
         self._image_cache.clear()
         self._basin_data.clear()
         self.h5.close()
@@ -369,66 +393,110 @@ class HDF5Data:
                 raise ValueError(f"Invalid parameter '{var}'!")
         return kwargs
-    def get_basin_data(self, index):
+    def get_basin_data(self, index: int) -> (
+            h5py.Group,
+            List,
+            int | slice | List | np.ndarray,
+            ):
         """Return HDF5Data info for a basin index in `self.basins`
+        Parameters
+        ----------
+        index: int
+            index of the basin from which to get data
         Returns
         -------
-        data: HDF5Data
-            Data instance
+        group: h5py.Group
+            HDF5 group containing HDF5 Datasets with the names
+            listed in `features`
         features: list of str
-            List of features made available by this data instance
+            list of features made available by this basin
+        index_mapping:
+            a mapping (see `__init__`) that defines mapping from
+            the basin dataset to the referring dataset
         """
         if index not in self._basin_data:
             bn_dict = self.basins[index]
-            for ff in bn_dict["paths"]:
-                pp = pathlib.Path(ff)
-                if pp.is_absolute() and pp.exists():
-                    path = pp
-                    break
-                else:
-                    # try relative path
-                    prel = pathlib.Path(self.path).parent / pp
-                    if prel.exists():
-                        path = prel
-                        break
+            # HDF5 group containing the feature data
+            if bn_dict["type"] == "file":
+                h5group, features = self._get_basin_data_file(bn_dict)
+            elif bn_dict["type"] == "internal":
+                h5group, features = self._get_basin_data_internal(bn_dict)
             else:
-                path = None
-            if path is None:
-                self._basin_data[index] = (None, None)
+                raise ValueError(f"Invalid basin type '{bn_dict['type']}'")
+            # index mapping
+            feat_basinmap = bn_dict.get("mapping", None)
+            if feat_basinmap is None:
+                # This is NOT a mapped basin.
+                index_mapping = self.index_mapping
             else:
-                feat_basinmap = bn_dict.get("mapping", None)
-                if feat_basinmap is None:
-                    # This is NOT a mapped basin.
-                    index_mapping = self.index_mapping
+                # This is a mapped basin. Create an indexing list.
+                if self.index_mapping is None:
+                    # The current dataset is not mapped.
+                    basinmap_idx = slice(None)
                 else:
-                    # This is a mapped basin. Create an indexing list.
-                    if self.index_mapping is None:
-                        # The current dataset is not mapped.
-                        basinmap_idx = slice(None)
-                    else:
-                        # The current dataset is also mapped.
-                        basinmap_idx = get_mapping_indices(self.index_mapping)
-                    basinmap = self.h5[f"events/{feat_basinmap}"]
-                    index_mapping = basinmap[basinmap_idx]
-                h5dat = HDF5Data(path, index_mapping=index_mapping)
-                features = bn_dict.get("features")
-                if features is None:
-                    # Only get the features from the actual HDF5 file.
-                    # If this file has basins as well, the basin metadata
-                    # should have been copied over to the parent file. This
-                    # makes things a little cleaner, because basins are not
-                    # nested, but all basins are available in the top file.
-                    # See :func:`write.store_metadata` for copying metadata
-                    # between files.
-                    # The writer can still specify "features" in the basin
-                    # metadata, then these basins are indeed nested, and
-                    # we consider that ok as well.
-                    features = sorted(h5dat.h5["events"].keys())
-                self._basin_data[index] = (h5dat, features)
+                    # The current dataset is also mapped.
+                    basinmap_idx = get_mapping_indices(self.index_mapping)
+                basinmap = self.h5[f"events/{feat_basinmap}"]
+                index_mapping = basinmap[basinmap_idx]
+            self._basin_data[index] = (h5group, features, index_mapping)
         return self._basin_data[index]
+    def _get_basin_data_file(self, bn_dict):
+        for ff in bn_dict["paths"]:
+            pp = pathlib.Path(ff)
+            if pp.is_absolute() and pp.exists():
+                path = pp
+                break
+            else:
+                # try relative path
+                prel = pathlib.Path(self.path).parent / pp
+                if prel.exists():
+                    path = prel
+                    break
+        else:
+            path = None
+        if path is None:
+            # Cannot get data from this basin / cannot find file
+            h5group = None
+            features = []
+        else:
+            h5 = h5py.File(path, "r")
+            h5group = h5["events"]
+            # features defined in the basin
+            features = bn_dict.get("features")
+            if features is None:
+                # Only get the features from the actual HDF5 file.
+                # If this file has basins as well, the basin metadata
+                # should have been copied over to the parent file. This
+                # makes things a little cleaner, because basins are not
+                # nested, but all basins are available in the top file.
+                # See :func:`write.store_metadata` for copying metadata
+                # between files.
+                # The writer can still specify "features" in the basin
+                # metadata, then these basins are indeed nested, and
+                # we consider that ok as well.
+                features = sorted(h5group.keys())
+        return h5group, features
+    def _get_basin_data_internal(self, bn_dict):
+        # The group name is normally "basin_events"
+        group_name = bn_dict["paths"][0]
+        if group_name != "basin_events":
+            warnings.warn(
+                f"Uncommon group name for basin features: {group_name}")
+        h5group = self.h5[group_name]
+        features = bn_dict.get("features")
+        if features is None:
+            raise ValueError(
+                f"Encountered invalid internal basin '{bn_dict}': "
+                f"'features' must be defined")
+        return h5group, features
     def get_image_cache(self, feat):
         """Create an HDF5ImageCache object for the current dataset
@@ -442,15 +510,15 @@ class HDF5Data:
                 idx_map = None
                 # search all basins
                 for idx in range(len(self.basins)):
-                    bn_dat, features = self.get_basin_data(idx)
-                    if features is not None:
-                        if feat in features:
+                    bn_grp, bn_feats, bn_map = self.get_basin_data(idx)
+                    if bn_feats is not None:
+                        if feat in bn_feats:
                             # HDF5 dataset
-                            ds = bn_dat.h5[f"events/{feat}"]
+                            ds = bn_grp[feat]
                             # Index mapping (taken from the basins which
                             # already includes the mapping from the current
                             # instance).
-                            idx_map = bn_dat.index_mapping
+                            idx_map = bn_map
                             break
                 else:
                     ds = None
@@ -471,9 +539,9 @@ class HDF5Data:
             features = sorted(self.h5["/events"].keys())
             # add basin features
             for ii in range(len(self.basins)):
-                _, bfeats = self.get_basin_data(ii)
-                if bfeats:
-                    features += bfeats
+                _, bn_feats, _ = self.get_basin_data(ii)
+                if bn_feats:
+                    features += bn_feats
             self._keys = sorted(set(features))
         return self._keys

dcnum/read/mapped.py CHANGED Viewed

@@ -27,8 +27,21 @@ class MappedHDF5Dataset:
         if isinstance(idx, numbers.Integral):
             return self.h5ds[self.mapping_indices[idx]]
         else:
-            idx_mapped = self.mapping_indices[idx]
-            return self.h5ds[idx_mapped]
+            midx = self.mapping_indices[idx]
+            start = np.min(midx)
+            # Add one, because the final index must be included
+            stop = np.max(midx) + 1
+            # We have to perform mapping.
+            # Since h5py is very slow at indexing with arrays,
+            # we instead read the data in chunks from the input file,
+            # and perform the mapping afterward using the numpy arrays.
+            data_in = self.h5ds[start:stop]
+            # Determine the indices that we need from that chunk.
+            data = data_in[midx - start]
+            return data
+    def __len__(self):
+        return self.shape[0]
 def get_mapping_indices(

dcnum/segm/segm_torch/segm_torch_mpo.py CHANGED Viewed

@@ -36,7 +36,10 @@ class SegmentTorchMPO(TorchSegmenterBase, MPOSegmenter):
         # Set number of pytorch threads to 1, because dcnum is doing
         # all the multiprocessing.
         # https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads
-        torch.set_num_threads(1)
+        if torch.get_num_threads() != 1:
+            torch.set_num_threads(1)
+        if torch.get_num_interop_threads() != 1:
+            torch.set_num_interop_threads(1)
         device = torch.device("cpu")
         # Load model and metadata

dcnum/write/__init__.py CHANGED Viewed

@@ -2,5 +2,5 @@
 from .deque_writer_thread import DequeWriterThread
 from .queue_collector_thread import EventStash, QueueCollectorThread
 from .writer import (
-    HDF5Writer, copy_features, copy_metadata, create_with_basins,
+    HDF5Writer, copy_basins, copy_features, copy_metadata, create_with_basins,
     set_default_filter_kwargs)

dcnum/write/queue_collector_thread.py CHANGED Viewed

@@ -8,8 +8,6 @@ from typing import List
 import numpy as np
-from ..read import HDF5Data
 class EventStash:
     def __init__(self,
@@ -61,11 +59,10 @@ class EventStash:
             Event dictionary
         """
         idx_loc = index - self.index_offset
-        idx_stop = self.nev_idx[idx_loc]
-        self._tracker[idx_loc] = True
         if events:
             slice_loc = None
+            idx_stop = self.nev_idx[idx_loc]
             for feat in events:
                 dev = events[feat]
                 if dev.size:
@@ -76,6 +73,8 @@ class EventStash:
             if slice_loc:
                 self.indices_for_data[slice_loc] = index
+        self._tracker[idx_loc] = True
     def require_feature(self, feat, sample_data):
         """Create a new empty feature array in `self.events` and return it
@@ -87,10 +86,10 @@ class EventStash:
             Sample data for one event of the feature (used to determine
             shape and dtype of the feature array)
         """
-        sample_data = np.array(sample_data)
-        event_shape = sample_data.shape
-        dtype = sample_data.dtype
         if feat not in self.events:
+            sample_data = np.array(sample_data)
+            event_shape = sample_data.shape
+            dtype = sample_data.dtype
             darr = np.zeros((self.size,) + tuple(event_shape),
                             dtype=dtype)
             self.events[feat] = darr
@@ -99,7 +98,6 @@ class EventStash:
 class QueueCollectorThread(threading.Thread):
     def __init__(self,
-                 data: HDF5Data,
                  event_queue: mp.Queue,
                  writer_dq: deque,
                  feat_nevents: mp.Array,
@@ -115,9 +113,6 @@ class QueueCollectorThread(threading.Thread):
         Parameters
         ----------
-        data:
-            Data source object. This is used for appending additional
-            information
         event_queue:
             A queue object to which other processes or threads write
             events as tuples `(frame_index, events_dict)`.
@@ -146,8 +141,6 @@ class QueueCollectorThread(threading.Thread):
         super(QueueCollectorThread, self).__init__(
               name="QueueCollector", *args, **kwargs)
         self.logger = logging.getLogger("dcnum.write.QueueCollector")
-        #: HDF5 data instance
-        self.data = data
         #: Event queue from which to collect event data
         self.event_queue = event_queue
         #: Writer deque to which event arrays are appended
@@ -169,7 +162,7 @@ class QueueCollectorThread(threading.Thread):
         # We are not writing to `event_queue` so we can safely cancel
         # our queue thread if we are told to stop.
         self.event_queue.cancel_join_thread()
-        # Indexes the current frame in `self.data`.
+        # Indexes the current frame in the input HDF5Data instance.
         last_idx = 0
         self.logger.debug("Started collector thread")
         while True:

dcnum 0.23.2__py3-none-any.whl → 0.25.1__py3-none-any.whl

Potentially problematic release.

dcnum 0.23.2py3-none-any.whl → 0.25.1py3-none-any.whl