dcnum 0.13.2__py3-none-any.whl → 0.23.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dcnum might be problematic. Click here for more details.

Files changed (55) hide show
  1. dcnum/_version.py +2 -2
  2. dcnum/feat/__init__.py +2 -1
  3. dcnum/feat/event_extractor_manager_thread.py +67 -33
  4. dcnum/feat/feat_background/__init__.py +3 -12
  5. dcnum/feat/feat_background/base.py +80 -65
  6. dcnum/feat/feat_background/bg_copy.py +31 -0
  7. dcnum/feat/feat_background/bg_roll_median.py +38 -30
  8. dcnum/feat/feat_background/bg_sparse_median.py +96 -45
  9. dcnum/feat/feat_brightness/__init__.py +1 -0
  10. dcnum/feat/feat_brightness/bright_all.py +41 -6
  11. dcnum/feat/feat_contour/__init__.py +4 -0
  12. dcnum/feat/{feat_moments/mt_legacy.py → feat_contour/moments.py} +32 -8
  13. dcnum/feat/feat_contour/volume.py +174 -0
  14. dcnum/feat/feat_texture/__init__.py +1 -0
  15. dcnum/feat/feat_texture/tex_all.py +28 -1
  16. dcnum/feat/gate.py +92 -70
  17. dcnum/feat/queue_event_extractor.py +139 -70
  18. dcnum/logic/__init__.py +5 -0
  19. dcnum/logic/ctrl.py +794 -0
  20. dcnum/logic/job.py +184 -0
  21. dcnum/logic/json_encoder.py +19 -0
  22. dcnum/meta/__init__.py +1 -0
  23. dcnum/meta/paths.py +30 -0
  24. dcnum/meta/ppid.py +66 -9
  25. dcnum/read/__init__.py +1 -0
  26. dcnum/read/cache.py +109 -77
  27. dcnum/read/const.py +6 -4
  28. dcnum/read/hdf5_data.py +190 -31
  29. dcnum/read/mapped.py +87 -0
  30. dcnum/segm/__init__.py +6 -15
  31. dcnum/segm/segm_thresh.py +7 -14
  32. dcnum/segm/segm_torch/__init__.py +19 -0
  33. dcnum/segm/segm_torch/segm_torch_base.py +125 -0
  34. dcnum/segm/segm_torch/segm_torch_mpo.py +71 -0
  35. dcnum/segm/segm_torch/segm_torch_sto.py +88 -0
  36. dcnum/segm/segm_torch/torch_model.py +95 -0
  37. dcnum/segm/segm_torch/torch_postproc.py +93 -0
  38. dcnum/segm/segm_torch/torch_preproc.py +114 -0
  39. dcnum/segm/segmenter.py +245 -96
  40. dcnum/segm/segmenter_manager_thread.py +39 -28
  41. dcnum/segm/{segmenter_cpu.py → segmenter_mpo.py} +137 -43
  42. dcnum/segm/segmenter_sto.py +110 -0
  43. dcnum/write/__init__.py +3 -1
  44. dcnum/write/deque_writer_thread.py +15 -5
  45. dcnum/write/queue_collector_thread.py +14 -17
  46. dcnum/write/writer.py +225 -55
  47. {dcnum-0.13.2.dist-info → dcnum-0.23.1.dist-info}/METADATA +4 -2
  48. dcnum-0.23.1.dist-info/RECORD +55 -0
  49. {dcnum-0.13.2.dist-info → dcnum-0.23.1.dist-info}/WHEEL +1 -1
  50. dcnum/feat/feat_moments/__init__.py +0 -3
  51. dcnum/segm/segmenter_gpu.py +0 -45
  52. dcnum-0.13.2.dist-info/RECORD +0 -40
  53. /dcnum/feat/{feat_moments/ct_opencv.py → feat_contour/contour.py} +0 -0
  54. {dcnum-0.13.2.dist-info → dcnum-0.23.1.dist-info}/LICENSE +0 -0
  55. {dcnum-0.13.2.dist-info → dcnum-0.23.1.dist-info}/top_level.txt +0 -0
dcnum/logic/job.py ADDED
@@ -0,0 +1,184 @@
1
+ import collections
2
+ import copy
3
+ import inspect
4
+ import logging
5
+ import multiprocessing as mp
6
+ import pathlib
7
+ from typing import Dict, Literal
8
+ import warnings
9
+
10
+ from ..feat import QueueEventExtractor
11
+ from ..feat.feat_background.base import get_available_background_methods
12
+ from ..feat.gate import Gate
13
+ from ..meta.ppid import compute_pipeline_hash, DCNUM_PPID_GENERATION
14
+ from ..read import HDF5Data
15
+ from ..segm import get_available_segmenters
16
+
17
+
18
+ class DCNumPipelineJob:
19
+ def __init__(self,
20
+ path_in: pathlib.Path | str,
21
+ path_out: pathlib.Path | str = None,
22
+ data_code: str = "hdf",
23
+ data_kwargs: Dict = None,
24
+ background_code: str = "sparsemed",
25
+ background_kwargs: Dict = None,
26
+ segmenter_code: str = "thresh",
27
+ segmenter_kwargs: Dict = None,
28
+ feature_code: str = "legacy",
29
+ feature_kwargs: Dict = None,
30
+ gate_code: str = "norm",
31
+ gate_kwargs: Dict = None,
32
+ basin_strategy: Literal["drain", "tap"] = "drain",
33
+ no_basins_in_output: bool = None,
34
+ num_procs: int = None,
35
+ log_level: int = logging.INFO,
36
+ debug: bool = False,
37
+ ):
38
+ """Pipeline job recipe
39
+
40
+ Parameters
41
+ ----------
42
+ path_in: pathlib.Path | str
43
+ input data path
44
+ path_out: pathlib.Path | str
45
+ output data path
46
+ data_code: str
47
+ code of input data reader to use
48
+ data_kwargs: dict
49
+ keyword arguments for data reader
50
+ background_code: str
51
+ code of background data computer to use
52
+ background_kwargs: dict
53
+ keyword arguments for background data computer
54
+ segmenter_code: str
55
+ code of segmenter to use
56
+ segmenter_kwargs: dict
57
+ keyword arguments for segmenter
58
+ feature_code: str
59
+ code of feature extractor
60
+ feature_kwargs: dict
61
+ keyword arguments for feature extractor
62
+ gate_code: str
63
+ code for gating/event filtering class
64
+ gate_kwargs: dict
65
+ keyword arguments for gating/event filtering class
66
+ basin_strategy: str
67
+ strategy on how to handle event data; In principle, not all
68
+ events have to be stored in the output file if basins are
69
+ defined, linking back to the original file.
70
+ - You can "drain" all basins which means that the output file
71
+ will contain all features, but will also be very big.
72
+ - You can "tap" the basins, including the input file, which means
73
+ that the output file will be comparatively small.
74
+ no_basins_in_output: bool
75
+ Deprecated
76
+ num_procs: int
77
+ Number of processes to use
78
+ log_level: int
79
+ Logging level to use.
80
+ debug: bool
81
+ Whether to set logging level to "DEBUG" and
82
+ use threads instead of processes
83
+ """
84
+ if no_basins_in_output is not None:
85
+ warnings.warn("The `no_basins_in_output` keyword argument is "
86
+ "deprecated. Please use `basin_strategy` instead.")
87
+ if no_basins_in_output:
88
+ basin_strategy = "drain"
89
+ else:
90
+ basin_strategy = "tap"
91
+
92
+ #: initialize keyword arguments for this job
93
+ self.kwargs = {}
94
+ spec = inspect.getfullargspec(DCNumPipelineJob.__init__)
95
+ locs = locals()
96
+ for arg in spec.args:
97
+ if arg == "self":
98
+ continue
99
+ value = locs[arg]
100
+ if value is None and spec.annotations[arg] is Dict:
101
+ value = {}
102
+ self.kwargs[arg] = value
103
+ # Set default pixel size for this job
104
+ if "pixel_size" not in self.kwargs["data_kwargs"]:
105
+ # Extract from input file
106
+ with HDF5Data(path_in) as hd:
107
+ self.kwargs["data_kwargs"]["pixel_size"] = hd.pixel_size
108
+ # Set default output path
109
+ if path_out is None:
110
+ pin = pathlib.Path(path_in)
111
+ path_out = pin.with_name(pin.stem + "_dcn.rtdc")
112
+ # Set logging level to DEBUG in debugging mode
113
+ if self.kwargs["debug"]:
114
+ self.kwargs["log_level"] = logging.DEBUG
115
+ self.kwargs["path_out"] = pathlib.Path(path_out)
116
+ # Set default mask kwargs for segmenter
117
+ self.kwargs["segmenter_kwargs"].setdefault("kwargs_mask", {})
118
+ # Set default number of processes
119
+ if num_procs is None:
120
+ self.kwargs["num_procs"] = mp.cpu_count()
121
+
122
+ def __getitem__(self, item):
123
+ return copy.deepcopy(self.kwargs[item])
124
+
125
+ def __getstate__(self):
126
+ state = copy.deepcopy(self.kwargs)
127
+ return state
128
+
129
+ def __setstate__(self, state):
130
+ self.kwargs.clear()
131
+ self.kwargs.update(copy.deepcopy(state))
132
+
133
+ def assert_pp_codes(self):
134
+ """Sanity check of `self.kwargs`"""
135
+ # PPID classes with only one option
136
+ for cls, key in [
137
+ (HDF5Data, "data_code"),
138
+ (Gate, "gate_code"),
139
+ (QueueEventExtractor, "feature_code"),
140
+ ]:
141
+ code_act = self.kwargs[key]
142
+ code_exp = cls.get_ppid_code()
143
+ if code_act != code_exp:
144
+ raise ValueError(f"Invalid code '{code_act}' for '{key}', "
145
+ f"expected '{code_exp}'!")
146
+ # PPID classes with multiple options
147
+ for options, key in [
148
+ (get_available_background_methods(), "background_code"),
149
+ (get_available_segmenters(), "segmenter_code"),
150
+ ]:
151
+ code_act = self.kwargs[key]
152
+ if code_act not in options:
153
+ raise ValueError(f"Invalid code '{code_act}' for '{key}', "
154
+ f"expected one of '{options}'!")
155
+
156
+ def get_ppid(self, ret_hash=False, ret_dict=False):
157
+ self.assert_pp_codes()
158
+ pp_hash_kw = collections.OrderedDict()
159
+ pp_hash_kw["gen_id"] = DCNUM_PPID_GENERATION
160
+ for pp_kw, cls, cls_kw in [
161
+ ("dat_id", HDF5Data, "data_kwargs"),
162
+ ("bg_id",
163
+ get_available_background_methods()[
164
+ self.kwargs["background_code"]],
165
+ "background_kwargs"),
166
+ ("seg_id",
167
+ get_available_segmenters()[self.kwargs["segmenter_code"]],
168
+ "segmenter_kwargs"),
169
+ ("feat_id", QueueEventExtractor, "feature_kwargs"),
170
+ ("gate_id", Gate, "gate_kwargs"),
171
+ ]:
172
+ pp_hash_kw[pp_kw] = cls.get_ppid_from_ppkw(self.kwargs[cls_kw])
173
+
174
+ ppid = "|".join(pp_hash_kw.values())
175
+
176
+ ret = [ppid]
177
+ if ret_hash:
178
+ pp_hash = compute_pipeline_hash(**pp_hash_kw)
179
+ ret.append(pp_hash)
180
+ if ret_dict:
181
+ ret.append(pp_hash_kw)
182
+ if len(ret) == 1:
183
+ ret = ret[0]
184
+ return ret
@@ -0,0 +1,19 @@
1
+ import json
2
+ import numbers
3
+ import pathlib
4
+
5
+ import numpy as np
6
+
7
+
8
+ class ExtendedJSONEncoder(json.JSONEncoder):
9
+ def default(self, obj):
10
+ if isinstance(obj, pathlib.Path):
11
+ return str(obj)
12
+ elif isinstance(obj, numbers.Integral):
13
+ return int(obj)
14
+ elif isinstance(obj, np.bool_):
15
+ return bool(obj)
16
+ elif isinstance(obj, slice):
17
+ return "PYTHON-SLICE", (obj.start, obj.stop, obj.step)
18
+ # Let the base class default method raise the TypeError
19
+ return json.JSONEncoder.default(self, obj)
dcnum/meta/__init__.py CHANGED
@@ -1,2 +1,3 @@
1
1
  # flake8: noqa: F401
2
+ from . import paths
2
3
  from . import ppid
dcnum/meta/paths.py ADDED
@@ -0,0 +1,30 @@
1
+ import pathlib
2
+
3
+ search_path_registry = {}
4
+
5
+
6
+ def register_search_path(topic: str,
7
+ search_path: str | pathlib.Path):
8
+ """Register a search path for a given topic
9
+
10
+ Search paths are a global solution for organizing the locations
11
+ of resources that are part of an analysis pipeline. For instance,
12
+ if the location of such a file that depends on where your pipeline is
13
+ running, you can register multiple search paths and the file will
14
+ be found using :func:`find_file`.
15
+ """
16
+ topic_list = search_path_registry.setdefault(topic, [])
17
+ topic_list.append(pathlib.Path(search_path))
18
+
19
+
20
+ def find_file(topic: str,
21
+ file_name: str):
22
+ """Find a file in the search path for the given topic"""
23
+ search_paths = search_path_registry.get(topic, [])
24
+ for pp in search_paths:
25
+ pf = pp / file_name
26
+ if pf.is_file():
27
+ return pf
28
+ else:
29
+ raise KeyError(f"Could not find {file_name} for {topic} in the "
30
+ f"registered search paths {search_paths}")
dcnum/meta/ppid.py CHANGED
@@ -4,17 +4,38 @@ import collections
4
4
  import hashlib
5
5
  import inspect
6
6
  import pathlib
7
+ from typing import Dict, List, Protocol
8
+ import warnings
7
9
 
8
10
 
9
11
  #: Increment this string if there are breaking changes that make
10
12
  #: previous pipelines unreproducible.
11
- DCNUM_PPID_GENERATION = "5"
13
+ DCNUM_PPID_GENERATION = "10"
12
14
 
13
15
 
14
- def compute_pipeline_hash(bg_id, seg_id, feat_id, gate_id,
15
- gen_id=DCNUM_PPID_GENERATION):
16
+ class ClassWithPPIDCapabilities(Protocol):
17
+ def get_ppid(self) -> str:
18
+ """full pipeline identifier for the class (instance method)"""
19
+ pass
20
+
21
+ def get_ppid_code(self) -> str:
22
+ """string representing the class in the pipeline (classmethod)"""
23
+ pass
24
+
25
+ def get_ppid_from_ppkw(self) -> str:
26
+ """pipeline identifier from specific pipeline keywords (classmethod)"""
27
+ pass
28
+
29
+ def get_ppkw_from_ppid(self) -> Dict:
30
+ """class keywords from full pipeline identifier (staticmethod)"""
31
+ pass
32
+
33
+
34
+ def compute_pipeline_hash(*, bg_id, seg_id, feat_id, gate_id,
35
+ dat_id="unknown", gen_id=DCNUM_PPID_GENERATION):
16
36
  hasher = hashlib.md5()
17
- hasher.update("|".join([gen_id, bg_id, seg_id, feat_id, gate_id]).encode())
37
+ hasher.update("|".join([
38
+ gen_id, dat_id, bg_id, seg_id, feat_id, gate_id]).encode())
18
39
  pph = hasher.hexdigest()
19
40
  return pph
20
41
 
@@ -37,7 +58,10 @@ def convert_to_dtype(value, dtype):
37
58
  return value
38
59
 
39
60
 
40
- def get_class_method_info(class_obj, static_kw_methods=None):
61
+ def get_class_method_info(class_obj: ClassWithPPIDCapabilities,
62
+ static_kw_methods: List = None,
63
+ static_kw_defaults: Dict = None,
64
+ ):
41
65
  """Return dictionary of class info with static keyword methods docs
42
66
 
43
67
  Parameters
@@ -47,10 +71,19 @@ def get_class_method_info(class_obj, static_kw_methods=None):
47
71
  static_kw_methods: list of callable
48
72
  The methods to inspect; all kwargs-only keyword arguments
49
73
  are extracted.
74
+ static_kw_defaults: dict
75
+ If a key in this dictionary matches an item in `static_kw_methods`,
76
+ then these are the default values returned in the "defaults"
77
+ dictionary. This is used in cases where a base class does
78
+ implement some annotations, but the subclass does not actually
79
+ use them, because e.g. they are taken from a property such as is
80
+ the case for the mask postprocessing of segmenter classes.
50
81
  """
82
+ if static_kw_defaults is None:
83
+ static_kw_defaults = {}
51
84
  doc = class_obj.__doc__ or class_obj.__init__.__doc__
52
85
  info = {
53
- "key": class_obj.key(),
86
+ "code": class_obj.get_ppid_code(),
54
87
  "doc": doc,
55
88
  "title": doc.split("\n")[0],
56
89
  }
@@ -60,19 +93,43 @@ def get_class_method_info(class_obj, static_kw_methods=None):
60
93
  for mm in static_kw_methods:
61
94
  meth = getattr(class_obj, mm)
62
95
  spec = inspect.getfullargspec(meth)
63
- defau[mm] = spec.kwonlydefaults
96
+ if mm_defaults := static_kw_defaults.get(mm):
97
+ defau[mm] = mm_defaults
98
+ else:
99
+ defau[mm] = spec.kwonlydefaults or {}
64
100
  annot[mm] = spec.annotations
65
101
  info["defaults"] = defau
66
102
  info["annotations"] = annot
67
103
  return info
68
104
 
69
105
 
70
- def kwargs_to_ppid(cls, method, kwargs):
71
- info = get_class_method_info(cls, [method])
106
+ def kwargs_to_ppid(cls: ClassWithPPIDCapabilities,
107
+ method: str,
108
+ kwargs: Dict,
109
+ allow_invalid_keys: bool = True):
110
+ info = get_class_method_info(cls, [method, "__init__"])
72
111
 
73
112
  concat_strings = []
74
113
  if info["defaults"][method]:
75
114
  kwdefaults = info["defaults"][method]
115
+ kwdefaults_init = info["defaults"]["__init__"]
116
+ kw_false = (set(kwargs.keys())
117
+ - set(kwdefaults.keys())
118
+ - set(kwdefaults_init.keys()))
119
+ if kw_false:
120
+ # This should not have happened.
121
+ msg = (f"Invalid kwargs {kw_false} specified for method "
122
+ f"'{method}'! Valid kwargs are"
123
+ f"{sorted(kwdefaults.keys())}. If you wrote this "
124
+ f"segmenter and had to implement `__init__`, make sure "
125
+ f"that it accepts all kwonly-arguments its super class "
126
+ f"accepts. If this is not the case, you are probably "
127
+ f"passing invalid kwargs to the segmenter."
128
+ )
129
+ if allow_invalid_keys:
130
+ warnings.warn(msg, UserWarning)
131
+ else:
132
+ raise KeyError(msg)
76
133
  kwannot = info["annotations"][method]
77
134
  kws = list(kwdefaults.keys())
78
135
  kws_abrv = get_unique_prefix(kws)
dcnum/read/__init__.py CHANGED
@@ -2,3 +2,4 @@
2
2
  from .cache import md5sum
3
3
  from .const import PROTECTED_FEATURES
4
4
  from .hdf5_data import HDF5Data, HDF5ImageCache, concatenated_hdf5_data
5
+ from .mapped import get_mapping_indices, get_mapped_object
dcnum/read/cache.py CHANGED
@@ -1,42 +1,68 @@
1
+ import abc
1
2
  import collections
2
3
  import functools
3
4
  import hashlib
4
5
  import pathlib
6
+ from typing import Tuple
7
+ import warnings
5
8
 
6
9
  import h5py
7
10
  import numpy as np
8
11
 
9
12
 
10
- class HDF5ImageCache:
13
+ class EmptyDatasetWarning(UserWarning):
14
+ """Used for files that contain no actual data"""
15
+ pass
16
+
17
+
18
+ class BaseImageChunkCache(abc.ABC):
11
19
  def __init__(self,
12
- h5ds: h5py.Dataset,
20
+ shape: Tuple[int],
13
21
  chunk_size: int = 1000,
14
- cache_size: int = 5,
15
- boolean: bool = False):
16
- """An HDF5 image cache
17
-
18
- Deformability cytometry data files commonly contain image stacks
19
- that are chunked in various ways. Loading just a single image
20
- can be time-consuming, because an entire HDF5 chunk has to be
21
- loaded, decompressed and from that one image extracted. The
22
- `HDF5ImageCache` class caches the chunks from the HDF5 files
23
- into memory, making single-image-access very fast.
24
- """
25
- # TODO:
26
- # - adjust chunking to multiples of the chunks in the dataset
27
- # (which will slightly speed up things)
28
- chunk_size = min(h5ds.shape[0], chunk_size)
29
- self.h5ds = h5ds
30
- self.chunk_size = chunk_size
31
- self.boolean = boolean
32
- self.cache_size = cache_size
22
+ cache_size: int = 2,
23
+ ):
24
+ self.shape = shape
25
+ self._dtype = None
26
+ chunk_size = min(shape[0], chunk_size)
27
+ self._len = self.shape[0]
33
28
  #: This is a FILO cache for the chunks
34
29
  self.cache = collections.OrderedDict()
35
- self.shape = h5ds.shape
36
30
  self.image_shape = self.shape[1:]
37
31
  self.chunk_shape = (chunk_size,) + self.shape[1:]
38
- self._len = self.shape[0]
39
- self.num_chunks = int(np.ceil(self._len / self.chunk_size))
32
+ self.chunk_size = chunk_size
33
+ self.cache_size = cache_size
34
+ self.num_chunks = int(np.ceil(self._len / (self.chunk_size or 1)))
35
+
36
+ def __getitem__(self, index):
37
+ if isinstance(index, (slice, list, np.ndarray)):
38
+ if isinstance(index, slice):
39
+ indices = np.arange(index.start or 0,
40
+ index.stop or len(self),
41
+ index.step)
42
+ else:
43
+ indices = index
44
+ array_out = np.empty((len(indices),) + self.image_shape,
45
+ dtype=self.dtype)
46
+ for ii, idx in enumerate(indices):
47
+ array_out[ii] = self[idx]
48
+ return array_out
49
+ else:
50
+ chunk_index, sub_index = self._get_chunk_index_for_index(index)
51
+ return self.get_chunk(chunk_index)[sub_index]
52
+
53
+ def __len__(self):
54
+ return self._len
55
+
56
+ @property
57
+ def dtype(self):
58
+ """data type of the image data"""
59
+ if self._dtype is None:
60
+ self._dtype = self[0].dtype
61
+ return self._dtype
62
+
63
+ @abc.abstractmethod
64
+ def _get_chunk_data(self, chunk_slice):
65
+ """Implemented in subclass to obtain actual data"""
40
66
 
41
67
  def _get_chunk_index_for_index(self, index):
42
68
  if index < 0:
@@ -45,30 +71,19 @@ class HDF5ImageCache:
45
71
  raise IndexError(
46
72
  f"Index {index} out of bounds for HDF5ImageCache "
47
73
  f"of size {self._len}")
74
+ index = int(index) # convert np.uint64 to int, so we get ints below
48
75
  chunk_index = index // self.chunk_size
49
76
  sub_index = index % self.chunk_size
50
77
  return chunk_index, sub_index
51
78
 
52
- def __getitem__(self, index):
53
- chunk_index, sub_index = self._get_chunk_index_for_index(index)
54
- return self.get_chunk(chunk_index)[sub_index]
55
-
56
- def __len__(self):
57
- return self._len
58
-
59
79
  def get_chunk(self, chunk_index):
60
80
  """Return one chunk of images"""
61
81
  if chunk_index not in self.cache:
62
- fslice = slice(self.chunk_size * chunk_index,
63
- self.chunk_size * (chunk_index + 1)
64
- )
65
- data = self.h5ds[fslice]
66
- if self.boolean:
67
- data = np.array(data, dtype=bool)
68
- self.cache[chunk_index] = data
69
- if len(self.cache) > self.cache_size:
82
+ if len(self.cache) >= self.cache_size:
70
83
  # Remove the first item
71
84
  self.cache.popitem(last=False)
85
+ data = self._get_chunk_data(self.get_chunk_slice(chunk_index))
86
+ self.cache[chunk_index] = data
72
87
  return self.cache[chunk_index]
73
88
 
74
89
  def get_chunk_size(self, chunk_index):
@@ -81,60 +96,77 @@ class HDF5ImageCache:
81
96
  raise IndexError(f"{self} only has {self.num_chunks} chunks!")
82
97
  return chunk_size
83
98
 
99
+ def get_chunk_slice(self, chunk_index):
100
+ """Return the slice corresponding to the chunk index"""
101
+ ch_slice = slice(self.chunk_size * chunk_index,
102
+ self.chunk_size * (chunk_index + 1)
103
+ )
104
+ return ch_slice
105
+
84
106
  def iter_chunks(self):
85
- size = self.h5ds.shape[0]
86
107
  index = 0
87
108
  chunk = 0
88
109
  while True:
89
110
  yield chunk
90
111
  chunk += 1
91
112
  index += self.chunk_size
92
- if index >= size:
113
+ if index >= self._len:
93
114
  break
94
115
 
95
116
 
96
- class ImageCorrCache:
117
+ class HDF5ImageCache(BaseImageChunkCache):
97
118
  def __init__(self,
98
- image: HDF5ImageCache,
99
- image_bg: HDF5ImageCache):
100
- self.image = image
101
- self.image_bg = image_bg
102
- self.chunk_size = image.chunk_size
103
- self.num_chunks = image.num_chunks
104
- self.h5ds = image.h5ds
105
- self.shape = image.shape
106
- self.chunk_shape = image.chunk_shape
107
- #: This is a FILO cache for the corrected image chunks
108
- self.cache = collections.OrderedDict()
109
- self.cache_size = image.cache_size
119
+ h5ds: h5py.Dataset,
120
+ chunk_size: int = 1000,
121
+ cache_size: int = 2,
122
+ boolean: bool = False):
123
+ """An HDF5 image cache
110
124
 
111
- def _get_chunk_index_for_index(self, index):
112
- if index < 0:
113
- index = len(self.h5ds) + index
114
- chunk_index = index // self.chunk_size
115
- sub_index = index % self.chunk_size
116
- return chunk_index, sub_index
125
+ Deformability cytometry data files commonly contain image stacks
126
+ that are chunked in various ways. Loading just a single image
127
+ can be time-consuming, because an entire HDF5 chunk has to be
128
+ loaded, decompressed and from that one image extracted. The
129
+ `HDF5ImageCache` class caches the chunks from the HDF5 files
130
+ into memory, making single-image-access very fast.
131
+ """
132
+ super(HDF5ImageCache, self).__init__(
133
+ shape=h5ds.shape,
134
+ chunk_size=chunk_size,
135
+ cache_size=cache_size)
136
+ # TODO:
137
+ # - adjust chunking to multiples of the chunks in the dataset
138
+ # (which might slightly speed up things)
139
+ self.h5ds = h5ds
140
+ self.boolean = boolean
117
141
 
118
- def __getitem__(self, index):
119
- chunk_index, sub_index = self._get_chunk_index_for_index(index)
120
- return self.get_chunk(chunk_index)[sub_index]
142
+ if self._len == 0:
143
+ warnings.warn(f"Input image '{h5ds.name}' in "
144
+ f"file {h5ds.file.filename} has zero length",
145
+ EmptyDatasetWarning)
121
146
 
122
- def __len__(self):
123
- return len(self.image)
147
+ def _get_chunk_data(self, chunk_slice):
148
+ data = self.h5ds[chunk_slice]
149
+ if self.boolean:
150
+ data = np.array(data, dtype=bool)
151
+ return data
124
152
 
125
- def get_chunk(self, chunk_index):
126
- if chunk_index not in self.cache:
127
- data = np.array(
128
- self.image.get_chunk(chunk_index), dtype=np.int16) \
129
- - self.image_bg.get_chunk(chunk_index)
130
- self.cache[chunk_index] = data
131
- if len(self.cache) > self.cache_size:
132
- # Remove the first item
133
- self.cache.popitem(last=False)
134
- return self.cache[chunk_index]
135
153
 
136
- def iter_chunks(self):
137
- return self.image.iter_chunks()
154
+ class ImageCorrCache(BaseImageChunkCache):
155
+ def __init__(self,
156
+ image: HDF5ImageCache,
157
+ image_bg: HDF5ImageCache):
158
+ super(ImageCorrCache, self).__init__(
159
+ shape=image.shape,
160
+ chunk_size=image.chunk_size,
161
+ cache_size=image.cache_size)
162
+ self.image = image
163
+ self.image_bg = image_bg
164
+
165
+ def _get_chunk_data(self, chunk_slice):
166
+ data = np.array(
167
+ self.image._get_chunk_data(chunk_slice), dtype=np.int16) \
168
+ - self.image_bg._get_chunk_data(chunk_slice)
169
+ return data
138
170
 
139
171
 
140
172
  @functools.cache
dcnum/read/const.py CHANGED
@@ -1,15 +1,17 @@
1
- #: Scalar features that apply to all events in a frame
1
+ #: Scalar features that apply to all events in a frame and which are
2
+ #: not computed for individual events.
2
3
  PROTECTED_FEATURES = [
3
- "bg_med",
4
+ "bg_off",
4
5
  "flow_rate",
5
6
  "frame",
6
7
  "g_force",
7
- "index_online",
8
8
  "pressure",
9
9
  "temp",
10
10
  "temp_amb",
11
- "time"
11
+ "time",
12
12
  ]
13
13
 
14
+ # User-defined features may be anything, but if the user needs something
15
+ # very specific for the pipeline, having them protected is a nice feature.
14
16
  for ii in range(10):
15
17
  PROTECTED_FEATURES.append(f"userdef{ii}")