dcnum 0.17.2__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dcnum might be problematic. Click here for more details.

dcnum/logic/ctrl.py CHANGED
@@ -310,7 +310,17 @@ class DCNumJobRunner(threading.Thread):
310
310
  # Whether pipeline hash is invalid.
311
311
  ppid.compute_pipeline_hash(**datdict) != dathash
312
312
  # Whether the input file is the original output of the pipeline.
313
- or len(self.draw) != evyield)
313
+ or len(self.draw) != evyield
314
+ # If index mapping is defined, then we always redo the pipeline.
315
+ # If the pipeline hashes are identical and index mapping is not
316
+ # None, then both pipelines were done with index mapping.
317
+ # But applying the same pipeline with index mapping in series
318
+ # will lead to a different result in the second run (e.g. 1st
319
+ # pipeline run: take every 2nd event; 2nd pipeline run: take
320
+ # every second event -> results in every 4th event in output of
321
+ # second pipeline run).
322
+ or self.draw.index_mapping is not None
323
+ )
314
324
  # Do we have to recompute the background data? In addition to the
315
325
  # hash sanity check above, check the generation, input data,
316
326
  # and background pipeline identifiers.
@@ -382,6 +392,10 @@ class DCNumJobRunner(threading.Thread):
382
392
  hw.h5.attrs["pipeline:dcnum gate"] = self.ppdict["gate_id"]
383
393
  hw.h5.attrs["pipeline:dcnum hash"] = self.pphash
384
394
  hw.h5.attrs["pipeline:dcnum yield"] = self.event_count
395
+ # index mapping information
396
+ im = self.job.kwargs["data_kwargs"].get("index_mapping", None)
397
+ dim = HDF5Data.get_ppid_index_mapping(im)
398
+ hw.h5.attrs["pipeline:dcnum mapping"] = dim
385
399
  # regular metadata
386
400
  hw.h5.attrs["experiment:event count"] = self.event_count
387
401
  hw.h5.attrs["imaging:pixel size"] = self.draw.pixel_size
@@ -503,7 +517,7 @@ class DCNumJobRunner(threading.Thread):
503
517
  num_segmenters = 1
504
518
  elif seg_cls.hardware_processor == "cpu": # CPU segmenter
505
519
  # We could in principle set the number of slots to one and
506
- # jave both number of extractors and number of segmenters set
520
+ # have both number of extractors and number of segmenters set
507
521
  # to the total number of CPUs. However, we would need more RAM
508
522
  # (for caching the image data) and we also have more overhead.
509
523
  # Having two slots shared between all workers is more efficient.
@@ -522,10 +536,11 @@ class DCNumJobRunner(threading.Thread):
522
536
  slot_chunks = mp_spawn.Array("i", num_slots)
523
537
  slot_states = mp_spawn.Array("u", num_slots)
524
538
 
525
- # Initialize thread
539
+ # Initialize segmenter manager thread
526
540
  thr_segm = SegmenterManagerThread(
527
541
  segmenter=seg_cls(**self.job["segmenter_kwargs"]),
528
542
  image_data=imdat,
543
+ bg_off=self.dtin["bg_off"] if "bg_off" in self.dtin else None,
529
544
  slot_states=slot_states,
530
545
  slot_chunks=slot_chunks,
531
546
  debug=self.job["debug"],
@@ -13,5 +13,7 @@ class ExtendedJSONEncoder(json.JSONEncoder):
13
13
  return int(obj)
14
14
  elif isinstance(obj, np.bool_):
15
15
  return bool(obj)
16
+ elif isinstance(obj, slice):
17
+ return "PYTHON-SLICE", (obj.start, obj.stop, obj.step)
16
18
  # Let the base class default method raise the TypeError
17
19
  return json.JSONEncoder.default(self, obj)
dcnum/meta/ppid.py CHANGED
@@ -10,7 +10,7 @@ import warnings
10
10
 
11
11
  #: Increment this string if there are breaking changes that make
12
12
  #: previous pipelines unreproducible.
13
- DCNUM_PPID_GENERATION = "7"
13
+ DCNUM_PPID_GENERATION = "8"
14
14
 
15
15
 
16
16
  class ClassWithPPIDCapabilities(Protocol):
dcnum/read/__init__.py CHANGED
@@ -2,3 +2,4 @@
2
2
  from .cache import md5sum
3
3
  from .const import PROTECTED_FEATURES
4
4
  from .hdf5_data import HDF5Data, HDF5ImageCache, concatenated_hdf5_data
5
+ from .mapped import get_mapping_indices, get_mapped_object
dcnum/read/cache.py CHANGED
@@ -1,7 +1,9 @@
1
+ import abc
1
2
  import collections
2
3
  import functools
3
4
  import hashlib
4
5
  import pathlib
6
+ from typing import Tuple
5
7
  import warnings
6
8
 
7
9
  import h5py
@@ -13,41 +15,34 @@ class EmptyDatasetWarning(UserWarning):
13
15
  pass
14
16
 
15
17
 
16
- class HDF5ImageCache:
18
+ class BaseImageChunkCache(abc.ABC):
17
19
  def __init__(self,
18
- h5ds: h5py.Dataset,
20
+ shape: Tuple[int],
19
21
  chunk_size: int = 1000,
20
22
  cache_size: int = 2,
21
- boolean: bool = False):
22
- """An HDF5 image cache
23
-
24
- Deformability cytometry data files commonly contain image stacks
25
- that are chunked in various ways. Loading just a single image
26
- can be time-consuming, because an entire HDF5 chunk has to be
27
- loaded, decompressed and from that one image extracted. The
28
- `HDF5ImageCache` class caches the chunks from the HDF5 files
29
- into memory, making single-image-access very fast.
30
- """
31
- self.shape = h5ds.shape
23
+ ):
24
+ self.shape = shape
25
+ chunk_size = min(shape[0], chunk_size)
32
26
  self._len = self.shape[0]
33
- if self._len == 0:
34
- warnings.warn(f"Input image '{h5ds.name}' in "
35
- f"file {h5ds.file.filename} has zero length",
36
- EmptyDatasetWarning)
37
- # TODO:
38
- # - adjust chunking to multiples of the chunks in the dataset
39
- # (which might slightly speed up things)
40
- chunk_size = min(h5ds.shape[0], chunk_size)
41
- self.h5ds = h5ds
42
- self.chunk_size = chunk_size
43
- self.boolean = boolean
44
- self.cache_size = cache_size
45
27
  #: This is a FILO cache for the chunks
46
28
  self.cache = collections.OrderedDict()
47
29
  self.image_shape = self.shape[1:]
48
30
  self.chunk_shape = (chunk_size,) + self.shape[1:]
31
+ self.chunk_size = chunk_size
32
+ self.cache_size = cache_size
49
33
  self.num_chunks = int(np.ceil(self._len / (self.chunk_size or 1)))
50
34
 
35
+ def __getitem__(self, index):
36
+ chunk_index, sub_index = self._get_chunk_index_for_index(index)
37
+ return self.get_chunk(chunk_index)[sub_index]
38
+
39
+ def __len__(self):
40
+ return self._len
41
+
42
+ @abc.abstractmethod
43
+ def _get_chunk_data(self, chunk_slice):
44
+ """Implemented in subclass to obtain actual data"""
45
+
51
46
  def _get_chunk_index_for_index(self, index):
52
47
  if index < 0:
53
48
  index = self._len + index
@@ -59,26 +54,14 @@ class HDF5ImageCache:
59
54
  sub_index = index % self.chunk_size
60
55
  return chunk_index, sub_index
61
56
 
62
- def __getitem__(self, index):
63
- chunk_index, sub_index = self._get_chunk_index_for_index(index)
64
- return self.get_chunk(chunk_index)[sub_index]
65
-
66
- def __len__(self):
67
- return self._len
68
-
69
57
  def get_chunk(self, chunk_index):
70
58
  """Return one chunk of images"""
71
59
  if chunk_index not in self.cache:
72
- fslice = slice(self.chunk_size * chunk_index,
73
- self.chunk_size * (chunk_index + 1)
74
- )
75
- data = self.h5ds[fslice]
76
- if self.boolean:
77
- data = np.array(data, dtype=bool)
78
- self.cache[chunk_index] = data
79
- if len(self.cache) > self.cache_size:
60
+ if len(self.cache) >= self.cache_size:
80
61
  # Remove the first item
81
62
  self.cache.popitem(last=False)
63
+ data = self._get_chunk_data(self.get_chunk_slice(chunk_index))
64
+ self.cache[chunk_index] = data
82
65
  return self.cache[chunk_index]
83
66
 
84
67
  def get_chunk_size(self, chunk_index):
@@ -91,60 +74,77 @@ class HDF5ImageCache:
91
74
  raise IndexError(f"{self} only has {self.num_chunks} chunks!")
92
75
  return chunk_size
93
76
 
77
+ def get_chunk_slice(self, chunk_index):
78
+ """Return the slice corresponding to the chunk index"""
79
+ ch_slice = slice(self.chunk_size * chunk_index,
80
+ self.chunk_size * (chunk_index + 1)
81
+ )
82
+ return ch_slice
83
+
94
84
  def iter_chunks(self):
95
- size = self.h5ds.shape[0]
96
85
  index = 0
97
86
  chunk = 0
98
87
  while True:
99
88
  yield chunk
100
89
  chunk += 1
101
90
  index += self.chunk_size
102
- if index >= size:
91
+ if index >= self._len:
103
92
  break
104
93
 
105
94
 
106
- class ImageCorrCache:
95
+ class HDF5ImageCache(BaseImageChunkCache):
107
96
  def __init__(self,
108
- image: HDF5ImageCache,
109
- image_bg: HDF5ImageCache):
110
- self.image = image
111
- self.image_bg = image_bg
112
- self.chunk_size = image.chunk_size
113
- self.num_chunks = image.num_chunks
114
- self.h5ds = image.h5ds
115
- self.shape = image.shape
116
- self.chunk_shape = image.chunk_shape
117
- #: This is a FILO cache for the corrected image chunks
118
- self.cache = collections.OrderedDict()
119
- self.cache_size = image.cache_size
97
+ h5ds: h5py.Dataset,
98
+ chunk_size: int = 1000,
99
+ cache_size: int = 2,
100
+ boolean: bool = False):
101
+ """An HDF5 image cache
120
102
 
121
- def _get_chunk_index_for_index(self, index):
122
- if index < 0:
123
- index = len(self.h5ds) + index
124
- chunk_index = index // self.chunk_size
125
- sub_index = index % self.chunk_size
126
- return chunk_index, sub_index
103
+ Deformability cytometry data files commonly contain image stacks
104
+ that are chunked in various ways. Loading just a single image
105
+ can be time-consuming, because an entire HDF5 chunk has to be
106
+ loaded, decompressed and from that one image extracted. The
107
+ `HDF5ImageCache` class caches the chunks from the HDF5 files
108
+ into memory, making single-image-access very fast.
109
+ """
110
+ super(HDF5ImageCache, self).__init__(
111
+ shape=h5ds.shape,
112
+ chunk_size=chunk_size,
113
+ cache_size=cache_size)
114
+ # TODO:
115
+ # - adjust chunking to multiples of the chunks in the dataset
116
+ # (which might slightly speed up things)
117
+ self.h5ds = h5ds
118
+ self.boolean = boolean
127
119
 
128
- def __getitem__(self, index):
129
- chunk_index, sub_index = self._get_chunk_index_for_index(index)
130
- return self.get_chunk(chunk_index)[sub_index]
120
+ if self._len == 0:
121
+ warnings.warn(f"Input image '{h5ds.name}' in "
122
+ f"file {h5ds.file.filename} has zero length",
123
+ EmptyDatasetWarning)
131
124
 
132
- def __len__(self):
133
- return len(self.image)
125
+ def _get_chunk_data(self, chunk_slice):
126
+ data = self.h5ds[chunk_slice]
127
+ if self.boolean:
128
+ data = np.array(data, dtype=bool)
129
+ return data
134
130
 
135
- def get_chunk(self, chunk_index):
136
- if chunk_index not in self.cache:
137
- data = np.array(
138
- self.image.get_chunk(chunk_index), dtype=np.int16) \
139
- - self.image_bg.get_chunk(chunk_index)
140
- self.cache[chunk_index] = data
141
- if len(self.cache) > self.cache_size:
142
- # Remove the first item
143
- self.cache.popitem(last=False)
144
- return self.cache[chunk_index]
145
131
 
146
- def iter_chunks(self):
147
- return self.image.iter_chunks()
132
+ class ImageCorrCache(BaseImageChunkCache):
133
+ def __init__(self,
134
+ image: HDF5ImageCache,
135
+ image_bg: HDF5ImageCache):
136
+ super(ImageCorrCache, self).__init__(
137
+ shape=image.shape,
138
+ chunk_size=image.chunk_size,
139
+ cache_size=image.cache_size)
140
+ self.image = image
141
+ self.image_bg = image_bg
142
+
143
+ def _get_chunk_data(self, chunk_slice):
144
+ data = np.array(
145
+ self.image._get_chunk_data(chunk_slice), dtype=np.int16) \
146
+ - self.image_bg._get_chunk_data(chunk_slice)
147
+ return data
148
148
 
149
149
 
150
150
  @functools.cache
dcnum/read/const.py CHANGED
@@ -1,6 +1,7 @@
1
1
  #: Scalar features that apply to all events in a frame and which are
2
- #: not computed from image or image_bg data.
2
+ #: not computed for individual events.
3
3
  PROTECTED_FEATURES = [
4
+ "bg_off",
4
5
  "flow_rate",
5
6
  "frame",
6
7
  "g_force",
@@ -10,5 +11,7 @@ PROTECTED_FEATURES = [
10
11
  "time"
11
12
  ]
12
13
 
14
+ # User-defined features may be anything, but if the user needs something
15
+ # very specific for the pipeline, having them protected is a nice feature.
13
16
  for ii in range(10):
14
17
  PROTECTED_FEATURES.append(f"userdef{ii}")
dcnum/read/hdf5_data.py CHANGED
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hashlib
3
4
  import io
4
5
  import json
6
+ import numbers
5
7
  import pathlib
6
8
  import tempfile
7
9
  from typing import Dict, BinaryIO, List
@@ -13,6 +15,7 @@ import numpy as np
13
15
 
14
16
  from .cache import HDF5ImageCache, ImageCorrCache, md5sum
15
17
  from .const import PROTECTED_FEATURES
18
+ from .mapped import get_mapped_object, get_mapping_indices
16
19
 
17
20
 
18
21
  class HDF5Data:
@@ -26,12 +29,47 @@ class HDF5Data:
26
29
  logs: Dict[List[str]] = None,
27
30
  tables: Dict[np.ndarray] = None,
28
31
  image_cache_size: int = 2,
32
+ index_mapping: int | slice | List | np.ndarray = None,
29
33
  ):
34
+ """
35
+
36
+ Parameters
37
+ ----------
38
+ path:
39
+ path to data file
40
+ pixel_size:
41
+ pixel size in µm
42
+ md5_5m:
43
+ MD5 sum of the first 5 MiB; computed if not provided
44
+ meta:
45
+ metadata dictionary; extracted from HDF5 attributes
46
+ if not provided
47
+ basins:
48
+ list of basin dictionaries; extracted from HDF5 attributes
49
+ if not provided
50
+ logs:
51
+ dictionary of logs; extracted from HDF5 attributes
52
+ if not provided
53
+ tables:
54
+ dictionary of tables; extracted from HDF5 attributes
55
+ if not provided
56
+ image_cache_size:
57
+ size of the image cache to use when accessing image data
58
+ index_mapping:
59
+ select only a subset of input events, transparently reducing the
60
+ size of the dataset, possible data types are
61
+ - int `N`: use the first `N` events
62
+ - slice: use the events defined by a slice
63
+ - list: list of integers specifying the event indices to use
64
+ Numpy indexing rules apply. E.g. to only process the first
65
+ 100 events, set this to `100` or `slice(0, 100)`.
66
+ """
30
67
  # Init is in __setstate__ so we can pickle this class
31
68
  # and use it for multiprocessing.
32
69
  if isinstance(path, h5py.File):
33
70
  self.h5 = path
34
71
  path = path.filename
72
+
35
73
  self.__setstate__({"path": path,
36
74
  "pixel_size": pixel_size,
37
75
  "md5_5m": md5_5m,
@@ -40,6 +78,7 @@ class HDF5Data:
40
78
  "logs": logs,
41
79
  "tables": tables,
42
80
  "image_cache_size": image_cache_size,
81
+ "index_mapping": index_mapping,
43
82
  })
44
83
 
45
84
  def __contains__(self, item):
@@ -53,7 +92,7 @@ class HDF5Data:
53
92
 
54
93
  def __getitem__(self, feat):
55
94
  if feat in ["image", "image_bg", "mask"]:
56
- data = self.get_image_cache(feat)
95
+ data = self.get_image_cache(feat) # already index-mapped
57
96
  if data is None:
58
97
  raise KeyError(f"Feature '{feat}' not found in {self}!")
59
98
  else:
@@ -62,19 +101,25 @@ class HDF5Data:
62
101
  return self._cache_scalar[feat]
63
102
  elif (feat in self.h5["events"]
64
103
  and len(self.h5["events"][feat].shape) == 1): # cache scalar
65
- self._cache_scalar[feat] = self.h5["events"][feat][:]
104
+ if self.index_mapping is None:
105
+ idx_map = slice(None) # no mapping indices, just slice
106
+ else:
107
+ idx_map = get_mapping_indices(self.index_mapping)
108
+ self._cache_scalar[feat] = self.h5["events"][feat][idx_map]
66
109
  return self._cache_scalar[feat]
67
110
  else:
68
111
  if feat in self.h5["events"]:
69
112
  # Not cached (possibly slow)
70
113
  warnings.warn(f"Feature {feat} not cached (possibly slow)")
71
- return self.h5["events"][feat]
114
+ return get_mapped_object(
115
+ obj=self.h5["events"][feat],
116
+ index_mapping=self.index_mapping)
72
117
  else:
73
118
  # Check the basins
74
119
  for idx in range(len(self.basins)):
75
120
  bn, bn_features = self.get_basin_data(idx)
76
121
  if bn_features and feat in bn_features:
77
- return bn[feat]
122
+ return bn[feat] # already index-mapped
78
123
  # If we got here, then the feature data does not exist.
79
124
  raise KeyError(f"Feature '{feat}' not found in {self}!")
80
125
 
@@ -86,13 +131,14 @@ class HDF5Data:
86
131
  "logs": self.logs,
87
132
  "tables": self.tables,
88
133
  "basins": self.basins,
89
- "image_cache_size": self.image.cache_size
134
+ "image_cache_size": self.image.cache_size,
135
+ "index_mapping": self.index_mapping,
90
136
  }
91
137
 
92
138
  def __setstate__(self, state):
93
139
  # Make sure these properties exist (we rely on __init__, because
94
140
  # we want this class to be pickable and __init__ is not called by
95
- # `pickle.load`.
141
+ # `pickle.load`).
96
142
  # Cached properties
97
143
  self._feats = None
98
144
  self._keys = None
@@ -116,7 +162,7 @@ class HDF5Data:
116
162
  if self.md5_5m is None:
117
163
  if isinstance(self.path, pathlib.Path):
118
164
  # 5MB md5sum of input file
119
- self.md5_5m = md5sum(self.path, count=80)
165
+ self.md5_5m = md5sum(self.path, blocksize=65536, count=80)
120
166
  else:
121
167
  self.md5_5m = str(uuid.uuid4()).replace("-", "")
122
168
  self.meta = state["meta"]
@@ -165,12 +211,17 @@ class HDF5Data:
165
211
 
166
212
  self.image_cache_size = state["image_cache_size"]
167
213
 
214
+ self.index_mapping = state["index_mapping"]
215
+
168
216
  if self.h5 is None:
169
217
  self.h5 = h5py.File(self.path, libver="latest")
170
218
 
171
219
  def __len__(self):
172
220
  if self._len is None:
173
- self._len = self.h5.attrs["experiment:event count"]
221
+ if self.index_mapping is not None:
222
+ self._len = get_mapping_indices(self.index_mapping).size
223
+ else:
224
+ self._len = self.h5.attrs["experiment:event count"]
174
225
  return self._len
175
226
 
176
227
  @property
@@ -244,7 +295,9 @@ class HDF5Data:
244
295
  self.h5.close()
245
296
 
246
297
  def get_ppid(self):
247
- return self.get_ppid_from_ppkw({"pixel_size": self.pixel_size})
298
+ return self.get_ppid_from_ppkw(
299
+ {"pixel_size": self.pixel_size,
300
+ "index_mapping": self.index_mapping})
248
301
 
249
302
  @classmethod
250
303
  def get_ppid_code(cls):
@@ -255,20 +308,60 @@ class HDF5Data:
255
308
  # Data does not really fit into the PPID scheme we use for the rest
256
309
  # of the pipeline. This implementation here is custom.
257
310
  code = cls.get_ppid_code()
258
- kwid = f"p={kwargs['pixel_size']:.8f}".rstrip("0")
311
+ # pixel size
312
+ ppid_ps = f"{kwargs['pixel_size']:.8f}".rstrip("0")
313
+ # index mapping
314
+ ppid_im = cls.get_ppid_index_mapping(kwargs.get("index_mapping", None))
315
+ kwid = "^".join([f"p={ppid_ps}", f"i={ppid_im}"])
259
316
  return ":".join([code, kwid])
260
317
 
318
+ @staticmethod
319
+ def get_ppid_index_mapping(index_mapping):
320
+ """Return the pipeline identifier part for index mapping"""
321
+ im = index_mapping
322
+ if im is None:
323
+ dim = "0"
324
+ elif isinstance(im, numbers.Integral):
325
+ dim = f"{im}"
326
+ elif isinstance(im, slice):
327
+ dim = (f"{im.start if im.start is not None else 'n'}"
328
+ + f"-{im.stop if im.stop is not None else 'n'}"
329
+ + f"-{im.step if im.step is not None else 'n'}"
330
+ )
331
+ elif isinstance(im, (list, np.ndarray)):
332
+ idhash = hashlib.md5(
333
+ np.array(im, dtype=np.uint32).tobytes()).hexdigest()
334
+ dim = f"h-{idhash[:8]}"
335
+ else:
336
+ dim = "unknown"
337
+ return dim
338
+
261
339
  @staticmethod
262
340
  def get_ppkw_from_ppid(dat_ppid):
263
341
  # Data does not fit in the PPID scheme we use, but we still
264
342
  # would like to pass pixel_size to __init__ if we need it.
265
- code, pp_dat_kwargs = dat_ppid.split(":")
343
+ code, kwargs_str = dat_ppid.split(":")
266
344
  if code != HDF5Data.get_ppid_code():
267
345
  raise ValueError(f"Could not find data method '{code}'!")
268
- p, val = pp_dat_kwargs.split("=")
269
- if p != "p":
270
- raise ValueError(f"Invalid parameter '{p}'!")
271
- return {"pixel_size": float(val)}
346
+ kwitems = kwargs_str.split("^")
347
+ kwargs = {}
348
+ for item in kwitems:
349
+ var, val = item.split("=")
350
+ if var == "p":
351
+ kwargs["pixel_size"] = float(val)
352
+ elif var == "i":
353
+ if val.startswith("h-") or val == "unknown":
354
+ raise ValueError(f"Cannot invert index mapping {val}")
355
+ elif val == "0":
356
+ kwargs["index_mapping"] = None
357
+ elif val.count("-"):
358
+ start, stop = [int(v) for v in val.split("-")]
359
+ kwargs["index_mapping"] = slice(start, stop)
360
+ else:
361
+ kwargs["index_mapping"] = int(val)
362
+ else:
363
+ raise ValueError(f"Invalid parameter '{var}'!")
364
+ return kwargs
272
365
 
273
366
  def get_basin_data(self, index):
274
367
  """Return HDF5Data info for a basin index in `self.basins`
@@ -298,7 +391,7 @@ class HDF5Data:
298
391
  if path is None:
299
392
  self._basin_data[index] = (None, None)
300
393
  else:
301
- h5dat = HDF5Data(path)
394
+ h5dat = HDF5Data(path, index_mapping=self.index_mapping)
302
395
  features = bn_dict.get("features")
303
396
  if features is None:
304
397
  # Only get the features from the actual HDF5 file.
@@ -336,7 +429,8 @@ class HDF5Data:
336
429
 
337
430
  if ds is not None:
338
431
  image = HDF5ImageCache(
339
- h5ds=ds,
432
+ h5ds=get_mapped_object(obj=ds,
433
+ index_mapping=self.index_mapping),
340
434
  cache_size=self.image_cache_size,
341
435
  boolean=feat == "mask")
342
436
  else:
@@ -386,6 +480,7 @@ def concatenated_hdf5_data(paths: List[pathlib.Path],
386
480
  - If one of the input files does not contain a feature from the first
387
481
  input `paths`, then a `ValueError` is raised. Use the `features`
388
482
  argument to specify which features you need instead.
483
+ - Basins are not considered.
389
484
  """
390
485
  h5kwargs = {"mode": "w", "libver": "latest"}
391
486
  if isinstance(path_out, (pathlib.Path, str)):
dcnum/read/mapped.py ADDED
@@ -0,0 +1,79 @@
1
+ import functools
2
+
3
+ import numbers
4
+
5
+ import h5py
6
+ import numpy as np
7
+
8
+
9
+ class MappedHDF5Dataset:
10
+ def __init__(self,
11
+ h5ds: h5py.Dataset,
12
+ mapping_indices: np.ndarray):
13
+ """An index-mapped object for accessing an HDF5 dataset
14
+
15
+ Parameters
16
+ ----------
17
+ h5ds: h5py.Dataset
18
+ HDF5 dataset from which to map data
19
+ mapping_indices: np.ndarray
20
+ numpy indexing array containing integer indices
21
+ """
22
+ self.h5ds = h5ds
23
+ self.mapping_indices = mapping_indices
24
+ self.shape = (mapping_indices.size,) + h5ds.shape[1:]
25
+
26
+ def __getitem__(self, idx):
27
+ if isinstance(idx, numbers.Integral):
28
+ return self.h5ds[self.mapping_indices[idx]]
29
+ else:
30
+ idx_mapped = self.mapping_indices[idx]
31
+ return self.h5ds[idx_mapped]
32
+
33
+
34
+ def get_mapping_indices(
35
+ index_mapping: numbers.Integral | slice | list | np.ndarray
36
+ ):
37
+ if isinstance(index_mapping, numbers.Integral):
38
+ return _get_mapping_indices_cached(index_mapping)
39
+ elif isinstance(index_mapping, slice):
40
+ return _get_mapping_indices_cached(
41
+ (index_mapping.start, index_mapping.stop, index_mapping.step))
42
+ elif isinstance(index_mapping, (np.ndarray, list)):
43
+ return np.array(index_mapping, dtype=np.uint32)
44
+ else:
45
+ raise ValueError(f"Invalid type for `index_mapping`: "
46
+ f"{type(index_mapping)} ({index_mapping})")
47
+
48
+
49
+ @functools.lru_cache(maxsize=100)
50
+ def _get_mapping_indices_cached(
51
+ index_mapping: numbers.Integral | tuple
52
+ ):
53
+ if isinstance(index_mapping, numbers.Integral):
54
+ return np.arange(index_mapping)
55
+ elif isinstance(index_mapping, tuple):
56
+ im_slice = slice(*index_mapping)
57
+ if im_slice.step is not None:
58
+ raise NotImplementedError("Slices with step not implemented yet")
59
+ if im_slice.stop is None or im_slice.start is None:
60
+ raise NotImplementedError(
61
+ "Slices must have start and stop defined")
62
+ return np.arange(im_slice.start, im_slice.stop)
63
+ elif isinstance(index_mapping, list):
64
+ return np.array(index_mapping, dtype=np.uint32)
65
+ else:
66
+ raise ValueError(f"Invalid type for cached `index_mapping`: "
67
+ f"{type(index_mapping)} ({index_mapping})")
68
+
69
+
70
+ def get_mapped_object(obj, index_mapping=None):
71
+ if index_mapping is None:
72
+ return obj
73
+ elif isinstance(obj, h5py.Dataset):
74
+ return MappedHDF5Dataset(
75
+ obj,
76
+ mapping_indices=get_mapping_indices(index_mapping))
77
+ else:
78
+ raise ValueError(f"No recipe to convert object of type {type(obj)} "
79
+ f"({obj}) to an index-mapped object")
dcnum/segm/segm_thresh.py CHANGED
@@ -16,7 +16,7 @@ class SegmentThresh(CPUSegmenter):
16
16
  Parameters
17
17
  ----------
18
18
  thresh: int
19
- grayscale threhold value for creating the mask image;
19
+ grayscale threshold value for creating the mask image;
20
20
  For a background-corrected image, pixels with values below
21
21
  this value are considered to be part of the mask.
22
22
  """
@@ -25,7 +25,7 @@ class SegmentThresh(CPUSegmenter):
25
25
  @staticmethod
26
26
  def segment_approach(image, *,
27
27
  thresh: float = -6):
28
- """Mask retrieval as it is done in Shape-In
28
+ """Mask retrieval using basic thresholding
29
29
 
30
30
  Parameters
31
31
  ----------
@@ -39,7 +39,7 @@ class SegmentThresh(CPUSegmenter):
39
39
  Returns
40
40
  -------
41
41
  mask: 2d boolean ndarray
42
- Mask image for the give index
42
+ Mask image for the given index
43
43
  """
44
44
  assert thresh < 0, "threshold values above zero not supported!"
45
45
  return image < thresh