dcnum 0.19.1__py3-none-any.whl → 0.20.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dcnum might be problematic. Click here for more details.

dcnum/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.19.1'
16
- __version_tuple__ = version_tuple = (0, 19, 1)
15
+ __version__ = version = '0.20.1'
16
+ __version_tuple__ = version_tuple = (0, 20, 1)
@@ -96,8 +96,8 @@ class EventExtractorManagerThread(threading.Thread):
96
96
  # If the writer_dq starts filling up, then this could lead to
97
97
  # an oom-kill signal. Stall for the writer to prevent this.
98
98
  ldq = len(self.writer_dq)
99
- if ldq > 100:
100
- stallsec = ldq / 100
99
+ if ldq > 1000:
100
+ stallsec = ldq / 1000
101
101
  self.logger.warning(
102
102
  f"Stalling {stallsec:.1f}s for slow writer")
103
103
  time.sleep(stallsec)
@@ -62,8 +62,8 @@ class Background(abc.ABC):
62
62
 
63
63
  #: number of images in the input data
64
64
  self.image_count = None
65
- #: number of images that have been processed
66
- self.image_proc = mp_spawn.Value("L", 0)
65
+ #: fraction images that have been processed
66
+ self.image_proc = mp_spawn.Value("d", 0)
67
67
 
68
68
  #: HDF5Data instance for input data
69
69
  self.hdin = None
@@ -185,7 +185,7 @@ class Background(abc.ABC):
185
185
  if self.image_count == 0:
186
186
  return 0.
187
187
  else:
188
- return self.image_proc.value / self.image_count
188
+ return self.image_proc.value
189
189
 
190
190
  def process(self):
191
191
  # Delete any old background data
@@ -9,17 +9,19 @@ class BackgroundCopy(Background):
9
9
  pass
10
10
 
11
11
  def process(self):
12
- """Perform median computation on entire input data"""
12
+ """Copy input data to output dataset"""
13
13
  if self.h5in != self.h5out:
14
- hin = self.hdin.image_bg.h5ds
15
- h5py.h5o.copy(src_loc=hin.parent.id,
16
- src_name=b"image_bg",
17
- dst_loc=self.h5out["events"].id,
18
- dst_name=b"image_bg",
19
- )
14
+ hin = self.hdin.h5
15
+ for feat in ["image_bg", "bg_off"]:
16
+ if feat in hin["events"]:
17
+ h5py.h5o.copy(src_loc=hin["events"].id,
18
+ src_name=feat.encode("utf-8"),
19
+ dst_loc=self.h5out["events"].id,
20
+ dst_name=feat.encode("utf-8"),
21
+ )
20
22
 
21
23
  # set progress to 100%
22
- self.image_proc.value = self.image_count
24
+ self.image_proc.value = 1
23
25
 
24
26
  def process_approach(self):
25
27
  # We do the copying in `process`, because we do not want to modify
@@ -184,7 +184,7 @@ class BackgroundRollMed(Background):
184
184
  num_remaining,
185
185
  axis=0)
186
186
  self.writer.store_feature_chunk("image_bg", last_chunk)
187
- self.image_proc.value += num_remaining
187
+ self.image_proc.value = 1
188
188
 
189
189
  def process_next_batch(self):
190
190
  """Process one batch of input data"""
@@ -223,7 +223,7 @@ class BackgroundRollMed(Background):
223
223
  )
224
224
 
225
225
  self.current_batch += 1
226
- self.image_proc.value += self.batch_size
226
+ self.image_proc.value += self.batch_size / self.image_count
227
227
 
228
228
 
229
229
  class WorkerRollMed(mp_spawn.Process):
@@ -329,7 +329,7 @@ class BackgroundSparseMed(Background):
329
329
  # Fill up remainder of index array with last entry
330
330
  bg_idx[idx1:] = ii
331
331
 
332
- self.image_proc.value = self.image_count
332
+ self.image_proc.value = 1
333
333
 
334
334
  # Write background data
335
335
  pos = 0
@@ -393,7 +393,7 @@ class BackgroundSparseMed(Background):
393
393
 
394
394
  self.bg_images[ii] = self.shared_output.reshape(self.image_shape)
395
395
 
396
- self.image_proc.value = idx_stop
396
+ self.image_proc.value = idx_stop / self.image_count
397
397
 
398
398
 
399
399
  class WorkerSparseMed(mp_spawn.Process):
dcnum/logic/ctrl.py CHANGED
@@ -14,6 +14,7 @@ import traceback
14
14
  import uuid
15
15
 
16
16
  import h5py
17
+ import numpy as np
17
18
 
18
19
  from ..feat.feat_background.base import get_available_background_methods
19
20
  from ..feat.queue_event_extractor import QueueEventExtractor
@@ -21,10 +22,10 @@ from ..feat import gate
21
22
  from ..feat import EventExtractorManagerThread
22
23
  from ..segm import SegmenterManagerThread, get_available_segmenters
23
24
  from ..meta import ppid
24
- from ..read import HDF5Data
25
- from .._version import version_tuple
25
+ from ..read import HDF5Data, get_mapping_indices
26
+ from .._version import version, version_tuple
26
27
  from ..write import (
27
- DequeWriterThread, HDF5Writer, QueueCollectorThread,
28
+ DequeWriterThread, HDF5Writer, QueueCollectorThread, copy_features,
28
29
  copy_metadata, create_with_basins, set_default_filter_kwargs
29
30
  )
30
31
 
@@ -43,6 +44,7 @@ valid_states = [
43
44
  "setup",
44
45
  "background",
45
46
  "segmentation",
47
+ "plumbing",
46
48
  "cleanup",
47
49
  "done",
48
50
  "error",
@@ -79,8 +81,9 @@ class DCNumJobRunner(threading.Thread):
79
81
  # current job state
80
82
  self._state = "init"
81
83
  # overall progress [0, 1]
82
- self._progress_bg = None
83
- self._progress_ex = None
84
+ self._progress_bg = None # background
85
+ self._progress_ex = None # segmentation
86
+ self._progress_bn = None # creating basins
84
87
  # segmentation frame rate
85
88
  self._segm_rate = 0
86
89
 
@@ -237,8 +240,12 @@ class DCNumJobRunner(threading.Thread):
237
240
  # how much fractional time each processing step takes.
238
241
  bgw = 4 # fraction of background
239
242
  exw = 27 # fraction of segmentation and feature extraction
243
+ if self.job["basin_strategy"] == "drain":
244
+ drw = 15 # because data need to be copied
245
+ else:
246
+ drw = 1 # just creating the basins in output file
240
247
  clw = 1 # fraction of cleanup operations
241
- tot = bgw + exw + clw
248
+ tot = bgw + exw + drw + clw
242
249
  progress = 0
243
250
  st = self.state
244
251
 
@@ -247,15 +254,22 @@ class DCNumJobRunner(threading.Thread):
247
254
  # background already computed
248
255
  progress += bgw / tot
249
256
  elif self._progress_bg is not None:
250
- # This is the image count of the input dataset
251
- progress += bgw / tot * (self._progress_bg.value / len(self.draw))
257
+ # This is the image count of the input dataset.
258
+ progress += self._progress_bg.value * bgw / tot
252
259
 
253
260
  # segmentation
254
261
  if valid_states.index(st) > valid_states.index("segmentation"):
255
262
  # segmentation already done
256
263
  progress += exw / tot
257
264
  elif self._progress_ex is not None:
258
- progress += exw / tot * self._progress_ex
265
+ progress += self._progress_ex * exw / tot
266
+
267
+ # draining basins
268
+ if valid_states.index(st) > valid_states.index("plumbing"):
269
+ # plumbing already done
270
+ progress += drw / tot
271
+ if self._progress_bn is not None:
272
+ progress += self._progress_bn * drw / tot
259
273
 
260
274
  if self.state == "done":
261
275
  progress = 1
@@ -371,16 +385,20 @@ class DCNumJobRunner(threading.Thread):
371
385
  # Note any new actions that work on `self.path_temp_in` are not
372
386
  # reflected in `self.path_temp_out`.
373
387
  self.path_temp_in.rename(self.path_temp_out)
374
-
375
- self.state = "cleanup"
376
-
377
- # The user would normally expect the output file to be something
378
- # that is self-contained (copying the file wildly across file
379
- # systems and network shares should not impair feature availability).
380
- # Therefore, we copy any remaining basin-based features to the
381
- # temporary output file.
382
- if self.job["no_basins_in_output"]:
383
- self.task_transfer_basin_data()
388
+ # Since no segmentation was done, the output file now does not
389
+ # contain any events. This is not really what we wanted, but we
390
+ # can still store all features in the output file if required.
391
+ if self.job["basin_strategy"] == "drain":
392
+ orig_feats = []
393
+ for feat in self.draw.h5["events"].keys():
394
+ if isinstance(self.draw.h5["events"][feat], h5py.Dataset):
395
+ # copy_features does not support Groups
396
+ orig_feats.append(feat)
397
+ with h5py.File(self.path_temp_out, "a") as h5_dst:
398
+ copy_features(h5_src=self.draw.h5,
399
+ h5_dst=h5_dst,
400
+ features=orig_feats,
401
+ mapping=None)
384
402
 
385
403
  with HDF5Writer(self.path_temp_out) as hw:
386
404
  # pipeline metadata
@@ -433,7 +451,8 @@ class DCNumJobRunner(threading.Thread):
433
451
  with h5py.File(self.job["path_in"]) as h5_src:
434
452
  copy_metadata(h5_src=h5_src,
435
453
  h5_dst=hw.h5,
436
- # don't copy basins
454
+ # Don't copy basins, we would have to index-map
455
+ # them first.
437
456
  copy_basins=False)
438
457
  if redo_seg:
439
458
  # Store the correct measurement identifier. This is used to
@@ -450,6 +469,12 @@ class DCNumJobRunner(threading.Thread):
450
469
  mid_new = f"{mid_cur}_{mid_ap}" if mid_cur else mid_ap
451
470
  hw.h5.attrs["experiment:run identifier"] = mid_new
452
471
 
472
+ # Handle basin data according to the user's request
473
+ self.state = "plumbing"
474
+ self.task_enforce_basin_strategy()
475
+
476
+ self.state = "cleanup"
477
+
453
478
  trun = datetime.timedelta(seconds=round(time.monotonic() - time_start))
454
479
  self.logger.info(f"Run duration: {str(trun)}")
455
480
  self.logger.info(time.strftime("Run stop: %Y-%m-%d-%H.%M.%S",
@@ -491,6 +516,115 @@ class DCNumJobRunner(threading.Thread):
491
516
  bic.process()
492
517
  self.logger.info("Finished background computation")
493
518
 
519
+ def task_enforce_basin_strategy(self):
520
+ """Transfer basin data from input files to output if requested
521
+
522
+ The user specified the "basin_strategy" keyword argument in
523
+ `self.job`. If this is set to "drain", then copy all basin
524
+ information from the input file to the output file. If it
525
+ is set to "tap", then only create basins in the output file.
526
+ """
527
+ self._progress_bn = 0
528
+ t0 = time.perf_counter()
529
+ # We need to make sure that the features are correctly attributed
530
+ # from the input files. E.g. if the input file already has
531
+ # background images, but we recompute the background images, then
532
+ # we have to use the data from the recomputed background file.
533
+ # We achieve this by keeping a specific order and only copying those
534
+ # features that we don't already have in the output file.
535
+ feats_raw = [
536
+ # 1. background data from the temporary input image
537
+ # (this must come before draw [sic!])
538
+ [self.dtin.h5, ["image_bg", "bg_off"], "critical"],
539
+ # 2. frame-based scalar features from the raw input file
540
+ # (e.g. "temp" or "frame")
541
+ [self.draw.h5, self.draw.features_scalar_frame, "optional"],
542
+ # 3. image features from the input file
543
+ [self.draw.h5, ["image", "image_bg", "bg_off"], "optional"],
544
+ ]
545
+ with h5py.File(self.path_temp_out, "a") as hout:
546
+ hw = HDF5Writer(hout)
547
+ # First, we have to determine the basin mapping from input to
548
+ # output. This information is stored by the QueueCollectorThread
549
+ # in the "basinmap0" feature, ready to be used by us.
550
+ if "index_unmapped" in hout["events"]:
551
+ # The unmapped indices enumerate the events in the output file
552
+ # with indices from the mapped input file. E.g. if for the
553
+ # first image in the input file, two events are found and for
554
+ # the second image in the input file, three events are found,
555
+ # then this would contain [0, 0, 1, 1, 1, ...]. If the index
556
+ # mapping of the input file was set to slice(1, 100), then the
557
+ # first image would not be there, and we would have
558
+ # [1, 1, 1, ...].
559
+ idx_um = hout["events/index_unmapped"]
560
+
561
+ # If we want to convert this to an actual basinmap feature,
562
+ # then we have to convert those indices to indices that map
563
+ # to the original input HDF5 file.
564
+ raw_im = self.draw.index_mapping
565
+ if raw_im is None:
566
+ self.logger.info("Input file mapped with basinmap0")
567
+ # Create a hard link to save time and space
568
+ hout["events/basinmap0"] = hout["events/index_unmapped"]
569
+ basinmap = idx_um
570
+ else:
571
+ basinmap = get_mapping_indices(raw_im)[idx_um]
572
+ # Store the mapped basin data in the output file.
573
+ hw.store_feature_chunk("basinmap0", basinmap)
574
+ # We don't need them anymore.
575
+ del hout["events/index_unmapped"]
576
+
577
+ # Note that `size_raw != (len(self.draw))` [sic!]. The former
578
+ # is the size of the raw dataset and the latter is its mapped
579
+ # size!
580
+ size_raw = self.draw.h5.attrs["experiment:event count"]
581
+ if (len(basinmap) == size_raw
582
+ and np.all(basinmap == np.arange(size_raw))):
583
+ # This means that the images in the input overlap perfectly
584
+ # with the images in the output, i.e. a "copy" segmenter
585
+ # was used or something is very reproducible.
586
+ # We set basinmap to None to be more efficient.
587
+ basinmap = None
588
+
589
+ else:
590
+ # The input is identical to the output, because we are using
591
+ # the same pipeline identifier.
592
+ basinmap = None
593
+
594
+ for hin, feats, importance in feats_raw:
595
+ # Only consider features that are available in the input
596
+ # and that are not already in the output.
597
+ feats = [f for f in feats
598
+ if (f in hin["events"] and f not in hout["events"])]
599
+ if not feats:
600
+ continue
601
+ elif (self.job["basin_strategy"] == "drain"
602
+ or importance == "critical"):
603
+ # DRAIN: Copy all features over to the output file.
604
+ self.logger.debug(f"Transferring {feats} to output file")
605
+ copy_features(h5_src=hin,
606
+ h5_dst=hout,
607
+ features=feats,
608
+ mapping=basinmap)
609
+ else:
610
+ # TAP: Create basins for the "optional" features in the
611
+ # output file. Note that the "critical" features never
612
+ # reach this case.
613
+ self.logger.debug(f"Creating basin for {feats}")
614
+ # Relative and absolute paths.
615
+ pin = pathlib.Path(hin.filename).resolve()
616
+ pout = pathlib.Path(hout.filename).resolve().parent
617
+ paths = [pin, os.path.relpath(pin, pout)]
618
+ hw.store_basin(name="dcnum basin",
619
+ features=feats,
620
+ mapping=basinmap,
621
+ paths=paths,
622
+ description=f"Created with dcnum {version}",
623
+ )
624
+ self._progress_bn += 1 / len(feats_raw)
625
+ t_tot = time.perf_counter() - t0
626
+ self.logger.info(f"Enforcing basin strategy time: {t_tot:.1f}s")
627
+
494
628
  def task_segment_extract(self):
495
629
  self.logger.info("Starting segmentation and feature extraction")
496
630
  # Start writer thread
@@ -629,21 +763,6 @@ class DCNumJobRunner(threading.Thread):
629
763
 
630
764
  self.logger.info("Finished segmentation and feature extraction")
631
765
 
632
- def task_transfer_basin_data(self):
633
- with h5py.File(self.path_temp_out, "a") as hout:
634
- hd = HDF5Data(hout)
635
- for ii, _ in enumerate(hd.basins):
636
- hindat, features = hd.get_basin_data(ii)
637
- for feat in features:
638
- if feat not in hout["events"]:
639
- self.logger.debug(
640
- f"Transferring {feat} to output file")
641
- h5py.h5o.copy(src_loc=hindat.h5["events"].id,
642
- src_name=feat.encode(),
643
- dst_loc=hout["events"].id,
644
- dst_name=feat.encode(),
645
- )
646
-
647
766
 
648
767
  def join_thread_helper(thr, timeout, retries, logger, name):
649
768
  for _ in range(retries):
dcnum/logic/job.py CHANGED
@@ -3,7 +3,8 @@ import copy
3
3
  import inspect
4
4
  import multiprocessing as mp
5
5
  import pathlib
6
- from typing import Dict
6
+ from typing import Dict, Literal
7
+ import warnings
7
8
 
8
9
  from ..feat import QueueEventExtractor
9
10
  from ..feat.feat_background.base import get_available_background_methods
@@ -27,10 +28,62 @@ class DCNumPipelineJob:
27
28
  feature_kwargs: Dict = None,
28
29
  gate_code: str = "norm",
29
30
  gate_kwargs: Dict = None,
30
- no_basins_in_output: bool = True,
31
+ basin_strategy: Literal["drain", "tap"] = "drain",
32
+ no_basins_in_output: bool = None,
31
33
  num_procs: int = None,
32
34
  debug: bool = False,
33
35
  ):
36
+ """Pipeline job recipe
37
+
38
+ Parameters
39
+ ----------
40
+ path_in: pathlib.Path | str
41
+ input data path
42
+ path_out: pathlib.Path | str
43
+ output data path
44
+ data_code: str
45
+ code of input data reader to use
46
+ data_kwargs: dict
47
+ keyword arguments for data reader
48
+ background_code: str
49
+ code of background data computer to use
50
+ background_kwargs: dict
51
+ keyword arguments for background data computer
52
+ segmenter_code: str
53
+ code of segmenter to use
54
+ segmenter_kwargs: dict
55
+ keyword arguments for segmenter
56
+ feature_code: str
57
+ code of feature extractor
58
+ feature_kwargs: dict
59
+ keyword arguments for feature extractor
60
+ gate_code: str
61
+ code for gating/event filtering class
62
+ gate_kwargs: dict
63
+ keyword arguments for gating/event filtering class
64
+ basin_strategy: str
65
+ strategy on how to handle event data; In principle, not all
66
+ events have to be stored in the output file if basins are
67
+ defined, linking back to the original file.
68
+ - You can "drain" all basins which means that the output file
69
+ will contain all features, but will also be very big.
70
+ - You can "tap" the basins, including the input file, which means
71
+ that the output file will be comparatively small.
72
+ no_basins_in_output: bool
73
+ Deprecated
74
+ num_procs: int
75
+ Number of processes to use
76
+ debug: bool
77
+ Whether to be verbose and use threads instead of processes
78
+ """
79
+ if no_basins_in_output is not None:
80
+ warnings.warn("The `no_basins_in_output` keyword argument is "
81
+ "deprecated. Please use `basin_strategy` instead.")
82
+ if no_basins_in_output:
83
+ basin_strategy = "drain"
84
+ else:
85
+ basin_strategy = "tap"
86
+
34
87
  #: initialize keyword arguments for this job
35
88
  self.kwargs = {}
36
89
  spec = inspect.getfullargspec(DCNumPipelineJob.__init__)
dcnum/read/__init__.py CHANGED
@@ -2,4 +2,4 @@
2
2
  from .cache import md5sum
3
3
  from .const import PROTECTED_FEATURES
4
4
  from .hdf5_data import HDF5Data, HDF5ImageCache, concatenated_hdf5_data
5
- from .mapped import get_mapping_indices, get_mapped_object
5
+ from .mapped import get_mapping_indices, get_mapped_object
dcnum/read/cache.py CHANGED
@@ -22,6 +22,7 @@ class BaseImageChunkCache(abc.ABC):
22
22
  cache_size: int = 2,
23
23
  ):
24
24
  self.shape = shape
25
+ self._dtype = None
25
26
  chunk_size = min(shape[0], chunk_size)
26
27
  self._len = self.shape[0]
27
28
  #: This is a FILO cache for the chunks
@@ -33,12 +34,32 @@ class BaseImageChunkCache(abc.ABC):
33
34
  self.num_chunks = int(np.ceil(self._len / (self.chunk_size or 1)))
34
35
 
35
36
  def __getitem__(self, index):
36
- chunk_index, sub_index = self._get_chunk_index_for_index(index)
37
- return self.get_chunk(chunk_index)[sub_index]
37
+ if isinstance(index, (slice, list, np.ndarray)):
38
+ if isinstance(index, slice):
39
+ indices = np.arange(index.start or 0,
40
+ index.stop or len(self),
41
+ index.step)
42
+ else:
43
+ indices = index
44
+ array_out = np.empty((len(indices),) + self.image_shape,
45
+ dtype=self.dtype)
46
+ for ii, idx in enumerate(indices):
47
+ array_out[ii] = self[idx]
48
+ return array_out
49
+ else:
50
+ chunk_index, sub_index = self._get_chunk_index_for_index(index)
51
+ return self.get_chunk(chunk_index)[sub_index]
38
52
 
39
53
  def __len__(self):
40
54
  return self._len
41
55
 
56
+ @property
57
+ def dtype(self):
58
+ """data type of the image data"""
59
+ if self._dtype is None:
60
+ self._dtype = self[0].dtype
61
+ return self._dtype
62
+
42
63
  @abc.abstractmethod
43
64
  def _get_chunk_data(self, chunk_slice):
44
65
  """Implemented in subclass to obtain actual data"""
@@ -50,6 +71,7 @@ class BaseImageChunkCache(abc.ABC):
50
71
  raise IndexError(
51
72
  f"Index {index} out of bounds for HDF5ImageCache "
52
73
  f"of size {self._len}")
74
+ index = int(index) # convert np.uint64 to int, so we get ints below
53
75
  chunk_index = index // self.chunk_size
54
76
  sub_index = index % self.chunk_size
55
77
  return chunk_index, sub_index
dcnum/read/const.py CHANGED
@@ -8,7 +8,7 @@ PROTECTED_FEATURES = [
8
8
  "pressure",
9
9
  "temp",
10
10
  "temp_amb",
11
- "time"
11
+ "time",
12
12
  ]
13
13
 
14
14
  # User-defined features may be anything, but if the user needs something
dcnum/read/hdf5_data.py CHANGED
@@ -186,25 +186,27 @@ class HDF5Data:
186
186
  if isinstance(self.meta[key], bytes):
187
187
  self.meta[key] = self.meta[key].decode("utf-8")
188
188
  # logs
189
- for key in h5.get("logs", []):
189
+ for key in sorted(h5.get("logs", {}).keys()):
190
190
  alog = list(h5["logs"][key])
191
191
  if alog:
192
192
  if isinstance(alog[0], bytes):
193
193
  alog = [ll.decode("utf") for ll in alog]
194
194
  self.logs[key] = alog
195
195
  # tables
196
- for tab in h5.get("tables", []):
196
+ for tab in sorted(h5.get("tables", {}).keys()):
197
197
  tabdict = {}
198
198
  for tkey in h5["tables"][tab].dtype.fields.keys():
199
199
  tabdict[tkey] = \
200
200
  np.array(h5["tables"][tab][tkey]).reshape(-1)
201
201
  self.tables[tab] = tabdict
202
202
  # basins
203
- for bnkey in h5.get("basins", []):
203
+ basins = []
204
+ for bnkey in h5.get("basins", {}).keys():
204
205
  bn_data = "\n".join(
205
206
  [s.decode() for s in h5["basins"][bnkey][:].tolist()])
206
207
  bn_dict = json.loads(bn_data)
207
- self.basins.append(bn_dict)
208
+ basins.append(bn_dict)
209
+ self.basins = sorted(basins, key=lambda x: x["name"])
208
210
 
209
211
  if state["pixel_size"] is not None:
210
212
  self.pixel_size = state["pixel_size"]
@@ -395,7 +397,22 @@ class HDF5Data:
395
397
  if path is None:
396
398
  self._basin_data[index] = (None, None)
397
399
  else:
398
- h5dat = HDF5Data(path, index_mapping=self.index_mapping)
400
+ feat_basinmap = bn_dict.get("mapping", None)
401
+ if feat_basinmap is None:
402
+ # This is NOT a mapped basin.
403
+ index_mapping = self.index_mapping
404
+ else:
405
+ # This is a mapped basin. Create an indexing list.
406
+ if self.index_mapping is None:
407
+ # The current dataset is not mapped.
408
+ basinmap_idx = slice(None)
409
+ else:
410
+ # The current dataset is also mapped.
411
+ basinmap_idx = get_mapping_indices(self.index_mapping)
412
+ basinmap = self.h5[f"events/{feat_basinmap}"]
413
+ index_mapping = basinmap[basinmap_idx]
414
+
415
+ h5dat = HDF5Data(path, index_mapping=index_mapping)
399
416
  features = bn_dict.get("features")
400
417
  if features is None:
401
418
  # Only get the features from the actual HDF5 file.
@@ -420,21 +437,27 @@ class HDF5Data:
420
437
  if feat not in self._image_cache:
421
438
  if f"events/{feat}" in self.h5:
422
439
  ds = self.h5[f"events/{feat}"]
440
+ idx_map = self.index_mapping
423
441
  else:
442
+ idx_map = None
424
443
  # search all basins
425
444
  for idx in range(len(self.basins)):
426
- bndat, features = self.get_basin_data(idx)
445
+ bn_dat, features = self.get_basin_data(idx)
427
446
  if features is not None:
428
447
  if feat in features:
429
- ds = bndat.h5[f"events/{feat}"]
448
+ # HDF5 dataset
449
+ ds = bn_dat.h5[f"events/{feat}"]
450
+ # Index mapping (taken from the basins which
451
+ # already includes the mapping from the current
452
+ # instance).
453
+ idx_map = bn_dat.index_mapping
430
454
  break
431
455
  else:
432
456
  ds = None
433
457
 
434
458
  if ds is not None:
435
459
  image = HDF5ImageCache(
436
- h5ds=get_mapped_object(obj=ds,
437
- index_mapping=self.index_mapping),
460
+ h5ds=get_mapped_object(obj=ds, index_mapping=idx_map),
438
461
  cache_size=self.image_cache_size,
439
462
  boolean=feat == "mask")
440
463
  else:
dcnum/read/mapped.py CHANGED
@@ -34,6 +34,16 @@ class MappedHDF5Dataset:
34
34
  def get_mapping_indices(
35
35
  index_mapping: numbers.Integral | slice | list | np.ndarray
36
36
  ):
37
+ """Return integer numpy array with mapping indices for a range
38
+
39
+ Parameters
40
+ ----------
41
+ index_mapping: numbers.Integral | slice | list | np.ndarray
42
+ Several options you have here:
43
+ - integer: results in np.arrange(integer)
44
+ - slice: results in np.arrange(slice.start, slice.stop, slice.step)
45
+ - list or np.ndarray: returns the input as unit32 array
46
+ """
37
47
  if isinstance(index_mapping, numbers.Integral):
38
48
  return _get_mapping_indices_cached(index_mapping)
39
49
  elif isinstance(index_mapping, slice):
dcnum/write/__init__.py CHANGED
@@ -2,4 +2,5 @@
2
2
  from .deque_writer_thread import DequeWriterThread
3
3
  from .queue_collector_thread import EventStash, QueueCollectorThread
4
4
  from .writer import (
5
- HDF5Writer, copy_metadata, create_with_basins, set_default_filter_kwargs)
5
+ HDF5Writer, copy_features, copy_metadata, create_with_basins,
6
+ set_default_filter_kwargs)
@@ -1,14 +1,17 @@
1
1
  import collections
2
+ import logging
2
3
  import pathlib
3
4
  import threading
4
5
  import time
5
6
 
7
+ import h5py
8
+
6
9
  from .writer import HDF5Writer
7
10
 
8
11
 
9
12
  class DequeWriterThread(threading.Thread):
10
13
  def __init__(self,
11
- path_out: pathlib.Path,
14
+ path_out: pathlib.Path | h5py.File,
12
15
  dq: collections.deque,
13
16
  ds_kwds: dict = None,
14
17
  mode: str = "a",
@@ -24,6 +27,7 @@ class DequeWriterThread(threading.Thread):
24
27
  using `popleft()`.
25
28
  """
26
29
  super(DequeWriterThread, self).__init__(*args, **kwargs)
30
+ self.logger = logging.getLogger("dcnum.write.DequeWriterThread")
27
31
  if mode == "w":
28
32
  path_out.unlink(missing_ok=True)
29
33
  self.writer = HDF5Writer(path_out, mode=mode, ds_kwds=ds_kwds)
@@ -40,17 +44,21 @@ class DequeWriterThread(threading.Thread):
40
44
  self.may_stop_loop = True
41
45
 
42
46
  def run(self):
47
+ time_tot = 0
43
48
  while True:
44
49
  ldq = len(self.dq)
45
50
  if self.must_stop_loop:
46
51
  break
47
52
  elif ldq:
53
+ t0 = time.perf_counter()
48
54
  for _ in range(ldq):
49
55
  feat, data = self.dq.popleft()
50
56
  self.writer.store_feature_chunk(feat=feat, data=data)
57
+ time_tot += time.perf_counter() - t0
51
58
  elif self.may_stop_loop:
52
59
  break
53
60
  else:
54
61
  # wait for the next item to arrive
55
62
  time.sleep(.1)
63
+ self.logger.info(f"Disk time: {time_tot:.1f}s")
56
64
  self.writer.close()
@@ -245,20 +245,14 @@ class QueueCollectorThread(threading.Thread):
245
245
  # the events that we just saved.
246
246
  indices = stash.indices_for_data
247
247
 
248
- # Write all the scalar features.
249
- for feat in self.data.features_scalar_frame:
250
- self.writer_dq.append((feat, self.data[feat][indices]))
251
-
252
- # Write the image and background data.
253
- imdat = np.zeros((stash.size,) + self.data.image.image_shape,
254
- dtype=np.uint8)
255
- bgdat = np.zeros((stash.size,) + self.data.image.image_shape,
256
- dtype=np.uint8)
257
- for ii, idx in enumerate(indices):
258
- imdat[ii] = self.data.image[idx]
259
- bgdat[ii] = self.data.image_bg[idx]
260
- self.writer_dq.append(("image", imdat))
261
- self.writer_dq.append(("image_bg", bgdat))
248
+ # This is the unmapped index from the input HDF5Data instance.
249
+ # Unmapped means that this only enumerates HDF5Data, but since
250
+ # HDF5Data can be mapped, the index does not necessarily enumerate
251
+ # the underlying HDF5 file. Later on, we will have to convert this
252
+ # to the correct "basinmap0" feature
253
+ # (see `DCNumJobRunner.task_enforce_basin_strategy`)
254
+ self.writer_dq.append(("index_unmapped",
255
+ np.array(indices, dtype=np.uint32)))
262
256
 
263
257
  # Write the number of events.
264
258
  self.writer_dq.append(("nevents",
dcnum/write/writer.py CHANGED
@@ -115,6 +115,7 @@ class HDF5Writer:
115
115
  paths: List[str | pathlib.Path],
116
116
  features: List[str] = None,
117
117
  description: str | None = None,
118
+ mapping: np.ndarray = None
118
119
  ):
119
120
  """Write an HDF5-based file basin
120
121
 
@@ -128,6 +129,9 @@ class HDF5Writer:
128
129
  list of features provided by `paths`
129
130
  description: str
130
131
  optional string describing the basin
132
+ mapping: 1D array
133
+ integer array with indices that map the basin dataset
134
+ to this dataset
131
135
  """
132
136
  bdat = {
133
137
  "description": description,
@@ -136,8 +140,38 @@ class HDF5Writer:
136
140
  "paths": [str(pp) for pp in paths],
137
141
  "type": "file",
138
142
  }
143
+ # Explicit features stored in basin file
139
144
  if features is not None and len(features):
140
145
  bdat["features"] = features
146
+ # Mapped basin information
147
+ if mapping is not None:
148
+ events = self.h5.require_group("events")
149
+ # Reserve a mapping feature for this dataset
150
+ for ii in range(10): # basinmap0 to basinmap9
151
+ bm_cand = f"basinmap{ii}"
152
+ if bm_cand in events:
153
+ # There is a basin mapping defined in the file. Check
154
+ # whether it is identical to ours.
155
+ if np.all(events[bm_cand] == mapping):
156
+ # Great, we are done here.
157
+ feat_basinmap = bm_cand
158
+ break
159
+ else:
160
+ # This mapping belongs to a different basin,
161
+ # try the next mapping.
162
+ continue
163
+ else:
164
+ # The mapping is not defined in the dataset, and we may
165
+ # write it to a new feature.
166
+ feat_basinmap = bm_cand
167
+ self.store_feature_chunk(feat=feat_basinmap, data=mapping)
168
+ break
169
+ else:
170
+ raise ValueError(
171
+ "You have exhausted the usage of mapped basins for "
172
+ "the current dataset. Please revise your analysis "
173
+ "pipeline.")
174
+ bdat["mapping"] = feat_basinmap
141
175
  bstring = json.dumps(bdat, indent=2)
142
176
  # basin key is its hash
143
177
  key = hashlib.md5(bstring.encode("utf-8",
@@ -266,6 +300,63 @@ def create_with_basins(
266
300
  )
267
301
 
268
302
 
303
+ def copy_features(h5_src: h5py.File,
304
+ h5_dst: h5py.File,
305
+ features: List[str],
306
+ mapping: np.ndarray = None,
307
+ ):
308
+ """Copy feature data from one HDF5 file to another
309
+
310
+ The feature must not exist in the destination file.
311
+
312
+ Parameters
313
+ ----------
314
+ h5_src: h5py.File
315
+ Input HDF5File containing `features` in the "events" group
316
+ h5_dst: h5py.File
317
+ Output HDF5File opened in write mode not containing `features`
318
+ features: List[str]
319
+ List of features to copy from source to destination
320
+ mapping: 1D array
321
+ If given, contains indices in the input file that should be
322
+ written to the output file. If set to None, all features are written.
323
+ """
324
+ ei = h5_src["events"]
325
+ eo = h5_dst.require_group("events")
326
+ # This is the size of the output dataset
327
+ size = h5_dst.attrs["experiment:event count"]
328
+ hw = HDF5Writer(h5_dst)
329
+ for feat in features:
330
+ if feat in eo:
331
+ raise ValueError(f"Output file {h5_dst.filename} already contains "
332
+ f"the feature {feat}.")
333
+ if not isinstance(ei[feat], h5py.Dataset):
334
+ raise NotImplementedError(
335
+ f"Only dataset-based features are supported here, not {feat}")
336
+ if mapping is None:
337
+ # Just copy the data as-is.
338
+ h5py.h5o.copy(src_loc=ei.id,
339
+ src_name=feat.encode(),
340
+ dst_loc=eo.id,
341
+ dst_name=feat.encode(),
342
+ )
343
+ else:
344
+ # Perform mapping and store the features in chunks to keep
345
+ # memory usage down.
346
+ dsi = ei[feat]
347
+ chunk_size = hw.get_best_nd_chunks(dsi[0].shape, dsi.dtype)[0]
348
+ start = 0
349
+ while start < size:
350
+ chunk_idx = mapping[start:start + chunk_size]
351
+ # h5py only supports indexing in increasing order
352
+ chunk_unique, order = np.unique(chunk_idx, return_inverse=True)
353
+ data_unique = dsi[chunk_unique]
354
+ data = data_unique[order]
355
+ hw.store_feature_chunk(feat, data)
356
+ # increment start
357
+ start += chunk_size
358
+
359
+
269
360
  def copy_metadata(h5_src: h5py.File,
270
361
  h5_dst: h5py.File,
271
362
  copy_basins=True):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcnum
3
- Version: 0.19.1
3
+ Version: 0.20.1
4
4
  Summary: numerics toolbox for imaging deformability cytometry
5
5
  Author: Maximilian Schlögel, Paul Müller
6
6
  Maintainer-email: Paul Müller <dev@craban.de>
@@ -1,14 +1,14 @@
1
1
  dcnum/__init__.py,sha256=hcawIKS7utYiOyVhOAX9t7K3xYzP1b9862VV0b6qSrQ,74
2
- dcnum/_version.py,sha256=wOLHPF5OO0ubEMjzvaXg4CVhVL1uy5Ci-sH1WTZH0Dg,413
2
+ dcnum/_version.py,sha256=cyxBp0FYMpyeeOYYUcvD5Pt3djNYQojwuNgSo8-1Bp4,413
3
3
  dcnum/feat/__init__.py,sha256=jUJYWTD3VIoDNKrmryXbjHb1rGwYtK4b7VPWihYgUoo,325
4
- dcnum/feat/event_extractor_manager_thread.py,sha256=Ocid_t1awH6pOmurCmKYkC51XsXB0-DoN3fzjFDgE4c,7129
4
+ dcnum/feat/event_extractor_manager_thread.py,sha256=mAjPnS7K-ZmKbWolTNCnjXe3e-y5canNhf1l_GRYil0,7131
5
5
  dcnum/feat/gate.py,sha256=svbObmqpYdqPawpfrsEjTiUPJXf24GrNi8PXTKT-z44,7225
6
6
  dcnum/feat/queue_event_extractor.py,sha256=XhA930QVQ1Z4saisbcGTrEut-fSgwTyfDn6b9GzD4iY,15644
7
7
  dcnum/feat/feat_background/__init__.py,sha256=OTmMuazHNaSrZb2XW4cnJ6PlgJLbKrPbaidpEixYa0A,341
8
- dcnum/feat/feat_background/base.py,sha256=IYBFfsGXBfmFnZfD9QrmfrXbJtFSfVOS-v-u-uxSThs,7985
9
- dcnum/feat/feat_background/bg_copy.py,sha256=muU-6eTUf3HTA2K2asrLWcR_hbRMjdygZROmjNXCm7Q,923
10
- dcnum/feat/feat_background/bg_roll_median.py,sha256=j3seExcWkk6IeFIOO4zkj-LIA7ryig9bmGYYj_dDgoM,13173
11
- dcnum/feat/feat_background/bg_sparse_median.py,sha256=-CShs4UAuZd00rACaXTZj3dccKevhcSGRsILFgMPLWo,20705
8
+ dcnum/feat/feat_background/base.py,sha256=phZdyOrHQPjvYlw1JQ8DkdXw5H2-eE1LfLGqCAo1rlo,7965
9
+ dcnum/feat/feat_background/bg_copy.py,sha256=PK8x4_Uph-_A6uszZC5uhe1gD1dSRdHnDMEsN0HSGHA,1034
10
+ dcnum/feat/feat_background/bg_roll_median.py,sha256=EyjstMDXFBYuJB1lN6g4Uw7tPm434X3hXQxKSqvcoJ4,13175
11
+ dcnum/feat/feat_background/bg_sparse_median.py,sha256=ab7Boj7cmr6PBdTbyWTj_yNNJSfuowr7u-iSGW989WI,20709
12
12
  dcnum/feat/feat_brightness/__init__.py,sha256=o6AebVlmydwNgVF5kW6ITqJyFreoKrU3Ki_3EC8If-s,155
13
13
  dcnum/feat/feat_brightness/bright_all.py,sha256=vf8xaYBdKD24hHUXdkI0_S7nbr7m49KW6gvuWvbHDVg,4545
14
14
  dcnum/feat/feat_brightness/common.py,sha256=JX49EszYDmnvoOKXFVV1CalEIWRmOuY5EryNbqGbdac,156
@@ -20,29 +20,29 @@ dcnum/feat/feat_texture/__init__.py,sha256=6StM9S540UVtdFFR3bHa7nfCTomeVdoo7Uy9C
20
20
  dcnum/feat/feat_texture/common.py,sha256=COXHpXS-7DMouGu3WF83I76L02Sr7P9re4lxajh6g0E,439
21
21
  dcnum/feat/feat_texture/tex_all.py,sha256=eGjjNfPpfZw7FA_VNFCIMiU38KD0qcGbxLciYy-tCiA,4097
22
22
  dcnum/logic/__init__.py,sha256=7J3GrwJInNQbrLk61HRIV7X7p69TAIbMYpR34hh6u14,177
23
- dcnum/logic/ctrl.py,sha256=FvVXbrP7WqgYeDznep0KyfMck3cbCO8Yoli8P6clRPc,27956
24
- dcnum/logic/job.py,sha256=M0Q-Rfcm-zkTXTQc79W6YSNUjUlgmRPG0Ikbdn1aOpY,4608
23
+ dcnum/logic/ctrl.py,sha256=eaA_eO8X9c8wXFo35GwcMZEKQwzsVual7JTNL9f12y4,34412
24
+ dcnum/logic/job.py,sha256=cF4bPiEy5UkDmQN91Ku2yxBW0nXBEmKTNkIHlL2LT-U,6724
25
25
  dcnum/logic/json_encoder.py,sha256=cxMnqisbKEVf-rVcw6rK2BBAb6iz_hKFaGl81kK36lQ,571
26
26
  dcnum/meta/__init__.py,sha256=AVqRgyKXO1orKnE305h88IBvoZ1oz6X11HN1WP5nGvg,60
27
27
  dcnum/meta/paths.py,sha256=J_ikeHzd7gEeRgAKjuayz3x6q4h1fOiDadM-ZxhAGm4,1053
28
28
  dcnum/meta/ppid.py,sha256=Q3jg8lZt5tlGIby_-7rBqTANesMjJrmxASXZhsvBD_Y,7706
29
- dcnum/read/__init__.py,sha256=8uGj4YN7pDP4FO9TkZWXrpScwTLVWSEZexFq-TS9vsA,215
30
- dcnum/read/cache.py,sha256=kC2Y9hXA92ARQ2Vgm1kBFCU-s6TPE1tPYvpzWI0aPow,5619
31
- dcnum/read/const.py,sha256=8ih8rlWM7ntp8phrr9dh22hXXb210igSCatOSI9Ou30,463
32
- dcnum/read/hdf5_data.py,sha256=psMN2CGorU4uFO1nlGcpUxKFLZ9HaKCReTi7tVx50tg,22291
33
- dcnum/read/mapped.py,sha256=Oh1jH2yVqWBPomEf8vlGvsGOMc02ldapAAjgNo-bS7g,2676
29
+ dcnum/read/__init__.py,sha256=ksLdV8EkOU3EPje8teCOSehcUeGAZfg9TQ5ltuEUgls,216
30
+ dcnum/read/cache.py,sha256=lisrGG7AyvVitf0h92wh5FvYCsxa0pWyGcAyYwGP-LQ,6471
31
+ dcnum/read/const.py,sha256=GG9iyXDtEldvJYOBnhZjlimzIeBMAt4bSr2-xn2gzzc,464
32
+ dcnum/read/hdf5_data.py,sha256=Yyq02UTILc5ZgIQXpR9Y0wuX2WT8s0g23PraI7KxmJY,23489
33
+ dcnum/read/mapped.py,sha256=UryArlrIsHxjOyimBL2Nooi3r73zuGtnGdqdxa6PK_g,3076
34
34
  dcnum/segm/__init__.py,sha256=iiq_1A9DU5wMUcKnsZ53E7NyzCkbZCJeUDimzunE-OM,247
35
35
  dcnum/segm/segm_thresh.py,sha256=lMf-lso_O_5Q5lJiiIQdYkM3zlj4uwNz9cNvLxVMeXc,1396
36
36
  dcnum/segm/segmenter.py,sha256=gVzmP6CuwI9Qfk8GN_xWGu_xbtVTOhxIOWn-2yr_H1Y,12220
37
37
  dcnum/segm/segmenter_cpu.py,sha256=IzhPNQaO4TBh3EzZqLGaBAeRryfBKnld7Joe8qY4AB4,10690
38
38
  dcnum/segm/segmenter_gpu.py,sha256=Au1MQdAalVsmJ-cmb3OcCmEMBfXSDuJjdXJTGqEIcG8,1962
39
39
  dcnum/segm/segmenter_manager_thread.py,sha256=xQEioOkASlm8DTdG0RBtjCJP1cOuiyJAm4q2n1l_tfM,5710
40
- dcnum/write/__init__.py,sha256=Cpn3LqL18hh8OScUnGp_AnNfpWPpKW-oAJZH6ot7aRA,241
41
- dcnum/write/deque_writer_thread.py,sha256=KpJ6po8JPlM696MITN-bhNnWQcy9E-qlhg9g-uzoPZg,1710
42
- dcnum/write/queue_collector_thread.py,sha256=YQ6pvKNmCDf1C6HVx6gOA-q-FBoI6nkhOo-tAVYnyag,11906
43
- dcnum/write/writer.py,sha256=nlJfQCPoW2Wze72y_256G4qmgYMdh5mL0vpvqg7lSaU,11728
44
- dcnum-0.19.1.dist-info/LICENSE,sha256=YRChA1C8A2E-amJbudwMcbTCZy_HzmeY0hMIvduh1MM,1089
45
- dcnum-0.19.1.dist-info/METADATA,sha256=tWMwpFt4Nn8vs1H0aRTBWDmH3pej-O3gWHJ6ESkbvSw,2194
46
- dcnum-0.19.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
47
- dcnum-0.19.1.dist-info/top_level.txt,sha256=Hmh38rgG_MFTVDpUDGuO2HWTSq80P585Het4COQzFTg,6
48
- dcnum-0.19.1.dist-info/RECORD,,
40
+ dcnum/write/__init__.py,sha256=QvWHeZmjHI18i-YlGYuzN3i7dVWY9UCReKchrJ-gif0,260
41
+ dcnum/write/deque_writer_thread.py,sha256=ao7F1yrVKyufgC4rC0Y2_Vt7snuT6KpI7W2qVxcjdhk,1994
42
+ dcnum/write/queue_collector_thread.py,sha256=d_WfdsZdFnFsiAY0zVMwUlA4juIMeiWYmE_-rezBQCE,11734
43
+ dcnum/write/writer.py,sha256=e6J8YVqhS7kzkpPIMoDMokJpqSy1WWNdOrwaJof1oVc,15601
44
+ dcnum-0.20.1.dist-info/LICENSE,sha256=YRChA1C8A2E-amJbudwMcbTCZy_HzmeY0hMIvduh1MM,1089
45
+ dcnum-0.20.1.dist-info/METADATA,sha256=8hfnqtJ-lrkKlXnbWBGqRK4bSDDb0C4zmQDB6Os8f-U,2194
46
+ dcnum-0.20.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
47
+ dcnum-0.20.1.dist-info/top_level.txt,sha256=Hmh38rgG_MFTVDpUDGuO2HWTSq80P585Het4COQzFTg,6
48
+ dcnum-0.20.1.dist-info/RECORD,,
File without changes