dcnum 0.13.2__py3-none-any.whl → 0.23.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dcnum might be problematic. Click here for more details.

Files changed (55) hide show
  1. dcnum/_version.py +2 -2
  2. dcnum/feat/__init__.py +2 -1
  3. dcnum/feat/event_extractor_manager_thread.py +67 -33
  4. dcnum/feat/feat_background/__init__.py +3 -12
  5. dcnum/feat/feat_background/base.py +80 -65
  6. dcnum/feat/feat_background/bg_copy.py +31 -0
  7. dcnum/feat/feat_background/bg_roll_median.py +38 -30
  8. dcnum/feat/feat_background/bg_sparse_median.py +96 -45
  9. dcnum/feat/feat_brightness/__init__.py +1 -0
  10. dcnum/feat/feat_brightness/bright_all.py +41 -6
  11. dcnum/feat/feat_contour/__init__.py +4 -0
  12. dcnum/feat/{feat_moments/mt_legacy.py → feat_contour/moments.py} +32 -8
  13. dcnum/feat/feat_contour/volume.py +174 -0
  14. dcnum/feat/feat_texture/__init__.py +1 -0
  15. dcnum/feat/feat_texture/tex_all.py +28 -1
  16. dcnum/feat/gate.py +92 -70
  17. dcnum/feat/queue_event_extractor.py +139 -70
  18. dcnum/logic/__init__.py +5 -0
  19. dcnum/logic/ctrl.py +794 -0
  20. dcnum/logic/job.py +184 -0
  21. dcnum/logic/json_encoder.py +19 -0
  22. dcnum/meta/__init__.py +1 -0
  23. dcnum/meta/paths.py +30 -0
  24. dcnum/meta/ppid.py +66 -9
  25. dcnum/read/__init__.py +1 -0
  26. dcnum/read/cache.py +109 -77
  27. dcnum/read/const.py +6 -4
  28. dcnum/read/hdf5_data.py +190 -31
  29. dcnum/read/mapped.py +87 -0
  30. dcnum/segm/__init__.py +6 -15
  31. dcnum/segm/segm_thresh.py +7 -14
  32. dcnum/segm/segm_torch/__init__.py +19 -0
  33. dcnum/segm/segm_torch/segm_torch_base.py +125 -0
  34. dcnum/segm/segm_torch/segm_torch_mpo.py +71 -0
  35. dcnum/segm/segm_torch/segm_torch_sto.py +88 -0
  36. dcnum/segm/segm_torch/torch_model.py +95 -0
  37. dcnum/segm/segm_torch/torch_postproc.py +93 -0
  38. dcnum/segm/segm_torch/torch_preproc.py +114 -0
  39. dcnum/segm/segmenter.py +245 -96
  40. dcnum/segm/segmenter_manager_thread.py +39 -28
  41. dcnum/segm/{segmenter_cpu.py → segmenter_mpo.py} +137 -43
  42. dcnum/segm/segmenter_sto.py +110 -0
  43. dcnum/write/__init__.py +3 -1
  44. dcnum/write/deque_writer_thread.py +15 -5
  45. dcnum/write/queue_collector_thread.py +14 -17
  46. dcnum/write/writer.py +225 -55
  47. {dcnum-0.13.2.dist-info → dcnum-0.23.1.dist-info}/METADATA +4 -2
  48. dcnum-0.23.1.dist-info/RECORD +55 -0
  49. {dcnum-0.13.2.dist-info → dcnum-0.23.1.dist-info}/WHEEL +1 -1
  50. dcnum/feat/feat_moments/__init__.py +0 -3
  51. dcnum/segm/segmenter_gpu.py +0 -45
  52. dcnum-0.13.2.dist-info/RECORD +0 -40
  53. /dcnum/feat/{feat_moments/ct_opencv.py → feat_contour/contour.py} +0 -0
  54. {dcnum-0.13.2.dist-info → dcnum-0.23.1.dist-info}/LICENSE +0 -0
  55. {dcnum-0.13.2.dist-info → dcnum-0.23.1.dist-info}/top_level.txt +0 -0
dcnum/logic/ctrl.py ADDED
@@ -0,0 +1,794 @@
1
+ import collections
2
+ import datetime
3
+ import hashlib
4
+ import json
5
+ import logging
6
+ from logging.handlers import QueueListener
7
+ import multiprocessing as mp
8
+ import os
9
+ import pathlib
10
+ import platform
11
+ import socket
12
+ import threading
13
+ import time
14
+ import traceback
15
+ import uuid
16
+
17
+ import h5py
18
+ import numpy as np
19
+
20
+ from ..feat.feat_background.base import get_available_background_methods
21
+ from ..feat.queue_event_extractor import QueueEventExtractor
22
+ from ..feat import gate
23
+ from ..feat import EventExtractorManagerThread
24
+ from ..segm import SegmenterManagerThread, get_available_segmenters
25
+ from ..meta import ppid
26
+ from ..read import HDF5Data, get_mapping_indices
27
+ from .._version import version, version_tuple
28
+ from ..write import (
29
+ DequeWriterThread, HDF5Writer, QueueCollectorThread, copy_features,
30
+ copy_metadata, create_with_basins, set_default_filter_kwargs
31
+ )
32
+
33
+ from .job import DCNumPipelineJob
34
+ from .json_encoder import ExtendedJSONEncoder
35
+
36
+ # Force using "spawn" method for multiprocessing, because we are using
37
+ # queues and threads and would end up with race conditions otherwise.
38
+ mp_spawn = mp.get_context("spawn")
39
+
40
+ #: valid states for a job runnter. The states must be in logical ordern,
41
+ #: not in alphabetical order.
42
+ valid_states = [
43
+ "created",
44
+ "init",
45
+ "setup",
46
+ "background",
47
+ "segmentation",
48
+ "plumbing",
49
+ "cleanup",
50
+ "done",
51
+ "error",
52
+ ]
53
+
54
+
55
+ class DCNumJobRunner(threading.Thread):
56
+ def __init__(self,
57
+ job: DCNumPipelineJob,
58
+ tmp_suffix: str = None,
59
+ *args, **kwargs):
60
+ """Run a pipeline as defined by a :class:`DCNumPipelineJob` instance
61
+
62
+ Parameters
63
+ ----------
64
+ job: DCNumPipelineJob
65
+ pipeline job to run
66
+ tmp_suffix: str
67
+ optional unique string for creating temporary files
68
+ (defaults to hostname)
69
+ """
70
+ super(DCNumJobRunner, self).__init__(*args, **kwargs)
71
+ self.error_tb = None
72
+ self.job = job
73
+ if tmp_suffix is None:
74
+ tmp_suffix = f"{socket.gethostname()}_{str(uuid.uuid4())[:5]}"
75
+ self.tmp_suffix = tmp_suffix
76
+ self.ppid, self.pphash, self.ppdict = job.get_ppid(ret_hash=True,
77
+ ret_dict=True)
78
+ self.event_count = 0
79
+
80
+ self._data_raw = None
81
+ self._data_temp_in = None
82
+ # current job state
83
+ self._state = "init"
84
+ # overall progress [0, 1]
85
+ self._progress_bg = None # background
86
+ self._progress_ex = None # segmentation
87
+ self._progress_bn = None # creating basins
88
+ # segmentation frame rate
89
+ self._segm_rate = 0
90
+
91
+ # Set up logging
92
+ # General logger for this job
93
+ self.main_logger = logging.getLogger("dcnum")
94
+ self.main_logger.setLevel(job["log_level"])
95
+ # Log file output in target directory
96
+ self.path_log = job["path_out"].with_suffix(".log")
97
+ self.path_log.parent.mkdir(exist_ok=True, parents=True)
98
+ self.path_log.unlink(missing_ok=True)
99
+ self._log_file_handler = logging.FileHandler(
100
+ filename=self.path_log,
101
+ encoding="utf-8",
102
+ delay=True,
103
+ errors="ignore",
104
+ )
105
+ # Set the log file handler level to DEBUG, so it logs everything
106
+ # presented to it.
107
+ self._log_file_handler.setLevel(logging.DEBUG)
108
+ fmt = logging.Formatter(
109
+ fmt="%(asctime)s %(levelname)s %(name)s: %(message)s",
110
+ datefmt='%H:%M:%S'
111
+ )
112
+ self._log_file_handler.setFormatter(fmt)
113
+ self.main_logger.addHandler(self._log_file_handler)
114
+ handlers = list(self.main_logger.handlers)
115
+
116
+ # Queue for subprocesses to log to
117
+ self.log_queue = mp_spawn.Queue()
118
+ self._qlisten = QueueListener(self.log_queue, *handlers,
119
+ respect_handler_level=True)
120
+ self._qlisten.start()
121
+
122
+ if job["debug"]:
123
+ self.main_logger.info("Note that in debugging mode, duplicate "
124
+ "log entries may appear (logs that are "
125
+ "recorded via queues)")
126
+
127
+ self.logger = logging.getLogger(f"dcnum.Runner-{self.pphash[:2]}")
128
+
129
+ # Sanity checks
130
+ for os_env in ["MKL_NUM_THREADS", "NUMBA_NUM_THREADS",
131
+ "NUMEXPR_NUM_THREADS", "NUMPY_NUM_THREADS",
132
+ "OPENBLAS_NUM_THREADS", "OMP_NUM_THREADS",
133
+ "VECLIB_MAXIMUM_THREADS"]:
134
+ # You should disable multithreading for all major tools that
135
+ # use dcnum.logic. We don't want multithreading, because dcnum
136
+ # uses linear code and relies on multiprocessing for
137
+ # parallelization. This has to be done before importing numpy
138
+ # or any other library affected. In your scripts, you can use:
139
+ #
140
+ # os.environ.setdefault("MKL_NUM_THREADS", "1")
141
+ # os.environ.setdefault("NUMBA_NUM_THREADS", "1")
142
+ # os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
143
+ # os.environ.setdefault("NUMPY_NUM_THREADS", "1")
144
+ # os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
145
+ # os.environ.setdefault("OMP_NUM_THREADS", "1")
146
+ # os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "1")
147
+ #
148
+ val_act = os.environ.get(os_env)
149
+ if val_act != "1":
150
+ self.logger.warning(
151
+ f"Make sure to set the environment variable {os_env} to "
152
+ f"'1' (disables multithreading)! Other values will reduce "
153
+ f"performance and your system may become inresponsive. "
154
+ f"The current value is '{val_act}'.")
155
+
156
+ def __enter__(self):
157
+ return self
158
+
159
+ def __exit__(self, exc_type, exc_val, exc_tb):
160
+ # If an error occurred, don't delete the log and basin files.
161
+ delete_temporary_files = exc_type is None
162
+ self.close(delete_temporary_files=delete_temporary_files)
163
+
164
+ @property
165
+ def draw(self) -> HDF5Data:
166
+ """Raw input data"""
167
+ if self._data_raw is None:
168
+ # Initialize with the proper kwargs (pixel_size)
169
+ self._data_raw = HDF5Data(self.job["path_in"],
170
+ **self.job["data_kwargs"])
171
+ return self._data_raw
172
+
173
+ @property
174
+ def dtin(self) -> HDF5Data:
175
+ """Input data with (corrected) background image"""
176
+ if self._data_temp_in is None:
177
+ if not self.path_temp_in.exists():
178
+ # create basin-based input file
179
+ create_with_basins(path_out=self.path_temp_in,
180
+ basin_paths=[self.draw.path])
181
+ # Initialize with the proper kwargs (pixel_size)
182
+ self._data_temp_in = HDF5Data(self.path_temp_in,
183
+ **self.job["data_kwargs"])
184
+ assert len(self._data_temp_in) > 0
185
+ assert "image_bg" in self._data_temp_in
186
+ return self._data_temp_in
187
+
188
+ @property
189
+ def path_temp_in(self):
190
+ po = pathlib.Path(self.job["path_out"])
191
+ return po.with_name(po.stem + f"_input_bb_{self.tmp_suffix}.rtdc~")
192
+
193
+ @property
194
+ def path_temp_out(self):
195
+ po = pathlib.Path(self.job["path_out"])
196
+ return po.with_name(po.stem + f"_output_{self.tmp_suffix}.rtdc~")
197
+
198
+ @property
199
+ def state(self):
200
+ return self._state
201
+
202
+ @state.setter
203
+ def state(self, state):
204
+ if state not in valid_states:
205
+ raise ValueError(f"Invalid state '{state}' specified!")
206
+ self._state = state
207
+
208
+ def close(self, delete_temporary_files=True):
209
+ if self._data_raw is not None:
210
+ self._data_raw.close()
211
+ self._data_raw = None
212
+ if self._data_temp_in is not None:
213
+ self._data_temp_in.close()
214
+ self._data_temp_in = None
215
+ # clean up logging
216
+ if self._log_file_handler in self.main_logger.handlers:
217
+ self.main_logger.removeHandler(self._log_file_handler)
218
+ self._log_file_handler.flush()
219
+ self._log_file_handler.close()
220
+ if self._qlisten is not None:
221
+ self._qlisten.stop()
222
+ self._qlisten = None
223
+ self.log_queue.cancel_join_thread()
224
+ self.log_queue.close()
225
+ if delete_temporary_files:
226
+ # Delete log file on disk
227
+ self.path_log.unlink(missing_ok=True)
228
+ # Delete temporary input file
229
+ self.path_temp_in.unlink(missing_ok=True)
230
+ # We don't have to delete self.path_temp_out, since this one
231
+ # is `rename`d to `self.jon["path_out"]`.
232
+
233
+ def join(self, delete_temporary_files=True, *args, **kwargs):
234
+ super(DCNumJobRunner, self).join(*args, **kwargs)
235
+ # Close only after join
236
+ self.close(delete_temporary_files=delete_temporary_files)
237
+
238
+ def get_status(self):
239
+ # Compute the total progress. The following weights indicate
240
+ # how much fractional time each processing step takes.
241
+ bgw = 4 # fraction of background
242
+ exw = 27 # fraction of segmentation and feature extraction
243
+ if self.job["basin_strategy"] == "drain":
244
+ drw = 15 # because data need to be copied
245
+ else:
246
+ drw = 1 # just creating the basins in output file
247
+ clw = 1 # fraction of cleanup operations
248
+ tot = bgw + exw + drw + clw
249
+ progress = 0
250
+ st = self.state
251
+
252
+ # background
253
+ if valid_states.index(st) > valid_states.index("background"):
254
+ # background already computed
255
+ progress += bgw / tot
256
+ elif self._progress_bg is not None:
257
+ # This is the image count of the input dataset.
258
+ progress += self._progress_bg.value * bgw / tot
259
+
260
+ # segmentation
261
+ if valid_states.index(st) > valid_states.index("segmentation"):
262
+ # segmentation already done
263
+ progress += exw / tot
264
+ elif self._progress_ex is not None:
265
+ progress += self._progress_ex * exw / tot
266
+
267
+ # draining basins
268
+ if valid_states.index(st) > valid_states.index("plumbing"):
269
+ # plumbing already done
270
+ progress += drw / tot
271
+ if self._progress_bn is not None:
272
+ progress += self._progress_bn * drw / tot
273
+
274
+ if self.state == "done":
275
+ progress = 1
276
+
277
+ return {
278
+ "progress": progress,
279
+ "segm rate": self._segm_rate,
280
+ "state": self._state,
281
+ }
282
+
283
+ def run(self):
284
+ try:
285
+ self.run_pipeline()
286
+ except BaseException:
287
+ self.state = "error"
288
+ self.error_tb = traceback.format_exc()
289
+ if not self.is_alive():
290
+ # Thread has not been started. This means we are not running
291
+ # in a thread but in the main process. Raise the exception.
292
+ raise
293
+
294
+ def run_pipeline(self):
295
+ """Execute the pipeline job"""
296
+ time_start = time.monotonic()
297
+ time_string = time.strftime("%Y-%m-%d-%H.%M.%S", time.gmtime())
298
+ self.logger.info(f"Run start: {time_string}")
299
+ if self.job["path_out"].exists():
300
+ raise FileExistsError(
301
+ f"Output file {self.job['path_out']} already exists!")
302
+ # Make sure the output directory exists.
303
+ self.job["path_out"].parent.mkdir(parents=True, exist_ok=True)
304
+ self.state = "setup"
305
+ # First get a list of all pipeline IDs. If the input file has
306
+ # already been processed by dcnum, then we do not have to redo
307
+ # everything.
308
+ # Crucial here is the fact that we also compare the
309
+ # "pipeline:dcnum hash" in case individual steps of the pipeline
310
+ # have been run by a rogue data analyst.
311
+ datdict = {
312
+ "gen_id": self.draw.h5.attrs.get("pipeline:dcnum generation", "0"),
313
+ "dat_id": self.draw.h5.attrs.get("pipeline:dcnum data", "0"),
314
+ "bg_id": self.draw.h5.attrs.get("pipeline:dcnum background", "0"),
315
+ "seg_id": self.draw.h5.attrs.get("pipeline:dcnum segmenter", "0"),
316
+ "feat_id": self.draw.h5.attrs.get("pipeline:dcnum feature", "0"),
317
+ "gate_id": self.draw.h5.attrs.get("pipeline:dcnum gate", "0"),
318
+ }
319
+ # The hash of a potential previous pipeline run.
320
+ dathash = self.draw.h5.attrs.get("pipeline:dcnum hash", "0")
321
+ # The number of events extracted in a potential previous pipeline run.
322
+ evyield = self.draw.h5.attrs.get("pipeline:dcnum yield", -1)
323
+ redo_sanity = (
324
+ # Whether pipeline hash is invalid.
325
+ ppid.compute_pipeline_hash(**datdict) != dathash
326
+ # Whether the input file is the original output of the pipeline.
327
+ or len(self.draw) != evyield
328
+ # If index mapping is defined, then we always redo the pipeline.
329
+ # If the pipeline hashes are identical and index mapping is not
330
+ # None, then both pipelines were done with index mapping.
331
+ # But applying the same pipeline with index mapping in series
332
+ # will lead to a different result in the second run (e.g. 1st
333
+ # pipeline run: take every 2nd event; 2nd pipeline run: take
334
+ # every second event -> results in every 4th event in output of
335
+ # second pipeline run).
336
+ or self.draw.index_mapping is not None
337
+ )
338
+ # Do we have to recompute the background data? In addition to the
339
+ # hash sanity check above, check the generation, input data,
340
+ # and background pipeline identifiers.
341
+ redo_bg = (
342
+ "image_bg" not in self.draw
343
+ or (datdict["gen_id"] != self.ppdict["gen_id"])
344
+ or (datdict["dat_id"] != self.ppdict["dat_id"])
345
+ or (datdict["bg_id"] != self.ppdict["bg_id"]))
346
+
347
+ # Do we have to rerun segmentation and feature extraction? Check
348
+ # the segmentation, feature extraction, and gating pipeline
349
+ # identifiers.
350
+ redo_seg = (
351
+ redo_sanity
352
+ or redo_bg
353
+ or (datdict["seg_id"] != self.ppdict["seg_id"])
354
+ or (datdict["feat_id"] != self.ppdict["feat_id"])
355
+ or (datdict["gate_id"] != self.ppdict["gate_id"]))
356
+
357
+ self.state = "background"
358
+
359
+ if redo_bg:
360
+ # The 'image_bg' feature is written to `self.path_temp_in`.
361
+ # If `job["path_in"]` already has the correct 'image_bg'
362
+ # feature, then we never reach this case here
363
+ # (note that `self.path_temp_in` is basin-based).
364
+ self.task_background()
365
+
366
+ self.state = "segmentation"
367
+
368
+ # We have the input data covered, and we have to run the
369
+ # long-lasting segmentation and feature extraction step.
370
+ # We are taking into account two scenarios:
371
+ # A) The segmentation step is exactly the one given in the input
372
+ # file. Here it is sufficient to use a basin-based
373
+ # output file `self.path_temp_out`.
374
+ # B) Everything else (including background pipeline mismatch or
375
+ # different segmenters); Here, we simply populate `path_temp_out`
376
+ # with the data from the segmenter.
377
+ if redo_seg:
378
+ # scenario B (Note this implies `redo_bg`)
379
+ self.task_segment_extract()
380
+ else:
381
+ # scenario A
382
+ # Access the temporary input HDF5Data so that the underlying
383
+ # basin file is created and close it immediately afterward.
384
+ self.dtin.close()
385
+ self._data_temp_in = None
386
+ # Note any new actions that work on `self.path_temp_in` are not
387
+ # reflected in `self.path_temp_out`.
388
+ self.path_temp_in.rename(self.path_temp_out)
389
+ # Since no segmentation was done, the output file now does not
390
+ # contain any events. This is not really what we wanted, but we
391
+ # can still store all features in the output file if required.
392
+ if self.job["basin_strategy"] == "drain":
393
+ orig_feats = []
394
+ for feat in self.draw.h5["events"].keys():
395
+ if isinstance(self.draw.h5["events"][feat], h5py.Dataset):
396
+ # copy_features does not support Groups
397
+ orig_feats.append(feat)
398
+ with h5py.File(self.path_temp_out, "a") as h5_dst:
399
+ copy_features(h5_src=self.draw.h5,
400
+ h5_dst=h5_dst,
401
+ features=orig_feats,
402
+ mapping=None)
403
+
404
+ with HDF5Writer(self.path_temp_out) as hw:
405
+ # pipeline metadata
406
+ hw.h5.attrs["pipeline:dcnum generation"] = self.ppdict["gen_id"]
407
+ hw.h5.attrs["pipeline:dcnum data"] = self.ppdict["dat_id"]
408
+ hw.h5.attrs["pipeline:dcnum background"] = self.ppdict["bg_id"]
409
+ hw.h5.attrs["pipeline:dcnum segmenter"] = self.ppdict["seg_id"]
410
+ hw.h5.attrs["pipeline:dcnum feature"] = self.ppdict["feat_id"]
411
+ hw.h5.attrs["pipeline:dcnum gate"] = self.ppdict["gate_id"]
412
+ hw.h5.attrs["pipeline:dcnum hash"] = self.pphash
413
+ hw.h5.attrs["pipeline:dcnum yield"] = self.event_count
414
+ # index mapping information
415
+ im = self.job.kwargs["data_kwargs"].get("index_mapping", None)
416
+ dim = HDF5Data.get_ppid_index_mapping(im)
417
+ hw.h5.attrs["pipeline:dcnum mapping"] = dim
418
+ # regular metadata
419
+ hw.h5.attrs["experiment:event count"] = self.event_count
420
+ hw.h5.attrs["imaging:pixel size"] = self.draw.pixel_size
421
+ # Add job information to resulting .rtdc file
422
+ hw.store_log(f"dcnum-job-{time_string}",
423
+ json.dumps({
424
+ "dcnum version": version_tuple,
425
+ "job": self.job.__getstate__(),
426
+ "pipeline": {"identifiers": self.ppdict,
427
+ "hash": self.pphash,
428
+ },
429
+ "python": {
430
+ "build": ", ".join(platform.python_build()),
431
+ "implementation":
432
+ platform.python_implementation(),
433
+ "version": platform.python_version(),
434
+ },
435
+ "system": {
436
+ "info": platform.platform(),
437
+ "machine": platform.machine(),
438
+ "name": platform.system(),
439
+ "release": platform.release(),
440
+ "version": platform.version(),
441
+ },
442
+ "tasks": {"background": redo_bg,
443
+ "segmentation": redo_seg
444
+ },
445
+ },
446
+ indent=2,
447
+ sort_keys=True,
448
+ cls=ExtendedJSONEncoder,
449
+ ).split("\n"))
450
+
451
+ # copy metadata/logs/tables from original file
452
+ with h5py.File(self.job["path_in"]) as h5_src:
453
+ copy_metadata(h5_src=h5_src,
454
+ h5_dst=hw.h5,
455
+ # Don't copy basins, we would have to index-map
456
+ # them first.
457
+ copy_basins=False)
458
+ if redo_seg:
459
+ # Store the correct measurement identifier. This is used to
460
+ # identify this file as a correct basin in subsequent pipeline
461
+ # steps, and it also makes sure that the original file cannot
462
+ # become a basin by accident (we have different indexing).
463
+ # This is the identifier appendix that we use to identify this
464
+ # dataset. Note that we only override the run identifier when
465
+ # segmentation did actually take place.
466
+ mid_ap = f"dcn-{self.pphash[:7]}"
467
+ # This is the current measurement identifier
468
+ mid_cur = hw.h5.attrs.get("experiment:run identifier")
469
+ if not mid_cur:
470
+ # Compute a measurement identifier from the metadata
471
+ m_time = hw.h5.attrs.get("experiment:time", "none")
472
+ m_date = hw.h5.attrs.get("experiment:date", "none")
473
+ m_sid = hw.h5.attrs.get("setup:identifier", "none")
474
+ hasher = hashlib.md5(
475
+ f"{m_time}_{m_date}_{m_sid}".encode("utf-8"))
476
+ mid_cur = str(uuid.UUID(hex=hasher.hexdigest()))
477
+ # The new measurement identifier is a combination of both.
478
+ mid_new = f"{mid_cur}_{mid_ap}" if mid_cur else mid_ap
479
+ hw.h5.attrs["experiment:run identifier"] = mid_new
480
+
481
+ # Handle basin data according to the user's request
482
+ self.state = "plumbing"
483
+ self.task_enforce_basin_strategy()
484
+
485
+ self.state = "cleanup"
486
+
487
+ trun = datetime.timedelta(seconds=round(time.monotonic() - time_start))
488
+ self.logger.info(f"Run duration: {str(trun)}")
489
+ self.logger.info(time.strftime("Run stop: %Y-%m-%d-%H.%M.%S",
490
+ time.gmtime()))
491
+ # Add the log file to the resulting .rtdc file
492
+ if self.path_log.exists():
493
+ with HDF5Writer(self.path_temp_out) as hw:
494
+ hw.store_log(
495
+ f"dcnum-log-{time_string}",
496
+ self.path_log.read_text().strip().split("\n"))
497
+
498
+ # Rename the output file
499
+ self.path_temp_out.rename(self.job["path_out"])
500
+ self.state = "done"
501
+
502
+ def task_background(self):
503
+ """Perform background computation task
504
+
505
+ This populates the file `self.path_temp_in` with the 'image_bg'
506
+ feature.
507
+ """
508
+ self.logger.info("Starting background computation")
509
+ if self._data_temp_in is not None:
510
+ # Close the temporary input data file, so we can write to it.
511
+ self._data_temp_in.close()
512
+ self._data_temp_in = None
513
+ # Start background computation
514
+ bg_code = self.job["background_code"]
515
+ bg_cls = get_available_background_methods()[bg_code]
516
+ with bg_cls(
517
+ input_data=self.job["path_in"],
518
+ output_path=self.path_temp_in,
519
+ # always compress, the disk is usually the bottleneck
520
+ compress=True,
521
+ num_cpus=self.job["num_procs"],
522
+ # custom kwargs
523
+ **self.job["background_kwargs"]) as bic:
524
+ self._progress_bg = bic.image_proc
525
+ bic.process()
526
+ self.logger.info("Finished background computation")
527
+
528
+ def task_enforce_basin_strategy(self):
529
+ """Transfer basin data from input files to output if requested
530
+
531
+ The user specified the "basin_strategy" keyword argument in
532
+ `self.job`. If this is set to "drain", then copy all basin
533
+ information from the input file to the output file. If it
534
+ is set to "tap", then only create basins in the output file.
535
+ """
536
+ self._progress_bn = 0
537
+ t0 = time.perf_counter()
538
+ # We need to make sure that the features are correctly attributed
539
+ # from the input files. E.g. if the input file already has
540
+ # background images, but we recompute the background images, then
541
+ # we have to use the data from the recomputed background file.
542
+ # We achieve this by keeping a specific order and only copying those
543
+ # features that we don't already have in the output file.
544
+ feats_raw = [
545
+ # 1. background data from the temporary input image
546
+ # (this must come before draw [sic!])
547
+ [self.dtin.h5, ["image_bg", "bg_off"], "critical"],
548
+ # 2. frame-based scalar features from the raw input file
549
+ # (e.g. "temp" or "frame")
550
+ [self.draw.h5, self.draw.features_scalar_frame, "optional"],
551
+ # 3. image features from the input file
552
+ [self.draw.h5, ["image", "image_bg", "bg_off"], "optional"],
553
+ ]
554
+ with h5py.File(self.path_temp_out, "a") as hout:
555
+ hw = HDF5Writer(hout)
556
+ # First, we have to determine the basin mapping from input to
557
+ # output. This information is stored by the QueueCollectorThread
558
+ # in the "basinmap0" feature, ready to be used by us.
559
+ if "index_unmapped" in hout["events"]:
560
+ # The unmapped indices enumerate the events in the output file
561
+ # with indices from the mapped input file. E.g. if for the
562
+ # first image in the input file, two events are found and for
563
+ # the second image in the input file, three events are found,
564
+ # then this would contain [0, 0, 1, 1, 1, ...]. If the index
565
+ # mapping of the input file was set to slice(1, 100), then the
566
+ # first image would not be there, and we would have
567
+ # [1, 1, 1, ...].
568
+ idx_um = hout["events/index_unmapped"]
569
+
570
+ # If we want to convert this to an actual basinmap feature,
571
+ # then we have to convert those indices to indices that map
572
+ # to the original input HDF5 file.
573
+ raw_im = self.draw.index_mapping
574
+ if raw_im is None:
575
+ self.logger.info("Input file mapped with basinmap0")
576
+ # Create a hard link to save time and space
577
+ hout["events/basinmap0"] = hout["events/index_unmapped"]
578
+ basinmap = idx_um
579
+ else:
580
+ basinmap = get_mapping_indices(raw_im)[idx_um]
581
+ # Store the mapped basin data in the output file.
582
+ hw.store_feature_chunk("basinmap0", basinmap)
583
+ # We don't need them anymore.
584
+ del hout["events/index_unmapped"]
585
+
586
+ # Note that `size_raw != (len(self.draw))` [sic!]. The former
587
+ # is the size of the raw dataset and the latter is its mapped
588
+ # size!
589
+ size_raw = self.draw.h5.attrs["experiment:event count"]
590
+ if (len(basinmap) == size_raw
591
+ and np.all(basinmap == np.arange(size_raw))):
592
+ # This means that the images in the input overlap perfectly
593
+ # with the images in the output, i.e. a "copy" segmenter
594
+ # was used or something is very reproducible.
595
+ # We set basinmap to None to be more efficient.
596
+ basinmap = None
597
+
598
+ else:
599
+ # The input is identical to the output, because we are using
600
+ # the same pipeline identifier.
601
+ basinmap = None
602
+
603
+ for hin, feats, importance in feats_raw:
604
+ # Only consider features that are available in the input
605
+ # and that are not already in the output.
606
+ feats = [f for f in feats
607
+ if (f in hin["events"] and f not in hout["events"])]
608
+ if not feats:
609
+ continue
610
+ elif (self.job["basin_strategy"] == "drain"
611
+ or importance == "critical"):
612
+ # DRAIN: Copy all features over to the output file.
613
+ self.logger.debug(f"Transferring {feats} to output file")
614
+ copy_features(h5_src=hin,
615
+ h5_dst=hout,
616
+ features=feats,
617
+ mapping=basinmap)
618
+ else:
619
+ # TAP: Create basins for the "optional" features in the
620
+ # output file. Note that the "critical" features never
621
+ # reach this case.
622
+ self.logger.debug(f"Creating basin for {feats}")
623
+ # Relative and absolute paths.
624
+ pin = pathlib.Path(hin.filename).resolve()
625
+ pout = pathlib.Path(hout.filename).resolve().parent
626
+ paths = [pin, os.path.relpath(pin, pout)]
627
+ hw.store_basin(name="dcnum basin",
628
+ features=feats,
629
+ mapping=basinmap,
630
+ paths=paths,
631
+ description=f"Created with dcnum {version}",
632
+ )
633
+ self._progress_bn += 1 / len(feats_raw)
634
+ t_tot = time.perf_counter() - t0
635
+ self.logger.info(f"Enforcing basin strategy time: {t_tot:.1f}s")
636
+
637
+ def task_segment_extract(self):
638
+ self.logger.info("Starting segmentation and feature extraction")
639
+ # Start writer thread
640
+ writer_dq = collections.deque()
641
+ ds_kwds = set_default_filter_kwargs()
642
+ thr_write = DequeWriterThread(
643
+ path_out=self.path_temp_out,
644
+ dq=writer_dq,
645
+ mode="w",
646
+ ds_kwds=ds_kwds,
647
+ )
648
+ thr_write.start()
649
+
650
+ # Start segmentation thread
651
+ seg_cls = get_available_segmenters()[self.job["segmenter_code"]]
652
+ if seg_cls.requires_background_correction:
653
+ imdat = self.dtin.image_corr
654
+ else:
655
+ imdat = self.dtin.image
656
+
657
+ if self.job["debug"]:
658
+ num_slots = 1
659
+ num_extractors = 1
660
+ num_segmenters = 1
661
+ elif seg_cls.hardware_processor == "cpu": # MPO segmenter
662
+ # We could in principle set the number of slots to one and
663
+ # have both number of extractors and number of segmenters set
664
+ # to the total number of CPUs. However, we would need more RAM
665
+ # (for caching the image data) and we also have more overhead.
666
+ # Having two slots shared between all workers is more efficient.
667
+ num_slots = 2
668
+ # Split segmentation and feature extraction workers evenly.
669
+ num_extractors = self.job["num_procs"] // 2
670
+ num_segmenters = self.job["num_procs"] - num_extractors
671
+ # leave one CPU for the writer and the remaining Threads
672
+ num_segmenters -= 1
673
+ else: # GPU segmenter
674
+ num_slots = 3
675
+ num_extractors = self.job["num_procs"]
676
+ # leave one CPU for the writer and the remaining Threads
677
+ num_extractors -= 1
678
+ num_segmenters = 1
679
+ num_extractors = max(1, num_extractors)
680
+ num_segmenters = max(1, num_segmenters)
681
+ self.job.kwargs["segmenter_kwargs"]["num_workers"] = num_segmenters
682
+ self.job.kwargs["segmenter_kwargs"]["debug"] = self.job["debug"]
683
+ slot_chunks = mp_spawn.Array("i", num_slots, lock=False)
684
+ slot_states = mp_spawn.Array("u", num_slots, lock=False)
685
+
686
+ self.logger.debug(f"Number of slots: {num_slots}")
687
+ self.logger.debug(f"Number of segmenters: {num_segmenters}")
688
+ self.logger.debug(f"Number of extractors: {num_extractors}")
689
+
690
+ # Initialize segmenter manager thread
691
+ thr_segm = SegmenterManagerThread(
692
+ segmenter=seg_cls(**self.job["segmenter_kwargs"]),
693
+ image_data=imdat,
694
+ bg_off=self.dtin["bg_off"] if "bg_off" in self.dtin else None,
695
+ slot_states=slot_states,
696
+ slot_chunks=slot_chunks,
697
+ )
698
+ thr_segm.start()
699
+
700
+ # Start feature extractor thread
701
+ fe_kwargs = QueueEventExtractor.get_init_kwargs(
702
+ data=self.dtin,
703
+ gate=gate.Gate(self.dtin, **self.job["gate_kwargs"]),
704
+ num_extractors=num_extractors,
705
+ log_queue=self.log_queue,
706
+ log_level=self.logger.level,
707
+ )
708
+ fe_kwargs["extract_kwargs"] = self.job["feature_kwargs"]
709
+
710
+ thr_feat = EventExtractorManagerThread(
711
+ slot_chunks=slot_chunks,
712
+ slot_states=slot_states,
713
+ fe_kwargs=fe_kwargs,
714
+ num_workers=num_extractors,
715
+ labels_list=thr_segm.labels_list,
716
+ writer_dq=writer_dq,
717
+ debug=self.job["debug"])
718
+ thr_feat.start()
719
+
720
+ # Start the data collection thread
721
+ thr_coll = QueueCollectorThread(
722
+ data=self.dtin,
723
+ event_queue=fe_kwargs["event_queue"],
724
+ writer_dq=writer_dq,
725
+ feat_nevents=fe_kwargs["feat_nevents"],
726
+ write_threshold=500,
727
+ )
728
+ thr_coll.start()
729
+
730
+ data_size = len(self.dtin)
731
+ t0 = time.monotonic()
732
+
733
+ # So in principle we are done here. We do not have to do anything
734
+ # besides monitoring the progress.
735
+ while True:
736
+ counted_frames = thr_coll.written_frames
737
+ self.event_count = thr_coll.written_events
738
+ td = time.monotonic() - t0
739
+ # set the current status
740
+ self._progress_ex = counted_frames / data_size
741
+ self._segm_rate = counted_frames / (td or 0.03)
742
+ time.sleep(.5)
743
+ if counted_frames == data_size:
744
+ break
745
+
746
+ self.logger.debug("Flushing data to disk")
747
+
748
+ # join threads
749
+ join_thread_helper(thr=thr_segm,
750
+ timeout=30,
751
+ retries=10,
752
+ logger=self.logger,
753
+ name="segmentation")
754
+ # Join the collector thread before the feature extractors. On
755
+ # compute clusters, we had problems with joining the feature
756
+ # extractors, maybe because the event_queue was not depleted.
757
+ join_thread_helper(thr=thr_coll,
758
+ timeout=600,
759
+ retries=10,
760
+ logger=self.logger,
761
+ name="collector for writer")
762
+ join_thread_helper(thr=thr_feat,
763
+ timeout=30,
764
+ retries=10,
765
+ logger=self.logger,
766
+ name="feature extraction")
767
+ thr_write.finished_when_queue_empty()
768
+ join_thread_helper(thr=thr_write,
769
+ timeout=600,
770
+ retries=10,
771
+ logger=self.logger,
772
+ name="writer")
773
+
774
+ self.event_count = thr_coll.written_events
775
+ if self.event_count == 0:
776
+ self.logger.error(
777
+ f"No events found in {self.draw.path}! Please check the "
778
+ f"input file or revise your pipeline")
779
+
780
+ self.logger.info("Finished segmentation and feature extraction")
781
+
782
+
783
+ def join_thread_helper(thr, timeout, retries, logger, name):
784
+ for _ in range(retries):
785
+ thr.join(timeout=timeout)
786
+ if thr.is_alive():
787
+ logger.info(f"Waiting for '{name}' ({thr}")
788
+ else:
789
+ logger.debug(f"Joined thread '{name}'")
790
+ break
791
+ else:
792
+ logger.error(f"Failed to join thread '{name}'")
793
+ raise ValueError(f"Thread '{name}' ({thr}) did not join"
794
+ f"within {timeout * retries}s!")