dcnum 0.13.3__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dcnum might be problematic. Click here for more details.

dcnum/logic/ctrl.py ADDED
@@ -0,0 +1,501 @@
1
+ import collections
2
+ import logging
3
+ from logging.handlers import QueueListener
4
+ import multiprocessing as mp
5
+ import os
6
+ import pathlib
7
+ import socket
8
+ import threading
9
+ import time
10
+ import uuid
11
+
12
+ import hdf5plugin
13
+ import h5py
14
+
15
+ from ..feat.feat_background.base import get_available_background_methods
16
+ from ..feat.queue_event_extractor import QueueEventExtractor
17
+ from ..feat import gate
18
+ from ..feat import EventExtractorManagerThread
19
+ from ..segm import SegmenterManagerThread, get_available_segmenters
20
+ from ..meta import ppid
21
+ from ..read import HDF5Data
22
+ from ..write import (
23
+ DequeWriterThread, HDF5Writer, QueueCollectorThread,
24
+ copy_metadata, create_with_basins,
25
+ )
26
+
27
+ from .job import DCNumPipelineJob
28
+
29
+
30
+ # Force using "spawn" method for multiprocessing, because we are using
31
+ # queues and threads and would end up with race conditions otherwise.
32
+ mp_spawn = mp.get_context("spawn")
33
+
34
+
35
+ class DCNumJobRunner(threading.Thread):
36
+ def __init__(self,
37
+ job: DCNumPipelineJob,
38
+ tmp_suffix: str = None,
39
+ *args, **kwargs):
40
+ """Run a pipeline as defined by a :class:`DCNumPipelineJob` instance
41
+
42
+ Parameters
43
+ ----------
44
+ job: DCNumPipelineJob
45
+ pipeline job to run
46
+ tmp_suffix: str
47
+ optional unique string for creating temporary files
48
+ (defaults to hostname)
49
+ """
50
+ super(DCNumJobRunner, self).__init__(*args, **kwargs)
51
+ self.job = job
52
+ if tmp_suffix is None:
53
+ tmp_suffix = f"{socket.gethostname()}_{str(uuid.uuid4())[:5]}"
54
+ self.tmp_suffix = tmp_suffix
55
+ self.ppid, self.pphash, self.ppdict = job.get_ppid(ret_hash=True,
56
+ ret_dict=True)
57
+ self.event_count = 0
58
+
59
+ self._data_raw = None
60
+ self._data_temp_in = None
61
+ # current job state
62
+ self._state = "init"
63
+ # overall progress [0, 1]
64
+ self._progress = 0
65
+ # segmentation frame rate
66
+ self._segm_rate = 0
67
+
68
+ # Set up logging
69
+ # General logger for this job
70
+ self.logger = logging.getLogger(__name__).getChild(
71
+ f"Runner-{self.pphash[:5]}")
72
+ self.logger.setLevel(
73
+ logging.DEBUG if job["debug"] else logging.WARNING)
74
+ # Log file output in target directory
75
+ self.path_log = job["path_out"].with_suffix(".log")
76
+ self.path_log.parent.mkdir(exist_ok=True, parents=True)
77
+ self.path_log.unlink(missing_ok=True)
78
+ self._log_file_handler = logging.FileHandler(
79
+ filename=self.path_log,
80
+ encoding="utf-8",
81
+ delay=True,
82
+ errors="ignore",
83
+ )
84
+ fmt = logging.Formatter(
85
+ "%(asctime)s %(levelname)s %(processName)s/%(threadName)s "
86
+ + "in %(name)s: %(message)s")
87
+ self._log_file_handler.setFormatter(fmt)
88
+ self.logger.addHandler(self._log_file_handler)
89
+ handlers = list(self.logger.handlers)
90
+ # Queue for subprocesses to log to
91
+ self.log_queue = mp_spawn.Queue()
92
+ self._qlisten = QueueListener(self.log_queue, *handlers)
93
+ self._qlisten.start()
94
+
95
+ # Sanity checks
96
+ for os_env in [
97
+ "OMP_NUM_THREADS",
98
+ "MKL_NUM_THREADS",
99
+ "NUMEXPR_NUM_THREADS",
100
+ "NUMBA_NUM_THREADS"]:
101
+ # You should disable multithreading for all major tools that
102
+ # use dcnum.logic. We don't want multithreading, because dcnum
103
+ # uses linear code and relies on multiprocessing for
104
+ # parallelization. This has to be done before importing numpy
105
+ # or any other library affected. In your scripts, you can use:
106
+ #
107
+ # os.environ.setdefault("OMP_NUM_THREADS", "1")
108
+ # os.environ.setdefault("MKL_NUM_THREADS", "1")
109
+ # os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
110
+ # os.environ.setdefault("NUMBA_NUM_THREADS", "1")
111
+ #
112
+ val_act = os.environ.get(os_env)
113
+ if val_act != "1":
114
+ self.logger.warning(
115
+ f"Make sure to set the environment variable {os_env} to "
116
+ f"'1' (disables multithreading)! Other values will reduce "
117
+ f"performance and your system may become inresponsive. "
118
+ f"The current value is '{val_act}'.")
119
+
120
+ def __enter__(self):
121
+ return self
122
+
123
+ def __exit__(self, exc_type, exc_val, exc_tb):
124
+ # If an error occurred, don't delete the log and basin files.
125
+ delete_temporary_files = exc_type is None
126
+ self.close(delete_temporary_files=delete_temporary_files)
127
+
128
+ @property
129
+ def draw(self) -> HDF5Data:
130
+ """Raw input data"""
131
+ if self._data_raw is None:
132
+ # Initialize with the proper kwargs (pixel_size)
133
+ self._data_raw = HDF5Data(self.job["path_in"],
134
+ **self.job["data_kwargs"])
135
+ return self._data_raw
136
+
137
+ @property
138
+ def dtin(self) -> HDF5Data:
139
+ """Input data with (corrected) background image"""
140
+ if self._data_temp_in is None:
141
+ if not self.path_temp_in.exists():
142
+ # create basin-based input file
143
+ create_with_basins(path_out=self.path_temp_in,
144
+ basin_paths=[self.draw.path])
145
+ # Initialize with the proper kwargs (pixel_size)
146
+ self._data_temp_in = HDF5Data(self.path_temp_in,
147
+ **self.job["data_kwargs"])
148
+ assert len(self._data_temp_in) > 0
149
+ assert "image_bg" in self._data_temp_in
150
+ return self._data_temp_in
151
+
152
+ @property
153
+ def path_temp_in(self):
154
+ po = pathlib.Path(self.job["path_out"])
155
+ return po.with_name(po.stem + f"_input_bb_{self.tmp_suffix}.rtdc~")
156
+
157
+ @property
158
+ def path_temp_out(self):
159
+ po = pathlib.Path(self.job["path_out"])
160
+ return po.with_name(po.stem + f"_output_{self.tmp_suffix}.rtdc~")
161
+
162
+ def close(self, delete_temporary_files=True):
163
+ if self._data_raw is not None:
164
+ self._data_raw.close()
165
+ self._data_raw = None
166
+ if self._data_temp_in is not None:
167
+ self._data_temp_in.close()
168
+ self._data_temp_in = None
169
+ # clean up logging
170
+ if self._log_file_handler in self.logger.handlers:
171
+ self.logger.removeHandler(self._log_file_handler)
172
+ self._log_file_handler.flush()
173
+ self._log_file_handler.close()
174
+ if self._qlisten is not None:
175
+ self._qlisten.stop()
176
+ self._qlisten = None
177
+ self.log_queue.cancel_join_thread()
178
+ self.log_queue.close()
179
+ if delete_temporary_files:
180
+ # Delete log file on disk
181
+ self.path_log.unlink(missing_ok=True)
182
+ # Delete temporary input file
183
+ self.path_temp_in.unlink(missing_ok=True)
184
+ # We don't have to delete self.path_temp_out, since this one
185
+ # is `rename`d to `self.jon["path_out"]`.
186
+
187
+ def join(self, *args, **kwargs):
188
+ super(DCNumJobRunner, self).join(*args, **kwargs)
189
+ # Close only after join
190
+ self.close()
191
+
192
+ def get_status(self):
193
+ return {
194
+ "progress": self._progress,
195
+ "segm rate": self._segm_rate,
196
+ "state": self._state,
197
+ }
198
+
199
+ def run(self):
200
+ """Execute the pipeline job"""
201
+ if self.job["path_out"].exists():
202
+ raise FileExistsError(
203
+ f"Output file {self.job['path_out']} already exists!")
204
+ self._state = "setup"
205
+ # First get a list of all pipeline IDs. If the input file has
206
+ # already been processed by dcnum, then we do not have to redo
207
+ # everything.
208
+ # Crucial here is the fact that we also compare the
209
+ # "pipeline:dcnum hash" in case individual steps of the pipeline
210
+ # have been run by a rogue data analyst.
211
+ datdict = {
212
+ "gen_id": self.draw.h5.attrs.get("pipeline:dcnum generation", "0"),
213
+ "dat_id": self.draw.h5.attrs.get("pipeline:dcnum data", "0"),
214
+ "bg_id": self.draw.h5.attrs.get("pipeline:dcnum background", "0"),
215
+ "seg_id": self.draw.h5.attrs.get("pipeline:dcnum segmenter", "0"),
216
+ "feat_id": self.draw.h5.attrs.get("pipeline:dcnum feature", "0"),
217
+ "gate_id": self.draw.h5.attrs.get("pipeline:dcnum gate", "0"),
218
+ }
219
+ # The hash of a potential previous pipeline run.
220
+ dathash = self.draw.h5.attrs.get("pipeline:dcnum hash", "0")
221
+ # The number of events extracted in a potential previous pipeline run.
222
+ evyield = self.draw.h5.attrs.get("pipeline:dcnum yield", -1)
223
+ redo_sanity = (
224
+ # Whether pipeline hash is invalid.
225
+ ppid.compute_pipeline_hash(**datdict) != dathash
226
+ # Whether the input file is the original output of the pipeline.
227
+ or len(self.draw) != evyield
228
+ )
229
+ # Do we have to recompute the background data? In addition to the
230
+ # hash sanity check above, check the generation, input data,
231
+ # and background pipeline identifiers.
232
+ redo_bg = (
233
+ (datdict["gen_id"] != self.ppdict["gen_id"])
234
+ or (datdict["dat_id"] != self.ppdict["dat_id"])
235
+ or (datdict["bg_id"] != self.ppdict["bg_id"]))
236
+
237
+ # Do we have to rerun segmentation and feature extraction? Check
238
+ # the segmentation, feature extraction, and gating pipeline
239
+ # identifiers.
240
+ redo_seg = (
241
+ redo_sanity
242
+ or redo_bg
243
+ or (datdict["seg_id"] != self.ppdict["seg_id"])
244
+ or (datdict["feat_id"] != self.ppdict["feat_id"])
245
+ or (datdict["gate_id"] != self.ppdict["gate_id"]))
246
+
247
+ self._state = "background"
248
+
249
+ if redo_bg:
250
+ # The 'image_bg' feature is written to `self.path_temp_in`.
251
+ # If `job["path_in"]` already has the correct 'image_bg'
252
+ # feature, then we never reach this case here
253
+ # (note that `self.path_temp_in` is basin-based).
254
+ self.task_background()
255
+
256
+ self._progress = 0.1
257
+ self._state = "segmentation"
258
+
259
+ # We have the input data covered, and we have to run the
260
+ # long-lasting segmentation and feature extraction step.
261
+ # We are taking into account two scenarios:
262
+ # A) The segmentation step is exactly the one given in the input
263
+ # file. Here it is sufficient to use a basin-based
264
+ # output file `self.path_temp_out`.
265
+ # B) Everything else (including background pipeline mismatch or
266
+ # different segmenters); Here, we simply populate `path_temp_out`
267
+ # with the data from the segmenter.
268
+ if redo_seg:
269
+ # scenario B (Note this implies `redo_bg`)
270
+ self.task_segment_extract()
271
+ else:
272
+ # scenario A
273
+ # Access the temporary input HDF5Data so that the underlying
274
+ # basin file is created and close it immediately afterward.
275
+ self.dtin.close()
276
+ self._data_temp_in = None
277
+ # Note any new actions that work on `self.path_temp_in` are not
278
+ # reflected in `self.path_temp_out`.
279
+ self.path_temp_in.rename(self.path_temp_out)
280
+
281
+ self._progress = 0.95
282
+ self._state = "cleanup"
283
+
284
+ # The user would normally expect the output file to be something
285
+ # that is self-contained (copying the file wildly across file
286
+ # systems and network shares should not impair feature availability).
287
+ # Therefore, we copy any remaining basin-based features to the
288
+ # temporary output file.
289
+ if self.job["no_basins_in_output"]:
290
+ self.task_transfer_basin_data()
291
+
292
+ with HDF5Writer(self.path_temp_out) as hw:
293
+ # pipeline metadata
294
+ hw.h5.attrs["pipeline:dcnum generation"] = self.ppdict["gen_id"]
295
+ hw.h5.attrs["pipeline:dcnum data"] = self.ppdict["dat_id"]
296
+ hw.h5.attrs["pipeline:dcnum background"] = self.ppdict["bg_id"]
297
+ hw.h5.attrs["pipeline:dcnum segmenter"] = self.ppdict["seg_id"]
298
+ hw.h5.attrs["pipeline:dcnum feature"] = self.ppdict["feat_id"]
299
+ hw.h5.attrs["pipeline:dcnum gate"] = self.ppdict["gate_id"]
300
+ hw.h5.attrs["pipeline:dcnum hash"] = self.pphash
301
+ hw.h5.attrs["pipeline:dcnum yield"] = self.event_count
302
+ # regular metadata
303
+ hw.h5.attrs["experiment:event count"] = self.event_count
304
+ hw.h5.attrs["imaging:pixel size"] = self.draw.pixel_size
305
+ if self.path_log.exists():
306
+ # Add the log file to the resulting .rtdc file
307
+ hw.store_log(
308
+ time.strftime("dcnum-process-%Y-%m-%d-%H.%M.%S"),
309
+ self.path_log.read_text().split("\n"))
310
+ # copy metadata/logs/tables from original file
311
+ with h5py.File(self.job["path_in"]) as h5_src:
312
+ copy_metadata(h5_src=h5_src,
313
+ h5_dst=hw.h5,
314
+ # don't copy basins
315
+ copy_basins=False)
316
+
317
+ # Rename the output file
318
+ self.path_temp_out.rename(self.job["path_out"])
319
+ self._progress = 1.0
320
+ self._state = "done"
321
+
322
+ def task_background(self):
323
+ """Perform background computation task
324
+
325
+ This populates the file `self.path_temp_in` with the 'image_bg'
326
+ feature.
327
+ """
328
+ self.logger.info("Starting background computation")
329
+ if self._data_temp_in is not None:
330
+ # Close the temporary input data file, so we can write to it.
331
+ self._data_temp_in.close()
332
+ self._data_temp_in = None
333
+ # Start background computation
334
+ bg_code = self.job["background_code"]
335
+ bg_cls = get_available_background_methods()[bg_code]
336
+ with bg_cls(
337
+ input_data=self.job["path_in"],
338
+ output_path=self.path_temp_in,
339
+ # always compress, the disk is usually the bottleneck
340
+ compress=True,
341
+ num_cpus=self.job["num_procs"],
342
+ # custom kwargs
343
+ **self.job["background_kwargs"]) as bic:
344
+
345
+ bic.process()
346
+ self.logger.info("Finished background computation")
347
+
348
+ def task_segment_extract(self):
349
+ self.logger.info("Starting segmentation and feature extraction")
350
+ # Start writer thread
351
+ writer_dq = collections.deque()
352
+ ds_kwds = dict(hdf5plugin.Zstd(clevel=5))
353
+ ds_kwds["fletcher32"] = True
354
+ thr_write = DequeWriterThread(
355
+ path_out=self.path_temp_out,
356
+ dq=writer_dq,
357
+ mode="w",
358
+ ds_kwds=ds_kwds,
359
+ )
360
+ thr_write.start()
361
+
362
+ # Start segmentation thread
363
+ seg_cls = get_available_segmenters()[self.job["segmenter_code"]]
364
+ if seg_cls.requires_background_correction:
365
+ imdat = self.dtin.image_corr
366
+ else:
367
+ imdat = self.dtin.image
368
+
369
+ if self.job["debug"]:
370
+ num_slots = 1
371
+ num_extractors = 1
372
+ elif seg_cls.hardware_processor == "cpu": # CPU segmenter
373
+ num_slots = 2
374
+ num_extractors = self.job["num_procs"] // 2
375
+ else: # GPU segmenter
376
+ num_slots = 3
377
+ num_extractors = self.job["num_procs"]
378
+ num_extractors = max(1, num_extractors)
379
+
380
+ slot_chunks = mp_spawn.Array("i", num_slots)
381
+ slot_states = mp_spawn.Array("u", num_slots)
382
+
383
+ # Initialize thread
384
+ thr_segm = SegmenterManagerThread(
385
+ segmenter=seg_cls(**self.job["segmenter_kwargs"]),
386
+ image_data=imdat,
387
+ slot_states=slot_states,
388
+ slot_chunks=slot_chunks,
389
+ )
390
+ thr_segm.start()
391
+
392
+ # Start feature extractor thread
393
+ fe_kwargs = QueueEventExtractor.get_init_kwargs(
394
+ data=self.dtin,
395
+ gate=gate.Gate(self.dtin, **self.job["gate_kwargs"]),
396
+ log_queue=self.log_queue)
397
+ fe_kwargs["extract_kwargs"] = self.job["feature_kwargs"]
398
+
399
+ thr_feat = EventExtractorManagerThread(
400
+ slot_chunks=slot_chunks,
401
+ slot_states=slot_states,
402
+ fe_kwargs=fe_kwargs,
403
+ num_workers=num_extractors,
404
+ labels_list=thr_segm.labels_list,
405
+ debug=self.job["debug"])
406
+ thr_feat.start()
407
+
408
+ # Start the data collection thread
409
+ thr_coll = QueueCollectorThread(
410
+ data=self.dtin,
411
+ event_queue=fe_kwargs["event_queue"],
412
+ writer_dq=writer_dq,
413
+ feat_nevents=fe_kwargs["feat_nevents"],
414
+ write_threshold=500,
415
+ )
416
+ thr_coll.start()
417
+
418
+ data_size = len(self.dtin)
419
+ t0 = time.monotonic()
420
+
421
+ # So in principle we are done here. We do not have to do anything
422
+ # besides monitoring the progress.
423
+ pmin = 0.1 # from background computation
424
+ pmax = 0.95 # 5% reserved for cleanup
425
+ while True:
426
+ counted_frames = thr_coll.written_frames
427
+ self.event_count = thr_coll.written_events
428
+ td = time.monotonic() - t0
429
+ # set the current status
430
+ self._progress = round(
431
+ pmin + counted_frames / data_size * (pmax - pmin),
432
+ 3)
433
+ self._segm_rate = counted_frames / (td or 0.03)
434
+ time.sleep(.5)
435
+ if counted_frames == data_size:
436
+ break
437
+
438
+ self.logger.debug("Flushing data to disk...")
439
+
440
+ # join threads
441
+ join_thread_helper(thr=thr_segm,
442
+ timeout=30,
443
+ retries=10,
444
+ logger=self.logger,
445
+ name="segmentation")
446
+ # Join the collector thread before the feature extractors. On
447
+ # compute clusters, we had problems with joining the feature
448
+ # extractors, maybe because the event_queue was not depleted.
449
+ join_thread_helper(thr=thr_coll,
450
+ timeout=600,
451
+ retries=10,
452
+ logger=self.logger,
453
+ name="collector for writer")
454
+ join_thread_helper(thr=thr_feat,
455
+ timeout=30,
456
+ retries=10,
457
+ logger=self.logger,
458
+ name="feature extraction")
459
+ thr_write.finished_when_queue_empty()
460
+ join_thread_helper(thr=thr_write,
461
+ timeout=600,
462
+ retries=10,
463
+ logger=self.logger,
464
+ name="writer")
465
+
466
+ self.event_count = thr_coll.written_events
467
+ if self.event_count == 0:
468
+ self.logger.error(
469
+ f"No events found in {self.draw.path}! Please check the "
470
+ f"input file or revise your pipeline.")
471
+
472
+ self.logger.info("Finished segmentation and feature extraction")
473
+
474
+ def task_transfer_basin_data(self):
475
+ with h5py.File(self.path_temp_out, "a") as hout:
476
+ hd = HDF5Data(hout)
477
+ for ii, _ in enumerate(hd.basins):
478
+ hindat, features = hd.get_basin_data(ii)
479
+ for feat in features:
480
+ if feat not in hout["events"]:
481
+ self.logger.debug(
482
+ f"Transferring {feat} to output file.")
483
+ h5py.h5o.copy(src_loc=hindat.h5["events"].id,
484
+ src_name=feat.encode(),
485
+ dst_loc=hout["events"].id,
486
+ dst_name=feat.encode(),
487
+ )
488
+
489
+
490
+ def join_thread_helper(thr, timeout, retries, logger, name):
491
+ for _ in range(retries):
492
+ thr.join(timeout=timeout)
493
+ if thr.is_alive():
494
+ logger.info(f"Waiting for '{name}' ({thr}")
495
+ else:
496
+ logger.info(f"Joined thread '{name}'")
497
+ break
498
+ else:
499
+ logger.error(f"Failed to join thread '{name}'")
500
+ raise ValueError(
501
+ f"Thread '{name}' ({thr}) did not join within {timeout*retries}s!")
dcnum/logic/job.py ADDED
@@ -0,0 +1,123 @@
1
+ import collections
2
+ import copy
3
+ import inspect
4
+ import multiprocessing as mp
5
+ import pathlib
6
+ from typing import Dict
7
+
8
+ from ..feat import QueueEventExtractor
9
+ from ..feat.feat_background.base import get_available_background_methods
10
+ from ..feat.gate import Gate
11
+ from ..meta.ppid import compute_pipeline_hash, DCNUM_PPID_GENERATION
12
+ from ..read import HDF5Data
13
+ from ..segm import get_available_segmenters
14
+
15
+
16
+ class DCNumPipelineJob:
17
+ def __init__(self,
18
+ path_in: pathlib.Path | str,
19
+ path_out: pathlib.Path | str = None,
20
+ data_code: str = "hdf",
21
+ data_kwargs: Dict = None,
22
+ background_code: str = "sparsemed",
23
+ background_kwargs: Dict = None,
24
+ segmenter_code: str = "thresh",
25
+ segmenter_kwargs: Dict = None,
26
+ feature_code: str = "legacy",
27
+ feature_kwargs: Dict = None,
28
+ gate_code: str = "norm",
29
+ gate_kwargs: Dict = None,
30
+ no_basins_in_output: bool = True,
31
+ num_procs: int = None,
32
+ debug: bool = False,
33
+ ):
34
+ #: initialize keyword arguments for this job
35
+ self.kwargs = {}
36
+ spec = inspect.getfullargspec(DCNumPipelineJob.__init__)
37
+ locs = locals()
38
+ for arg in spec.args:
39
+ if arg == "self":
40
+ continue
41
+ value = locs[arg]
42
+ if value is None and spec.annotations[arg] is Dict:
43
+ value = {}
44
+ self.kwargs[arg] = value
45
+ # Set default pixel size for this job
46
+ if "pixel_size" not in self.kwargs["data_kwargs"]:
47
+ # Extract from input file
48
+ with HDF5Data(path_in) as hd:
49
+ self.kwargs["data_kwargs"]["pixel_size"] = hd.pixel_size
50
+ # Set default output path
51
+ if path_out is None:
52
+ pin = pathlib.Path(path_in)
53
+ path_out = pin.with_name(pin.stem + "_dcn.rtdc")
54
+ self.kwargs["path_out"] = pathlib.Path(path_out)
55
+ # Set default mask kwargs for segmenter
56
+ self.kwargs["segmenter_kwargs"].setdefault("kwargs_mask", {})
57
+ # Set default number of processes
58
+ if num_procs is None:
59
+ self.kwargs["num_procs"] = mp.cpu_count()
60
+
61
+ def __getitem__(self, item):
62
+ return copy.deepcopy(self.kwargs[item])
63
+
64
+ def __getstate__(self):
65
+ state = copy.deepcopy(self.kwargs)
66
+ return state
67
+
68
+ def __setstate__(self, state):
69
+ self.kwargs.clear()
70
+ self.kwargs.update(copy.deepcopy(state))
71
+
72
+ def assert_pp_codes(self):
73
+ """Sanity check of `self.kwargs`"""
74
+ # PPID classes with only one option
75
+ for cls, key in [
76
+ (HDF5Data, "data_code"),
77
+ (Gate, "gate_code"),
78
+ (QueueEventExtractor, "feature_code"),
79
+ ]:
80
+ code_act = self.kwargs[key]
81
+ code_exp = cls.get_ppid_code()
82
+ if code_act != code_exp:
83
+ raise ValueError(f"Invalid code '{code_act}' for '{key}', "
84
+ f"expected '{code_exp}'!")
85
+ # PPID classes with multiple options
86
+ for options, key in [
87
+ (get_available_background_methods(), "background_code"),
88
+ (get_available_segmenters(), "segmenter_code"),
89
+ ]:
90
+ code_act = self.kwargs[key]
91
+ if code_act not in options:
92
+ raise ValueError(f"Invalid code '{code_act}' for '{key}', "
93
+ f"expected one of '{options}'!")
94
+
95
+ def get_ppid(self, ret_hash=False, ret_dict=False):
96
+ self.assert_pp_codes()
97
+ pp_hash_kw = collections.OrderedDict()
98
+ pp_hash_kw["gen_id"] = DCNUM_PPID_GENERATION
99
+ for pp_kw, cls, cls_kw in [
100
+ ("dat_id", HDF5Data, "data_kwargs"),
101
+ ("bg_id",
102
+ get_available_background_methods()[
103
+ self.kwargs["background_code"]],
104
+ "background_kwargs"),
105
+ ("seg_id",
106
+ get_available_segmenters()[self.kwargs["segmenter_code"]],
107
+ "segmenter_kwargs"),
108
+ ("feat_id", QueueEventExtractor, "feature_kwargs"),
109
+ ("gate_id", Gate, "gate_kwargs"),
110
+ ]:
111
+ pp_hash_kw[pp_kw] = cls.get_ppid_from_ppkw(self.kwargs[cls_kw])
112
+
113
+ ppid = "|".join(pp_hash_kw.values())
114
+
115
+ ret = [ppid]
116
+ if ret_hash:
117
+ pp_hash = compute_pipeline_hash(**pp_hash_kw)
118
+ ret.append(pp_hash)
119
+ if ret_dict:
120
+ ret.append(pp_hash_kw)
121
+ if len(ret) == 1:
122
+ ret = ret[0]
123
+ return ret