dcnum 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dcnum might be problematic. Click here for more details.
- dcnum/_version.py +2 -2
- dcnum/feat/__init__.py +1 -0
- dcnum/feat/event_extractor_manager_thread.py +3 -0
- dcnum/feat/feat_background/__init__.py +2 -12
- dcnum/feat/feat_background/base.py +51 -33
- dcnum/feat/feat_brightness/__init__.py +1 -0
- dcnum/feat/feat_moments/__init__.py +1 -0
- dcnum/feat/feat_texture/__init__.py +1 -0
- dcnum/feat/gate.py +62 -41
- dcnum/feat/queue_event_extractor.py +80 -40
- dcnum/logic/__init__.py +4 -0
- dcnum/logic/ctrl.py +501 -0
- dcnum/logic/job.py +123 -0
- dcnum/meta/ppid.py +48 -7
- dcnum/read/hdf5_data.py +36 -1
- dcnum/segm/__init__.py +1 -13
- dcnum/segm/segm_thresh.py +1 -0
- dcnum/segm/segmenter.py +58 -17
- dcnum/segm/segmenter_cpu.py +2 -0
- dcnum/segm/segmenter_gpu.py +1 -0
- dcnum/write/deque_writer_thread.py +1 -1
- dcnum/write/writer.py +45 -4
- {dcnum-0.14.0.dist-info → dcnum-0.15.0.dist-info}/METADATA +1 -1
- dcnum-0.15.0.dist-info/RECORD +43 -0
- {dcnum-0.14.0.dist-info → dcnum-0.15.0.dist-info}/WHEEL +1 -1
- dcnum-0.14.0.dist-info/RECORD +0 -40
- {dcnum-0.14.0.dist-info → dcnum-0.15.0.dist-info}/LICENSE +0 -0
- {dcnum-0.14.0.dist-info → dcnum-0.15.0.dist-info}/top_level.txt +0 -0
dcnum/logic/ctrl.py
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import logging
|
|
3
|
+
from logging.handlers import QueueListener
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
import os
|
|
6
|
+
import pathlib
|
|
7
|
+
import socket
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
import uuid
|
|
11
|
+
|
|
12
|
+
import hdf5plugin
|
|
13
|
+
import h5py
|
|
14
|
+
|
|
15
|
+
from ..feat.feat_background.base import get_available_background_methods
|
|
16
|
+
from ..feat.queue_event_extractor import QueueEventExtractor
|
|
17
|
+
from ..feat import gate
|
|
18
|
+
from ..feat import EventExtractorManagerThread
|
|
19
|
+
from ..segm import SegmenterManagerThread, get_available_segmenters
|
|
20
|
+
from ..meta import ppid
|
|
21
|
+
from ..read import HDF5Data
|
|
22
|
+
from ..write import (
|
|
23
|
+
DequeWriterThread, HDF5Writer, QueueCollectorThread,
|
|
24
|
+
copy_metadata, create_with_basins,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from .job import DCNumPipelineJob
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Force using "spawn" method for multiprocessing, because we are using
|
|
31
|
+
# queues and threads and would end up with race conditions otherwise.
|
|
32
|
+
mp_spawn = mp.get_context("spawn")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DCNumJobRunner(threading.Thread):
|
|
36
|
+
def __init__(self,
|
|
37
|
+
job: DCNumPipelineJob,
|
|
38
|
+
tmp_suffix: str = None,
|
|
39
|
+
*args, **kwargs):
|
|
40
|
+
"""Run a pipeline as defined by a :class:`DCNumPipelineJob` instance
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
job: DCNumPipelineJob
|
|
45
|
+
pipeline job to run
|
|
46
|
+
tmp_suffix: str
|
|
47
|
+
optional unique string for creating temporary files
|
|
48
|
+
(defaults to hostname)
|
|
49
|
+
"""
|
|
50
|
+
super(DCNumJobRunner, self).__init__(*args, **kwargs)
|
|
51
|
+
self.job = job
|
|
52
|
+
if tmp_suffix is None:
|
|
53
|
+
tmp_suffix = f"{socket.gethostname()}_{str(uuid.uuid4())[:5]}"
|
|
54
|
+
self.tmp_suffix = tmp_suffix
|
|
55
|
+
self.ppid, self.pphash, self.ppdict = job.get_ppid(ret_hash=True,
|
|
56
|
+
ret_dict=True)
|
|
57
|
+
self.event_count = 0
|
|
58
|
+
|
|
59
|
+
self._data_raw = None
|
|
60
|
+
self._data_temp_in = None
|
|
61
|
+
# current job state
|
|
62
|
+
self._state = "init"
|
|
63
|
+
# overall progress [0, 1]
|
|
64
|
+
self._progress = 0
|
|
65
|
+
# segmentation frame rate
|
|
66
|
+
self._segm_rate = 0
|
|
67
|
+
|
|
68
|
+
# Set up logging
|
|
69
|
+
# General logger for this job
|
|
70
|
+
self.logger = logging.getLogger(__name__).getChild(
|
|
71
|
+
f"Runner-{self.pphash[:5]}")
|
|
72
|
+
self.logger.setLevel(
|
|
73
|
+
logging.DEBUG if job["debug"] else logging.WARNING)
|
|
74
|
+
# Log file output in target directory
|
|
75
|
+
self.path_log = job["path_out"].with_suffix(".log")
|
|
76
|
+
self.path_log.parent.mkdir(exist_ok=True, parents=True)
|
|
77
|
+
self.path_log.unlink(missing_ok=True)
|
|
78
|
+
self._log_file_handler = logging.FileHandler(
|
|
79
|
+
filename=self.path_log,
|
|
80
|
+
encoding="utf-8",
|
|
81
|
+
delay=True,
|
|
82
|
+
errors="ignore",
|
|
83
|
+
)
|
|
84
|
+
fmt = logging.Formatter(
|
|
85
|
+
"%(asctime)s %(levelname)s %(processName)s/%(threadName)s "
|
|
86
|
+
+ "in %(name)s: %(message)s")
|
|
87
|
+
self._log_file_handler.setFormatter(fmt)
|
|
88
|
+
self.logger.addHandler(self._log_file_handler)
|
|
89
|
+
handlers = list(self.logger.handlers)
|
|
90
|
+
# Queue for subprocesses to log to
|
|
91
|
+
self.log_queue = mp_spawn.Queue()
|
|
92
|
+
self._qlisten = QueueListener(self.log_queue, *handlers)
|
|
93
|
+
self._qlisten.start()
|
|
94
|
+
|
|
95
|
+
# Sanity checks
|
|
96
|
+
for os_env in [
|
|
97
|
+
"OMP_NUM_THREADS",
|
|
98
|
+
"MKL_NUM_THREADS",
|
|
99
|
+
"NUMEXPR_NUM_THREADS",
|
|
100
|
+
"NUMBA_NUM_THREADS"]:
|
|
101
|
+
# You should disable multithreading for all major tools that
|
|
102
|
+
# use dcnum.logic. We don't want multithreading, because dcnum
|
|
103
|
+
# uses linear code and relies on multiprocessing for
|
|
104
|
+
# parallelization. This has to be done before importing numpy
|
|
105
|
+
# or any other library affected. In your scripts, you can use:
|
|
106
|
+
#
|
|
107
|
+
# os.environ.setdefault("OMP_NUM_THREADS", "1")
|
|
108
|
+
# os.environ.setdefault("MKL_NUM_THREADS", "1")
|
|
109
|
+
# os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
|
|
110
|
+
# os.environ.setdefault("NUMBA_NUM_THREADS", "1")
|
|
111
|
+
#
|
|
112
|
+
val_act = os.environ.get(os_env)
|
|
113
|
+
if val_act != "1":
|
|
114
|
+
self.logger.warning(
|
|
115
|
+
f"Make sure to set the environment variable {os_env} to "
|
|
116
|
+
f"'1' (disables multithreading)! Other values will reduce "
|
|
117
|
+
f"performance and your system may become inresponsive. "
|
|
118
|
+
f"The current value is '{val_act}'.")
|
|
119
|
+
|
|
120
|
+
def __enter__(self):
|
|
121
|
+
return self
|
|
122
|
+
|
|
123
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
124
|
+
# If an error occurred, don't delete the log and basin files.
|
|
125
|
+
delete_temporary_files = exc_type is None
|
|
126
|
+
self.close(delete_temporary_files=delete_temporary_files)
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def draw(self) -> HDF5Data:
|
|
130
|
+
"""Raw input data"""
|
|
131
|
+
if self._data_raw is None:
|
|
132
|
+
# Initialize with the proper kwargs (pixel_size)
|
|
133
|
+
self._data_raw = HDF5Data(self.job["path_in"],
|
|
134
|
+
**self.job["data_kwargs"])
|
|
135
|
+
return self._data_raw
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def dtin(self) -> HDF5Data:
|
|
139
|
+
"""Input data with (corrected) background image"""
|
|
140
|
+
if self._data_temp_in is None:
|
|
141
|
+
if not self.path_temp_in.exists():
|
|
142
|
+
# create basin-based input file
|
|
143
|
+
create_with_basins(path_out=self.path_temp_in,
|
|
144
|
+
basin_paths=[self.draw.path])
|
|
145
|
+
# Initialize with the proper kwargs (pixel_size)
|
|
146
|
+
self._data_temp_in = HDF5Data(self.path_temp_in,
|
|
147
|
+
**self.job["data_kwargs"])
|
|
148
|
+
assert len(self._data_temp_in) > 0
|
|
149
|
+
assert "image_bg" in self._data_temp_in
|
|
150
|
+
return self._data_temp_in
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def path_temp_in(self):
|
|
154
|
+
po = pathlib.Path(self.job["path_out"])
|
|
155
|
+
return po.with_name(po.stem + f"_input_bb_{self.tmp_suffix}.rtdc~")
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def path_temp_out(self):
|
|
159
|
+
po = pathlib.Path(self.job["path_out"])
|
|
160
|
+
return po.with_name(po.stem + f"_output_{self.tmp_suffix}.rtdc~")
|
|
161
|
+
|
|
162
|
+
def close(self, delete_temporary_files=True):
|
|
163
|
+
if self._data_raw is not None:
|
|
164
|
+
self._data_raw.close()
|
|
165
|
+
self._data_raw = None
|
|
166
|
+
if self._data_temp_in is not None:
|
|
167
|
+
self._data_temp_in.close()
|
|
168
|
+
self._data_temp_in = None
|
|
169
|
+
# clean up logging
|
|
170
|
+
if self._log_file_handler in self.logger.handlers:
|
|
171
|
+
self.logger.removeHandler(self._log_file_handler)
|
|
172
|
+
self._log_file_handler.flush()
|
|
173
|
+
self._log_file_handler.close()
|
|
174
|
+
if self._qlisten is not None:
|
|
175
|
+
self._qlisten.stop()
|
|
176
|
+
self._qlisten = None
|
|
177
|
+
self.log_queue.cancel_join_thread()
|
|
178
|
+
self.log_queue.close()
|
|
179
|
+
if delete_temporary_files:
|
|
180
|
+
# Delete log file on disk
|
|
181
|
+
self.path_log.unlink(missing_ok=True)
|
|
182
|
+
# Delete temporary input file
|
|
183
|
+
self.path_temp_in.unlink(missing_ok=True)
|
|
184
|
+
# We don't have to delete self.path_temp_out, since this one
|
|
185
|
+
# is `rename`d to `self.jon["path_out"]`.
|
|
186
|
+
|
|
187
|
+
def join(self, *args, **kwargs):
|
|
188
|
+
super(DCNumJobRunner, self).join(*args, **kwargs)
|
|
189
|
+
# Close only after join
|
|
190
|
+
self.close()
|
|
191
|
+
|
|
192
|
+
def get_status(self):
|
|
193
|
+
return {
|
|
194
|
+
"progress": self._progress,
|
|
195
|
+
"segm rate": self._segm_rate,
|
|
196
|
+
"state": self._state,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
def run(self):
|
|
200
|
+
"""Execute the pipeline job"""
|
|
201
|
+
if self.job["path_out"].exists():
|
|
202
|
+
raise FileExistsError(
|
|
203
|
+
f"Output file {self.job['path_out']} already exists!")
|
|
204
|
+
self._state = "setup"
|
|
205
|
+
# First get a list of all pipeline IDs. If the input file has
|
|
206
|
+
# already been processed by dcnum, then we do not have to redo
|
|
207
|
+
# everything.
|
|
208
|
+
# Crucial here is the fact that we also compare the
|
|
209
|
+
# "pipeline:dcnum hash" in case individual steps of the pipeline
|
|
210
|
+
# have been run by a rogue data analyst.
|
|
211
|
+
datdict = {
|
|
212
|
+
"gen_id": self.draw.h5.attrs.get("pipeline:dcnum generation", "0"),
|
|
213
|
+
"dat_id": self.draw.h5.attrs.get("pipeline:dcnum data", "0"),
|
|
214
|
+
"bg_id": self.draw.h5.attrs.get("pipeline:dcnum background", "0"),
|
|
215
|
+
"seg_id": self.draw.h5.attrs.get("pipeline:dcnum segmenter", "0"),
|
|
216
|
+
"feat_id": self.draw.h5.attrs.get("pipeline:dcnum feature", "0"),
|
|
217
|
+
"gate_id": self.draw.h5.attrs.get("pipeline:dcnum gate", "0"),
|
|
218
|
+
}
|
|
219
|
+
# The hash of a potential previous pipeline run.
|
|
220
|
+
dathash = self.draw.h5.attrs.get("pipeline:dcnum hash", "0")
|
|
221
|
+
# The number of events extracted in a potential previous pipeline run.
|
|
222
|
+
evyield = self.draw.h5.attrs.get("pipeline:dcnum yield", -1)
|
|
223
|
+
redo_sanity = (
|
|
224
|
+
# Whether pipeline hash is invalid.
|
|
225
|
+
ppid.compute_pipeline_hash(**datdict) != dathash
|
|
226
|
+
# Whether the input file is the original output of the pipeline.
|
|
227
|
+
or len(self.draw) != evyield
|
|
228
|
+
)
|
|
229
|
+
# Do we have to recompute the background data? In addition to the
|
|
230
|
+
# hash sanity check above, check the generation, input data,
|
|
231
|
+
# and background pipeline identifiers.
|
|
232
|
+
redo_bg = (
|
|
233
|
+
(datdict["gen_id"] != self.ppdict["gen_id"])
|
|
234
|
+
or (datdict["dat_id"] != self.ppdict["dat_id"])
|
|
235
|
+
or (datdict["bg_id"] != self.ppdict["bg_id"]))
|
|
236
|
+
|
|
237
|
+
# Do we have to rerun segmentation and feature extraction? Check
|
|
238
|
+
# the segmentation, feature extraction, and gating pipeline
|
|
239
|
+
# identifiers.
|
|
240
|
+
redo_seg = (
|
|
241
|
+
redo_sanity
|
|
242
|
+
or redo_bg
|
|
243
|
+
or (datdict["seg_id"] != self.ppdict["seg_id"])
|
|
244
|
+
or (datdict["feat_id"] != self.ppdict["feat_id"])
|
|
245
|
+
or (datdict["gate_id"] != self.ppdict["gate_id"]))
|
|
246
|
+
|
|
247
|
+
self._state = "background"
|
|
248
|
+
|
|
249
|
+
if redo_bg:
|
|
250
|
+
# The 'image_bg' feature is written to `self.path_temp_in`.
|
|
251
|
+
# If `job["path_in"]` already has the correct 'image_bg'
|
|
252
|
+
# feature, then we never reach this case here
|
|
253
|
+
# (note that `self.path_temp_in` is basin-based).
|
|
254
|
+
self.task_background()
|
|
255
|
+
|
|
256
|
+
self._progress = 0.1
|
|
257
|
+
self._state = "segmentation"
|
|
258
|
+
|
|
259
|
+
# We have the input data covered, and we have to run the
|
|
260
|
+
# long-lasting segmentation and feature extraction step.
|
|
261
|
+
# We are taking into account two scenarios:
|
|
262
|
+
# A) The segmentation step is exactly the one given in the input
|
|
263
|
+
# file. Here it is sufficient to use a basin-based
|
|
264
|
+
# output file `self.path_temp_out`.
|
|
265
|
+
# B) Everything else (including background pipeline mismatch or
|
|
266
|
+
# different segmenters); Here, we simply populate `path_temp_out`
|
|
267
|
+
# with the data from the segmenter.
|
|
268
|
+
if redo_seg:
|
|
269
|
+
# scenario B (Note this implies `redo_bg`)
|
|
270
|
+
self.task_segment_extract()
|
|
271
|
+
else:
|
|
272
|
+
# scenario A
|
|
273
|
+
# Access the temporary input HDF5Data so that the underlying
|
|
274
|
+
# basin file is created and close it immediately afterward.
|
|
275
|
+
self.dtin.close()
|
|
276
|
+
self._data_temp_in = None
|
|
277
|
+
# Note any new actions that work on `self.path_temp_in` are not
|
|
278
|
+
# reflected in `self.path_temp_out`.
|
|
279
|
+
self.path_temp_in.rename(self.path_temp_out)
|
|
280
|
+
|
|
281
|
+
self._progress = 0.95
|
|
282
|
+
self._state = "cleanup"
|
|
283
|
+
|
|
284
|
+
# The user would normally expect the output file to be something
|
|
285
|
+
# that is self-contained (copying the file wildly across file
|
|
286
|
+
# systems and network shares should not impair feature availability).
|
|
287
|
+
# Therefore, we copy any remaining basin-based features to the
|
|
288
|
+
# temporary output file.
|
|
289
|
+
if self.job["no_basins_in_output"]:
|
|
290
|
+
self.task_transfer_basin_data()
|
|
291
|
+
|
|
292
|
+
with HDF5Writer(self.path_temp_out) as hw:
|
|
293
|
+
# pipeline metadata
|
|
294
|
+
hw.h5.attrs["pipeline:dcnum generation"] = self.ppdict["gen_id"]
|
|
295
|
+
hw.h5.attrs["pipeline:dcnum data"] = self.ppdict["dat_id"]
|
|
296
|
+
hw.h5.attrs["pipeline:dcnum background"] = self.ppdict["bg_id"]
|
|
297
|
+
hw.h5.attrs["pipeline:dcnum segmenter"] = self.ppdict["seg_id"]
|
|
298
|
+
hw.h5.attrs["pipeline:dcnum feature"] = self.ppdict["feat_id"]
|
|
299
|
+
hw.h5.attrs["pipeline:dcnum gate"] = self.ppdict["gate_id"]
|
|
300
|
+
hw.h5.attrs["pipeline:dcnum hash"] = self.pphash
|
|
301
|
+
hw.h5.attrs["pipeline:dcnum yield"] = self.event_count
|
|
302
|
+
# regular metadata
|
|
303
|
+
hw.h5.attrs["experiment:event count"] = self.event_count
|
|
304
|
+
hw.h5.attrs["imaging:pixel size"] = self.draw.pixel_size
|
|
305
|
+
if self.path_log.exists():
|
|
306
|
+
# Add the log file to the resulting .rtdc file
|
|
307
|
+
hw.store_log(
|
|
308
|
+
time.strftime("dcnum-process-%Y-%m-%d-%H.%M.%S"),
|
|
309
|
+
self.path_log.read_text().split("\n"))
|
|
310
|
+
# copy metadata/logs/tables from original file
|
|
311
|
+
with h5py.File(self.job["path_in"]) as h5_src:
|
|
312
|
+
copy_metadata(h5_src=h5_src,
|
|
313
|
+
h5_dst=hw.h5,
|
|
314
|
+
# don't copy basins
|
|
315
|
+
copy_basins=False)
|
|
316
|
+
|
|
317
|
+
# Rename the output file
|
|
318
|
+
self.path_temp_out.rename(self.job["path_out"])
|
|
319
|
+
self._progress = 1.0
|
|
320
|
+
self._state = "done"
|
|
321
|
+
|
|
322
|
+
def task_background(self):
|
|
323
|
+
"""Perform background computation task
|
|
324
|
+
|
|
325
|
+
This populates the file `self.path_temp_in` with the 'image_bg'
|
|
326
|
+
feature.
|
|
327
|
+
"""
|
|
328
|
+
self.logger.info("Starting background computation")
|
|
329
|
+
if self._data_temp_in is not None:
|
|
330
|
+
# Close the temporary input data file, so we can write to it.
|
|
331
|
+
self._data_temp_in.close()
|
|
332
|
+
self._data_temp_in = None
|
|
333
|
+
# Start background computation
|
|
334
|
+
bg_code = self.job["background_code"]
|
|
335
|
+
bg_cls = get_available_background_methods()[bg_code]
|
|
336
|
+
with bg_cls(
|
|
337
|
+
input_data=self.job["path_in"],
|
|
338
|
+
output_path=self.path_temp_in,
|
|
339
|
+
# always compress, the disk is usually the bottleneck
|
|
340
|
+
compress=True,
|
|
341
|
+
num_cpus=self.job["num_procs"],
|
|
342
|
+
# custom kwargs
|
|
343
|
+
**self.job["background_kwargs"]) as bic:
|
|
344
|
+
|
|
345
|
+
bic.process()
|
|
346
|
+
self.logger.info("Finished background computation")
|
|
347
|
+
|
|
348
|
+
def task_segment_extract(self):
|
|
349
|
+
self.logger.info("Starting segmentation and feature extraction")
|
|
350
|
+
# Start writer thread
|
|
351
|
+
writer_dq = collections.deque()
|
|
352
|
+
ds_kwds = dict(hdf5plugin.Zstd(clevel=5))
|
|
353
|
+
ds_kwds["fletcher32"] = True
|
|
354
|
+
thr_write = DequeWriterThread(
|
|
355
|
+
path_out=self.path_temp_out,
|
|
356
|
+
dq=writer_dq,
|
|
357
|
+
mode="w",
|
|
358
|
+
ds_kwds=ds_kwds,
|
|
359
|
+
)
|
|
360
|
+
thr_write.start()
|
|
361
|
+
|
|
362
|
+
# Start segmentation thread
|
|
363
|
+
seg_cls = get_available_segmenters()[self.job["segmenter_code"]]
|
|
364
|
+
if seg_cls.requires_background_correction:
|
|
365
|
+
imdat = self.dtin.image_corr
|
|
366
|
+
else:
|
|
367
|
+
imdat = self.dtin.image
|
|
368
|
+
|
|
369
|
+
if self.job["debug"]:
|
|
370
|
+
num_slots = 1
|
|
371
|
+
num_extractors = 1
|
|
372
|
+
elif seg_cls.hardware_processor == "cpu": # CPU segmenter
|
|
373
|
+
num_slots = 2
|
|
374
|
+
num_extractors = self.job["num_procs"] // 2
|
|
375
|
+
else: # GPU segmenter
|
|
376
|
+
num_slots = 3
|
|
377
|
+
num_extractors = self.job["num_procs"]
|
|
378
|
+
num_extractors = max(1, num_extractors)
|
|
379
|
+
|
|
380
|
+
slot_chunks = mp_spawn.Array("i", num_slots)
|
|
381
|
+
slot_states = mp_spawn.Array("u", num_slots)
|
|
382
|
+
|
|
383
|
+
# Initialize thread
|
|
384
|
+
thr_segm = SegmenterManagerThread(
|
|
385
|
+
segmenter=seg_cls(**self.job["segmenter_kwargs"]),
|
|
386
|
+
image_data=imdat,
|
|
387
|
+
slot_states=slot_states,
|
|
388
|
+
slot_chunks=slot_chunks,
|
|
389
|
+
)
|
|
390
|
+
thr_segm.start()
|
|
391
|
+
|
|
392
|
+
# Start feature extractor thread
|
|
393
|
+
fe_kwargs = QueueEventExtractor.get_init_kwargs(
|
|
394
|
+
data=self.dtin,
|
|
395
|
+
gate=gate.Gate(self.dtin, **self.job["gate_kwargs"]),
|
|
396
|
+
log_queue=self.log_queue)
|
|
397
|
+
fe_kwargs["extract_kwargs"] = self.job["feature_kwargs"]
|
|
398
|
+
|
|
399
|
+
thr_feat = EventExtractorManagerThread(
|
|
400
|
+
slot_chunks=slot_chunks,
|
|
401
|
+
slot_states=slot_states,
|
|
402
|
+
fe_kwargs=fe_kwargs,
|
|
403
|
+
num_workers=num_extractors,
|
|
404
|
+
labels_list=thr_segm.labels_list,
|
|
405
|
+
debug=self.job["debug"])
|
|
406
|
+
thr_feat.start()
|
|
407
|
+
|
|
408
|
+
# Start the data collection thread
|
|
409
|
+
thr_coll = QueueCollectorThread(
|
|
410
|
+
data=self.dtin,
|
|
411
|
+
event_queue=fe_kwargs["event_queue"],
|
|
412
|
+
writer_dq=writer_dq,
|
|
413
|
+
feat_nevents=fe_kwargs["feat_nevents"],
|
|
414
|
+
write_threshold=500,
|
|
415
|
+
)
|
|
416
|
+
thr_coll.start()
|
|
417
|
+
|
|
418
|
+
data_size = len(self.dtin)
|
|
419
|
+
t0 = time.monotonic()
|
|
420
|
+
|
|
421
|
+
# So in principle we are done here. We do not have to do anything
|
|
422
|
+
# besides monitoring the progress.
|
|
423
|
+
pmin = 0.1 # from background computation
|
|
424
|
+
pmax = 0.95 # 5% reserved for cleanup
|
|
425
|
+
while True:
|
|
426
|
+
counted_frames = thr_coll.written_frames
|
|
427
|
+
self.event_count = thr_coll.written_events
|
|
428
|
+
td = time.monotonic() - t0
|
|
429
|
+
# set the current status
|
|
430
|
+
self._progress = round(
|
|
431
|
+
pmin + counted_frames / data_size * (pmax - pmin),
|
|
432
|
+
3)
|
|
433
|
+
self._segm_rate = counted_frames / (td or 0.03)
|
|
434
|
+
time.sleep(.5)
|
|
435
|
+
if counted_frames == data_size:
|
|
436
|
+
break
|
|
437
|
+
|
|
438
|
+
self.logger.debug("Flushing data to disk...")
|
|
439
|
+
|
|
440
|
+
# join threads
|
|
441
|
+
join_thread_helper(thr=thr_segm,
|
|
442
|
+
timeout=30,
|
|
443
|
+
retries=10,
|
|
444
|
+
logger=self.logger,
|
|
445
|
+
name="segmentation")
|
|
446
|
+
# Join the collector thread before the feature extractors. On
|
|
447
|
+
# compute clusters, we had problems with joining the feature
|
|
448
|
+
# extractors, maybe because the event_queue was not depleted.
|
|
449
|
+
join_thread_helper(thr=thr_coll,
|
|
450
|
+
timeout=600,
|
|
451
|
+
retries=10,
|
|
452
|
+
logger=self.logger,
|
|
453
|
+
name="collector for writer")
|
|
454
|
+
join_thread_helper(thr=thr_feat,
|
|
455
|
+
timeout=30,
|
|
456
|
+
retries=10,
|
|
457
|
+
logger=self.logger,
|
|
458
|
+
name="feature extraction")
|
|
459
|
+
thr_write.finished_when_queue_empty()
|
|
460
|
+
join_thread_helper(thr=thr_write,
|
|
461
|
+
timeout=600,
|
|
462
|
+
retries=10,
|
|
463
|
+
logger=self.logger,
|
|
464
|
+
name="writer")
|
|
465
|
+
|
|
466
|
+
self.event_count = thr_coll.written_events
|
|
467
|
+
if self.event_count == 0:
|
|
468
|
+
self.logger.error(
|
|
469
|
+
f"No events found in {self.draw.path}! Please check the "
|
|
470
|
+
f"input file or revise your pipeline.")
|
|
471
|
+
|
|
472
|
+
self.logger.info("Finished segmentation and feature extraction")
|
|
473
|
+
|
|
474
|
+
def task_transfer_basin_data(self):
|
|
475
|
+
with h5py.File(self.path_temp_out, "a") as hout:
|
|
476
|
+
hd = HDF5Data(hout)
|
|
477
|
+
for ii, _ in enumerate(hd.basins):
|
|
478
|
+
hindat, features = hd.get_basin_data(ii)
|
|
479
|
+
for feat in features:
|
|
480
|
+
if feat not in hout["events"]:
|
|
481
|
+
self.logger.debug(
|
|
482
|
+
f"Transferring {feat} to output file.")
|
|
483
|
+
h5py.h5o.copy(src_loc=hindat.h5["events"].id,
|
|
484
|
+
src_name=feat.encode(),
|
|
485
|
+
dst_loc=hout["events"].id,
|
|
486
|
+
dst_name=feat.encode(),
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def join_thread_helper(thr, timeout, retries, logger, name):
|
|
491
|
+
for _ in range(retries):
|
|
492
|
+
thr.join(timeout=timeout)
|
|
493
|
+
if thr.is_alive():
|
|
494
|
+
logger.info(f"Waiting for '{name}' ({thr}")
|
|
495
|
+
else:
|
|
496
|
+
logger.info(f"Joined thread '{name}'")
|
|
497
|
+
break
|
|
498
|
+
else:
|
|
499
|
+
logger.error(f"Failed to join thread '{name}'")
|
|
500
|
+
raise ValueError(
|
|
501
|
+
f"Thread '{name}' ({thr}) did not join within {timeout*retries}s!")
|
dcnum/logic/job.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import copy
|
|
3
|
+
import inspect
|
|
4
|
+
import multiprocessing as mp
|
|
5
|
+
import pathlib
|
|
6
|
+
from typing import Dict
|
|
7
|
+
|
|
8
|
+
from ..feat import QueueEventExtractor
|
|
9
|
+
from ..feat.feat_background.base import get_available_background_methods
|
|
10
|
+
from ..feat.gate import Gate
|
|
11
|
+
from ..meta.ppid import compute_pipeline_hash, DCNUM_PPID_GENERATION
|
|
12
|
+
from ..read import HDF5Data
|
|
13
|
+
from ..segm import get_available_segmenters
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DCNumPipelineJob:
|
|
17
|
+
def __init__(self,
|
|
18
|
+
path_in: pathlib.Path | str,
|
|
19
|
+
path_out: pathlib.Path | str = None,
|
|
20
|
+
data_code: str = "hdf",
|
|
21
|
+
data_kwargs: Dict = None,
|
|
22
|
+
background_code: str = "sparsemed",
|
|
23
|
+
background_kwargs: Dict = None,
|
|
24
|
+
segmenter_code: str = "thresh",
|
|
25
|
+
segmenter_kwargs: Dict = None,
|
|
26
|
+
feature_code: str = "legacy",
|
|
27
|
+
feature_kwargs: Dict = None,
|
|
28
|
+
gate_code: str = "norm",
|
|
29
|
+
gate_kwargs: Dict = None,
|
|
30
|
+
no_basins_in_output: bool = True,
|
|
31
|
+
num_procs: int = None,
|
|
32
|
+
debug: bool = False,
|
|
33
|
+
):
|
|
34
|
+
#: initialize keyword arguments for this job
|
|
35
|
+
self.kwargs = {}
|
|
36
|
+
spec = inspect.getfullargspec(DCNumPipelineJob.__init__)
|
|
37
|
+
locs = locals()
|
|
38
|
+
for arg in spec.args:
|
|
39
|
+
if arg == "self":
|
|
40
|
+
continue
|
|
41
|
+
value = locs[arg]
|
|
42
|
+
if value is None and spec.annotations[arg] is Dict:
|
|
43
|
+
value = {}
|
|
44
|
+
self.kwargs[arg] = value
|
|
45
|
+
# Set default pixel size for this job
|
|
46
|
+
if "pixel_size" not in self.kwargs["data_kwargs"]:
|
|
47
|
+
# Extract from input file
|
|
48
|
+
with HDF5Data(path_in) as hd:
|
|
49
|
+
self.kwargs["data_kwargs"]["pixel_size"] = hd.pixel_size
|
|
50
|
+
# Set default output path
|
|
51
|
+
if path_out is None:
|
|
52
|
+
pin = pathlib.Path(path_in)
|
|
53
|
+
path_out = pin.with_name(pin.stem + "_dcn.rtdc")
|
|
54
|
+
self.kwargs["path_out"] = pathlib.Path(path_out)
|
|
55
|
+
# Set default mask kwargs for segmenter
|
|
56
|
+
self.kwargs["segmenter_kwargs"].setdefault("kwargs_mask", {})
|
|
57
|
+
# Set default number of processes
|
|
58
|
+
if num_procs is None:
|
|
59
|
+
self.kwargs["num_procs"] = mp.cpu_count()
|
|
60
|
+
|
|
61
|
+
def __getitem__(self, item):
|
|
62
|
+
return copy.deepcopy(self.kwargs[item])
|
|
63
|
+
|
|
64
|
+
def __getstate__(self):
|
|
65
|
+
state = copy.deepcopy(self.kwargs)
|
|
66
|
+
return state
|
|
67
|
+
|
|
68
|
+
def __setstate__(self, state):
|
|
69
|
+
self.kwargs.clear()
|
|
70
|
+
self.kwargs.update(copy.deepcopy(state))
|
|
71
|
+
|
|
72
|
+
def assert_pp_codes(self):
|
|
73
|
+
"""Sanity check of `self.kwargs`"""
|
|
74
|
+
# PPID classes with only one option
|
|
75
|
+
for cls, key in [
|
|
76
|
+
(HDF5Data, "data_code"),
|
|
77
|
+
(Gate, "gate_code"),
|
|
78
|
+
(QueueEventExtractor, "feature_code"),
|
|
79
|
+
]:
|
|
80
|
+
code_act = self.kwargs[key]
|
|
81
|
+
code_exp = cls.get_ppid_code()
|
|
82
|
+
if code_act != code_exp:
|
|
83
|
+
raise ValueError(f"Invalid code '{code_act}' for '{key}', "
|
|
84
|
+
f"expected '{code_exp}'!")
|
|
85
|
+
# PPID classes with multiple options
|
|
86
|
+
for options, key in [
|
|
87
|
+
(get_available_background_methods(), "background_code"),
|
|
88
|
+
(get_available_segmenters(), "segmenter_code"),
|
|
89
|
+
]:
|
|
90
|
+
code_act = self.kwargs[key]
|
|
91
|
+
if code_act not in options:
|
|
92
|
+
raise ValueError(f"Invalid code '{code_act}' for '{key}', "
|
|
93
|
+
f"expected one of '{options}'!")
|
|
94
|
+
|
|
95
|
+
def get_ppid(self, ret_hash=False, ret_dict=False):
|
|
96
|
+
self.assert_pp_codes()
|
|
97
|
+
pp_hash_kw = collections.OrderedDict()
|
|
98
|
+
pp_hash_kw["gen_id"] = DCNUM_PPID_GENERATION
|
|
99
|
+
for pp_kw, cls, cls_kw in [
|
|
100
|
+
("dat_id", HDF5Data, "data_kwargs"),
|
|
101
|
+
("bg_id",
|
|
102
|
+
get_available_background_methods()[
|
|
103
|
+
self.kwargs["background_code"]],
|
|
104
|
+
"background_kwargs"),
|
|
105
|
+
("seg_id",
|
|
106
|
+
get_available_segmenters()[self.kwargs["segmenter_code"]],
|
|
107
|
+
"segmenter_kwargs"),
|
|
108
|
+
("feat_id", QueueEventExtractor, "feature_kwargs"),
|
|
109
|
+
("gate_id", Gate, "gate_kwargs"),
|
|
110
|
+
]:
|
|
111
|
+
pp_hash_kw[pp_kw] = cls.get_ppid_from_ppkw(self.kwargs[cls_kw])
|
|
112
|
+
|
|
113
|
+
ppid = "|".join(pp_hash_kw.values())
|
|
114
|
+
|
|
115
|
+
ret = [ppid]
|
|
116
|
+
if ret_hash:
|
|
117
|
+
pp_hash = compute_pipeline_hash(**pp_hash_kw)
|
|
118
|
+
ret.append(pp_hash)
|
|
119
|
+
if ret_dict:
|
|
120
|
+
ret.append(pp_hash_kw)
|
|
121
|
+
if len(ret) == 1:
|
|
122
|
+
ret = ret[0]
|
|
123
|
+
return ret
|