dcnum 0.16.1__py3-none-any.whl → 0.16.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dcnum might be problematic. Click here for more details.

dcnum/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.16.1'
16
- __version_tuple__ = version_tuple = (0, 16, 1)
15
+ __version__ = version = '0.16.3'
16
+ __version_tuple__ = version_tuple = (0, 16, 3)
@@ -46,8 +46,6 @@ class EventExtractorManagerThread(threading.Thread):
46
46
  """
47
47
  super(EventExtractorManagerThread, self).__init__(
48
48
  name="EventExtractorManager", *args, **kwargs)
49
- if debug:
50
- fe_kwargs["close_queues"] = False
51
49
  self.logger = logging.getLogger(
52
50
  "dcnum.feat.EventExtractorManagerThread")
53
51
  #: Keyword arguments for class:`.EventExtractor`
@@ -83,9 +81,9 @@ class EventExtractorManagerThread(threading.Thread):
83
81
  for _ in range(self.num_workers)]
84
82
  [w.start() for w in workers]
85
83
 
84
+ num_slots = len(self.slot_states)
86
85
  chunks_processed = 0
87
86
  while True:
88
- num_slots = len(self.slot_states)
89
87
  cur_slot = 0
90
88
  unavailable_slots = 0
91
89
  # Check all slots for segmented labels
@@ -95,8 +93,10 @@ class EventExtractorManagerThread(threading.Thread):
95
93
  # - "s" the extractor processed the data and is waiting
96
94
  # for the segmenter
97
95
  if self.slot_states[cur_slot] == "e":
96
+ # The segmenter has something for us in this slot.
98
97
  break
99
98
  else:
99
+ # Try another slot.
100
100
  unavailable_slots += 1
101
101
  cur_slot = (cur_slot + 1) % num_slots
102
102
  if unavailable_slots >= num_slots:
@@ -136,15 +136,31 @@ class EventExtractorManagerThread(threading.Thread):
136
136
  if chunks_processed == self.data.image.num_chunks:
137
137
  break
138
138
 
139
- self.logger.debug("Waiting for event_queue to empty.")
140
139
  # Wait until the event queue is empty.
140
+ self.logger.debug("Waiting for event_queue to empty.")
141
141
  event_queue = self.fe_kwargs["event_queue"]
142
142
  while not event_queue.empty():
143
143
  # The collector thread is still sorting things out. Wait
144
144
  # before joining the threads.
145
- time.sleep(.1)
145
+ time.sleep(.05)
146
+
147
+ # Wait until log queue is empty
148
+ self.logger.debug("Waiting for log_queue to empty.")
149
+ log_queue = self.fe_kwargs["log_queue"]
150
+ while not log_queue.empty():
151
+ time.sleep(.05)
152
+
153
+ inv_masks = self.fe_kwargs["invalid_mask_counter"].value
154
+ if inv_masks:
155
+ self.logger.info(f"Encountered {inv_masks} invalid masks.")
156
+ inv_frac = inv_masks / len(self.data)
157
+ if inv_frac > 0.005: # warn above one half percent
158
+ self.logger.warning(f"Discarded {inv_frac:.1%} of the masks. "
159
+ f"Please check segmenter applicability.")
160
+
146
161
  self.logger.debug("Requesting extraction workers to join.")
147
162
  self.fe_kwargs["finalize_extraction"].value = True
148
163
  [w.join() for w in workers]
164
+
149
165
  self.logger.debug("Finished extraction.")
150
166
  self.logger.info(f"Extraction time: {self.t_count:.1f}s")
@@ -7,12 +7,16 @@ import uuid
7
7
  import warnings
8
8
 
9
9
  import h5py
10
- import hdf5plugin
11
10
  import numpy as np
12
11
 
13
12
  from ...meta import ppid
14
13
  from ...read import HDF5Data
15
- from ...write import create_with_basins
14
+ from ...write import create_with_basins, set_default_filter_kwargs
15
+
16
+
17
+ # All subprocesses should use 'spawn' to avoid issues with threads
18
+ # and 'fork' on POSIX systems.
19
+ mp_spawn = mp.get_context('spawn')
16
20
 
17
21
 
18
22
  class Background(abc.ABC):
@@ -55,12 +59,14 @@ class Background(abc.ABC):
55
59
  self.kwargs.update(kwargs)
56
60
 
57
61
  if num_cpus is None:
58
- num_cpus = mp.cpu_count()
62
+ num_cpus = mp_spawn.cpu_count()
59
63
  #: number of CPUs used
60
64
  self.num_cpus = num_cpus
61
65
 
62
- #: number of frames
63
- self.event_count = None
66
+ #: number of images in the input data
67
+ self.image_count = None
68
+ #: number of images that have been processed
69
+ self.image_proc = mp_spawn.Value("L", 0)
64
70
 
65
71
  #: HDF5Data instance for input data
66
72
  self.hdin = None
@@ -93,7 +99,7 @@ class Background(abc.ABC):
93
99
  #: shape of event images
94
100
  self.image_shape = self.input_data[0].shape
95
101
  #: total number of events
96
- self.event_count = len(self.input_data)
102
+ self.image_count = len(self.input_data)
97
103
 
98
104
  if self.h5out is None:
99
105
  if not output_path.exists():
@@ -105,19 +111,15 @@ class Background(abc.ABC):
105
111
  self.h5out = h5py.File(output_path, "a", libver="latest")
106
112
 
107
113
  # Initialize background data
108
- if compress:
109
- compression_kwargs = hdf5plugin.Zstd(clevel=5)
110
- else:
111
- compression_kwargs = {}
114
+ ds_kwargs = set_default_filter_kwargs(compression=compress)
112
115
  h5bg = self.h5out.require_dataset(
113
116
  "events/image_bg",
114
117
  shape=self.input_data.shape,
115
118
  dtype=np.uint8,
116
- chunks=(min(100, self.event_count),
119
+ chunks=(min(100, self.image_count),
117
120
  self.image_shape[0],
118
121
  self.image_shape[1]),
119
- fletcher32=True,
120
- **compression_kwargs,
122
+ **ds_kwargs,
121
123
  )
122
124
  h5bg.attrs.create('CLASS', np.string_('IMAGE'))
123
125
  h5bg.attrs.create('IMAGE_VERSION', np.string_('1.2'))
@@ -191,6 +193,13 @@ class Background(abc.ABC):
191
193
  ppid=pp_check_user_kwargs)
192
194
  return kwargs
193
195
 
196
+ def get_progress(self):
197
+ """Return progress of background computation, float in [0,1]"""
198
+ if self.image_count == 0:
199
+ return 0.
200
+ else:
201
+ return self.image_proc.value / self.image_count
202
+
194
203
  def process(self):
195
204
  self.process_approach()
196
205
  bg_ppid = self.get_ppid()
@@ -1,16 +1,10 @@
1
- import multiprocessing as mp
2
1
  import queue
3
2
  import time
4
3
 
5
4
  import numpy as np
6
5
  from scipy import ndimage
7
6
 
8
- from .base import Background
9
-
10
-
11
- # All subprocesses should use 'spawn' to avoid issues with threads
12
- # and 'fork' on POSIX systems.
13
- mp_spawn = mp.get_context('spawn')
7
+ from .base import mp_spawn, Background
14
8
 
15
9
 
16
10
  class BackgroundRollMed(Background):
@@ -152,9 +146,9 @@ class BackgroundRollMed(Background):
152
146
  stop_in = (batch_index + 1) * self.batch_size + self.kernel_size
153
147
  stop_out = (batch_index + 1) * self.batch_size
154
148
 
155
- if stop_in > self.event_count:
156
- stop_in = self.event_count
157
- stop_out = self.event_count - self.kernel_size
149
+ if stop_in > self.image_count:
150
+ stop_in = self.image_count
151
+ stop_out = self.image_count - self.kernel_size
158
152
 
159
153
  slice_in = slice(start, stop_in)
160
154
  slice_out = slice(start, stop_out)
@@ -175,16 +169,14 @@ class BackgroundRollMed(Background):
175
169
 
176
170
  def process_approach(self):
177
171
  """Perform median computation on entire input data"""
178
- num_steps = int(np.ceil(self.event_count / self.batch_size))
172
+ num_steps = int(np.ceil(self.image_count / self.batch_size))
179
173
  for ii in range(num_steps):
180
- print(f"Computing background {ii/num_steps*100:.0f}%",
181
- end="\r", flush=True)
182
174
  self.process_next_batch()
183
175
  # Set the remaining kernel_size median values to the last one
184
176
  last_image = self.h5out["events/image_bg"][-self.kernel_size-1]
185
177
  for ii in range(self.kernel_size):
186
- self.h5out["events/image_bg"][self.event_count-ii-1] = last_image
187
- print("Computing background 100% ", flush=True)
178
+ self.h5out["events/image_bg"][self.image_count-ii-1] = last_image
179
+ self.image_proc.value = self.image_count
188
180
 
189
181
  def process_next_batch(self):
190
182
  """Process one batch of input data"""
@@ -221,6 +213,7 @@ class BackgroundRollMed(Background):
221
213
  *self.image_shape)
222
214
 
223
215
  self.current_batch += 1
216
+ self.image_proc.value += self.batch_size
224
217
 
225
218
 
226
219
  class MedianWorker(mp_spawn.Process):
@@ -1,19 +1,15 @@
1
1
  import logging
2
- import multiprocessing as mp
3
2
  import queue
4
3
  import time
5
4
 
6
5
  import numpy as np
7
6
  from scipy import ndimage
8
7
 
9
- from .base import Background
10
-
11
- logger = logging.getLogger(__name__)
8
+ from ...read import HDF5Data
12
9
 
10
+ from .base import mp_spawn, Background
13
11
 
14
- # All subprocesses should use 'spawn' to avoid issues with threads
15
- # and 'fork' on POSIX systems.
16
- mp_spawn = mp.get_context('spawn')
12
+ logger = logging.getLogger(__name__)
17
13
 
18
14
 
19
15
  class BackgroundSparseMed(Background):
@@ -96,27 +92,28 @@ class BackgroundSparseMed(Background):
96
92
  # time axis
97
93
  self.time = None
98
94
  if self.h5in is not None:
99
- if "time" in self.h5in["events"]:
95
+ hd = HDF5Data(self.h5in)
96
+ if "time" in hd:
100
97
  # use actual time from dataset
101
- self.time = self.h5in["/events/time"][:]
98
+ self.time = hd["time"][:]
102
99
  self.time -= self.time[0]
103
- elif "imaging:frame rate" in self.h5in.attrs:
104
- fr = self.h5in.attrs["imaging:frame rate"]
105
- if "frame" in self.h5in["/events"]:
100
+ elif "imaging:frame rate" in hd.meta:
101
+ fr = hd.meta["imaging:frame rate"]
102
+ if "frame" in hd:
106
103
  # compute time from frame rate and frame numbers
107
- self.time = self.h5in["/events/frame"] / fr
104
+ self.time = hd["frame"] / fr
108
105
  self.time -= self.time[0]
109
106
  else:
110
107
  # compute time using frame rate (approximate)
111
- dur = self.event_count / fr * 1.5
108
+ dur = self.image_count / fr * 1.5
112
109
  logger.info(f"Approximating duration: {dur/60:.1f}min")
113
- self.time = np.linspace(0, dur, self.event_count,
110
+ self.time = np.linspace(0, dur, self.image_count,
114
111
  endpoint=True)
115
112
  if self.time is None:
116
113
  # No HDF5 file or no information therein; Make an educated guess.
117
- dur = self.event_count / 3600 * 1.5
114
+ dur = self.image_count / 3600 * 1.5
118
115
  logger.info(f"Guessing duration: {dur/60:.1f}min")
119
- self.time = np.linspace(0, dur, self.event_count,
116
+ self.time = np.linspace(0, dur, self.image_count,
120
117
  endpoint=True)
121
118
 
122
119
  #: duration of the measurement
@@ -212,10 +209,7 @@ class BackgroundSparseMed(Background):
212
209
 
213
210
  # Compute initial background images (populates self.bg_images)
214
211
  for ii, ti in enumerate(self.step_times):
215
- print(f"Computing background {ii / self.step_times.size:.0%}",
216
- end="\r", flush=True)
217
212
  self.process_second(ii, ti)
218
- print("Computing background 100% ", flush=True)
219
213
 
220
214
  if self.frac_cleansing != 1:
221
215
  # The following algorithm finds background images that contain
@@ -277,7 +271,7 @@ class BackgroundSparseMed(Background):
277
271
  f"`thresh_cleansing` or `frac_cleansing`. The new "
278
272
  f"threshold is {thresh_fact / thresh}.")
279
273
 
280
- logger.info(f"Removed {frac_remove:.2%} of the background series")
274
+ logger.info(f"Cleansed {frac_remove:.2%}")
281
275
  step_times = self.step_times[used]
282
276
  bg_images = self.bg_images[used]
283
277
  else:
@@ -286,7 +280,7 @@ class BackgroundSparseMed(Background):
286
280
  bg_images = self.bg_images
287
281
 
288
282
  # Assign each frame to a certain background index
289
- bg_idx = np.zeros(self.event_count, dtype=int)
283
+ bg_idx = np.zeros(self.image_count, dtype=int)
290
284
  idx0 = 0
291
285
  idx1 = None
292
286
  for ii in range(len(step_times)):
@@ -298,21 +292,25 @@ class BackgroundSparseMed(Background):
298
292
  # Fill up remainder of index array with last entry
299
293
  bg_idx[idx1:] = ii
300
294
 
295
+ self.image_proc.value = self.image_count
296
+
301
297
  # Write background data
302
298
  pos = 0
303
299
  step = 1000
304
- while pos < self.event_count:
305
- stop = min(pos + step, self.event_count)
300
+ while pos < self.image_count:
301
+ stop = min(pos + step, self.image_count)
306
302
  cur_slice = slice(pos, stop)
307
303
  self.h5out["events/image_bg"][cur_slice] = \
308
304
  bg_images[bg_idx[cur_slice]]
309
305
  pos += step
310
306
 
311
- def process_second(self, ii, second):
307
+ def process_second(self,
308
+ ii: int,
309
+ second: float | int):
312
310
  idx_start = np.argmin(np.abs(second - self.time))
313
311
  idx_stop = idx_start + self.kernel_size
314
- if idx_stop >= self.event_count:
315
- idx_stop = self.event_count
312
+ if idx_stop >= self.image_count:
313
+ idx_stop = self.image_count
316
314
  idx_start = max(0, idx_stop - self.kernel_size)
317
315
  assert idx_stop - idx_start == self.kernel_size
318
316
 
@@ -347,6 +345,8 @@ class BackgroundSparseMed(Background):
347
345
 
348
346
  self.bg_images[ii] = self.shared_output.reshape(self.image_shape)
349
347
 
348
+ self.image_proc.value = idx_stop
349
+
350
350
 
351
351
  class MedianWorkerSingle(mp_spawn.Process):
352
352
  def __init__(self, job_queue, counter, shared_input, shared_output,
@@ -35,7 +35,8 @@ class QueueEventExtractor:
35
35
  feat_nevents: mp.Array,
36
36
  label_array: mp.Array,
37
37
  finalize_extraction: mp.Value,
38
- close_queues: bool = True,
38
+ invalid_mask_counter: mp.Value,
39
+ log_level: int = logging.INFO,
39
40
  extract_kwargs: dict = None,
40
41
  *args, **kwargs):
41
42
  """Base class for event extraction from label images
@@ -66,9 +67,10 @@ class QueueEventExtractor:
66
67
  finalize_extraction:
67
68
  Shared value indicating whether this worker should stop as
68
69
  soon as the `raw_queue` is empty.
69
- close_queues: bool
70
- Whether to close event and logging queues
71
- (set to False in debug mode)
70
+ invalid_mask_counter:
71
+ Counts masks labeled as invalid by the feature extractor
72
+ log_level:
73
+ Logging level to use
72
74
  extract_kwargs:
73
75
  Keyword arguments for the extraction process. See the
74
76
  keyword-only arguments in
@@ -85,7 +87,13 @@ class QueueEventExtractor:
85
87
  self.event_queue = event_queue
86
88
  #: queue for logging
87
89
  self.log_queue = log_queue
88
- self.close_queues = close_queues
90
+ #: invalid mask counter
91
+ self.invalid_mask_counter = invalid_mask_counter
92
+ # Logging needs to be set up after `start` is called, otherwise
93
+ # it looks like we have the same PID as the parent process. We
94
+ # are setting up logging in `run`.
95
+ self.logger = None
96
+ self.log_level = log_level
89
97
  #: Shared array of length `len(data)` into which the number of
90
98
  #: events per frame is written.
91
99
  self.feat_nevents = feat_nevents
@@ -100,15 +108,12 @@ class QueueEventExtractor:
100
108
  extract_kwargs.setdefault("haralick", True)
101
109
  #: Feature extraction keyword arguments.
102
110
  self.extract_kwargs = extract_kwargs
103
- # Logging needs to be set up after `start` is called, otherwise
104
- # it looks like we have the same PID as the parent process. We
105
- # are setting up logging in `run`.
106
- self.logger = None
107
111
 
108
112
  @staticmethod
109
113
  def get_init_kwargs(data: HDF5Data,
110
114
  gate: Gate,
111
115
  log_queue: mp.Queue,
116
+ log_level: int = logging.INFO,
112
117
  preselect: None = None,
113
118
  ptp_median: None = None):
114
119
  """Get initialization arguments for :cass:`.QueueEventExtractor`
@@ -125,7 +130,9 @@ class QueueEventExtractor:
125
130
  gate: HDF5Data
126
131
  Gating class to use
127
132
  log_queue: mp.Queue
128
- Queue for sending log messages
133
+ Queue the worker uses for sending log messages
134
+ log_level: int
135
+ Logging level to use in the worker process
129
136
  preselect, ptp_median:
130
137
  Deprecated
131
138
 
@@ -146,6 +153,7 @@ class QueueEventExtractor:
146
153
  warnings.warn("The `ptp_median` argument is deprecated!",
147
154
  DeprecationWarning)
148
155
 
156
+ # Note that the order must be identical to __init__
149
157
  args = collections.OrderedDict()
150
158
  args["data"] = data
151
159
  args["gate"] = gate
@@ -159,7 +167,8 @@ class QueueEventExtractor:
159
167
  np.ctypeslib.ctypes.c_int16,
160
168
  int(np.prod(data.image.chunk_shape)))
161
169
  args["finalize_extraction"] = mp_spawn.Value("b", False)
162
- args["close_queues"] = True
170
+ args["invalid_mask_counter"] = mp_spawn.Value("L", 0)
171
+ args["log_level"] = log_level
163
172
  return args
164
173
 
165
174
  def get_events_from_masks(self, masks, data_index, *,
@@ -207,8 +216,7 @@ class QueueEventExtractor:
207
216
  # over from gated_events to valid_events. According to our experience
208
217
  # invalid events happen rarely though.
209
218
  if np.any(invalid):
210
- self.logger.info(f"Discarded {np.sum(invalid)} events due to "
211
- "invalid segmentation.")
219
+ self.invalid_mask_counter.value += np.sum(invalid)
212
220
  for key in gated_events:
213
221
  valid_events[key] = gated_events[key][valid]
214
222
  else:
@@ -294,17 +302,27 @@ class QueueEventExtractor:
294
302
  """Main loop of worker process"""
295
303
  # Don't wait for these two queues when joining workers
296
304
  self.raw_queue.cancel_join_thread()
297
- self.log_queue.cancel_join_thread()
298
305
  #: logger sends all logs to `self.log_queue`
299
306
  self.logger = logging.getLogger(
300
307
  f"dcnum.feat.EventExtractor.{os.getpid()}")
308
+ self.logger.setLevel(self.log_level)
309
+ # Clear any handlers that might be set for this logger. This is
310
+ # important for the case when we are an instance of
311
+ # EventExtractorThread, because then all handlers from the main
312
+ # thread are inherited (as opposed to no handlers in the case
313
+ # of EventExtractorProcess).
314
+ self.logger.handlers.clear()
301
315
  queue_handler = QueueHandler(self.log_queue)
316
+ queue_handler.setLevel(self.log_level)
302
317
  self.logger.addHandler(queue_handler)
303
- self.logger.addFilter(DeduplicatingLoggingFilter())
304
- self.logger.debug(f"Running {self} in PID {os.getpid()}")
318
+ self.logger.info("Ready")
305
319
 
306
320
  mp_array = np.ctypeslib.as_array(
307
321
  self.label_array).reshape(self.data.image.chunk_shape)
322
+
323
+ # only close queues when we have created them ourselves.
324
+ close_queues = isinstance(self, EventExtractorProcess)
325
+
308
326
  while True:
309
327
  try:
310
328
  chunk_index, label_index = self.raw_queue.get(timeout=.03)
@@ -332,15 +350,21 @@ class QueueEventExtractor:
332
350
  self.event_queue.put((index, events))
333
351
 
334
352
  self.logger.debug(f"Finalizing `run` for PID {os.getpid()}, {self}")
335
- if self.close_queues:
353
+ if close_queues:
336
354
  # Explicitly close the event queue and join it
337
355
  self.event_queue.close()
338
356
  self.event_queue.join_thread()
339
357
  self.logger.debug(f"End of `run` for PID {os.getpid()}, {self}")
358
+
359
+ # Make sure everything gets written to the queue.
360
+ queue_handler.flush()
361
+
362
+ if close_queues:
340
363
  # Also close the logging queue. Note that not all messages might
341
364
  # arrive in the logging queue, since we called `cancel_join_thread`
342
365
  # earlier.
343
366
  self.log_queue.close()
367
+ self.log_queue.join_thread()
344
368
 
345
369
  @classmethod
346
370
  def get_ppid_from_kwargs(cls, kwargs):
@@ -362,17 +386,3 @@ class EventExtractorThread(QueueEventExtractor, threading.Thread):
362
386
  def __init__(self, *args, **kwargs):
363
387
  super(EventExtractorThread, self).__init__(
364
388
  name="EventExtractorThread", *args, **kwargs)
365
-
366
-
367
- class DeduplicatingLoggingFilter(logging.Filter):
368
- def __init__(self, *args, **kwargs):
369
- super(DeduplicatingLoggingFilter, self).__init__(*args, **kwargs)
370
- self._records = []
371
-
372
- def filter(self, record):
373
- """Return True if the record should be logged"""
374
- msg = record.getMessage()
375
- logged = msg in self._records
376
- if not logged:
377
- self._records.append(msg)
378
- return not logged
dcnum/logic/ctrl.py CHANGED
@@ -1,15 +1,18 @@
1
1
  import collections
2
+ import datetime
3
+ import json
2
4
  import logging
3
5
  from logging.handlers import QueueListener
4
6
  import multiprocessing as mp
5
7
  import os
6
8
  import pathlib
9
+ import platform
7
10
  import socket
8
11
  import threading
9
12
  import time
13
+ import traceback
10
14
  import uuid
11
15
 
12
- import hdf5plugin
13
16
  import h5py
14
17
 
15
18
  from ..feat.feat_background.base import get_available_background_methods
@@ -19,18 +22,32 @@ from ..feat import EventExtractorManagerThread
19
22
  from ..segm import SegmenterManagerThread, get_available_segmenters
20
23
  from ..meta import ppid
21
24
  from ..read import HDF5Data
25
+ from .._version import version_tuple
22
26
  from ..write import (
23
27
  DequeWriterThread, HDF5Writer, QueueCollectorThread,
24
- copy_metadata, create_with_basins,
28
+ copy_metadata, create_with_basins, set_default_filter_kwargs
25
29
  )
26
30
 
27
31
  from .job import DCNumPipelineJob
28
-
32
+ from .json_encoder import ExtendedJSONEncoder
29
33
 
30
34
  # Force using "spawn" method for multiprocessing, because we are using
31
35
  # queues and threads and would end up with race conditions otherwise.
32
36
  mp_spawn = mp.get_context("spawn")
33
37
 
38
+ #: valid states for a job runnter. The states must be in logical ordern,
39
+ #: not in alphabetical order.
40
+ valid_states = [
41
+ "created",
42
+ "init",
43
+ "setup",
44
+ "background",
45
+ "segmentation",
46
+ "cleanup",
47
+ "done",
48
+ "error",
49
+ ]
50
+
34
51
 
35
52
  class DCNumJobRunner(threading.Thread):
36
53
  def __init__(self,
@@ -48,6 +65,7 @@ class DCNumJobRunner(threading.Thread):
48
65
  (defaults to hostname)
49
66
  """
50
67
  super(DCNumJobRunner, self).__init__(*args, **kwargs)
68
+ self.error_tb = None
51
69
  self.job = job
52
70
  if tmp_suffix is None:
53
71
  tmp_suffix = f"{socket.gethostname()}_{str(uuid.uuid4())[:5]}"
@@ -61,16 +79,16 @@ class DCNumJobRunner(threading.Thread):
61
79
  # current job state
62
80
  self._state = "init"
63
81
  # overall progress [0, 1]
64
- self._progress = 0
82
+ self._progress_bg = None
83
+ self._progress_ex = None
65
84
  # segmentation frame rate
66
85
  self._segm_rate = 0
67
86
 
68
87
  # Set up logging
69
88
  # General logger for this job
70
- self.logger = logging.getLogger(__name__).getChild(
71
- f"Runner-{self.pphash[:5]}")
72
- self.logger.setLevel(
73
- logging.DEBUG if job["debug"] else logging.WARNING)
89
+ self.main_logger = logging.getLogger("dcnum")
90
+ self.main_logger.setLevel(
91
+ logging.DEBUG if job["debug"] else logging.INFO)
74
92
  # Log file output in target directory
75
93
  self.path_log = job["path_out"].with_suffix(".log")
76
94
  self.path_log.parent.mkdir(exist_ok=True, parents=True)
@@ -81,17 +99,30 @@ class DCNumJobRunner(threading.Thread):
81
99
  delay=True,
82
100
  errors="ignore",
83
101
  )
102
+ # Set the log file handler level to DEBUG, so it logs everything
103
+ # presented to it.
104
+ self._log_file_handler.setLevel(logging.DEBUG)
84
105
  fmt = logging.Formatter(
85
- "%(asctime)s %(levelname)s %(processName)s/%(threadName)s "
86
- + "in %(name)s: %(message)s")
106
+ fmt="%(asctime)s %(levelname)s %(name)s: %(message)s",
107
+ datefmt='%H:%M:%S'
108
+ )
87
109
  self._log_file_handler.setFormatter(fmt)
88
- self.logger.addHandler(self._log_file_handler)
89
- handlers = list(self.logger.handlers)
110
+ self.main_logger.addHandler(self._log_file_handler)
111
+ handlers = list(self.main_logger.handlers)
112
+
90
113
  # Queue for subprocesses to log to
91
114
  self.log_queue = mp_spawn.Queue()
92
- self._qlisten = QueueListener(self.log_queue, *handlers)
115
+ self._qlisten = QueueListener(self.log_queue, *handlers,
116
+ respect_handler_level=True)
93
117
  self._qlisten.start()
94
118
 
119
+ if job["debug"]:
120
+ self.main_logger.info("Note that in debugging mode, duplicate "
121
+ "log entries may appear (logs that are "
122
+ "recorded via queues).")
123
+
124
+ self.logger = logging.getLogger(f"dcnum.Runner-{self.pphash[:2]}")
125
+
95
126
  # Sanity checks
96
127
  for os_env in [
97
128
  "OMP_NUM_THREADS",
@@ -159,6 +190,16 @@ class DCNumJobRunner(threading.Thread):
159
190
  po = pathlib.Path(self.job["path_out"])
160
191
  return po.with_name(po.stem + f"_output_{self.tmp_suffix}.rtdc~")
161
192
 
193
+ @property
194
+ def state(self):
195
+ return self._state
196
+
197
+ @state.setter
198
+ def state(self, state):
199
+ if state not in valid_states:
200
+ raise ValueError(f"Invalid state '{state}' specified!")
201
+ self._state = state
202
+
162
203
  def close(self, delete_temporary_files=True):
163
204
  if self._data_raw is not None:
164
205
  self._data_raw.close()
@@ -167,8 +208,8 @@ class DCNumJobRunner(threading.Thread):
167
208
  self._data_temp_in.close()
168
209
  self._data_temp_in = None
169
210
  # clean up logging
170
- if self._log_file_handler in self.logger.handlers:
171
- self.logger.removeHandler(self._log_file_handler)
211
+ if self._log_file_handler in self.main_logger.handlers:
212
+ self.main_logger.removeHandler(self._log_file_handler)
172
213
  self._log_file_handler.flush()
173
214
  self._log_file_handler.close()
174
215
  if self._qlisten is not None:
@@ -184,26 +225,67 @@ class DCNumJobRunner(threading.Thread):
184
225
  # We don't have to delete self.path_temp_out, since this one
185
226
  # is `rename`d to `self.jon["path_out"]`.
186
227
 
187
- def join(self, *args, **kwargs):
228
+ def join(self, delete_temporary_files=True, *args, **kwargs):
188
229
  super(DCNumJobRunner, self).join(*args, **kwargs)
189
230
  # Close only after join
190
- self.close()
231
+ self.close(delete_temporary_files=delete_temporary_files)
191
232
 
192
233
  def get_status(self):
234
+ # Compute the total progress. The following weights indicate
235
+ # how much fractional time each processing step takes.
236
+ bgw = 4 # fraction of background
237
+ exw = 27 # fraction of segmentation and feature extraction
238
+ clw = 1 # fraction of cleanup operations
239
+ tot = bgw + exw + clw
240
+ progress = 0
241
+ st = self.state
242
+
243
+ # background
244
+ if valid_states.index(st) > valid_states.index("background"):
245
+ # background already computed
246
+ progress += bgw / tot
247
+ elif self._progress_bg is not None:
248
+ # This is the image count of the input dataset
249
+ progress += bgw / tot * (self._progress_bg.value / len(self.draw))
250
+
251
+ # segmentation
252
+ if valid_states.index(st) > valid_states.index("segmentation"):
253
+ # segmentation already done
254
+ progress += exw / tot
255
+ elif self._progress_ex is not None:
256
+ progress += exw / tot * self._progress_ex
257
+
258
+ if self.state == "done":
259
+ progress = 1
260
+
193
261
  return {
194
- "progress": self._progress,
262
+ "progress": progress,
195
263
  "segm rate": self._segm_rate,
196
264
  "state": self._state,
197
265
  }
198
266
 
199
267
  def run(self):
268
+ try:
269
+ self.run_pipeline()
270
+ except BaseException:
271
+ self.state = "error"
272
+ self.error_tb = traceback.format_exc()
273
+ if not self.is_alive():
274
+ # Thread has not been started. This means we are not running
275
+ # in a thread but in the main process. Raise the exception.
276
+ raise
277
+
278
+ def run_pipeline(self):
200
279
  """Execute the pipeline job"""
280
+ time_start = time.monotonic()
281
+ time_string = time.strftime("%Y-%m-%d-%H.%M.%S", time.gmtime())
282
+ self.logger.info(f"Run start: {time_string}")
201
283
  if self.job["path_out"].exists():
202
284
  raise FileExistsError(
203
285
  f"Output file {self.job['path_out']} already exists!")
204
286
  # Make sure the output directory exists.
205
287
  self.job["path_out"].parent.mkdir(parents=True, exist_ok=True)
206
- self._state = "setup"
288
+ self.state = "setup"
207
289
  # First get a list of all pipeline IDs. If the input file has
208
290
  # already been processed by dcnum, then we do not have to redo
209
291
  # everything.
@@ -223,11 +305,10 @@ class DCNumJobRunner(threading.Thread):
223
305
  # The number of events extracted in a potential previous pipeline run.
224
306
  evyield = self.draw.h5.attrs.get("pipeline:dcnum yield", -1)
225
307
  redo_sanity = (
226
- # Whether pipeline hash is invalid.
227
- ppid.compute_pipeline_hash(**datdict) != dathash
228
- # Whether the input file is the original output of the pipeline.
229
- or len(self.draw) != evyield
230
- )
308
+ # Whether pipeline hash is invalid.
309
+ ppid.compute_pipeline_hash(**datdict) != dathash
310
+ # Whether the input file is the original output of the pipeline.
311
+ or len(self.draw) != evyield)
231
312
  # Do we have to recompute the background data? In addition to the
232
313
  # hash sanity check above, check the generation, input data,
233
314
  # and background pipeline identifiers.
@@ -246,7 +327,7 @@ class DCNumJobRunner(threading.Thread):
246
327
  or (datdict["feat_id"] != self.ppdict["feat_id"])
247
328
  or (datdict["gate_id"] != self.ppdict["gate_id"]))
248
329
 
249
- self._state = "background"
330
+ self.state = "background"
250
331
 
251
332
  if redo_bg:
252
333
  # The 'image_bg' feature is written to `self.path_temp_in`.
@@ -255,8 +336,7 @@ class DCNumJobRunner(threading.Thread):
255
336
  # (note that `self.path_temp_in` is basin-based).
256
337
  self.task_background()
257
338
 
258
- self._progress = 0.1
259
- self._state = "segmentation"
339
+ self.state = "segmentation"
260
340
 
261
341
  # We have the input data covered, and we have to run the
262
342
  # long-lasting segmentation and feature extraction step.
@@ -280,8 +360,7 @@ class DCNumJobRunner(threading.Thread):
280
360
  # reflected in `self.path_temp_out`.
281
361
  self.path_temp_in.rename(self.path_temp_out)
282
362
 
283
- self._progress = 0.95
284
- self._state = "cleanup"
363
+ self.state = "cleanup"
285
364
 
286
365
  # The user would normally expect the output file to be something
287
366
  # that is self-contained (copying the file wildly across file
@@ -304,11 +383,36 @@ class DCNumJobRunner(threading.Thread):
304
383
  # regular metadata
305
384
  hw.h5.attrs["experiment:event count"] = self.event_count
306
385
  hw.h5.attrs["imaging:pixel size"] = self.draw.pixel_size
307
- if self.path_log.exists():
308
- # Add the log file to the resulting .rtdc file
309
- hw.store_log(
310
- time.strftime("dcnum-process-%Y-%m-%d-%H.%M.%S"),
311
- self.path_log.read_text().split("\n"))
386
+ # Add job information to resulting .rtdc file
387
+ hw.store_log(f"dcnum-job-{time_string}",
388
+ json.dumps({
389
+ "dcnum version": version_tuple,
390
+ "job": self.job.__getstate__(),
391
+ "pipeline": {"identifiers": self.ppdict,
392
+ "hash": self.pphash,
393
+ },
394
+ "python": {
395
+ "build": ", ".join(platform.python_build()),
396
+ "implementation":
397
+ platform.python_implementation(),
398
+ "version": platform.python_version(),
399
+ },
400
+ "system": {
401
+ "info": platform.platform(),
402
+ "machine": platform.machine(),
403
+ "name": platform.system(),
404
+ "release": platform.release(),
405
+ "version": platform.version(),
406
+ },
407
+ "tasks": {"background": redo_bg,
408
+ "segmentation": redo_seg
409
+ },
410
+ },
411
+ indent=2,
412
+ sort_keys=True,
413
+ cls=ExtendedJSONEncoder,
414
+ ).split("\n"))
415
+
312
416
  # copy metadata/logs/tables from original file
313
417
  with h5py.File(self.job["path_in"]) as h5_src:
314
418
  copy_metadata(h5_src=h5_src,
@@ -330,10 +434,20 @@ class DCNumJobRunner(threading.Thread):
330
434
  mid_new = f"{mid_cur}_{mid_ap}" if mid_cur else mid_ap
331
435
  hw.h5.attrs["experiment:run identifier"] = mid_new
332
436
 
437
+ trun = datetime.timedelta(seconds=round(time.monotonic() - time_start))
438
+ self.logger.info(f"Run duration: {str(trun)}")
439
+ self.logger.info(time.strftime("Run stop: %Y-%m-%d-%H.%M.%S",
440
+ time.gmtime()))
441
+ # Add the log file to the resulting .rtdc file
442
+ if self.path_log.exists():
443
+ with HDF5Writer(self.path_temp_out) as hw:
444
+ hw.store_log(
445
+ f"dcnum-log-{time_string}",
446
+ self.path_log.read_text().strip().split("\n"))
447
+
333
448
  # Rename the output file
334
449
  self.path_temp_out.rename(self.job["path_out"])
335
- self._progress = 1.0
336
- self._state = "done"
450
+ self.state = "done"
337
451
 
338
452
  def task_background(self):
339
453
  """Perform background computation task
@@ -357,7 +471,7 @@ class DCNumJobRunner(threading.Thread):
357
471
  num_cpus=self.job["num_procs"],
358
472
  # custom kwargs
359
473
  **self.job["background_kwargs"]) as bic:
360
-
474
+ self._progress_bg = bic.image_proc
361
475
  bic.process()
362
476
  self.logger.info("Finished background computation")
363
477
 
@@ -365,14 +479,13 @@ class DCNumJobRunner(threading.Thread):
365
479
  self.logger.info("Starting segmentation and feature extraction")
366
480
  # Start writer thread
367
481
  writer_dq = collections.deque()
368
- ds_kwds = dict(hdf5plugin.Zstd(clevel=5))
369
- ds_kwds["fletcher32"] = True
482
+ ds_kwds = set_default_filter_kwargs()
370
483
  thr_write = DequeWriterThread(
371
484
  path_out=self.path_temp_out,
372
485
  dq=writer_dq,
373
486
  mode="w",
374
487
  ds_kwds=ds_kwds,
375
- )
488
+ )
376
489
  thr_write.start()
377
490
 
378
491
  # Start segmentation thread
@@ -385,13 +498,24 @@ class DCNumJobRunner(threading.Thread):
385
498
  if self.job["debug"]:
386
499
  num_slots = 1
387
500
  num_extractors = 1
501
+ num_segmenters = 1
388
502
  elif seg_cls.hardware_processor == "cpu": # CPU segmenter
503
+ # We could in principle set the number of slots to one and
504
+ # jave both number of extractors and number of segmenters set
505
+ # to the total number of CPUs. However, we would need more RAM
506
+ # (for caching the image data) and we also have more overhead.
507
+ # Having two slots shared between all workers is more efficient.
389
508
  num_slots = 2
509
+ # Split segmentation and feature extraction workers evenly.
390
510
  num_extractors = self.job["num_procs"] // 2
511
+ num_segmenters = self.job["num_procs"] - num_extractors
391
512
  else: # GPU segmenter
392
513
  num_slots = 3
393
514
  num_extractors = self.job["num_procs"]
515
+ num_segmenters = 1
394
516
  num_extractors = max(1, num_extractors)
517
+ num_segmenters = max(1, num_segmenters)
518
+ self.job["segmenter_kwargs"]["num_workers"] = num_segmenters
395
519
 
396
520
  slot_chunks = mp_spawn.Array("i", num_slots)
397
521
  slot_states = mp_spawn.Array("u", num_slots)
@@ -410,7 +534,9 @@ class DCNumJobRunner(threading.Thread):
410
534
  fe_kwargs = QueueEventExtractor.get_init_kwargs(
411
535
  data=self.dtin,
412
536
  gate=gate.Gate(self.dtin, **self.job["gate_kwargs"]),
413
- log_queue=self.log_queue)
537
+ log_queue=self.log_queue,
538
+ log_level=logging.DEBUG if self.job["debug"] else logging.INFO,
539
+ )
414
540
  fe_kwargs["extract_kwargs"] = self.job["feature_kwargs"]
415
541
 
416
542
  thr_feat = EventExtractorManagerThread(
@@ -437,16 +563,12 @@ class DCNumJobRunner(threading.Thread):
437
563
 
438
564
  # So in principle we are done here. We do not have to do anything
439
565
  # besides monitoring the progress.
440
- pmin = 0.1 # from background computation
441
- pmax = 0.95 # 5% reserved for cleanup
442
566
  while True:
443
567
  counted_frames = thr_coll.written_frames
444
568
  self.event_count = thr_coll.written_events
445
569
  td = time.monotonic() - t0
446
570
  # set the current status
447
- self._progress = round(
448
- pmin + counted_frames / data_size * (pmax - pmin),
449
- 3)
571
+ self._progress_ex = counted_frames / data_size
450
572
  self._segm_rate = counted_frames / (td or 0.03)
451
573
  time.sleep(.5)
452
574
  if counted_frames == data_size:
@@ -510,9 +632,9 @@ def join_thread_helper(thr, timeout, retries, logger, name):
510
632
  if thr.is_alive():
511
633
  logger.info(f"Waiting for '{name}' ({thr}")
512
634
  else:
513
- logger.info(f"Joined thread '{name}'")
635
+ logger.debug(f"Joined thread '{name}'")
514
636
  break
515
637
  else:
516
638
  logger.error(f"Failed to join thread '{name}'")
517
- raise ValueError(
518
- f"Thread '{name}' ({thr}) did not join within {timeout*retries}s!")
639
+ raise ValueError(f"Thread '{name}' ({thr}) did not join"
640
+ f"within {timeout * retries}s!")
@@ -0,0 +1,17 @@
1
+ import json
2
+ import numbers
3
+ import pathlib
4
+
5
+ import numpy as np
6
+
7
+
8
+ class ExtendedJSONEncoder(json.JSONEncoder):
9
+ def default(self, obj):
10
+ if isinstance(obj, pathlib.Path):
11
+ return str(obj)
12
+ elif isinstance(obj, numbers.Integral):
13
+ return int(obj)
14
+ elif isinstance(obj, np.bool_):
15
+ return bool(obj)
16
+ # Let the base class default method raise the TypeError
17
+ return json.JSONEncoder.default(self, obj)
dcnum/read/cache.py CHANGED
@@ -11,7 +11,7 @@ class HDF5ImageCache:
11
11
  def __init__(self,
12
12
  h5ds: h5py.Dataset,
13
13
  chunk_size: int = 1000,
14
- cache_size: int = 5,
14
+ cache_size: int = 2,
15
15
  boolean: bool = False):
16
16
  """An HDF5 image cache
17
17
 
dcnum/read/hdf5_data.py CHANGED
@@ -25,7 +25,7 @@ class HDF5Data:
25
25
  basins: List[Dict[List[str] | str]] = None,
26
26
  logs: Dict[List[str]] = None,
27
27
  tables: Dict[np.ndarray] = None,
28
- image_cache_size: int = 5,
28
+ image_cache_size: int = 2,
29
29
  ):
30
30
  # Init is in __setstate__ so we can pickle this class
31
31
  # and use it for multiprocessing.
@@ -15,6 +15,7 @@ class GPUSegmenter(Segmenter, abc.ABC):
15
15
 
16
16
  def __init__(self,
17
17
  *,
18
+ num_workers: int = None,
18
19
  kwargs_mask: Dict = None,
19
20
  debug: bool = False,
20
21
  **kwargs
@@ -31,6 +32,9 @@ class GPUSegmenter(Segmenter, abc.ABC):
31
32
  Additional, optional keyword arguments for `segment_approach`
32
33
  defined in the subclass.
33
34
  """
35
+ if num_workers not in [None, 1]:
36
+ raise ValueError(f"Number of workers must not be larger than 1 "
37
+ f"for GPU segmenter, got '{num_workers}'!")
34
38
  super(GPUSegmenter, self).__init__(kwargs_mask=kwargs_mask,
35
39
  debug=debug,
36
40
  **kwargs)
@@ -77,9 +77,9 @@ class SegmenterManagerThread(threading.Thread):
77
77
  self.debug = debug
78
78
 
79
79
  def run(self):
80
+ num_slots = len(self.slot_states)
80
81
  # We iterate over all the chunks of the image data.
81
82
  for chunk in self.image_data.iter_chunks():
82
- num_slots = len(self.slot_states)
83
83
  cur_slot = 0
84
84
  empty_slots = 0
85
85
  # Wait for a free slot to perform segmentation (compute labels)
@@ -89,8 +89,11 @@ class SegmenterManagerThread(threading.Thread):
89
89
  # - "s" the extractor processed the data and is waiting
90
90
  # for the segmenter
91
91
  if self.slot_states[cur_slot] != "e":
92
+ # It's the segmenters turn. Note that we use '!= "e"',
93
+ # because the initial value is "\x00".
92
94
  break
93
95
  else:
96
+ # Try another slot.
94
97
  empty_slots += 1
95
98
  cur_slot = (cur_slot + 1) % num_slots
96
99
  if empty_slots >= num_slots:
dcnum/write/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
1
  # flake8: noqa: F401
2
2
  from .deque_writer_thread import DequeWriterThread
3
3
  from .queue_collector_thread import EventStash, QueueCollectorThread
4
- from .writer import HDF5Writer, copy_metadata, create_with_basins
4
+ from .writer import (
5
+ HDF5Writer, copy_metadata, create_with_basins, set_default_filter_kwargs)
@@ -184,7 +184,7 @@ class QueueCollectorThread(threading.Thread):
184
184
 
185
185
  if len(cur_nevents) == 0:
186
186
  self.logger.info(
187
- "Reached the end of the current dataset (frame "
187
+ "Reached dataset end (frame "
188
188
  # `last_idx` is the size of the dataset in the end,
189
189
  # because `len(cur_nevents)` is always added to it.
190
190
  f"{last_idx} of {len(self.feat_nevents)}).")
@@ -273,3 +273,6 @@ class QueueCollectorThread(threading.Thread):
273
273
 
274
274
  # Increment current frame index.
275
275
  last_idx += len(cur_nevents)
276
+
277
+ self.logger.info(f"Counted {self.written_events} events.")
278
+ self.logger.debug(f"Counted {self.written_frames} frames.")
dcnum/write/writer.py CHANGED
@@ -21,11 +21,7 @@ class HDF5Writer:
21
21
  """Write deformability cytometry HDF5 data"""
22
22
  self.h5 = h5py.File(path, mode=mode, libver="latest")
23
23
  self.events = self.h5.require_group("events")
24
- if ds_kwds is None:
25
- ds_kwds = {}
26
- for key, val in dict(hdf5plugin.Zstd(clevel=5)).items():
27
- ds_kwds.setdefault(key, val)
28
- ds_kwds.setdefault("fletcher32", True)
24
+ ds_kwds = set_default_filter_kwargs(ds_kwds)
29
25
  self.ds_kwds = ds_kwds
30
26
 
31
27
  def __enter__(self):
@@ -38,25 +34,24 @@ class HDF5Writer:
38
34
  self.h5.close()
39
35
 
40
36
  @staticmethod
41
- def get_best_nd_chunks(item_shape):
37
+ def get_best_nd_chunks(item_shape, feat_dtype=np.float64):
42
38
  """Return best chunks for image data
43
39
 
44
40
  Chunking has performance implications. It’s recommended to keep the
45
- total size of your chunks between 10 KiB and 1 MiB. This number defines
46
- the maximum chunk size as well as half the maximum cache size for each
47
- dataset.
41
+ total size of dataset chunks between 10 KiB and 1 MiB. This number
42
+ defines the maximum chunk size as well as half the maximum cache
43
+ size for each dataset.
48
44
  """
49
- num_bytes = 1024**2 # between 10KiB and 1 MiB
50
- if len(item_shape) == 0:
51
- # scalar feature
52
- chunk_size_int = 10000
53
- else:
54
- event_size = np.prod(item_shape) * np.dtype(np.uint8).itemsize
55
- chunk_size = num_bytes / event_size
56
- chunk_size_int = max(1, int(np.floor(chunk_size)))
45
+ # set image feature chunk size to approximately 1MiB
46
+ num_bytes = 1024 ** 2
47
+ event_size = np.prod(item_shape) * np.dtype(feat_dtype).itemsize
48
+ chunk_size = num_bytes / event_size
49
+ # Set minimum chunk size to 10 so that we can have at least some
50
+ # compression performance.
51
+ chunk_size_int = max(10, int(np.floor(chunk_size)))
57
52
  return tuple([chunk_size_int] + list(item_shape))
58
53
 
59
- def require_feature(self, feat, item_shape, dtype, ds_kwds=None):
54
+ def require_feature(self, feat, item_shape, feat_dtype, ds_kwds=None):
60
55
  """Create a new feature in the "events" group"""
61
56
 
62
57
  if ds_kwds is None:
@@ -67,9 +62,10 @@ class HDF5Writer:
67
62
  dset = self.events.create_dataset(
68
63
  feat,
69
64
  shape=tuple([0] + list(item_shape)),
70
- dtype=dtype,
65
+ dtype=feat_dtype,
71
66
  maxshape=tuple([None] + list(item_shape)),
72
- chunks=self.get_best_nd_chunks(item_shape),
67
+ chunks=self.get_best_nd_chunks(item_shape,
68
+ feat_dtype=feat_dtype),
73
69
  **ds_kwds)
74
70
  if len(item_shape) == 2:
75
71
  dset.attrs.create('CLASS', np.string_('IMAGE'))
@@ -137,7 +133,7 @@ class HDF5Writer:
137
133
  data = 255 * np.array(data, dtype=np.uint8)
138
134
  ds, offset = self.require_feature(feat=feat,
139
135
  item_shape=data.shape[1:],
140
- dtype=data.dtype)
136
+ feat_dtype=data.dtype)
141
137
  dsize = data.shape[0]
142
138
  ds.resize(offset + dsize, axis=0)
143
139
  ds[offset:offset + dsize] = data
@@ -249,10 +245,7 @@ def copy_metadata(h5_src: h5py.File,
249
245
  are not defined already are added.
250
246
  """
251
247
  # compress data
252
- ds_kwds = {}
253
- for key, val in dict(hdf5plugin.Zstd(clevel=5)).items():
254
- ds_kwds.setdefault(key, val)
255
- ds_kwds.setdefault("fletcher32", True)
248
+ ds_kwds = set_default_filter_kwargs()
256
249
  # set attributes
257
250
  src_attrs = dict(h5_src.attrs)
258
251
  for kk in src_attrs:
@@ -267,18 +260,31 @@ def copy_metadata(h5_src: h5py.File,
267
260
  h5_dst.require_group(topic)
268
261
  if key not in h5_dst[topic]:
269
262
  data = h5_src[topic][key][:]
270
- if data.dtype == np.dtype("O"):
271
- # convert variable-length strings to fixed-length
272
- max_length = max([len(line) for line in data])
273
- data = np.asarray(data, dtype=f"S{max_length}")
274
- ds = h5_dst[topic].create_dataset(
275
- name=key,
276
- data=data,
277
- **ds_kwds
278
- )
279
- # help with debugging and add some meta-metadata
280
- ds.attrs.update(h5_src[topic][key].attrs)
281
- soft_strings = [ds.attrs.get("software"),
282
- f"dcnum {version}"]
283
- soft_strings = [s for s in soft_strings if s is not None]
284
- ds.attrs["software"] = " | ".join(soft_strings)
263
+ if data.size: # ignore empty datasets
264
+ if data.dtype == np.dtype("O"):
265
+ # convert variable-length strings to fixed-length
266
+ max_length = max([len(line) for line in data])
267
+ data = np.asarray(data, dtype=f"S{max_length}")
268
+ ds = h5_dst[topic].create_dataset(
269
+ name=key,
270
+ data=data,
271
+ **ds_kwds
272
+ )
273
+ # help with debugging and add some meta-metadata
274
+ ds.attrs.update(h5_src[topic][key].attrs)
275
+ soft_strgs = [ds.attrs.get("software"),
276
+ f"dcnum {version}"]
277
+ soft_strgs = [s for s in soft_strgs if s is not None]
278
+ ds.attrs["software"] = " | ".join(soft_strgs)
279
+
280
+
281
+ def set_default_filter_kwargs(ds_kwds=None, compression=True):
282
+ if ds_kwds is None:
283
+ ds_kwds = {}
284
+ if compression:
285
+ # compression
286
+ for key, val in dict(hdf5plugin.Zstd(clevel=5)).items():
287
+ ds_kwds.setdefault(key, val)
288
+ # checksums
289
+ ds_kwds.setdefault("fletcher32", True)
290
+ return ds_kwds
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcnum
3
- Version: 0.16.1
3
+ Version: 0.16.3
4
4
  Summary: numerics toolbox for imaging deformability cytometry
5
5
  Author: Paul Müller
6
6
  Maintainer-email: Paul Müller <dev@craban.de>
@@ -1,14 +1,14 @@
1
1
  dcnum/__init__.py,sha256=hcawIKS7utYiOyVhOAX9t7K3xYzP1b9862VV0b6qSrQ,74
2
- dcnum/_version.py,sha256=djn14s8hpDNt1rX_1-rosukF6WH__dfZ71HDUa63720,413
2
+ dcnum/_version.py,sha256=KgYmvPTSKj3FMTlG8S1wtqb14ST1EcsHHNSwLJp3foQ,413
3
3
  dcnum/feat/__init__.py,sha256=JqlgzOgDJhoTk8WVYcIiKTWq9EAM16_jGivzOtN6JGo,325
4
- dcnum/feat/event_extractor_manager_thread.py,sha256=V2idRAlC7bdsA8I40RAUkqz3jtWmTeb4cjPXpRjr8Ik,6145
4
+ dcnum/feat/event_extractor_manager_thread.py,sha256=ypsGEwmM_ohHCnnl8g1vpruezFAkH0drIU1AOngH5Bg,6837
5
5
  dcnum/feat/gate.py,sha256=srobj5p2RDr_S2SUtbwGbTKatnc_aPSndt0cR2P9zoY,7060
6
- dcnum/feat/queue_event_extractor.py,sha256=RYz0VNtV8OVGDFn9MrYWM5NaB4rXLiJch9MTKDnccs0,14453
6
+ dcnum/feat/queue_event_extractor.py,sha256=o7K4p5VNExnaO6lgnlHrVk_qPhbXzocUyFhUcoP7OAU,14970
7
7
  dcnum/feat/feat_background/__init__.py,sha256=OTmMuazHNaSrZb2XW4cnJ6PlgJLbKrPbaidpEixYa0A,341
8
- dcnum/feat/feat_background/base.py,sha256=N1SL5NCZ7gTS5AQONxEH31PFJBx0zvVjCaA4mprheuY,7974
8
+ dcnum/feat/feat_background/base.py,sha256=KA1H5giTyMBADex2-LmGbu7B1PEAKjiCUAvSF89WiZs,8375
9
9
  dcnum/feat/feat_background/bg_copy.py,sha256=aHabgizRuwIdOH8S850Cun9NsmpMzo4B3yHWv1aFNFI,645
10
- dcnum/feat/feat_background/bg_roll_median.py,sha256=3wbY_zoNx7zSs8ZgORQBXw0dfD_3Xai4h7-RyPETnoI,13048
11
- dcnum/feat/feat_background/bg_sparse_median.py,sha256=JzrAAOKOXJS1gI4y3RPTvDeNPUa2iliXZqlwADp4Edc,17840
10
+ dcnum/feat/feat_background/bg_roll_median.py,sha256=FfC3v1cX8mreLO971C_kTpFRBtuJP4Sv-Hj1Wj8yb3Q,12826
11
+ dcnum/feat/feat_background/bg_sparse_median.py,sha256=CDO8X7-7agBxTrC79lskt0zWTaSex6ouxUVfxImhgs4,17630
12
12
  dcnum/feat/feat_brightness/__init__.py,sha256=o6AebVlmydwNgVF5kW6ITqJyFreoKrU3Ki_3EC8If-s,155
13
13
  dcnum/feat/feat_brightness/bright_all.py,sha256=Z5b-xkw7g7ejMpbGmdUqrxGRymqFhAQsZ938gaGXk9Y,3102
14
14
  dcnum/feat/feat_brightness/common.py,sha256=JX49EszYDmnvoOKXFVV1CalEIWRmOuY5EryNbqGbdac,156
@@ -19,26 +19,27 @@ dcnum/feat/feat_texture/__init__.py,sha256=6StM9S540UVtdFFR3bHa7nfCTomeVdoo7Uy9C
19
19
  dcnum/feat/feat_texture/common.py,sha256=COXHpXS-7DMouGu3WF83I76L02Sr7P9re4lxajh6g0E,439
20
20
  dcnum/feat/feat_texture/tex_all.py,sha256=eGjjNfPpfZw7FA_VNFCIMiU38KD0qcGbxLciYy-tCiA,4097
21
21
  dcnum/logic/__init__.py,sha256=5hgAQMp2YGsqpWoeTQ9qxGAWfxPOKQjJsYyNsS49t0g,131
22
- dcnum/logic/ctrl.py,sha256=8yPhIOB9Pju2D8s65OlEDxXX1fu4iqHvo7MpSqUe7z8,21381
22
+ dcnum/logic/ctrl.py,sha256=aqXCH_yyrfifeAxmpW6Cg-FQIwTBjpElHbva60ghYpY,26655
23
23
  dcnum/logic/job.py,sha256=M0Q-Rfcm-zkTXTQc79W6YSNUjUlgmRPG0Ikbdn1aOpY,4608
24
+ dcnum/logic/json_encoder.py,sha256=dy44ArmdnxpUfxxONmKdIv-fde3aTXPjZDN0HPATaxs,467
24
25
  dcnum/meta/__init__.py,sha256=cQT_HN5yDKzMnZM8CUyNmeA68OhE3ENO_rvFmgDj95c,40
25
26
  dcnum/meta/ppid.py,sha256=_xUqJal4wBqgic2aRN3ZMMteTggHeYGs44nrYbTKlpQ,8107
26
27
  dcnum/read/__init__.py,sha256=iV2wrBMdwJgpXaphNiiAVybndDzTTv0CAGRNXyvxcLY,157
27
- dcnum/read/cache.py,sha256=mr2DBJZYgNIAiz64TQ4cgkPmRt8nJWBvgkOpaz-p6Yg,5467
28
+ dcnum/read/cache.py,sha256=0tMurtHOA7VnPNpfeAGi-dxWXfYhL5wmWuXb6ka_eEo,5467
28
29
  dcnum/read/const.py,sha256=SVlvEJiRIHyTyUlWG24_ogcnT5nTxCi0CRslNuNP56I,282
29
- dcnum/read/hdf5_data.py,sha256=jUPoXgn52eJZrF4uOpR2-fBLaQX9Ezw3tcuAlI5LnF8,18817
30
+ dcnum/read/hdf5_data.py,sha256=8g39CZoFIa2tUvizZt_vzMeoCUcTkkt3AkXK6MMN0iY,18817
30
31
  dcnum/segm/__init__.py,sha256=iiq_1A9DU5wMUcKnsZ53E7NyzCkbZCJeUDimzunE-OM,247
31
32
  dcnum/segm/segm_thresh.py,sha256=aLVTydPjbrgKDkZFY3Ew5CX-miwOw71meHfxcO5EjCc,1176
32
33
  dcnum/segm/segmenter.py,sha256=F3gCp-Z51F9GxdFYPF1CHjnbfgqnS0_g-34lJF2tMCM,10611
33
34
  dcnum/segm/segmenter_cpu.py,sha256=tCY105rVr9_0RIq2618qnF1ueHRj7UtuK_nUBoAg-nY,10743
34
- dcnum/segm/segmenter_gpu.py,sha256=aIBieTjcr4YaOHUhYiOOh_EpKUzQLBhanpBrtPSUL5k,1900
35
- dcnum/segm/segmenter_manager_thread.py,sha256=xtuk7gnk7xhoRoV_J97rrv7IR3JgeRvVewCDT-chqpk,5172
36
- dcnum/write/__init__.py,sha256=6vAQECatcd7DJMXFEuab1wdvEiaxisbY8_qmK5tzIwY,207
35
+ dcnum/segm/segmenter_gpu.py,sha256=tL2X5BN0jKmhC7wgfG0hygd-6UpG1ZCVuKe5OP1qde0,2133
36
+ dcnum/segm/segmenter_manager_thread.py,sha256=2znDaKedSueomcU1pbHtFmVcGoHzp--sf494VgJF_Tk,5342
37
+ dcnum/write/__init__.py,sha256=Cpn3LqL18hh8OScUnGp_AnNfpWPpKW-oAJZH6ot7aRA,241
37
38
  dcnum/write/deque_writer_thread.py,sha256=R4x3p-HZUls3upCBX3vV1VqSdSmaiHdrAswMJj_tVpk,1643
38
- dcnum/write/queue_collector_thread.py,sha256=BivSe5ZA-rTPH4sridXU1yFB6CP7LYzIFudLMbN481s,11793
39
- dcnum/write/writer.py,sha256=8DB4O14tXFisiMDdHawHBdQHOg_uXZkRFbgXNdWdCHQ,10167
40
- dcnum-0.16.1.dist-info/LICENSE,sha256=YRChA1C8A2E-amJbudwMcbTCZy_HzmeY0hMIvduh1MM,1089
41
- dcnum-0.16.1.dist-info/METADATA,sha256=4tTCYJ4d49W7OmwvhDriK295uKuPzCMgecWvUJvcksw,2172
42
- dcnum-0.16.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
43
- dcnum-0.16.1.dist-info/top_level.txt,sha256=Hmh38rgG_MFTVDpUDGuO2HWTSq80P585Het4COQzFTg,6
44
- dcnum-0.16.1.dist-info/RECORD,,
39
+ dcnum/write/queue_collector_thread.py,sha256=c0Z6uZfZ3B8xsTMCB5jglEukM5sesA9HgEawBk_YEUA,11910
40
+ dcnum/write/writer.py,sha256=Hr37OSDJGUpJJ4OufJHYYBanE26GiNwUPOMAt-5Yc2Y,10478
41
+ dcnum-0.16.3.dist-info/LICENSE,sha256=YRChA1C8A2E-amJbudwMcbTCZy_HzmeY0hMIvduh1MM,1089
42
+ dcnum-0.16.3.dist-info/METADATA,sha256=BChO0SWVq5w9ZEFvvP0KDsFd-T1WRbMZkhW2xCBeVWc,2172
43
+ dcnum-0.16.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
44
+ dcnum-0.16.3.dist-info/top_level.txt,sha256=Hmh38rgG_MFTVDpUDGuO2HWTSq80P585Het4COQzFTg,6
45
+ dcnum-0.16.3.dist-info/RECORD,,
File without changes