dcnum 0.17.0__py3-none-any.whl → 0.23.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dcnum might be problematic. Click here for more details.

Files changed (49) hide show
  1. dcnum/_version.py +2 -2
  2. dcnum/feat/__init__.py +1 -1
  3. dcnum/feat/event_extractor_manager_thread.py +34 -25
  4. dcnum/feat/feat_background/base.py +22 -26
  5. dcnum/feat/feat_background/bg_copy.py +18 -12
  6. dcnum/feat/feat_background/bg_roll_median.py +20 -10
  7. dcnum/feat/feat_background/bg_sparse_median.py +55 -7
  8. dcnum/feat/feat_brightness/bright_all.py +41 -6
  9. dcnum/feat/feat_contour/__init__.py +4 -0
  10. dcnum/feat/{feat_moments/mt_legacy.py → feat_contour/moments.py} +32 -8
  11. dcnum/feat/feat_contour/volume.py +174 -0
  12. dcnum/feat/feat_texture/tex_all.py +28 -1
  13. dcnum/feat/gate.py +2 -2
  14. dcnum/feat/queue_event_extractor.py +30 -9
  15. dcnum/logic/ctrl.py +222 -48
  16. dcnum/logic/job.py +85 -2
  17. dcnum/logic/json_encoder.py +2 -0
  18. dcnum/meta/ppid.py +17 -3
  19. dcnum/read/__init__.py +1 -0
  20. dcnum/read/cache.py +100 -78
  21. dcnum/read/const.py +6 -4
  22. dcnum/read/hdf5_data.py +146 -23
  23. dcnum/read/mapped.py +87 -0
  24. dcnum/segm/__init__.py +6 -3
  25. dcnum/segm/segm_thresh.py +6 -18
  26. dcnum/segm/segm_torch/__init__.py +23 -0
  27. dcnum/segm/segm_torch/segm_torch_base.py +125 -0
  28. dcnum/segm/segm_torch/segm_torch_mpo.py +71 -0
  29. dcnum/segm/segm_torch/segm_torch_sto.py +88 -0
  30. dcnum/segm/segm_torch/torch_model.py +95 -0
  31. dcnum/segm/segm_torch/torch_postproc.py +93 -0
  32. dcnum/segm/segm_torch/torch_preproc.py +114 -0
  33. dcnum/segm/segmenter.py +181 -80
  34. dcnum/segm/segmenter_manager_thread.py +38 -30
  35. dcnum/segm/{segmenter_cpu.py → segmenter_mpo.py} +116 -44
  36. dcnum/segm/segmenter_sto.py +110 -0
  37. dcnum/write/__init__.py +2 -1
  38. dcnum/write/deque_writer_thread.py +9 -1
  39. dcnum/write/queue_collector_thread.py +8 -14
  40. dcnum/write/writer.py +128 -5
  41. {dcnum-0.17.0.dist-info → dcnum-0.23.2.dist-info}/METADATA +4 -2
  42. dcnum-0.23.2.dist-info/RECORD +55 -0
  43. {dcnum-0.17.0.dist-info → dcnum-0.23.2.dist-info}/WHEEL +1 -1
  44. dcnum/feat/feat_moments/__init__.py +0 -4
  45. dcnum/segm/segmenter_gpu.py +0 -64
  46. dcnum-0.17.0.dist-info/RECORD +0 -46
  47. /dcnum/feat/{feat_moments/ct_opencv.py → feat_contour/contour.py} +0 -0
  48. {dcnum-0.17.0.dist-info → dcnum-0.23.2.dist-info}/LICENSE +0 -0
  49. {dcnum-0.17.0.dist-info → dcnum-0.23.2.dist-info}/top_level.txt +0 -0
dcnum/logic/ctrl.py CHANGED
@@ -1,5 +1,7 @@
1
1
  import collections
2
2
  import datetime
3
+ import hashlib
4
+ import importlib
3
5
  import json
4
6
  import logging
5
7
  from logging.handlers import QueueListener
@@ -14,6 +16,7 @@ import traceback
14
16
  import uuid
15
17
 
16
18
  import h5py
19
+ import numpy as np
17
20
 
18
21
  from ..feat.feat_background.base import get_available_background_methods
19
22
  from ..feat.queue_event_extractor import QueueEventExtractor
@@ -21,10 +24,10 @@ from ..feat import gate
21
24
  from ..feat import EventExtractorManagerThread
22
25
  from ..segm import SegmenterManagerThread, get_available_segmenters
23
26
  from ..meta import ppid
24
- from ..read import HDF5Data
25
- from .._version import version_tuple
27
+ from ..read import HDF5Data, get_mapping_indices
28
+ from .._version import version, version_tuple
26
29
  from ..write import (
27
- DequeWriterThread, HDF5Writer, QueueCollectorThread,
30
+ DequeWriterThread, HDF5Writer, QueueCollectorThread, copy_features,
28
31
  copy_metadata, create_with_basins, set_default_filter_kwargs
29
32
  )
30
33
 
@@ -43,6 +46,7 @@ valid_states = [
43
46
  "setup",
44
47
  "background",
45
48
  "segmentation",
49
+ "plumbing",
46
50
  "cleanup",
47
51
  "done",
48
52
  "error",
@@ -79,16 +83,16 @@ class DCNumJobRunner(threading.Thread):
79
83
  # current job state
80
84
  self._state = "init"
81
85
  # overall progress [0, 1]
82
- self._progress_bg = None
83
- self._progress_ex = None
86
+ self._progress_bg = None # background
87
+ self._progress_ex = None # segmentation
88
+ self._progress_bn = None # creating basins
84
89
  # segmentation frame rate
85
90
  self._segm_rate = 0
86
91
 
87
92
  # Set up logging
88
93
  # General logger for this job
89
94
  self.main_logger = logging.getLogger("dcnum")
90
- self.main_logger.setLevel(
91
- logging.DEBUG if job["debug"] else logging.INFO)
95
+ self.main_logger.setLevel(job["log_level"])
92
96
  # Log file output in target directory
93
97
  self.path_log = job["path_out"].with_suffix(".log")
94
98
  self.path_log.parent.mkdir(exist_ok=True, parents=True)
@@ -237,8 +241,12 @@ class DCNumJobRunner(threading.Thread):
237
241
  # how much fractional time each processing step takes.
238
242
  bgw = 4 # fraction of background
239
243
  exw = 27 # fraction of segmentation and feature extraction
244
+ if self.job["basin_strategy"] == "drain":
245
+ drw = 15 # because data need to be copied
246
+ else:
247
+ drw = 1 # just creating the basins in output file
240
248
  clw = 1 # fraction of cleanup operations
241
- tot = bgw + exw + clw
249
+ tot = bgw + exw + drw + clw
242
250
  progress = 0
243
251
  st = self.state
244
252
 
@@ -247,15 +255,22 @@ class DCNumJobRunner(threading.Thread):
247
255
  # background already computed
248
256
  progress += bgw / tot
249
257
  elif self._progress_bg is not None:
250
- # This is the image count of the input dataset
251
- progress += bgw / tot * (self._progress_bg.value / len(self.draw))
258
+ # This is the image count of the input dataset.
259
+ progress += self._progress_bg.value * bgw / tot
252
260
 
253
261
  # segmentation
254
262
  if valid_states.index(st) > valid_states.index("segmentation"):
255
263
  # segmentation already done
256
264
  progress += exw / tot
257
265
  elif self._progress_ex is not None:
258
- progress += exw / tot * self._progress_ex
266
+ progress += self._progress_ex * exw / tot
267
+
268
+ # draining basins
269
+ if valid_states.index(st) > valid_states.index("plumbing"):
270
+ # plumbing already done
271
+ progress += drw / tot
272
+ if self._progress_bn is not None:
273
+ progress += self._progress_bn * drw / tot
259
274
 
260
275
  if self.state == "done":
261
276
  progress = 1
@@ -310,12 +325,23 @@ class DCNumJobRunner(threading.Thread):
310
325
  # Whether pipeline hash is invalid.
311
326
  ppid.compute_pipeline_hash(**datdict) != dathash
312
327
  # Whether the input file is the original output of the pipeline.
313
- or len(self.draw) != evyield)
328
+ or len(self.draw) != evyield
329
+ # If index mapping is defined, then we always redo the pipeline.
330
+ # If the pipeline hashes are identical and index mapping is not
331
+ # None, then both pipelines were done with index mapping.
332
+ # But applying the same pipeline with index mapping in series
333
+ # will lead to a different result in the second run (e.g. 1st
334
+ # pipeline run: take every 2nd event; 2nd pipeline run: take
335
+ # every second event -> results in every 4th event in output of
336
+ # second pipeline run).
337
+ or self.draw.index_mapping is not None
338
+ )
314
339
  # Do we have to recompute the background data? In addition to the
315
340
  # hash sanity check above, check the generation, input data,
316
341
  # and background pipeline identifiers.
317
342
  redo_bg = (
318
- (datdict["gen_id"] != self.ppdict["gen_id"])
343
+ "image_bg" not in self.draw
344
+ or (datdict["gen_id"] != self.ppdict["gen_id"])
319
345
  or (datdict["dat_id"] != self.ppdict["dat_id"])
320
346
  or (datdict["bg_id"] != self.ppdict["bg_id"]))
321
347
 
@@ -361,16 +387,20 @@ class DCNumJobRunner(threading.Thread):
361
387
  # Note any new actions that work on `self.path_temp_in` are not
362
388
  # reflected in `self.path_temp_out`.
363
389
  self.path_temp_in.rename(self.path_temp_out)
364
-
365
- self.state = "cleanup"
366
-
367
- # The user would normally expect the output file to be something
368
- # that is self-contained (copying the file wildly across file
369
- # systems and network shares should not impair feature availability).
370
- # Therefore, we copy any remaining basin-based features to the
371
- # temporary output file.
372
- if self.job["no_basins_in_output"]:
373
- self.task_transfer_basin_data()
390
+ # Since no segmentation was done, the output file now does not
391
+ # contain any events. This is not really what we wanted, but we
392
+ # can still store all features in the output file if required.
393
+ if self.job["basin_strategy"] == "drain":
394
+ orig_feats = []
395
+ for feat in self.draw.h5["events"].keys():
396
+ if isinstance(self.draw.h5["events"][feat], h5py.Dataset):
397
+ # copy_features does not support Groups
398
+ orig_feats.append(feat)
399
+ with h5py.File(self.path_temp_out, "a") as h5_dst:
400
+ copy_features(h5_src=self.draw.h5,
401
+ h5_dst=h5_dst,
402
+ features=orig_feats,
403
+ mapping=None)
374
404
 
375
405
  with HDF5Writer(self.path_temp_out) as hw:
376
406
  # pipeline metadata
@@ -382,6 +412,10 @@ class DCNumJobRunner(threading.Thread):
382
412
  hw.h5.attrs["pipeline:dcnum gate"] = self.ppdict["gate_id"]
383
413
  hw.h5.attrs["pipeline:dcnum hash"] = self.pphash
384
414
  hw.h5.attrs["pipeline:dcnum yield"] = self.event_count
415
+ # index mapping information
416
+ im = self.job.kwargs["data_kwargs"].get("index_mapping", None)
417
+ dim = HDF5Data.get_ppid_index_mapping(im)
418
+ hw.h5.attrs["pipeline:dcnum mapping"] = dim
385
419
  # regular metadata
386
420
  hw.h5.attrs["experiment:event count"] = self.event_count
387
421
  hw.h5.attrs["imaging:pixel size"] = self.draw.pixel_size
@@ -397,6 +431,16 @@ class DCNumJobRunner(threading.Thread):
397
431
  "build": ", ".join(platform.python_build()),
398
432
  "implementation":
399
433
  platform.python_implementation(),
434
+ "libraries": get_library_versions_dict([
435
+ "cv2",
436
+ "h5py",
437
+ "mahotas",
438
+ "numba",
439
+ "numpy",
440
+ "scipy",
441
+ "skimage",
442
+ "torch",
443
+ ]),
400
444
  "version": platform.python_version(),
401
445
  },
402
446
  "system": {
@@ -419,7 +463,8 @@ class DCNumJobRunner(threading.Thread):
419
463
  with h5py.File(self.job["path_in"]) as h5_src:
420
464
  copy_metadata(h5_src=h5_src,
421
465
  h5_dst=hw.h5,
422
- # don't copy basins
466
+ # Don't copy basins, we would have to index-map
467
+ # them first.
423
468
  copy_basins=False)
424
469
  if redo_seg:
425
470
  # Store the correct measurement identifier. This is used to
@@ -429,13 +474,27 @@ class DCNumJobRunner(threading.Thread):
429
474
  # This is the identifier appendix that we use to identify this
430
475
  # dataset. Note that we only override the run identifier when
431
476
  # segmentation did actually take place.
432
- mid_ap = "dcn-" + self.pphash[:7]
433
- # This is the current measurement identifier (may be empty).
434
- mid_cur = hw.h5.attrs.get("experiment:run identifier", "")
477
+ mid_ap = f"dcn-{self.pphash[:7]}"
478
+ # This is the current measurement identifier
479
+ mid_cur = hw.h5.attrs.get("experiment:run identifier")
480
+ if not mid_cur:
481
+ # Compute a measurement identifier from the metadata
482
+ m_time = hw.h5.attrs.get("experiment:time", "none")
483
+ m_date = hw.h5.attrs.get("experiment:date", "none")
484
+ m_sid = hw.h5.attrs.get("setup:identifier", "none")
485
+ hasher = hashlib.md5(
486
+ f"{m_time}_{m_date}_{m_sid}".encode("utf-8"))
487
+ mid_cur = str(uuid.UUID(hex=hasher.hexdigest()))
435
488
  # The new measurement identifier is a combination of both.
436
489
  mid_new = f"{mid_cur}_{mid_ap}" if mid_cur else mid_ap
437
490
  hw.h5.attrs["experiment:run identifier"] = mid_new
438
491
 
492
+ # Handle basin data according to the user's request
493
+ self.state = "plumbing"
494
+ self.task_enforce_basin_strategy()
495
+
496
+ self.state = "cleanup"
497
+
439
498
  trun = datetime.timedelta(seconds=round(time.monotonic() - time_start))
440
499
  self.logger.info(f"Run duration: {str(trun)}")
441
500
  self.logger.info(time.strftime("Run stop: %Y-%m-%d-%H.%M.%S",
@@ -477,6 +536,115 @@ class DCNumJobRunner(threading.Thread):
477
536
  bic.process()
478
537
  self.logger.info("Finished background computation")
479
538
 
539
+ def task_enforce_basin_strategy(self):
540
+ """Transfer basin data from input files to output if requested
541
+
542
+ The user specified the "basin_strategy" keyword argument in
543
+ `self.job`. If this is set to "drain", then copy all basin
544
+ information from the input file to the output file. If it
545
+ is set to "tap", then only create basins in the output file.
546
+ """
547
+ self._progress_bn = 0
548
+ t0 = time.perf_counter()
549
+ # We need to make sure that the features are correctly attributed
550
+ # from the input files. E.g. if the input file already has
551
+ # background images, but we recompute the background images, then
552
+ # we have to use the data from the recomputed background file.
553
+ # We achieve this by keeping a specific order and only copying those
554
+ # features that we don't already have in the output file.
555
+ feats_raw = [
556
+ # 1. background data from the temporary input image
557
+ # (this must come before draw [sic!])
558
+ [self.dtin.h5, ["image_bg", "bg_off"], "critical"],
559
+ # 2. frame-based scalar features from the raw input file
560
+ # (e.g. "temp" or "frame")
561
+ [self.draw.h5, self.draw.features_scalar_frame, "optional"],
562
+ # 3. image features from the input file
563
+ [self.draw.h5, ["image", "image_bg", "bg_off"], "optional"],
564
+ ]
565
+ with h5py.File(self.path_temp_out, "a") as hout:
566
+ hw = HDF5Writer(hout)
567
+ # First, we have to determine the basin mapping from input to
568
+ # output. This information is stored by the QueueCollectorThread
569
+ # in the "basinmap0" feature, ready to be used by us.
570
+ if "index_unmapped" in hout["events"]:
571
+ # The unmapped indices enumerate the events in the output file
572
+ # with indices from the mapped input file. E.g. if for the
573
+ # first image in the input file, two events are found and for
574
+ # the second image in the input file, three events are found,
575
+ # then this would contain [0, 0, 1, 1, 1, ...]. If the index
576
+ # mapping of the input file was set to slice(1, 100), then the
577
+ # first image would not be there, and we would have
578
+ # [1, 1, 1, ...].
579
+ idx_um = hout["events/index_unmapped"]
580
+
581
+ # If we want to convert this to an actual basinmap feature,
582
+ # then we have to convert those indices to indices that map
583
+ # to the original input HDF5 file.
584
+ raw_im = self.draw.index_mapping
585
+ if raw_im is None:
586
+ self.logger.info("Input file mapped with basinmap0")
587
+ # Create a hard link to save time and space
588
+ hout["events/basinmap0"] = hout["events/index_unmapped"]
589
+ basinmap = idx_um
590
+ else:
591
+ basinmap = get_mapping_indices(raw_im)[idx_um]
592
+ # Store the mapped basin data in the output file.
593
+ hw.store_feature_chunk("basinmap0", basinmap)
594
+ # We don't need them anymore.
595
+ del hout["events/index_unmapped"]
596
+
597
+ # Note that `size_raw != (len(self.draw))` [sic!]. The former
598
+ # is the size of the raw dataset and the latter is its mapped
599
+ # size!
600
+ size_raw = self.draw.h5.attrs["experiment:event count"]
601
+ if (len(basinmap) == size_raw
602
+ and np.all(basinmap == np.arange(size_raw))):
603
+ # This means that the images in the input overlap perfectly
604
+ # with the images in the output, i.e. a "copy" segmenter
605
+ # was used or something is very reproducible.
606
+ # We set basinmap to None to be more efficient.
607
+ basinmap = None
608
+
609
+ else:
610
+ # The input is identical to the output, because we are using
611
+ # the same pipeline identifier.
612
+ basinmap = None
613
+
614
+ for hin, feats, importance in feats_raw:
615
+ # Only consider features that are available in the input
616
+ # and that are not already in the output.
617
+ feats = [f for f in feats
618
+ if (f in hin["events"] and f not in hout["events"])]
619
+ if not feats:
620
+ continue
621
+ elif (self.job["basin_strategy"] == "drain"
622
+ or importance == "critical"):
623
+ # DRAIN: Copy all features over to the output file.
624
+ self.logger.debug(f"Transferring {feats} to output file")
625
+ copy_features(h5_src=hin,
626
+ h5_dst=hout,
627
+ features=feats,
628
+ mapping=basinmap)
629
+ else:
630
+ # TAP: Create basins for the "optional" features in the
631
+ # output file. Note that the "critical" features never
632
+ # reach this case.
633
+ self.logger.debug(f"Creating basin for {feats}")
634
+ # Relative and absolute paths.
635
+ pin = pathlib.Path(hin.filename).resolve()
636
+ pout = pathlib.Path(hout.filename).resolve().parent
637
+ paths = [pin, os.path.relpath(pin, pout)]
638
+ hw.store_basin(name="dcnum basin",
639
+ features=feats,
640
+ mapping=basinmap,
641
+ paths=paths,
642
+ description=f"Created with dcnum {version}",
643
+ )
644
+ self._progress_bn += 1 / len(feats_raw)
645
+ t_tot = time.perf_counter() - t0
646
+ self.logger.info(f"Enforcing basin strategy time: {t_tot:.1f}s")
647
+
480
648
  def task_segment_extract(self):
481
649
  self.logger.info("Starting segmentation and feature extraction")
482
650
  # Start writer thread
@@ -501,9 +669,9 @@ class DCNumJobRunner(threading.Thread):
501
669
  num_slots = 1
502
670
  num_extractors = 1
503
671
  num_segmenters = 1
504
- elif seg_cls.hardware_processor == "cpu": # CPU segmenter
672
+ elif seg_cls.hardware_processor == "cpu": # MPO segmenter
505
673
  # We could in principle set the number of slots to one and
506
- # jave both number of extractors and number of segmenters set
674
+ # have both number of extractors and number of segmenters set
507
675
  # to the total number of CPUs. However, we would need more RAM
508
676
  # (for caching the image data) and we also have more overhead.
509
677
  # Having two slots shared between all workers is more efficient.
@@ -511,24 +679,32 @@ class DCNumJobRunner(threading.Thread):
511
679
  # Split segmentation and feature extraction workers evenly.
512
680
  num_extractors = self.job["num_procs"] // 2
513
681
  num_segmenters = self.job["num_procs"] - num_extractors
682
+ # leave one CPU for the writer and the remaining Threads
683
+ num_segmenters -= 1
514
684
  else: # GPU segmenter
515
685
  num_slots = 3
516
686
  num_extractors = self.job["num_procs"]
687
+ # leave one CPU for the writer and the remaining Threads
688
+ num_extractors -= 1
517
689
  num_segmenters = 1
518
690
  num_extractors = max(1, num_extractors)
519
691
  num_segmenters = max(1, num_segmenters)
520
692
  self.job.kwargs["segmenter_kwargs"]["num_workers"] = num_segmenters
693
+ self.job.kwargs["segmenter_kwargs"]["debug"] = self.job["debug"]
694
+ slot_chunks = mp_spawn.Array("i", num_slots, lock=False)
695
+ slot_states = mp_spawn.Array("u", num_slots, lock=False)
521
696
 
522
- slot_chunks = mp_spawn.Array("i", num_slots)
523
- slot_states = mp_spawn.Array("u", num_slots)
697
+ self.logger.debug(f"Number of slots: {num_slots}")
698
+ self.logger.debug(f"Number of segmenters: {num_segmenters}")
699
+ self.logger.debug(f"Number of extractors: {num_extractors}")
524
700
 
525
- # Initialize thread
701
+ # Initialize segmenter manager thread
526
702
  thr_segm = SegmenterManagerThread(
527
703
  segmenter=seg_cls(**self.job["segmenter_kwargs"]),
528
704
  image_data=imdat,
705
+ bg_off=self.dtin["bg_off"] if "bg_off" in self.dtin else None,
529
706
  slot_states=slot_states,
530
707
  slot_chunks=slot_chunks,
531
- debug=self.job["debug"],
532
708
  )
533
709
  thr_segm.start()
534
710
 
@@ -538,7 +714,7 @@ class DCNumJobRunner(threading.Thread):
538
714
  gate=gate.Gate(self.dtin, **self.job["gate_kwargs"]),
539
715
  num_extractors=num_extractors,
540
716
  log_queue=self.log_queue,
541
- log_level=logging.DEBUG if self.job["debug"] else logging.INFO,
717
+ log_level=self.logger.level,
542
718
  )
543
719
  fe_kwargs["extract_kwargs"] = self.job["feature_kwargs"]
544
720
 
@@ -614,20 +790,18 @@ class DCNumJobRunner(threading.Thread):
614
790
 
615
791
  self.logger.info("Finished segmentation and feature extraction")
616
792
 
617
- def task_transfer_basin_data(self):
618
- with h5py.File(self.path_temp_out, "a") as hout:
619
- hd = HDF5Data(hout)
620
- for ii, _ in enumerate(hd.basins):
621
- hindat, features = hd.get_basin_data(ii)
622
- for feat in features:
623
- if feat not in hout["events"]:
624
- self.logger.debug(
625
- f"Transferring {feat} to output file")
626
- h5py.h5o.copy(src_loc=hindat.h5["events"].id,
627
- src_name=feat.encode(),
628
- dst_loc=hout["events"].id,
629
- dst_name=feat.encode(),
630
- )
793
+
794
+ def get_library_versions_dict(library_name_list):
795
+ version_dict = {}
796
+ for library_name in library_name_list:
797
+ try:
798
+ lib = importlib.import_module(library_name)
799
+ except BaseException:
800
+ version = None
801
+ else:
802
+ version = lib.__version__
803
+ version_dict[library_name] = version
804
+ return version_dict
631
805
 
632
806
 
633
807
  def join_thread_helper(thr, timeout, retries, logger, name):
dcnum/logic/job.py CHANGED
@@ -1,9 +1,11 @@
1
1
  import collections
2
2
  import copy
3
3
  import inspect
4
+ import logging
4
5
  import multiprocessing as mp
5
6
  import pathlib
6
- from typing import Dict
7
+ from typing import Dict, Literal
8
+ import warnings
7
9
 
8
10
  from ..feat import QueueEventExtractor
9
11
  from ..feat.feat_background.base import get_available_background_methods
@@ -27,10 +29,66 @@ class DCNumPipelineJob:
27
29
  feature_kwargs: Dict = None,
28
30
  gate_code: str = "norm",
29
31
  gate_kwargs: Dict = None,
30
- no_basins_in_output: bool = True,
32
+ basin_strategy: Literal["drain", "tap"] = "drain",
33
+ no_basins_in_output: bool = None,
31
34
  num_procs: int = None,
35
+ log_level: int = logging.INFO,
32
36
  debug: bool = False,
33
37
  ):
38
+ """Pipeline job recipe
39
+
40
+ Parameters
41
+ ----------
42
+ path_in: pathlib.Path | str
43
+ input data path
44
+ path_out: pathlib.Path | str
45
+ output data path
46
+ data_code: str
47
+ code of input data reader to use
48
+ data_kwargs: dict
49
+ keyword arguments for data reader
50
+ background_code: str
51
+ code of background data computer to use
52
+ background_kwargs: dict
53
+ keyword arguments for background data computer
54
+ segmenter_code: str
55
+ code of segmenter to use
56
+ segmenter_kwargs: dict
57
+ keyword arguments for segmenter
58
+ feature_code: str
59
+ code of feature extractor
60
+ feature_kwargs: dict
61
+ keyword arguments for feature extractor
62
+ gate_code: str
63
+ code for gating/event filtering class
64
+ gate_kwargs: dict
65
+ keyword arguments for gating/event filtering class
66
+ basin_strategy: str
67
+ strategy on how to handle event data; In principle, not all
68
+ events have to be stored in the output file if basins are
69
+ defined, linking back to the original file.
70
+ - You can "drain" all basins which means that the output file
71
+ will contain all features, but will also be very big.
72
+ - You can "tap" the basins, including the input file, which means
73
+ that the output file will be comparatively small.
74
+ no_basins_in_output: bool
75
+ Deprecated
76
+ num_procs: int
77
+ Number of processes to use
78
+ log_level: int
79
+ Logging level to use.
80
+ debug: bool
81
+ Whether to set logging level to "DEBUG" and
82
+ use threads instead of processes
83
+ """
84
+ if no_basins_in_output is not None:
85
+ warnings.warn("The `no_basins_in_output` keyword argument is "
86
+ "deprecated. Please use `basin_strategy` instead.")
87
+ if no_basins_in_output:
88
+ basin_strategy = "drain"
89
+ else:
90
+ basin_strategy = "tap"
91
+
34
92
  #: initialize keyword arguments for this job
35
93
  self.kwargs = {}
36
94
  spec = inspect.getfullargspec(DCNumPipelineJob.__init__)
@@ -51,6 +109,9 @@ class DCNumPipelineJob:
51
109
  if path_out is None:
52
110
  pin = pathlib.Path(path_in)
53
111
  path_out = pin.with_name(pin.stem + "_dcn.rtdc")
112
+ # Set logging level to DEBUG in debugging mode
113
+ if self.kwargs["debug"]:
114
+ self.kwargs["log_level"] = logging.DEBUG
54
115
  self.kwargs["path_out"] = pathlib.Path(path_out)
55
116
  # Set default mask kwargs for segmenter
56
117
  self.kwargs["segmenter_kwargs"].setdefault("kwargs_mask", {})
@@ -121,3 +182,25 @@ class DCNumPipelineJob:
121
182
  if len(ret) == 1:
122
183
  ret = ret[0]
123
184
  return ret
185
+
186
+ def validate(self):
187
+ """Make sure the pipeline will run given the job kwargs
188
+
189
+ Returns
190
+ -------
191
+ True:
192
+ for testing convenience
193
+
194
+ Raises
195
+ ------
196
+ dcnum.segm.SegmenterNotApplicableError:
197
+ the segmenter is incompatible with the input path
198
+ """
199
+ # Check segmenter applicability applicability
200
+ seg_cls = get_available_segmenters()[self.kwargs["segmenter_code"]]
201
+ with HDF5Data(self.kwargs["path_in"]) as hd:
202
+ seg_cls.validate_applicability(
203
+ segmenter_kwargs=self.kwargs["segmenter_kwargs"],
204
+ logs=hd.logs,
205
+ meta=hd.meta)
206
+ return True
@@ -13,5 +13,7 @@ class ExtendedJSONEncoder(json.JSONEncoder):
13
13
  return int(obj)
14
14
  elif isinstance(obj, np.bool_):
15
15
  return bool(obj)
16
+ elif isinstance(obj, slice):
17
+ return "PYTHON-SLICE", (obj.start, obj.stop, obj.step)
16
18
  # Let the base class default method raise the TypeError
17
19
  return json.JSONEncoder.default(self, obj)
dcnum/meta/ppid.py CHANGED
@@ -10,7 +10,7 @@ import warnings
10
10
 
11
11
  #: Increment this string if there are breaking changes that make
12
12
  #: previous pipelines unreproducible.
13
- DCNUM_PPID_GENERATION = "7"
13
+ DCNUM_PPID_GENERATION = "10"
14
14
 
15
15
 
16
16
  class ClassWithPPIDCapabilities(Protocol):
@@ -59,7 +59,9 @@ def convert_to_dtype(value, dtype):
59
59
 
60
60
 
61
61
  def get_class_method_info(class_obj: ClassWithPPIDCapabilities,
62
- static_kw_methods: List = None):
62
+ static_kw_methods: List = None,
63
+ static_kw_defaults: Dict = None,
64
+ ):
63
65
  """Return dictionary of class info with static keyword methods docs
64
66
 
65
67
  Parameters
@@ -69,7 +71,16 @@ def get_class_method_info(class_obj: ClassWithPPIDCapabilities,
69
71
  static_kw_methods: list of callable
70
72
  The methods to inspect; all kwargs-only keyword arguments
71
73
  are extracted.
74
+ static_kw_defaults: dict
75
+ If a key in this dictionary matches an item in `static_kw_methods`,
76
+ then these are the default values returned in the "defaults"
77
+ dictionary. This is used in cases where a base class does
78
+ implement some annotations, but the subclass does not actually
79
+ use them, because e.g. they are taken from a property such as is
80
+ the case for the mask postprocessing of segmenter classes.
72
81
  """
82
+ if static_kw_defaults is None:
83
+ static_kw_defaults = {}
73
84
  doc = class_obj.__doc__ or class_obj.__init__.__doc__
74
85
  info = {
75
86
  "code": class_obj.get_ppid_code(),
@@ -82,7 +93,10 @@ def get_class_method_info(class_obj: ClassWithPPIDCapabilities,
82
93
  for mm in static_kw_methods:
83
94
  meth = getattr(class_obj, mm)
84
95
  spec = inspect.getfullargspec(meth)
85
- defau[mm] = spec.kwonlydefaults or {}
96
+ if mm_defaults := static_kw_defaults.get(mm):
97
+ defau[mm] = mm_defaults
98
+ else:
99
+ defau[mm] = spec.kwonlydefaults or {}
86
100
  annot[mm] = spec.annotations
87
101
  info["defaults"] = defau
88
102
  info["annotations"] = annot
dcnum/read/__init__.py CHANGED
@@ -2,3 +2,4 @@
2
2
  from .cache import md5sum
3
3
  from .const import PROTECTED_FEATURES
4
4
  from .hdf5_data import HDF5Data, HDF5ImageCache, concatenated_hdf5_data
5
+ from .mapped import get_mapping_indices, get_mapped_object