dcnum 0.17.2__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dcnum might be problematic. Click here for more details.
- dcnum/_version.py +2 -2
- dcnum/feat/__init__.py +1 -1
- dcnum/feat/feat_background/base.py +18 -22
- dcnum/feat/feat_background/bg_copy.py +8 -4
- dcnum/feat/feat_background/bg_roll_median.py +19 -9
- dcnum/feat/feat_background/bg_sparse_median.py +53 -5
- dcnum/feat/feat_brightness/bright_all.py +41 -6
- dcnum/feat/feat_contour/__init__.py +4 -0
- dcnum/feat/{feat_moments/mt_legacy.py → feat_contour/moments.py} +32 -8
- dcnum/feat/feat_contour/volume.py +174 -0
- dcnum/feat/queue_event_extractor.py +25 -4
- dcnum/logic/ctrl.py +18 -3
- dcnum/logic/json_encoder.py +2 -0
- dcnum/meta/ppid.py +1 -1
- dcnum/read/__init__.py +1 -0
- dcnum/read/cache.py +78 -78
- dcnum/read/const.py +4 -1
- dcnum/read/hdf5_data.py +112 -17
- dcnum/read/mapped.py +79 -0
- dcnum/segm/segm_thresh.py +3 -3
- dcnum/segm/segmenter.py +73 -42
- dcnum/segm/segmenter_cpu.py +5 -5
- dcnum/segm/segmenter_manager_thread.py +11 -2
- dcnum/write/writer.py +37 -5
- {dcnum-0.17.2.dist-info → dcnum-0.19.0.dist-info}/METADATA +1 -1
- dcnum-0.19.0.dist-info/RECORD +48 -0
- dcnum/feat/feat_moments/__init__.py +0 -4
- dcnum-0.17.2.dist-info/RECORD +0 -46
- /dcnum/feat/{feat_moments/ct_opencv.py → feat_contour/contour.py} +0 -0
- {dcnum-0.17.2.dist-info → dcnum-0.19.0.dist-info}/LICENSE +0 -0
- {dcnum-0.17.2.dist-info → dcnum-0.19.0.dist-info}/WHEEL +0 -0
- {dcnum-0.17.2.dist-info → dcnum-0.19.0.dist-info}/top_level.txt +0 -0
dcnum/logic/ctrl.py
CHANGED
|
@@ -310,7 +310,17 @@ class DCNumJobRunner(threading.Thread):
|
|
|
310
310
|
# Whether pipeline hash is invalid.
|
|
311
311
|
ppid.compute_pipeline_hash(**datdict) != dathash
|
|
312
312
|
# Whether the input file is the original output of the pipeline.
|
|
313
|
-
or len(self.draw) != evyield
|
|
313
|
+
or len(self.draw) != evyield
|
|
314
|
+
# If index mapping is defined, then we always redo the pipeline.
|
|
315
|
+
# If the pipeline hashes are identical and index mapping is not
|
|
316
|
+
# None, then both pipelines were done with index mapping.
|
|
317
|
+
# But applying the same pipeline with index mapping in series
|
|
318
|
+
# will lead to a different result in the second run (e.g. 1st
|
|
319
|
+
# pipeline run: take every 2nd event; 2nd pipeline run: take
|
|
320
|
+
# every second event -> results in every 4th event in output of
|
|
321
|
+
# second pipeline run).
|
|
322
|
+
or self.draw.index_mapping is not None
|
|
323
|
+
)
|
|
314
324
|
# Do we have to recompute the background data? In addition to the
|
|
315
325
|
# hash sanity check above, check the generation, input data,
|
|
316
326
|
# and background pipeline identifiers.
|
|
@@ -382,6 +392,10 @@ class DCNumJobRunner(threading.Thread):
|
|
|
382
392
|
hw.h5.attrs["pipeline:dcnum gate"] = self.ppdict["gate_id"]
|
|
383
393
|
hw.h5.attrs["pipeline:dcnum hash"] = self.pphash
|
|
384
394
|
hw.h5.attrs["pipeline:dcnum yield"] = self.event_count
|
|
395
|
+
# index mapping information
|
|
396
|
+
im = self.job.kwargs["data_kwargs"].get("index_mapping", None)
|
|
397
|
+
dim = HDF5Data.get_ppid_index_mapping(im)
|
|
398
|
+
hw.h5.attrs["pipeline:dcnum mapping"] = dim
|
|
385
399
|
# regular metadata
|
|
386
400
|
hw.h5.attrs["experiment:event count"] = self.event_count
|
|
387
401
|
hw.h5.attrs["imaging:pixel size"] = self.draw.pixel_size
|
|
@@ -503,7 +517,7 @@ class DCNumJobRunner(threading.Thread):
|
|
|
503
517
|
num_segmenters = 1
|
|
504
518
|
elif seg_cls.hardware_processor == "cpu": # CPU segmenter
|
|
505
519
|
# We could in principle set the number of slots to one and
|
|
506
|
-
#
|
|
520
|
+
# have both number of extractors and number of segmenters set
|
|
507
521
|
# to the total number of CPUs. However, we would need more RAM
|
|
508
522
|
# (for caching the image data) and we also have more overhead.
|
|
509
523
|
# Having two slots shared between all workers is more efficient.
|
|
@@ -522,10 +536,11 @@ class DCNumJobRunner(threading.Thread):
|
|
|
522
536
|
slot_chunks = mp_spawn.Array("i", num_slots)
|
|
523
537
|
slot_states = mp_spawn.Array("u", num_slots)
|
|
524
538
|
|
|
525
|
-
# Initialize thread
|
|
539
|
+
# Initialize segmenter manager thread
|
|
526
540
|
thr_segm = SegmenterManagerThread(
|
|
527
541
|
segmenter=seg_cls(**self.job["segmenter_kwargs"]),
|
|
528
542
|
image_data=imdat,
|
|
543
|
+
bg_off=self.dtin["bg_off"] if "bg_off" in self.dtin else None,
|
|
529
544
|
slot_states=slot_states,
|
|
530
545
|
slot_chunks=slot_chunks,
|
|
531
546
|
debug=self.job["debug"],
|
dcnum/logic/json_encoder.py
CHANGED
|
@@ -13,5 +13,7 @@ class ExtendedJSONEncoder(json.JSONEncoder):
|
|
|
13
13
|
return int(obj)
|
|
14
14
|
elif isinstance(obj, np.bool_):
|
|
15
15
|
return bool(obj)
|
|
16
|
+
elif isinstance(obj, slice):
|
|
17
|
+
return "PYTHON-SLICE", (obj.start, obj.stop, obj.step)
|
|
16
18
|
# Let the base class default method raise the TypeError
|
|
17
19
|
return json.JSONEncoder.default(self, obj)
|
dcnum/meta/ppid.py
CHANGED
dcnum/read/__init__.py
CHANGED
dcnum/read/cache.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import abc
|
|
1
2
|
import collections
|
|
2
3
|
import functools
|
|
3
4
|
import hashlib
|
|
4
5
|
import pathlib
|
|
6
|
+
from typing import Tuple
|
|
5
7
|
import warnings
|
|
6
8
|
|
|
7
9
|
import h5py
|
|
@@ -13,41 +15,34 @@ class EmptyDatasetWarning(UserWarning):
|
|
|
13
15
|
pass
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
class
|
|
18
|
+
class BaseImageChunkCache(abc.ABC):
|
|
17
19
|
def __init__(self,
|
|
18
|
-
|
|
20
|
+
shape: Tuple[int],
|
|
19
21
|
chunk_size: int = 1000,
|
|
20
22
|
cache_size: int = 2,
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
Deformability cytometry data files commonly contain image stacks
|
|
25
|
-
that are chunked in various ways. Loading just a single image
|
|
26
|
-
can be time-consuming, because an entire HDF5 chunk has to be
|
|
27
|
-
loaded, decompressed and from that one image extracted. The
|
|
28
|
-
`HDF5ImageCache` class caches the chunks from the HDF5 files
|
|
29
|
-
into memory, making single-image-access very fast.
|
|
30
|
-
"""
|
|
31
|
-
self.shape = h5ds.shape
|
|
23
|
+
):
|
|
24
|
+
self.shape = shape
|
|
25
|
+
chunk_size = min(shape[0], chunk_size)
|
|
32
26
|
self._len = self.shape[0]
|
|
33
|
-
if self._len == 0:
|
|
34
|
-
warnings.warn(f"Input image '{h5ds.name}' in "
|
|
35
|
-
f"file {h5ds.file.filename} has zero length",
|
|
36
|
-
EmptyDatasetWarning)
|
|
37
|
-
# TODO:
|
|
38
|
-
# - adjust chunking to multiples of the chunks in the dataset
|
|
39
|
-
# (which might slightly speed up things)
|
|
40
|
-
chunk_size = min(h5ds.shape[0], chunk_size)
|
|
41
|
-
self.h5ds = h5ds
|
|
42
|
-
self.chunk_size = chunk_size
|
|
43
|
-
self.boolean = boolean
|
|
44
|
-
self.cache_size = cache_size
|
|
45
27
|
#: This is a FILO cache for the chunks
|
|
46
28
|
self.cache = collections.OrderedDict()
|
|
47
29
|
self.image_shape = self.shape[1:]
|
|
48
30
|
self.chunk_shape = (chunk_size,) + self.shape[1:]
|
|
31
|
+
self.chunk_size = chunk_size
|
|
32
|
+
self.cache_size = cache_size
|
|
49
33
|
self.num_chunks = int(np.ceil(self._len / (self.chunk_size or 1)))
|
|
50
34
|
|
|
35
|
+
def __getitem__(self, index):
|
|
36
|
+
chunk_index, sub_index = self._get_chunk_index_for_index(index)
|
|
37
|
+
return self.get_chunk(chunk_index)[sub_index]
|
|
38
|
+
|
|
39
|
+
def __len__(self):
|
|
40
|
+
return self._len
|
|
41
|
+
|
|
42
|
+
@abc.abstractmethod
|
|
43
|
+
def _get_chunk_data(self, chunk_slice):
|
|
44
|
+
"""Implemented in subclass to obtain actual data"""
|
|
45
|
+
|
|
51
46
|
def _get_chunk_index_for_index(self, index):
|
|
52
47
|
if index < 0:
|
|
53
48
|
index = self._len + index
|
|
@@ -59,26 +54,14 @@ class HDF5ImageCache:
|
|
|
59
54
|
sub_index = index % self.chunk_size
|
|
60
55
|
return chunk_index, sub_index
|
|
61
56
|
|
|
62
|
-
def __getitem__(self, index):
|
|
63
|
-
chunk_index, sub_index = self._get_chunk_index_for_index(index)
|
|
64
|
-
return self.get_chunk(chunk_index)[sub_index]
|
|
65
|
-
|
|
66
|
-
def __len__(self):
|
|
67
|
-
return self._len
|
|
68
|
-
|
|
69
57
|
def get_chunk(self, chunk_index):
|
|
70
58
|
"""Return one chunk of images"""
|
|
71
59
|
if chunk_index not in self.cache:
|
|
72
|
-
|
|
73
|
-
self.chunk_size * (chunk_index + 1)
|
|
74
|
-
)
|
|
75
|
-
data = self.h5ds[fslice]
|
|
76
|
-
if self.boolean:
|
|
77
|
-
data = np.array(data, dtype=bool)
|
|
78
|
-
self.cache[chunk_index] = data
|
|
79
|
-
if len(self.cache) > self.cache_size:
|
|
60
|
+
if len(self.cache) >= self.cache_size:
|
|
80
61
|
# Remove the first item
|
|
81
62
|
self.cache.popitem(last=False)
|
|
63
|
+
data = self._get_chunk_data(self.get_chunk_slice(chunk_index))
|
|
64
|
+
self.cache[chunk_index] = data
|
|
82
65
|
return self.cache[chunk_index]
|
|
83
66
|
|
|
84
67
|
def get_chunk_size(self, chunk_index):
|
|
@@ -91,60 +74,77 @@ class HDF5ImageCache:
|
|
|
91
74
|
raise IndexError(f"{self} only has {self.num_chunks} chunks!")
|
|
92
75
|
return chunk_size
|
|
93
76
|
|
|
77
|
+
def get_chunk_slice(self, chunk_index):
|
|
78
|
+
"""Return the slice corresponding to the chunk index"""
|
|
79
|
+
ch_slice = slice(self.chunk_size * chunk_index,
|
|
80
|
+
self.chunk_size * (chunk_index + 1)
|
|
81
|
+
)
|
|
82
|
+
return ch_slice
|
|
83
|
+
|
|
94
84
|
def iter_chunks(self):
|
|
95
|
-
size = self.h5ds.shape[0]
|
|
96
85
|
index = 0
|
|
97
86
|
chunk = 0
|
|
98
87
|
while True:
|
|
99
88
|
yield chunk
|
|
100
89
|
chunk += 1
|
|
101
90
|
index += self.chunk_size
|
|
102
|
-
if index >=
|
|
91
|
+
if index >= self._len:
|
|
103
92
|
break
|
|
104
93
|
|
|
105
94
|
|
|
106
|
-
class
|
|
95
|
+
class HDF5ImageCache(BaseImageChunkCache):
|
|
107
96
|
def __init__(self,
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
self.num_chunks = image.num_chunks
|
|
114
|
-
self.h5ds = image.h5ds
|
|
115
|
-
self.shape = image.shape
|
|
116
|
-
self.chunk_shape = image.chunk_shape
|
|
117
|
-
#: This is a FILO cache for the corrected image chunks
|
|
118
|
-
self.cache = collections.OrderedDict()
|
|
119
|
-
self.cache_size = image.cache_size
|
|
97
|
+
h5ds: h5py.Dataset,
|
|
98
|
+
chunk_size: int = 1000,
|
|
99
|
+
cache_size: int = 2,
|
|
100
|
+
boolean: bool = False):
|
|
101
|
+
"""An HDF5 image cache
|
|
120
102
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
103
|
+
Deformability cytometry data files commonly contain image stacks
|
|
104
|
+
that are chunked in various ways. Loading just a single image
|
|
105
|
+
can be time-consuming, because an entire HDF5 chunk has to be
|
|
106
|
+
loaded, decompressed and from that one image extracted. The
|
|
107
|
+
`HDF5ImageCache` class caches the chunks from the HDF5 files
|
|
108
|
+
into memory, making single-image-access very fast.
|
|
109
|
+
"""
|
|
110
|
+
super(HDF5ImageCache, self).__init__(
|
|
111
|
+
shape=h5ds.shape,
|
|
112
|
+
chunk_size=chunk_size,
|
|
113
|
+
cache_size=cache_size)
|
|
114
|
+
# TODO:
|
|
115
|
+
# - adjust chunking to multiples of the chunks in the dataset
|
|
116
|
+
# (which might slightly speed up things)
|
|
117
|
+
self.h5ds = h5ds
|
|
118
|
+
self.boolean = boolean
|
|
127
119
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
120
|
+
if self._len == 0:
|
|
121
|
+
warnings.warn(f"Input image '{h5ds.name}' in "
|
|
122
|
+
f"file {h5ds.file.filename} has zero length",
|
|
123
|
+
EmptyDatasetWarning)
|
|
131
124
|
|
|
132
|
-
def
|
|
133
|
-
|
|
125
|
+
def _get_chunk_data(self, chunk_slice):
|
|
126
|
+
data = self.h5ds[chunk_slice]
|
|
127
|
+
if self.boolean:
|
|
128
|
+
data = np.array(data, dtype=bool)
|
|
129
|
+
return data
|
|
134
130
|
|
|
135
|
-
def get_chunk(self, chunk_index):
|
|
136
|
-
if chunk_index not in self.cache:
|
|
137
|
-
data = np.array(
|
|
138
|
-
self.image.get_chunk(chunk_index), dtype=np.int16) \
|
|
139
|
-
- self.image_bg.get_chunk(chunk_index)
|
|
140
|
-
self.cache[chunk_index] = data
|
|
141
|
-
if len(self.cache) > self.cache_size:
|
|
142
|
-
# Remove the first item
|
|
143
|
-
self.cache.popitem(last=False)
|
|
144
|
-
return self.cache[chunk_index]
|
|
145
131
|
|
|
146
|
-
|
|
147
|
-
|
|
132
|
+
class ImageCorrCache(BaseImageChunkCache):
|
|
133
|
+
def __init__(self,
|
|
134
|
+
image: HDF5ImageCache,
|
|
135
|
+
image_bg: HDF5ImageCache):
|
|
136
|
+
super(ImageCorrCache, self).__init__(
|
|
137
|
+
shape=image.shape,
|
|
138
|
+
chunk_size=image.chunk_size,
|
|
139
|
+
cache_size=image.cache_size)
|
|
140
|
+
self.image = image
|
|
141
|
+
self.image_bg = image_bg
|
|
142
|
+
|
|
143
|
+
def _get_chunk_data(self, chunk_slice):
|
|
144
|
+
data = np.array(
|
|
145
|
+
self.image._get_chunk_data(chunk_slice), dtype=np.int16) \
|
|
146
|
+
- self.image_bg._get_chunk_data(chunk_slice)
|
|
147
|
+
return data
|
|
148
148
|
|
|
149
149
|
|
|
150
150
|
@functools.cache
|
dcnum/read/const.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#: Scalar features that apply to all events in a frame and which are
|
|
2
|
-
#: not computed
|
|
2
|
+
#: not computed for individual events.
|
|
3
3
|
PROTECTED_FEATURES = [
|
|
4
|
+
"bg_off",
|
|
4
5
|
"flow_rate",
|
|
5
6
|
"frame",
|
|
6
7
|
"g_force",
|
|
@@ -10,5 +11,7 @@ PROTECTED_FEATURES = [
|
|
|
10
11
|
"time"
|
|
11
12
|
]
|
|
12
13
|
|
|
14
|
+
# User-defined features may be anything, but if the user needs something
|
|
15
|
+
# very specific for the pipeline, having them protected is a nice feature.
|
|
13
16
|
for ii in range(10):
|
|
14
17
|
PROTECTED_FEATURES.append(f"userdef{ii}")
|
dcnum/read/hdf5_data.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
3
4
|
import io
|
|
4
5
|
import json
|
|
6
|
+
import numbers
|
|
5
7
|
import pathlib
|
|
6
8
|
import tempfile
|
|
7
9
|
from typing import Dict, BinaryIO, List
|
|
@@ -13,6 +15,7 @@ import numpy as np
|
|
|
13
15
|
|
|
14
16
|
from .cache import HDF5ImageCache, ImageCorrCache, md5sum
|
|
15
17
|
from .const import PROTECTED_FEATURES
|
|
18
|
+
from .mapped import get_mapped_object, get_mapping_indices
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
class HDF5Data:
|
|
@@ -26,12 +29,47 @@ class HDF5Data:
|
|
|
26
29
|
logs: Dict[List[str]] = None,
|
|
27
30
|
tables: Dict[np.ndarray] = None,
|
|
28
31
|
image_cache_size: int = 2,
|
|
32
|
+
index_mapping: int | slice | List | np.ndarray = None,
|
|
29
33
|
):
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
path:
|
|
39
|
+
path to data file
|
|
40
|
+
pixel_size:
|
|
41
|
+
pixel size in µm
|
|
42
|
+
md5_5m:
|
|
43
|
+
MD5 sum of the first 5 MiB; computed if not provided
|
|
44
|
+
meta:
|
|
45
|
+
metadata dictionary; extracted from HDF5 attributes
|
|
46
|
+
if not provided
|
|
47
|
+
basins:
|
|
48
|
+
list of basin dictionaries; extracted from HDF5 attributes
|
|
49
|
+
if not provided
|
|
50
|
+
logs:
|
|
51
|
+
dictionary of logs; extracted from HDF5 attributes
|
|
52
|
+
if not provided
|
|
53
|
+
tables:
|
|
54
|
+
dictionary of tables; extracted from HDF5 attributes
|
|
55
|
+
if not provided
|
|
56
|
+
image_cache_size:
|
|
57
|
+
size of the image cache to use when accessing image data
|
|
58
|
+
index_mapping:
|
|
59
|
+
select only a subset of input events, transparently reducing the
|
|
60
|
+
size of the dataset, possible data types are
|
|
61
|
+
- int `N`: use the first `N` events
|
|
62
|
+
- slice: use the events defined by a slice
|
|
63
|
+
- list: list of integers specifying the event indices to use
|
|
64
|
+
Numpy indexing rules apply. E.g. to only process the first
|
|
65
|
+
100 events, set this to `100` or `slice(0, 100)`.
|
|
66
|
+
"""
|
|
30
67
|
# Init is in __setstate__ so we can pickle this class
|
|
31
68
|
# and use it for multiprocessing.
|
|
32
69
|
if isinstance(path, h5py.File):
|
|
33
70
|
self.h5 = path
|
|
34
71
|
path = path.filename
|
|
72
|
+
|
|
35
73
|
self.__setstate__({"path": path,
|
|
36
74
|
"pixel_size": pixel_size,
|
|
37
75
|
"md5_5m": md5_5m,
|
|
@@ -40,6 +78,7 @@ class HDF5Data:
|
|
|
40
78
|
"logs": logs,
|
|
41
79
|
"tables": tables,
|
|
42
80
|
"image_cache_size": image_cache_size,
|
|
81
|
+
"index_mapping": index_mapping,
|
|
43
82
|
})
|
|
44
83
|
|
|
45
84
|
def __contains__(self, item):
|
|
@@ -53,7 +92,7 @@ class HDF5Data:
|
|
|
53
92
|
|
|
54
93
|
def __getitem__(self, feat):
|
|
55
94
|
if feat in ["image", "image_bg", "mask"]:
|
|
56
|
-
data = self.get_image_cache(feat)
|
|
95
|
+
data = self.get_image_cache(feat) # already index-mapped
|
|
57
96
|
if data is None:
|
|
58
97
|
raise KeyError(f"Feature '{feat}' not found in {self}!")
|
|
59
98
|
else:
|
|
@@ -62,19 +101,25 @@ class HDF5Data:
|
|
|
62
101
|
return self._cache_scalar[feat]
|
|
63
102
|
elif (feat in self.h5["events"]
|
|
64
103
|
and len(self.h5["events"][feat].shape) == 1): # cache scalar
|
|
65
|
-
self.
|
|
104
|
+
if self.index_mapping is None:
|
|
105
|
+
idx_map = slice(None) # no mapping indices, just slice
|
|
106
|
+
else:
|
|
107
|
+
idx_map = get_mapping_indices(self.index_mapping)
|
|
108
|
+
self._cache_scalar[feat] = self.h5["events"][feat][idx_map]
|
|
66
109
|
return self._cache_scalar[feat]
|
|
67
110
|
else:
|
|
68
111
|
if feat in self.h5["events"]:
|
|
69
112
|
# Not cached (possibly slow)
|
|
70
113
|
warnings.warn(f"Feature {feat} not cached (possibly slow)")
|
|
71
|
-
return
|
|
114
|
+
return get_mapped_object(
|
|
115
|
+
obj=self.h5["events"][feat],
|
|
116
|
+
index_mapping=self.index_mapping)
|
|
72
117
|
else:
|
|
73
118
|
# Check the basins
|
|
74
119
|
for idx in range(len(self.basins)):
|
|
75
120
|
bn, bn_features = self.get_basin_data(idx)
|
|
76
121
|
if bn_features and feat in bn_features:
|
|
77
|
-
return bn[feat]
|
|
122
|
+
return bn[feat] # already index-mapped
|
|
78
123
|
# If we got here, then the feature data does not exist.
|
|
79
124
|
raise KeyError(f"Feature '{feat}' not found in {self}!")
|
|
80
125
|
|
|
@@ -86,13 +131,14 @@ class HDF5Data:
|
|
|
86
131
|
"logs": self.logs,
|
|
87
132
|
"tables": self.tables,
|
|
88
133
|
"basins": self.basins,
|
|
89
|
-
"image_cache_size": self.image.cache_size
|
|
134
|
+
"image_cache_size": self.image.cache_size,
|
|
135
|
+
"index_mapping": self.index_mapping,
|
|
90
136
|
}
|
|
91
137
|
|
|
92
138
|
def __setstate__(self, state):
|
|
93
139
|
# Make sure these properties exist (we rely on __init__, because
|
|
94
140
|
# we want this class to be pickable and __init__ is not called by
|
|
95
|
-
# `pickle.load
|
|
141
|
+
# `pickle.load`).
|
|
96
142
|
# Cached properties
|
|
97
143
|
self._feats = None
|
|
98
144
|
self._keys = None
|
|
@@ -116,7 +162,7 @@ class HDF5Data:
|
|
|
116
162
|
if self.md5_5m is None:
|
|
117
163
|
if isinstance(self.path, pathlib.Path):
|
|
118
164
|
# 5MB md5sum of input file
|
|
119
|
-
self.md5_5m = md5sum(self.path, count=80)
|
|
165
|
+
self.md5_5m = md5sum(self.path, blocksize=65536, count=80)
|
|
120
166
|
else:
|
|
121
167
|
self.md5_5m = str(uuid.uuid4()).replace("-", "")
|
|
122
168
|
self.meta = state["meta"]
|
|
@@ -165,12 +211,17 @@ class HDF5Data:
|
|
|
165
211
|
|
|
166
212
|
self.image_cache_size = state["image_cache_size"]
|
|
167
213
|
|
|
214
|
+
self.index_mapping = state["index_mapping"]
|
|
215
|
+
|
|
168
216
|
if self.h5 is None:
|
|
169
217
|
self.h5 = h5py.File(self.path, libver="latest")
|
|
170
218
|
|
|
171
219
|
def __len__(self):
|
|
172
220
|
if self._len is None:
|
|
173
|
-
self.
|
|
221
|
+
if self.index_mapping is not None:
|
|
222
|
+
self._len = get_mapping_indices(self.index_mapping).size
|
|
223
|
+
else:
|
|
224
|
+
self._len = self.h5.attrs["experiment:event count"]
|
|
174
225
|
return self._len
|
|
175
226
|
|
|
176
227
|
@property
|
|
@@ -244,7 +295,9 @@ class HDF5Data:
|
|
|
244
295
|
self.h5.close()
|
|
245
296
|
|
|
246
297
|
def get_ppid(self):
|
|
247
|
-
return self.get_ppid_from_ppkw(
|
|
298
|
+
return self.get_ppid_from_ppkw(
|
|
299
|
+
{"pixel_size": self.pixel_size,
|
|
300
|
+
"index_mapping": self.index_mapping})
|
|
248
301
|
|
|
249
302
|
@classmethod
|
|
250
303
|
def get_ppid_code(cls):
|
|
@@ -255,20 +308,60 @@ class HDF5Data:
|
|
|
255
308
|
# Data does not really fit into the PPID scheme we use for the rest
|
|
256
309
|
# of the pipeline. This implementation here is custom.
|
|
257
310
|
code = cls.get_ppid_code()
|
|
258
|
-
|
|
311
|
+
# pixel size
|
|
312
|
+
ppid_ps = f"{kwargs['pixel_size']:.8f}".rstrip("0")
|
|
313
|
+
# index mapping
|
|
314
|
+
ppid_im = cls.get_ppid_index_mapping(kwargs.get("index_mapping", None))
|
|
315
|
+
kwid = "^".join([f"p={ppid_ps}", f"i={ppid_im}"])
|
|
259
316
|
return ":".join([code, kwid])
|
|
260
317
|
|
|
318
|
+
@staticmethod
|
|
319
|
+
def get_ppid_index_mapping(index_mapping):
|
|
320
|
+
"""Return the pipeline identifier part for index mapping"""
|
|
321
|
+
im = index_mapping
|
|
322
|
+
if im is None:
|
|
323
|
+
dim = "0"
|
|
324
|
+
elif isinstance(im, numbers.Integral):
|
|
325
|
+
dim = f"{im}"
|
|
326
|
+
elif isinstance(im, slice):
|
|
327
|
+
dim = (f"{im.start if im.start is not None else 'n'}"
|
|
328
|
+
+ f"-{im.stop if im.stop is not None else 'n'}"
|
|
329
|
+
+ f"-{im.step if im.step is not None else 'n'}"
|
|
330
|
+
)
|
|
331
|
+
elif isinstance(im, (list, np.ndarray)):
|
|
332
|
+
idhash = hashlib.md5(
|
|
333
|
+
np.array(im, dtype=np.uint32).tobytes()).hexdigest()
|
|
334
|
+
dim = f"h-{idhash[:8]}"
|
|
335
|
+
else:
|
|
336
|
+
dim = "unknown"
|
|
337
|
+
return dim
|
|
338
|
+
|
|
261
339
|
@staticmethod
|
|
262
340
|
def get_ppkw_from_ppid(dat_ppid):
|
|
263
341
|
# Data does not fit in the PPID scheme we use, but we still
|
|
264
342
|
# would like to pass pixel_size to __init__ if we need it.
|
|
265
|
-
code,
|
|
343
|
+
code, kwargs_str = dat_ppid.split(":")
|
|
266
344
|
if code != HDF5Data.get_ppid_code():
|
|
267
345
|
raise ValueError(f"Could not find data method '{code}'!")
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
346
|
+
kwitems = kwargs_str.split("^")
|
|
347
|
+
kwargs = {}
|
|
348
|
+
for item in kwitems:
|
|
349
|
+
var, val = item.split("=")
|
|
350
|
+
if var == "p":
|
|
351
|
+
kwargs["pixel_size"] = float(val)
|
|
352
|
+
elif var == "i":
|
|
353
|
+
if val.startswith("h-") or val == "unknown":
|
|
354
|
+
raise ValueError(f"Cannot invert index mapping {val}")
|
|
355
|
+
elif val == "0":
|
|
356
|
+
kwargs["index_mapping"] = None
|
|
357
|
+
elif val.count("-"):
|
|
358
|
+
start, stop = [int(v) for v in val.split("-")]
|
|
359
|
+
kwargs["index_mapping"] = slice(start, stop)
|
|
360
|
+
else:
|
|
361
|
+
kwargs["index_mapping"] = int(val)
|
|
362
|
+
else:
|
|
363
|
+
raise ValueError(f"Invalid parameter '{var}'!")
|
|
364
|
+
return kwargs
|
|
272
365
|
|
|
273
366
|
def get_basin_data(self, index):
|
|
274
367
|
"""Return HDF5Data info for a basin index in `self.basins`
|
|
@@ -298,7 +391,7 @@ class HDF5Data:
|
|
|
298
391
|
if path is None:
|
|
299
392
|
self._basin_data[index] = (None, None)
|
|
300
393
|
else:
|
|
301
|
-
h5dat = HDF5Data(path)
|
|
394
|
+
h5dat = HDF5Data(path, index_mapping=self.index_mapping)
|
|
302
395
|
features = bn_dict.get("features")
|
|
303
396
|
if features is None:
|
|
304
397
|
# Only get the features from the actual HDF5 file.
|
|
@@ -336,7 +429,8 @@ class HDF5Data:
|
|
|
336
429
|
|
|
337
430
|
if ds is not None:
|
|
338
431
|
image = HDF5ImageCache(
|
|
339
|
-
h5ds=ds,
|
|
432
|
+
h5ds=get_mapped_object(obj=ds,
|
|
433
|
+
index_mapping=self.index_mapping),
|
|
340
434
|
cache_size=self.image_cache_size,
|
|
341
435
|
boolean=feat == "mask")
|
|
342
436
|
else:
|
|
@@ -386,6 +480,7 @@ def concatenated_hdf5_data(paths: List[pathlib.Path],
|
|
|
386
480
|
- If one of the input files does not contain a feature from the first
|
|
387
481
|
input `paths`, then a `ValueError` is raised. Use the `features`
|
|
388
482
|
argument to specify which features you need instead.
|
|
483
|
+
- Basins are not considered.
|
|
389
484
|
"""
|
|
390
485
|
h5kwargs = {"mode": "w", "libver": "latest"}
|
|
391
486
|
if isinstance(path_out, (pathlib.Path, str)):
|
dcnum/read/mapped.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
|
|
3
|
+
import numbers
|
|
4
|
+
|
|
5
|
+
import h5py
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MappedHDF5Dataset:
|
|
10
|
+
def __init__(self,
|
|
11
|
+
h5ds: h5py.Dataset,
|
|
12
|
+
mapping_indices: np.ndarray):
|
|
13
|
+
"""An index-mapped object for accessing an HDF5 dataset
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
h5ds: h5py.Dataset
|
|
18
|
+
HDF5 dataset from which to map data
|
|
19
|
+
mapping_indices: np.ndarray
|
|
20
|
+
numpy indexing array containing integer indices
|
|
21
|
+
"""
|
|
22
|
+
self.h5ds = h5ds
|
|
23
|
+
self.mapping_indices = mapping_indices
|
|
24
|
+
self.shape = (mapping_indices.size,) + h5ds.shape[1:]
|
|
25
|
+
|
|
26
|
+
def __getitem__(self, idx):
|
|
27
|
+
if isinstance(idx, numbers.Integral):
|
|
28
|
+
return self.h5ds[self.mapping_indices[idx]]
|
|
29
|
+
else:
|
|
30
|
+
idx_mapped = self.mapping_indices[idx]
|
|
31
|
+
return self.h5ds[idx_mapped]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_mapping_indices(
|
|
35
|
+
index_mapping: numbers.Integral | slice | list | np.ndarray
|
|
36
|
+
):
|
|
37
|
+
if isinstance(index_mapping, numbers.Integral):
|
|
38
|
+
return _get_mapping_indices_cached(index_mapping)
|
|
39
|
+
elif isinstance(index_mapping, slice):
|
|
40
|
+
return _get_mapping_indices_cached(
|
|
41
|
+
(index_mapping.start, index_mapping.stop, index_mapping.step))
|
|
42
|
+
elif isinstance(index_mapping, (np.ndarray, list)):
|
|
43
|
+
return np.array(index_mapping, dtype=np.uint32)
|
|
44
|
+
else:
|
|
45
|
+
raise ValueError(f"Invalid type for `index_mapping`: "
|
|
46
|
+
f"{type(index_mapping)} ({index_mapping})")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@functools.lru_cache(maxsize=100)
|
|
50
|
+
def _get_mapping_indices_cached(
|
|
51
|
+
index_mapping: numbers.Integral | tuple
|
|
52
|
+
):
|
|
53
|
+
if isinstance(index_mapping, numbers.Integral):
|
|
54
|
+
return np.arange(index_mapping)
|
|
55
|
+
elif isinstance(index_mapping, tuple):
|
|
56
|
+
im_slice = slice(*index_mapping)
|
|
57
|
+
if im_slice.step is not None:
|
|
58
|
+
raise NotImplementedError("Slices with step not implemented yet")
|
|
59
|
+
if im_slice.stop is None or im_slice.start is None:
|
|
60
|
+
raise NotImplementedError(
|
|
61
|
+
"Slices must have start and stop defined")
|
|
62
|
+
return np.arange(im_slice.start, im_slice.stop)
|
|
63
|
+
elif isinstance(index_mapping, list):
|
|
64
|
+
return np.array(index_mapping, dtype=np.uint32)
|
|
65
|
+
else:
|
|
66
|
+
raise ValueError(f"Invalid type for cached `index_mapping`: "
|
|
67
|
+
f"{type(index_mapping)} ({index_mapping})")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_mapped_object(obj, index_mapping=None):
|
|
71
|
+
if index_mapping is None:
|
|
72
|
+
return obj
|
|
73
|
+
elif isinstance(obj, h5py.Dataset):
|
|
74
|
+
return MappedHDF5Dataset(
|
|
75
|
+
obj,
|
|
76
|
+
mapping_indices=get_mapping_indices(index_mapping))
|
|
77
|
+
else:
|
|
78
|
+
raise ValueError(f"No recipe to convert object of type {type(obj)} "
|
|
79
|
+
f"({obj}) to an index-mapped object")
|
dcnum/segm/segm_thresh.py
CHANGED
|
@@ -16,7 +16,7 @@ class SegmentThresh(CPUSegmenter):
|
|
|
16
16
|
Parameters
|
|
17
17
|
----------
|
|
18
18
|
thresh: int
|
|
19
|
-
grayscale
|
|
19
|
+
grayscale threshold value for creating the mask image;
|
|
20
20
|
For a background-corrected image, pixels with values below
|
|
21
21
|
this value are considered to be part of the mask.
|
|
22
22
|
"""
|
|
@@ -25,7 +25,7 @@ class SegmentThresh(CPUSegmenter):
|
|
|
25
25
|
@staticmethod
|
|
26
26
|
def segment_approach(image, *,
|
|
27
27
|
thresh: float = -6):
|
|
28
|
-
"""Mask retrieval
|
|
28
|
+
"""Mask retrieval using basic thresholding
|
|
29
29
|
|
|
30
30
|
Parameters
|
|
31
31
|
----------
|
|
@@ -39,7 +39,7 @@ class SegmentThresh(CPUSegmenter):
|
|
|
39
39
|
Returns
|
|
40
40
|
-------
|
|
41
41
|
mask: 2d boolean ndarray
|
|
42
|
-
Mask image for the
|
|
42
|
+
Mask image for the given index
|
|
43
43
|
"""
|
|
44
44
|
assert thresh < 0, "threshold values above zero not supported!"
|
|
45
45
|
return image < thresh
|