dcnum 0.17.0__py3-none-any.whl → 0.23.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dcnum might be problematic. Click here for more details.
- dcnum/_version.py +2 -2
- dcnum/feat/__init__.py +1 -1
- dcnum/feat/event_extractor_manager_thread.py +34 -25
- dcnum/feat/feat_background/base.py +22 -26
- dcnum/feat/feat_background/bg_copy.py +18 -12
- dcnum/feat/feat_background/bg_roll_median.py +20 -10
- dcnum/feat/feat_background/bg_sparse_median.py +55 -7
- dcnum/feat/feat_brightness/bright_all.py +41 -6
- dcnum/feat/feat_contour/__init__.py +4 -0
- dcnum/feat/{feat_moments/mt_legacy.py → feat_contour/moments.py} +32 -8
- dcnum/feat/feat_contour/volume.py +174 -0
- dcnum/feat/feat_texture/tex_all.py +28 -1
- dcnum/feat/gate.py +2 -2
- dcnum/feat/queue_event_extractor.py +30 -9
- dcnum/logic/ctrl.py +199 -49
- dcnum/logic/job.py +63 -2
- dcnum/logic/json_encoder.py +2 -0
- dcnum/meta/ppid.py +17 -3
- dcnum/read/__init__.py +1 -0
- dcnum/read/cache.py +100 -78
- dcnum/read/const.py +6 -4
- dcnum/read/hdf5_data.py +146 -23
- dcnum/read/mapped.py +87 -0
- dcnum/segm/__init__.py +6 -3
- dcnum/segm/segm_thresh.py +6 -18
- dcnum/segm/segm_torch/__init__.py +19 -0
- dcnum/segm/segm_torch/segm_torch_base.py +125 -0
- dcnum/segm/segm_torch/segm_torch_mpo.py +71 -0
- dcnum/segm/segm_torch/segm_torch_sto.py +88 -0
- dcnum/segm/segm_torch/torch_model.py +95 -0
- dcnum/segm/segm_torch/torch_postproc.py +93 -0
- dcnum/segm/segm_torch/torch_preproc.py +114 -0
- dcnum/segm/segmenter.py +181 -80
- dcnum/segm/segmenter_manager_thread.py +38 -30
- dcnum/segm/{segmenter_cpu.py → segmenter_mpo.py} +116 -44
- dcnum/segm/segmenter_sto.py +110 -0
- dcnum/write/__init__.py +2 -1
- dcnum/write/deque_writer_thread.py +9 -1
- dcnum/write/queue_collector_thread.py +8 -14
- dcnum/write/writer.py +128 -5
- {dcnum-0.17.0.dist-info → dcnum-0.23.1.dist-info}/METADATA +4 -2
- dcnum-0.23.1.dist-info/RECORD +55 -0
- {dcnum-0.17.0.dist-info → dcnum-0.23.1.dist-info}/WHEEL +1 -1
- dcnum/feat/feat_moments/__init__.py +0 -4
- dcnum/segm/segmenter_gpu.py +0 -64
- dcnum-0.17.0.dist-info/RECORD +0 -46
- /dcnum/feat/{feat_moments/ct_opencv.py → feat_contour/contour.py} +0 -0
- {dcnum-0.17.0.dist-info → dcnum-0.23.1.dist-info}/LICENSE +0 -0
- {dcnum-0.17.0.dist-info → dcnum-0.23.1.dist-info}/top_level.txt +0 -0
dcnum/read/cache.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import abc
|
|
1
2
|
import collections
|
|
2
3
|
import functools
|
|
3
4
|
import hashlib
|
|
4
5
|
import pathlib
|
|
6
|
+
from typing import Tuple
|
|
5
7
|
import warnings
|
|
6
8
|
|
|
7
9
|
import h5py
|
|
@@ -13,41 +15,55 @@ class EmptyDatasetWarning(UserWarning):
|
|
|
13
15
|
pass
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
class
|
|
18
|
+
class BaseImageChunkCache(abc.ABC):
|
|
17
19
|
def __init__(self,
|
|
18
|
-
|
|
20
|
+
shape: Tuple[int],
|
|
19
21
|
chunk_size: int = 1000,
|
|
20
22
|
cache_size: int = 2,
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
that are chunked in various ways. Loading just a single image
|
|
26
|
-
can be time-consuming, because an entire HDF5 chunk has to be
|
|
27
|
-
loaded, decompressed and from that one image extracted. The
|
|
28
|
-
`HDF5ImageCache` class caches the chunks from the HDF5 files
|
|
29
|
-
into memory, making single-image-access very fast.
|
|
30
|
-
"""
|
|
31
|
-
self.shape = h5ds.shape
|
|
23
|
+
):
|
|
24
|
+
self.shape = shape
|
|
25
|
+
self._dtype = None
|
|
26
|
+
chunk_size = min(shape[0], chunk_size)
|
|
32
27
|
self._len = self.shape[0]
|
|
33
|
-
if self._len == 0:
|
|
34
|
-
warnings.warn(f"Input image '{h5ds.name}' in "
|
|
35
|
-
f"file {h5ds.file.filename} has zero length",
|
|
36
|
-
EmptyDatasetWarning)
|
|
37
|
-
# TODO:
|
|
38
|
-
# - adjust chunking to multiples of the chunks in the dataset
|
|
39
|
-
# (which might slightly speed up things)
|
|
40
|
-
chunk_size = min(h5ds.shape[0], chunk_size)
|
|
41
|
-
self.h5ds = h5ds
|
|
42
|
-
self.chunk_size = chunk_size
|
|
43
|
-
self.boolean = boolean
|
|
44
|
-
self.cache_size = cache_size
|
|
45
28
|
#: This is a FILO cache for the chunks
|
|
46
29
|
self.cache = collections.OrderedDict()
|
|
47
30
|
self.image_shape = self.shape[1:]
|
|
48
31
|
self.chunk_shape = (chunk_size,) + self.shape[1:]
|
|
32
|
+
self.chunk_size = chunk_size
|
|
33
|
+
self.cache_size = cache_size
|
|
49
34
|
self.num_chunks = int(np.ceil(self._len / (self.chunk_size or 1)))
|
|
50
35
|
|
|
36
|
+
def __getitem__(self, index):
|
|
37
|
+
if isinstance(index, (slice, list, np.ndarray)):
|
|
38
|
+
if isinstance(index, slice):
|
|
39
|
+
indices = np.arange(index.start or 0,
|
|
40
|
+
index.stop or len(self),
|
|
41
|
+
index.step)
|
|
42
|
+
else:
|
|
43
|
+
indices = index
|
|
44
|
+
array_out = np.empty((len(indices),) + self.image_shape,
|
|
45
|
+
dtype=self.dtype)
|
|
46
|
+
for ii, idx in enumerate(indices):
|
|
47
|
+
array_out[ii] = self[idx]
|
|
48
|
+
return array_out
|
|
49
|
+
else:
|
|
50
|
+
chunk_index, sub_index = self._get_chunk_index_for_index(index)
|
|
51
|
+
return self.get_chunk(chunk_index)[sub_index]
|
|
52
|
+
|
|
53
|
+
def __len__(self):
|
|
54
|
+
return self._len
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def dtype(self):
|
|
58
|
+
"""data type of the image data"""
|
|
59
|
+
if self._dtype is None:
|
|
60
|
+
self._dtype = self[0].dtype
|
|
61
|
+
return self._dtype
|
|
62
|
+
|
|
63
|
+
@abc.abstractmethod
|
|
64
|
+
def _get_chunk_data(self, chunk_slice):
|
|
65
|
+
"""Implemented in subclass to obtain actual data"""
|
|
66
|
+
|
|
51
67
|
def _get_chunk_index_for_index(self, index):
|
|
52
68
|
if index < 0:
|
|
53
69
|
index = self._len + index
|
|
@@ -55,30 +71,19 @@ class HDF5ImageCache:
|
|
|
55
71
|
raise IndexError(
|
|
56
72
|
f"Index {index} out of bounds for HDF5ImageCache "
|
|
57
73
|
f"of size {self._len}")
|
|
74
|
+
index = int(index) # convert np.uint64 to int, so we get ints below
|
|
58
75
|
chunk_index = index // self.chunk_size
|
|
59
76
|
sub_index = index % self.chunk_size
|
|
60
77
|
return chunk_index, sub_index
|
|
61
78
|
|
|
62
|
-
def __getitem__(self, index):
|
|
63
|
-
chunk_index, sub_index = self._get_chunk_index_for_index(index)
|
|
64
|
-
return self.get_chunk(chunk_index)[sub_index]
|
|
65
|
-
|
|
66
|
-
def __len__(self):
|
|
67
|
-
return self._len
|
|
68
|
-
|
|
69
79
|
def get_chunk(self, chunk_index):
|
|
70
80
|
"""Return one chunk of images"""
|
|
71
81
|
if chunk_index not in self.cache:
|
|
72
|
-
|
|
73
|
-
self.chunk_size * (chunk_index + 1)
|
|
74
|
-
)
|
|
75
|
-
data = self.h5ds[fslice]
|
|
76
|
-
if self.boolean:
|
|
77
|
-
data = np.array(data, dtype=bool)
|
|
78
|
-
self.cache[chunk_index] = data
|
|
79
|
-
if len(self.cache) > self.cache_size:
|
|
82
|
+
if len(self.cache) >= self.cache_size:
|
|
80
83
|
# Remove the first item
|
|
81
84
|
self.cache.popitem(last=False)
|
|
85
|
+
data = self._get_chunk_data(self.get_chunk_slice(chunk_index))
|
|
86
|
+
self.cache[chunk_index] = data
|
|
82
87
|
return self.cache[chunk_index]
|
|
83
88
|
|
|
84
89
|
def get_chunk_size(self, chunk_index):
|
|
@@ -91,60 +96,77 @@ class HDF5ImageCache:
|
|
|
91
96
|
raise IndexError(f"{self} only has {self.num_chunks} chunks!")
|
|
92
97
|
return chunk_size
|
|
93
98
|
|
|
99
|
+
def get_chunk_slice(self, chunk_index):
|
|
100
|
+
"""Return the slice corresponding to the chunk index"""
|
|
101
|
+
ch_slice = slice(self.chunk_size * chunk_index,
|
|
102
|
+
self.chunk_size * (chunk_index + 1)
|
|
103
|
+
)
|
|
104
|
+
return ch_slice
|
|
105
|
+
|
|
94
106
|
def iter_chunks(self):
|
|
95
|
-
size = self.h5ds.shape[0]
|
|
96
107
|
index = 0
|
|
97
108
|
chunk = 0
|
|
98
109
|
while True:
|
|
99
110
|
yield chunk
|
|
100
111
|
chunk += 1
|
|
101
112
|
index += self.chunk_size
|
|
102
|
-
if index >=
|
|
113
|
+
if index >= self._len:
|
|
103
114
|
break
|
|
104
115
|
|
|
105
116
|
|
|
106
|
-
class
|
|
117
|
+
class HDF5ImageCache(BaseImageChunkCache):
|
|
107
118
|
def __init__(self,
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
self.num_chunks = image.num_chunks
|
|
114
|
-
self.h5ds = image.h5ds
|
|
115
|
-
self.shape = image.shape
|
|
116
|
-
self.chunk_shape = image.chunk_shape
|
|
117
|
-
#: This is a FILO cache for the corrected image chunks
|
|
118
|
-
self.cache = collections.OrderedDict()
|
|
119
|
-
self.cache_size = image.cache_size
|
|
119
|
+
h5ds: h5py.Dataset,
|
|
120
|
+
chunk_size: int = 1000,
|
|
121
|
+
cache_size: int = 2,
|
|
122
|
+
boolean: bool = False):
|
|
123
|
+
"""An HDF5 image cache
|
|
120
124
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
125
|
+
Deformability cytometry data files commonly contain image stacks
|
|
126
|
+
that are chunked in various ways. Loading just a single image
|
|
127
|
+
can be time-consuming, because an entire HDF5 chunk has to be
|
|
128
|
+
loaded, decompressed and from that one image extracted. The
|
|
129
|
+
`HDF5ImageCache` class caches the chunks from the HDF5 files
|
|
130
|
+
into memory, making single-image-access very fast.
|
|
131
|
+
"""
|
|
132
|
+
super(HDF5ImageCache, self).__init__(
|
|
133
|
+
shape=h5ds.shape,
|
|
134
|
+
chunk_size=chunk_size,
|
|
135
|
+
cache_size=cache_size)
|
|
136
|
+
# TODO:
|
|
137
|
+
# - adjust chunking to multiples of the chunks in the dataset
|
|
138
|
+
# (which might slightly speed up things)
|
|
139
|
+
self.h5ds = h5ds
|
|
140
|
+
self.boolean = boolean
|
|
127
141
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
142
|
+
if self._len == 0:
|
|
143
|
+
warnings.warn(f"Input image '{h5ds.name}' in "
|
|
144
|
+
f"file {h5ds.file.filename} has zero length",
|
|
145
|
+
EmptyDatasetWarning)
|
|
131
146
|
|
|
132
|
-
def
|
|
133
|
-
|
|
147
|
+
def _get_chunk_data(self, chunk_slice):
|
|
148
|
+
data = self.h5ds[chunk_slice]
|
|
149
|
+
if self.boolean:
|
|
150
|
+
data = np.array(data, dtype=bool)
|
|
151
|
+
return data
|
|
134
152
|
|
|
135
|
-
def get_chunk(self, chunk_index):
|
|
136
|
-
if chunk_index not in self.cache:
|
|
137
|
-
data = np.array(
|
|
138
|
-
self.image.get_chunk(chunk_index), dtype=np.int16) \
|
|
139
|
-
- self.image_bg.get_chunk(chunk_index)
|
|
140
|
-
self.cache[chunk_index] = data
|
|
141
|
-
if len(self.cache) > self.cache_size:
|
|
142
|
-
# Remove the first item
|
|
143
|
-
self.cache.popitem(last=False)
|
|
144
|
-
return self.cache[chunk_index]
|
|
145
153
|
|
|
146
|
-
|
|
147
|
-
|
|
154
|
+
class ImageCorrCache(BaseImageChunkCache):
|
|
155
|
+
def __init__(self,
|
|
156
|
+
image: HDF5ImageCache,
|
|
157
|
+
image_bg: HDF5ImageCache):
|
|
158
|
+
super(ImageCorrCache, self).__init__(
|
|
159
|
+
shape=image.shape,
|
|
160
|
+
chunk_size=image.chunk_size,
|
|
161
|
+
cache_size=image.cache_size)
|
|
162
|
+
self.image = image
|
|
163
|
+
self.image_bg = image_bg
|
|
164
|
+
|
|
165
|
+
def _get_chunk_data(self, chunk_slice):
|
|
166
|
+
data = np.array(
|
|
167
|
+
self.image._get_chunk_data(chunk_slice), dtype=np.int16) \
|
|
168
|
+
- self.image_bg._get_chunk_data(chunk_slice)
|
|
169
|
+
return data
|
|
148
170
|
|
|
149
171
|
|
|
150
172
|
@functools.cache
|
dcnum/read/const.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
|
1
|
-
#: Scalar features that apply to all events in a frame
|
|
1
|
+
#: Scalar features that apply to all events in a frame and which are
|
|
2
|
+
#: not computed for individual events.
|
|
2
3
|
PROTECTED_FEATURES = [
|
|
3
|
-
"
|
|
4
|
+
"bg_off",
|
|
4
5
|
"flow_rate",
|
|
5
6
|
"frame",
|
|
6
7
|
"g_force",
|
|
7
|
-
"index_online",
|
|
8
8
|
"pressure",
|
|
9
9
|
"temp",
|
|
10
10
|
"temp_amb",
|
|
11
|
-
"time"
|
|
11
|
+
"time",
|
|
12
12
|
]
|
|
13
13
|
|
|
14
|
+
# User-defined features may be anything, but if the user needs something
|
|
15
|
+
# very specific for the pipeline, having them protected is a nice feature.
|
|
14
16
|
for ii in range(10):
|
|
15
17
|
PROTECTED_FEATURES.append(f"userdef{ii}")
|
dcnum/read/hdf5_data.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
3
4
|
import io
|
|
4
5
|
import json
|
|
6
|
+
import numbers
|
|
5
7
|
import pathlib
|
|
6
8
|
import tempfile
|
|
7
9
|
from typing import Dict, BinaryIO, List
|
|
@@ -13,6 +15,7 @@ import numpy as np
|
|
|
13
15
|
|
|
14
16
|
from .cache import HDF5ImageCache, ImageCorrCache, md5sum
|
|
15
17
|
from .const import PROTECTED_FEATURES
|
|
18
|
+
from .mapped import get_mapped_object, get_mapping_indices
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
class HDF5Data:
|
|
@@ -26,12 +29,47 @@ class HDF5Data:
|
|
|
26
29
|
logs: Dict[List[str]] = None,
|
|
27
30
|
tables: Dict[np.ndarray] = None,
|
|
28
31
|
image_cache_size: int = 2,
|
|
32
|
+
index_mapping: int | slice | List | np.ndarray = None,
|
|
29
33
|
):
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
path:
|
|
39
|
+
path to data file
|
|
40
|
+
pixel_size:
|
|
41
|
+
pixel size in µm
|
|
42
|
+
md5_5m:
|
|
43
|
+
MD5 sum of the first 5 MiB; computed if not provided
|
|
44
|
+
meta:
|
|
45
|
+
metadata dictionary; extracted from HDF5 attributes
|
|
46
|
+
if not provided
|
|
47
|
+
basins:
|
|
48
|
+
list of basin dictionaries; extracted from HDF5 attributes
|
|
49
|
+
if not provided
|
|
50
|
+
logs:
|
|
51
|
+
dictionary of logs; extracted from HDF5 attributes
|
|
52
|
+
if not provided
|
|
53
|
+
tables:
|
|
54
|
+
dictionary of tables; extracted from HDF5 attributes
|
|
55
|
+
if not provided
|
|
56
|
+
image_cache_size:
|
|
57
|
+
size of the image cache to use when accessing image data
|
|
58
|
+
index_mapping:
|
|
59
|
+
select only a subset of input events, transparently reducing the
|
|
60
|
+
size of the dataset, possible data types are
|
|
61
|
+
- int `N`: use the first `N` events
|
|
62
|
+
- slice: use the events defined by a slice
|
|
63
|
+
- list: list of integers specifying the event indices to use
|
|
64
|
+
Numpy indexing rules apply. E.g. to only process the first
|
|
65
|
+
100 events, set this to `100` or `slice(0, 100)`.
|
|
66
|
+
"""
|
|
30
67
|
# Init is in __setstate__ so we can pickle this class
|
|
31
68
|
# and use it for multiprocessing.
|
|
32
69
|
if isinstance(path, h5py.File):
|
|
33
70
|
self.h5 = path
|
|
34
71
|
path = path.filename
|
|
72
|
+
|
|
35
73
|
self.__setstate__({"path": path,
|
|
36
74
|
"pixel_size": pixel_size,
|
|
37
75
|
"md5_5m": md5_5m,
|
|
@@ -40,6 +78,7 @@ class HDF5Data:
|
|
|
40
78
|
"logs": logs,
|
|
41
79
|
"tables": tables,
|
|
42
80
|
"image_cache_size": image_cache_size,
|
|
81
|
+
"index_mapping": index_mapping,
|
|
43
82
|
})
|
|
44
83
|
|
|
45
84
|
def __contains__(self, item):
|
|
@@ -53,7 +92,7 @@ class HDF5Data:
|
|
|
53
92
|
|
|
54
93
|
def __getitem__(self, feat):
|
|
55
94
|
if feat in ["image", "image_bg", "mask"]:
|
|
56
|
-
data = self.get_image_cache(feat)
|
|
95
|
+
data = self.get_image_cache(feat) # already index-mapped
|
|
57
96
|
if data is None:
|
|
58
97
|
raise KeyError(f"Feature '{feat}' not found in {self}!")
|
|
59
98
|
else:
|
|
@@ -62,19 +101,25 @@ class HDF5Data:
|
|
|
62
101
|
return self._cache_scalar[feat]
|
|
63
102
|
elif (feat in self.h5["events"]
|
|
64
103
|
and len(self.h5["events"][feat].shape) == 1): # cache scalar
|
|
65
|
-
self.
|
|
104
|
+
if self.index_mapping is None:
|
|
105
|
+
idx_map = slice(None) # no mapping indices, just slice
|
|
106
|
+
else:
|
|
107
|
+
idx_map = get_mapping_indices(self.index_mapping)
|
|
108
|
+
self._cache_scalar[feat] = self.h5["events"][feat][idx_map]
|
|
66
109
|
return self._cache_scalar[feat]
|
|
67
110
|
else:
|
|
68
111
|
if feat in self.h5["events"]:
|
|
69
112
|
# Not cached (possibly slow)
|
|
70
113
|
warnings.warn(f"Feature {feat} not cached (possibly slow)")
|
|
71
|
-
return
|
|
114
|
+
return get_mapped_object(
|
|
115
|
+
obj=self.h5["events"][feat],
|
|
116
|
+
index_mapping=self.index_mapping)
|
|
72
117
|
else:
|
|
73
118
|
# Check the basins
|
|
74
119
|
for idx in range(len(self.basins)):
|
|
75
120
|
bn, bn_features = self.get_basin_data(idx)
|
|
76
121
|
if bn_features and feat in bn_features:
|
|
77
|
-
return bn[feat]
|
|
122
|
+
return bn[feat] # already index-mapped
|
|
78
123
|
# If we got here, then the feature data does not exist.
|
|
79
124
|
raise KeyError(f"Feature '{feat}' not found in {self}!")
|
|
80
125
|
|
|
@@ -86,13 +131,14 @@ class HDF5Data:
|
|
|
86
131
|
"logs": self.logs,
|
|
87
132
|
"tables": self.tables,
|
|
88
133
|
"basins": self.basins,
|
|
89
|
-
"image_cache_size": self.image.cache_size
|
|
134
|
+
"image_cache_size": self.image.cache_size,
|
|
135
|
+
"index_mapping": self.index_mapping,
|
|
90
136
|
}
|
|
91
137
|
|
|
92
138
|
def __setstate__(self, state):
|
|
93
139
|
# Make sure these properties exist (we rely on __init__, because
|
|
94
140
|
# we want this class to be pickable and __init__ is not called by
|
|
95
|
-
# `pickle.load
|
|
141
|
+
# `pickle.load`).
|
|
96
142
|
# Cached properties
|
|
97
143
|
self._feats = None
|
|
98
144
|
self._keys = None
|
|
@@ -116,7 +162,7 @@ class HDF5Data:
|
|
|
116
162
|
if self.md5_5m is None:
|
|
117
163
|
if isinstance(self.path, pathlib.Path):
|
|
118
164
|
# 5MB md5sum of input file
|
|
119
|
-
self.md5_5m = md5sum(self.path, count=80)
|
|
165
|
+
self.md5_5m = md5sum(self.path, blocksize=65536, count=80)
|
|
120
166
|
else:
|
|
121
167
|
self.md5_5m = str(uuid.uuid4()).replace("-", "")
|
|
122
168
|
self.meta = state["meta"]
|
|
@@ -140,37 +186,44 @@ class HDF5Data:
|
|
|
140
186
|
if isinstance(self.meta[key], bytes):
|
|
141
187
|
self.meta[key] = self.meta[key].decode("utf-8")
|
|
142
188
|
# logs
|
|
143
|
-
for key in h5.get("logs",
|
|
189
|
+
for key in sorted(h5.get("logs", {}).keys()):
|
|
144
190
|
alog = list(h5["logs"][key])
|
|
145
191
|
if alog:
|
|
146
192
|
if isinstance(alog[0], bytes):
|
|
147
193
|
alog = [ll.decode("utf") for ll in alog]
|
|
148
194
|
self.logs[key] = alog
|
|
149
195
|
# tables
|
|
150
|
-
for tab in h5.get("tables",
|
|
196
|
+
for tab in sorted(h5.get("tables", {}).keys()):
|
|
151
197
|
tabdict = {}
|
|
152
198
|
for tkey in h5["tables"][tab].dtype.fields.keys():
|
|
153
199
|
tabdict[tkey] = \
|
|
154
200
|
np.array(h5["tables"][tab][tkey]).reshape(-1)
|
|
155
201
|
self.tables[tab] = tabdict
|
|
156
202
|
# basins
|
|
157
|
-
|
|
203
|
+
basins = []
|
|
204
|
+
for bnkey in h5.get("basins", {}).keys():
|
|
158
205
|
bn_data = "\n".join(
|
|
159
206
|
[s.decode() for s in h5["basins"][bnkey][:].tolist()])
|
|
160
207
|
bn_dict = json.loads(bn_data)
|
|
161
|
-
|
|
208
|
+
basins.append(bn_dict)
|
|
209
|
+
self.basins = sorted(basins, key=lambda x: x["name"])
|
|
162
210
|
|
|
163
211
|
if state["pixel_size"] is not None:
|
|
164
212
|
self.pixel_size = state["pixel_size"]
|
|
165
213
|
|
|
166
214
|
self.image_cache_size = state["image_cache_size"]
|
|
167
215
|
|
|
216
|
+
self.index_mapping = state["index_mapping"]
|
|
217
|
+
|
|
168
218
|
if self.h5 is None:
|
|
169
219
|
self.h5 = h5py.File(self.path, libver="latest")
|
|
170
220
|
|
|
171
221
|
def __len__(self):
|
|
172
222
|
if self._len is None:
|
|
173
|
-
self.
|
|
223
|
+
if self.index_mapping is not None:
|
|
224
|
+
self._len = get_mapping_indices(self.index_mapping).size
|
|
225
|
+
else:
|
|
226
|
+
self._len = self.h5.attrs["experiment:event count"]
|
|
174
227
|
return self._len
|
|
175
228
|
|
|
176
229
|
@property
|
|
@@ -244,7 +297,9 @@ class HDF5Data:
|
|
|
244
297
|
self.h5.close()
|
|
245
298
|
|
|
246
299
|
def get_ppid(self):
|
|
247
|
-
return self.get_ppid_from_ppkw(
|
|
300
|
+
return self.get_ppid_from_ppkw(
|
|
301
|
+
{"pixel_size": self.pixel_size,
|
|
302
|
+
"index_mapping": self.index_mapping})
|
|
248
303
|
|
|
249
304
|
@classmethod
|
|
250
305
|
def get_ppid_code(cls):
|
|
@@ -255,20 +310,64 @@ class HDF5Data:
|
|
|
255
310
|
# Data does not really fit into the PPID scheme we use for the rest
|
|
256
311
|
# of the pipeline. This implementation here is custom.
|
|
257
312
|
code = cls.get_ppid_code()
|
|
258
|
-
|
|
313
|
+
# pixel size
|
|
314
|
+
ppid_ps = f"{kwargs['pixel_size']:.8f}".rstrip("0")
|
|
315
|
+
# index mapping
|
|
316
|
+
ppid_im = cls.get_ppid_index_mapping(kwargs.get("index_mapping", None))
|
|
317
|
+
kwid = "^".join([f"p={ppid_ps}", f"i={ppid_im}"])
|
|
259
318
|
return ":".join([code, kwid])
|
|
260
319
|
|
|
320
|
+
@staticmethod
|
|
321
|
+
def get_ppid_index_mapping(index_mapping):
|
|
322
|
+
"""Return the pipeline identifier part for index mapping"""
|
|
323
|
+
im = index_mapping
|
|
324
|
+
if im is None:
|
|
325
|
+
dim = "0"
|
|
326
|
+
elif isinstance(im, numbers.Integral):
|
|
327
|
+
dim = f"{im}"
|
|
328
|
+
elif isinstance(im, slice):
|
|
329
|
+
dim = (f"{im.start if im.start is not None else 'n'}"
|
|
330
|
+
+ f"-{im.stop if im.stop is not None else 'n'}"
|
|
331
|
+
+ f"-{im.step if im.step is not None else 'n'}"
|
|
332
|
+
)
|
|
333
|
+
elif isinstance(im, (list, np.ndarray)):
|
|
334
|
+
idhash = hashlib.md5(
|
|
335
|
+
np.array(im, dtype=np.uint32).tobytes()).hexdigest()
|
|
336
|
+
dim = f"h-{idhash[:8]}"
|
|
337
|
+
else:
|
|
338
|
+
dim = "unknown"
|
|
339
|
+
return dim
|
|
340
|
+
|
|
261
341
|
@staticmethod
|
|
262
342
|
def get_ppkw_from_ppid(dat_ppid):
|
|
263
343
|
# Data does not fit in the PPID scheme we use, but we still
|
|
264
344
|
# would like to pass pixel_size to __init__ if we need it.
|
|
265
|
-
code,
|
|
345
|
+
code, kwargs_str = dat_ppid.split(":")
|
|
266
346
|
if code != HDF5Data.get_ppid_code():
|
|
267
347
|
raise ValueError(f"Could not find data method '{code}'!")
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
348
|
+
kwitems = kwargs_str.split("^")
|
|
349
|
+
kwargs = {}
|
|
350
|
+
for item in kwitems:
|
|
351
|
+
var, val = item.split("=")
|
|
352
|
+
if var == "p":
|
|
353
|
+
kwargs["pixel_size"] = float(val)
|
|
354
|
+
elif var == "i":
|
|
355
|
+
if val.startswith("h-") or val == "unknown":
|
|
356
|
+
raise ValueError(f"Cannot invert index mapping {val}")
|
|
357
|
+
elif val == "0":
|
|
358
|
+
kwargs["index_mapping"] = None
|
|
359
|
+
elif val.count("-"):
|
|
360
|
+
start, stop, step = val.split("-")
|
|
361
|
+
kwargs["index_mapping"] = slice(
|
|
362
|
+
None if start == "n" else int(start),
|
|
363
|
+
None if stop == "n" else int(stop),
|
|
364
|
+
None if step == "n" else int(step)
|
|
365
|
+
)
|
|
366
|
+
else:
|
|
367
|
+
kwargs["index_mapping"] = int(val)
|
|
368
|
+
else:
|
|
369
|
+
raise ValueError(f"Invalid parameter '{var}'!")
|
|
370
|
+
return kwargs
|
|
272
371
|
|
|
273
372
|
def get_basin_data(self, index):
|
|
274
373
|
"""Return HDF5Data info for a basin index in `self.basins`
|
|
@@ -298,7 +397,22 @@ class HDF5Data:
|
|
|
298
397
|
if path is None:
|
|
299
398
|
self._basin_data[index] = (None, None)
|
|
300
399
|
else:
|
|
301
|
-
|
|
400
|
+
feat_basinmap = bn_dict.get("mapping", None)
|
|
401
|
+
if feat_basinmap is None:
|
|
402
|
+
# This is NOT a mapped basin.
|
|
403
|
+
index_mapping = self.index_mapping
|
|
404
|
+
else:
|
|
405
|
+
# This is a mapped basin. Create an indexing list.
|
|
406
|
+
if self.index_mapping is None:
|
|
407
|
+
# The current dataset is not mapped.
|
|
408
|
+
basinmap_idx = slice(None)
|
|
409
|
+
else:
|
|
410
|
+
# The current dataset is also mapped.
|
|
411
|
+
basinmap_idx = get_mapping_indices(self.index_mapping)
|
|
412
|
+
basinmap = self.h5[f"events/{feat_basinmap}"]
|
|
413
|
+
index_mapping = basinmap[basinmap_idx]
|
|
414
|
+
|
|
415
|
+
h5dat = HDF5Data(path, index_mapping=index_mapping)
|
|
302
416
|
features = bn_dict.get("features")
|
|
303
417
|
if features is None:
|
|
304
418
|
# Only get the features from the actual HDF5 file.
|
|
@@ -323,20 +437,27 @@ class HDF5Data:
|
|
|
323
437
|
if feat not in self._image_cache:
|
|
324
438
|
if f"events/{feat}" in self.h5:
|
|
325
439
|
ds = self.h5[f"events/{feat}"]
|
|
440
|
+
idx_map = self.index_mapping
|
|
326
441
|
else:
|
|
442
|
+
idx_map = None
|
|
327
443
|
# search all basins
|
|
328
444
|
for idx in range(len(self.basins)):
|
|
329
|
-
|
|
445
|
+
bn_dat, features = self.get_basin_data(idx)
|
|
330
446
|
if features is not None:
|
|
331
447
|
if feat in features:
|
|
332
|
-
|
|
448
|
+
# HDF5 dataset
|
|
449
|
+
ds = bn_dat.h5[f"events/{feat}"]
|
|
450
|
+
# Index mapping (taken from the basins which
|
|
451
|
+
# already includes the mapping from the current
|
|
452
|
+
# instance).
|
|
453
|
+
idx_map = bn_dat.index_mapping
|
|
333
454
|
break
|
|
334
455
|
else:
|
|
335
456
|
ds = None
|
|
336
457
|
|
|
337
458
|
if ds is not None:
|
|
338
459
|
image = HDF5ImageCache(
|
|
339
|
-
h5ds=ds,
|
|
460
|
+
h5ds=get_mapped_object(obj=ds, index_mapping=idx_map),
|
|
340
461
|
cache_size=self.image_cache_size,
|
|
341
462
|
boolean=feat == "mask")
|
|
342
463
|
else:
|
|
@@ -386,6 +507,7 @@ def concatenated_hdf5_data(paths: List[pathlib.Path],
|
|
|
386
507
|
- If one of the input files does not contain a feature from the first
|
|
387
508
|
input `paths`, then a `ValueError` is raised. Use the `features`
|
|
388
509
|
argument to specify which features you need instead.
|
|
510
|
+
- Basins are not considered.
|
|
389
511
|
"""
|
|
390
512
|
h5kwargs = {"mode": "w", "libver": "latest"}
|
|
391
513
|
if isinstance(path_out, (pathlib.Path, str)):
|
|
@@ -432,6 +554,7 @@ def concatenated_hdf5_data(paths: List[pathlib.Path],
|
|
|
432
554
|
if not isinstance(h5["events"][feat], h5py.Dataset):
|
|
433
555
|
warnings.warn(
|
|
434
556
|
f"Ignoring {feat}; not implemented yet!")
|
|
557
|
+
continue
|
|
435
558
|
if feat in ["frame", "time"]:
|
|
436
559
|
continue
|
|
437
560
|
shapes.setdefault(feat, []).append(
|