dclab 0.67.0__cp314-cp314-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dclab might be problematic. Click here for more details.

Files changed (142) hide show
  1. dclab/__init__.py +41 -0
  2. dclab/_version.py +34 -0
  3. dclab/cached.py +97 -0
  4. dclab/cli/__init__.py +10 -0
  5. dclab/cli/common.py +237 -0
  6. dclab/cli/task_compress.py +126 -0
  7. dclab/cli/task_condense.py +223 -0
  8. dclab/cli/task_join.py +229 -0
  9. dclab/cli/task_repack.py +98 -0
  10. dclab/cli/task_split.py +154 -0
  11. dclab/cli/task_tdms2rtdc.py +186 -0
  12. dclab/cli/task_verify_dataset.py +75 -0
  13. dclab/definitions/__init__.py +79 -0
  14. dclab/definitions/feat_const.py +202 -0
  15. dclab/definitions/feat_logic.py +182 -0
  16. dclab/definitions/meta_const.py +252 -0
  17. dclab/definitions/meta_logic.py +111 -0
  18. dclab/definitions/meta_parse.py +94 -0
  19. dclab/downsampling.cpython-314-darwin.so +0 -0
  20. dclab/downsampling.pyx +230 -0
  21. dclab/external/__init__.py +4 -0
  22. dclab/external/packaging/LICENSE +3 -0
  23. dclab/external/packaging/LICENSE.APACHE +177 -0
  24. dclab/external/packaging/LICENSE.BSD +23 -0
  25. dclab/external/packaging/__init__.py +6 -0
  26. dclab/external/packaging/_structures.py +61 -0
  27. dclab/external/packaging/version.py +505 -0
  28. dclab/external/skimage/LICENSE +28 -0
  29. dclab/external/skimage/__init__.py +2 -0
  30. dclab/external/skimage/_find_contours.py +216 -0
  31. dclab/external/skimage/_find_contours_cy.cpython-314-darwin.so +0 -0
  32. dclab/external/skimage/_find_contours_cy.pyx +188 -0
  33. dclab/external/skimage/_pnpoly.cpython-314-darwin.so +0 -0
  34. dclab/external/skimage/_pnpoly.pyx +99 -0
  35. dclab/external/skimage/_shared/__init__.py +1 -0
  36. dclab/external/skimage/_shared/geometry.cpython-314-darwin.so +0 -0
  37. dclab/external/skimage/_shared/geometry.pxd +6 -0
  38. dclab/external/skimage/_shared/geometry.pyx +55 -0
  39. dclab/external/skimage/measure.py +7 -0
  40. dclab/external/skimage/pnpoly.py +53 -0
  41. dclab/external/statsmodels/LICENSE +35 -0
  42. dclab/external/statsmodels/__init__.py +6 -0
  43. dclab/external/statsmodels/nonparametric/__init__.py +1 -0
  44. dclab/external/statsmodels/nonparametric/_kernel_base.py +203 -0
  45. dclab/external/statsmodels/nonparametric/kernel_density.py +165 -0
  46. dclab/external/statsmodels/nonparametric/kernels.py +36 -0
  47. dclab/features/__init__.py +9 -0
  48. dclab/features/bright.py +81 -0
  49. dclab/features/bright_bc.py +93 -0
  50. dclab/features/bright_perc.py +63 -0
  51. dclab/features/contour.py +161 -0
  52. dclab/features/emodulus/__init__.py +339 -0
  53. dclab/features/emodulus/load.py +252 -0
  54. dclab/features/emodulus/lut_HE-2D-FEM-22.txt +16432 -0
  55. dclab/features/emodulus/lut_HE-3D-FEM-22.txt +1276 -0
  56. dclab/features/emodulus/lut_LE-2D-FEM-19.txt +13082 -0
  57. dclab/features/emodulus/pxcorr.py +135 -0
  58. dclab/features/emodulus/scale_linear.py +247 -0
  59. dclab/features/emodulus/viscosity.py +260 -0
  60. dclab/features/fl_crosstalk.py +95 -0
  61. dclab/features/inert_ratio.py +377 -0
  62. dclab/features/volume.py +242 -0
  63. dclab/http_utils.py +322 -0
  64. dclab/isoelastics/__init__.py +468 -0
  65. dclab/isoelastics/iso_HE-2D-FEM-22-area_um-deform.txt +2440 -0
  66. dclab/isoelastics/iso_HE-2D-FEM-22-volume-deform.txt +2635 -0
  67. dclab/isoelastics/iso_HE-3D-FEM-22-area_um-deform.txt +1930 -0
  68. dclab/isoelastics/iso_HE-3D-FEM-22-volume-deform.txt +2221 -0
  69. dclab/isoelastics/iso_LE-2D-FEM-19-area_um-deform.txt +2151 -0
  70. dclab/isoelastics/iso_LE-2D-FEM-19-volume-deform.txt +2250 -0
  71. dclab/isoelastics/iso_LE-2D-ana-18-area_um-deform.txt +1266 -0
  72. dclab/kde/__init__.py +1 -0
  73. dclab/kde/base.py +459 -0
  74. dclab/kde/contours.py +222 -0
  75. dclab/kde/methods.py +313 -0
  76. dclab/kde_contours.py +10 -0
  77. dclab/kde_methods.py +11 -0
  78. dclab/lme4/__init__.py +5 -0
  79. dclab/lme4/lme4_template.R +94 -0
  80. dclab/lme4/rsetup.py +204 -0
  81. dclab/lme4/wrapr.py +386 -0
  82. dclab/polygon_filter.py +398 -0
  83. dclab/rtdc_dataset/__init__.py +15 -0
  84. dclab/rtdc_dataset/check.py +902 -0
  85. dclab/rtdc_dataset/config.py +533 -0
  86. dclab/rtdc_dataset/copier.py +353 -0
  87. dclab/rtdc_dataset/core.py +896 -0
  88. dclab/rtdc_dataset/export.py +867 -0
  89. dclab/rtdc_dataset/feat_anc_core/__init__.py +24 -0
  90. dclab/rtdc_dataset/feat_anc_core/af_basic.py +75 -0
  91. dclab/rtdc_dataset/feat_anc_core/af_emodulus.py +160 -0
  92. dclab/rtdc_dataset/feat_anc_core/af_fl_max_ctc.py +133 -0
  93. dclab/rtdc_dataset/feat_anc_core/af_image_contour.py +113 -0
  94. dclab/rtdc_dataset/feat_anc_core/af_ml_class.py +102 -0
  95. dclab/rtdc_dataset/feat_anc_core/ancillary_feature.py +320 -0
  96. dclab/rtdc_dataset/feat_anc_ml/__init__.py +32 -0
  97. dclab/rtdc_dataset/feat_anc_plugin/__init__.py +3 -0
  98. dclab/rtdc_dataset/feat_anc_plugin/plugin_feature.py +329 -0
  99. dclab/rtdc_dataset/feat_basin.py +762 -0
  100. dclab/rtdc_dataset/feat_temp.py +102 -0
  101. dclab/rtdc_dataset/filter.py +263 -0
  102. dclab/rtdc_dataset/fmt_dcor/__init__.py +7 -0
  103. dclab/rtdc_dataset/fmt_dcor/access_token.py +52 -0
  104. dclab/rtdc_dataset/fmt_dcor/api.py +173 -0
  105. dclab/rtdc_dataset/fmt_dcor/base.py +299 -0
  106. dclab/rtdc_dataset/fmt_dcor/basin.py +73 -0
  107. dclab/rtdc_dataset/fmt_dcor/logs.py +26 -0
  108. dclab/rtdc_dataset/fmt_dcor/tables.py +66 -0
  109. dclab/rtdc_dataset/fmt_dict.py +103 -0
  110. dclab/rtdc_dataset/fmt_hdf5/__init__.py +6 -0
  111. dclab/rtdc_dataset/fmt_hdf5/base.py +192 -0
  112. dclab/rtdc_dataset/fmt_hdf5/basin.py +30 -0
  113. dclab/rtdc_dataset/fmt_hdf5/events.py +276 -0
  114. dclab/rtdc_dataset/fmt_hdf5/feat_defect.py +164 -0
  115. dclab/rtdc_dataset/fmt_hdf5/logs.py +33 -0
  116. dclab/rtdc_dataset/fmt_hdf5/tables.py +60 -0
  117. dclab/rtdc_dataset/fmt_hierarchy/__init__.py +11 -0
  118. dclab/rtdc_dataset/fmt_hierarchy/base.py +278 -0
  119. dclab/rtdc_dataset/fmt_hierarchy/events.py +146 -0
  120. dclab/rtdc_dataset/fmt_hierarchy/hfilter.py +140 -0
  121. dclab/rtdc_dataset/fmt_hierarchy/mapper.py +134 -0
  122. dclab/rtdc_dataset/fmt_http.py +102 -0
  123. dclab/rtdc_dataset/fmt_s3.py +354 -0
  124. dclab/rtdc_dataset/fmt_tdms/__init__.py +476 -0
  125. dclab/rtdc_dataset/fmt_tdms/event_contour.py +264 -0
  126. dclab/rtdc_dataset/fmt_tdms/event_image.py +220 -0
  127. dclab/rtdc_dataset/fmt_tdms/event_mask.py +62 -0
  128. dclab/rtdc_dataset/fmt_tdms/event_trace.py +146 -0
  129. dclab/rtdc_dataset/fmt_tdms/exc.py +37 -0
  130. dclab/rtdc_dataset/fmt_tdms/naming.py +151 -0
  131. dclab/rtdc_dataset/load.py +77 -0
  132. dclab/rtdc_dataset/meta_table.py +25 -0
  133. dclab/rtdc_dataset/writer.py +1019 -0
  134. dclab/statistics.py +226 -0
  135. dclab/util.py +176 -0
  136. dclab/warn.py +15 -0
  137. dclab-0.67.0.dist-info/METADATA +153 -0
  138. dclab-0.67.0.dist-info/RECORD +142 -0
  139. dclab-0.67.0.dist-info/WHEEL +6 -0
  140. dclab-0.67.0.dist-info/entry_points.txt +8 -0
  141. dclab-0.67.0.dist-info/licenses/LICENSE +283 -0
  142. dclab-0.67.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,762 @@
1
+ """
2
+ With basins, you can create analysis pipelines that result in output files
3
+ which, when opened in dclab, can access features stored in the input file
4
+ (without having to write those features to the output file).
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import abc
9
+ import logging
10
+ import numbers
11
+ import threading
12
+ from typing import Callable, Dict, List, Literal, Union
13
+ import uuid
14
+ import warnings
15
+ import weakref
16
+
17
+ import numpy as np
18
+
19
+ from ..util import copy_if_needed
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class BasinFeatureMissingWarning(UserWarning):
26
+ """Used when a badin feature is defined but not stored"""
27
+
28
+
29
+ class CyclicBasinDependencyFoundWarning(UserWarning):
30
+ """Used when a basin is defined in one of its sub-basins"""
31
+
32
+
33
+ class IgnoringPerishableBasinTTL(UserWarning):
34
+ """Used when refreshing a basin does not support TTL"""
35
+
36
+
37
+ class BasinmapFeatureMissingError(KeyError):
38
+ """Used when one of the `basinmap` features is not defined"""
39
+ pass
40
+
41
+
42
+ class BasinNotAvailableError(BaseException):
43
+ """Used to identify situations where the basin data is not available"""
44
+ pass
45
+
46
+
47
+ class BasinAvailabilityChecker(threading.Thread):
48
+ """Helper thread for checking basin availability in the background"""
49
+ def __init__(self, basin, *args, **kwargs):
50
+ super(BasinAvailabilityChecker, self).__init__(*args, daemon=True,
51
+ **kwargs)
52
+ self.basin = basin
53
+
54
+ def run(self):
55
+ self.basin.is_available()
56
+
57
+
58
+ class PerishableRecord:
59
+ """A class containing information about perishable basins
60
+
61
+ Perishable basins are basins than may discontinue to work after
62
+ e.g. a specific amount of time (e.g. presigned S3 URLs). With the
63
+ `PerishableRecord`, these basins may be "refreshed" (made
64
+ available again).
65
+ """
66
+ def __init__(self,
67
+ basin,
68
+ expiration_func: Callable = None,
69
+ expiration_kwargs: Dict = None,
70
+ refresh_func: Callable = None,
71
+ refresh_kwargs: Dict = None,
72
+ ):
73
+ """
74
+ Parameters
75
+ ----------
76
+ basin: Basin
77
+ Instance of the perishable basin
78
+ expiration_func: callable
79
+ A function that determines whether the basin has perished.
80
+ It must accept `basin` as the first argument. Calling this
81
+ function should be fast, as it is called every time a feature
82
+ is accessed.
83
+ Note that if you are implementing this in the time domain, then
84
+ you should use `time.time()` (TSE), because you need an absolute
85
+ time measure. `time.monotonic()` for instance does not count up
86
+ when the system goes to sleep. However, keep in mind that if
87
+ a remote machine dictates the expiration time, then that
88
+ remote machine should also transmit the creation time (in case
89
+ there are time offsets).
90
+ expiration_kwargs: dict
91
+ Additional kwargs for `expiration_func`.
92
+ refresh_func: callable
93
+ The function used to refresh the `basin`. It must accept
94
+ `basin` as the first argument.
95
+ refresh_kwargs: dict
96
+ Additional kwargs for `refresh_func`
97
+ """
98
+ if not isinstance(basin, weakref.ProxyType):
99
+ basin = weakref.proxy(basin)
100
+ self.basin = basin
101
+ self.expiration_func = expiration_func
102
+ self.expiration_kwargs = expiration_kwargs or {}
103
+ self.refresh_func = refresh_func
104
+ self.refresh_kwargs = refresh_kwargs or {}
105
+
106
+ def __repr__(self):
107
+ state = "perished" if self.perished() else "valid"
108
+ return f"<PerishableRecord ({state}) at {hex(id(self))}>"
109
+
110
+ def perished(self) -> Union[bool, None]:
111
+ """Determine whether the basin has perished
112
+
113
+ Returns
114
+ -------
115
+ state: bool or None
116
+ True means the basin has perished, False means the basin
117
+ has not perished, and `None` means we don't know
118
+ """
119
+ if self.expiration_func is None:
120
+ return None
121
+ else:
122
+ return self.expiration_func(self.basin, **self.expiration_kwargs)
123
+
124
+ def refresh(self, extend_by: float = None) -> None:
125
+ """Extend the lifetime of the associated perishable basin
126
+
127
+ Parameters
128
+ ----------
129
+ extend_by: float
130
+ Custom argument for extending the life of the basin.
131
+ Normally, this would be a lifetime.
132
+
133
+ Returns
134
+ -------
135
+ basin: dict | None
136
+ Dictionary for instantiating a new basin
137
+ """
138
+ if self.refresh_func is None:
139
+ # The basin is a perishable basin, but we have no way of
140
+ # refreshing it.
141
+ logger.error(f"Cannot refresh basin '{self.basin}'")
142
+ return
143
+
144
+ if extend_by and "extend_by" not in self.refresh_kwargs:
145
+ warnings.warn(
146
+ "Parameter 'extend_by' ignored, because the basin "
147
+ "source does not support it",
148
+ IgnoringPerishableBasinTTL)
149
+ extend_by = None
150
+
151
+ rkw = {}
152
+ rkw.update(self.refresh_kwargs)
153
+
154
+ if extend_by is not None:
155
+ rkw["extend_by"] = extend_by
156
+
157
+ self.refresh_func(self.basin, **rkw)
158
+ logger.info(f"Refreshed basin '{self.basin}'")
159
+
160
+ # If everything went well, reset the current dataset of the basin
161
+ if self.basin._ds is not None:
162
+ self.basin._ds.close()
163
+ self.basin._ds = None
164
+
165
+
166
+ class Basin(abc.ABC):
167
+ """A basin represents data from an external source
168
+
169
+ The external data must be a valid RT-DC dataset, subclasses
170
+ should ensure that the corresponding API is available.
171
+ """
172
+ id_getters = {}
173
+
174
+ def __init__(self,
175
+ location: str,
176
+ name: str = None,
177
+ description: str = None,
178
+ features: List[str] = None,
179
+ referrer_identifier: str = None,
180
+ basin_identifier: str = None,
181
+ mapping: Literal["same",
182
+ "basinmap0",
183
+ "basinmap1",
184
+ "basinmap2",
185
+ "basinmap3",
186
+ "basinmap4",
187
+ "basinmap5",
188
+ "basinmap6",
189
+ "basinmap7",
190
+ "basinmap8",
191
+ "basinmap9",
192
+ ] = "same",
193
+ mapping_referrer: Dict = None,
194
+ ignored_basins: List[str] = None,
195
+ key: str = None,
196
+ perishable=False,
197
+ **kwargs):
198
+ """
199
+
200
+ Parameters
201
+ ----------
202
+ location: str
203
+ Location of the basin, this can be a path or a URL, depending
204
+ on the implementation of the subclass
205
+ name: str
206
+ Human-readable name of the basin
207
+ description: str
208
+ Lengthy description of the basin
209
+ features: list of str
210
+ List of features this basin provides; This list is enforced,
211
+ even if the basin actually contains more features.
212
+ referrer_identifier: str
213
+ A measurement identifier against which to check the basin.
214
+ If the basin mapping is "same", then this must match the
215
+ identifier of the basin exactly, otherwise it must start
216
+ with the basin identifier (e.g. "basin-id_referrer-sub-id").
217
+ If this is set to None (default), there is no certainty
218
+ that the downstream dataset is from the same measurement.
219
+ basin_identifier: str
220
+ A measurement identifier that must match the basin exactly.
221
+ In contrast to `referrer_identifier`, the basin identifier is
222
+ the identifier of the basin file. If `basin_identifier` is
223
+ specified, the identifier of the basin must be identical to it.
224
+ mapping: str
225
+ Which type of mapping to use. This can be either "same"
226
+ when the event list of the basin is identical to that
227
+ of the dataset defining the basin, or one of the "basinmap"
228
+ features (e.g. "basinmap1") in cases where the dataset consists
229
+ of a subset of the events of the basin dataset. In the latter
230
+ case, the feature defined by `mapping` must be present in the
231
+ dataset and consist of integer-valued indices (starting at 0)
232
+ for the basin dataset.
233
+ mapping_referrer: dict-like
234
+ Dict-like object from which "basinmap" features can be obtained
235
+ in situations where `mapping != "same"`. This can be a simple
236
+ dictionary of numpy arrays or e.g. an instance of
237
+ :class:`.RTDCBase`.
238
+ ignored_basins: list of str
239
+ List of basins to ignore in subsequent basin instantiations
240
+ key: str
241
+ Unique key to identify this basin; normally computed from
242
+ a JSON dump of the basin definition. A random string is used
243
+ if None is specified.
244
+ perishable: bool or PerishableRecord
245
+ If this is not False, then it must be a :class:`.PerishableRecord`
246
+ that holds the information about the expiration time, and that
247
+ comes with a method `refresh` to extend the lifetime of the basin.
248
+ kwargs:
249
+ Additional keyword arguments passed to the `load_dataset`
250
+ method of the `Basin` subclass.
251
+
252
+ .. versionchanged: 0.58.0
253
+
254
+ Added the `mapping` keyword argument to support basins
255
+ with a superset of events.
256
+ """
257
+ #: location of the basin (e.g. path or URL)
258
+ self.location = location
259
+ #: user-defined name of the basin
260
+ self.name = name
261
+ #: lengthy description of the basin
262
+ self.description = description
263
+ # perishable record
264
+ if isinstance(perishable, bool) and perishable:
265
+ # Create an empty perishable record
266
+ perishable = PerishableRecord(self)
267
+ self.perishable = perishable
268
+ # define key of the basin
269
+ self.key = key or str(uuid.uuid4())
270
+ # features this basin provides
271
+ self._features = features
272
+ #: measurement identifier of the referencing dataset
273
+ self.referrer_identifier = referrer_identifier
274
+ self.basin_identifier = basin_identifier or None
275
+ self._identifiers_verification = None
276
+ #: ignored basins
277
+ self.ignored_basins = ignored_basins or []
278
+ #: additional keyword arguments passed to the basin
279
+ self.kwargs = kwargs
280
+ #: Event mapping strategy. If this is "same", it means that the
281
+ #: referring dataset and the basin dataset have identical event
282
+ #: indices. If `mapping` is e.g. `basinmap1` then the mapping of the
283
+ #: indices from the basin to the referring dataset is defined in
284
+ #: `self.basinmap` (copied during initialization of this class from
285
+ #: the array in the key `basinmap1` from the dict-like object
286
+ #: `mapping_referrer`).
287
+ self.mapping = mapping or "same"
288
+ self._basinmap = None # see `basinmap` property
289
+ # Create a weakref to the original referrer: If it is an instance
290
+ # of RTDCBase, then garbage collection can clean up properly and
291
+ # the basin instance has no reason to exist without the referrer.
292
+ if self.mapping != "same":
293
+ self._basinmap_referrer = weakref.ref(mapping_referrer)
294
+ else:
295
+ self._basinmap_referrer = None
296
+ self._ds = None
297
+ # perform availability check in separate thread
298
+ self._av_check_lock = threading.Lock()
299
+ self._av_check = BasinAvailabilityChecker(self)
300
+ self._av_check.start()
301
+
302
+ def __repr__(self):
303
+ try:
304
+ feature_info = len(self.features)
305
+ except BaseException:
306
+ feature_info = "unknown"
307
+ options = [
308
+ self.name,
309
+ f"mapped {self.mapping}" if self.mapping != "same" else "",
310
+ f"{feature_info} features",
311
+ f"location {self.location}",
312
+ ]
313
+ opt_str = ", ".join([o for o in options if o])
314
+
315
+ return f"<{self.__class__.__name__} ({opt_str}) at {hex(id(self))}>"
316
+
317
+ def _assert_referrer_identifier(self):
318
+ """Make sure the basin matches the measurement identifier
319
+ """
320
+ if not self.verify_basin(run_identifier=True):
321
+ raise KeyError(f"Measurement identifier of basin {self.ds} "
322
+ f"({self.get_measurement_identifier()}) does "
323
+ f"not match {self.referrer_identifier}!")
324
+
325
+ @property
326
+ def basinmap(self):
327
+ """Contains the indexing array in case of a mapped basin"""
328
+ if self._basinmap is None:
329
+ if self.mapping != "same":
330
+ try:
331
+ basinmap = self._basinmap_referrer()[self.mapping]
332
+ except (KeyError, RecursionError):
333
+ raise BasinmapFeatureMissingError(
334
+ f"Could not find the feature '{self.mapping}' in the "
335
+ f"dataset or any of its basins. This suggests that "
336
+ f"this feature was never saved anywhere. Please check "
337
+ f"the input files.")
338
+ #: `basinmap` is an integer array that maps the events from the
339
+ #: basin to the events of the referring dataset.
340
+ self._basinmap = np.array(basinmap,
341
+ dtype=np.uint64,
342
+ copy=True)
343
+ else:
344
+ self._basinmap = None
345
+ return self._basinmap
346
+
347
+ @property
348
+ @abc.abstractmethod
349
+ def basin_format(self):
350
+ """Basin format (:class:`.RTDCBase` subclass), e.g. "hdf5" or "s3"
351
+ """
352
+ # to be implemented in subclasses
353
+
354
+ @property
355
+ @abc.abstractmethod
356
+ def basin_type(self):
357
+ """Storage type to use (e.g. "file" or "remote")"""
358
+ # to be implemented in subclasses
359
+
360
+ @property
361
+ def ds(self):
362
+ """The :class:`.RTDCBase` instance represented by the basin"""
363
+ if self.perishable and self.perishable.perished():
364
+ # We have perished. Ask the PerishableRecord to refresh this
365
+ # basin so we can access it again.
366
+ self.perishable.refresh()
367
+ if self._ds is None:
368
+ if not self.is_available():
369
+ raise BasinNotAvailableError(f"Basin {self} is not available!")
370
+ self._ds = self.load_dataset(self.location, **self.kwargs)
371
+ self._ds.ignore_basins(self.ignored_basins)
372
+ return self._ds
373
+
374
+ @property
375
+ def features(self):
376
+ """Features made available by the basin
377
+
378
+ .. versionchanged: 0.56.0
379
+
380
+ Return nested basin features
381
+ """
382
+ if self._features is None:
383
+ if self.is_available():
384
+ # If features are not specified already, either by previous
385
+ # call to this method or during initialization from basin
386
+ # definition, then make the innate and *all* the basin
387
+ # features available.
388
+ self._features = sorted(set(self.ds.features_innate
389
+ + self.ds.features_basin))
390
+ else:
391
+ self._features = []
392
+ return self._features
393
+
394
+ def as_dict(self):
395
+ """Return basin kwargs for :func:`RTDCWriter.store_basin`
396
+
397
+ Note that each subclass of :class:`.RTDCBase` has its own
398
+ implementation of :func:`.RTDCBase.basins_get_dicts` which
399
+ returns a list of basin dictionaries that are used to
400
+ instantiate the basins in :func:`RTDCBase.basins_enable`.
401
+ This method here is only intended for usage with
402
+ :func:`RTDCWriter.store_basin`.
403
+ """
404
+ return {
405
+ "basin_name": self.name,
406
+ "basin_type": self.basin_type,
407
+ "basin_format": self.basin_format,
408
+ "basin_locs": [self.location],
409
+ "basin_descr": self.description,
410
+ "basin_feats": self.features,
411
+ "basin_map": self.basinmap,
412
+ "perishable": bool(self.perishable),
413
+ }
414
+
415
+ def close(self):
416
+ """Close any open file handles or connections"""
417
+ if self._ds is not None:
418
+ self._ds.close()
419
+ self._av_check.join(0.5)
420
+
421
+ def get_feature_data(self, feat):
422
+ """Return an object representing feature data of the basin"""
423
+ self._assert_referrer_identifier()
424
+ return self.ds[feat]
425
+
426
+ def get_measurement_identifier(self):
427
+ """Return the identifier of the basin dataset"""
428
+ return self.ds.get_measurement_identifier()
429
+
430
+ @abc.abstractmethod
431
+ def is_available(self):
432
+ """Return True if the basin is available"""
433
+
434
+ @abc.abstractmethod
435
+ def _load_dataset(self, location, **kwargs):
436
+ """Subclasses should return an instance of :class:`.RTDCBase`"""
437
+
438
+ def load_dataset(self, location, **kwargs):
439
+ """Return an instance of :class:`.RTDCBase` for this basin
440
+
441
+ If the basin mapping (`self.mapping`) is not the same as
442
+ the referencing dataset
443
+ """
444
+ ds = self._load_dataset(location, **kwargs)
445
+ if self.mapping != "same":
446
+ # The array `self.basinmap` may contain duplicate elements,
447
+ # which is why we cannot use hierarchy children to access the
448
+ # data (sometimes the data must be blown-up rather than gated).
449
+ ds_bn = BasinProxy(ds=ds, basinmap=self.basinmap)
450
+ else:
451
+ ds_bn = ds
452
+ return ds_bn
453
+
454
+ def verify_basin(self, run_identifier=True, availability=True):
455
+ if not availability:
456
+ warnings.warn("The keyword argument 'availability' is "
457
+ "deprecated, because it can lead to long waiting "
458
+ "times with many unavailable basins.",
459
+ DeprecationWarning)
460
+ if availability:
461
+ check_avail = self.is_available()
462
+ else:
463
+ check_avail = True
464
+
465
+ # Only check for run identifier if requested and if the availability
466
+ # check did not fail.
467
+ if run_identifier and check_avail:
468
+ if self._identifiers_verification is None:
469
+ # This is the measurement identifier of the basin.
470
+ basin_identifier = self.get_measurement_identifier()
471
+
472
+ # Perform a sanity check for the basin identifier.
473
+ if (self.basin_identifier
474
+ and self.basin_identifier != basin_identifier):
475
+ # We should not proceed any further with this basin.
476
+ self._identifiers_verification = False
477
+ warnings.warn(
478
+ f"Basin identifier mismatch for {self}. Expected "
479
+ f"'{self.basin_identifier}', got '{basin_identifier}'")
480
+
481
+ if self.referrer_identifier is None:
482
+ # No measurement identifier was presented by the
483
+ # referencing dataset. We are in the dark.
484
+ # Don't perform any checks.
485
+ self._identifiers_verification = True
486
+ else:
487
+ if basin_identifier is None:
488
+ # Again, we are in the dark, because the basin dataset
489
+ # does not have an identifier. This is an undesirable
490
+ # situation, but there is nothing we can do about it.
491
+ self._identifiers_verification = True
492
+ else:
493
+ if self.mapping == "same":
494
+ # When we have identical mapping, then the
495
+ # measurement identifier has to match exactly.
496
+ verifier = str.__eq__
497
+ else:
498
+ # When we have non-identical mapping (e.g. exported
499
+ # data), then the measurement identifier has to
500
+ # partially match.
501
+ verifier = str.startswith
502
+ self._identifiers_verification = verifier(
503
+ self.referrer_identifier, basin_identifier)
504
+
505
+ check_rid = self._identifiers_verification
506
+ else:
507
+ check_rid = True
508
+
509
+ return check_rid and check_avail
510
+
511
+
512
+ class BasinProxy:
513
+ def __init__(self, ds, basinmap):
514
+ """Proxy for accessing data in basin datasets
515
+
516
+ The idea of a basin proxy is to give access to the data of an
517
+ :class:`.RTDCBase` that is mapped, i.e. the indices defined for
518
+ the basin do not coincide with the indices in the downstream
519
+ dataset.
520
+
521
+ This class achieves two things:
522
+ 1. Subset indexing: For every event in the downstream dataset, there
523
+ is *only* one corresponding event in the basin dataset. This
524
+ could also be achieved via hierarchy children
525
+ (:class:`RTDCHierarchy`).
526
+ 2. Blown indexing: Two different events in the downstream dataset
527
+ can refer to one event in the basin dataset. I.e. the basin
528
+ dataset contains fewer events than the downstream dataset,
529
+ because e.g. it is a raw image recording series that has been
530
+ processed and multiple events were found in one frame.
531
+
532
+ Parameters
533
+ ----------
534
+ ds: RTDCBase
535
+ the basin dataset
536
+ basinmap: np.ndarray
537
+ 1D integer indexing array that maps the events of the basin
538
+ dataset to the downstream dataset
539
+ """
540
+ self.ds = ds
541
+ self.basins_get_dicts = ds.basins_get_dicts
542
+ self.basinmap = basinmap
543
+ self._features = {}
544
+
545
+ def __contains__(self, item):
546
+ return item in self.ds
547
+
548
+ def __getattr__(self, item):
549
+ if item in [
550
+ "basins",
551
+ "close",
552
+ "features",
553
+ "features_ancillary",
554
+ "features_basin",
555
+ "features_innate",
556
+ "features_loaded",
557
+ "features_local",
558
+ "features_scalar",
559
+ "get_measurement_identifier",
560
+ "ignore_basins",
561
+ ]:
562
+ return getattr(self.ds, item)
563
+ else:
564
+ raise AttributeError(
565
+ f"BasinProxy does not implement {item}")
566
+
567
+ def __getitem__(self, feat):
568
+ if feat not in self._features:
569
+ if feat == "contour":
570
+ feat_obj = BasinProxyContour(feat_obj=self.ds[feat],
571
+ basinmap=self.basinmap)
572
+ else:
573
+ feat_obj = BasinProxyFeature(feat_obj=self.ds[feat],
574
+ basinmap=self.basinmap)
575
+ self._features[feat] = feat_obj
576
+ return self._features[feat]
577
+
578
+ def __len__(self):
579
+ return len(self.basinmap)
580
+
581
+
582
+ class BasinProxyContour:
583
+ def __init__(self, feat_obj, basinmap):
584
+ """Wrap around a contour, mapping it upon data access, no caching"""
585
+ self.feat_obj = feat_obj
586
+ self.basinmap = basinmap
587
+ self.is_scalar = False
588
+ self.shape = (len(self.basinmap), np.nan, 2)
589
+ self.identifier = feat_obj.identifier
590
+
591
+ def __getattr__(self, item):
592
+ if item in [
593
+ "dtype",
594
+ ]:
595
+ return getattr(self.feat_obj, item)
596
+ else:
597
+ raise AttributeError(
598
+ f"BasinProxyContour does not implement {item}")
599
+
600
+ def __getitem__(self, index):
601
+ if isinstance(index, numbers.Integral):
602
+ # single index, cheap operation
603
+ return self.feat_obj[self.basinmap[index]]
604
+ else:
605
+ raise NotImplementedError(
606
+ "Cannot index contours without anything else than integers.")
607
+
608
+ def __len__(self):
609
+ return self.shape[0]
610
+
611
+
612
+ class BasinProxyFeature(np.lib.mixins.NDArrayOperatorsMixin):
613
+ def __init__(self, feat_obj, basinmap):
614
+ """Wrap around a feature object, mapping it upon data access"""
615
+ self.feat_obj = feat_obj
616
+ self.basinmap = basinmap
617
+ self._cache = None
618
+ self._shape = None
619
+ self._size = None
620
+ self.is_scalar = bool(len(self.feat_obj.shape) == 1)
621
+
622
+ @property
623
+ def shape(self):
624
+ if self._shape is None:
625
+ if self.is_scalar:
626
+ self._shape = self.basinmap.shape
627
+ else:
628
+ self._shape = (self.basinmap.size,) + self.feat_obj.shape[1:]
629
+ return self._shape
630
+
631
+ @property
632
+ def size(self):
633
+ if self._size is None:
634
+ self._size = np.prod(self.shape)
635
+ return self._size
636
+
637
+ def __array__(self, dtype=None, copy=copy_if_needed, *args, **kwargs):
638
+ if self._cache is None and self.is_scalar:
639
+ self._cache = self.feat_obj[:][self.basinmap]
640
+ else:
641
+ # This is dangerous territory in terms of memory usage
642
+ out_arr = np.empty((len(self.basinmap),) + self.feat_obj.shape[1:],
643
+ dtype=dtype or self.feat_obj.dtype,
644
+ *args, **kwargs)
645
+ for ii, idx in enumerate(self.basinmap):
646
+ out_arr[ii] = self.feat_obj[idx]
647
+ return out_arr
648
+ return np.array(self._cache, copy=copy)
649
+
650
+ def __getattr__(self, item):
651
+ if item in [
652
+ "dtype",
653
+ ]:
654
+ return getattr(self.feat_obj, item)
655
+ else:
656
+ raise AttributeError(
657
+ f"BasinProxyFeature does not implement {item}")
658
+
659
+ def __getitem__(self, index):
660
+ if self._cache is None and isinstance(index, numbers.Integral):
661
+ # single index, cheap operation
662
+ return self.feat_obj[self.basinmap[index]]
663
+ elif not self.is_scalar:
664
+ # image, mask, etc
665
+ if isinstance(index, slice) and index == slice(None):
666
+ indices = self.basinmap
667
+ else:
668
+ indices = self.basinmap[index]
669
+ out_arr = np.empty((len(indices),) + self.feat_obj.shape[1:],
670
+ dtype=self.feat_obj.dtype)
671
+ for ii, idx in enumerate(indices):
672
+ out_arr[ii] = self.feat_obj[idx]
673
+ return out_arr
674
+ else:
675
+ # sets the cache if not already set
676
+ return self.__array__()[index]
677
+
678
+ def __len__(self):
679
+ return len(self.basinmap)
680
+
681
+
682
+ def basin_priority_sorted_key(bdict: Dict):
683
+ """Yield a sorting value for a given basin that can be used with `sorted`
684
+
685
+ Basins are normally stored in random order in a dataset. This method
686
+ brings them into correct order, prioritizing:
687
+
688
+ - type: "file" over "remote"
689
+ - format: "HTTP" over "S3" over "dcor"
690
+ - mapping: "same" over anything else
691
+ """
692
+ srt_type = {
693
+ "internal": "a",
694
+ "file": "b",
695
+ "remote": "c",
696
+ }.get(bdict.get("type"), "z")
697
+
698
+ srt_format = {
699
+ "h5dataset": "a",
700
+ "hdf5": "b",
701
+ "http": "c",
702
+ "s3": "d",
703
+ "dcor": "e",
704
+ }.get(bdict.get("format"), "z")
705
+
706
+ mapping = bdict.get("mapping", "same") # old dicts don't have "mapping"
707
+ srt_map = "a" if mapping == "same" else mapping
708
+
709
+ return srt_type + srt_format + srt_map
710
+
711
+
712
+ class InternalH5DatasetBasin(Basin):
713
+ basin_format = "h5dataset"
714
+ basin_type = "internal"
715
+
716
+ def __init__(self, *args, **kwargs):
717
+ super(InternalH5DatasetBasin, self).__init__(*args, **kwargs)
718
+ if self.mapping == "same":
719
+ raise ValueError(
720
+ "'internal' basins must be instantiated with `mapping`. "
721
+ "If you are not doing that, then you probably don't need "
722
+ "them.")
723
+ if self._features is None:
724
+ raise ValueError("You must specify features when defining "
725
+ "internal basins.")
726
+ # Redefine the features if necessary
727
+ h5root = self._basinmap_referrer().h5file
728
+ available_features = []
729
+ for feat in self._features:
730
+ if self.location in h5root and feat in h5root[self.location]:
731
+ available_features.append(feat)
732
+ else:
733
+ warnings.warn(
734
+ f"Feature '{feat}' is defined as an internal basin, "
735
+ f"but it cannot be found in '{self.location}'.",
736
+ BasinFeatureMissingWarning)
737
+ self._features.clear()
738
+ self._features += available_features
739
+
740
+ def _load_dataset(self, location, **kwargs):
741
+ from .fmt_dict import RTDC_Dict
742
+ # get the h5file object
743
+ h5root = self._basinmap_referrer().h5file
744
+ ds_dict = {}
745
+ for feat in self.features:
746
+ ds_dict[feat] = h5root[self.location][feat]
747
+ return RTDC_Dict(ds_dict)
748
+
749
+ def is_available(self):
750
+ return bool(self._features)
751
+
752
+ def verify_basin(self, *args, **kwargs):
753
+ """It's not necessary to verify internal basins"""
754
+ return True
755
+
756
+
757
+ def get_basin_classes():
758
+ bc = {}
759
+ for b_cls in Basin.__subclasses__():
760
+ if hasattr(b_cls, "basin_format"):
761
+ bc[b_cls.basin_format] = b_cls
762
+ return bc