dclab 0.67.0__cp314-cp314t-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dclab might be problematic. Click here for more details.

Files changed (142) hide show
  1. dclab/__init__.py +41 -0
  2. dclab/_version.py +34 -0
  3. dclab/cached.py +97 -0
  4. dclab/cli/__init__.py +10 -0
  5. dclab/cli/common.py +237 -0
  6. dclab/cli/task_compress.py +126 -0
  7. dclab/cli/task_condense.py +223 -0
  8. dclab/cli/task_join.py +229 -0
  9. dclab/cli/task_repack.py +98 -0
  10. dclab/cli/task_split.py +154 -0
  11. dclab/cli/task_tdms2rtdc.py +186 -0
  12. dclab/cli/task_verify_dataset.py +75 -0
  13. dclab/definitions/__init__.py +79 -0
  14. dclab/definitions/feat_const.py +202 -0
  15. dclab/definitions/feat_logic.py +182 -0
  16. dclab/definitions/meta_const.py +252 -0
  17. dclab/definitions/meta_logic.py +111 -0
  18. dclab/definitions/meta_parse.py +94 -0
  19. dclab/downsampling.cpython-314t-darwin.so +0 -0
  20. dclab/downsampling.pyx +230 -0
  21. dclab/external/__init__.py +4 -0
  22. dclab/external/packaging/LICENSE +3 -0
  23. dclab/external/packaging/LICENSE.APACHE +177 -0
  24. dclab/external/packaging/LICENSE.BSD +23 -0
  25. dclab/external/packaging/__init__.py +6 -0
  26. dclab/external/packaging/_structures.py +61 -0
  27. dclab/external/packaging/version.py +505 -0
  28. dclab/external/skimage/LICENSE +28 -0
  29. dclab/external/skimage/__init__.py +2 -0
  30. dclab/external/skimage/_find_contours.py +216 -0
  31. dclab/external/skimage/_find_contours_cy.cpython-314t-darwin.so +0 -0
  32. dclab/external/skimage/_find_contours_cy.pyx +188 -0
  33. dclab/external/skimage/_pnpoly.cpython-314t-darwin.so +0 -0
  34. dclab/external/skimage/_pnpoly.pyx +99 -0
  35. dclab/external/skimage/_shared/__init__.py +1 -0
  36. dclab/external/skimage/_shared/geometry.cpython-314t-darwin.so +0 -0
  37. dclab/external/skimage/_shared/geometry.pxd +6 -0
  38. dclab/external/skimage/_shared/geometry.pyx +55 -0
  39. dclab/external/skimage/measure.py +7 -0
  40. dclab/external/skimage/pnpoly.py +53 -0
  41. dclab/external/statsmodels/LICENSE +35 -0
  42. dclab/external/statsmodels/__init__.py +6 -0
  43. dclab/external/statsmodels/nonparametric/__init__.py +1 -0
  44. dclab/external/statsmodels/nonparametric/_kernel_base.py +203 -0
  45. dclab/external/statsmodels/nonparametric/kernel_density.py +165 -0
  46. dclab/external/statsmodels/nonparametric/kernels.py +36 -0
  47. dclab/features/__init__.py +9 -0
  48. dclab/features/bright.py +81 -0
  49. dclab/features/bright_bc.py +93 -0
  50. dclab/features/bright_perc.py +63 -0
  51. dclab/features/contour.py +161 -0
  52. dclab/features/emodulus/__init__.py +339 -0
  53. dclab/features/emodulus/load.py +252 -0
  54. dclab/features/emodulus/lut_HE-2D-FEM-22.txt +16432 -0
  55. dclab/features/emodulus/lut_HE-3D-FEM-22.txt +1276 -0
  56. dclab/features/emodulus/lut_LE-2D-FEM-19.txt +13082 -0
  57. dclab/features/emodulus/pxcorr.py +135 -0
  58. dclab/features/emodulus/scale_linear.py +247 -0
  59. dclab/features/emodulus/viscosity.py +260 -0
  60. dclab/features/fl_crosstalk.py +95 -0
  61. dclab/features/inert_ratio.py +377 -0
  62. dclab/features/volume.py +242 -0
  63. dclab/http_utils.py +322 -0
  64. dclab/isoelastics/__init__.py +468 -0
  65. dclab/isoelastics/iso_HE-2D-FEM-22-area_um-deform.txt +2440 -0
  66. dclab/isoelastics/iso_HE-2D-FEM-22-volume-deform.txt +2635 -0
  67. dclab/isoelastics/iso_HE-3D-FEM-22-area_um-deform.txt +1930 -0
  68. dclab/isoelastics/iso_HE-3D-FEM-22-volume-deform.txt +2221 -0
  69. dclab/isoelastics/iso_LE-2D-FEM-19-area_um-deform.txt +2151 -0
  70. dclab/isoelastics/iso_LE-2D-FEM-19-volume-deform.txt +2250 -0
  71. dclab/isoelastics/iso_LE-2D-ana-18-area_um-deform.txt +1266 -0
  72. dclab/kde/__init__.py +1 -0
  73. dclab/kde/base.py +459 -0
  74. dclab/kde/contours.py +222 -0
  75. dclab/kde/methods.py +313 -0
  76. dclab/kde_contours.py +10 -0
  77. dclab/kde_methods.py +11 -0
  78. dclab/lme4/__init__.py +5 -0
  79. dclab/lme4/lme4_template.R +94 -0
  80. dclab/lme4/rsetup.py +204 -0
  81. dclab/lme4/wrapr.py +386 -0
  82. dclab/polygon_filter.py +398 -0
  83. dclab/rtdc_dataset/__init__.py +15 -0
  84. dclab/rtdc_dataset/check.py +902 -0
  85. dclab/rtdc_dataset/config.py +533 -0
  86. dclab/rtdc_dataset/copier.py +353 -0
  87. dclab/rtdc_dataset/core.py +896 -0
  88. dclab/rtdc_dataset/export.py +867 -0
  89. dclab/rtdc_dataset/feat_anc_core/__init__.py +24 -0
  90. dclab/rtdc_dataset/feat_anc_core/af_basic.py +75 -0
  91. dclab/rtdc_dataset/feat_anc_core/af_emodulus.py +160 -0
  92. dclab/rtdc_dataset/feat_anc_core/af_fl_max_ctc.py +133 -0
  93. dclab/rtdc_dataset/feat_anc_core/af_image_contour.py +113 -0
  94. dclab/rtdc_dataset/feat_anc_core/af_ml_class.py +102 -0
  95. dclab/rtdc_dataset/feat_anc_core/ancillary_feature.py +320 -0
  96. dclab/rtdc_dataset/feat_anc_ml/__init__.py +32 -0
  97. dclab/rtdc_dataset/feat_anc_plugin/__init__.py +3 -0
  98. dclab/rtdc_dataset/feat_anc_plugin/plugin_feature.py +329 -0
  99. dclab/rtdc_dataset/feat_basin.py +762 -0
  100. dclab/rtdc_dataset/feat_temp.py +102 -0
  101. dclab/rtdc_dataset/filter.py +263 -0
  102. dclab/rtdc_dataset/fmt_dcor/__init__.py +7 -0
  103. dclab/rtdc_dataset/fmt_dcor/access_token.py +52 -0
  104. dclab/rtdc_dataset/fmt_dcor/api.py +173 -0
  105. dclab/rtdc_dataset/fmt_dcor/base.py +299 -0
  106. dclab/rtdc_dataset/fmt_dcor/basin.py +73 -0
  107. dclab/rtdc_dataset/fmt_dcor/logs.py +26 -0
  108. dclab/rtdc_dataset/fmt_dcor/tables.py +66 -0
  109. dclab/rtdc_dataset/fmt_dict.py +103 -0
  110. dclab/rtdc_dataset/fmt_hdf5/__init__.py +6 -0
  111. dclab/rtdc_dataset/fmt_hdf5/base.py +192 -0
  112. dclab/rtdc_dataset/fmt_hdf5/basin.py +30 -0
  113. dclab/rtdc_dataset/fmt_hdf5/events.py +276 -0
  114. dclab/rtdc_dataset/fmt_hdf5/feat_defect.py +164 -0
  115. dclab/rtdc_dataset/fmt_hdf5/logs.py +33 -0
  116. dclab/rtdc_dataset/fmt_hdf5/tables.py +60 -0
  117. dclab/rtdc_dataset/fmt_hierarchy/__init__.py +11 -0
  118. dclab/rtdc_dataset/fmt_hierarchy/base.py +278 -0
  119. dclab/rtdc_dataset/fmt_hierarchy/events.py +146 -0
  120. dclab/rtdc_dataset/fmt_hierarchy/hfilter.py +140 -0
  121. dclab/rtdc_dataset/fmt_hierarchy/mapper.py +134 -0
  122. dclab/rtdc_dataset/fmt_http.py +102 -0
  123. dclab/rtdc_dataset/fmt_s3.py +354 -0
  124. dclab/rtdc_dataset/fmt_tdms/__init__.py +476 -0
  125. dclab/rtdc_dataset/fmt_tdms/event_contour.py +264 -0
  126. dclab/rtdc_dataset/fmt_tdms/event_image.py +220 -0
  127. dclab/rtdc_dataset/fmt_tdms/event_mask.py +62 -0
  128. dclab/rtdc_dataset/fmt_tdms/event_trace.py +146 -0
  129. dclab/rtdc_dataset/fmt_tdms/exc.py +37 -0
  130. dclab/rtdc_dataset/fmt_tdms/naming.py +151 -0
  131. dclab/rtdc_dataset/load.py +77 -0
  132. dclab/rtdc_dataset/meta_table.py +25 -0
  133. dclab/rtdc_dataset/writer.py +1019 -0
  134. dclab/statistics.py +226 -0
  135. dclab/util.py +176 -0
  136. dclab/warn.py +15 -0
  137. dclab-0.67.0.dist-info/METADATA +153 -0
  138. dclab-0.67.0.dist-info/RECORD +142 -0
  139. dclab-0.67.0.dist-info/WHEEL +6 -0
  140. dclab-0.67.0.dist-info/entry_points.txt +8 -0
  141. dclab-0.67.0.dist-info/licenses/LICENSE +283 -0
  142. dclab-0.67.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,896 @@
1
+ """RT-DC dataset core classes and methods"""
2
+ import abc
3
+ import hashlib
4
+ import json
5
+ import os.path
6
+ import pathlib
7
+ import random
8
+ import traceback
9
+ from typing import Literal
10
+ import uuid
11
+ import warnings
12
+
13
+ import numpy as np
14
+
15
+ from .. import definitions as dfn
16
+ from .. import downsampling
17
+ from ..kde import KernelDensityEstimator
18
+ from ..kde import methods as kde_methods
19
+ from ..polygon_filter import PolygonFilter
20
+ from ..util import hashobj
21
+ from . import feat_basin
22
+ from .export import Export
23
+ from .feat_anc_core import FEATURES_RAPID, AncillaryFeature
24
+ from .filter import Filter
25
+
26
+
27
+ class FeatureShouldExistButNotFoundWarning(UserWarning):
28
+ pass
29
+
30
+
31
+ class LocalBasinForbiddenWarning(UserWarning):
32
+ pass
33
+
34
+
35
+ class LogTransformWarning(UserWarning):
36
+ pass
37
+
38
+
39
+ class RTDCBase(abc.ABC):
40
+ def __init__(self, identifier=None, enable_basins=True):
41
+ """RT-DC measurement base class
42
+
43
+ Notes
44
+ -----
45
+ Besides the filter arrays for each data feature, there is a manual
46
+ boolean filter array ``RTDCBase.filter.manual`` that can be edited
47
+ by the user - a boolean value of ``False`` means that the event is
48
+ excluded from all computations.
49
+ """
50
+ #: Local basins are basins that are defined on the user's file system.
51
+ #: For reasons of data security (leaking data from a server or from a
52
+ #: user's file system), dclab only allows remote basins (see
53
+ #: :func:`basins_retrieve`) by default. This variable is set to True
54
+ #: for the RTDC_HDF5 file format, because it implies the data are
55
+ #: located on the user's computer.
56
+ self._local_basins_allowed = False
57
+
58
+ #: Dataset format (derived from class name)
59
+ self.format = self.__class__.__name__.split("_")[-1].lower()
60
+
61
+ # Cache attribute used for __len__()-function
62
+ self._length = None
63
+ self._polygon_filter_ids = []
64
+ # Events have the feature name as keys and contain nD ndarrays.
65
+ self._events = {}
66
+ # Ancillaries have the feature name as keys and a
67
+ # tuple containing feature and hash as value.
68
+ self._ancillaries = {}
69
+ # Temporary features are defined by the user ad hoc at runtime.
70
+ self._usertemp = {}
71
+ # List of :class:`.Basin` for external features
72
+ self._basins = None
73
+ # List of basin identifiers that should be ignored, used to
74
+ # avoid circular basin dependencies
75
+ self._basins_ignored = []
76
+ # List of all features available via basins
77
+ self._basins_features = None
78
+ #: Configuration of the measurement
79
+ self.config = None
80
+ #: Export functionalities; instance of
81
+ #: :class:`dclab.rtdc_dataset.export.Export`.
82
+ self.export = Export(self)
83
+ # Filtering functionalities; instance of
84
+ # :class:`dclab.rtdc_dataset.filter.Filter`.
85
+ self._ds_filter = None
86
+ #: Dictionary of log files. Each log file is a list of strings
87
+ #: (one string per line).
88
+ self.logs = {}
89
+ #: Dictionary of tables. Each table is an indexable compound numpy
90
+ #: array.
91
+ self.tables = {}
92
+ #: Title of the measurement
93
+ self.title = None
94
+ #: Path or DCOR identifier of the dataset (set to "none"
95
+ #: for :class:`RTDC_Dict`)
96
+ self.path = None
97
+ # Unique, random identifier
98
+ if identifier is None:
99
+ # Generate a unique, random identifier for this dataset
100
+ rhex = [random.choice('0123456789abcdef') for _n in range(7)]
101
+ self._identifier = "mm-{}_{}".format(self.format, "".join(rhex))
102
+ else:
103
+ self._identifier = identifier
104
+
105
+ # Basins are initialized in the "basins" property function
106
+ self._enable_basins = enable_basins
107
+
108
+ def __contains__(self, feat):
109
+ ct = False
110
+ if (feat in self._events
111
+ or feat in self._usertemp
112
+ or feat in self.features_basin):
113
+ ct = True
114
+ else:
115
+ # Check ancillary features data
116
+ if feat in self._ancillaries:
117
+ # already computed
118
+ ct = True
119
+ elif feat in AncillaryFeature.feature_names:
120
+ # get all instance of AncillaryFeature that
121
+ # check availability of the feature `feat`
122
+ instlist = AncillaryFeature.get_instances(feat)
123
+ for inst in instlist:
124
+ if inst.is_available(self):
125
+ # to be computed
126
+ ct = True
127
+ break
128
+ return ct
129
+
130
+ def __enter__(self):
131
+ return self
132
+
133
+ def __exit__(self, type, value, tb):
134
+ self.close()
135
+
136
+ def __getitem__(self, feat):
137
+ if feat in self._events:
138
+ return self._events[feat]
139
+ elif feat in self._usertemp:
140
+ return self._usertemp[feat]
141
+ # 1. Check for cached ancillary data
142
+ data = self._get_ancillary_feature_data(feat, no_compute=True)
143
+ if data is not None:
144
+ return data
145
+ # 2. Check for h5dataset-based, file-based, or other basin data,
146
+ # in that order.
147
+ for basin_type in ["internal", "file", None]:
148
+ data = self._get_basin_feature_data(feat, basin_type=basin_type)
149
+ if data is not None:
150
+ return data
151
+ # 3. Check for ancillary features that can be computed
152
+ data = self._get_ancillary_feature_data(feat)
153
+ if data is not None:
154
+ return data
155
+ if feat in self:
156
+ warnings.warn(f"The feature {feat} is supposedly defined in "
157
+ f"{self}, but I cannot get its data. Please "
158
+ f"make sure you have not defined any unreachable "
159
+ f"remote basins.",
160
+ FeatureShouldExistButNotFoundWarning)
161
+ # Not here ¯\_(ツ)_/¯
162
+ raise KeyError(f"Feature '{feat}' does not exist in {self}!")
163
+
164
+ def __iter__(self):
165
+ """An iterator over all valid scalar features"""
166
+ mycols = []
167
+ for col in self._feature_candidates:
168
+ if col in self:
169
+ mycols.append(col)
170
+ mycols.sort()
171
+ for col in mycols:
172
+ yield col
173
+
174
+ def __len__(self):
175
+ if self._length is None:
176
+ self._length = self._get_length()
177
+ return self._length
178
+
179
+ def _get_length(self):
180
+ # Try to get length from metadata.
181
+ length = self.config["experiment"].get("event count")
182
+ if length is not None:
183
+ return length
184
+ # Try to get the length from the feature sizes
185
+ keys = list(self._events.keys()) or self.features_basin
186
+ keys.sort()
187
+ for kk in keys:
188
+ length = len(self[kk])
189
+ if length:
190
+ return length
191
+ else:
192
+ raise ValueError(f"Could not determine size of dataset '{self}'.")
193
+
194
+ def __repr__(self):
195
+ repre = "<{} '{}' at {}".format(self.__class__.__name__,
196
+ self.identifier,
197
+ hex(id(self)))
198
+ if self.path != "none":
199
+ repre += " ({})>".format(self.path)
200
+ return repre
201
+
202
+ @property
203
+ def basins(self):
204
+ """Basins containing upstream features from other datasets"""
205
+ if self._basins is None:
206
+ if self._enable_basins:
207
+ self._basins = self.basins_retrieve()
208
+ else:
209
+ self._basins = []
210
+ return self._basins
211
+
212
+ @property
213
+ def filter(self):
214
+ """Filtering functionalities; instance of :class:`.Filter`"""
215
+ self._assert_filter()
216
+ return self._ds_filter
217
+
218
+ def _assert_filter(self):
219
+ if self._ds_filter is None:
220
+ self._ds_filter = Filter(self)
221
+
222
+ def _get_ancillary_feature_data(self,
223
+ feat: str,
224
+ no_compute: bool = False):
225
+ """Return feature data of ancillary features
226
+
227
+ Parameters
228
+ ----------
229
+ feat: str
230
+ Name of the feature
231
+ no_compute: bool
232
+ Whether to bother computing the feature. If it is not
233
+ already computed, return None instead
234
+
235
+ Returns
236
+ -------
237
+ data:
238
+ The feature object (array-like) or None if it could not
239
+ be found or was not computed.
240
+ """
241
+ data = None
242
+ anhash = None
243
+ if feat in AncillaryFeature.feature_names:
244
+ # Try to find the feature in the ancillary features
245
+ # (see feat_anc_core submodule for more information).
246
+ # These features are cached in `self._ancillaries`.
247
+ ancol = AncillaryFeature.available_features(self)
248
+ if feat in ancol:
249
+ # The feature is generally available.
250
+ if feat in self._ancillaries:
251
+ # We have already computed the feature. Make sure that we
252
+ # have the updated one by checking the hash.
253
+ anhash = ancol[feat].hash(self)
254
+ if self._ancillaries[feat][0] == anhash:
255
+ # Use cached value
256
+ data = self._ancillaries[feat][1]
257
+ # We either already have the ancillary feature or have to
258
+ # compute it. We only compute it if we are asked to.
259
+ if data is None and not no_compute:
260
+ anhash = anhash or ancol[feat].hash(self)
261
+ # Compute new value
262
+ data_dict = ancol[feat].compute(self)
263
+ for okey in data_dict:
264
+ # Store computed value in `self._ancillaries`.
265
+ self._ancillaries[okey] = (anhash, data_dict[okey])
266
+ data = data_dict[feat]
267
+ return data
268
+
269
+ def _get_basin_feature_data(
270
+ self,
271
+ feat: str,
272
+ basin_type: Literal["file", "internal", "remote", None] = None):
273
+ """Return feature data from basins
274
+
275
+ Parameters
276
+ ----------
277
+ feat: str
278
+ Name of the feature
279
+ basin_type: str or bool
280
+ The basin type to look at, which is either "file"-based
281
+ (e.g. local on disk), "remote"-based (e.g. S3), or
282
+ "internal"-type (e.g. h5py.Dataset inside the current HDF5 file).
283
+ Defaults to `None` which means no preference.
284
+
285
+ Returns
286
+ -------
287
+ data:
288
+ The feature object (array-like) or None if it could not
289
+ be found or was not computed.
290
+ """
291
+ data = None
292
+ if self.basins:
293
+ for bn in list(self.basins):
294
+ if basin_type is not None and basin_type != bn.basin_type:
295
+ # User asked for specific basin type
296
+ continue
297
+ try:
298
+ # There are all kinds of errors that may happen here.
299
+ # Note that `bn.features` can already trigger an
300
+ # availability check that may raise a ValueError.
301
+ # TODO:
302
+ # Introduce some kind of callback so the user knows
303
+ # why the data are not available. The current solution
304
+ # (fail silently) is not sufficiently transparent,
305
+ # especially when considering networking issues.
306
+ if feat in bn.features:
307
+ data = bn.get_feature_data(feat)
308
+ # The data are available, we may abort the search.
309
+ break
310
+ except (KeyError, OSError, PermissionError):
311
+ # Basin data not available
312
+ pass
313
+ except feat_basin.BasinNotAvailableError:
314
+ # remove the basin from the list
315
+ # TODO:
316
+ # Check whether this has an actual effect. It could be
317
+ # that due to some iterative process `self`
318
+ # gets re-initialized and we have to go through this
319
+ # again.
320
+ self._basins.remove(bn)
321
+ warnings.warn(
322
+ f"Removed unavailable basin {bn} from {self}")
323
+ except BaseException:
324
+ warnings.warn(f"Could not access {feat} in {self}:\n"
325
+ f"{traceback.format_exc()}")
326
+ pass
327
+ return data
328
+
329
+ @staticmethod
330
+ def get_kde_spacing(a, scale="linear", method=kde_methods.bin_width_doane,
331
+ method_kw=None, feat="undefined", ret_scaled=False):
332
+ """Convenience function for computing the contour spacing
333
+
334
+ Parameters
335
+ ----------
336
+ a: ndarray
337
+ feature data
338
+ scale: str
339
+ how the data should be scaled ("log" or "linear")
340
+ method: callable
341
+ KDE method to use (see `kde_methods` submodule)
342
+ method_kw: dict
343
+ keyword arguments to `method`
344
+ feat: str
345
+ feature name for debugging
346
+ ret_scaled: bool
347
+ whether to return the scaled array of `a`
348
+ """
349
+ return KernelDensityEstimator.get_spacing(
350
+ a=a,
351
+ scale=scale,
352
+ method=method,
353
+ method_kw=method_kw,
354
+ feat=feat,
355
+ ret_scaled=ret_scaled,
356
+ )
357
+
358
+ @property
359
+ def _feature_candidates(self):
360
+ """List of feature candidates for this dataset
361
+
362
+ Use with caution! Features in this list might not actually
363
+ be available. Always check against `__contains__`.
364
+ """
365
+ feats = list(self._events.keys())
366
+ feats += list(self._usertemp.keys())
367
+ feats += list(AncillaryFeature.feature_names)
368
+ feats += self.features_basin
369
+ feats = sorted(set(feats))
370
+ # exclude non-standard features
371
+ featsv = [ff for ff in feats if dfn.feature_exists(ff)]
372
+ return featsv
373
+
374
+ @property
375
+ def _filter(self):
376
+ """return the current filter boolean array"""
377
+ warnings.warn("RTDCBase._filter is deprecated. Please use "
378
+ + "RTDCBase.filter.all instead.",
379
+ DeprecationWarning)
380
+ return self.filter.all
381
+
382
+ @property
383
+ def _plot_filter(self):
384
+ raise NotImplementedError(
385
+ "RTDCBase._plot_filter has been removed in dclab 0.16.0. "
386
+ + "Please use the output of RTDCBase.downsample_scatter "
387
+ + "with the argument ret_mask instead.")
388
+
389
+ @property
390
+ def identifier(self):
391
+ """Unique (unreproducible) identifier"""
392
+ return self._identifier
393
+
394
+ @property
395
+ def features(self):
396
+ """All available features"""
397
+ features = []
398
+ for col in self._feature_candidates:
399
+ if col in self:
400
+ features.append(col)
401
+ features.sort()
402
+ return features
403
+
404
+ @property
405
+ def features_ancillary(self):
406
+ """All available ancillary features
407
+
408
+ This includes all ancillary features, excluding the features
409
+ that are already in `self.features_innate`. This means that
410
+ there may be overlap between `features_ancillary` and e.g.
411
+ `self.features_basin`.
412
+
413
+ .. versionadded:: 0.58.0
414
+
415
+ """
416
+ features_innate = self.features_innate
417
+ features_ancillary = []
418
+ for feat in AncillaryFeature.feature_names:
419
+ if feat not in features_innate and feat in self:
420
+ features_ancillary.append(feat)
421
+ return sorted(features_ancillary)
422
+
423
+ @property
424
+ def features_basin(self):
425
+ """All features accessed via upstream basins from other locations"""
426
+ if self._basins_features is None:
427
+ if self.basins:
428
+ features = []
429
+ for bn in self.basins:
430
+ if bn.features and set(bn.features) <= set(features):
431
+ # We already have the features from a different basin.
432
+ # There might be a basin availability check going on
433
+ # somewhere, but we are not interested in it.
434
+ continue
435
+ if bn.is_available():
436
+ features += bn.features
437
+ self._basins_features = sorted(set(features))
438
+ else:
439
+ self._basins_features = []
440
+ return self._basins_features
441
+
442
+ @property
443
+ def features_innate(self):
444
+ """All features excluding ancillary, basin, or temporary features"""
445
+ innate = [ft for ft in self.features if ft in self._events]
446
+ return innate
447
+
448
+ @property
449
+ def features_loaded(self):
450
+ """All features that have been computed
451
+
452
+ This includes ancillary features and temporary features.
453
+
454
+ Notes
455
+ -----
456
+ Ancillary features that are computationally cheap to compute are
457
+ always included. They are defined in
458
+ :const:`dclab.rtdc_dataset.feat_anc_core.FEATURES_RAPID`.
459
+ """
460
+ features_loaded = self.features_local + self.features_innate
461
+ features_loaded += [f for f in self.features if f in FEATURES_RAPID]
462
+ return sorted(set(features_loaded))
463
+
464
+ @property
465
+ def features_local(self):
466
+ """All features that are, with certainty, really fast to access
467
+
468
+ Local features is a slimmed down version of `features_loaded`.
469
+ Nothing needs to be computed, not even rapid features
470
+ (:const:`dclab.rtdc_dataset.feat_anc_core.FEATURES_RAPID`).
471
+ And features from remote sources that have not been downloaded
472
+ already are excluded. Ancillary and temporary features that are
473
+ available are included.
474
+ """
475
+ features_local = []
476
+ # Note that the hierarchy format just calls its hparent's
477
+ # `features_local`.
478
+ if hasattr(self._events, "_cached_events"):
479
+ features_local += list(self._events._cached_events.keys())
480
+
481
+ if self.format == "hdf5":
482
+ features_local += list(self._events.keys())
483
+
484
+ # Get into the basins.
485
+ for bn in self.basins:
486
+ if (bn.basin_format == "hdf5"
487
+ and bn.basin_type == "file"
488
+ and bn.is_available()):
489
+ features_local += bn.ds.features_local
490
+ elif bn._ds is not None:
491
+ features_local += bn.ds.features_local
492
+
493
+ # If they are here, then we use them:
494
+ features_local += list(self._ancillaries.keys())
495
+ features_local += list(self._usertemp.keys())
496
+
497
+ return sorted(set(features_local))
498
+
499
+ @property
500
+ def features_scalar(self):
501
+ """All scalar features available"""
502
+ sclr = [ft for ft in self.features if dfn.scalar_feature_exists(ft)]
503
+ return sclr
504
+
505
+ @property
506
+ @abc.abstractmethod
507
+ def hash(self):
508
+ """Reproducible dataset hash (defined by derived classes)"""
509
+
510
+ def ignore_basins(self, basin_identifiers):
511
+ """Ignore these basin identifiers when looking for features
512
+
513
+ This is used to avoid circular basin dependencies.
514
+ """
515
+ self._basins_ignored += basin_identifiers
516
+
517
+ def apply_filter(self, force=None):
518
+ """Compute the filters for the dataset"""
519
+ if force is None:
520
+ force = []
521
+ self.filter.update(rtdc_ds=self, force=force)
522
+
523
+ def close(self):
524
+ """Close any open files or connections, including basins
525
+
526
+ If implemented in a subclass, the subclass must call this
527
+ method via `super`, otherwise basins are not closed. The
528
+ subclass is responsible for closing its specific file handles.
529
+ """
530
+ if self._basins:
531
+ for bn in self._basins:
532
+ bn.close()
533
+
534
+ def get_downsampled_scatter(self, xax="area_um", yax="deform",
535
+ downsample=0, xscale="linear",
536
+ yscale="linear", remove_invalid=False,
537
+ ret_mask=False):
538
+ """Downsampling by removing points at dense locations
539
+
540
+ Parameters
541
+ ----------
542
+ xax: str
543
+ Identifier for x axis (e.g. "area_um", "aspect", "deform")
544
+ yax: str
545
+ Identifier for y axis
546
+ downsample: int
547
+ Number of points to draw in the down-sampled plot.
548
+ This number is either
549
+
550
+ - >=1: exactly downsample to this number by randomly adding
551
+ or removing points
552
+ - 0 : do not perform downsampling
553
+ xscale: str
554
+ If set to "log", take the logarithm of the x-values before
555
+ performing downsampling. This is useful when data are are
556
+ displayed on a log-scale. Defaults to "linear".
557
+ yscale: str
558
+ See `xscale`.
559
+ remove_invalid: bool
560
+ Remove nan and inf values before downsampling; if set to
561
+ `True`, the actual number of samples returned might be
562
+ smaller than `downsample` due to infinite or nan values
563
+ (e.g. due to logarithmic scaling).
564
+ ret_mask: bool
565
+ If set to `True`, returns a boolean array of length
566
+ `len(self)` where `True` values identify the filtered
567
+ data.
568
+
569
+ Returns
570
+ -------
571
+ xnew, xnew: 1d ndarray of lenght `N`
572
+ Filtered data; `N` is either identical to `downsample`
573
+ or smaller (if `remove_invalid==True`)
574
+ mask: 1d boolean array of length `len(RTDCBase)`
575
+ Array for identifying the downsampled data points
576
+ """
577
+ if downsample < 0:
578
+ raise ValueError("`downsample` must be zero or positive!")
579
+
580
+ downsample = int(downsample)
581
+ xax = xax.lower()
582
+ yax = yax.lower()
583
+
584
+ # Get data
585
+ x = self[xax][self.filter.all]
586
+ y = self[yax][self.filter.all]
587
+
588
+ # Apply scale (no change for linear scale)
589
+ xs = KernelDensityEstimator.apply_scale(x, xscale, xax)
590
+ ys = KernelDensityEstimator.apply_scale(y, yscale, yax)
591
+
592
+ _, _, idx = downsampling.downsample_grid(xs, ys,
593
+ samples=downsample,
594
+ remove_invalid=remove_invalid,
595
+ ret_idx=True)
596
+
597
+ if ret_mask:
598
+ # Mask is a boolean array of len(self)
599
+ mask = np.zeros(len(self), dtype=bool)
600
+ mids = np.where(self.filter.all)[0]
601
+ mask[mids] = idx
602
+ return x[idx], y[idx], mask
603
+ else:
604
+ return x[idx], y[idx]
605
+
606
+ def get_kde_contour(self, xax="area_um", yax="deform", xacc=None,
607
+ yacc=None, kde_type="histogram", kde_kwargs=None,
608
+ xscale="linear", yscale="linear"):
609
+ """Evaluate the kernel density estimate for contour plots
610
+
611
+ Parameters
612
+ ----------
613
+ xax: str
614
+ Identifier for X axis (e.g. "area_um", "aspect", "deform")
615
+ yax: str
616
+ Identifier for Y axis
617
+ xacc: float
618
+ Contour accuracy in x direction
619
+ yacc: float
620
+ Contour accuracy in y direction
621
+ kde_type: str
622
+ The KDE method to use
623
+ kde_kwargs: dict
624
+ Additional keyword arguments to the KDE method
625
+ xscale: str
626
+ If set to "log", take the logarithm of the x-values before
627
+ computing the KDE. This is useful when data are
628
+ displayed on a log-scale. Defaults to "linear".
629
+ yscale: str
630
+ See `xscale`.
631
+
632
+ Returns
633
+ -------
634
+ X, Y, Z : coordinates
635
+ The kernel density Z evaluated on a rectangular grid (X,Y).
636
+ """
637
+ kde_instance = KernelDensityEstimator(rtdc_ds=self)
638
+ xmesh, ymesh, density = kde_instance.get_raster(
639
+ xax=xax, yax=yax, xacc=xacc, yacc=yacc, kde_type=kde_type,
640
+ kde_kwargs=kde_kwargs, xscale=xscale, yscale=yscale
641
+ )
642
+
643
+ return xmesh, ymesh, density
644
+
645
+ def get_kde_scatter(self, xax="area_um", yax="deform", positions=None,
646
+ kde_type="histogram", kde_kwargs=None, xscale="linear",
647
+ yscale="linear"):
648
+ """Evaluate the kernel density estimate for scatter plots
649
+
650
+ Parameters
651
+ ----------
652
+ xax: str
653
+ Identifier for X axis (e.g. "area_um", "aspect", "deform")
654
+ yax: str
655
+ Identifier for Y axis
656
+ positions: list of two 1d ndarrays or ndarray of shape (2, N)
657
+ The positions where the KDE will be computed. Note that
658
+ the KDE estimate is computed from the points that
659
+ are set in `self.filter.all`.
660
+ kde_type: str
661
+ The KDE method to use, see :const:`.kde_methods.methods`
662
+ kde_kwargs: dict
663
+ Additional keyword arguments to the KDE method
664
+ xscale: str
665
+ If set to "log", take the logarithm of the x-values before
666
+ computing the KDE. This is useful when data are are
667
+ displayed on a log-scale. Defaults to "linear".
668
+ yscale: str
669
+ See `xscale`.
670
+
671
+ Returns
672
+ -------
673
+ density : 1d ndarray
674
+ The kernel density evaluated for the filtered data points.
675
+ """
676
+ kde_instance = KernelDensityEstimator(rtdc_ds=self)
677
+ density = kde_instance.get_scatter(
678
+ xax=xax, yax=yax, positions=positions, kde_type=kde_type,
679
+ kde_kwargs=kde_kwargs, xscale=xscale, yscale=yscale
680
+ )
681
+
682
+ return density
683
+
684
+ def basins_get_dicts(self):
685
+ """Return the list of dictionaries describing the dataset's basins"""
686
+ # Only implement this for classes that support this
687
+ return []
688
+
689
+ def basins_retrieve(self):
690
+ """Load all basins available
691
+
692
+ .. versionadded:: 0.54.0
693
+
694
+ In dclab 0.51.0, we introduced basins, a simple way of combining
695
+ HDF5-based datasets (including the :class:`.HDF5_S3` format).
696
+ The idea is to be able to store parts of the dataset
697
+ (e.g. images) in a separate file that could then be located
698
+ someplace else (e.g. an S3 object store).
699
+
700
+ If an RT-DC file has "basins" defined, then these are sought out and
701
+ made available via the `features_basin` property.
702
+
703
+ .. versionchanged:: 0.57.5
704
+
705
+ "file"-type basins are only available for subclasses that
706
+ set the `_local_basins_allowed` attribute to True.
707
+ """
708
+ basins = []
709
+ bc = feat_basin.get_basin_classes()
710
+ # Sort basins according to priority
711
+ bdicts_srt = sorted(self.basins_get_dicts(),
712
+ key=feat_basin.basin_priority_sorted_key)
713
+ # complement basin "key"s (we do the same in writer)
714
+ for bdict in bdicts_srt:
715
+ if "key" not in bdict:
716
+ b_dat = json.dumps(bdict, indent=2, sort_keys=True).split("\n")
717
+ bdict["key"] = hashobj(b_dat)
718
+
719
+ bd_keys = [bd["key"] for bd in bdicts_srt]
720
+ bd_keys += self._basins_ignored
721
+ for bdict in bdicts_srt:
722
+ if bdict["format"] not in bc:
723
+ warnings.warn(f"Encountered unsupported basin "
724
+ f"format '{bdict['format']}'!")
725
+ continue
726
+ if bdict["key"] in self._basins_ignored:
727
+ warnings.warn(
728
+ f"Encountered cyclic basin dependency '{bdict['key']}'",
729
+ feat_basin.CyclicBasinDependencyFoundWarning)
730
+ continue
731
+
732
+ # Basin initialization keyword arguments
733
+ kwargs = {
734
+ "name": bdict.get("name"),
735
+ "description": bdict.get("description"),
736
+ # Honor features intended by basin creator.
737
+ "features": bdict.get("features"),
738
+ # Which mapping we are using ("same", "basinmap1", ...)
739
+ "mapping": bdict.get("mapping", "same"),
740
+ # For non-identical mapping ("basinmap1", etc.), we
741
+ # need the referring dataset.
742
+ "mapping_referrer": self,
743
+ # Make sure the measurement identifier is checked.
744
+ "referrer_identifier": self.get_measurement_identifier(),
745
+ # Make sure the basin identifier is checked.
746
+ "basin_identifier": bdict.get("identifier"),
747
+ # allow to ignore basins
748
+ "ignored_basins": bd_keys,
749
+ # basin key
750
+ "key": bdict["key"],
751
+ # whether the basin is perishable or not
752
+ "perishable": bdict.get("perishable", False),
753
+ }
754
+
755
+ # Check whether this basin is supported and exists
756
+ if bdict["type"] == "internal":
757
+ b_cls = bc[bdict["format"]]
758
+ bna = b_cls(bdict["paths"][0], **kwargs)
759
+ # In contrast to file-type basins, we just add all remote
760
+ # basins without checking first. We do not check for
761
+ # the availability of remote basins, because they could
762
+ # be temporarily inaccessible (unstable network connection)
763
+ # and because checking the availability of remote basins
764
+ # normally takes a lot of time.
765
+ basins.append(bna)
766
+ elif bdict["type"] == "file":
767
+ if not self._local_basins_allowed:
768
+ warnings.warn(f"Basin type 'file' not allowed for format "
769
+ f"'{self.format}'",
770
+ LocalBasinForbiddenWarning)
771
+ # stop processing this basin
772
+ continue
773
+ p_paths = list(bdict["paths"])
774
+ # translate Windows and Unix relative paths
775
+ for pi in list(p_paths): # [sic] create a copy of the list
776
+ if pi.count(".."):
777
+ if pi[2:].count("/") and os.path.sep == r"\\":
778
+ # Windows
779
+ p_paths.append(pi.replace("/", r"\\"))
780
+ elif pi[2:].count(r"\\") and os.path.sep == "/":
781
+ # Unix
782
+ p_paths.append(pi.replace(r"\\", "/"))
783
+ # perform the actual check
784
+ for pp in p_paths:
785
+ pp = pathlib.Path(pp)
786
+ # Instantiate the proper basin class
787
+ b_cls = bc[bdict["format"]]
788
+ # Try absolute path
789
+ bna = b_cls(pp, **kwargs)
790
+
791
+ try:
792
+ absolute_exists = bna.verify_basin()
793
+ except BaseException:
794
+ pass
795
+ else:
796
+ if absolute_exists:
797
+ basins.append(bna)
798
+ break
799
+ # Try relative path
800
+ this_path = pathlib.Path(self.path)
801
+ if this_path.exists():
802
+
803
+ # Insert relative path
804
+ bnr = b_cls(this_path.parent / pp, **kwargs)
805
+ if bnr.verify_basin():
806
+ basins.append(bnr)
807
+ break
808
+ elif bdict["type"] == "remote":
809
+ for url in bdict["urls"]:
810
+ # Instantiate the proper basin class
811
+ b_cls = bc[bdict["format"]]
812
+ bna = b_cls(url, **kwargs)
813
+ # In contrast to file-type basins, we just add all remote
814
+ # basins without checking first. We do not check for
815
+ # the availability of remote basins, because they could
816
+ # be temporarily inaccessible (unstable network connection)
817
+ # and because checking the availability of remote basins
818
+ # normally takes a lot of time.
819
+ basins.append(bna)
820
+ else:
821
+ warnings.warn(
822
+ f"Encountered unsupported basin type '{bdict['type']}'!")
823
+ return basins
824
+
825
+ def get_measurement_identifier(self):
826
+ """Return a unique measurement identifier
827
+
828
+ Return the [experiment]:"run identifier" configuration feat, if it
829
+ exists. Otherwise, return the MD5 sum computed from the measurement
830
+ time, date, and setup identifier.
831
+
832
+ Returns `None` if no identifier could be found or computed.
833
+
834
+ .. versionadded:: 0.51.0
835
+
836
+ """
837
+ identifier = self.config.get("experiment", {}).get("run identifier",
838
+ None)
839
+ if identifier is None:
840
+ time = self.config.get("experiment", {}).get("time", None) or None
841
+ date = self.config.get("experiment", {}).get("date", None) or None
842
+ sid = self.config.get("setup", {}).get("identifier", None) or None
843
+ if None not in [time, date, sid]:
844
+ # only compute an identifier if all of the above are defined.
845
+ hasher = hashlib.md5(f"{time}_{date}_{sid}".encode("utf-8"))
846
+ identifier = str(uuid.UUID(hex=hasher.hexdigest()))
847
+ return identifier
848
+
849
+ def polygon_filter_add(self, filt):
850
+ """Associate a Polygon Filter with this instance
851
+
852
+ Parameters
853
+ ----------
854
+ filt: int or instance of `PolygonFilter`
855
+ The polygon filter to add
856
+ """
857
+ self._assert_filter() # [sic] initialize the filter if not done yet
858
+ if not isinstance(filt, (PolygonFilter, int, float)):
859
+ msg = "`filt` must be a number or instance of PolygonFilter!"
860
+ raise ValueError(msg)
861
+
862
+ if isinstance(filt, PolygonFilter):
863
+ uid = filt.unique_id
864
+ else:
865
+ uid = int(filt)
866
+ # append item
867
+ self.config["filtering"]["polygon filters"].append(uid)
868
+
869
+ def polygon_filter_rm(self, filt):
870
+ """Remove a polygon filter from this instance
871
+
872
+ Parameters
873
+ ----------
874
+ filt: int or instance of `PolygonFilter`
875
+ The polygon filter to remove
876
+ """
877
+ if not isinstance(filt, (PolygonFilter, int, float)):
878
+ msg = "`filt` must be a number or instance of PolygonFilter!"
879
+ raise ValueError(msg)
880
+
881
+ if isinstance(filt, PolygonFilter):
882
+ uid = filt.unique_id
883
+ else:
884
+ uid = int(filt)
885
+ # remove item
886
+ self.config["filtering"]["polygon filters"].remove(uid)
887
+
888
+ def reset_filter(self):
889
+ """Reset the current filter"""
890
+ # reset filter instance
891
+ self.filter.reset()
892
+ # reset configuration
893
+ # remember hierarchy parent
894
+ hp = self.config["filtering"]["hierarchy parent"]
895
+ self.config._init_default_filter_values()
896
+ self.config["filtering"]["hierarchy parent"] = hp