dclab 0.62.11__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dclab might be problematic. Click here for more details.

Files changed (137) hide show
  1. dclab/__init__.py +23 -0
  2. dclab/_version.py +16 -0
  3. dclab/cached.py +97 -0
  4. dclab/cli/__init__.py +10 -0
  5. dclab/cli/common.py +237 -0
  6. dclab/cli/task_compress.py +126 -0
  7. dclab/cli/task_condense.py +223 -0
  8. dclab/cli/task_join.py +229 -0
  9. dclab/cli/task_repack.py +98 -0
  10. dclab/cli/task_split.py +154 -0
  11. dclab/cli/task_tdms2rtdc.py +186 -0
  12. dclab/cli/task_verify_dataset.py +75 -0
  13. dclab/definitions/__init__.py +79 -0
  14. dclab/definitions/feat_const.py +202 -0
  15. dclab/definitions/feat_logic.py +183 -0
  16. dclab/definitions/meta_const.py +252 -0
  17. dclab/definitions/meta_logic.py +111 -0
  18. dclab/definitions/meta_parse.py +94 -0
  19. dclab/downsampling.cp313-win_amd64.pyd +0 -0
  20. dclab/downsampling.pyx +230 -0
  21. dclab/external/__init__.py +4 -0
  22. dclab/external/packaging/LICENSE +3 -0
  23. dclab/external/packaging/LICENSE.APACHE +177 -0
  24. dclab/external/packaging/LICENSE.BSD +23 -0
  25. dclab/external/packaging/__init__.py +6 -0
  26. dclab/external/packaging/_structures.py +61 -0
  27. dclab/external/packaging/version.py +505 -0
  28. dclab/external/skimage/LICENSE +28 -0
  29. dclab/external/skimage/__init__.py +2 -0
  30. dclab/external/skimage/_find_contours.py +216 -0
  31. dclab/external/skimage/_find_contours_cy.cp313-win_amd64.pyd +0 -0
  32. dclab/external/skimage/_find_contours_cy.pyx +188 -0
  33. dclab/external/skimage/_pnpoly.cp313-win_amd64.pyd +0 -0
  34. dclab/external/skimage/_pnpoly.pyx +99 -0
  35. dclab/external/skimage/_shared/__init__.py +1 -0
  36. dclab/external/skimage/_shared/geometry.cp313-win_amd64.pyd +0 -0
  37. dclab/external/skimage/_shared/geometry.pxd +6 -0
  38. dclab/external/skimage/_shared/geometry.pyx +55 -0
  39. dclab/external/skimage/measure.py +7 -0
  40. dclab/external/skimage/pnpoly.py +53 -0
  41. dclab/external/statsmodels/LICENSE +35 -0
  42. dclab/external/statsmodels/__init__.py +6 -0
  43. dclab/external/statsmodels/nonparametric/__init__.py +1 -0
  44. dclab/external/statsmodels/nonparametric/_kernel_base.py +203 -0
  45. dclab/external/statsmodels/nonparametric/kernel_density.py +165 -0
  46. dclab/external/statsmodels/nonparametric/kernels.py +36 -0
  47. dclab/features/__init__.py +9 -0
  48. dclab/features/bright.py +81 -0
  49. dclab/features/bright_bc.py +93 -0
  50. dclab/features/bright_perc.py +63 -0
  51. dclab/features/contour.py +161 -0
  52. dclab/features/emodulus/__init__.py +339 -0
  53. dclab/features/emodulus/load.py +252 -0
  54. dclab/features/emodulus/lut_HE-2D-FEM-22.txt +16432 -0
  55. dclab/features/emodulus/lut_HE-3D-FEM-22.txt +1276 -0
  56. dclab/features/emodulus/lut_LE-2D-FEM-19.txt +13082 -0
  57. dclab/features/emodulus/pxcorr.py +135 -0
  58. dclab/features/emodulus/scale_linear.py +247 -0
  59. dclab/features/emodulus/viscosity.py +256 -0
  60. dclab/features/fl_crosstalk.py +95 -0
  61. dclab/features/inert_ratio.py +377 -0
  62. dclab/features/volume.py +242 -0
  63. dclab/http_utils.py +322 -0
  64. dclab/isoelastics/__init__.py +468 -0
  65. dclab/isoelastics/iso_HE-2D-FEM-22-area_um-deform.txt +2440 -0
  66. dclab/isoelastics/iso_HE-2D-FEM-22-volume-deform.txt +2635 -0
  67. dclab/isoelastics/iso_HE-3D-FEM-22-area_um-deform.txt +1930 -0
  68. dclab/isoelastics/iso_HE-3D-FEM-22-volume-deform.txt +2221 -0
  69. dclab/isoelastics/iso_LE-2D-FEM-19-area_um-deform.txt +2151 -0
  70. dclab/isoelastics/iso_LE-2D-FEM-19-volume-deform.txt +2250 -0
  71. dclab/isoelastics/iso_LE-2D-ana-18-area_um-deform.txt +1266 -0
  72. dclab/kde_contours.py +222 -0
  73. dclab/kde_methods.py +303 -0
  74. dclab/lme4/__init__.py +5 -0
  75. dclab/lme4/lme4_template.R +94 -0
  76. dclab/lme4/rsetup.py +204 -0
  77. dclab/lme4/wrapr.py +386 -0
  78. dclab/polygon_filter.py +398 -0
  79. dclab/rtdc_dataset/__init__.py +15 -0
  80. dclab/rtdc_dataset/check.py +902 -0
  81. dclab/rtdc_dataset/config.py +533 -0
  82. dclab/rtdc_dataset/copier.py +353 -0
  83. dclab/rtdc_dataset/core.py +1001 -0
  84. dclab/rtdc_dataset/export.py +737 -0
  85. dclab/rtdc_dataset/feat_anc_core/__init__.py +24 -0
  86. dclab/rtdc_dataset/feat_anc_core/af_basic.py +75 -0
  87. dclab/rtdc_dataset/feat_anc_core/af_emodulus.py +160 -0
  88. dclab/rtdc_dataset/feat_anc_core/af_fl_max_ctc.py +133 -0
  89. dclab/rtdc_dataset/feat_anc_core/af_image_contour.py +113 -0
  90. dclab/rtdc_dataset/feat_anc_core/af_ml_class.py +102 -0
  91. dclab/rtdc_dataset/feat_anc_core/ancillary_feature.py +320 -0
  92. dclab/rtdc_dataset/feat_anc_ml/__init__.py +32 -0
  93. dclab/rtdc_dataset/feat_anc_plugin/__init__.py +3 -0
  94. dclab/rtdc_dataset/feat_anc_plugin/plugin_feature.py +329 -0
  95. dclab/rtdc_dataset/feat_basin.py +550 -0
  96. dclab/rtdc_dataset/feat_temp.py +102 -0
  97. dclab/rtdc_dataset/filter.py +263 -0
  98. dclab/rtdc_dataset/fmt_dcor/__init__.py +7 -0
  99. dclab/rtdc_dataset/fmt_dcor/access_token.py +52 -0
  100. dclab/rtdc_dataset/fmt_dcor/api.py +111 -0
  101. dclab/rtdc_dataset/fmt_dcor/base.py +200 -0
  102. dclab/rtdc_dataset/fmt_dcor/basin.py +73 -0
  103. dclab/rtdc_dataset/fmt_dcor/logs.py +26 -0
  104. dclab/rtdc_dataset/fmt_dcor/tables.py +42 -0
  105. dclab/rtdc_dataset/fmt_dict.py +103 -0
  106. dclab/rtdc_dataset/fmt_hdf5/__init__.py +6 -0
  107. dclab/rtdc_dataset/fmt_hdf5/base.py +192 -0
  108. dclab/rtdc_dataset/fmt_hdf5/basin.py +30 -0
  109. dclab/rtdc_dataset/fmt_hdf5/events.py +257 -0
  110. dclab/rtdc_dataset/fmt_hdf5/feat_defect.py +164 -0
  111. dclab/rtdc_dataset/fmt_hdf5/logs.py +33 -0
  112. dclab/rtdc_dataset/fmt_hdf5/tables.py +30 -0
  113. dclab/rtdc_dataset/fmt_hierarchy/__init__.py +11 -0
  114. dclab/rtdc_dataset/fmt_hierarchy/base.py +278 -0
  115. dclab/rtdc_dataset/fmt_hierarchy/events.py +146 -0
  116. dclab/rtdc_dataset/fmt_hierarchy/hfilter.py +140 -0
  117. dclab/rtdc_dataset/fmt_hierarchy/mapper.py +134 -0
  118. dclab/rtdc_dataset/fmt_http.py +102 -0
  119. dclab/rtdc_dataset/fmt_s3.py +320 -0
  120. dclab/rtdc_dataset/fmt_tdms/__init__.py +476 -0
  121. dclab/rtdc_dataset/fmt_tdms/event_contour.py +264 -0
  122. dclab/rtdc_dataset/fmt_tdms/event_image.py +220 -0
  123. dclab/rtdc_dataset/fmt_tdms/event_mask.py +62 -0
  124. dclab/rtdc_dataset/fmt_tdms/event_trace.py +146 -0
  125. dclab/rtdc_dataset/fmt_tdms/exc.py +37 -0
  126. dclab/rtdc_dataset/fmt_tdms/naming.py +151 -0
  127. dclab/rtdc_dataset/load.py +72 -0
  128. dclab/rtdc_dataset/writer.py +985 -0
  129. dclab/statistics.py +203 -0
  130. dclab/util.py +156 -0
  131. dclab/warn.py +15 -0
  132. dclab-0.62.11.dist-info/LICENSE +343 -0
  133. dclab-0.62.11.dist-info/METADATA +146 -0
  134. dclab-0.62.11.dist-info/RECORD +137 -0
  135. dclab-0.62.11.dist-info/WHEEL +5 -0
  136. dclab-0.62.11.dist-info/entry_points.txt +8 -0
  137. dclab-0.62.11.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1001 @@
1
+ """RT-DC dataset core classes and methods"""
2
+ import abc
3
+ import hashlib
4
+ import json
5
+ import os.path
6
+ import pathlib
7
+ import traceback
8
+ from typing import Literal
9
+ import uuid
10
+ import random
11
+ import warnings
12
+
13
+ import numpy as np
14
+
15
+ from .. import definitions as dfn
16
+ from .. import downsampling
17
+ from ..polygon_filter import PolygonFilter
18
+ from .. import kde_methods
19
+ from ..util import hashobj
20
+
21
+ from .feat_anc_core import AncillaryFeature, FEATURES_RAPID
22
+ from . import feat_basin
23
+ from .export import Export
24
+ from .filter import Filter
25
+
26
+
27
+ class FeatureShouldExistButNotFoundWarning(UserWarning):
28
+ pass
29
+
30
+
31
+ class LogTransformWarning(UserWarning):
32
+ pass
33
+
34
+
35
+ class RTDCBase(abc.ABC):
36
+ def __init__(self, identifier=None, enable_basins=True):
37
+ """RT-DC measurement base class
38
+
39
+ Notes
40
+ -----
41
+ Besides the filter arrays for each data feature, there is a manual
42
+ boolean filter array ``RTDCBase.filter.manual`` that can be edited
43
+ by the user - a boolean value of ``False`` means that the event is
44
+ excluded from all computations.
45
+ """
46
+ #: Local basins are basins that are defined on the user's file system.
47
+ #: For reasons of data security (leaking data from a server or from a
48
+ #: user's file system), dclab only allows remote basins (see
49
+ #: :func:`basins_retrieve`) by default. This variable is set to True
50
+ #: for the RTDC_HDF5 file format, because it implies the data are
51
+ #: located on the user's computer.
52
+ self._local_basins_allowed = False
53
+
54
+ #: Dataset format (derived from class name)
55
+ self.format = self.__class__.__name__.split("_")[-1].lower()
56
+
57
+ # Cache attribute used for __len__()-function
58
+ self._length = None
59
+ self._polygon_filter_ids = []
60
+ # Events have the feature name as keys and contain nD ndarrays.
61
+ self._events = {}
62
+ # Ancillaries have the feature name as keys and a
63
+ # tuple containing feature and hash as value.
64
+ self._ancillaries = {}
65
+ # Temporary features are defined by the user ad hoc at runtime.
66
+ self._usertemp = {}
67
+ # List of :class:`.Basin` for external features
68
+ self._basins = None
69
+ # List of basin identifiers that should be ignored, used to
70
+ # avoid circular basin dependencies
71
+ self._basins_ignored = []
72
+ # List of all features available via basins
73
+ self._basins_features = None
74
+ #: Configuration of the measurement
75
+ self.config = None
76
+ #: Export functionalities; instance of
77
+ #: :class:`dclab.rtdc_dataset.export.Export`.
78
+ self.export = Export(self)
79
+ # Filtering functionalities; instance of
80
+ # :class:`dclab.rtdc_dataset.filter.Filter`.
81
+ self._ds_filter = None
82
+ #: Dictionary of log files. Each log file is a list of strings
83
+ #: (one string per line).
84
+ self.logs = {}
85
+ #: Dictionary of tables. Each table is an indexable compound numpy
86
+ #: array.
87
+ self.tables = {}
88
+ #: Title of the measurement
89
+ self.title = None
90
+ #: Path or DCOR identifier of the dataset (set to "none"
91
+ #: for :class:`RTDC_Dict`)
92
+ self.path = None
93
+ # Unique, random identifier
94
+ if identifier is None:
95
+ # Generate a unique, random identifier for this dataset
96
+ rhex = [random.choice('0123456789abcdef') for _n in range(7)]
97
+ self._identifier = "mm-{}_{}".format(self.format, "".join(rhex))
98
+ else:
99
+ self._identifier = identifier
100
+
101
+ # Basins are initialized in the "basins" property function
102
+ self._enable_basins = enable_basins
103
+
104
+ def __contains__(self, feat):
105
+ ct = False
106
+ if (feat in self._events
107
+ or feat in self._usertemp
108
+ or feat in self.features_basin):
109
+ ct = True
110
+ else:
111
+ # Check ancillary features data
112
+ if feat in self._ancillaries:
113
+ # already computed
114
+ ct = True
115
+ elif feat in AncillaryFeature.feature_names:
116
+ # get all instance of AncillaryFeature that
117
+ # check availability of the feature `feat`
118
+ instlist = AncillaryFeature.get_instances(feat)
119
+ for inst in instlist:
120
+ if inst.is_available(self):
121
+ # to be computed
122
+ ct = True
123
+ break
124
+ return ct
125
+
126
+ def __enter__(self):
127
+ return self
128
+
129
+ def __exit__(self, type, value, tb):
130
+ self.close()
131
+
132
+ def __getitem__(self, feat):
133
+ if feat in self._events:
134
+ return self._events[feat]
135
+ elif feat in self._usertemp:
136
+ return self._usertemp[feat]
137
+ # 1. Check for cached ancillary data
138
+ data = self._get_ancillary_feature_data(feat, no_compute=True)
139
+ if data is not None:
140
+ return data
141
+ # 2. Check for h5dataset-based, file-based, or other basin data,
142
+ # in that order.
143
+ for basin_type in ["internal", "file", None]:
144
+ data = self._get_basin_feature_data(feat, basin_type=basin_type)
145
+ if data is not None:
146
+ return data
147
+ # 3. Check for ancillary features that can be computed
148
+ data = self._get_ancillary_feature_data(feat)
149
+ if data is not None:
150
+ return data
151
+ if feat in self:
152
+ warnings.warn(f"The feature {feat} is supposedly defined in "
153
+ f"{self}, but I cannot get its data. Please "
154
+ f"make sure you have not defined any unreachable "
155
+ f"remote basins.",
156
+ FeatureShouldExistButNotFoundWarning)
157
+ # Not here ¯\_(ツ)_/¯
158
+ raise KeyError(f"Feature '{feat}' does not exist in {self}!")
159
+
160
+ def __iter__(self):
161
+ """An iterator over all valid scalar features"""
162
+ mycols = []
163
+ for col in self._feature_candidates:
164
+ if col in self:
165
+ mycols.append(col)
166
+ mycols.sort()
167
+ for col in mycols:
168
+ yield col
169
+
170
+ def __len__(self):
171
+ if self._length is None:
172
+ self._length = self._get_length()
173
+ return self._length
174
+
175
+ def _get_length(self):
176
+ # Try to get length from metadata.
177
+ length = self.config["experiment"].get("event count")
178
+ if length is not None:
179
+ return length
180
+ # Try to get the length from the feature sizes
181
+ keys = list(self._events.keys()) or self.features_basin
182
+ keys.sort()
183
+ for kk in keys:
184
+ length = len(self[kk])
185
+ if length:
186
+ return length
187
+ else:
188
+ raise ValueError(f"Could not determine size of dataset '{self}'.")
189
+
190
+ def __repr__(self):
191
+ repre = "<{} '{}' at {}".format(self.__class__.__name__,
192
+ self.identifier,
193
+ hex(id(self)))
194
+ if self.path != "none":
195
+ repre += " ({})>".format(self.path)
196
+ return repre
197
+
198
+ @property
199
+ def basins(self):
200
+ """Basins containing upstream features from other datasets"""
201
+ if self._basins is None:
202
+ if self._enable_basins:
203
+ self._basins = self.basins_retrieve()
204
+ else:
205
+ self._basins = []
206
+ return self._basins
207
+
208
+ @property
209
+ def filter(self):
210
+ """Filtering functionalities; instance of :class:`.Filter`"""
211
+ self._assert_filter()
212
+ return self._ds_filter
213
+
214
+ def _assert_filter(self):
215
+ if self._ds_filter is None:
216
+ self._ds_filter = Filter(self)
217
+
218
+ def _get_ancillary_feature_data(self,
219
+ feat: str,
220
+ no_compute: bool = False):
221
+ """Return feature data of ancillary features
222
+
223
+ Parameters
224
+ ----------
225
+ feat: str
226
+ Name of the feature
227
+ no_compute: bool
228
+ Whether to bother computing the feature. If it is not
229
+ already computed, return None instead
230
+
231
+ Returns
232
+ -------
233
+ data:
234
+ The feature object (array-like) or None if it could not
235
+ be found or was not computed.
236
+ """
237
+ data = None
238
+ anhash = None
239
+ if feat in AncillaryFeature.feature_names:
240
+ # Try to find the feature in the ancillary features
241
+ # (see feat_anc_core submodule for more information).
242
+ # These features are cached in `self._ancillaries`.
243
+ ancol = AncillaryFeature.available_features(self)
244
+ if feat in ancol:
245
+ # The feature is generally available.
246
+ if feat in self._ancillaries:
247
+ # We have already computed the feature. Make sure that we
248
+ # have the updated one by checking the hash.
249
+ anhash = ancol[feat].hash(self)
250
+ if self._ancillaries[feat][0] == anhash:
251
+ # Use cached value
252
+ data = self._ancillaries[feat][1]
253
+ # We either already have the ancillary feature or have to
254
+ # compute it. We only compute it if we are asked to.
255
+ if data is None and not no_compute:
256
+ anhash = anhash or ancol[feat].hash(self)
257
+ # Compute new value
258
+ data_dict = ancol[feat].compute(self)
259
+ for okey in data_dict:
260
+ # Store computed value in `self._ancillaries`.
261
+ self._ancillaries[okey] = (anhash, data_dict[okey])
262
+ data = data_dict[feat]
263
+ return data
264
+
265
+ def _get_basin_feature_data(
266
+ self,
267
+ feat: str,
268
+ basin_type: Literal["file", "internal", "remote", None] = None):
269
+ """Return feature data from basins
270
+
271
+ Parameters
272
+ ----------
273
+ feat: str
274
+ Name of the feature
275
+ basin_type: str or bool
276
+ The basin type to look at, which is either "file"-based
277
+ (e.g. local on disk), "remote"-based (e.g. S3), or
278
+ "internal"-type (e.g. h5py.Dataset inside the current HDF5 file).
279
+ Defaults to `None` which means no preference.
280
+
281
+ Returns
282
+ -------
283
+ data:
284
+ The feature object (array-like) or None if it could not
285
+ be found or was not computed.
286
+ """
287
+ data = None
288
+ if self.basins:
289
+ for bn in list(self.basins):
290
+ if basin_type is not None and basin_type != bn.basin_type:
291
+ # User asked for specific basin type
292
+ continue
293
+ try:
294
+ # There are all kinds of errors that may happen here.
295
+ # Note that `bn.features` can already trigger an
296
+ # availability check that may raise a ValueError.
297
+ # TODO:
298
+ # Introduce some kind of callback so the user knows
299
+ # why the data are not available. The current solution
300
+ # (fail silently) is not sufficiently transparent,
301
+ # especially when considering networking issues.
302
+ if feat in bn.features:
303
+ data = bn.get_feature_data(feat)
304
+ # The data are available, we may abort the search.
305
+ break
306
+ except (KeyError, OSError, PermissionError):
307
+ # Basin data not available
308
+ pass
309
+ except feat_basin.BasinNotAvailableError:
310
+ # remove the basin from the list
311
+ # TODO:
312
+ # Check whether this has an actual effect. It could be
313
+ # that due to some iterative process `self`
314
+ # gets re-initialized and we have to go through this
315
+ # again.
316
+ self._basins.remove(bn)
317
+ warnings.warn(
318
+ f"Removed unavailable basin {bn} from {self}")
319
+ except BaseException:
320
+ warnings.warn(f"Could not access {feat} in {self}:\n"
321
+ f"{traceback.format_exc()}")
322
+ pass
323
+ return data
324
+
325
+ @staticmethod
326
+ def _apply_scale(a, scale, feat):
327
+ """Helper function for transforming an aray to log-scale
328
+
329
+ Parameters
330
+ ----------
331
+ a: np.ndarray
332
+ Input array
333
+ scale: str
334
+ If set to "log", take the logarithm of `a`; if set to
335
+ "linear" return `a` unchanged.
336
+ feat: str
337
+ Feature name (required for debugging)
338
+
339
+ Returns
340
+ -------
341
+ b: np.ndarray
342
+ The scaled array
343
+
344
+ Notes
345
+ -----
346
+ If the scale is not "linear", then a new array is returned.
347
+ All warnings are suppressed when computing `np.log(a)`, as
348
+ `a` may have negative or nan values.
349
+ """
350
+ if scale == "linear":
351
+ b = a
352
+ elif scale == "log":
353
+ with warnings.catch_warnings(record=True) as w:
354
+ warnings.simplefilter("always")
355
+ b = np.log(a)
356
+ if len(w):
357
+ # Tell the user that the log-transformation issued
358
+ # a warning.
359
+ warnings.warn("Invalid values encounterd in np.log "
360
+ "while scaling feature '{}'!".format(feat))
361
+ else:
362
+ raise ValueError("`scale` must be either 'linear' or 'log', "
363
+ + "got '{}'!".format(scale))
364
+ return b
365
+
366
+ @staticmethod
367
+ def get_kde_spacing(a, scale="linear", method=kde_methods.bin_width_doane,
368
+ method_kw=None, feat="undefined", ret_scaled=False):
369
+ """Convenience function for computing the contour spacing
370
+
371
+ Parameters
372
+ ----------
373
+ a: ndarray
374
+ feature data
375
+ scale: str
376
+ how the data should be scaled ("log" or "linear")
377
+ method: callable
378
+ KDE method to use (see `kde_methods` submodule)
379
+ method_kw: dict
380
+ keyword arguments to `method`
381
+ feat: str
382
+ feature name for debugging
383
+ ret_scaled: bool
384
+ whether to return the scaled array of `a`
385
+ """
386
+ if method_kw is None:
387
+ method_kw = {}
388
+ # Apply scale (no change for linear scale)
389
+ asc = RTDCBase._apply_scale(a, scale, feat)
390
+ # Apply multiplicator
391
+ acc = method(asc, **method_kw)
392
+ if ret_scaled:
393
+ return acc, asc
394
+ else:
395
+ return acc
396
+
397
+ @property
398
+ def _feature_candidates(self):
399
+ """List of feature candidates for this dataset
400
+
401
+ Use with caution! Features in this list might not actually
402
+ be available. Always check against `__contains__`.
403
+ """
404
+ feats = list(self._events.keys())
405
+ feats += list(self._usertemp.keys())
406
+ feats += list(AncillaryFeature.feature_names)
407
+ feats += self.features_basin
408
+ feats = sorted(set(feats))
409
+ # exclude non-standard features
410
+ featsv = [ff for ff in feats if dfn.feature_exists(ff)]
411
+ return featsv
412
+
413
+ @property
414
+ def _filter(self):
415
+ """return the current filter boolean array"""
416
+ warnings.warn("RTDCBase._filter is deprecated. Please use "
417
+ + "RTDCBase.filter.all instead.",
418
+ DeprecationWarning)
419
+ return self.filter.all
420
+
421
+ @property
422
+ def _plot_filter(self):
423
+ raise NotImplementedError(
424
+ "RTDCBase._plot_filter has been removed in dclab 0.16.0. "
425
+ + "Please use the output of RTDCBase.downsample_scatter "
426
+ + "with the argument ret_mask instead.")
427
+
428
+ @property
429
+ def identifier(self):
430
+ """Unique (unreproducible) identifier"""
431
+ return self._identifier
432
+
433
+ @property
434
+ def features(self):
435
+ """All available features"""
436
+ features = []
437
+ for col in self._feature_candidates:
438
+ if col in self:
439
+ features.append(col)
440
+ features.sort()
441
+ return features
442
+
443
+ @property
444
+ def features_ancillary(self):
445
+ """All available ancillary features
446
+
447
+ This includes all ancillary features, excluding the features
448
+ that are already in `self.features_innate`. This means that
449
+ there may be overlap between `features_ancillary` and e.g.
450
+ `self.features_basin`.
451
+
452
+ .. versionadded:: 0.58.0
453
+
454
+ """
455
+ features_innate = self.features_innate
456
+ features_ancillary = []
457
+ for feat in AncillaryFeature.feature_names:
458
+ if feat not in features_innate and feat in self:
459
+ features_ancillary.append(feat)
460
+ return sorted(features_ancillary)
461
+
462
+ @property
463
+ def features_basin(self):
464
+ """All features accessed via upstream basins from other locations"""
465
+ if self._basins_features is None:
466
+ if self.basins:
467
+ features = []
468
+ for bn in self.basins:
469
+ if bn.features and set(bn.features) <= set(features):
470
+ # We already have the features from a different basin.
471
+ # There might be a basin availability check going on
472
+ # somewhere, but we are not interested in it.
473
+ continue
474
+ if bn.is_available():
475
+ features += bn.features
476
+ self._basins_features = sorted(set(features))
477
+ else:
478
+ self._basins_features = []
479
+ return self._basins_features
480
+
481
+ @property
482
+ def features_innate(self):
483
+ """All features excluding ancillary, basin, or temporary features"""
484
+ innate = [ft for ft in self.features if ft in self._events]
485
+ return innate
486
+
487
+ @property
488
+ def features_loaded(self):
489
+ """All features that have been computed
490
+
491
+ This includes ancillary features and temporary features.
492
+
493
+ Notes
494
+ -----
495
+ Ancillary features that are computationally cheap to compute are
496
+ always included. They are defined in
497
+ :const:`dclab.rtdc_dataset.feat_anc_core.FEATURES_RAPID`.
498
+ """
499
+ features_loaded = self.features_local + self.features_innate
500
+ features_loaded += [f for f in self.features if f in FEATURES_RAPID]
501
+ return sorted(set(features_loaded))
502
+
503
+ @property
504
+ def features_local(self):
505
+ """All features that are, with certainty, really fast to access
506
+
507
+ Local features is a slimmed down version of `features_loaded`.
508
+ Nothing needs to be computed, not even rapid features
509
+ (:const:`dclab.rtdc_dataset.feat_anc_core.FEATURES_RAPID`).
510
+ And features from remote sources that have not been downloaded
511
+ already are excluded. Ancillary and temporary features that are
512
+ available are included.
513
+ """
514
+ features_local = []
515
+ # Note that the hierarchy format just calls its hparent's
516
+ # `features_local`.
517
+ if hasattr(self._events, "_cached_events"):
518
+ features_local += list(self._events._cached_events.keys())
519
+
520
+ if self.format == "hdf5":
521
+ features_local += list(self._events.keys())
522
+
523
+ # Get into the basins.
524
+ for bn in self.basins:
525
+ if (bn.basin_format == "hdf5"
526
+ and bn.basin_type == "file"
527
+ and bn.is_available()):
528
+ features_local += bn.ds.features_local
529
+ elif bn._ds is not None:
530
+ features_local += bn.ds.features_local
531
+
532
+ # If they are here, then we use them:
533
+ features_local += list(self._ancillaries.keys())
534
+ features_local += list(self._usertemp.keys())
535
+
536
+ return sorted(set(features_local))
537
+
538
+ @property
539
+ def features_scalar(self):
540
+ """All scalar features available"""
541
+ sclr = [ft for ft in self.features if dfn.scalar_feature_exists(ft)]
542
+ return sclr
543
+
544
+ @property
545
+ @abc.abstractmethod
546
+ def hash(self):
547
+ """Reproducible dataset hash (defined by derived classes)"""
548
+
549
+ def ignore_basins(self, basin_identifiers):
550
+ """Ignore these basin identifiers when looking for features
551
+
552
+ This is used to avoid circular basin dependencies.
553
+ """
554
+ self._basins_ignored += basin_identifiers
555
+
556
+ def apply_filter(self, force=None):
557
+ """Compute the filters for the dataset"""
558
+ if force is None:
559
+ force = []
560
+ self.filter.update(rtdc_ds=self, force=force)
561
+
562
+ def close(self):
563
+ """Close any open files or connections, including basins
564
+
565
+ If implemented in a subclass, the subclass must call this
566
+ method via `super`, otherwise basins are not closed. The
567
+ subclass is responsible for closing its specific file handles.
568
+ """
569
+ if self._basins:
570
+ for bn in self._basins:
571
+ bn.close()
572
+
573
+ def get_downsampled_scatter(self, xax="area_um", yax="deform",
574
+ downsample=0, xscale="linear",
575
+ yscale="linear", remove_invalid=False,
576
+ ret_mask=False):
577
+ """Downsampling by removing points at dense locations
578
+
579
+ Parameters
580
+ ----------
581
+ xax: str
582
+ Identifier for x axis (e.g. "area_um", "aspect", "deform")
583
+ yax: str
584
+ Identifier for y axis
585
+ downsample: int
586
+ Number of points to draw in the down-sampled plot.
587
+ This number is either
588
+
589
+ - >=1: exactly downsample to this number by randomly adding
590
+ or removing points
591
+ - 0 : do not perform downsampling
592
+ xscale: str
593
+ If set to "log", take the logarithm of the x-values before
594
+ performing downsampling. This is useful when data are are
595
+ displayed on a log-scale. Defaults to "linear".
596
+ yscale: str
597
+ See `xscale`.
598
+ remove_invalid: bool
599
+ Remove nan and inf values before downsampling; if set to
600
+ `True`, the actual number of samples returned might be
601
+ smaller than `downsample` due to infinite or nan values
602
+ (e.g. due to logarithmic scaling).
603
+ ret_mask: bool
604
+ If set to `True`, returns a boolean array of length
605
+ `len(self)` where `True` values identify the filtered
606
+ data.
607
+
608
+ Returns
609
+ -------
610
+ xnew, xnew: 1d ndarray of lenght `N`
611
+ Filtered data; `N` is either identical to `downsample`
612
+ or smaller (if `remove_invalid==True`)
613
+ mask: 1d boolean array of length `len(RTDCBase)`
614
+ Array for identifying the downsampled data points
615
+ """
616
+ if downsample < 0:
617
+ raise ValueError("`downsample` must be zero or positive!")
618
+
619
+ downsample = int(downsample)
620
+ xax = xax.lower()
621
+ yax = yax.lower()
622
+
623
+ # Get data
624
+ x = self[xax][self.filter.all]
625
+ y = self[yax][self.filter.all]
626
+
627
+ # Apply scale (no change for linear scale)
628
+ xs = RTDCBase._apply_scale(x, xscale, xax)
629
+ ys = RTDCBase._apply_scale(y, yscale, yax)
630
+
631
+ _, _, idx = downsampling.downsample_grid(xs, ys,
632
+ samples=downsample,
633
+ remove_invalid=remove_invalid,
634
+ ret_idx=True)
635
+
636
+ if ret_mask:
637
+ # Mask is a boolean array of len(self)
638
+ mask = np.zeros(len(self), dtype=bool)
639
+ mids = np.where(self.filter.all)[0]
640
+ mask[mids] = idx
641
+ return x[idx], y[idx], mask
642
+ else:
643
+ return x[idx], y[idx]
644
+
645
+ def get_kde_contour(self, xax="area_um", yax="deform", xacc=None,
646
+ yacc=None, kde_type="histogram", kde_kwargs=None,
647
+ xscale="linear", yscale="linear"):
648
+ """Evaluate the kernel density estimate for contour plots
649
+
650
+ Parameters
651
+ ----------
652
+ xax: str
653
+ Identifier for X axis (e.g. "area_um", "aspect", "deform")
654
+ yax: str
655
+ Identifier for Y axis
656
+ xacc: float
657
+ Contour accuracy in x direction
658
+ yacc: float
659
+ Contour accuracy in y direction
660
+ kde_type: str
661
+ The KDE method to use
662
+ kde_kwargs: dict
663
+ Additional keyword arguments to the KDE method
664
+ xscale: str
665
+ If set to "log", take the logarithm of the x-values before
666
+ computing the KDE. This is useful when data are
667
+ displayed on a log-scale. Defaults to "linear".
668
+ yscale: str
669
+ See `xscale`.
670
+
671
+ Returns
672
+ -------
673
+ X, Y, Z : coordinates
674
+ The kernel density Z evaluated on a rectangular grid (X,Y).
675
+ """
676
+ if kde_kwargs is None:
677
+ kde_kwargs = {}
678
+ xax = xax.lower()
679
+ yax = yax.lower()
680
+ kde_type = kde_type.lower()
681
+ if kde_type not in kde_methods.methods:
682
+ raise ValueError("Not a valid kde type: {}!".format(kde_type))
683
+
684
+ # Get data
685
+ x = self[xax][self.filter.all]
686
+ y = self[yax][self.filter.all]
687
+
688
+ xacc_sc, xs = RTDCBase.get_kde_spacing(
689
+ a=x,
690
+ feat=xax,
691
+ scale=xscale,
692
+ method=kde_methods.bin_width_doane,
693
+ ret_scaled=True)
694
+
695
+ yacc_sc, ys = RTDCBase.get_kde_spacing(
696
+ a=y,
697
+ feat=yax,
698
+ scale=yscale,
699
+ method=kde_methods.bin_width_doane,
700
+ ret_scaled=True)
701
+
702
+ if xacc is None or xacc == 0:
703
+ xacc = xacc_sc / 5
704
+
705
+ if yacc is None or yacc == 0:
706
+ yacc = yacc_sc / 5
707
+
708
+ # Ignore infs and nans
709
+ bad = kde_methods.get_bad_vals(xs, ys)
710
+ xc = xs[~bad]
711
+ yc = ys[~bad]
712
+
713
+ xnum = int(np.ceil((xc.max() - xc.min()) / xacc))
714
+ ynum = int(np.ceil((yc.max() - yc.min()) / yacc))
715
+
716
+ xlin = np.linspace(xc.min(), xc.max(), xnum, endpoint=True)
717
+ ylin = np.linspace(yc.min(), yc.max(), ynum, endpoint=True)
718
+
719
+ xmesh, ymesh = np.meshgrid(xlin, ylin, indexing="ij")
720
+
721
+ kde_fct = kde_methods.methods[kde_type]
722
+ if len(x):
723
+ density = kde_fct(events_x=xs, events_y=ys,
724
+ xout=xmesh, yout=ymesh,
725
+ **kde_kwargs)
726
+ else:
727
+ density = np.array([])
728
+
729
+ # Convert mesh back to linear scale if applicable
730
+ if xscale == "log":
731
+ xmesh = np.exp(xmesh)
732
+ if yscale == "log":
733
+ ymesh = np.exp(ymesh)
734
+
735
+ return xmesh, ymesh, density
736
+
737
+ def get_kde_scatter(self, xax="area_um", yax="deform", positions=None,
738
+ kde_type="histogram", kde_kwargs=None, xscale="linear",
739
+ yscale="linear"):
740
+ """Evaluate the kernel density estimate for scatter plots
741
+
742
+ Parameters
743
+ ----------
744
+ xax: str
745
+ Identifier for X axis (e.g. "area_um", "aspect", "deform")
746
+ yax: str
747
+ Identifier for Y axis
748
+ positions: list of two 1d ndarrays or ndarray of shape (2, N)
749
+ The positions where the KDE will be computed. Note that
750
+ the KDE estimate is computed from the points that
751
+ are set in `self.filter.all`.
752
+ kde_type: str
753
+ The KDE method to use, see :const:`.kde_methods.methods`
754
+ kde_kwargs: dict
755
+ Additional keyword arguments to the KDE method
756
+ xscale: str
757
+ If set to "log", take the logarithm of the x-values before
758
+ computing the KDE. This is useful when data are are
759
+ displayed on a log-scale. Defaults to "linear".
760
+ yscale: str
761
+ See `xscale`.
762
+
763
+ Returns
764
+ -------
765
+ density : 1d ndarray
766
+ The kernel density evaluated for the filtered data points.
767
+ """
768
+ if kde_kwargs is None:
769
+ kde_kwargs = {}
770
+ xax = xax.lower()
771
+ yax = yax.lower()
772
+ kde_type = kde_type.lower()
773
+ if kde_type not in kde_methods.methods:
774
+ raise ValueError("Not a valid kde type: {}!".format(kde_type))
775
+
776
+ # Get data
777
+ x = self[xax][self.filter.all]
778
+ y = self[yax][self.filter.all]
779
+
780
+ # Apply scale (no change for linear scale)
781
+ xs = RTDCBase._apply_scale(x, xscale, xax)
782
+ ys = RTDCBase._apply_scale(y, yscale, yax)
783
+
784
+ if positions is None:
785
+ posx = None
786
+ posy = None
787
+ else:
788
+ posx = RTDCBase._apply_scale(positions[0], xscale, xax)
789
+ posy = RTDCBase._apply_scale(positions[1], yscale, yax)
790
+
791
+ kde_fct = kde_methods.methods[kde_type]
792
+ if len(x):
793
+ density = kde_fct(events_x=xs, events_y=ys,
794
+ xout=posx, yout=posy,
795
+ **kde_kwargs)
796
+ else:
797
+ density = np.array([])
798
+
799
+ return density
800
+
801
+ def basins_get_dicts(self):
802
+ """Return the list of dictionaries describing the dataset's basins"""
803
+ # Only implement this for classes that support this
804
+ return []
805
+
806
+ def basins_retrieve(self):
807
+ """Load all basins available
808
+
809
+ .. versionadded:: 0.54.0
810
+
811
+ In dclab 0.51.0, we introduced basins, a simple way of combining
812
+ HDF5-based datasets (including the :class:`.HDF5_S3` format).
813
+ The idea is to be able to store parts of the dataset
814
+ (e.g. images) in a separate file that could then be located
815
+ someplace else (e.g. an S3 object store).
816
+
817
+ If an RT-DC file has "basins" defined, then these are sought out and
818
+ made available via the `features_basin` property.
819
+
820
+ .. versionchanged:: 0.57.5
821
+
822
+ "file"-type basins are only available for subclasses that
823
+ set the `_local_basins_allowed` attribute to True.
824
+ """
825
+ basins = []
826
+ bc = feat_basin.get_basin_classes()
827
+ # Sort basins according to priority
828
+ bdicts_srt = sorted(self.basins_get_dicts(),
829
+ key=feat_basin.basin_priority_sorted_key)
830
+ # complement basin "key"s (we do the same in writer)
831
+ for bdict in bdicts_srt:
832
+ if "key" not in bdict:
833
+ b_dat = json.dumps(bdict, indent=2, sort_keys=True).split("\n")
834
+ bdict["key"] = hashobj(b_dat)
835
+
836
+ bd_keys = [bd["key"] for bd in bdicts_srt]
837
+ bd_keys += self._basins_ignored
838
+ for bdict in bdicts_srt:
839
+ if bdict["format"] not in bc:
840
+ warnings.warn(f"Encountered unsupported basin "
841
+ f"format '{bdict['format']}'!")
842
+ continue
843
+ if bdict["key"] in self._basins_ignored:
844
+ warnings.warn(
845
+ f"Encountered cyclic basin dependency '{bdict['key']}'",
846
+ feat_basin.CyclicBasinDependencyFoundWarning)
847
+ continue
848
+
849
+ # Basin initialization keyword arguments
850
+ kwargs = {
851
+ "name": bdict.get("name"),
852
+ "description": bdict.get("description"),
853
+ # Honor features intended by basin creator.
854
+ "features": bdict.get("features"),
855
+ # Which mapping we are using ("same", "basinmap1", ...)
856
+ "mapping": bdict.get("mapping", "same"),
857
+ # For non-identical mapping ("basinmap1", etc.), we
858
+ # need the referring dataset.
859
+ "mapping_referrer": self,
860
+ # Make sure the measurement identifier is checked.
861
+ "measurement_identifier": self.get_measurement_identifier(),
862
+ # allow to ignore basins
863
+ "ignored_basins": bd_keys,
864
+ # basin key
865
+ "key": bdict["key"],
866
+ }
867
+
868
+ # Check whether this basin is supported and exists
869
+ if bdict["type"] == "internal":
870
+ b_cls = bc[bdict["format"]]
871
+ bna = b_cls(bdict["paths"][0], **kwargs)
872
+ # In contrast to file-type basins, we just add all remote
873
+ # basins without checking first. We do not check for
874
+ # the availability of remote basins, because they could
875
+ # be temporarily inaccessible (unstable network connection)
876
+ # and because checking the availability of remote basins
877
+ # normally takes a lot of time.
878
+ basins.append(bna)
879
+ elif bdict["type"] == "file":
880
+ if not self._local_basins_allowed:
881
+ warnings.warn(f"Basin type 'file' not allowed for format "
882
+ f"'{self.format}'")
883
+ # stop processing this basin
884
+ continue
885
+ p_paths = list(bdict["paths"])
886
+ # translate Windows and Unix relative paths
887
+ for pi in list(p_paths): # [sic] create a copy of the list
888
+ if pi.count(".."):
889
+ if pi[2:].count("/") and os.path.sep == r"\\":
890
+ # Windows
891
+ p_paths.append(pi.replace("/", r"\\"))
892
+ elif pi[2:].count(r"\\") and os.path.sep == "/":
893
+ # Unix
894
+ p_paths.append(pi.replace(r"\\", "/"))
895
+ # perform the actual check
896
+ for pp in p_paths:
897
+ pp = pathlib.Path(pp)
898
+ # Instantiate the proper basin class
899
+ b_cls = bc[bdict["format"]]
900
+ # Try absolute path
901
+ bna = b_cls(pp, **kwargs)
902
+ if bna.verify_basin():
903
+ basins.append(bna)
904
+ break
905
+ # Try relative path
906
+ this_path = pathlib.Path(self.path)
907
+ if this_path.exists():
908
+ # Insert relative path
909
+ bnr = b_cls(this_path.parent / pp, **kwargs)
910
+ if bnr.verify_basin():
911
+ basins.append(bnr)
912
+ break
913
+ elif bdict["type"] == "remote":
914
+ for url in bdict["urls"]:
915
+ # Instantiate the proper basin class
916
+ b_cls = bc[bdict["format"]]
917
+ bna = b_cls(url, **kwargs)
918
+ # In contrast to file-type basins, we just add all remote
919
+ # basins without checking first. We do not check for
920
+ # the availability of remote basins, because they could
921
+ # be temporarily inaccessible (unstable network connection)
922
+ # and because checking the availability of remote basins
923
+ # normally takes a lot of time.
924
+ basins.append(bna)
925
+ else:
926
+ warnings.warn(
927
+ f"Encountered unsupported basin type '{bdict['type']}'!")
928
+ return basins
929
+
930
+ def get_measurement_identifier(self):
931
+ """Return a unique measurement identifier
932
+
933
+ Return the [experiment]:"run identifier" configuration feat, if it
934
+ exists. Otherwise, return the MD5 sum computed from the measurement
935
+ time, date, and setup identifier.
936
+
937
+ Returns `None` if no identifier could be found or computed.
938
+
939
+ .. versionadded:: 0.51.0
940
+
941
+ """
942
+ identifier = self.config.get("experiment", {}).get("run identifier",
943
+ None)
944
+ if identifier is None:
945
+ time = self.config.get("experiment", {}).get("time", None)
946
+ date = self.config.get("experiment", {}).get("date", None)
947
+ sid = self.config.get("setup", {}).get("identifier", None)
948
+ if None not in [time, date, sid]:
949
+ # only compute an identifier if all of the above are defined.
950
+ hasher = hashlib.md5(f"{time}_{date}_{sid}".encode("utf-8"))
951
+ identifier = str(uuid.UUID(hex=hasher.hexdigest()))
952
+ return identifier
953
+
954
+ def polygon_filter_add(self, filt):
955
+ """Associate a Polygon Filter with this instance
956
+
957
+ Parameters
958
+ ----------
959
+ filt: int or instance of `PolygonFilter`
960
+ The polygon filter to add
961
+ """
962
+ self._assert_filter() # [sic] initialize the filter if not done yet
963
+ if not isinstance(filt, (PolygonFilter, int, float)):
964
+ msg = "`filt` must be a number or instance of PolygonFilter!"
965
+ raise ValueError(msg)
966
+
967
+ if isinstance(filt, PolygonFilter):
968
+ uid = filt.unique_id
969
+ else:
970
+ uid = int(filt)
971
+ # append item
972
+ self.config["filtering"]["polygon filters"].append(uid)
973
+
974
+ def polygon_filter_rm(self, filt):
975
+ """Remove a polygon filter from this instance
976
+
977
+ Parameters
978
+ ----------
979
+ filt: int or instance of `PolygonFilter`
980
+ The polygon filter to remove
981
+ """
982
+ if not isinstance(filt, (PolygonFilter, int, float)):
983
+ msg = "`filt` must be a number or instance of PolygonFilter!"
984
+ raise ValueError(msg)
985
+
986
+ if isinstance(filt, PolygonFilter):
987
+ uid = filt.unique_id
988
+ else:
989
+ uid = int(filt)
990
+ # remove item
991
+ self.config["filtering"]["polygon filters"].remove(uid)
992
+
993
+ def reset_filter(self):
994
+ """Reset the current filter"""
995
+ # reset filter instance
996
+ self.filter.reset()
997
+ # reset configuration
998
+ # remember hierarchy parent
999
+ hp = self.config["filtering"]["hierarchy parent"]
1000
+ self.config._init_default_filter_values()
1001
+ self.config["filtering"]["hierarchy parent"] = hp