dclab 0.62.7__cp38-cp38-macosx_11_0_arm64.whl → 2.18.0__cp38-cp38-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dclab might be problematic. Click here for more details.
- dclab/_version.py +2 -2
- dclab/definitions/meta_const.py +1 -11
- dclab/downsampling.cpython-38-darwin.so +0 -0
- dclab/external/skimage/_find_contours_cy.cpython-38-darwin.so +0 -0
- dclab/external/skimage/_pnpoly.cpython-38-darwin.so +0 -0
- dclab/external/skimage/_shared/geometry.cpython-38-darwin.so +0 -0
- dclab/http_utils.py +2 -12
- dclab/lme4/__init__.py +4 -4
- dclab/lme4/rlibs.py +93 -0
- dclab/lme4/rsetup.py +153 -150
- dclab/lme4/wrapr.py +129 -93
- dclab/rtdc_dataset/check.py +6 -74
- dclab/rtdc_dataset/copier.py +14 -60
- dclab/rtdc_dataset/feat_basin.py +3 -21
- dclab/rtdc_dataset/fmt_hdf5/base.py +2 -7
- dclab/rtdc_dataset/fmt_hdf5/events.py +3 -3
- dclab/rtdc_dataset/fmt_hierarchy/base.py +1 -0
- dclab/rtdc_dataset/fmt_hierarchy/events.py +3 -4
- dclab/rtdc_dataset/linker.py +124 -0
- dclab/util.py +0 -6
- {dclab-0.62.7.dist-info → dclab-2.18.0.dist-info}/METADATA +4 -2
- {dclab-0.62.7.dist-info → dclab-2.18.0.dist-info}/RECORD +26 -25
- {dclab-0.62.7.dist-info → dclab-2.18.0.dist-info}/WHEEL +1 -1
- dclab/lme4/lme4_template.R +0 -94
- {dclab-0.62.7.dist-info → dclab-2.18.0.dist-info}/LICENSE +0 -0
- {dclab-0.62.7.dist-info → dclab-2.18.0.dist-info}/entry_points.txt +0 -0
- {dclab-0.62.7.dist-info → dclab-2.18.0.dist-info}/top_level.txt +0 -0
dclab/lme4/wrapr.py
CHANGED
|
@@ -1,19 +1,18 @@
|
|
|
1
1
|
"""R lme4 wrapper"""
|
|
2
|
-
import logging
|
|
3
2
|
import numbers
|
|
4
|
-
import
|
|
5
|
-
import tempfile
|
|
3
|
+
import warnings
|
|
6
4
|
|
|
7
|
-
import importlib_resources
|
|
8
5
|
import numpy as np
|
|
9
6
|
|
|
10
7
|
from .. import definitions as dfn
|
|
11
8
|
from ..rtdc_dataset.core import RTDCBase
|
|
12
9
|
|
|
10
|
+
from .rlibs import rpy2
|
|
13
11
|
from . import rsetup
|
|
14
12
|
|
|
15
13
|
|
|
16
|
-
|
|
14
|
+
class Lme4InstallWarning(UserWarning):
|
|
15
|
+
pass
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
class Rlme4(object):
|
|
@@ -39,12 +38,19 @@ class Rlme4(object):
|
|
|
39
38
|
#: list of [RTDCBase, column, repetition, chip_region]
|
|
40
39
|
self.data = []
|
|
41
40
|
|
|
41
|
+
#: model function
|
|
42
|
+
self.r_func_model = "feature ~ group + (1 + group | repetition)"
|
|
43
|
+
#: null model function
|
|
44
|
+
self.r_func_nullmodel = "feature ~ (1 + group | repetition)"
|
|
45
|
+
|
|
42
46
|
self.set_options(model=model, feature=feature)
|
|
43
47
|
|
|
44
48
|
# Make sure that lme4 is available
|
|
45
49
|
if not rsetup.has_lme4():
|
|
46
|
-
|
|
47
|
-
|
|
50
|
+
warnings.warn("Installing lme4, this may take a while!",
|
|
51
|
+
Lme4InstallWarning)
|
|
52
|
+
rsetup.install_lme4()
|
|
53
|
+
rsetup.import_lme4()
|
|
48
54
|
|
|
49
55
|
def add_dataset(self, ds, group, repetition):
|
|
50
56
|
"""Add a dataset to the analysis list
|
|
@@ -61,8 +67,8 @@ class Rlme4(object):
|
|
|
61
67
|
|
|
62
68
|
Notes
|
|
63
69
|
-----
|
|
64
|
-
- For each repetition, there must be a "treatment"
|
|
65
|
-
"control"
|
|
70
|
+
- For each repetition, there must be a "treatment" and a
|
|
71
|
+
"control" ``group``.
|
|
66
72
|
- If you would like to perform a differential feature analysis,
|
|
67
73
|
then you need to pass at least a reservoir and a channel
|
|
68
74
|
dataset (with same parameters for `group` and `repetition`).
|
|
@@ -96,10 +102,10 @@ class Rlme4(object):
|
|
|
96
102
|
The response variable is modeled using two linear mixed effect
|
|
97
103
|
models:
|
|
98
104
|
|
|
99
|
-
- model:
|
|
100
|
-
|
|
101
|
-
- the null model:
|
|
102
|
-
|
|
105
|
+
- model :const:`Rlme4.r_func_model` (random intercept +
|
|
106
|
+
random slope model)
|
|
107
|
+
- the null model :const:`Rlme4.r_func_nullmodel` (without
|
|
108
|
+
the fixed effect introduced by the "treatment" group).
|
|
103
109
|
|
|
104
110
|
Both models are compared in R using "anova" (from the
|
|
105
111
|
R-package "stats" :cite:`Everitt1992`) which performs a
|
|
@@ -127,16 +133,16 @@ class Rlme4(object):
|
|
|
127
133
|
results: dict
|
|
128
134
|
Dictionary with the results of the fitting process:
|
|
129
135
|
|
|
130
|
-
- "anova p-value": Anova
|
|
136
|
+
- "anova p-value": Anova likelyhood ratio test (significance)
|
|
131
137
|
- "feature": name of the feature used for the analysis
|
|
132
138
|
``self.feature``
|
|
133
139
|
- "fixed effects intercept": Mean of ``self.feature`` for all
|
|
134
140
|
controls; In the case of the "glmer+loglink" model, the intercept
|
|
135
|
-
is already
|
|
141
|
+
is already backtransformed from log space.
|
|
136
142
|
- "fixed effects treatment": The fixed effect size between the mean
|
|
137
143
|
of the controls and the mean of the treatments relative to
|
|
138
144
|
"fixed effects intercept"; In the case of the "glmer+loglink"
|
|
139
|
-
model, the fixed effect is already
|
|
145
|
+
model, the fixed effect is already backtransformed from log
|
|
140
146
|
space.
|
|
141
147
|
- "fixed effects repetitions": The effects (intercept and
|
|
142
148
|
treatment) for each repetition. The first axis defines
|
|
@@ -153,10 +159,11 @@ class Rlme4(object):
|
|
|
153
159
|
- "model": model name used for the analysis ``self.model``
|
|
154
160
|
- "model converged": boolean indicating whether the model
|
|
155
161
|
converged
|
|
156
|
-
- "r
|
|
157
|
-
- "r model
|
|
158
|
-
- "r
|
|
159
|
-
- "r
|
|
162
|
+
- "r anova": Anova model (exposed from R)
|
|
163
|
+
- "r model summary": Summary of the model (exposed from R)
|
|
164
|
+
- "r model coefficients": Model coefficient table (exposed from R)
|
|
165
|
+
- "r stderr": errors and warnings from R
|
|
166
|
+
- "r stdout": standard output from R
|
|
160
167
|
"""
|
|
161
168
|
self.set_options(model=model, feature=feature)
|
|
162
169
|
self.check_data()
|
|
@@ -175,38 +182,105 @@ class Rlme4(object):
|
|
|
175
182
|
groups.append(dd[1])
|
|
176
183
|
repetitions.append(dd[2])
|
|
177
184
|
|
|
178
|
-
#
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
185
|
+
# Fire up R
|
|
186
|
+
with rsetup.AutoRConsole() as ac:
|
|
187
|
+
r = rpy2.robjects.r
|
|
188
|
+
|
|
189
|
+
# Load lme4
|
|
190
|
+
rpy2.robjects.packages.importr("lme4")
|
|
191
|
+
|
|
192
|
+
# Concatenate huge arrays for R
|
|
193
|
+
r_features = rpy2.robjects.FloatVector(np.concatenate(features))
|
|
194
|
+
_groups = []
|
|
195
|
+
_repets = []
|
|
196
|
+
for ii in range(len(features)):
|
|
197
|
+
_groups.append(np.repeat(groups[ii], len(features[ii])))
|
|
198
|
+
_repets.append(np.repeat(repetitions[ii], len(features[ii])))
|
|
199
|
+
r_groups = rpy2.robjects.StrVector(np.concatenate(_groups))
|
|
200
|
+
r_repetitions = rpy2.robjects.IntVector(np.concatenate(_repets))
|
|
201
|
+
|
|
202
|
+
# Register groups and repetitions
|
|
203
|
+
rpy2.robjects.globalenv["feature"] = r_features
|
|
204
|
+
rpy2.robjects.globalenv["group"] = r_groups
|
|
205
|
+
rpy2.robjects.globalenv["repetition"] = r_repetitions
|
|
206
|
+
|
|
207
|
+
# Create a dataframe which contains all the data
|
|
208
|
+
r_data = r["data.frame"](r_features, r_groups, r_repetitions)
|
|
209
|
+
|
|
210
|
+
# Random intercept and random slope model
|
|
211
|
+
if self.model == 'glmer+loglink':
|
|
212
|
+
r_model = r["glmer"](self.r_func_model, r_data,
|
|
213
|
+
family=r["Gamma"](link='log'))
|
|
214
|
+
r_nullmodel = r["glmer"](self.r_func_nullmodel, r_data,
|
|
215
|
+
family=r["Gamma"](link='log'))
|
|
216
|
+
else: # lmer
|
|
217
|
+
r_model = r["lmer"](self.r_func_model, r_data)
|
|
218
|
+
r_nullmodel = r["lmer"](self.r_func_nullmodel, r_data)
|
|
219
|
+
|
|
220
|
+
# Anova analysis (increase verbosity by making models global)
|
|
221
|
+
# Using anova is a very conservative way of determining
|
|
222
|
+
# p values.
|
|
223
|
+
rpy2.robjects.globalenv["Model"] = r_model
|
|
224
|
+
rpy2.robjects.globalenv["NullModel"] = r_nullmodel
|
|
225
|
+
r_anova = r("anova(Model, NullModel)")
|
|
226
|
+
try:
|
|
227
|
+
pvalue = r_anova.rx2["Pr(>Chisq)"][1]
|
|
228
|
+
except ValueError: # rpy2 2.9.4
|
|
229
|
+
pvalue = r_anova[7][1]
|
|
230
|
+
r_model_summary = r["summary"](r_model)
|
|
231
|
+
r_model_coefficients = r["coef"](r_model)
|
|
232
|
+
try:
|
|
233
|
+
fe_reps = np.array(r_model_coefficients.rx2["repetition"])
|
|
234
|
+
except ValueError: # rpy2 2.9.4
|
|
235
|
+
fe_reps = np.concatenate((
|
|
236
|
+
np.array(r_model_coefficients[0][0]).reshape(1, -1),
|
|
237
|
+
np.array(r_model_coefficients[0][1]).reshape(1, -1)),
|
|
238
|
+
axis=0)
|
|
239
|
+
|
|
240
|
+
r_effects = r["data.frame"](r["coef"](r_model_summary))
|
|
241
|
+
try:
|
|
242
|
+
fe_icept = r_effects.rx2["Estimate"][0]
|
|
243
|
+
fe_treat = r_effects.rx2["Estimate"][1]
|
|
244
|
+
except ValueError: # rpy2 2.9.4
|
|
245
|
+
fe_icept = r_effects[0][0]
|
|
246
|
+
fe_treat = r_effects[0][1]
|
|
247
|
+
if self.model == "glmer+loglink":
|
|
248
|
+
# transform back from log
|
|
249
|
+
fe_treat = np.exp(fe_icept + fe_treat) - np.exp(fe_icept)
|
|
250
|
+
fe_icept = np.exp(fe_icept)
|
|
251
|
+
fe_reps[:, 1] = np.exp(fe_reps[:, 0] + fe_reps[:, 1]) \
|
|
252
|
+
- np.exp(fe_reps[:, 0])
|
|
253
|
+
fe_reps[:, 0] = np.exp(fe_reps[:, 0])
|
|
254
|
+
|
|
255
|
+
# convergence
|
|
256
|
+
try:
|
|
257
|
+
lme4l = r_model_summary.rx2["optinfo"].rx2["conv"].rx2["lme4"]
|
|
258
|
+
except ValueError: # rpy2 2.9.4
|
|
259
|
+
lme4l = r_model_summary[17][3][1]
|
|
260
|
+
|
|
261
|
+
if lme4l and "code" in lme4l.names:
|
|
262
|
+
try:
|
|
263
|
+
conv_code = lme4l.rx2["code"]
|
|
264
|
+
except ValueError: # rpy2 2.9.4
|
|
265
|
+
conv_code = lme4l[0]
|
|
266
|
+
else:
|
|
267
|
+
conv_code = 0
|
|
268
|
+
|
|
269
|
+
ret_dict = {
|
|
270
|
+
"anova p-value": pvalue,
|
|
271
|
+
"feature": self.feature,
|
|
272
|
+
"fixed effects intercept": fe_icept,
|
|
273
|
+
"fixed effects treatment": fe_treat, # aka "fixed effect"
|
|
274
|
+
"fixed effects repetitions": fe_reps,
|
|
275
|
+
"is differential": self.is_differential(),
|
|
276
|
+
"model": self.model,
|
|
277
|
+
"model converged": conv_code == 0,
|
|
278
|
+
"r anova": r_anova,
|
|
279
|
+
"r model summary": r_model_summary,
|
|
280
|
+
"r model coefficients": r_model_coefficients,
|
|
281
|
+
"r stderr": ac.get_warnerrors(),
|
|
282
|
+
"r stdout": ac.get_prints(),
|
|
283
|
+
}
|
|
210
284
|
return ret_dict
|
|
211
285
|
|
|
212
286
|
def get_differential_dataset(self):
|
|
@@ -214,7 +288,7 @@ class Rlme4(object):
|
|
|
214
288
|
|
|
215
289
|
The most famous use case is differential deformation. The idea
|
|
216
290
|
is that you cannot tell what the difference in deformation
|
|
217
|
-
from channel to reservoir, because you never measure the
|
|
291
|
+
from channel to reservoir is, because you never measure the
|
|
218
292
|
same object in the reservoir and the channel. You usually just
|
|
219
293
|
have two distributions. Comparing distributions is possible
|
|
220
294
|
via bootstrapping. And then, instead of running the lme4
|
|
@@ -288,34 +362,6 @@ class Rlme4(object):
|
|
|
288
362
|
else:
|
|
289
363
|
return False
|
|
290
364
|
|
|
291
|
-
def parse_result(self, result):
|
|
292
|
-
resd = result.split("OUTPUT")
|
|
293
|
-
ret_dict = {}
|
|
294
|
-
for item in resd:
|
|
295
|
-
string = item.split("#*#")[0]
|
|
296
|
-
key, value = string.split(":", 1)
|
|
297
|
-
key = key.strip()
|
|
298
|
-
value = value.strip().replace("\n\n", "\n")
|
|
299
|
-
|
|
300
|
-
if key == "fixed effects repetitions":
|
|
301
|
-
rows = value.split("\n")[1:]
|
|
302
|
-
reps = []
|
|
303
|
-
for row in rows:
|
|
304
|
-
reps.append([float(vv) for vv in row.split()[1:]])
|
|
305
|
-
value = np.array(reps).transpose()
|
|
306
|
-
elif key == "model converged":
|
|
307
|
-
value = value == "TRUE"
|
|
308
|
-
elif value == "NA":
|
|
309
|
-
value = np.nan
|
|
310
|
-
else:
|
|
311
|
-
try:
|
|
312
|
-
value = float(value)
|
|
313
|
-
except ValueError:
|
|
314
|
-
pass
|
|
315
|
-
|
|
316
|
-
ret_dict[key] = value
|
|
317
|
-
return ret_dict
|
|
318
|
-
|
|
319
365
|
def set_options(self, model=None, feature=None):
|
|
320
366
|
"""Set analysis options"""
|
|
321
367
|
if model is not None:
|
|
@@ -326,16 +372,6 @@ class Rlme4(object):
|
|
|
326
372
|
self.feature = feature
|
|
327
373
|
|
|
328
374
|
|
|
329
|
-
def arr2str(a):
|
|
330
|
-
"""Convert an array to a string"""
|
|
331
|
-
if isinstance(a.dtype.type, np.integer):
|
|
332
|
-
return ",".join(str(dd) for dd in a.tolist())
|
|
333
|
-
elif a.dtype.type == np.str_:
|
|
334
|
-
return ",".join(f"'{dd}'" for dd in a.tolist())
|
|
335
|
-
else:
|
|
336
|
-
return ",".join(f"{dd:.16g}" for dd in a.tolist())
|
|
337
|
-
|
|
338
|
-
|
|
339
375
|
def bootstrapped_median_distributions(a, b, bs_iter=1000, rs=117):
|
|
340
376
|
"""Compute the bootstrapped distributions for two arrays.
|
|
341
377
|
|
|
@@ -345,7 +381,7 @@ def bootstrapped_median_distributions(a, b, bs_iter=1000, rs=117):
|
|
|
345
381
|
Input data
|
|
346
382
|
bs_iter: int
|
|
347
383
|
Number of bootstrapping iterations to perform
|
|
348
|
-
(
|
|
384
|
+
(outtput size).
|
|
349
385
|
rs: int
|
|
350
386
|
Random state seed for random number generator
|
|
351
387
|
|
|
@@ -360,7 +396,7 @@ def bootstrapped_median_distributions(a, b, bs_iter=1000, rs=117):
|
|
|
360
396
|
|
|
361
397
|
Notes
|
|
362
398
|
-----
|
|
363
|
-
From a
|
|
399
|
+
From a programmatical point of view, it would have been better
|
|
364
400
|
to implement this method for just one input array (because of
|
|
365
401
|
redundant code). However, due to historical reasons (testing
|
|
366
402
|
and comparability to Shape-Out 1), bootstrapping is done
|
dclab/rtdc_dataset/check.py
CHANGED
|
@@ -8,7 +8,6 @@ import numpy as np
|
|
|
8
8
|
|
|
9
9
|
from .copier import is_properly_compressed
|
|
10
10
|
from .core import RTDCBase
|
|
11
|
-
from .fmt_hdf5 import RTDC_HDF5
|
|
12
11
|
from .fmt_hierarchy import RTDC_Hierarchy
|
|
13
12
|
from .load import load_file
|
|
14
13
|
|
|
@@ -193,13 +192,14 @@ class IntegrityChecker(object):
|
|
|
193
192
|
else:
|
|
194
193
|
with warnings.catch_warnings(record=True) as ws:
|
|
195
194
|
warnings.simplefilter("always")
|
|
196
|
-
self.ds = load_file(path_or_ds
|
|
195
|
+
self.ds = load_file(path_or_ds)
|
|
197
196
|
for ww in ws:
|
|
198
197
|
self.warn_cues.append(ICue(
|
|
199
198
|
msg=f"{ww.category.__name__}: {ww.message}",
|
|
200
199
|
level="alert",
|
|
201
200
|
category="warning"))
|
|
202
201
|
self.finally_close = True
|
|
202
|
+
np.max(self.ds["index"])
|
|
203
203
|
|
|
204
204
|
def __enter__(self):
|
|
205
205
|
return self
|
|
@@ -260,24 +260,13 @@ class IntegrityChecker(object):
|
|
|
260
260
|
level="alert",
|
|
261
261
|
category="basin data",
|
|
262
262
|
))
|
|
263
|
-
|
|
264
|
-
if
|
|
263
|
+
for feat in bn["features"]:
|
|
264
|
+
if feat not in self.ds.h5file[bpaths[0]]:
|
|
265
265
|
cues.append(
|
|
266
|
-
ICue(msg="Missing internal basin
|
|
267
|
-
"'basin_events', although an internal "
|
|
268
|
-
"basin is defined",
|
|
266
|
+
ICue(msg=f"Missing internal basin feature {feat}",
|
|
269
267
|
level="violation",
|
|
270
268
|
category="basin data",
|
|
271
269
|
))
|
|
272
|
-
else:
|
|
273
|
-
for feat in bn["features"]:
|
|
274
|
-
if feat not in self.ds.h5file["basin_events"]:
|
|
275
|
-
cues.append(
|
|
276
|
-
ICue(msg=f"Missing internal basin "
|
|
277
|
-
f"feature {feat}",
|
|
278
|
-
level="violation",
|
|
279
|
-
category="basin data",
|
|
280
|
-
))
|
|
281
270
|
return cues
|
|
282
271
|
|
|
283
272
|
def check_compression(self, **kwargs):
|
|
@@ -330,32 +319,8 @@ class IntegrityChecker(object):
|
|
|
330
319
|
data=data))
|
|
331
320
|
return cues
|
|
332
321
|
|
|
333
|
-
def check_empty(self, **kwargs):
|
|
334
|
-
"""The dataset should contain events"""
|
|
335
|
-
cues = []
|
|
336
|
-
lends = len(self.ds)
|
|
337
|
-
if lends == 0:
|
|
338
|
-
cues.append(ICue(
|
|
339
|
-
msg="The dataset does not contain any events",
|
|
340
|
-
level="alert",
|
|
341
|
-
category="feature data"))
|
|
342
|
-
return cues
|
|
343
|
-
|
|
344
|
-
def check_external_links(self, **kwargs):
|
|
345
|
-
"""An HDF5 dataset should not contain external links"""
|
|
346
|
-
cues = []
|
|
347
|
-
if isinstance(self.ds, RTDC_HDF5):
|
|
348
|
-
has_external, h5object = hdf5_has_external(self.ds.h5file)
|
|
349
|
-
if has_external:
|
|
350
|
-
cues.append(ICue(
|
|
351
|
-
msg=f"The HDF5 file contains at least one external "
|
|
352
|
-
f"link: '{h5object}'",
|
|
353
|
-
level="violation",
|
|
354
|
-
category="format HDF5"))
|
|
355
|
-
return cues
|
|
356
|
-
|
|
357
322
|
def check_feat_index(self, **kwargs):
|
|
358
|
-
"""
|
|
323
|
+
"""Up until"""
|
|
359
324
|
cues = []
|
|
360
325
|
lends = len(self.ds)
|
|
361
326
|
if "index" in self.ds:
|
|
@@ -867,36 +832,3 @@ def check_dataset(path_or_ds):
|
|
|
867
832
|
elif cue.level == "violation":
|
|
868
833
|
viol.append(cue.msg)
|
|
869
834
|
return sorted(viol), sorted(aler), sorted(info)
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
def hdf5_has_external(h5):
|
|
873
|
-
"""Check recursively, whether an h5py object contains external data
|
|
874
|
-
|
|
875
|
-
External data includes binary data in external files, virtual
|
|
876
|
-
datasets, and external links.
|
|
877
|
-
|
|
878
|
-
Returns a tuple of either
|
|
879
|
-
|
|
880
|
-
- `(True, path_ext)` if the object contains external data
|
|
881
|
-
- `(False, None)` if this is not the case
|
|
882
|
-
|
|
883
|
-
where `path_ext` is the path to the group or dataset in `h5`.
|
|
884
|
-
|
|
885
|
-
.. versionadded:: 0.62.0
|
|
886
|
-
|
|
887
|
-
"""
|
|
888
|
-
for key in h5:
|
|
889
|
-
obj = h5[key]
|
|
890
|
-
if (obj.file != h5.file # not in same file
|
|
891
|
-
or (isinstance(obj, h5py.Dataset)
|
|
892
|
-
and (obj.is_virtual # virtual dataset
|
|
893
|
-
or obj.external))): # external dataset
|
|
894
|
-
# These are external data
|
|
895
|
-
return True, f"{h5.name}/{key}".replace("//", "/")
|
|
896
|
-
elif isinstance(obj, h5py.Group):
|
|
897
|
-
# Perform recursive check for external data
|
|
898
|
-
has_ext, path_ext = hdf5_has_external(obj)
|
|
899
|
-
if has_ext:
|
|
900
|
-
return True, path_ext
|
|
901
|
-
else:
|
|
902
|
-
return False, None
|
dclab/rtdc_dataset/copier.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Helper methods for copying .rtdc data"""
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
-
import json
|
|
5
4
|
import re
|
|
6
5
|
from typing import List, Literal
|
|
7
6
|
|
|
@@ -11,10 +10,8 @@ import hdf5plugin
|
|
|
11
10
|
import numpy as np
|
|
12
11
|
|
|
13
12
|
from ..definitions import feature_exists, scalar_feature_exists
|
|
14
|
-
from ..util import hashobj
|
|
15
13
|
|
|
16
|
-
from .fmt_hdf5 import DEFECTIVE_FEATURES
|
|
17
|
-
from .writer import RTDCWriter
|
|
14
|
+
from .fmt_hdf5 import DEFECTIVE_FEATURES
|
|
18
15
|
|
|
19
16
|
|
|
20
17
|
def rtdc_copy(src_h5file: h5py.Group,
|
|
@@ -60,6 +57,19 @@ def rtdc_copy(src_h5file: h5py.Group,
|
|
|
60
57
|
events_src += list(src_h5file["basin_events"].keys())
|
|
61
58
|
events_src = sorted(set(events_src))
|
|
62
59
|
|
|
60
|
+
# basins
|
|
61
|
+
if include_basins and "basins" in src_h5file:
|
|
62
|
+
dst_h5file.require_group("basins")
|
|
63
|
+
for b_key in src_h5file["basins"]:
|
|
64
|
+
if b_key in dst_h5file["basins"]:
|
|
65
|
+
# This basin already exists.
|
|
66
|
+
continue
|
|
67
|
+
h5ds_copy(src_loc=src_h5file["basins"],
|
|
68
|
+
src_name=b_key,
|
|
69
|
+
dst_loc=dst_h5file["basins"],
|
|
70
|
+
dst_name=b_key,
|
|
71
|
+
recursive=False)
|
|
72
|
+
|
|
63
73
|
# logs
|
|
64
74
|
if include_logs and "logs" in src_h5file:
|
|
65
75
|
dst_h5file.require_group("logs")
|
|
@@ -120,12 +130,6 @@ def rtdc_copy(src_h5file: h5py.Group,
|
|
|
120
130
|
if feat in feature_iter:
|
|
121
131
|
feature_iter.remove(feat)
|
|
122
132
|
|
|
123
|
-
# copy basin definitions
|
|
124
|
-
if include_basins and "basins" in src_h5file:
|
|
125
|
-
basin_definition_copy(src_h5file=src_h5file,
|
|
126
|
-
dst_h5file=dst_h5file,
|
|
127
|
-
features_iter=feature_iter)
|
|
128
|
-
|
|
129
133
|
if feature_iter:
|
|
130
134
|
dst_h5file.require_group("events")
|
|
131
135
|
for feat in feature_iter:
|
|
@@ -166,56 +170,6 @@ def rtdc_copy(src_h5file: h5py.Group,
|
|
|
166
170
|
)
|
|
167
171
|
|
|
168
172
|
|
|
169
|
-
def basin_definition_copy(src_h5file, dst_h5file, features_iter):
|
|
170
|
-
"""Copy basin definitions `src_h5file["basins"]` to the new file
|
|
171
|
-
|
|
172
|
-
Normally, we would just use :func:`h5ds_copy` to copy basins from
|
|
173
|
-
one dataset to another. However, if we are e.g. only copying scalar
|
|
174
|
-
features, and there are non-scalar features in the internal basin,
|
|
175
|
-
then we must rewrite the basin definition of the internal basin.
|
|
176
|
-
|
|
177
|
-
The `features_iter` list of features defines which features are
|
|
178
|
-
relevant for the internal basin.
|
|
179
|
-
"""
|
|
180
|
-
dst_h5file.require_group("basins")
|
|
181
|
-
for b_key in src_h5file["basins"]:
|
|
182
|
-
if b_key in dst_h5file["basins"]:
|
|
183
|
-
# This basin already exists.
|
|
184
|
-
continue
|
|
185
|
-
# Load the basin information
|
|
186
|
-
basin_dicts = RTDC_HDF5.basin_get_dicts_from_h5file(src_h5file)
|
|
187
|
-
for bn in basin_dicts:
|
|
188
|
-
if bn["type"] == "internal":
|
|
189
|
-
# Make sure we define the internal features selected
|
|
190
|
-
feat_used = [f for f in bn["features"] if f in features_iter]
|
|
191
|
-
if len(feat_used) == 0:
|
|
192
|
-
# We don't have any internal features, don't write anything
|
|
193
|
-
continue
|
|
194
|
-
elif feat_used != bn["features"]:
|
|
195
|
-
bn["features"] = feat_used
|
|
196
|
-
rewrite = True
|
|
197
|
-
else:
|
|
198
|
-
rewrite = False
|
|
199
|
-
else:
|
|
200
|
-
# We do not have an internal basin, just copy everything
|
|
201
|
-
rewrite = False
|
|
202
|
-
|
|
203
|
-
if rewrite:
|
|
204
|
-
# Convert edited `bn` to JSON and write feature data
|
|
205
|
-
b_lines = json.dumps(bn, indent=2).split("\n")
|
|
206
|
-
key = hashobj(b_lines)
|
|
207
|
-
if key not in dst_h5file["basins"]:
|
|
208
|
-
with RTDCWriter(dst_h5file) as hw:
|
|
209
|
-
hw.write_text(dst_h5file["basins"], key, b_lines)
|
|
210
|
-
else:
|
|
211
|
-
# copy only
|
|
212
|
-
h5ds_copy(src_loc=src_h5file["basins"],
|
|
213
|
-
src_name=b_key,
|
|
214
|
-
dst_loc=dst_h5file["basins"],
|
|
215
|
-
dst_name=b_key,
|
|
216
|
-
recursive=False)
|
|
217
|
-
|
|
218
|
-
|
|
219
173
|
def h5ds_copy(src_loc, src_name, dst_loc, dst_name=None,
|
|
220
174
|
ensure_compression=True, recursive=True):
|
|
221
175
|
"""Copy an HDF5 Dataset from one group to another
|
dclab/rtdc_dataset/feat_basin.py
CHANGED
|
@@ -14,12 +14,6 @@ import weakref
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
|
|
17
|
-
from ..util import copy_if_needed
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class BasinFeatureMissingWarning(UserWarning):
|
|
21
|
-
"""Used when a badin feature is defined but not stored"""
|
|
22
|
-
|
|
23
17
|
|
|
24
18
|
class CyclicBasinDependencyFoundWarning(UserWarning):
|
|
25
19
|
"""Used when a basin is defined in one of its sub-basins"""
|
|
@@ -411,7 +405,7 @@ class BasinProxyFeature(np.lib.mixins.NDArrayOperatorsMixin):
|
|
|
411
405
|
self._cache = None
|
|
412
406
|
self.is_scalar = bool(len(self.feat_obj.shape) == 1)
|
|
413
407
|
|
|
414
|
-
def __array__(self, dtype=None, copy=
|
|
408
|
+
def __array__(self, dtype=None, copy=False, *args, **kwargs):
|
|
415
409
|
if self._cache is None and self.is_scalar:
|
|
416
410
|
self._cache = self.feat_obj[:][self.basinmap]
|
|
417
411
|
else:
|
|
@@ -502,19 +496,6 @@ class InternalH5DatasetBasin(Basin):
|
|
|
502
496
|
if self._features is None:
|
|
503
497
|
raise ValueError("You must specify features when defining "
|
|
504
498
|
"internal basins.")
|
|
505
|
-
# Redefine the features if necessary
|
|
506
|
-
h5root = self._basinmap_referrer().h5file
|
|
507
|
-
available_features = []
|
|
508
|
-
for feat in self._features:
|
|
509
|
-
if self.location in h5root and feat in h5root[self.location]:
|
|
510
|
-
available_features.append(feat)
|
|
511
|
-
else:
|
|
512
|
-
warnings.warn(
|
|
513
|
-
f"Feature '{feat}' is defined as an internal basin, "
|
|
514
|
-
f"but it cannot be found in '{self.location}'.",
|
|
515
|
-
BasinFeatureMissingWarning)
|
|
516
|
-
self._features.clear()
|
|
517
|
-
self._features += available_features
|
|
518
499
|
|
|
519
500
|
def _load_dataset(self, location, **kwargs):
|
|
520
501
|
from .fmt_dict import RTDC_Dict
|
|
@@ -526,7 +507,8 @@ class InternalH5DatasetBasin(Basin):
|
|
|
526
507
|
return RTDC_Dict(ds_dict)
|
|
527
508
|
|
|
528
509
|
def is_available(self):
|
|
529
|
-
|
|
510
|
+
h5root = self._basinmap_referrer().h5file
|
|
511
|
+
return self.location in h5root
|
|
530
512
|
|
|
531
513
|
def verify_basin(self, *args, **kwargs):
|
|
532
514
|
"""It's not necessary to verify internal basins"""
|
|
@@ -175,15 +175,10 @@ class RTDC_HDF5(RTDCBase):
|
|
|
175
175
|
|
|
176
176
|
def basins_get_dicts(self):
|
|
177
177
|
"""Return list of dicts for all basins defined in `self.h5file`"""
|
|
178
|
-
return self.basin_get_dicts_from_h5file(self.h5file)
|
|
179
|
-
|
|
180
|
-
@staticmethod
|
|
181
|
-
def basin_get_dicts_from_h5file(h5file):
|
|
182
|
-
"""Return list of dicts for all basins defined in `h5file`"""
|
|
183
178
|
basins = []
|
|
184
179
|
# Do not sort anything here, sorting is done in `RTDCBase`.
|
|
185
|
-
for bk in h5file.get("basins", []):
|
|
186
|
-
bdat = list(h5file["basins"][bk])
|
|
180
|
+
for bk in self.h5file.get("basins", []):
|
|
181
|
+
bdat = list(self.h5file["basins"][bk])
|
|
187
182
|
if isinstance(bdat[0], bytes):
|
|
188
183
|
bdat = [bi.decode("utf") for bi in bdat]
|
|
189
184
|
bdict = json.loads(" ".join(bdat))
|
|
@@ -7,7 +7,7 @@ import numbers
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
9
|
from ... import definitions as dfn
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
|
|
12
12
|
from . import feat_defect
|
|
13
13
|
|
|
@@ -140,7 +140,7 @@ class H5MaskEvent:
|
|
|
140
140
|
self.identifier = (self.h5dataset.file.filename, self.h5dataset.name)
|
|
141
141
|
self.dtype = np.dtype(bool)
|
|
142
142
|
|
|
143
|
-
def __array__(self, dtype=np.bool_, copy=
|
|
143
|
+
def __array__(self, dtype=np.bool_, copy=False, *args, **kwargs):
|
|
144
144
|
if dtype is not np.uint8:
|
|
145
145
|
warnings.warn("Please avoid calling the `__array__` method of the "
|
|
146
146
|
"`H5MaskEvent`. It may consume a lot of memory.",
|
|
@@ -180,7 +180,7 @@ class H5ScalarEvent(np.lib.mixins.NDArrayOperatorsMixin):
|
|
|
180
180
|
# attrs
|
|
181
181
|
self._ufunc_attrs = dict(self.h5ds.attrs)
|
|
182
182
|
|
|
183
|
-
def __array__(self, dtype=None, copy=
|
|
183
|
+
def __array__(self, dtype=None, copy=False, *args, **kwargs):
|
|
184
184
|
if self._array is None:
|
|
185
185
|
self._array = np.asarray(self.h5ds, *args, **kwargs)
|
|
186
186
|
return np.array(self._array, dtype=dtype, copy=copy)
|