dclab 0.62.11__cp39-cp39-macosx_11_0_arm64.whl → 2.18.0__cp39-cp39-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dclab might be problematic. Click here for more details.
- dclab/_version.py +2 -2
- dclab/definitions/meta_const.py +1 -11
- dclab/downsampling.cpython-39-darwin.so +0 -0
- dclab/external/skimage/_find_contours_cy.cpython-39-darwin.so +0 -0
- dclab/external/skimage/_pnpoly.cpython-39-darwin.so +0 -0
- dclab/external/skimage/_shared/geometry.cpython-39-darwin.so +0 -0
- dclab/http_utils.py +2 -12
- dclab/lme4/__init__.py +4 -4
- dclab/lme4/rlibs.py +93 -0
- dclab/lme4/rsetup.py +153 -150
- dclab/lme4/wrapr.py +129 -93
- dclab/rtdc_dataset/check.py +6 -74
- dclab/rtdc_dataset/copier.py +19 -73
- dclab/rtdc_dataset/core.py +2 -12
- dclab/rtdc_dataset/export.py +12 -16
- dclab/rtdc_dataset/feat_basin.py +3 -30
- dclab/rtdc_dataset/fmt_dcor/tables.py +4 -6
- dclab/rtdc_dataset/fmt_hdf5/base.py +2 -7
- dclab/rtdc_dataset/fmt_hdf5/events.py +3 -3
- dclab/rtdc_dataset/fmt_hierarchy/base.py +1 -0
- dclab/rtdc_dataset/fmt_hierarchy/events.py +3 -4
- dclab/rtdc_dataset/linker.py +124 -0
- dclab/rtdc_dataset/writer.py +11 -30
- dclab/util.py +0 -6
- {dclab-0.62.11.dist-info → dclab-2.18.0.dist-info}/METADATA +5 -3
- {dclab-0.62.11.dist-info → dclab-2.18.0.dist-info}/RECORD +30 -29
- {dclab-0.62.11.dist-info → dclab-2.18.0.dist-info}/WHEEL +1 -1
- dclab/lme4/lme4_template.R +0 -94
- {dclab-0.62.11.dist-info → dclab-2.18.0.dist-info}/LICENSE +0 -0
- {dclab-0.62.11.dist-info → dclab-2.18.0.dist-info}/entry_points.txt +0 -0
- {dclab-0.62.11.dist-info → dclab-2.18.0.dist-info}/top_level.txt +0 -0
dclab/lme4/wrapr.py
CHANGED
|
@@ -1,19 +1,18 @@
|
|
|
1
1
|
"""R lme4 wrapper"""
|
|
2
|
-
import logging
|
|
3
2
|
import numbers
|
|
4
|
-
import
|
|
5
|
-
import tempfile
|
|
3
|
+
import warnings
|
|
6
4
|
|
|
7
|
-
import importlib_resources
|
|
8
5
|
import numpy as np
|
|
9
6
|
|
|
10
7
|
from .. import definitions as dfn
|
|
11
8
|
from ..rtdc_dataset.core import RTDCBase
|
|
12
9
|
|
|
10
|
+
from .rlibs import rpy2
|
|
13
11
|
from . import rsetup
|
|
14
12
|
|
|
15
13
|
|
|
16
|
-
|
|
14
|
+
class Lme4InstallWarning(UserWarning):
|
|
15
|
+
pass
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
class Rlme4(object):
|
|
@@ -39,12 +38,19 @@ class Rlme4(object):
|
|
|
39
38
|
#: list of [RTDCBase, column, repetition, chip_region]
|
|
40
39
|
self.data = []
|
|
41
40
|
|
|
41
|
+
#: model function
|
|
42
|
+
self.r_func_model = "feature ~ group + (1 + group | repetition)"
|
|
43
|
+
#: null model function
|
|
44
|
+
self.r_func_nullmodel = "feature ~ (1 + group | repetition)"
|
|
45
|
+
|
|
42
46
|
self.set_options(model=model, feature=feature)
|
|
43
47
|
|
|
44
48
|
# Make sure that lme4 is available
|
|
45
49
|
if not rsetup.has_lme4():
|
|
46
|
-
|
|
47
|
-
|
|
50
|
+
warnings.warn("Installing lme4, this may take a while!",
|
|
51
|
+
Lme4InstallWarning)
|
|
52
|
+
rsetup.install_lme4()
|
|
53
|
+
rsetup.import_lme4()
|
|
48
54
|
|
|
49
55
|
def add_dataset(self, ds, group, repetition):
|
|
50
56
|
"""Add a dataset to the analysis list
|
|
@@ -61,8 +67,8 @@ class Rlme4(object):
|
|
|
61
67
|
|
|
62
68
|
Notes
|
|
63
69
|
-----
|
|
64
|
-
- For each repetition, there must be a "treatment"
|
|
65
|
-
"control"
|
|
70
|
+
- For each repetition, there must be a "treatment" and a
|
|
71
|
+
"control" ``group``.
|
|
66
72
|
- If you would like to perform a differential feature analysis,
|
|
67
73
|
then you need to pass at least a reservoir and a channel
|
|
68
74
|
dataset (with same parameters for `group` and `repetition`).
|
|
@@ -96,10 +102,10 @@ class Rlme4(object):
|
|
|
96
102
|
The response variable is modeled using two linear mixed effect
|
|
97
103
|
models:
|
|
98
104
|
|
|
99
|
-
- model:
|
|
100
|
-
|
|
101
|
-
- the null model:
|
|
102
|
-
|
|
105
|
+
- model :const:`Rlme4.r_func_model` (random intercept +
|
|
106
|
+
random slope model)
|
|
107
|
+
- the null model :const:`Rlme4.r_func_nullmodel` (without
|
|
108
|
+
the fixed effect introduced by the "treatment" group).
|
|
103
109
|
|
|
104
110
|
Both models are compared in R using "anova" (from the
|
|
105
111
|
R-package "stats" :cite:`Everitt1992`) which performs a
|
|
@@ -127,16 +133,16 @@ class Rlme4(object):
|
|
|
127
133
|
results: dict
|
|
128
134
|
Dictionary with the results of the fitting process:
|
|
129
135
|
|
|
130
|
-
- "anova p-value": Anova
|
|
136
|
+
- "anova p-value": Anova likelyhood ratio test (significance)
|
|
131
137
|
- "feature": name of the feature used for the analysis
|
|
132
138
|
``self.feature``
|
|
133
139
|
- "fixed effects intercept": Mean of ``self.feature`` for all
|
|
134
140
|
controls; In the case of the "glmer+loglink" model, the intercept
|
|
135
|
-
is already
|
|
141
|
+
is already backtransformed from log space.
|
|
136
142
|
- "fixed effects treatment": The fixed effect size between the mean
|
|
137
143
|
of the controls and the mean of the treatments relative to
|
|
138
144
|
"fixed effects intercept"; In the case of the "glmer+loglink"
|
|
139
|
-
model, the fixed effect is already
|
|
145
|
+
model, the fixed effect is already backtransformed from log
|
|
140
146
|
space.
|
|
141
147
|
- "fixed effects repetitions": The effects (intercept and
|
|
142
148
|
treatment) for each repetition. The first axis defines
|
|
@@ -153,10 +159,11 @@ class Rlme4(object):
|
|
|
153
159
|
- "model": model name used for the analysis ``self.model``
|
|
154
160
|
- "model converged": boolean indicating whether the model
|
|
155
161
|
converged
|
|
156
|
-
- "r
|
|
157
|
-
- "r model
|
|
158
|
-
- "r
|
|
159
|
-
- "r
|
|
162
|
+
- "r anova": Anova model (exposed from R)
|
|
163
|
+
- "r model summary": Summary of the model (exposed from R)
|
|
164
|
+
- "r model coefficients": Model coefficient table (exposed from R)
|
|
165
|
+
- "r stderr": errors and warnings from R
|
|
166
|
+
- "r stdout": standard output from R
|
|
160
167
|
"""
|
|
161
168
|
self.set_options(model=model, feature=feature)
|
|
162
169
|
self.check_data()
|
|
@@ -175,38 +182,105 @@ class Rlme4(object):
|
|
|
175
182
|
groups.append(dd[1])
|
|
176
183
|
repetitions.append(dd[2])
|
|
177
184
|
|
|
178
|
-
#
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
185
|
+
# Fire up R
|
|
186
|
+
with rsetup.AutoRConsole() as ac:
|
|
187
|
+
r = rpy2.robjects.r
|
|
188
|
+
|
|
189
|
+
# Load lme4
|
|
190
|
+
rpy2.robjects.packages.importr("lme4")
|
|
191
|
+
|
|
192
|
+
# Concatenate huge arrays for R
|
|
193
|
+
r_features = rpy2.robjects.FloatVector(np.concatenate(features))
|
|
194
|
+
_groups = []
|
|
195
|
+
_repets = []
|
|
196
|
+
for ii in range(len(features)):
|
|
197
|
+
_groups.append(np.repeat(groups[ii], len(features[ii])))
|
|
198
|
+
_repets.append(np.repeat(repetitions[ii], len(features[ii])))
|
|
199
|
+
r_groups = rpy2.robjects.StrVector(np.concatenate(_groups))
|
|
200
|
+
r_repetitions = rpy2.robjects.IntVector(np.concatenate(_repets))
|
|
201
|
+
|
|
202
|
+
# Register groups and repetitions
|
|
203
|
+
rpy2.robjects.globalenv["feature"] = r_features
|
|
204
|
+
rpy2.robjects.globalenv["group"] = r_groups
|
|
205
|
+
rpy2.robjects.globalenv["repetition"] = r_repetitions
|
|
206
|
+
|
|
207
|
+
# Create a dataframe which contains all the data
|
|
208
|
+
r_data = r["data.frame"](r_features, r_groups, r_repetitions)
|
|
209
|
+
|
|
210
|
+
# Random intercept and random slope model
|
|
211
|
+
if self.model == 'glmer+loglink':
|
|
212
|
+
r_model = r["glmer"](self.r_func_model, r_data,
|
|
213
|
+
family=r["Gamma"](link='log'))
|
|
214
|
+
r_nullmodel = r["glmer"](self.r_func_nullmodel, r_data,
|
|
215
|
+
family=r["Gamma"](link='log'))
|
|
216
|
+
else: # lmer
|
|
217
|
+
r_model = r["lmer"](self.r_func_model, r_data)
|
|
218
|
+
r_nullmodel = r["lmer"](self.r_func_nullmodel, r_data)
|
|
219
|
+
|
|
220
|
+
# Anova analysis (increase verbosity by making models global)
|
|
221
|
+
# Using anova is a very conservative way of determining
|
|
222
|
+
# p values.
|
|
223
|
+
rpy2.robjects.globalenv["Model"] = r_model
|
|
224
|
+
rpy2.robjects.globalenv["NullModel"] = r_nullmodel
|
|
225
|
+
r_anova = r("anova(Model, NullModel)")
|
|
226
|
+
try:
|
|
227
|
+
pvalue = r_anova.rx2["Pr(>Chisq)"][1]
|
|
228
|
+
except ValueError: # rpy2 2.9.4
|
|
229
|
+
pvalue = r_anova[7][1]
|
|
230
|
+
r_model_summary = r["summary"](r_model)
|
|
231
|
+
r_model_coefficients = r["coef"](r_model)
|
|
232
|
+
try:
|
|
233
|
+
fe_reps = np.array(r_model_coefficients.rx2["repetition"])
|
|
234
|
+
except ValueError: # rpy2 2.9.4
|
|
235
|
+
fe_reps = np.concatenate((
|
|
236
|
+
np.array(r_model_coefficients[0][0]).reshape(1, -1),
|
|
237
|
+
np.array(r_model_coefficients[0][1]).reshape(1, -1)),
|
|
238
|
+
axis=0)
|
|
239
|
+
|
|
240
|
+
r_effects = r["data.frame"](r["coef"](r_model_summary))
|
|
241
|
+
try:
|
|
242
|
+
fe_icept = r_effects.rx2["Estimate"][0]
|
|
243
|
+
fe_treat = r_effects.rx2["Estimate"][1]
|
|
244
|
+
except ValueError: # rpy2 2.9.4
|
|
245
|
+
fe_icept = r_effects[0][0]
|
|
246
|
+
fe_treat = r_effects[0][1]
|
|
247
|
+
if self.model == "glmer+loglink":
|
|
248
|
+
# transform back from log
|
|
249
|
+
fe_treat = np.exp(fe_icept + fe_treat) - np.exp(fe_icept)
|
|
250
|
+
fe_icept = np.exp(fe_icept)
|
|
251
|
+
fe_reps[:, 1] = np.exp(fe_reps[:, 0] + fe_reps[:, 1]) \
|
|
252
|
+
- np.exp(fe_reps[:, 0])
|
|
253
|
+
fe_reps[:, 0] = np.exp(fe_reps[:, 0])
|
|
254
|
+
|
|
255
|
+
# convergence
|
|
256
|
+
try:
|
|
257
|
+
lme4l = r_model_summary.rx2["optinfo"].rx2["conv"].rx2["lme4"]
|
|
258
|
+
except ValueError: # rpy2 2.9.4
|
|
259
|
+
lme4l = r_model_summary[17][3][1]
|
|
260
|
+
|
|
261
|
+
if lme4l and "code" in lme4l.names:
|
|
262
|
+
try:
|
|
263
|
+
conv_code = lme4l.rx2["code"]
|
|
264
|
+
except ValueError: # rpy2 2.9.4
|
|
265
|
+
conv_code = lme4l[0]
|
|
266
|
+
else:
|
|
267
|
+
conv_code = 0
|
|
268
|
+
|
|
269
|
+
ret_dict = {
|
|
270
|
+
"anova p-value": pvalue,
|
|
271
|
+
"feature": self.feature,
|
|
272
|
+
"fixed effects intercept": fe_icept,
|
|
273
|
+
"fixed effects treatment": fe_treat, # aka "fixed effect"
|
|
274
|
+
"fixed effects repetitions": fe_reps,
|
|
275
|
+
"is differential": self.is_differential(),
|
|
276
|
+
"model": self.model,
|
|
277
|
+
"model converged": conv_code == 0,
|
|
278
|
+
"r anova": r_anova,
|
|
279
|
+
"r model summary": r_model_summary,
|
|
280
|
+
"r model coefficients": r_model_coefficients,
|
|
281
|
+
"r stderr": ac.get_warnerrors(),
|
|
282
|
+
"r stdout": ac.get_prints(),
|
|
283
|
+
}
|
|
210
284
|
return ret_dict
|
|
211
285
|
|
|
212
286
|
def get_differential_dataset(self):
|
|
@@ -214,7 +288,7 @@ class Rlme4(object):
|
|
|
214
288
|
|
|
215
289
|
The most famous use case is differential deformation. The idea
|
|
216
290
|
is that you cannot tell what the difference in deformation
|
|
217
|
-
from channel to reservoir, because you never measure the
|
|
291
|
+
from channel to reservoir is, because you never measure the
|
|
218
292
|
same object in the reservoir and the channel. You usually just
|
|
219
293
|
have two distributions. Comparing distributions is possible
|
|
220
294
|
via bootstrapping. And then, instead of running the lme4
|
|
@@ -288,34 +362,6 @@ class Rlme4(object):
|
|
|
288
362
|
else:
|
|
289
363
|
return False
|
|
290
364
|
|
|
291
|
-
def parse_result(self, result):
|
|
292
|
-
resd = result.split("OUTPUT")
|
|
293
|
-
ret_dict = {}
|
|
294
|
-
for item in resd:
|
|
295
|
-
string = item.split("#*#")[0]
|
|
296
|
-
key, value = string.split(":", 1)
|
|
297
|
-
key = key.strip()
|
|
298
|
-
value = value.strip().replace("\n\n", "\n")
|
|
299
|
-
|
|
300
|
-
if key == "fixed effects repetitions":
|
|
301
|
-
rows = value.split("\n")[1:]
|
|
302
|
-
reps = []
|
|
303
|
-
for row in rows:
|
|
304
|
-
reps.append([float(vv) for vv in row.split()[1:]])
|
|
305
|
-
value = np.array(reps).transpose()
|
|
306
|
-
elif key == "model converged":
|
|
307
|
-
value = value == "TRUE"
|
|
308
|
-
elif value == "NA":
|
|
309
|
-
value = np.nan
|
|
310
|
-
else:
|
|
311
|
-
try:
|
|
312
|
-
value = float(value)
|
|
313
|
-
except ValueError:
|
|
314
|
-
pass
|
|
315
|
-
|
|
316
|
-
ret_dict[key] = value
|
|
317
|
-
return ret_dict
|
|
318
|
-
|
|
319
365
|
def set_options(self, model=None, feature=None):
|
|
320
366
|
"""Set analysis options"""
|
|
321
367
|
if model is not None:
|
|
@@ -326,16 +372,6 @@ class Rlme4(object):
|
|
|
326
372
|
self.feature = feature
|
|
327
373
|
|
|
328
374
|
|
|
329
|
-
def arr2str(a):
|
|
330
|
-
"""Convert an array to a string"""
|
|
331
|
-
if isinstance(a.dtype.type, np.integer):
|
|
332
|
-
return ",".join(str(dd) for dd in a.tolist())
|
|
333
|
-
elif a.dtype.type == np.str_:
|
|
334
|
-
return ",".join(f"'{dd}'" for dd in a.tolist())
|
|
335
|
-
else:
|
|
336
|
-
return ",".join(f"{dd:.16g}" for dd in a.tolist())
|
|
337
|
-
|
|
338
|
-
|
|
339
375
|
def bootstrapped_median_distributions(a, b, bs_iter=1000, rs=117):
|
|
340
376
|
"""Compute the bootstrapped distributions for two arrays.
|
|
341
377
|
|
|
@@ -345,7 +381,7 @@ def bootstrapped_median_distributions(a, b, bs_iter=1000, rs=117):
|
|
|
345
381
|
Input data
|
|
346
382
|
bs_iter: int
|
|
347
383
|
Number of bootstrapping iterations to perform
|
|
348
|
-
(
|
|
384
|
+
(outtput size).
|
|
349
385
|
rs: int
|
|
350
386
|
Random state seed for random number generator
|
|
351
387
|
|
|
@@ -360,7 +396,7 @@ def bootstrapped_median_distributions(a, b, bs_iter=1000, rs=117):
|
|
|
360
396
|
|
|
361
397
|
Notes
|
|
362
398
|
-----
|
|
363
|
-
From a
|
|
399
|
+
From a programmatical point of view, it would have been better
|
|
364
400
|
to implement this method for just one input array (because of
|
|
365
401
|
redundant code). However, due to historical reasons (testing
|
|
366
402
|
and comparability to Shape-Out 1), bootstrapping is done
|
dclab/rtdc_dataset/check.py
CHANGED
|
@@ -8,7 +8,6 @@ import numpy as np
|
|
|
8
8
|
|
|
9
9
|
from .copier import is_properly_compressed
|
|
10
10
|
from .core import RTDCBase
|
|
11
|
-
from .fmt_hdf5 import RTDC_HDF5
|
|
12
11
|
from .fmt_hierarchy import RTDC_Hierarchy
|
|
13
12
|
from .load import load_file
|
|
14
13
|
|
|
@@ -193,13 +192,14 @@ class IntegrityChecker(object):
|
|
|
193
192
|
else:
|
|
194
193
|
with warnings.catch_warnings(record=True) as ws:
|
|
195
194
|
warnings.simplefilter("always")
|
|
196
|
-
self.ds = load_file(path_or_ds
|
|
195
|
+
self.ds = load_file(path_or_ds)
|
|
197
196
|
for ww in ws:
|
|
198
197
|
self.warn_cues.append(ICue(
|
|
199
198
|
msg=f"{ww.category.__name__}: {ww.message}",
|
|
200
199
|
level="alert",
|
|
201
200
|
category="warning"))
|
|
202
201
|
self.finally_close = True
|
|
202
|
+
np.max(self.ds["index"])
|
|
203
203
|
|
|
204
204
|
def __enter__(self):
|
|
205
205
|
return self
|
|
@@ -260,24 +260,13 @@ class IntegrityChecker(object):
|
|
|
260
260
|
level="alert",
|
|
261
261
|
category="basin data",
|
|
262
262
|
))
|
|
263
|
-
|
|
264
|
-
if
|
|
263
|
+
for feat in bn["features"]:
|
|
264
|
+
if feat not in self.ds.h5file[bpaths[0]]:
|
|
265
265
|
cues.append(
|
|
266
|
-
ICue(msg="Missing internal basin
|
|
267
|
-
"'basin_events', although an internal "
|
|
268
|
-
"basin is defined",
|
|
266
|
+
ICue(msg=f"Missing internal basin feature {feat}",
|
|
269
267
|
level="violation",
|
|
270
268
|
category="basin data",
|
|
271
269
|
))
|
|
272
|
-
else:
|
|
273
|
-
for feat in bn["features"]:
|
|
274
|
-
if feat not in self.ds.h5file["basin_events"]:
|
|
275
|
-
cues.append(
|
|
276
|
-
ICue(msg=f"Missing internal basin "
|
|
277
|
-
f"feature {feat}",
|
|
278
|
-
level="violation",
|
|
279
|
-
category="basin data",
|
|
280
|
-
))
|
|
281
270
|
return cues
|
|
282
271
|
|
|
283
272
|
def check_compression(self, **kwargs):
|
|
@@ -330,32 +319,8 @@ class IntegrityChecker(object):
|
|
|
330
319
|
data=data))
|
|
331
320
|
return cues
|
|
332
321
|
|
|
333
|
-
def check_empty(self, **kwargs):
|
|
334
|
-
"""The dataset should contain events"""
|
|
335
|
-
cues = []
|
|
336
|
-
lends = len(self.ds)
|
|
337
|
-
if lends == 0:
|
|
338
|
-
cues.append(ICue(
|
|
339
|
-
msg="The dataset does not contain any events",
|
|
340
|
-
level="alert",
|
|
341
|
-
category="feature data"))
|
|
342
|
-
return cues
|
|
343
|
-
|
|
344
|
-
def check_external_links(self, **kwargs):
|
|
345
|
-
"""An HDF5 dataset should not contain external links"""
|
|
346
|
-
cues = []
|
|
347
|
-
if isinstance(self.ds, RTDC_HDF5):
|
|
348
|
-
has_external, h5object = hdf5_has_external(self.ds.h5file)
|
|
349
|
-
if has_external:
|
|
350
|
-
cues.append(ICue(
|
|
351
|
-
msg=f"The HDF5 file contains at least one external "
|
|
352
|
-
f"link: '{h5object}'",
|
|
353
|
-
level="violation",
|
|
354
|
-
category="format HDF5"))
|
|
355
|
-
return cues
|
|
356
|
-
|
|
357
322
|
def check_feat_index(self, **kwargs):
|
|
358
|
-
"""
|
|
323
|
+
"""Up until"""
|
|
359
324
|
cues = []
|
|
360
325
|
lends = len(self.ds)
|
|
361
326
|
if "index" in self.ds:
|
|
@@ -867,36 +832,3 @@ def check_dataset(path_or_ds):
|
|
|
867
832
|
elif cue.level == "violation":
|
|
868
833
|
viol.append(cue.msg)
|
|
869
834
|
return sorted(viol), sorted(aler), sorted(info)
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
def hdf5_has_external(h5):
|
|
873
|
-
"""Check recursively, whether an h5py object contains external data
|
|
874
|
-
|
|
875
|
-
External data includes binary data in external files, virtual
|
|
876
|
-
datasets, and external links.
|
|
877
|
-
|
|
878
|
-
Returns a tuple of either
|
|
879
|
-
|
|
880
|
-
- `(True, path_ext)` if the object contains external data
|
|
881
|
-
- `(False, None)` if this is not the case
|
|
882
|
-
|
|
883
|
-
where `path_ext` is the path to the group or dataset in `h5`.
|
|
884
|
-
|
|
885
|
-
.. versionadded:: 0.62.0
|
|
886
|
-
|
|
887
|
-
"""
|
|
888
|
-
for key in h5:
|
|
889
|
-
obj = h5[key]
|
|
890
|
-
if (obj.file != h5.file # not in same file
|
|
891
|
-
or (isinstance(obj, h5py.Dataset)
|
|
892
|
-
and (obj.is_virtual # virtual dataset
|
|
893
|
-
or obj.external))): # external dataset
|
|
894
|
-
# These are external data
|
|
895
|
-
return True, f"{h5.name}/{key}".replace("//", "/")
|
|
896
|
-
elif isinstance(obj, h5py.Group):
|
|
897
|
-
# Perform recursive check for external data
|
|
898
|
-
has_ext, path_ext = hdf5_has_external(obj)
|
|
899
|
-
if has_ext:
|
|
900
|
-
return True, path_ext
|
|
901
|
-
else:
|
|
902
|
-
return False, None
|
dclab/rtdc_dataset/copier.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Helper methods for copying .rtdc data"""
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
-
import json
|
|
5
4
|
import re
|
|
6
5
|
from typing import List, Literal
|
|
7
6
|
|
|
@@ -11,10 +10,8 @@ import hdf5plugin
|
|
|
11
10
|
import numpy as np
|
|
12
11
|
|
|
13
12
|
from ..definitions import feature_exists, scalar_feature_exists
|
|
14
|
-
from ..util import hashobj
|
|
15
13
|
|
|
16
|
-
from .fmt_hdf5 import DEFECTIVE_FEATURES
|
|
17
|
-
from .writer import RTDCWriter
|
|
14
|
+
from .fmt_hdf5 import DEFECTIVE_FEATURES
|
|
18
15
|
|
|
19
16
|
|
|
20
17
|
def rtdc_copy(src_h5file: h5py.Group,
|
|
@@ -47,7 +44,8 @@ def rtdc_copy(src_h5file: h5py.Group,
|
|
|
47
44
|
Add this prefix to the name of the logs and tables in `dst_h5file`.
|
|
48
45
|
"""
|
|
49
46
|
# metadata
|
|
50
|
-
|
|
47
|
+
for akey in src_h5file.attrs:
|
|
48
|
+
dst_h5file.attrs[akey] = src_h5file.attrs[akey]
|
|
51
49
|
|
|
52
50
|
# events in source file
|
|
53
51
|
if "events" in src_h5file:
|
|
@@ -59,6 +57,19 @@ def rtdc_copy(src_h5file: h5py.Group,
|
|
|
59
57
|
events_src += list(src_h5file["basin_events"].keys())
|
|
60
58
|
events_src = sorted(set(events_src))
|
|
61
59
|
|
|
60
|
+
# basins
|
|
61
|
+
if include_basins and "basins" in src_h5file:
|
|
62
|
+
dst_h5file.require_group("basins")
|
|
63
|
+
for b_key in src_h5file["basins"]:
|
|
64
|
+
if b_key in dst_h5file["basins"]:
|
|
65
|
+
# This basin already exists.
|
|
66
|
+
continue
|
|
67
|
+
h5ds_copy(src_loc=src_h5file["basins"],
|
|
68
|
+
src_name=b_key,
|
|
69
|
+
dst_loc=dst_h5file["basins"],
|
|
70
|
+
dst_name=b_key,
|
|
71
|
+
recursive=False)
|
|
72
|
+
|
|
62
73
|
# logs
|
|
63
74
|
if include_logs and "logs" in src_h5file:
|
|
64
75
|
dst_h5file.require_group("logs")
|
|
@@ -83,12 +94,11 @@ def rtdc_copy(src_h5file: h5py.Group,
|
|
|
83
94
|
# dst_loc=dst_h5file["tables"],
|
|
84
95
|
# dst_name=meta_prefix + tkey,
|
|
85
96
|
# recursive=False)
|
|
86
|
-
|
|
97
|
+
dst_h5file["tables"].create_dataset(
|
|
87
98
|
name=tkey,
|
|
88
99
|
data=src_h5file["tables"][tkey][:],
|
|
89
100
|
fletcher32=True,
|
|
90
101
|
**hdf5plugin.Zstd(clevel=5))
|
|
91
|
-
copy_table.attrs.update(src_h5file["tables"][tkey].attrs)
|
|
92
102
|
|
|
93
103
|
# events
|
|
94
104
|
if isinstance(features, list):
|
|
@@ -120,12 +130,6 @@ def rtdc_copy(src_h5file: h5py.Group,
|
|
|
120
130
|
if feat in feature_iter:
|
|
121
131
|
feature_iter.remove(feat)
|
|
122
132
|
|
|
123
|
-
# copy basin definitions
|
|
124
|
-
if include_basins and "basins" in src_h5file:
|
|
125
|
-
basin_definition_copy(src_h5file=src_h5file,
|
|
126
|
-
dst_h5file=dst_h5file,
|
|
127
|
-
features_iter=feature_iter)
|
|
128
|
-
|
|
129
133
|
if feature_iter:
|
|
130
134
|
dst_h5file.require_group("events")
|
|
131
135
|
for feat in feature_iter:
|
|
@@ -166,65 +170,6 @@ def rtdc_copy(src_h5file: h5py.Group,
|
|
|
166
170
|
)
|
|
167
171
|
|
|
168
172
|
|
|
169
|
-
def basin_definition_copy(src_h5file, dst_h5file, features_iter):
|
|
170
|
-
"""Copy basin definitions `src_h5file["basins"]` to the new file
|
|
171
|
-
|
|
172
|
-
Normally, we would just use :func:`h5ds_copy` to copy basins from
|
|
173
|
-
one dataset to another. However, if we are e.g. only copying scalar
|
|
174
|
-
features, and there are non-scalar features in the internal basin,
|
|
175
|
-
then we must rewrite the basin definition of the internal basin.
|
|
176
|
-
|
|
177
|
-
The `features_iter` list of features defines which features are
|
|
178
|
-
relevant for the internal basin.
|
|
179
|
-
"""
|
|
180
|
-
dst_h5file.require_group("basins")
|
|
181
|
-
# Load the basin information
|
|
182
|
-
basin_dicts = RTDC_HDF5.basin_get_dicts_from_h5file(src_h5file)
|
|
183
|
-
for bn in basin_dicts:
|
|
184
|
-
b_key = bn["key"]
|
|
185
|
-
|
|
186
|
-
if b_key in dst_h5file["basins"]:
|
|
187
|
-
# already stored therein
|
|
188
|
-
continue
|
|
189
|
-
|
|
190
|
-
# sanity check
|
|
191
|
-
if b_key not in src_h5file["basins"]:
|
|
192
|
-
raise ValueError(
|
|
193
|
-
f"Failed to parse basin information correctly. Source file "
|
|
194
|
-
f"{src_h5file} does not contain basin {b_key} which I got "
|
|
195
|
-
f"from `RTDC_HDF5.basin_get_dicts_from_h5file`.")
|
|
196
|
-
|
|
197
|
-
if bn["type"] == "internal":
|
|
198
|
-
# Make sure we define the internal features selected
|
|
199
|
-
feat_used = [f for f in bn["features"] if f in features_iter]
|
|
200
|
-
if len(feat_used) == 0:
|
|
201
|
-
# We don't have any internal features, don't write anything
|
|
202
|
-
continue
|
|
203
|
-
elif feat_used != bn["features"]:
|
|
204
|
-
bn["features"] = feat_used
|
|
205
|
-
rewrite = True
|
|
206
|
-
else:
|
|
207
|
-
rewrite = False
|
|
208
|
-
else:
|
|
209
|
-
# We do not have an internal basin, just copy everything
|
|
210
|
-
rewrite = False
|
|
211
|
-
|
|
212
|
-
if rewrite:
|
|
213
|
-
# Convert edited `bn` to JSON and write feature data
|
|
214
|
-
b_lines = json.dumps(bn, indent=2).split("\n")
|
|
215
|
-
key = hashobj(b_lines)
|
|
216
|
-
if key not in dst_h5file["basins"]:
|
|
217
|
-
with RTDCWriter(dst_h5file) as hw:
|
|
218
|
-
hw.write_text(dst_h5file["basins"], key, b_lines)
|
|
219
|
-
else:
|
|
220
|
-
# copy only
|
|
221
|
-
h5ds_copy(src_loc=src_h5file["basins"],
|
|
222
|
-
src_name=b_key,
|
|
223
|
-
dst_loc=dst_h5file["basins"],
|
|
224
|
-
dst_name=b_key,
|
|
225
|
-
recursive=False)
|
|
226
|
-
|
|
227
|
-
|
|
228
173
|
def h5ds_copy(src_loc, src_name, dst_loc, dst_name=None,
|
|
229
174
|
ensure_compression=True, recursive=True):
|
|
230
175
|
"""Copy an HDF5 Dataset from one group to another
|
|
@@ -312,7 +257,8 @@ def h5ds_copy(src_loc, src_name, dst_loc, dst_name=None,
|
|
|
312
257
|
for chunk in src.iter_chunks():
|
|
313
258
|
dst[chunk] = src[chunk]
|
|
314
259
|
# Also write all the attributes
|
|
315
|
-
|
|
260
|
+
for key in src.attrs:
|
|
261
|
+
dst.attrs[key] = src.attrs[key]
|
|
316
262
|
else:
|
|
317
263
|
# Copy the Dataset to the destination as-is.
|
|
318
264
|
h5py.h5o.copy(src_loc=src_loc.id,
|
dclab/rtdc_dataset/core.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""RT-DC dataset core classes and methods"""
|
|
2
2
|
import abc
|
|
3
3
|
import hashlib
|
|
4
|
-
import json
|
|
5
4
|
import os.path
|
|
6
5
|
import pathlib
|
|
7
6
|
import traceback
|
|
@@ -16,7 +15,6 @@ from .. import definitions as dfn
|
|
|
16
15
|
from .. import downsampling
|
|
17
16
|
from ..polygon_filter import PolygonFilter
|
|
18
17
|
from .. import kde_methods
|
|
19
|
-
from ..util import hashobj
|
|
20
18
|
|
|
21
19
|
from .feat_anc_core import AncillaryFeature, FEATURES_RAPID
|
|
22
20
|
from . import feat_basin
|
|
@@ -827,20 +825,14 @@ class RTDCBase(abc.ABC):
|
|
|
827
825
|
# Sort basins according to priority
|
|
828
826
|
bdicts_srt = sorted(self.basins_get_dicts(),
|
|
829
827
|
key=feat_basin.basin_priority_sorted_key)
|
|
830
|
-
|
|
831
|
-
for bdict in bdicts_srt:
|
|
832
|
-
if "key" not in bdict:
|
|
833
|
-
b_dat = json.dumps(bdict, indent=2, sort_keys=True).split("\n")
|
|
834
|
-
bdict["key"] = hashobj(b_dat)
|
|
835
|
-
|
|
836
|
-
bd_keys = [bd["key"] for bd in bdicts_srt]
|
|
828
|
+
bd_keys = [bd["key"] for bd in bdicts_srt if "key" in bd]
|
|
837
829
|
bd_keys += self._basins_ignored
|
|
838
830
|
for bdict in bdicts_srt:
|
|
839
831
|
if bdict["format"] not in bc:
|
|
840
832
|
warnings.warn(f"Encountered unsupported basin "
|
|
841
833
|
f"format '{bdict['format']}'!")
|
|
842
834
|
continue
|
|
843
|
-
if bdict["key"] in self._basins_ignored:
|
|
835
|
+
if "key" in bdict and bdict["key"] in self._basins_ignored:
|
|
844
836
|
warnings.warn(
|
|
845
837
|
f"Encountered cyclic basin dependency '{bdict['key']}'",
|
|
846
838
|
feat_basin.CyclicBasinDependencyFoundWarning)
|
|
@@ -861,8 +853,6 @@ class RTDCBase(abc.ABC):
|
|
|
861
853
|
"measurement_identifier": self.get_measurement_identifier(),
|
|
862
854
|
# allow to ignore basins
|
|
863
855
|
"ignored_basins": bd_keys,
|
|
864
|
-
# basin key
|
|
865
|
-
"key": bdict["key"],
|
|
866
856
|
}
|
|
867
857
|
|
|
868
858
|
# Check whether this basin is supported and exists
|