dclab 0.67.0__cp314-cp314t-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dclab might be problematic. Click here for more details.
- dclab/__init__.py +41 -0
- dclab/_version.py +34 -0
- dclab/cached.py +97 -0
- dclab/cli/__init__.py +10 -0
- dclab/cli/common.py +237 -0
- dclab/cli/task_compress.py +126 -0
- dclab/cli/task_condense.py +223 -0
- dclab/cli/task_join.py +229 -0
- dclab/cli/task_repack.py +98 -0
- dclab/cli/task_split.py +154 -0
- dclab/cli/task_tdms2rtdc.py +186 -0
- dclab/cli/task_verify_dataset.py +75 -0
- dclab/definitions/__init__.py +79 -0
- dclab/definitions/feat_const.py +202 -0
- dclab/definitions/feat_logic.py +182 -0
- dclab/definitions/meta_const.py +252 -0
- dclab/definitions/meta_logic.py +111 -0
- dclab/definitions/meta_parse.py +94 -0
- dclab/downsampling.cpython-314t-darwin.so +0 -0
- dclab/downsampling.pyx +230 -0
- dclab/external/__init__.py +4 -0
- dclab/external/packaging/LICENSE +3 -0
- dclab/external/packaging/LICENSE.APACHE +177 -0
- dclab/external/packaging/LICENSE.BSD +23 -0
- dclab/external/packaging/__init__.py +6 -0
- dclab/external/packaging/_structures.py +61 -0
- dclab/external/packaging/version.py +505 -0
- dclab/external/skimage/LICENSE +28 -0
- dclab/external/skimage/__init__.py +2 -0
- dclab/external/skimage/_find_contours.py +216 -0
- dclab/external/skimage/_find_contours_cy.cpython-314t-darwin.so +0 -0
- dclab/external/skimage/_find_contours_cy.pyx +188 -0
- dclab/external/skimage/_pnpoly.cpython-314t-darwin.so +0 -0
- dclab/external/skimage/_pnpoly.pyx +99 -0
- dclab/external/skimage/_shared/__init__.py +1 -0
- dclab/external/skimage/_shared/geometry.cpython-314t-darwin.so +0 -0
- dclab/external/skimage/_shared/geometry.pxd +6 -0
- dclab/external/skimage/_shared/geometry.pyx +55 -0
- dclab/external/skimage/measure.py +7 -0
- dclab/external/skimage/pnpoly.py +53 -0
- dclab/external/statsmodels/LICENSE +35 -0
- dclab/external/statsmodels/__init__.py +6 -0
- dclab/external/statsmodels/nonparametric/__init__.py +1 -0
- dclab/external/statsmodels/nonparametric/_kernel_base.py +203 -0
- dclab/external/statsmodels/nonparametric/kernel_density.py +165 -0
- dclab/external/statsmodels/nonparametric/kernels.py +36 -0
- dclab/features/__init__.py +9 -0
- dclab/features/bright.py +81 -0
- dclab/features/bright_bc.py +93 -0
- dclab/features/bright_perc.py +63 -0
- dclab/features/contour.py +161 -0
- dclab/features/emodulus/__init__.py +339 -0
- dclab/features/emodulus/load.py +252 -0
- dclab/features/emodulus/lut_HE-2D-FEM-22.txt +16432 -0
- dclab/features/emodulus/lut_HE-3D-FEM-22.txt +1276 -0
- dclab/features/emodulus/lut_LE-2D-FEM-19.txt +13082 -0
- dclab/features/emodulus/pxcorr.py +135 -0
- dclab/features/emodulus/scale_linear.py +247 -0
- dclab/features/emodulus/viscosity.py +260 -0
- dclab/features/fl_crosstalk.py +95 -0
- dclab/features/inert_ratio.py +377 -0
- dclab/features/volume.py +242 -0
- dclab/http_utils.py +322 -0
- dclab/isoelastics/__init__.py +468 -0
- dclab/isoelastics/iso_HE-2D-FEM-22-area_um-deform.txt +2440 -0
- dclab/isoelastics/iso_HE-2D-FEM-22-volume-deform.txt +2635 -0
- dclab/isoelastics/iso_HE-3D-FEM-22-area_um-deform.txt +1930 -0
- dclab/isoelastics/iso_HE-3D-FEM-22-volume-deform.txt +2221 -0
- dclab/isoelastics/iso_LE-2D-FEM-19-area_um-deform.txt +2151 -0
- dclab/isoelastics/iso_LE-2D-FEM-19-volume-deform.txt +2250 -0
- dclab/isoelastics/iso_LE-2D-ana-18-area_um-deform.txt +1266 -0
- dclab/kde/__init__.py +1 -0
- dclab/kde/base.py +459 -0
- dclab/kde/contours.py +222 -0
- dclab/kde/methods.py +313 -0
- dclab/kde_contours.py +10 -0
- dclab/kde_methods.py +11 -0
- dclab/lme4/__init__.py +5 -0
- dclab/lme4/lme4_template.R +94 -0
- dclab/lme4/rsetup.py +204 -0
- dclab/lme4/wrapr.py +386 -0
- dclab/polygon_filter.py +398 -0
- dclab/rtdc_dataset/__init__.py +15 -0
- dclab/rtdc_dataset/check.py +902 -0
- dclab/rtdc_dataset/config.py +533 -0
- dclab/rtdc_dataset/copier.py +353 -0
- dclab/rtdc_dataset/core.py +896 -0
- dclab/rtdc_dataset/export.py +867 -0
- dclab/rtdc_dataset/feat_anc_core/__init__.py +24 -0
- dclab/rtdc_dataset/feat_anc_core/af_basic.py +75 -0
- dclab/rtdc_dataset/feat_anc_core/af_emodulus.py +160 -0
- dclab/rtdc_dataset/feat_anc_core/af_fl_max_ctc.py +133 -0
- dclab/rtdc_dataset/feat_anc_core/af_image_contour.py +113 -0
- dclab/rtdc_dataset/feat_anc_core/af_ml_class.py +102 -0
- dclab/rtdc_dataset/feat_anc_core/ancillary_feature.py +320 -0
- dclab/rtdc_dataset/feat_anc_ml/__init__.py +32 -0
- dclab/rtdc_dataset/feat_anc_plugin/__init__.py +3 -0
- dclab/rtdc_dataset/feat_anc_plugin/plugin_feature.py +329 -0
- dclab/rtdc_dataset/feat_basin.py +762 -0
- dclab/rtdc_dataset/feat_temp.py +102 -0
- dclab/rtdc_dataset/filter.py +263 -0
- dclab/rtdc_dataset/fmt_dcor/__init__.py +7 -0
- dclab/rtdc_dataset/fmt_dcor/access_token.py +52 -0
- dclab/rtdc_dataset/fmt_dcor/api.py +173 -0
- dclab/rtdc_dataset/fmt_dcor/base.py +299 -0
- dclab/rtdc_dataset/fmt_dcor/basin.py +73 -0
- dclab/rtdc_dataset/fmt_dcor/logs.py +26 -0
- dclab/rtdc_dataset/fmt_dcor/tables.py +66 -0
- dclab/rtdc_dataset/fmt_dict.py +103 -0
- dclab/rtdc_dataset/fmt_hdf5/__init__.py +6 -0
- dclab/rtdc_dataset/fmt_hdf5/base.py +192 -0
- dclab/rtdc_dataset/fmt_hdf5/basin.py +30 -0
- dclab/rtdc_dataset/fmt_hdf5/events.py +276 -0
- dclab/rtdc_dataset/fmt_hdf5/feat_defect.py +164 -0
- dclab/rtdc_dataset/fmt_hdf5/logs.py +33 -0
- dclab/rtdc_dataset/fmt_hdf5/tables.py +60 -0
- dclab/rtdc_dataset/fmt_hierarchy/__init__.py +11 -0
- dclab/rtdc_dataset/fmt_hierarchy/base.py +278 -0
- dclab/rtdc_dataset/fmt_hierarchy/events.py +146 -0
- dclab/rtdc_dataset/fmt_hierarchy/hfilter.py +140 -0
- dclab/rtdc_dataset/fmt_hierarchy/mapper.py +134 -0
- dclab/rtdc_dataset/fmt_http.py +102 -0
- dclab/rtdc_dataset/fmt_s3.py +354 -0
- dclab/rtdc_dataset/fmt_tdms/__init__.py +476 -0
- dclab/rtdc_dataset/fmt_tdms/event_contour.py +264 -0
- dclab/rtdc_dataset/fmt_tdms/event_image.py +220 -0
- dclab/rtdc_dataset/fmt_tdms/event_mask.py +62 -0
- dclab/rtdc_dataset/fmt_tdms/event_trace.py +146 -0
- dclab/rtdc_dataset/fmt_tdms/exc.py +37 -0
- dclab/rtdc_dataset/fmt_tdms/naming.py +151 -0
- dclab/rtdc_dataset/load.py +77 -0
- dclab/rtdc_dataset/meta_table.py +25 -0
- dclab/rtdc_dataset/writer.py +1019 -0
- dclab/statistics.py +226 -0
- dclab/util.py +176 -0
- dclab/warn.py +15 -0
- dclab-0.67.0.dist-info/METADATA +153 -0
- dclab-0.67.0.dist-info/RECORD +142 -0
- dclab-0.67.0.dist-info/WHEEL +6 -0
- dclab-0.67.0.dist-info/entry_points.txt +8 -0
- dclab-0.67.0.dist-info/licenses/LICENSE +283 -0
- dclab-0.67.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""Helper methods for copying .rtdc data"""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from typing import List, Literal
|
|
7
|
+
|
|
8
|
+
import h5py
|
|
9
|
+
import h5py.h5o
|
|
10
|
+
import hdf5plugin
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from ..definitions import feature_exists, scalar_feature_exists
|
|
14
|
+
from ..util import hashobj
|
|
15
|
+
|
|
16
|
+
from .fmt_hdf5 import DEFECTIVE_FEATURES, RTDC_HDF5
|
|
17
|
+
from .writer import RTDCWriter
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def rtdc_copy(src_h5file: h5py.Group,
|
|
21
|
+
dst_h5file: h5py.Group,
|
|
22
|
+
features: List[str] | Literal['all', 'scalar', 'none'] = "all",
|
|
23
|
+
include_basins: bool = True,
|
|
24
|
+
include_logs: bool = True,
|
|
25
|
+
include_tables: bool = True,
|
|
26
|
+
meta_prefix: str = ""):
|
|
27
|
+
"""Create a compressed copy of an RT-DC file
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
src_h5file: h5py.Group
|
|
32
|
+
Input HDF5 file
|
|
33
|
+
dst_h5file: h5py.Group
|
|
34
|
+
Output HDF5 file
|
|
35
|
+
features: list of strings or one of ['all', 'scalar', 'none']
|
|
36
|
+
If this is a list then it specifies the features that are copied from
|
|
37
|
+
`src_h5file` to `dst_h5file`. Alternatively, you may specify 'all'
|
|
38
|
+
(copy all features), 'scalar' (copy only scalar features), or 'none'
|
|
39
|
+
(don't copy any features).
|
|
40
|
+
include_basins: bool
|
|
41
|
+
Copy the basin information from `src_h5file` to `dst_h5file`.
|
|
42
|
+
include_logs: bool
|
|
43
|
+
Copy the logs from `src_h5file` to `dst_h5file`.
|
|
44
|
+
include_tables: bool
|
|
45
|
+
Copy the tables from `src_h5file` to `dst_h5file`.
|
|
46
|
+
meta_prefix: str
|
|
47
|
+
Add this prefix to the name of the logs and tables in `dst_h5file`.
|
|
48
|
+
"""
|
|
49
|
+
# metadata
|
|
50
|
+
dst_h5file.attrs.update(src_h5file.attrs)
|
|
51
|
+
|
|
52
|
+
# events in source file
|
|
53
|
+
if "events" in src_h5file:
|
|
54
|
+
events_src = list(src_h5file["events"].keys())
|
|
55
|
+
else:
|
|
56
|
+
events_src = []
|
|
57
|
+
|
|
58
|
+
if include_basins and "basin_events" in src_h5file:
|
|
59
|
+
events_src += list(src_h5file["basin_events"].keys())
|
|
60
|
+
events_src = sorted(set(events_src))
|
|
61
|
+
|
|
62
|
+
# logs
|
|
63
|
+
if include_logs and "logs" in src_h5file:
|
|
64
|
+
dst_h5file.require_group("logs")
|
|
65
|
+
for l_key in src_h5file["logs"]:
|
|
66
|
+
h5ds_copy(src_loc=src_h5file["logs"],
|
|
67
|
+
src_name=l_key,
|
|
68
|
+
dst_loc=dst_h5file["logs"],
|
|
69
|
+
dst_name=meta_prefix + l_key,
|
|
70
|
+
recursive=False)
|
|
71
|
+
|
|
72
|
+
# tables
|
|
73
|
+
if include_tables and "tables" in src_h5file:
|
|
74
|
+
dst_h5file.require_group("tables")
|
|
75
|
+
for tkey in src_h5file["tables"]:
|
|
76
|
+
# There appears to be a problem with h5copy in some rare
|
|
77
|
+
# situations, so we do not use h5copy, but read and write
|
|
78
|
+
# the table data directly.
|
|
79
|
+
# https://github.com/HDFGroup/hdf5/issues/3214
|
|
80
|
+
# The following caused a Segmentation fault:
|
|
81
|
+
# h5ds_copy(src_loc=src_h5file["tables"],
|
|
82
|
+
# src_name=tkey,
|
|
83
|
+
# dst_loc=dst_h5file["tables"],
|
|
84
|
+
# dst_name=meta_prefix + tkey,
|
|
85
|
+
# recursive=False)
|
|
86
|
+
copy_table = dst_h5file["tables"].create_dataset(
|
|
87
|
+
name=tkey,
|
|
88
|
+
data=src_h5file["tables"][tkey][:],
|
|
89
|
+
fletcher32=True,
|
|
90
|
+
**hdf5plugin.Zstd(clevel=5))
|
|
91
|
+
copy_table.attrs.update(src_h5file["tables"][tkey].attrs)
|
|
92
|
+
|
|
93
|
+
# events
|
|
94
|
+
if isinstance(features, list):
|
|
95
|
+
feature_iter = features
|
|
96
|
+
elif features == "all":
|
|
97
|
+
feature_iter = events_src
|
|
98
|
+
elif features == "scalar":
|
|
99
|
+
feature_iter = [feat for feat in events_src
|
|
100
|
+
if feature_exists(feat, scalar_only=True)]
|
|
101
|
+
elif features == "none":
|
|
102
|
+
feature_iter = []
|
|
103
|
+
else:
|
|
104
|
+
raise ValueError(f"`features` must be either a list of feature names "
|
|
105
|
+
f"or one of 'all', 'scalar' or 'none', got "
|
|
106
|
+
f"'{features}'")
|
|
107
|
+
|
|
108
|
+
# Additional check for basin features.
|
|
109
|
+
bn_regexp = re.compile("^basinmap[0-9]*$") # future-proof regexp
|
|
110
|
+
src_basin_feats = [f for f in events_src if bn_regexp.match(f)]
|
|
111
|
+
if include_basins:
|
|
112
|
+
# Make sure all 'basinmap?' features are included in the output file.
|
|
113
|
+
for feat in src_basin_feats:
|
|
114
|
+
if feat not in feature_iter:
|
|
115
|
+
feature_iter.append(feat)
|
|
116
|
+
else:
|
|
117
|
+
# We do not need the basinmap features, because basins are
|
|
118
|
+
# stripped from the output file.
|
|
119
|
+
for feat in src_basin_feats:
|
|
120
|
+
if feat in feature_iter:
|
|
121
|
+
feature_iter.remove(feat)
|
|
122
|
+
|
|
123
|
+
# copy basin definitions
|
|
124
|
+
if include_basins and "basins" in src_h5file:
|
|
125
|
+
basin_definition_copy(src_h5file=src_h5file,
|
|
126
|
+
dst_h5file=dst_h5file,
|
|
127
|
+
features_iter=feature_iter)
|
|
128
|
+
|
|
129
|
+
if feature_iter:
|
|
130
|
+
dst_h5file.require_group("events")
|
|
131
|
+
for feat in feature_iter:
|
|
132
|
+
if not feature_exists(feat):
|
|
133
|
+
continue
|
|
134
|
+
elif feat in src_h5file["events"]:
|
|
135
|
+
# Skip all defective features. These are features that
|
|
136
|
+
# are known to be invalid (e.g. ancillary features that
|
|
137
|
+
# were computed falsely) and must be recomputed by dclab.
|
|
138
|
+
if feat in DEFECTIVE_FEATURES:
|
|
139
|
+
defective = DEFECTIVE_FEATURES[feat](src_h5file)
|
|
140
|
+
if defective:
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
dst = h5ds_copy(src_loc=src_h5file["events"],
|
|
144
|
+
src_name=feat,
|
|
145
|
+
dst_loc=dst_h5file["events"],
|
|
146
|
+
recursive=True)
|
|
147
|
+
if scalar_feature_exists(feat):
|
|
148
|
+
# complement min/max values for all scalar features
|
|
149
|
+
for ufunc, attr in [(np.nanmin, "min"),
|
|
150
|
+
(np.nanmax, "max"),
|
|
151
|
+
(np.nanmean, "mean"),
|
|
152
|
+
]:
|
|
153
|
+
if attr not in dst.attrs:
|
|
154
|
+
dst.attrs[attr] = ufunc(dst)
|
|
155
|
+
|
|
156
|
+
elif (include_basins
|
|
157
|
+
and "basin_events" in src_h5file
|
|
158
|
+
and feat in src_h5file["basin_events"]):
|
|
159
|
+
# Also copy internal basins which should have been defined
|
|
160
|
+
# in the "basin_events" group.
|
|
161
|
+
if feat in src_h5file["basin_events"]:
|
|
162
|
+
h5ds_copy(src_loc=src_h5file["basin_events"],
|
|
163
|
+
src_name=feat,
|
|
164
|
+
dst_loc=dst_h5file.require_group("basin_events"),
|
|
165
|
+
dst_name=feat
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def basin_definition_copy(src_h5file, dst_h5file, features_iter):
|
|
170
|
+
"""Copy basin definitions `src_h5file["basins"]` to the new file
|
|
171
|
+
|
|
172
|
+
Normally, we would just use :func:`h5ds_copy` to copy basins from
|
|
173
|
+
one dataset to another. However, if we are e.g. only copying scalar
|
|
174
|
+
features, and there are non-scalar features in the internal basin,
|
|
175
|
+
then we must rewrite the basin definition of the internal basin.
|
|
176
|
+
|
|
177
|
+
The `features_iter` list of features defines which features are
|
|
178
|
+
relevant for the internal basin.
|
|
179
|
+
"""
|
|
180
|
+
dst_h5file.require_group("basins")
|
|
181
|
+
# Load the basin information
|
|
182
|
+
basin_dicts = RTDC_HDF5.basin_get_dicts_from_h5file(src_h5file)
|
|
183
|
+
for bn in basin_dicts:
|
|
184
|
+
b_key = bn["key"]
|
|
185
|
+
|
|
186
|
+
if b_key in dst_h5file["basins"]:
|
|
187
|
+
# already stored therein
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
# sanity check
|
|
191
|
+
if b_key not in src_h5file["basins"]:
|
|
192
|
+
raise ValueError(
|
|
193
|
+
f"Failed to parse basin information correctly. Source file "
|
|
194
|
+
f"{src_h5file} does not contain basin {b_key} which I got "
|
|
195
|
+
f"from `RTDC_HDF5.basin_get_dicts_from_h5file`.")
|
|
196
|
+
|
|
197
|
+
if bn["type"] == "internal":
|
|
198
|
+
# Make sure we define the internal features selected
|
|
199
|
+
feat_used = [f for f in bn["features"] if f in features_iter]
|
|
200
|
+
if len(feat_used) == 0:
|
|
201
|
+
# We don't have any internal features, don't write anything
|
|
202
|
+
continue
|
|
203
|
+
elif feat_used != bn["features"]:
|
|
204
|
+
bn["features"] = feat_used
|
|
205
|
+
rewrite = True
|
|
206
|
+
else:
|
|
207
|
+
rewrite = False
|
|
208
|
+
else:
|
|
209
|
+
# We do not have an internal basin, just copy everything
|
|
210
|
+
rewrite = False
|
|
211
|
+
|
|
212
|
+
if rewrite:
|
|
213
|
+
# Convert edited `bn` to JSON and write feature data
|
|
214
|
+
b_lines = json.dumps(bn, indent=2).split("\n")
|
|
215
|
+
key = hashobj(b_lines)
|
|
216
|
+
if key not in dst_h5file["basins"]:
|
|
217
|
+
with RTDCWriter(dst_h5file) as hw:
|
|
218
|
+
hw.write_text(dst_h5file["basins"], key, b_lines)
|
|
219
|
+
else:
|
|
220
|
+
# copy only
|
|
221
|
+
h5ds_copy(src_loc=src_h5file["basins"],
|
|
222
|
+
src_name=b_key,
|
|
223
|
+
dst_loc=dst_h5file["basins"],
|
|
224
|
+
dst_name=b_key,
|
|
225
|
+
recursive=False)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def h5ds_copy(src_loc, src_name, dst_loc, dst_name=None,
|
|
229
|
+
ensure_compression=True, recursive=True):
|
|
230
|
+
"""Copy an HDF5 Dataset from one group to another
|
|
231
|
+
|
|
232
|
+
Parameters
|
|
233
|
+
----------
|
|
234
|
+
src_loc: h5py.H5Group
|
|
235
|
+
The source location
|
|
236
|
+
src_name: str
|
|
237
|
+
Name of the dataset in `src_loc`
|
|
238
|
+
dst_loc: h5py.H5Group
|
|
239
|
+
The destination location
|
|
240
|
+
dst_name: str
|
|
241
|
+
The name of the destination dataset, defaults to `src_name`
|
|
242
|
+
ensure_compression: bool
|
|
243
|
+
Whether to make sure that the data are compressed,
|
|
244
|
+
If disabled, then all data from the source will be
|
|
245
|
+
just copied and not compressed.
|
|
246
|
+
recursive: bool
|
|
247
|
+
Whether to recurse into HDF5 Groups (this is required e.g.
|
|
248
|
+
for copying the "trace" feature)
|
|
249
|
+
|
|
250
|
+
Returns
|
|
251
|
+
-------
|
|
252
|
+
dst: h5py.Dataset
|
|
253
|
+
The dataset `dst_loc[dst_name]`
|
|
254
|
+
|
|
255
|
+
Raises
|
|
256
|
+
------
|
|
257
|
+
ValueError:
|
|
258
|
+
If the named source is not a h5py.Dataset
|
|
259
|
+
"""
|
|
260
|
+
compression_kwargs = hdf5plugin.Zstd(clevel=5)
|
|
261
|
+
dst_name = dst_name or src_name
|
|
262
|
+
src = src_loc[src_name]
|
|
263
|
+
if isinstance(src, h5py.Dataset):
|
|
264
|
+
if ensure_compression and not is_properly_compressed(src):
|
|
265
|
+
# Chunk size larger than dataset size is not allowed
|
|
266
|
+
# in h5py's `make_new_dset`.
|
|
267
|
+
if src.shape[0] == 0:
|
|
268
|
+
# Ignore empty datasets (This sometimes happens with logs).
|
|
269
|
+
return
|
|
270
|
+
elif src.chunks and src.chunks[0] > src.shape[0]:
|
|
271
|
+
# The chunks in the input file are larger than the dataset
|
|
272
|
+
# shape. So we set the chunks to the shape. Here, we only
|
|
273
|
+
# check for the first axis (event count for feature data),
|
|
274
|
+
# because if the chunks vary in any other dimension then
|
|
275
|
+
# there is something fundamentally wrong with the input
|
|
276
|
+
# dataset (which we don't want to endorse, and where there
|
|
277
|
+
# could potentially be a lot of data put into ram).
|
|
278
|
+
chunks = list(src.chunks)
|
|
279
|
+
chunks[0] = src.shape[0]
|
|
280
|
+
chunks = tuple(chunks)
|
|
281
|
+
else:
|
|
282
|
+
# original chunk size is fine
|
|
283
|
+
chunks = src.chunks
|
|
284
|
+
# Variable length strings, compression, and fletcher32 are not
|
|
285
|
+
# a good combination. If we encounter any logs, then we have
|
|
286
|
+
# to write them with fixed-length strings.
|
|
287
|
+
# https://forum.hdfgroup.org/t/fletcher32-filter-on-variable-
|
|
288
|
+
# length-string-datasets-not-suitable-for-filters/9038/4
|
|
289
|
+
if src.dtype.kind == "O":
|
|
290
|
+
# We are looking at logs with variable length strings.
|
|
291
|
+
max_length = max([len(ii) for ii in src] + [100])
|
|
292
|
+
dtype = f"S{max_length}"
|
|
293
|
+
convert_to_s_fixed = True
|
|
294
|
+
else:
|
|
295
|
+
dtype = src.dtype
|
|
296
|
+
convert_to_s_fixed = False
|
|
297
|
+
|
|
298
|
+
# Manually create a compressed version of the dataset.
|
|
299
|
+
dst = dst_loc.create_dataset(name=dst_name,
|
|
300
|
+
shape=src.shape,
|
|
301
|
+
dtype=dtype,
|
|
302
|
+
chunks=chunks,
|
|
303
|
+
fletcher32=True,
|
|
304
|
+
**compression_kwargs
|
|
305
|
+
)
|
|
306
|
+
if convert_to_s_fixed:
|
|
307
|
+
# We are looking at old variable-length log strings.
|
|
308
|
+
dst[:] = src[:].astype(dtype)
|
|
309
|
+
elif chunks is None:
|
|
310
|
+
dst[:] = src[:]
|
|
311
|
+
else:
|
|
312
|
+
for chunk in src.iter_chunks():
|
|
313
|
+
dst[chunk] = src[chunk]
|
|
314
|
+
# Also write all the attributes
|
|
315
|
+
dst.attrs.update(src.attrs)
|
|
316
|
+
else:
|
|
317
|
+
# Copy the Dataset to the destination as-is.
|
|
318
|
+
h5py.h5o.copy(src_loc=src_loc.id,
|
|
319
|
+
src_name=src_name.encode(),
|
|
320
|
+
dst_loc=dst_loc.id,
|
|
321
|
+
dst_name=dst_name.encode(),
|
|
322
|
+
)
|
|
323
|
+
elif recursive and isinstance(src, h5py.Group):
|
|
324
|
+
dst_rec = dst_loc.require_group(dst_name)
|
|
325
|
+
for key in src:
|
|
326
|
+
h5ds_copy(src_loc=src,
|
|
327
|
+
src_name=key,
|
|
328
|
+
dst_loc=dst_rec,
|
|
329
|
+
ensure_compression=ensure_compression,
|
|
330
|
+
recursive=recursive)
|
|
331
|
+
else:
|
|
332
|
+
raise ValueError(f"The object {src_name} in {src.file} is not "
|
|
333
|
+
f"a dataset!")
|
|
334
|
+
return dst_loc[dst_name]
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def is_properly_compressed(h5obj):
|
|
338
|
+
"""Check whether an HDF5 object is properly compressed
|
|
339
|
+
|
|
340
|
+
The compression check only returns True if the input file was
|
|
341
|
+
compressed with the Zstandard compression using compression
|
|
342
|
+
level 5 or higher.
|
|
343
|
+
"""
|
|
344
|
+
# Since version 0.43.0, we use Zstandard compression
|
|
345
|
+
# which does not show up in the `compression`
|
|
346
|
+
# attribute of `obj`.
|
|
347
|
+
create_plist = h5obj.id.get_create_plist()
|
|
348
|
+
filter_args = create_plist.get_filter_by_id(32015)
|
|
349
|
+
if filter_args is not None and filter_args[1][0] >= 5:
|
|
350
|
+
properly_compressed = True
|
|
351
|
+
else:
|
|
352
|
+
properly_compressed = False
|
|
353
|
+
return properly_compressed
|