dclab 0.67.0__cp314-cp314t-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dclab might be problematic. Click here for more details.

Files changed (142) hide show
  1. dclab/__init__.py +41 -0
  2. dclab/_version.py +34 -0
  3. dclab/cached.py +97 -0
  4. dclab/cli/__init__.py +10 -0
  5. dclab/cli/common.py +237 -0
  6. dclab/cli/task_compress.py +126 -0
  7. dclab/cli/task_condense.py +223 -0
  8. dclab/cli/task_join.py +229 -0
  9. dclab/cli/task_repack.py +98 -0
  10. dclab/cli/task_split.py +154 -0
  11. dclab/cli/task_tdms2rtdc.py +186 -0
  12. dclab/cli/task_verify_dataset.py +75 -0
  13. dclab/definitions/__init__.py +79 -0
  14. dclab/definitions/feat_const.py +202 -0
  15. dclab/definitions/feat_logic.py +182 -0
  16. dclab/definitions/meta_const.py +252 -0
  17. dclab/definitions/meta_logic.py +111 -0
  18. dclab/definitions/meta_parse.py +94 -0
  19. dclab/downsampling.cpython-314t-darwin.so +0 -0
  20. dclab/downsampling.pyx +230 -0
  21. dclab/external/__init__.py +4 -0
  22. dclab/external/packaging/LICENSE +3 -0
  23. dclab/external/packaging/LICENSE.APACHE +177 -0
  24. dclab/external/packaging/LICENSE.BSD +23 -0
  25. dclab/external/packaging/__init__.py +6 -0
  26. dclab/external/packaging/_structures.py +61 -0
  27. dclab/external/packaging/version.py +505 -0
  28. dclab/external/skimage/LICENSE +28 -0
  29. dclab/external/skimage/__init__.py +2 -0
  30. dclab/external/skimage/_find_contours.py +216 -0
  31. dclab/external/skimage/_find_contours_cy.cpython-314t-darwin.so +0 -0
  32. dclab/external/skimage/_find_contours_cy.pyx +188 -0
  33. dclab/external/skimage/_pnpoly.cpython-314t-darwin.so +0 -0
  34. dclab/external/skimage/_pnpoly.pyx +99 -0
  35. dclab/external/skimage/_shared/__init__.py +1 -0
  36. dclab/external/skimage/_shared/geometry.cpython-314t-darwin.so +0 -0
  37. dclab/external/skimage/_shared/geometry.pxd +6 -0
  38. dclab/external/skimage/_shared/geometry.pyx +55 -0
  39. dclab/external/skimage/measure.py +7 -0
  40. dclab/external/skimage/pnpoly.py +53 -0
  41. dclab/external/statsmodels/LICENSE +35 -0
  42. dclab/external/statsmodels/__init__.py +6 -0
  43. dclab/external/statsmodels/nonparametric/__init__.py +1 -0
  44. dclab/external/statsmodels/nonparametric/_kernel_base.py +203 -0
  45. dclab/external/statsmodels/nonparametric/kernel_density.py +165 -0
  46. dclab/external/statsmodels/nonparametric/kernels.py +36 -0
  47. dclab/features/__init__.py +9 -0
  48. dclab/features/bright.py +81 -0
  49. dclab/features/bright_bc.py +93 -0
  50. dclab/features/bright_perc.py +63 -0
  51. dclab/features/contour.py +161 -0
  52. dclab/features/emodulus/__init__.py +339 -0
  53. dclab/features/emodulus/load.py +252 -0
  54. dclab/features/emodulus/lut_HE-2D-FEM-22.txt +16432 -0
  55. dclab/features/emodulus/lut_HE-3D-FEM-22.txt +1276 -0
  56. dclab/features/emodulus/lut_LE-2D-FEM-19.txt +13082 -0
  57. dclab/features/emodulus/pxcorr.py +135 -0
  58. dclab/features/emodulus/scale_linear.py +247 -0
  59. dclab/features/emodulus/viscosity.py +260 -0
  60. dclab/features/fl_crosstalk.py +95 -0
  61. dclab/features/inert_ratio.py +377 -0
  62. dclab/features/volume.py +242 -0
  63. dclab/http_utils.py +322 -0
  64. dclab/isoelastics/__init__.py +468 -0
  65. dclab/isoelastics/iso_HE-2D-FEM-22-area_um-deform.txt +2440 -0
  66. dclab/isoelastics/iso_HE-2D-FEM-22-volume-deform.txt +2635 -0
  67. dclab/isoelastics/iso_HE-3D-FEM-22-area_um-deform.txt +1930 -0
  68. dclab/isoelastics/iso_HE-3D-FEM-22-volume-deform.txt +2221 -0
  69. dclab/isoelastics/iso_LE-2D-FEM-19-area_um-deform.txt +2151 -0
  70. dclab/isoelastics/iso_LE-2D-FEM-19-volume-deform.txt +2250 -0
  71. dclab/isoelastics/iso_LE-2D-ana-18-area_um-deform.txt +1266 -0
  72. dclab/kde/__init__.py +1 -0
  73. dclab/kde/base.py +459 -0
  74. dclab/kde/contours.py +222 -0
  75. dclab/kde/methods.py +313 -0
  76. dclab/kde_contours.py +10 -0
  77. dclab/kde_methods.py +11 -0
  78. dclab/lme4/__init__.py +5 -0
  79. dclab/lme4/lme4_template.R +94 -0
  80. dclab/lme4/rsetup.py +204 -0
  81. dclab/lme4/wrapr.py +386 -0
  82. dclab/polygon_filter.py +398 -0
  83. dclab/rtdc_dataset/__init__.py +15 -0
  84. dclab/rtdc_dataset/check.py +902 -0
  85. dclab/rtdc_dataset/config.py +533 -0
  86. dclab/rtdc_dataset/copier.py +353 -0
  87. dclab/rtdc_dataset/core.py +896 -0
  88. dclab/rtdc_dataset/export.py +867 -0
  89. dclab/rtdc_dataset/feat_anc_core/__init__.py +24 -0
  90. dclab/rtdc_dataset/feat_anc_core/af_basic.py +75 -0
  91. dclab/rtdc_dataset/feat_anc_core/af_emodulus.py +160 -0
  92. dclab/rtdc_dataset/feat_anc_core/af_fl_max_ctc.py +133 -0
  93. dclab/rtdc_dataset/feat_anc_core/af_image_contour.py +113 -0
  94. dclab/rtdc_dataset/feat_anc_core/af_ml_class.py +102 -0
  95. dclab/rtdc_dataset/feat_anc_core/ancillary_feature.py +320 -0
  96. dclab/rtdc_dataset/feat_anc_ml/__init__.py +32 -0
  97. dclab/rtdc_dataset/feat_anc_plugin/__init__.py +3 -0
  98. dclab/rtdc_dataset/feat_anc_plugin/plugin_feature.py +329 -0
  99. dclab/rtdc_dataset/feat_basin.py +762 -0
  100. dclab/rtdc_dataset/feat_temp.py +102 -0
  101. dclab/rtdc_dataset/filter.py +263 -0
  102. dclab/rtdc_dataset/fmt_dcor/__init__.py +7 -0
  103. dclab/rtdc_dataset/fmt_dcor/access_token.py +52 -0
  104. dclab/rtdc_dataset/fmt_dcor/api.py +173 -0
  105. dclab/rtdc_dataset/fmt_dcor/base.py +299 -0
  106. dclab/rtdc_dataset/fmt_dcor/basin.py +73 -0
  107. dclab/rtdc_dataset/fmt_dcor/logs.py +26 -0
  108. dclab/rtdc_dataset/fmt_dcor/tables.py +66 -0
  109. dclab/rtdc_dataset/fmt_dict.py +103 -0
  110. dclab/rtdc_dataset/fmt_hdf5/__init__.py +6 -0
  111. dclab/rtdc_dataset/fmt_hdf5/base.py +192 -0
  112. dclab/rtdc_dataset/fmt_hdf5/basin.py +30 -0
  113. dclab/rtdc_dataset/fmt_hdf5/events.py +276 -0
  114. dclab/rtdc_dataset/fmt_hdf5/feat_defect.py +164 -0
  115. dclab/rtdc_dataset/fmt_hdf5/logs.py +33 -0
  116. dclab/rtdc_dataset/fmt_hdf5/tables.py +60 -0
  117. dclab/rtdc_dataset/fmt_hierarchy/__init__.py +11 -0
  118. dclab/rtdc_dataset/fmt_hierarchy/base.py +278 -0
  119. dclab/rtdc_dataset/fmt_hierarchy/events.py +146 -0
  120. dclab/rtdc_dataset/fmt_hierarchy/hfilter.py +140 -0
  121. dclab/rtdc_dataset/fmt_hierarchy/mapper.py +134 -0
  122. dclab/rtdc_dataset/fmt_http.py +102 -0
  123. dclab/rtdc_dataset/fmt_s3.py +354 -0
  124. dclab/rtdc_dataset/fmt_tdms/__init__.py +476 -0
  125. dclab/rtdc_dataset/fmt_tdms/event_contour.py +264 -0
  126. dclab/rtdc_dataset/fmt_tdms/event_image.py +220 -0
  127. dclab/rtdc_dataset/fmt_tdms/event_mask.py +62 -0
  128. dclab/rtdc_dataset/fmt_tdms/event_trace.py +146 -0
  129. dclab/rtdc_dataset/fmt_tdms/exc.py +37 -0
  130. dclab/rtdc_dataset/fmt_tdms/naming.py +151 -0
  131. dclab/rtdc_dataset/load.py +77 -0
  132. dclab/rtdc_dataset/meta_table.py +25 -0
  133. dclab/rtdc_dataset/writer.py +1019 -0
  134. dclab/statistics.py +226 -0
  135. dclab/util.py +176 -0
  136. dclab/warn.py +15 -0
  137. dclab-0.67.0.dist-info/METADATA +153 -0
  138. dclab-0.67.0.dist-info/RECORD +142 -0
  139. dclab-0.67.0.dist-info/WHEEL +6 -0
  140. dclab-0.67.0.dist-info/entry_points.txt +8 -0
  141. dclab-0.67.0.dist-info/licenses/LICENSE +283 -0
  142. dclab-0.67.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1019 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Mapping
4
+ import copy
5
+ import json
6
+ import os
7
+ import pathlib
8
+ from typing import Dict, List, Literal, Tuple
9
+ import warnings
10
+
11
+ import h5py
12
+ import hdf5plugin
13
+ import numpy as np
14
+
15
+ from .. import definitions as dfn
16
+ from ..util import hashobj
17
+ from .._version import version
18
+
19
+ from .feat_anc_plugin import PlugInFeature
20
+ from .meta_table import MetaTable
21
+
22
+ #: DEPRECATED (use `CHUNK_SIZE_BYTES` instead)
23
+ CHUNK_SIZE = 100
24
+
25
+ #: Chunks size in bytes for storing HDF5 datasets
26
+ CHUNK_SIZE_BYTES = 1024**2 # 1MiB
27
+
28
+ #: features that should be written to the output file as uint32 values
29
+ FEATURES_UINT32 = [
30
+ "fl1_max",
31
+ "fl1_npeaks",
32
+ "fl2_max",
33
+ "fl2_npeaks",
34
+ "fl3_max",
35
+ "fl3_npeaks",
36
+ "index",
37
+ "ml_class",
38
+ "nevents",
39
+ ]
40
+
41
+ #: features that should be written to the output file as uint64 values
42
+ FEATURES_UINT64 = [
43
+ "frame",
44
+ ]
45
+
46
+
47
+ class RTDCWriter:
48
+ def __init__(self,
49
+ path_or_h5file: str | pathlib.Path | h5py.File,
50
+ mode: Literal['append', 'replace', 'reset'] = "append",
51
+ compression_kwargs: Dict | Mapping = None,
52
+ compression: str = "deprecated"):
53
+ """RT-DC data writer classe
54
+
55
+ Parameters
56
+ ----------
57
+ path_or_h5file: str or pathlib.Path or h5py.Group
58
+ Path to an HDF5 file or an HDF5 file opened in write mode
59
+ mode: str
60
+ Defines how the data are stored:
61
+
62
+ - "append": append new feature data to existing h5py Datasets
63
+ - "replace": replace existing h5py Datasets with new features
64
+ (used for ancillary feature storage)
65
+ - "reset": do not keep any previous data
66
+ compression_kwargs: dict-like
67
+ Dictionary with the keys "compression" and "compression_opts"
68
+ which are passed to :func:`h5py.H5File.create_dataset`. The
69
+ default is Zstandard compression with the compression
70
+ level 5 `hdf5plugin.Zstd(clevel=5)`. To disable compression, use
71
+ `{"compression": None}`.
72
+ compression: str or None
73
+ Compression method used for data storage;
74
+ one of [None, "lzf", "gzip", "szip"].
75
+
76
+ .. deprecated:: 0.43.0
77
+ Use `compression_kwargs` instead.
78
+ """
79
+ if mode not in ["append", "replace", "reset"]:
80
+ raise ValueError(f"Invalid mode '{mode}'!")
81
+ if compression != "deprecated":
82
+ warnings.warn("The `compression` kwarg is deprecated in favor of "
83
+ "`compression_kwargs`!",
84
+ DeprecationWarning)
85
+ if compression_kwargs is not None:
86
+ raise ValueError("You may not specify `compression` and "
87
+ "`compression_kwargs` at the same time!")
88
+ # be backwards-compatible
89
+ compression_kwargs = {"compression": compression}
90
+ if compression_kwargs is None:
91
+ compression_kwargs = hdf5plugin.Zstd(clevel=5)
92
+
93
+ self.mode = mode
94
+ self.compression_kwargs = compression_kwargs
95
+ if isinstance(path_or_h5file, h5py.Group):
96
+ self.owns_path = False
97
+ self.path = pathlib.Path(path_or_h5file.file.filename)
98
+ self.h5file = path_or_h5file
99
+ if mode == "reset":
100
+ raise ValueError("'reset' mode incompatible with h5py.Group!")
101
+ else:
102
+ self.owns_path = True
103
+ self.path = pathlib.Path(path_or_h5file)
104
+ self.h5file = h5py.File(path_or_h5file,
105
+ mode=("w" if mode == "reset" else "a"))
106
+ #: unfortunate necessity, as `len(h5py.Group)` can be really slow
107
+ self._group_sizes = {}
108
+
109
+ def __enter__(self):
110
+ return self
111
+
112
+ def __exit__(self, type, value, tb):
113
+ # close the HDF5 file
114
+ try:
115
+ self.h5file.require_group("events")
116
+ if len(self.h5file["events"]):
117
+ self.rectify_metadata()
118
+ self.version_brand()
119
+ except BaseException:
120
+ raise
121
+ finally:
122
+ # This is guaranteed to run if any exception is raised.
123
+ self.close()
124
+
125
+ @staticmethod
126
+ def get_best_nd_chunks(item_shape, item_dtype=np.float64):
127
+ """Return best chunks for HDF5 datasets
128
+
129
+ Chunking has performance implications. It’s recommended to keep the
130
+ total size of dataset chunks between 10 KiB and 1 MiB. This number
131
+ defines the maximum chunk size as well as half the maximum cache
132
+ size for each dataset.
133
+ """
134
+ # Note that `np.prod(()) == 1`
135
+ event_size = np.prod(item_shape) * np.dtype(item_dtype).itemsize
136
+
137
+ chunk_size = CHUNK_SIZE_BYTES / event_size
138
+ # Set minimum chunk size to 10 so that we can have at least some
139
+ # compression performance.
140
+ chunk_size_int = max(10, int(np.floor(chunk_size)))
141
+ return tuple([chunk_size_int] + list(item_shape))
142
+
143
+ def close(self):
144
+ """Close the underlying HDF5 file if a path was given during init"""
145
+ if self.owns_path:
146
+ self.h5file.close()
147
+
148
+ def rectify_metadata(self):
149
+ """Autocomplete the metadta of the RTDC-measurement
150
+
151
+ The following configuration keys are updated:
152
+
153
+ - experiment:event count
154
+ - fluorescence:samples per event
155
+ - imaging: roi size x (if image or mask is given)
156
+ - imaging: roi size y (if image or mask is given)
157
+
158
+ The following configuration keys are added if not present:
159
+
160
+ - fluorescence:channel count
161
+ """
162
+ # set event count
163
+ feats = sorted(self.h5file.get("events", {}).keys())
164
+ if feats:
165
+ self.h5file.attrs["experiment:event count"] = len(
166
+ self.h5file["events"][feats[0]])
167
+ else:
168
+ raise ValueError(f"No features in '{self.path}'!")
169
+
170
+ # ignore empty features in the checks further below
171
+ for feat in feats[:]: # iterate over a copy of the list
172
+ obj = self.h5file["events"][feat]
173
+ if ((isinstance(feat, h5py.Dataset) and obj.shape[0] == 0) # ds
174
+ or len(obj) == 0): # groups
175
+ feats.remove(feat)
176
+
177
+ # set samples per event
178
+ if "trace" in feats:
179
+ traces = list(self.h5file["events"]["trace"].keys())
180
+ trsize = self.h5file["events"]["trace"][traces[0]].shape[1]
181
+ self.h5file.attrs["fluorescence:samples per event"] = trsize
182
+
183
+ # set channel count
184
+ chcount = sum(
185
+ ["fl1_max" in feats, "fl2_max" in feats, "fl3_max" in feats])
186
+ if chcount:
187
+ if "fluorescence:channel count" not in self.h5file.attrs:
188
+ self.h5file.attrs["fluorescence:channel count"] = chcount
189
+
190
+ # set roi size x/y
191
+ if "image" in feats:
192
+ shape = self.h5file["events"]["image"][0].shape
193
+ elif "mask" in feats:
194
+ shape = self.h5file["events"]["mask"][0].shape
195
+ else:
196
+ shape = None
197
+ if shape is not None:
198
+ # update shape
199
+ self.h5file.attrs["imaging:roi size x"] = shape[1]
200
+ self.h5file.attrs["imaging:roi size y"] = shape[0]
201
+
202
+ def store_basin(self,
203
+ basin_name: str,
204
+ basin_type: Literal['file', 'internal', 'remote'],
205
+ basin_format: str,
206
+ basin_locs: List[str | pathlib.Path],
207
+ basin_descr: str | None = None,
208
+ basin_feats: List[str] = None,
209
+ basin_map: np.ndarray | Tuple[str, np.ndarray] = None,
210
+ basin_id: str = None,
211
+ internal_data: Dict | h5py.Group = None,
212
+ verify: bool = True,
213
+ perishable: bool = False,
214
+ ):
215
+ """Write basin information
216
+
217
+ Parameters
218
+ ----------
219
+ basin_name: str
220
+ basin name; Names do not have to be unique.
221
+ basin_type: str
222
+ basin type (file or remote); Files are paths accessible by the
223
+ operating system (including e.g. network shares) whereas
224
+ remote locations normally require an active internet connection.
225
+ basin_format: str
226
+ The basin format must match the ``format`` property of an
227
+ :class:`.RTDCBase` subclass (e.g. "hdf5" or "dcor")
228
+ basin_locs: list
229
+ location of the basin as a string or (optionally)
230
+ a ``pathlib.Path``
231
+ basin_descr: str
232
+ optional string describing the basin
233
+ basin_feats: list of str
234
+ list of features this basin provides; You may use this to
235
+ restrict access to features for a specific basin.
236
+ basin_map: np.ndarray or tuple of (str, np.ndarray)
237
+ If this is an integer numpy array, it defines the mapping
238
+ of event indices from the basin dataset to the referring dataset
239
+ (the dataset being written to disk). Normally, the basinmap
240
+ feature used for storing the mapping information is inferred
241
+ from the currently defined basinmap features. However, if you
242
+ are incepting basins, then this might not be sufficient, and you
243
+ have to specify explicitly which basinmap feature to use. In such
244
+ a case, you may specify a tuple `(feature_name, mapping_array)`
245
+ where `feature_name` is the explicit mapping name, e.g.
246
+ `"basinmap3"`.
247
+ basin_id: str
248
+ Identifier of the basin. This is the string returned by
249
+ :meth:`.RTDCBase.get_measurement_identifier`. This is
250
+ a unique string that identifies the data within a basin.
251
+ If not specified and `verify=True`, this value is automatically
252
+ taken from the basin file.
253
+ internal_data: dict or instance of h5py.Group
254
+ A dictionary or an `h5py.Group` containing the basin data.
255
+ The data are copied to the "basin_events" group, if
256
+ `internal_data` is not an `h5py.Group` in the current HDF5 file.
257
+ This must be specified when storing internal basins, and it
258
+ must not be specified for any other basin type.
259
+ verify: bool
260
+ Whether to verify the basin before storing it; You might have
261
+ set this to False if you would like to write a basin that is
262
+ e.g. temporarily not available
263
+ perishable: bool
264
+ Whether the basin is perishable. If this is True, then a
265
+ warning will be issued, because perishable basins may not be
266
+ accessed (e.g. time-based URL for private S3 data).
267
+
268
+ Returns
269
+ -------
270
+ basin_hash: str
271
+ hash of the basin which serves as the name of the HDF5 dataset
272
+ stored in the output file
273
+
274
+ .. versionadded:: 0.58.0
275
+ """
276
+ if perishable:
277
+ warnings.warn(f"Storing perishable basin {basin_name}")
278
+ if basin_type == "internal":
279
+ if internal_data is None:
280
+ raise ValueError(
281
+ "When writing an internal basin, you must specify "
282
+ "`internal_data` which is either a dictionary of numpy "
283
+ "arrays or an `h5py.Group` containing the relevant "
284
+ "datasets.")
285
+ if (isinstance(internal_data, dict)
286
+ or (isinstance(internal_data, h5py.Group)
287
+ and internal_data.file != self.h5file)):
288
+ # The data are not yet stored in this HDF5 file
289
+ for feat in basin_feats:
290
+ igroup = self.h5file.require_group("basin_events")
291
+ if feat in igroup:
292
+ raise ValueError(f"The feature '{feat}' already "
293
+ f"exists in the 'basin_events' group")
294
+ self.write_ndarray(group=igroup,
295
+ name=feat,
296
+ data=internal_data[feat])
297
+ # just override it with the default
298
+ basin_locs = ["basin_events"]
299
+ elif verify:
300
+ # Verify the existence of the data inside this HDF5 file
301
+ if basin_locs != ["basin_events"]:
302
+ warnings.warn("You specified an uncommon location for "
303
+ f"your internal basins: {basin_locs}. "
304
+ f"Please use 'basin_events' instead.")
305
+ for feat in basin_feats:
306
+ if feat not in self.h5file[basin_locs[0]]:
307
+ raise ValueError(f"Could not find feature '{feat}' in "
308
+ f"the group [{basin_locs[0]}]")
309
+
310
+ # Expand optional tuple for basin_map
311
+ if isinstance(basin_map, (list, tuple)) and len(basin_map) == 2:
312
+ basin_map_name, basin_map = basin_map
313
+ else:
314
+ basin_map_name = None
315
+
316
+ if verify and basin_type in ["file", "remote"]:
317
+ # We have to import this here to avoid circular imports
318
+ from .load import new_dataset
319
+ # Make sure the basin can be opened by dclab, verify its ID
320
+ ref_id = self.h5file.attrs.get("experiment:run identifier")
321
+ for loc in basin_locs:
322
+ with new_dataset(loc) as ds:
323
+ # We can open the file, which is great.
324
+ # Compare the IDs.
325
+ bn_id = ds.get_measurement_identifier()
326
+ # Check whether `basin_id` matches the actual basin
327
+ if basin_id:
328
+ if basin_id != bn_id:
329
+ raise ValueError(
330
+ f"Measurement identifier mismatch for "
331
+ f"{loc}: got {bn_id}, expected {basin_id=})!")
332
+ else:
333
+ # If `basin_id` was not specified, set it here for
334
+ # user convenience.
335
+ basin_id = bn_id or None
336
+ # Check whether the referrer ID matches the basin ID.
337
+ if ref_id:
338
+ if not (bn_id == ref_id
339
+ or (basin_map is not None
340
+ and ref_id.startswith(bn_id))):
341
+ raise ValueError(
342
+ f"Measurement identifier mismatch between "
343
+ f"{self.path} ({ref_id}) and {loc} ({bn_id})!")
344
+ if basin_feats:
345
+ for feat in basin_feats:
346
+ if not dfn.feature_exists(feat):
347
+ raise ValueError(f"Invalid feature: '{feat}'")
348
+ if basin_map is not None:
349
+ if (not isinstance(basin_map, np.ndarray)
350
+ and basin_map.dtype != np.uint64):
351
+ raise ValueError(
352
+ "The array specified in `basin_map` argument must be "
353
+ "a numpy array with the dtype `np.uint64`!")
354
+
355
+ # determine the basinmap to use
356
+ if basin_map is not None:
357
+ self.h5file.require_group("events")
358
+ if basin_map_name is None:
359
+ # We have to determine the basin_map_name to use for this
360
+ # mapped basin.
361
+ for ii in range(10): # basinmap0 to basinmap9
362
+ bm_cand = f"basinmap{ii}"
363
+ if bm_cand in self.h5file["events"]:
364
+ # There is a basin mapping defined in the file. Check
365
+ # whether it is identical to ours.
366
+ if np.all(self.h5file["events"][bm_cand] == basin_map):
367
+ # Great, we are done here.
368
+ basin_map_name = bm_cand
369
+ break
370
+ else:
371
+ # This mapping belongs to a different basin,
372
+ # try the next mapping.
373
+ continue
374
+ else:
375
+ # The mapping is not defined in the dataset, and we may
376
+ # write it to a new feature.
377
+ basin_map_name = bm_cand
378
+ self.store_feature(feat=basin_map_name, data=basin_map)
379
+ break
380
+ else:
381
+ raise ValueError(
382
+ "You have exhausted the usage of mapped basins for "
383
+ "the current dataset. Please revise your analysis "
384
+ "pipeline.")
385
+ else:
386
+ if basin_map_name not in self.h5file["events"]:
387
+ # Write the explicit basin mapping into the file.
388
+ self.store_feature(feat=basin_map_name, data=basin_map)
389
+ elif not np.all(
390
+ self.h5file["events"][basin_map_name] == basin_map):
391
+ # This is a sanity check that we have to perform.
392
+ raise ValueError(
393
+ f"The basin mapping feature {basin_map_name} you "
394
+ f"specified explicitly already exists in "
395
+ f"{self.h5file} and they do not match. I assume "
396
+ f"you are trying explicitly write to a basinmap that "
397
+ f"is already used elsewhere.")
398
+ else:
399
+ # Classic, simple case
400
+ basin_map_name = "same"
401
+
402
+ b_data = {
403
+ "description": basin_descr,
404
+ "format": basin_format,
405
+ "name": basin_name,
406
+ "type": basin_type,
407
+ "features": None if basin_feats is None else sorted(basin_feats),
408
+ "mapping": basin_map_name,
409
+ "perishable": perishable,
410
+ "identifier": basin_id,
411
+ }
412
+ if basin_type == "file":
413
+ flocs = []
414
+ for pp in basin_locs:
415
+ pp = pathlib.Path(pp)
416
+ if verify:
417
+ flocs.append(str(pp.resolve()))
418
+ # Also store the relative path for user convenience.
419
+ # Don't use pathlib.Path.relative_to, because that
420
+ # only has `walk_up` since Python 3.12.
421
+ # Also, just look in subdirectories which simplifies
422
+ # path resolution.
423
+ this_parent = str(self.path.parent) + os.sep
424
+ path_parent = str(pp.parent) + os.sep
425
+ if path_parent.startswith(this_parent):
426
+ flocs.append(str(pp).replace(this_parent, "", 1))
427
+ else:
428
+ # We already did (or did not upon user request) verify
429
+ # the path. Just pass it on to the list.
430
+ flocs.append(str(pp))
431
+ b_data["paths"] = flocs
432
+ elif basin_type == "internal":
433
+ b_data["paths"] = basin_locs
434
+ elif basin_type == "remote":
435
+ b_data["urls"] = [str(p) for p in basin_locs]
436
+ else:
437
+ raise ValueError(f"Unknown basin type '{basin_type}'")
438
+
439
+ b_lines = json.dumps(b_data, indent=2, sort_keys=True).split("\n")
440
+ basins = self.h5file.require_group("basins")
441
+ key = hashobj(b_lines)
442
+ if key not in basins:
443
+ self.write_text(basins, key, b_lines)
444
+ return key
445
+
446
+ def store_feature(self, feat, data, shape=None):
447
+ """Write feature data
448
+
449
+ Parameters
450
+ ----------
451
+ feat: str
452
+ feature name
453
+ data: np.ndarray or list or dict
454
+ feature data
455
+ shape: tuple of int
456
+ For non-scalar features, this is the shape of the
457
+ feature for one event (e.g. `(90, 250)` for an "image".
458
+ Usually, you do not have to specify this value, but you
459
+ do need it in case of plugin features that don't have
460
+ the "feature shape" set or in case of temporary features.
461
+ If you don't specify it, then the shape is guessed based
462
+ on the data you provide and a UserWarning will be issued.
463
+ """
464
+ if not dfn.feature_exists(feat):
465
+ raise ValueError(f"Undefined feature '{feat}'!")
466
+
467
+ events = self.h5file.require_group("events")
468
+
469
+ # replace data?
470
+ if feat in events and self.mode == "replace":
471
+ if feat == "trace":
472
+ for tr_name in data.keys():
473
+ if tr_name in events[feat]:
474
+ del events[feat][tr_name]
475
+ else:
476
+ del events[feat]
477
+
478
+ if feat in FEATURES_UINT32:
479
+ dtype = np.uint32
480
+ elif feat in FEATURES_UINT64:
481
+ dtype = np.uint64
482
+ else:
483
+ dtype = None
484
+
485
+ if feat == "index":
486
+ # By design, the index must be a simple enumeration.
487
+ # We enforce that by not trusting the user. If you need
488
+ # a different index, please take a look at the index_online
489
+ # feature.
490
+ nev = len(data)
491
+ if "index" in events:
492
+ nev0 = len(events["index"])
493
+ else:
494
+ nev0 = 0
495
+ self.write_ndarray(group=events,
496
+ name="index",
497
+ data=np.arange(nev0 + 1, nev0 + nev + 1),
498
+ dtype=dtype)
499
+ elif dfn.scalar_feature_exists(feat):
500
+ self.write_ndarray(group=events,
501
+ name=feat,
502
+ data=np.atleast_1d(data),
503
+ dtype=dtype)
504
+ elif feat == "contour":
505
+ self.write_ragged(group=events, name=feat, data=data)
506
+ elif feat in ["image", "image_bg", "mask", "qpi_oah", "qpi_oah_bg"]:
507
+ self.write_image_grayscale(group=events,
508
+ name=feat,
509
+ data=data,
510
+ is_boolean=(feat == "mask"))
511
+ elif feat in ["qpi_amp", "qpi_pha"]:
512
+ self.write_image_float32(group=events,
513
+ name=feat,
514
+ data=data)
515
+ elif feat == "trace":
516
+ for tr_name in data.keys():
517
+ # verify trace names
518
+ if tr_name not in dfn.FLUOR_TRACES:
519
+ raise ValueError(f"Unknown trace key: '{tr_name}'!")
520
+ # write trace
521
+ self.write_ndarray(group=events.require_group("trace"),
522
+ name=tr_name,
523
+ data=np.atleast_2d(data[tr_name]),
524
+ dtype=dtype
525
+ )
526
+ else:
527
+ if not shape:
528
+ # OK, so we are dealing with a plugin feature or a temporary
529
+ # feature here. Now, we don't know the exact shape of that
530
+ # feature, but we give the user the option to advertise
531
+ # the shape of the feature in the plugin.
532
+ # First, try to obtain the shape from the PluginFeature
533
+ # (if that exists).
534
+ for pf in PlugInFeature.get_instances(feat):
535
+ if isinstance(pf, PlugInFeature):
536
+ shape = pf.plugin_feature_info.get("feature shape")
537
+ if shape is not None:
538
+ break # This is good.
539
+ else:
540
+ # Temporary features will have to live with this warning.
541
+ warnings.warn(
542
+ "There is no information about the shape of the "
543
+ + f"feature '{feat}'. I am going out on a limb "
544
+ + "for you and assume that you are storing "
545
+ + "multiple events at a time. If this works, "
546
+ + f"you could put the shape `{data[0].shape}` "
547
+ + 'in the `info["feature shapes"]` key of '
548
+ + "your plugin feature.")
549
+ shape = data.shape[1:]
550
+ if shape == data.shape:
551
+ data = data.reshape(1, *shape)
552
+ elif shape == data.shape[1:]:
553
+ pass
554
+ else:
555
+ raise ValueError(f"Bad shape for {feat}! Expeted {shape}, "
556
+ + f"but got {data.shape[1:]}!")
557
+ self.write_ndarray(group=events, name=feat, data=data, dtype=dtype)
558
+
559
+ def store_log(self, name, lines):
560
+ """Write log data
561
+
562
+ Parameters
563
+ ----------
564
+ name: str
565
+ name of the log entry
566
+ lines: list of str or str
567
+ the text lines of the log
568
+ """
569
+ log_group = self.h5file.require_group("logs")
570
+ self.write_text(group=log_group, name=name, lines=lines)
571
+
572
+ def store_metadata(self, meta):
573
+ """Store RT-DC metadata
574
+
575
+ Parameters
576
+ ----------
577
+ meta: dict-like
578
+ The metadata to store. Each key depicts a metadata section
579
+ name whose data is given as a dictionary, e.g.::
580
+
581
+ meta = {"imaging": {"exposure time": 20,
582
+ "flash duration": 2,
583
+ ...
584
+ },
585
+ "setup": {"channel width": 20,
586
+ "chip region": "channel",
587
+ ...
588
+ },
589
+ ...
590
+ }
591
+
592
+ Only section key names and key values therein registered
593
+ in dclab are allowed and are converted to the pre-defined
594
+ dtype. Only sections from the
595
+ :const:`dclab.definitions.CFG_METADATA` dictionary are
596
+ stored. If you have custom metadata, you can use the "user"
597
+ section.
598
+ """
599
+ meta = copy.deepcopy(meta)
600
+ # Ignore/remove tdms section
601
+ meta.pop("fmt_tdms", None)
602
+ # Check meta data
603
+ for sec in meta:
604
+ if sec == "user":
605
+ # user-defined metadata are always written.
606
+ # Any errors (incompatibilities with HDF5 attributes)
607
+ # are the user's responsibility
608
+ continue
609
+ elif sec not in dfn.CFG_METADATA:
610
+ # only allow writing of meta data that are not editable
611
+ # by the user (not dclab.dfn.CFG_ANALYSIS)
612
+ raise ValueError(
613
+ f"Meta data section not defined in dclab: {sec}")
614
+ for ck in meta[sec]:
615
+ if not dfn.config_key_exists(sec, ck):
616
+ raise ValueError(
617
+ f"Meta key not defined in dclab: {sec}:{ck}")
618
+
619
+ # update version
620
+ old_version = meta.get("setup", {}).get("software version", "")
621
+ new_version = self.version_brand(
622
+ old_version=old_version or None,
623
+ write_attribute=False
624
+ )
625
+ meta.setdefault("setup", {})["software version"] = new_version
626
+
627
+ # Write metadata
628
+ for sec in meta:
629
+ for ck in meta[sec]:
630
+ idk = f"{sec}:{ck}"
631
+ value = meta[sec][ck]
632
+ if isinstance(value, bytes):
633
+ # We never store byte attribute values.
634
+ # In this case, `convfunc` should be `str` or `lcstr` or
635
+ # somesuch. But we don't test that, because no other
636
+ # datatype competes with str for bytes.
637
+ value = value.decode("utf-8")
638
+ if sec == "user":
639
+ # store user-defined metadata as-is
640
+ self.h5file.attrs[idk] = value
641
+ else:
642
+ # pipe the metadata through the hard-coded converter
643
+ # functions
644
+ convfunc = dfn.get_config_value_func(sec, ck)
645
+ self.h5file.attrs[idk] = convfunc(value)
646
+
647
+ def store_table(self, name, cmp_array, h5_attrs=None):
648
+ """Store a compound array table
649
+
650
+ Tables are semi-metadata. They may contain information collected
651
+ during a measurement (but with a lower temporal resolution) or
652
+ other tabular data relevant for a dataset. Tables have named
653
+ columns. Therefore, they can be represented as a numy recarray,
654
+ and they should be stored as such in an HDF5 file (compund dataset).
655
+
656
+ Parameters
657
+ ----------
658
+ name: str
659
+ Name of the table
660
+ cmp_array: np.recarray, h5py.Dataset, np.ndarray, or dict
661
+ If a np.recarray or h5py.Dataset are provided, then they
662
+ are written as-is to the file. If a dictionary is provided,
663
+ then the dictionary is converted into a numpy recarray.
664
+ If a numpy array is provided, then the array is written
665
+ as a raw table (no column names) to the file.
666
+ h5_attrs: dict, optional
667
+ Attributes to store alongside the corresponding HDF5 dataset
668
+ """
669
+ if h5_attrs is None:
670
+ h5_attrs = {}
671
+
672
+ # Convert MetaTable to numpy data
673
+ if isinstance(cmp_array, MetaTable):
674
+ h5_attrs.update(cmp_array.meta)
675
+ cmp_array = cmp_array.__array__()
676
+
677
+ # Handle individual cases
678
+ if isinstance(cmp_array, np.recarray):
679
+ # A table is a compound array (np.recarray). If we are here,
680
+ # this means that the user passed an instance of np.recarray.
681
+ pass
682
+ elif isinstance(cmp_array, h5py.Dataset):
683
+ # An instance of h5py.Dataset (which we trust to be a proper
684
+ # compound dataset at this point). No additional steps needed.
685
+ h5_attrs.update(cmp_array.attrs)
686
+ pass
687
+ elif isinstance(cmp_array, np.ndarray):
688
+ # A numpy array was passed. This usually means we have something
689
+ # that we can look at, so we add image tags.
690
+ h5_attrs['CLASS'] = np.bytes_('IMAGE')
691
+ h5_attrs['IMAGE_VERSION'] = np.bytes_('1.2')
692
+ h5_attrs['IMAGE_SUBCLASS'] = np.bytes_('IMAGE_GRAYSCALE')
693
+ pass
694
+ elif isinstance(cmp_array, dict):
695
+ # The user passed a dict which we now have to convert to a
696
+ # compound dataset. We do this because we are user-convenient.
697
+ # The user should not need to wade through these steps:
698
+ columns = list(cmp_array.keys())
699
+ # Everything should be floats in a table.
700
+ ds_dt = np.dtype({'names': columns,
701
+ 'formats': [np.float64] * len(columns)})
702
+ # We trust the user to provide a dictionary with one-dimensional
703
+ # lists or arrays of the same length.
704
+ tabsize = len(cmp_array[columns[0]])
705
+ tab_data = np.zeros((tabsize, len(columns)))
706
+ for ii, tab in enumerate(columns):
707
+ tab_data[:, ii] = cmp_array[tab]
708
+ # Now create a new compound array (discarding the old dict)
709
+ cmp_array = np.rec.array(tab_data, dtype=ds_dt)
710
+ else:
711
+ raise NotImplementedError(
712
+ f"Cannot convert {type(cmp_array)} to table!")
713
+
714
+ # data
715
+ group = self.h5file.require_group("tables")
716
+ tab = group.create_dataset(
717
+ name,
718
+ data=cmp_array,
719
+ fletcher32=True,
720
+ **self.compression_kwargs)
721
+
722
+ # metadata
723
+ if h5_attrs:
724
+ tab.attrs.update(h5_attrs)
725
+
726
+ def version_brand(self, old_version=None, write_attribute=True):
727
+ """Perform version branding
728
+
729
+ Append a " | dclab X.Y.Z" to the "setup:software version"
730
+ attribute.
731
+
732
+ Parameters
733
+ ----------
734
+ old_version: str or None
735
+ By default, the version string is taken from the HDF5 file.
736
+ If set to a string, then this version is used instead.
737
+ write_attribute: bool
738
+ If True (default), write the version string to the
739
+ "setup:software version" attribute
740
+ """
741
+ if old_version is None:
742
+ old_version = self.h5file.attrs.get("setup:software version", "")
743
+ if isinstance(old_version, bytes):
744
+ old_version = old_version.decode("utf-8")
745
+ version_chain = [vv.strip() for vv in old_version.split("|")]
746
+ version_chain = [vv for vv in version_chain if vv]
747
+ cur_version = "dclab {}".format(version)
748
+
749
+ if version_chain:
750
+ if version_chain[-1] != cur_version:
751
+ version_chain.append(cur_version)
752
+ else:
753
+ version_chain = [cur_version]
754
+ new_version = " | ".join(version_chain)
755
+ if write_attribute:
756
+ self.h5file.attrs["setup:software version"] = new_version
757
+ else:
758
+ return new_version
759
+
760
+ def write_image_float32(self, group, name, data):
761
+ """Write 32bit floating point image array
762
+
763
+ This function wraps :func:`RTDCWriter.write_ndarray`
764
+ and adds image attributes to the HDF5 file so HDFView
765
+ can display the images properly.
766
+
767
+ Parameters
768
+ ----------
769
+ group: h5py.Group
770
+ parent group
771
+ name: str
772
+ name of the dataset containing the text
773
+ data: np.ndarray or list of np.ndarray
774
+ image data
775
+ """
776
+ if isinstance(data, (list, tuple)):
777
+ # images may be in lists
778
+ data = np.atleast_2d(data)
779
+
780
+ if len(data.shape) == 2:
781
+ # put single event in 3D array
782
+ data = data[np.newaxis]
783
+
784
+ dset = self.write_ndarray(group=group, name=name, data=data,
785
+ dtype=np.float32)
786
+
787
+ # Create and Set image attributes:
788
+ # HDFView recognizes this as a series of images.
789
+ # Use np.bytes_ as per
790
+ # https://docs.h5py.org/en/stable/strings.html#compatibility
791
+ dset.attrs.create('CLASS', np.bytes_('IMAGE'))
792
+ dset.attrs.create('IMAGE_VERSION', np.bytes_('1.2'))
793
+ dset.attrs.create('IMAGE_SUBCLASS', np.bytes_('IMAGE_GRAYSCALE'))
794
+
795
+ def write_image_grayscale(self, group, name, data, is_boolean):
796
+ """Write grayscale image data to and HDF5 dataset
797
+
798
+ This function wraps :func:`RTDCWriter.write_ndarray`
799
+ and adds image attributes to the HDF5 file so HDFView
800
+ can display the images properly.
801
+
802
+ Parameters
803
+ ----------
804
+ group: h5py.Group
805
+ parent group
806
+ name: str
807
+ name of the dataset containing the text
808
+ data: np.ndarray or list of np.ndarray
809
+ image data
810
+ is_boolean: bool
811
+ whether the input data is of boolean nature
812
+ (e.g. mask data) - if so, data are converted to uint8
813
+ """
814
+ if isinstance(data, (list, tuple)):
815
+ # images may be in lists
816
+ data = np.atleast_2d(data)
817
+
818
+ if len(data.shape) == 2:
819
+ # put single event in 3D array
820
+ data = data.reshape(1, data.shape[0], data.shape[1])
821
+
822
+ if is_boolean:
823
+ # convert binary (mask) data to uint8
824
+ if data.__class__.__name__ == "H5MaskEvent":
825
+ # (if we use `isinstance`, we get circular imports)
826
+ # Be smart and directly write back the original data
827
+ # (otherwise we would convert to bool and back to uint8).
828
+ data = data.h5dataset
829
+ elif data.dtype == bool:
830
+ # Convert binary input mask data to uint8 with max range
831
+ data = np.asarray(data, dtype=np.uint8) * 255
832
+
833
+ dset = self.write_ndarray(group=group, name=name, data=data,
834
+ dtype=np.uint8)
835
+
836
+ # Create and Set image attributes:
837
+ # HDFView recognizes this as a series of images.
838
+ # Use np.bytes_ as per
839
+ # https://docs.h5py.org/en/stable/strings.html#compatibility
840
+ dset.attrs.create('CLASS', np.bytes_('IMAGE'))
841
+ dset.attrs.create('IMAGE_VERSION', np.bytes_('1.2'))
842
+ dset.attrs.create('IMAGE_SUBCLASS', np.bytes_('IMAGE_GRAYSCALE'))
843
+
844
+ def write_ndarray(self, group, name, data, dtype=None):
845
+ """Write n-dimensional array data to an HDF5 dataset
846
+
847
+ It is assumed that the shape of the array data is correct,
848
+ i.e. that the shape of `data` is
849
+ (number_events, feat_shape_1, ..., feat_shape_n).
850
+
851
+ Parameters
852
+ ----------
853
+ group: h5py.Group
854
+ parent group
855
+ name: str
856
+ name of the dataset containing the text
857
+ data: np.ndarray
858
+ data
859
+ dtype: dtype
860
+ the dtype to use for storing the data
861
+ (defaults to `data.dtype`)
862
+ """
863
+ if len(data) == 0:
864
+ raise ValueError(f"Empty data object for '{name}'")
865
+
866
+ if name not in group:
867
+ chunks = self.get_best_nd_chunks(item_shape=data.shape[1:],
868
+ item_dtype=data.dtype)
869
+ maxshape = tuple([None] + list(data.shape)[1:])
870
+ dset = group.create_dataset(
871
+ name,
872
+ shape=data.shape,
873
+ dtype=dtype or data.dtype,
874
+ maxshape=maxshape,
875
+ chunks=chunks,
876
+ fletcher32=True,
877
+ **self.compression_kwargs)
878
+ offset = 0
879
+ else:
880
+ dset = group[name]
881
+ offset = dset.shape[0]
882
+ dset.resize(offset + data.shape[0], axis=0)
883
+ if len(data.shape) == 1:
884
+ # store scalar data in one go
885
+ dset[offset:] = data
886
+ # store ufunc data for min/max
887
+ for uname, ufunc in [("min", np.nanmin),
888
+ ("max", np.nanmax)]:
889
+ val_a = dset.attrs.get(uname, None)
890
+ if val_a is not None:
891
+ val_b = ufunc(data)
892
+ val = ufunc([val_a, val_b])
893
+ else:
894
+ val = ufunc(dset)
895
+ dset.attrs[uname] = val
896
+ # store ufunc data for mean (weighted with size)
897
+ mean_a = dset.attrs.get("mean", None)
898
+ if mean_a is not None:
899
+ num_a = offset
900
+ mean_b = np.nanmean(data)
901
+ num_b = data.size
902
+ mean = (mean_a * num_a + mean_b * num_b) / (num_a + num_b)
903
+ else:
904
+ mean = np.nanmean(dset)
905
+ dset.attrs["mean"] = mean
906
+ else:
907
+ chunk_size = dset.chunks[0]
908
+ # populate higher-dimensional data in chunks
909
+ # (reduces file size, memory usage, and saves time)
910
+ num_chunks = len(data) // chunk_size
911
+ for ii in range(num_chunks):
912
+ start = ii * chunk_size
913
+ stop = start + chunk_size
914
+ dset[offset+start:offset+stop] = data[start:stop]
915
+ # write remainder (if applicable)
916
+ num_remain = len(data) % chunk_size
917
+ if num_remain:
918
+ start_e = num_chunks * chunk_size
919
+ stop_e = start_e + num_remain
920
+ dset[offset+start_e:offset+stop_e] = data[start_e:stop_e]
921
+ return dset
922
+
923
+ def write_ragged(self, group, name, data):
924
+ """Write ragged data (i.e. list of arrays of different lenghts)
925
+
926
+ Ragged array data (e.g. contour data) are stored in
927
+ a separate group and each entry becomes an HDF5 dataset.
928
+
929
+ Parameters
930
+ ----------
931
+ group: h5py.Group
932
+ parent group
933
+ name: str
934
+ name of the dataset containing the text
935
+ data: list of np.ndarray or np.ndarray
936
+ the data in a list
937
+ """
938
+ if isinstance(data, np.ndarray) and len(data.shape) == 2:
939
+ # place single event in list
940
+ data = [data]
941
+ grp = group.require_group(name)
942
+ # The following case is just a workaround for the very slow
943
+ # `len(grp)` which makes things horrible if you are storing
944
+ # contour data one-by-one. The only downside of this is that
945
+ # we have to keep track of the length of the group. But I
946
+ # think that is OK, since everything is very private here.
947
+ # - Paul (2021-10-18)
948
+ if grp not in self._group_sizes:
949
+ self._group_sizes[grp] = len(grp)
950
+ curid = self._group_sizes[grp]
951
+ for ii, cc in enumerate(data):
952
+ grp.create_dataset("{}".format(curid + ii),
953
+ data=cc,
954
+ fletcher32=True,
955
+ chunks=cc.shape,
956
+ **self.compression_kwargs)
957
+ self._group_sizes[grp] += 1
958
+
959
+ def write_text(self, group, name, lines):
960
+ """Write text to an HDF5 dataset
961
+
962
+ Text data are written as a fixed-length string dataset.
963
+
964
+ Parameters
965
+ ----------
966
+ group: h5py.Group
967
+ parent group
968
+ name: str
969
+ name of the dataset containing the text
970
+ lines: list of str or str
971
+ the text, line by line
972
+ """
973
+ # replace text?
974
+ if name in group and self.mode == "replace":
975
+ del group[name]
976
+
977
+ # handle strings
978
+ if isinstance(lines, (str, bytes)):
979
+ lines = [lines]
980
+
981
+ lnum = len(lines)
982
+ # Determine the maximum line length and use fixed-length strings,
983
+ # because compression and fletcher32 filters won't work with
984
+ # variable length strings.
985
+ # https://github.com/h5py/h5py/issues/1948
986
+ # 100 is the recommended maximum and the default, because if
987
+ # `mode` is e.g. "append", then this line may not be the longest.
988
+ max_length = 100
989
+ lines_as_bytes = []
990
+ for line in lines:
991
+ # convert lines to bytes
992
+ if not isinstance(line, bytes):
993
+ lbytes = line.encode("UTF-8")
994
+ else:
995
+ lbytes = line
996
+ max_length = max(max_length, len(lbytes))
997
+ lines_as_bytes.append(lbytes)
998
+
999
+ if name not in group:
1000
+ # Create the dataset
1001
+ txt_dset = group.create_dataset(
1002
+ name,
1003
+ shape=(lnum,),
1004
+ dtype=f"S{max_length}",
1005
+ maxshape=(None,),
1006
+ chunks=True,
1007
+ fletcher32=True,
1008
+ **self.compression_kwargs)
1009
+ line_offset = 0
1010
+ else:
1011
+ # TODO: test whether fixed length is long enough!
1012
+ # Resize the dataset
1013
+ txt_dset = group[name]
1014
+ line_offset = txt_dset.shape[0]
1015
+ txt_dset.resize(line_offset + lnum, axis=0)
1016
+
1017
+ # Write the text data line-by-line
1018
+ for ii, lbytes in enumerate(lines_as_bytes):
1019
+ txt_dset[line_offset + ii] = lbytes