cfdb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cfdb/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """CF conventions multi-dimensional array database on top of Booklet"""
2
+ from cfdb.main import open_dataset, open_edataset
3
+ from cfdb.utils import compute_scale_and_offset
4
+ from rechunkit import guess_chunk_shape
5
+
6
+ __version__ = '0.1.0'
cfdb/combine.py ADDED
@@ -0,0 +1,501 @@
1
+ """
2
+ Created on 2022-09-30.
3
+
4
+ @author: Mike K
5
+ """
6
+ import h5py
7
+ import io
8
+ import os
9
+ import numpy as np
10
+ import xarray as xr
11
+ # from time import time
12
+ # import numcodecs
13
+ import hdf5plugin
14
+ from typing import Union, List
15
+ import pathlib
16
+ import copy
17
+
18
+ from . import utils
19
+ # import utils
20
+
21
+ ##############################################
22
+ ### Parameters
23
+
24
+
25
+
26
+ ##############################################
27
+ ### Functions
28
+
29
+
30
+ ###################################################
31
+ ### Class
32
+
33
+
34
+ class Combine(object):
35
+ """
36
+ Class to load and combine one or more HDF5 data files (or xarray datasets) with optional filters. The class will then export the combined data to an HDF5 file, file object, or xr.Dataset.
37
+
38
+ Parameters
39
+ ----------
40
+ data : str, pathlib.Path, io.BytesIO, xr.Dataset, or list of str, pathlib.Path, io.BytesIO, bytes, or xr.Dataset
41
+ The input data need to be a path to HDF5 file(s), BytesIO objects, bytes objects, or xr.Datasets (or some combo of those).
42
+ group : str or None
43
+ The group or group path within the hdf5 file(s) to the datasets.
44
+
45
+ Returns
46
+ -------
47
+ H5 instance
48
+ """
49
+ def __init__(self, data: Union[List[Union[str, pathlib.Path, io.BytesIO, xr.Dataset]], Union[str, pathlib.Path, io.BytesIO, xr.Dataset]], group=None):
50
+ """
51
+ Class to load and combine one or more HDF5 data files (or xarray datasets) with optional filters. The class will then export the combined data to an HDF5 file, file object, or xr.Dataset.
52
+
53
+ Parameters
54
+ ----------
55
+ data : str, pathlib.Path, io.BytesIO, xr.Dataset, or list of str, pathlib.Path, io.BytesIO, bytes, or xr.Dataset
56
+ The input data need to be a path to HDF5 file(s), BytesIO objects, bytes objects, or xr.Datasets (or some combo of those).
57
+ group : str or None
58
+ The group or group path within the hdf5 file(s) to the datasets.
59
+
60
+ Returns
61
+ -------
62
+ H5 instance
63
+ """
64
+ ## Read paths input into the appropriate file objects
65
+ if isinstance(data, list):
66
+ data1 = data
67
+ else:
68
+ data1 = [data]
69
+
70
+ ## Get encodings
71
+ encodings = utils.get_encodings(data1, group)
72
+
73
+ ## Get attrs
74
+ attrs, global_attrs = utils.get_attrs(data1, group)
75
+
76
+ ## Get the extended coords
77
+ coords_dict = utils.extend_coords(data1, encodings, group)
78
+
79
+ ## Add the variables as datasets
80
+ vars_dict, is_regular_dict = utils.index_variables(data1, coords_dict, encodings, group)
81
+
82
+ ## Assign attributes
83
+ self._files = data1
84
+ self._group = group
85
+ self._coords_dict = coords_dict
86
+ self._data_vars_dict = vars_dict
87
+ self._attrs = attrs
88
+ self._global_attrs = global_attrs
89
+ self._encodings = encodings
90
+ self._is_regular_dict = is_regular_dict
91
+
92
+
93
+ def _build_empty_ds(self):
94
+ """
95
+
96
+ """
97
+ if self._data_vars_dict:
98
+
99
+ ## get all of the coords associated with the existing data vars
100
+ all_data_coords = set()
101
+ for ds in self._data_vars_dict:
102
+ for dim in self._data_vars_dict[ds]['dims']:
103
+ all_data_coords.add(dim)
104
+
105
+ ## Create empty xr.Dataset
106
+ data_vars = {}
107
+ for k, v in self._data_vars_dict.items():
108
+ if 'datetime' in v['dtype_decoded']:
109
+ data_vars[k] = (v['dims'], np.empty(v['shape'], dtype=np.dtype('datetime64[ns]')))
110
+ else:
111
+ data_vars[k] = (v['dims'], np.empty(v['shape'], dtype=v['dtype_decoded']))
112
+
113
+ coords = {}
114
+ for k, v in self._coords_dict.items():
115
+ if k in all_data_coords:
116
+ coords[k] = utils.decode_data(v, **self._encodings[k])
117
+
118
+ xr_ds = xr.Dataset(data_vars=data_vars, coords=coords, attrs=self._global_attrs)
119
+
120
+ for ds_name, attr in self._attrs.items():
121
+ if ds_name in xr_ds:
122
+ xr_ds[ds_name].attrs = attr
123
+ for ds_name, enc in self._encodings.items():
124
+ if ds_name in xr_ds:
125
+ xr_ds[ds_name].encoding = enc
126
+
127
+ else:
128
+ xr_ds = xr.Dataset()
129
+
130
+ return xr_ds
131
+
132
+
133
+ def __repr__(self):
134
+ """
135
+
136
+ """
137
+ xr_ds = self._build_empty_ds()
138
+
139
+ return xr_ds.__repr__()
140
+
141
+
142
+ def sel(self, selection: dict=None, include_coords: list=None, exclude_coords: list=None, include_data_vars: list=None, exclude_data_vars: list=None):
143
+ """
144
+ Filter the data by a selection, include, and exclude. Returns a new H5 instance. The selection parameter is very similar to xarry's .sel method.
145
+
146
+ Parameters
147
+ ----------
148
+ selection : dict
149
+ This filter requires a dict of coordinates using three optional types of filter values. These include slice instances (the best and preferred option), a list/np.ndarray of coordinate values, or a bool np.ndarray of the coordinate data length.
150
+ include_coords : list
151
+ A list of coordinates to include in the output. Only data variables with included coordinates will be included in the output.
152
+ exclude_dims : list
153
+ A list of coordinates to exclude from the output. Only data variables with coordinates that have not been excluded will be included in the output.
154
+ include_data_vars : list
155
+ A list of data variables to include in the output. Only coordinates that have data variables will be included in the output.
156
+ exclude_data_vars : list
157
+ A list of data variables to exclude from the output. Only coordinates that have data variables will be included in the output.
158
+
159
+ Returns
160
+ -------
161
+ H5 instance
162
+ """
163
+ c = self.copy()
164
+ if selection is not None:
165
+ utils.filter_coords(c._coords_dict, selection, self._encodings)
166
+ vars_dict, is_regular_dict = utils.index_variables(self._files, c._coords_dict, c._encodings, self._group)
167
+
168
+ c._data_vars_dict = vars_dict
169
+ c._is_regular_dict = is_regular_dict
170
+
171
+ if include_coords is not None:
172
+ coords_rem_list = []
173
+ for k in list(c._coords_dict.keys()):
174
+ if k not in include_coords:
175
+ _ = c._coords_dict.pop(k)
176
+ coords_rem_list.append(k)
177
+
178
+ if coords_rem_list:
179
+ for k in list(c._data_vars_dict.keys()):
180
+ for coord in coords_rem_list:
181
+ if coord in c._data_vars_dict[k]['dims']:
182
+ c._data_vars_dict.pop(k)
183
+ break
184
+
185
+ if exclude_coords is not None:
186
+ coords_rem_list = []
187
+ for k in list(c._coords_dict.keys()):
188
+ if k in exclude_coords:
189
+ _ = c._coords_dict.pop(k)
190
+ coords_rem_list.append(k)
191
+
192
+ if coords_rem_list:
193
+ for k in list(c._data_vars_dict.keys()):
194
+ for coord in coords_rem_list:
195
+ if coord in c._data_vars_dict[k]['dims']:
196
+ c._data_vars_dict.pop(k)
197
+ break
198
+
199
+ if include_data_vars is not None:
200
+ c._data_vars_dict = {k: v for k, v in c._data_vars_dict.items() if k in include_data_vars}
201
+
202
+ include_dims = set()
203
+ for k, v in c._data_vars_dict.items():
204
+ include_dims.update(set(v['dims']))
205
+
206
+ for k in list(c._coords_dict.keys()):
207
+ if k not in include_dims:
208
+ _ = c._coords_dict.pop(k)
209
+
210
+ if exclude_data_vars is not None:
211
+ c._data_vars_dict = {k: v for k, v in c._data_vars_dict.items() if k not in exclude_data_vars}
212
+
213
+ include_dims = set()
214
+ for k, v in c._data_vars_dict.items():
215
+ include_dims.update(set(v['dims']))
216
+
217
+ for k in list(c._coords_dict.keys()):
218
+ if k not in include_dims:
219
+ _ = c._coords_dict.pop(k)
220
+
221
+ return c
222
+
223
+
224
+ def copy(self):
225
+ """
226
+ Deep copy an Combine instance.
227
+ """
228
+ c = copy.deepcopy(self)
229
+
230
+ return c
231
+
232
+
233
+ def coords(self):
234
+ """
235
+ A Summary of the coordinates.
236
+ """
237
+ coords_summ = {}
238
+ for k, v in self._coords_dict.items():
239
+ encs = copy.deepcopy(self._encodings[k])
240
+ coords_summ[k] = {'shape': v.shape}
241
+ coords_summ[k].update(encs)
242
+
243
+ return coords_summ
244
+
245
+
246
+ def data_vars(self):
247
+ """
248
+ A summary of the data variables.
249
+ """
250
+ vars_summ = {}
251
+ for k, v in self._data_vars_dict.items():
252
+ encs = copy.deepcopy(self._encodings[k])
253
+ vars_summ[k] = {k1: v1 for k1, v1 in v.items() if k1 in ['dims', 'shape']}
254
+ vars_summ[k].update(encs)
255
+
256
+ return vars_summ
257
+
258
+
259
+ def variables(self):
260
+ """
261
+ A summary of all coordinates and data variables.
262
+ """
263
+ coords_summ = self.dims()
264
+ vars_summ = self.datasets()
265
+
266
+ coords_summ.update(vars_summ)
267
+
268
+ return coords_summ
269
+
270
+
271
+ def to_hdf5(self, output: Union[str, pathlib.Path, io.BytesIO], group=None, chunks=None, unlimited_dims=None, compression='lzf', libver='earliest'):
272
+ """
273
+ Method to output the filtered data to an HDF5 file or file object.
274
+
275
+ Parameters
276
+ ----------
277
+ output : str, pathlib.Path, or io.BytesIO
278
+ The output path of the new combined hdf5 file.
279
+ group : str or None
280
+ The group or group path within the hdf5 file to save the datasets.
281
+ chunks : dict of tuples
282
+ The chunks per dataset. Must be a dictionary of dataset names with tuple values of appropriate dimensions. A value of None will perform auto-chunking.
283
+ unlimited_dims : str, list of str, or None
284
+ The dimensions/dimensions that should be assigned as "unlimited" in the hdf5 file.
285
+ compression : str or None
286
+ The compression used for the chunks in the hdf5 files. Must be one of gzip, lzf, zstd, lz4, or None. gzip is compatible with any hdf5 installation (not only h5py), so this should be used if interoperability across platforms is important. lzf is compatible with any h5py installation, so if only python users will need to access these files then this is a better option than gzip. zstd and lz4 require the hdf5plugin python package, but zstd is the best compression option if users have access to the hdf5plugin package. None has no compression and is generally not recommended except in niche situations.
287
+ libver : The hdf5 library version according to h5py. This is for advanced users only. https://docs.h5py.org/en/stable/high/file.html#version-bounding.
288
+
289
+ Returns
290
+ -------
291
+ None
292
+ """
293
+ ## Check if there's anything to save
294
+ if self._coords_dict:
295
+
296
+ ## Set up initial parameters
297
+ if isinstance(unlimited_dims, str):
298
+ unlimited_dims = [unlimited_dims]
299
+ else:
300
+ unlimited_dims = []
301
+
302
+ compressor = utils.get_compressor(compression)
303
+
304
+ ## Create new file
305
+ with h5py.File(output, 'w', libver=libver, rdcc_nbytes=3*1024*1024, track_order=True) as nf:
306
+
307
+ if isinstance(group, str):
308
+ nf1 = nf.create_group(group, track_order=True)
309
+ else:
310
+ nf1 = nf
311
+
312
+ ## Add the coords as datasets
313
+ # dim_id = 0
314
+ for coord, arr in self._coords_dict.items():
315
+ # if coord == 'time':
316
+ # break
317
+ shape = arr.shape
318
+ dtype = self._encodings[coord]['dtype']
319
+
320
+ maxshape = tuple([s if s not in unlimited_dims else None for s in shape])
321
+
322
+ chunks1 = utils.guess_chunk(shape, maxshape, dtype)
323
+
324
+ if isinstance(chunks, dict):
325
+ if coord in chunks:
326
+ chunks1 = chunks[coord]
327
+
328
+ if dtype == 'object':
329
+ coord_dtype = h5py.string_dtype()
330
+ else:
331
+ coord_dtype = dtype
332
+
333
+ ds = nf1.create_dataset(coord, shape, chunks=chunks1, maxshape=maxshape, dtype=coord_dtype, track_order=True, **compressor)
334
+
335
+ ds[:] = arr
336
+
337
+ ds.make_scale(coord)
338
+ ds.dims[0].label = coord
339
+
340
+ # ds.attrs['_Netcdf4Dimid'] = dim_id
341
+ # dim_id += 1
342
+ # ds.attrs['DIMENSION_LABELS'] = coord
343
+
344
+ ## Add the variables as datasets
345
+ vars_dict = self._data_vars_dict
346
+
347
+ for var_name in vars_dict:
348
+ shape = vars_dict[var_name]['shape']
349
+ dims = vars_dict[var_name]['dims']
350
+ # nc_coords = np.zeros(len(dims), dtype='int32')
351
+ # nc_labels = np.zeros(len(dims), dtype='object')
352
+ maxshape = tuple([s if dims[i] not in unlimited_dims else None for i, s in enumerate(shape)])
353
+
354
+ chunks1 = utils.guess_chunk(shape, maxshape, vars_dict[var_name]['dtype'])
355
+
356
+ if isinstance(chunks, dict):
357
+ if var_name in chunks:
358
+ chunks1 = chunks[var_name]
359
+
360
+ if len(shape) == 0:
361
+ chunks1 = None
362
+ compressor1 = {}
363
+ vars_dict[var_name]['fillvalue'] = None
364
+ maxshape = None
365
+ else:
366
+ compressor1 = compressor
367
+
368
+ if vars_dict[var_name]['dtype'] == 'object':
369
+ ds_dtype = h5py.string_dtype()
370
+ else:
371
+ ds_dtype = vars_dict[var_name]['dtype']
372
+
373
+ ds = nf1.create_dataset(var_name, shape, chunks=chunks1, maxshape=maxshape, dtype=ds_dtype, fillvalue=vars_dict[var_name]['fillvalue'], track_order=True, **compressor1)
374
+
375
+ for i, dim in enumerate(dims):
376
+ ds.dims[i].attach_scale(nf1[dim])
377
+ ds.dims[i].label = dim
378
+ # dim_id = nf1[dim].attrs['_Netcdf4Dimid']
379
+ # nc_coords[i] = dim_id
380
+ # nc_labels[i] = dim
381
+
382
+ # ds.attrs['_Netcdf4Coordinates'] = nc_coords
383
+ # ds.attrs['_Netcdf4Dimid'] = 4
384
+ # ds.attrs['DIMENSION_LABELS'] = nc_labels
385
+
386
+ ds_vars = vars_dict[var_name]
387
+
388
+ n_files = len(ds_vars['data'])
389
+ mean_ds_file_size = utils.product(shape)/n_files
390
+
391
+ # Load data by file if no chunks are assigned
392
+ if ds.chunks is None:
393
+ for i in ds_vars['data']:
394
+ with utils.open_file(self._files[i], self._group) as file:
395
+ ds_old = file[var_name]
396
+
397
+ if isinstance(ds_old, xr.DataArray):
398
+ ds[()] = utils.encode_data(ds_old.values, **self._encodings[var_name])
399
+ else:
400
+ ds[()] = ds_old[()]
401
+ else:
402
+ # If files are big and regular fill by file
403
+ if self._is_regular_dict[var_name] and (mean_ds_file_size > (3 * utils.product(ds.chunks))):
404
+ utils.fill_ds_by_files(ds, self._files, ds_vars, var_name, self._group, self._encodings)
405
+ # Otherwise fill by chunk
406
+ else:
407
+ utils.fill_ds_by_chunks(ds, self._files, ds_vars, var_name, self._group, self._encodings)
408
+
409
+ ## Assign attrs and encodings
410
+ for ds_name, attr in self._attrs.items():
411
+ if ds_name in nf1:
412
+ nf1[ds_name].attrs.update(attr)
413
+
414
+ for ds_name, encs in self._encodings.items():
415
+ if ds_name in nf1:
416
+ for f, enc in encs.items():
417
+ nf1[ds_name].attrs.update({f: enc})
418
+
419
+ # nf1.attrs['_NCProperties'] = b'version=2,hdf5=1.12.2,h5py=3.7.0'
420
+ nf1.attrs.update(self._global_attrs)
421
+ # nf1.attrs.update({'unlimited_dims': ''})
422
+
423
+ if isinstance(output, io.BytesIO):
424
+ output.seek(0)
425
+
426
+ else:
427
+ print('No data to save')
428
+
429
+
430
+ def to_xarray(self, **kwargs):
431
+ """
432
+ Save an HDF5 file to an io.BytesIO object which is then opened by xr.open_dataset using the h5netcdf engine.
433
+
434
+ Parameters
435
+ ----------
436
+ kwargs
437
+ Any kwargs that can be passed to open_dataset EXCEPT engine.
438
+
439
+ Returns
440
+ -------
441
+ xr.Dataset
442
+ """
443
+ if self._coords_dict:
444
+ b1 = io.BytesIO()
445
+
446
+ self.to_hdf5(b1)
447
+
448
+ xr_ds = xr.open_dataset(b1, engine='h5netcdf', **kwargs)
449
+ else:
450
+ xr_ds = xr.Dataset()
451
+
452
+ return xr_ds
453
+
454
+
455
+ ## Backwards compatibility
456
+ H5 = Combine
457
+
458
+ ################################################
459
+ ### Convenience functions
460
+
461
+
462
+ def xr_to_hdf5(data: Union[List[xr.Dataset], xr.Dataset], output: Union[str, pathlib.Path, io.BytesIO], group=None, chunks=None, unlimited_dims=None, compression='lzf'):
463
+ """
464
+ Convenience function to take one or more xr.Datasets and output the data to an HDF5 file or file object.
465
+
466
+ Parameters
467
+ ----------
468
+ data : xr.Dataset, or list of xr.Dataset
469
+ The input data as xr.Datasets.
470
+ output : str, pathlib.Path, or io.BytesIO
471
+ The output path of the new combined hdf5 file.
472
+ group : str or None
473
+ The group or group path within the hdf5 file to save the datasets.
474
+ chunks : dict of tuples
475
+ The chunks per dataset. Must be a dictionary of dataset names with tuple values of appropriate dimensions. A value of None will perform auto-chunking.
476
+ unlimited_dims : str, list of str, or None
477
+ The dimensions/dimensions that should be assigned as "unlimited" in the hdf5 file.
478
+ compression : str
479
+ The compression used for the chunks in the hdf5 files. Must be one of gzip, lzf, zstd, or None. gzip is compatible with any hdf5 installation (not only h5py), so this should be used if interoperability across platforms is important. lzf is compatible with any h5py installation, so if only python users will need to access these files then this is a better option than gzip. zstd requires the hdf5plugin python package, but is the best compression option if users have access to the hdf5plugin package. None has no compression and is generally not recommended except in niche situations.
480
+
481
+ Returns
482
+ -------
483
+ None
484
+ """
485
+ Combine(data).to_hdf5(output, group, chunks, unlimited_dims, compression)
486
+
487
+
488
+
489
+
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+
499
+
500
+ ######################################
501
+ ### Testing