cfdb 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cfdb/__init__.py +6 -0
- cfdb/combine.py +501 -0
- cfdb/core.py +1232 -0
- cfdb/creation.py +345 -0
- cfdb/data_models.py +189 -0
- cfdb/indexers.py +452 -0
- cfdb/main.py +857 -0
- cfdb/support_classes.py +1187 -0
- cfdb/utils.py +2079 -0
- cfdb-0.1.0.dist-info/METADATA +57 -0
- cfdb-0.1.0.dist-info/RECORD +13 -0
- cfdb-0.1.0.dist-info/WHEEL +4 -0
- cfdb-0.1.0.dist-info/licenses/LICENSE +16 -0
cfdb/main.py
ADDED
@@ -0,0 +1,857 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
Created on Tue Jan 7 11:25:06 2025
|
5
|
+
|
6
|
+
@author: mike
|
7
|
+
"""
|
8
|
+
import booklet
|
9
|
+
from typing import Union
|
10
|
+
import pathlib
|
11
|
+
import msgspec
|
12
|
+
import weakref
|
13
|
+
from copy import deepcopy
|
14
|
+
|
15
|
+
try:
|
16
|
+
import h5netcdf
|
17
|
+
import_h5netcdf = True
|
18
|
+
except ImportError:
|
19
|
+
import_h5netcdf = False
|
20
|
+
|
21
|
+
try:
|
22
|
+
import ebooklet
|
23
|
+
import_ebooklet = True
|
24
|
+
except ImportError:
|
25
|
+
import_ebooklet = False
|
26
|
+
|
27
|
+
from . import utils, indexers, data_models, creation, support_classes as sc
|
28
|
+
# import utils, indexers, data_models, creation, support_classes as sc
|
29
|
+
|
30
|
+
|
31
|
+
############################################
|
32
|
+
### Parameters
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
############################################
|
38
|
+
### Functions
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
############################################
|
44
|
+
### Classes
|
45
|
+
|
46
|
+
|
47
|
+
class DatasetBase:
|
48
|
+
|
49
|
+
# def __bool__(self):
|
50
|
+
# """
|
51
|
+
|
52
|
+
# """
|
53
|
+
# return self._file.__bool__()
|
54
|
+
|
55
|
+
def __iter__(self):
|
56
|
+
for key in self.var_names:
|
57
|
+
yield key
|
58
|
+
|
59
|
+
def __len__(self):
|
60
|
+
return len(self.var_names)
|
61
|
+
|
62
|
+
def __contains__(self, key):
|
63
|
+
return key in self.var_names
|
64
|
+
|
65
|
+
# def get(self, var_name):
|
66
|
+
# """
|
67
|
+
|
68
|
+
# """
|
69
|
+
# if not isinstance(var_name, str):
|
70
|
+
# raise TypeError('var_name must be a string.')
|
71
|
+
|
72
|
+
# if var_name not in self:
|
73
|
+
# raise ValueError(f'The Variable {var_name} does not exist.')
|
74
|
+
|
75
|
+
# if self._sel is not None:
|
76
|
+
# if var_name not in self._sel:
|
77
|
+
# raise ValueError(f'The Variable {var_name} does not exist in view.')
|
78
|
+
|
79
|
+
# if var_name not in self._var_cache:
|
80
|
+
# var_meta = self._sys_meta.variables[var_name]
|
81
|
+
# if isinstance(var_meta, data_models.DataVariable):
|
82
|
+
# var = sc.DataVariable(var_name, self)
|
83
|
+
# else:
|
84
|
+
# var = sc.Coordinate(var_name, self)
|
85
|
+
# self._var_cache[var_name] = var
|
86
|
+
|
87
|
+
# if self._sel is None:
|
88
|
+
# return self._var_cache[var_name]
|
89
|
+
# else:
|
90
|
+
# return self._var_cache[var_name][self._sel[var_name]]
|
91
|
+
|
92
|
+
# var_meta = self._sys_meta.variables[var_name]
|
93
|
+
# if isinstance(var_meta, data_models.DataVariable):
|
94
|
+
# var = sc.DataVariable(var_name, self)
|
95
|
+
# else:
|
96
|
+
# var = sc.Coordinate(var_name, self)
|
97
|
+
|
98
|
+
# return var
|
99
|
+
|
100
|
+
|
101
|
+
def __getitem__(self, key):
|
102
|
+
return self.get(key)
|
103
|
+
|
104
|
+
# def __setitem__(self, key, value):
|
105
|
+
# if isinstance(value, sc.Variable):
|
106
|
+
# setattr(self, key, value)
|
107
|
+
# else:
|
108
|
+
# raise TypeError('Assigned value must be a Variable or Coordinate object.')
|
109
|
+
|
110
|
+
def __delitem__(self, key):
|
111
|
+
if key not in self:
|
112
|
+
raise KeyError(key)
|
113
|
+
|
114
|
+
if not self.writable:
|
115
|
+
raise ValueError('Dataset is not writable.')
|
116
|
+
|
117
|
+
# Check if the object to delete is a coordinate
|
118
|
+
# And if it is, check that no variables are attached to it
|
119
|
+
if isinstance(self[key], sc.Coordinate):
|
120
|
+
for var_name, var in self._sys_meta.variables.items():
|
121
|
+
if isinstance(var, data_models.DataVariable):
|
122
|
+
if key in var.coords:
|
123
|
+
raise ValueError(f'{key} is a coordinate of {var_name}. You must delete all variables associated with a coordinate before you can delete the coordinate.')
|
124
|
+
|
125
|
+
# Delete all chunks from file
|
126
|
+
var = self[key]
|
127
|
+
coord_origins = var.get_coord_origins()
|
128
|
+
|
129
|
+
slices = indexers.index_combo_all(None, coord_origins, var.shape)
|
130
|
+
for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, var.name, var.chunk_shape):
|
131
|
+
try:
|
132
|
+
del self._blt[blt_key]
|
133
|
+
except KeyError:
|
134
|
+
pass
|
135
|
+
|
136
|
+
# Delete the attrs key
|
137
|
+
try:
|
138
|
+
del self._blt[sc.attrs_key.format(var_name=key)]
|
139
|
+
except KeyError:
|
140
|
+
pass
|
141
|
+
|
142
|
+
# Delete in cache
|
143
|
+
try:
|
144
|
+
del self._var_cache[key]
|
145
|
+
except KeyError:
|
146
|
+
pass
|
147
|
+
|
148
|
+
# Delete the instance in the sys meta
|
149
|
+
del self._sys_meta.variables[key]
|
150
|
+
|
151
|
+
|
152
|
+
# def sync(self):
|
153
|
+
# """
|
154
|
+
|
155
|
+
# """
|
156
|
+
# old_meta = msgspec.convert(self._blt.get_metadata(), data_models.SysMeta)
|
157
|
+
# if old_meta != self._meta:
|
158
|
+
# self._blt.set_metadata(msgspec.to_builtins(self._meta))
|
159
|
+
# self._blt.sync()
|
160
|
+
|
161
|
+
def __bool__(self):
|
162
|
+
return self.is_open
|
163
|
+
|
164
|
+
|
165
|
+
def __repr__(self):
|
166
|
+
"""
|
167
|
+
|
168
|
+
"""
|
169
|
+
return utils.file_summary(self)
|
170
|
+
|
171
|
+
|
172
|
+
def sel(self, sel: dict):
|
173
|
+
"""
|
174
|
+
Filter the dataset variables by a selection of the coordinate positions.
|
175
|
+
"""
|
176
|
+
## Checks on input
|
177
|
+
coord_names = self.coord_names
|
178
|
+
for key in sel:
|
179
|
+
if key not in coord_names:
|
180
|
+
raise KeyError(f'The coordinate {key} does not exist in the dataset.')
|
181
|
+
|
182
|
+
## Create selections per coord
|
183
|
+
_sel = {}
|
184
|
+
for coord_name in coord_names:
|
185
|
+
coord = self[coord_name]
|
186
|
+
if coord_name in sel:
|
187
|
+
slices = indexers.index_combo_all(sel[coord_name], coord.get_coord_origins(), coord.shape)
|
188
|
+
else:
|
189
|
+
slices = indexers.index_combo_all(None, coord.get_coord_origins(), coord.shape)
|
190
|
+
_sel[coord_name] = slices
|
191
|
+
|
192
|
+
## Create selections for data vars
|
193
|
+
data_var_names = self.data_var_names
|
194
|
+
for data_var_name in data_var_names:
|
195
|
+
data_var = self[data_var_name]
|
196
|
+
data_var_sel = tuple(_sel[coord_name][0] for coord_name in data_var.coord_names)
|
197
|
+
_sel[data_var_name] = data_var_sel
|
198
|
+
|
199
|
+
## Init DatasetView
|
200
|
+
return DatasetView(self, _sel)
|
201
|
+
|
202
|
+
|
203
|
+
def sel_loc(self, sel: dict):
|
204
|
+
"""
|
205
|
+
Filter the dataset variables by a selection of the coordinate locations.
|
206
|
+
"""
|
207
|
+
## Checks on input
|
208
|
+
coord_names = self.coord_names
|
209
|
+
for key in sel:
|
210
|
+
if key not in coord_names:
|
211
|
+
raise KeyError(f'The coordinate {key} does not exist in the dataset.')
|
212
|
+
|
213
|
+
## Create selections per coord
|
214
|
+
_sel = {}
|
215
|
+
for coord_name in coord_names:
|
216
|
+
coord = self[coord_name]
|
217
|
+
if coord_name in sel:
|
218
|
+
slices = indexers.index_combo_all(indexers.loc_index_combo_all(sel[coord_name], (coord,)), coord.get_coord_origins(), coord.shape)
|
219
|
+
else:
|
220
|
+
slices = indexers.index_combo_all(None, coord.get_coord_origins(), coord.shape)
|
221
|
+
_sel[coord_name] = slices
|
222
|
+
|
223
|
+
## Create selections for data vars
|
224
|
+
data_var_names = self.data_var_names
|
225
|
+
for data_var_name in data_var_names:
|
226
|
+
data_var = self[data_var_name]
|
227
|
+
data_var_sel = tuple(_sel[coord_name][0] for coord_name in data_var.coord_names)
|
228
|
+
_sel[data_var_name] = data_var_sel
|
229
|
+
|
230
|
+
## Init DatasetView
|
231
|
+
return DatasetView(self, _sel)
|
232
|
+
|
233
|
+
|
234
|
+
# def to_pandas(self):
|
235
|
+
# """
|
236
|
+
# Convert the entire file into a pandas DataFrame.
|
237
|
+
# """
|
238
|
+
# if not import_pandas:
|
239
|
+
# raise ImportError('pandas could not be imported.')
|
240
|
+
|
241
|
+
# # TODO: This feels wrong...but it works...
|
242
|
+
# result = None
|
243
|
+
# for var_name in self.data_vars:
|
244
|
+
# if result is None:
|
245
|
+
# result = self[var_name].to_pandas().to_frame()
|
246
|
+
# else:
|
247
|
+
# result = result.join(self[var_name].to_pandas().to_frame(), how='outer')
|
248
|
+
|
249
|
+
# self.close()
|
250
|
+
|
251
|
+
# return result
|
252
|
+
|
253
|
+
|
254
|
+
# def to_xarray(self, **kwargs):
|
255
|
+
# """
|
256
|
+
# Closes the file and opens it in xarray.
|
257
|
+
|
258
|
+
# Parameters
|
259
|
+
# ----------
|
260
|
+
# kwargs
|
261
|
+
# Any kwargs that can be passed to xr.open_dataset.
|
262
|
+
|
263
|
+
# Returns
|
264
|
+
# -------
|
265
|
+
# xr.Dataset
|
266
|
+
# """
|
267
|
+
# if not import_xarray:
|
268
|
+
# raise ImportError('xarray could not be imported.')
|
269
|
+
|
270
|
+
# filename = pathlib.Path(self.filename)
|
271
|
+
|
272
|
+
# if filename.is_file():
|
273
|
+
# self.close()
|
274
|
+
# else:
|
275
|
+
# temp_file = tempfile.NamedTemporaryFile()
|
276
|
+
# filename = temp_file.name
|
277
|
+
# self.to_file(filename)
|
278
|
+
# self.close()
|
279
|
+
|
280
|
+
# x1 = xr.open_dataset(filename, **kwargs)
|
281
|
+
|
282
|
+
# return x1
|
283
|
+
|
284
|
+
|
285
|
+
def copy(self, file_path):
|
286
|
+
"""
|
287
|
+
|
288
|
+
"""
|
289
|
+
kwargs = dict(n_buckets=self._blt._n_buckets, buffer_size=self._blt._write_buffer_size)
|
290
|
+
|
291
|
+
new_ds = open_dataset(file_path, 'n', compression=self.compression, compression_level=self.compression_level, **kwargs)
|
292
|
+
|
293
|
+
for coord in self.coords:
|
294
|
+
new_coord = new_ds.create.coord.like(coord.name, coord, True)
|
295
|
+
new_coord.attrs.update(coord.attrs.data)
|
296
|
+
|
297
|
+
for data_var in self.data_vars:
|
298
|
+
new_data_var = new_ds.create.data_var.like(data_var.name, data_var)
|
299
|
+
new_data_var.attrs.update(data_var.attrs.data)
|
300
|
+
for write_chunk, data in data_var.iter_chunks(False):
|
301
|
+
new_data_var.set(write_chunk, data, False)
|
302
|
+
|
303
|
+
new_ds.attrs.update(self.attrs.data)
|
304
|
+
|
305
|
+
return new_ds
|
306
|
+
|
307
|
+
|
308
|
+
def to_netcdf4(self, file_path: Union[str, pathlib.Path], compression: str='gzip', **file_kwargs):
|
309
|
+
"""
|
310
|
+
Save a dataset to a netcdf4 file using h5netcdf.
|
311
|
+
"""
|
312
|
+
if not import_h5netcdf:
|
313
|
+
raise ImportError('h5netcdf must be installed to save files to netcdf4.')
|
314
|
+
|
315
|
+
h5 = h5netcdf.File(file_path, 'w', **file_kwargs)
|
316
|
+
|
317
|
+
# dims/coords
|
318
|
+
for coord in self.coords:
|
319
|
+
name = coord.name
|
320
|
+
h5.dimensions[name] = coord.shape[0]
|
321
|
+
coord_len = coord.shape[0]
|
322
|
+
chunk_len = coord.chunk_shape[0]
|
323
|
+
if chunk_len > coord_len:
|
324
|
+
chunk_shape = (coord_len,)
|
325
|
+
else:
|
326
|
+
chunk_shape = (chunk_len,)
|
327
|
+
|
328
|
+
h5_coord = h5.create_variable(name, (name,), coord.dtype_encoded, compression=compression, chunks=chunk_shape, fillvalue=coord.fillvalue)
|
329
|
+
attrs = deepcopy(coord.attrs.data)
|
330
|
+
dtype_decoded, dtype_encoded = utils.parse_dtype_names(coord.dtype_decoded, coord.dtype_encoded)
|
331
|
+
if coord.step is not None:
|
332
|
+
attrs['step'] = coord.step
|
333
|
+
if coord.scale_factor is not None:
|
334
|
+
attrs['scale_factor'] = coord.scale_factor
|
335
|
+
elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
|
336
|
+
attrs['scale_factor'] = 1
|
337
|
+
if coord.add_offset is not None:
|
338
|
+
attrs['add_offset'] = coord.add_offset
|
339
|
+
elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
|
340
|
+
attrs['add_offset'] = 0
|
341
|
+
if coord.dtype_decoded.kind == 'M':
|
342
|
+
units = utils.parse_cf_time_units(coord.dtype_decoded)
|
343
|
+
calendar = "proleptic_gregorian"
|
344
|
+
attrs['units'] = units
|
345
|
+
attrs['calendar'] = calendar
|
346
|
+
attrs['standard_name'] = 'time'
|
347
|
+
|
348
|
+
attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': coord.fillvalue})
|
349
|
+
h5_coord.attrs.update(attrs)
|
350
|
+
|
351
|
+
for write_chunk, data in coord.iter_chunks(decoded=False):
|
352
|
+
h5_coord[write_chunk] = data
|
353
|
+
|
354
|
+
# Data vars
|
355
|
+
for data_var in self.data_vars:
|
356
|
+
name = data_var.name
|
357
|
+
chunk_shape = []
|
358
|
+
for s, cs in zip(data_var.shape, data_var.chunk_shape):
|
359
|
+
if cs > s:
|
360
|
+
chunk_shape.append(s)
|
361
|
+
else:
|
362
|
+
chunk_shape.append(cs)
|
363
|
+
|
364
|
+
h5_data_var = h5.create_variable(name, data_var.coord_names, data_var.dtype_encoded, compression=compression, chunks=tuple(chunk_shape), fillvalue=data_var.fillvalue)
|
365
|
+
attrs = deepcopy(data_var.attrs.data)
|
366
|
+
dtype_decoded, dtype_encoded = utils.parse_dtype_names(data_var.dtype_decoded, data_var.dtype_encoded)
|
367
|
+
if data_var.scale_factor is not None:
|
368
|
+
attrs['scale_factor'] = data_var.scale_factor
|
369
|
+
elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
|
370
|
+
attrs['scale_factor'] = 1
|
371
|
+
if data_var.add_offset is not None:
|
372
|
+
attrs['add_offset'] = data_var.add_offset
|
373
|
+
elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
|
374
|
+
attrs['add_offset'] = 0
|
375
|
+
if data_var.dtype_decoded.kind == 'M':
|
376
|
+
units = utils.parse_cf_time_units(data_var.dtype_decoded)
|
377
|
+
calendar = "proleptic_gregorian"
|
378
|
+
attrs['units'] = units
|
379
|
+
attrs['calendar'] = calendar
|
380
|
+
attrs['standard_name'] = 'time'
|
381
|
+
|
382
|
+
attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': data_var.fillvalue})
|
383
|
+
h5_data_var.attrs.update(attrs)
|
384
|
+
|
385
|
+
for write_chunk, data in data_var.iter_chunks(decoded=False):
|
386
|
+
h5_data_var[write_chunk] = data
|
387
|
+
|
388
|
+
# Add global attrs
|
389
|
+
h5.attrs.update(self.attrs.data)
|
390
|
+
|
391
|
+
h5.close()
|
392
|
+
|
393
|
+
|
394
|
+
class Dataset(DatasetBase):
|
395
|
+
"""
|
396
|
+
|
397
|
+
"""
|
398
|
+
def __init__(self, file_path, open_blt, create, compression, compression_level):
|
399
|
+
"""
|
400
|
+
Compression can be either zstd, lz4, or None. But there's no point in using None.
|
401
|
+
"""
|
402
|
+
self._blt = open_blt
|
403
|
+
self.writable = self._blt.writable
|
404
|
+
self.file_path = file_path
|
405
|
+
self.is_open = True
|
406
|
+
|
407
|
+
if hasattr(self._blt, 'load_items'):
|
408
|
+
self._has_load_items = True
|
409
|
+
else:
|
410
|
+
self._has_load_items = False
|
411
|
+
|
412
|
+
## Set/Get system metadata
|
413
|
+
if create:
|
414
|
+
# Checks
|
415
|
+
compression = compression.lower()
|
416
|
+
if compression not in utils.compression_options:
|
417
|
+
raise ValueError(f'compression must be one of {utils.compression_options}.')
|
418
|
+
if compression_level is None:
|
419
|
+
compression_level = utils.default_compression_levels[compression]
|
420
|
+
elif not isinstance(compression_level, int):
|
421
|
+
raise ValueError('compression_level must be either None or an int.')
|
422
|
+
|
423
|
+
self._sys_meta = data_models.SysMeta(object_type='Dataset', compression=data_models.Compressor(compression), compression_level=compression_level, variables={})
|
424
|
+
self._blt.set_metadata(msgspec.to_builtins(self._sys_meta))
|
425
|
+
|
426
|
+
else:
|
427
|
+
self._sys_meta = msgspec.convert(self._blt.get_metadata(), data_models.SysMeta)
|
428
|
+
|
429
|
+
self.compression = self._sys_meta.compression.value
|
430
|
+
self.compression_level = self._sys_meta.compression_level
|
431
|
+
self._compressor = sc.Compressor(self.compression, self.compression_level)
|
432
|
+
|
433
|
+
self._finalizers = [weakref.finalize(self, utils.dataset_finalizer, self._blt, self._sys_meta)]
|
434
|
+
|
435
|
+
self.attrs = sc.Attributes(self._blt, '_', self.writable, self._finalizers)
|
436
|
+
|
437
|
+
self._var_cache = weakref.WeakValueDictionary()
|
438
|
+
|
439
|
+
if self.writable:
|
440
|
+
self.create = creation.Creator(self)
|
441
|
+
|
442
|
+
|
443
|
+
def get(self, var_name):
|
444
|
+
"""
|
445
|
+
Get a variable contained within the dataset.
|
446
|
+
"""
|
447
|
+
if not isinstance(var_name, str):
|
448
|
+
raise TypeError('var_name must be a string.')
|
449
|
+
|
450
|
+
if var_name not in self:
|
451
|
+
raise ValueError(f'The Variable {var_name} does not exist.')
|
452
|
+
|
453
|
+
# if self._sel is not None:
|
454
|
+
# if var_name not in self._sel:
|
455
|
+
# raise ValueError(f'The Variable {var_name} does not exist in view.')
|
456
|
+
|
457
|
+
if var_name not in self._var_cache:
|
458
|
+
var_meta = self._sys_meta.variables[var_name]
|
459
|
+
if isinstance(var_meta, data_models.DataVariable):
|
460
|
+
var = sc.DataVariable(var_name, self)
|
461
|
+
else:
|
462
|
+
var = sc.Coordinate(var_name, self)
|
463
|
+
self._var_cache[var_name] = var
|
464
|
+
|
465
|
+
return self._var_cache[var_name]
|
466
|
+
|
467
|
+
def __enter__(self):
|
468
|
+
return self
|
469
|
+
|
470
|
+
def __exit__(self, *args):
|
471
|
+
self.close()
|
472
|
+
|
473
|
+
def close(self):
|
474
|
+
"""
|
475
|
+
Close the database.
|
476
|
+
"""
|
477
|
+
# self.sync()
|
478
|
+
for finalizer in reversed(self._finalizers):
|
479
|
+
finalizer()
|
480
|
+
self.is_open = False
|
481
|
+
|
482
|
+
|
483
|
+
@property
|
484
|
+
def var_names(self):
|
485
|
+
"""
|
486
|
+
Return a tuple of all the variables names (coord and data variables).
|
487
|
+
"""
|
488
|
+
return tuple(self._sys_meta.variables.keys())
|
489
|
+
|
490
|
+
@property
|
491
|
+
def coord_names(self):
|
492
|
+
"""
|
493
|
+
Return a tuple of all the coordinate names.
|
494
|
+
"""
|
495
|
+
return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.CoordinateVariable))
|
496
|
+
|
497
|
+
|
498
|
+
@property
|
499
|
+
def data_var_names(self):
|
500
|
+
"""
|
501
|
+
Return a tuple of all the data variable names.
|
502
|
+
"""
|
503
|
+
return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable))
|
504
|
+
|
505
|
+
|
506
|
+
@property
|
507
|
+
def coords(self):
|
508
|
+
"""
|
509
|
+
Return a tuple of coords.
|
510
|
+
"""
|
511
|
+
return tuple(self[coord_name] for coord_name in self.coord_names)
|
512
|
+
|
513
|
+
@property
|
514
|
+
def data_vars(self):
|
515
|
+
"""
|
516
|
+
Return a tuple of data variables.
|
517
|
+
"""
|
518
|
+
return tuple(self[var_name] for var_name in self.data_var_names)
|
519
|
+
|
520
|
+
@property
|
521
|
+
def variables(self):
|
522
|
+
"""
|
523
|
+
Return a tuple of variables.
|
524
|
+
"""
|
525
|
+
return tuple(self[var_name] for var_name in self.var_names)
|
526
|
+
|
527
|
+
def prune(self, timestamp=None, reindex=False):
|
528
|
+
"""
|
529
|
+
Prunes deleted data from the file. Returns the number of removed items. The method can also prune remove keys/values older than the timestamp. The user can also reindex the booklet file. False does no reindexing, True increases the n_buckets to a preassigned value, or an int of the n_buckets. True can only be used if the default n_buckets were used at original initialisation.
|
530
|
+
"""
|
531
|
+
return self._blt.prune(timestamp, reindex)
|
532
|
+
|
533
|
+
# def sync(self):
|
534
|
+
# """
|
535
|
+
|
536
|
+
# """
|
537
|
+
# self._blt.sync()
|
538
|
+
|
539
|
+
|
540
|
+
|
541
|
+
|
542
|
+
class DatasetView(DatasetBase):
|
543
|
+
"""
|
544
|
+
|
545
|
+
"""
|
546
|
+
def __init__(self, dataset, sel):
|
547
|
+
"""
|
548
|
+
|
549
|
+
"""
|
550
|
+
self._dataset = dataset
|
551
|
+
self._sel = sel
|
552
|
+
self._blt = dataset._blt
|
553
|
+
self._has_load_items = dataset._has_load_items
|
554
|
+
self.writable = False
|
555
|
+
self.file_path = dataset.file_path
|
556
|
+
self._sys_meta = dataset._sys_meta
|
557
|
+
self._compressor = dataset._compressor
|
558
|
+
self.compression = dataset.compression
|
559
|
+
self.compression_level = dataset.compression_level
|
560
|
+
self.attrs = dataset.attrs
|
561
|
+
self._var_cache = dataset._var_cache
|
562
|
+
|
563
|
+
|
564
|
+
def get(self, var_name):
|
565
|
+
"""
|
566
|
+
Get a variable contained within the dataset.
|
567
|
+
"""
|
568
|
+
if self._sel is not None:
|
569
|
+
if var_name not in self._sel:
|
570
|
+
raise ValueError(f'The Variable {var_name} does not exist in view.')
|
571
|
+
|
572
|
+
return self._dataset.get(var_name)[self._sel[var_name]]
|
573
|
+
|
574
|
+
|
575
|
+
@property
|
576
|
+
def is_open(self):
|
577
|
+
return self._dataset.is_open
|
578
|
+
|
579
|
+
@property
|
580
|
+
def var_names(self):
|
581
|
+
"""
|
582
|
+
Return a tuple of all the variables names (coord and data variables).
|
583
|
+
"""
|
584
|
+
return tuple(self._sel.keys())
|
585
|
+
|
586
|
+
@property
|
587
|
+
def coord_names(self):
|
588
|
+
"""
|
589
|
+
Return a tuple of all the coordinate names.
|
590
|
+
"""
|
591
|
+
return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.CoordinateVariable) if k in self._sel)
|
592
|
+
|
593
|
+
@property
|
594
|
+
def data_var_names(self):
|
595
|
+
"""
|
596
|
+
Return a tuple of all the data variable names.
|
597
|
+
"""
|
598
|
+
return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable) if k in self._sel)
|
599
|
+
|
600
|
+
@property
|
601
|
+
def coords(self):
|
602
|
+
return tuple(self[coord_name][self._sel[coord_name]] for coord_name in self.coord_names if coord_name in self._sel)
|
603
|
+
|
604
|
+
@property
|
605
|
+
def data_vars(self):
|
606
|
+
return tuple(self[var_name][self._sel[var_name]] for var_name in self.data_var_names if var_name in self._sel)
|
607
|
+
|
608
|
+
@property
|
609
|
+
def variables(self):
|
610
|
+
return tuple(self[var_name][self._sel[var_name]] for var_name in self.var_names if var_name in self._sel)
|
611
|
+
|
612
|
+
|
613
|
+
|
614
|
+
class EDataset(Dataset):
|
615
|
+
"""
|
616
|
+
|
617
|
+
"""
|
618
|
+
def changes(self):
|
619
|
+
"""
|
620
|
+
Return a Change object of the changes that have occurred during this session.
|
621
|
+
"""
|
622
|
+
return self._blt.changes()
|
623
|
+
|
624
|
+
def delete_remote(self):
|
625
|
+
"""
|
626
|
+
Completely delete the remote dataset, but keep the local dataset.
|
627
|
+
"""
|
628
|
+
self._blt.delete_remote()
|
629
|
+
|
630
|
+
def copy_remote(self, remote_conn: ebooklet.S3Connection):
|
631
|
+
"""
|
632
|
+
Copy the entire remote dataset to another remote location. The new location must be empty.
|
633
|
+
"""
|
634
|
+
self._blt.copy_remote(remote_conn)
|
635
|
+
|
636
|
+
|
637
|
+
|
638
|
+
#######################################################
|
639
|
+
### Open functions
|
640
|
+
|
641
|
+
|
642
|
+
def open_dataset(file_path: Union[str, pathlib.Path], flag: str = "r", compression: str='zstd', compression_level: int=None, **kwargs):
|
643
|
+
"""
|
644
|
+
Open a cfdb dataset. This uses the python package booklet for managing data in a single file.
|
645
|
+
|
646
|
+
Parameters
|
647
|
+
----------
|
648
|
+
file_path: str or pathlib.Path
|
649
|
+
It must be a path to a local file location. If you want to use a tempfile, then use the name from the NamedTemporaryFile initialized class.
|
650
|
+
flag: str
|
651
|
+
Flag associated with how the file is opened according to the dbm style. See below for details.
|
652
|
+
compression: str
|
653
|
+
The compression algorithm used for compressing all data. Must be either zstd or lz4. The option zstd has a really good combo of compression ratio to speed, while lz4 has a stronger emphasis on speed (and is lightning fast). Default is zstd.
|
654
|
+
compression_level: int or None
|
655
|
+
The compression level used by the compression algorithm. Setting this to None will d=used the deafults, which is 1 for both compression options.
|
656
|
+
kwargs
|
657
|
+
Any kwargs that can be passed to booklet.open.
|
658
|
+
|
659
|
+
Returns
|
660
|
+
-------
|
661
|
+
cfdb.Dataset
|
662
|
+
|
663
|
+
The optional *flag* argument can be:
|
664
|
+
+---------+-------------------------------------------+
|
665
|
+
| Value | Meaning |
|
666
|
+
+=========+===========================================+
|
667
|
+
| ``'r'`` | Open existing database for reading only |
|
668
|
+
| | (default) |
|
669
|
+
+---------+-------------------------------------------+
|
670
|
+
| ``'w'`` | Open existing database for reading and |
|
671
|
+
| | writing |
|
672
|
+
+---------+-------------------------------------------+
|
673
|
+
| ``'c'`` | Open database for reading and writing, |
|
674
|
+
| | creating it if it doesn't exist |
|
675
|
+
+---------+-------------------------------------------+
|
676
|
+
| ``'n'`` | Always create a new, empty database, open |
|
677
|
+
| | for reading and writing |
|
678
|
+
+---------+-------------------------------------------+
|
679
|
+
"""
|
680
|
+
if 'n_buckets' not in kwargs:
|
681
|
+
kwargs['n_buckets'] = utils.default_n_buckets
|
682
|
+
|
683
|
+
fp = pathlib.Path(file_path)
|
684
|
+
fp_exists = fp.exists()
|
685
|
+
open_blt = booklet.open(file_path, flag, key_serializer='str', **kwargs)
|
686
|
+
|
687
|
+
if not fp_exists or flag == 'n':
|
688
|
+
create = True
|
689
|
+
else:
|
690
|
+
create = False
|
691
|
+
|
692
|
+
return Dataset(file_path, open_blt, create, compression, compression_level)
|
693
|
+
|
694
|
+
|
695
|
+
def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
|
696
|
+
file_path: Union[str, pathlib.Path],
|
697
|
+
flag: str = "r",
|
698
|
+
compression: str='zstd',
|
699
|
+
compression_level: int=1,
|
700
|
+
**kwargs):
|
701
|
+
"""
|
702
|
+
Open a cfdb that is linked with a remote S3 database.
|
703
|
+
|
704
|
+
Parameters
|
705
|
+
-----------
|
706
|
+
remote_conn : S3Connection, str, or dict
|
707
|
+
The object to connect to a remote. It can be an S3Connection object, an http url string, or a dict with the parameters for initializing an S3Connection object.
|
708
|
+
|
709
|
+
file_path : str or pathlib.Path
|
710
|
+
It must be a path to a local file location. If you want to use a tempfile, then use the name from the NamedTemporaryFile initialized class.
|
711
|
+
|
712
|
+
flag : str
|
713
|
+
Flag associated with how the file is opened according to the dbm style. See below for details.
|
714
|
+
compression: str
|
715
|
+
The compression algorithm used for compressing all data. Must be either zstd or lz4. The option zstd has a really good combo of compression ratio to speed, while lz4 has a stronger emphasis on speed (and is lightning fast). Default is zstd.
|
716
|
+
compression_level: int or None
|
717
|
+
The compression level used by the compression algorithm. Setting this to None will d=used the deafults, which is 1 for both compression options.
|
718
|
+
kwargs
|
719
|
+
Any kwargs that can be passed to ebooklet.open.
|
720
|
+
|
721
|
+
Returns
|
722
|
+
-------
|
723
|
+
cfdb.EDataset
|
724
|
+
|
725
|
+
The optional *flag* argument can be:
|
726
|
+
+---------+-------------------------------------------+
|
727
|
+
| Value | Meaning |
|
728
|
+
+=========+===========================================+
|
729
|
+
| ``'r'`` | Open existing database for reading only |
|
730
|
+
| | (default) |
|
731
|
+
+---------+-------------------------------------------+
|
732
|
+
| ``'w'`` | Open existing database for reading and |
|
733
|
+
| | writing |
|
734
|
+
+---------+-------------------------------------------+
|
735
|
+
| ``'c'`` | Open database for reading and writing, |
|
736
|
+
| | creating it if it doesn't exist |
|
737
|
+
+---------+-------------------------------------------+
|
738
|
+
| ``'n'`` | Always create a new, empty database, open |
|
739
|
+
| | for reading and writing |
|
740
|
+
+---------+-------------------------------------------+
|
741
|
+
"""
|
742
|
+
if not import_ebooklet:
|
743
|
+
raise ImportError('ebooklet must be installed to open ebooklets.')
|
744
|
+
|
745
|
+
if 'n_buckets' not in kwargs:
|
746
|
+
kwargs['n_buckets'] = utils.default_n_buckets
|
747
|
+
|
748
|
+
fp = pathlib.Path(file_path)
|
749
|
+
fp_exists = fp.exists()
|
750
|
+
open_blt = ebooklet.open(remote_conn, file_path, flag, **kwargs)
|
751
|
+
|
752
|
+
if (not fp_exists or flag == 'n') and open_blt.writable:
|
753
|
+
create = True
|
754
|
+
else:
|
755
|
+
create = False
|
756
|
+
|
757
|
+
return EDataset(file_path, open_blt, create, compression, compression_level)
|
758
|
+
|
759
|
+
|
760
|
+
|
761
|
+
|
762
|
+
|
763
|
+
|
764
|
+
|
765
|
+
|
766
|
+
|
767
|
+
|
768
|
+
|
769
|
+
|
770
|
+
|
771
|
+
|
772
|
+
|
773
|
+
|
774
|
+
|
775
|
+
|
776
|
+
|
777
|
+
|
778
|
+
|
779
|
+
|
780
|
+
|
781
|
+
|
782
|
+
|
783
|
+
|
784
|
+
|
785
|
+
|
786
|
+
|
787
|
+
|
788
|
+
|
789
|
+
|
790
|
+
|
791
|
+
|
792
|
+
|
793
|
+
|
794
|
+
|
795
|
+
|
796
|
+
|
797
|
+
|
798
|
+
|
799
|
+
|
800
|
+
|
801
|
+
|
802
|
+
|
803
|
+
|
804
|
+
|
805
|
+
|
806
|
+
|
807
|
+
|
808
|
+
|
809
|
+
|
810
|
+
|
811
|
+
|
812
|
+
|
813
|
+
|
814
|
+
|
815
|
+
|
816
|
+
|
817
|
+
|
818
|
+
|
819
|
+
|
820
|
+
|
821
|
+
|
822
|
+
|
823
|
+
|
824
|
+
|
825
|
+
|
826
|
+
|
827
|
+
|
828
|
+
|
829
|
+
|
830
|
+
|
831
|
+
|
832
|
+
|
833
|
+
|
834
|
+
|
835
|
+
|
836
|
+
|
837
|
+
|
838
|
+
|
839
|
+
|
840
|
+
|
841
|
+
|
842
|
+
|
843
|
+
|
844
|
+
|
845
|
+
|
846
|
+
|
847
|
+
|
848
|
+
|
849
|
+
|
850
|
+
|
851
|
+
|
852
|
+
|
853
|
+
|
854
|
+
|
855
|
+
|
856
|
+
|
857
|
+
|