cfdb 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cfdb/__init__.py +2 -1
- cfdb/indexers.py +0 -3
- cfdb/main.py +134 -161
- cfdb/support_classes.py +52 -16
- cfdb/tools.py +427 -0
- cfdb/utils.py +25 -0
- cfdb-0.1.1.dist-info/METADATA +204 -0
- cfdb-0.1.1.dist-info/RECORD +14 -0
- cfdb-0.1.0.dist-info/METADATA +0 -57
- cfdb-0.1.0.dist-info/RECORD +0 -13
- {cfdb-0.1.0.dist-info → cfdb-0.1.1.dist-info}/WHEEL +0 -0
- {cfdb-0.1.0.dist-info → cfdb-0.1.1.dist-info}/licenses/LICENSE +0 -0
cfdb/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""CF conventions multi-dimensional array database on top of Booklet"""
|
2
2
|
from cfdb.main import open_dataset, open_edataset
|
3
3
|
from cfdb.utils import compute_scale_and_offset
|
4
|
+
from cfdb.tools import netcdf4_to_cfdb, cfdb_to_netcdf4
|
4
5
|
from rechunkit import guess_chunk_shape
|
5
6
|
|
6
|
-
__version__ = '0.1.
|
7
|
+
__version__ = '0.1.1'
|
cfdb/indexers.py
CHANGED
@@ -287,11 +287,8 @@ def slices_to_chunks_keys(slices, var_name, var_chunk_shape, clip_ends=True):
|
|
287
287
|
"""
|
288
288
|
starts = tuple(s.start for s in slices)
|
289
289
|
stops = tuple(s.stop for s in slices)
|
290
|
-
# chunk_iter1 = rechunkit.chunk_range(starts, stops, var_chunk_shape, clip_ends=False)
|
291
290
|
chunk_iter2 = rechunkit.chunk_range(starts, stops, var_chunk_shape, clip_ends=clip_ends)
|
292
|
-
# for full_chunk, partial_chunk in zip(chunk_iter1, chunk_iter2):
|
293
291
|
for partial_chunk in chunk_iter2:
|
294
|
-
# starts_chunk = tuple(s.start for s in full_chunk)
|
295
292
|
starts_chunk = tuple((pc.start//cs) * cs for cs, pc in zip(var_chunk_shape, partial_chunk))
|
296
293
|
new_key = utils.make_var_chunk_key(var_name, starts_chunk)
|
297
294
|
|
cfdb/main.py
CHANGED
@@ -6,7 +6,7 @@ Created on Tue Jan 7 11:25:06 2025
|
|
6
6
|
@author: mike
|
7
7
|
"""
|
8
8
|
import booklet
|
9
|
-
from typing import Union
|
9
|
+
from typing import Union, List
|
10
10
|
import pathlib
|
11
11
|
import msgspec
|
12
12
|
import weakref
|
@@ -62,50 +62,10 @@ class DatasetBase:
|
|
62
62
|
def __contains__(self, key):
|
63
63
|
return key in self.var_names
|
64
64
|
|
65
|
-
# def get(self, var_name):
|
66
|
-
# """
|
67
|
-
|
68
|
-
# """
|
69
|
-
# if not isinstance(var_name, str):
|
70
|
-
# raise TypeError('var_name must be a string.')
|
71
|
-
|
72
|
-
# if var_name not in self:
|
73
|
-
# raise ValueError(f'The Variable {var_name} does not exist.')
|
74
|
-
|
75
|
-
# if self._sel is not None:
|
76
|
-
# if var_name not in self._sel:
|
77
|
-
# raise ValueError(f'The Variable {var_name} does not exist in view.')
|
78
|
-
|
79
|
-
# if var_name not in self._var_cache:
|
80
|
-
# var_meta = self._sys_meta.variables[var_name]
|
81
|
-
# if isinstance(var_meta, data_models.DataVariable):
|
82
|
-
# var = sc.DataVariable(var_name, self)
|
83
|
-
# else:
|
84
|
-
# var = sc.Coordinate(var_name, self)
|
85
|
-
# self._var_cache[var_name] = var
|
86
|
-
|
87
|
-
# if self._sel is None:
|
88
|
-
# return self._var_cache[var_name]
|
89
|
-
# else:
|
90
|
-
# return self._var_cache[var_name][self._sel[var_name]]
|
91
|
-
|
92
|
-
# var_meta = self._sys_meta.variables[var_name]
|
93
|
-
# if isinstance(var_meta, data_models.DataVariable):
|
94
|
-
# var = sc.DataVariable(var_name, self)
|
95
|
-
# else:
|
96
|
-
# var = sc.Coordinate(var_name, self)
|
97
|
-
|
98
|
-
# return var
|
99
|
-
|
100
65
|
|
101
66
|
def __getitem__(self, key):
|
102
67
|
return self.get(key)
|
103
68
|
|
104
|
-
# def __setitem__(self, key, value):
|
105
|
-
# if isinstance(value, sc.Variable):
|
106
|
-
# setattr(self, key, value)
|
107
|
-
# else:
|
108
|
-
# raise TypeError('Assigned value must be a Variable or Coordinate object.')
|
109
69
|
|
110
70
|
def __delitem__(self, key):
|
111
71
|
if key not in self:
|
@@ -168,8 +128,29 @@ class DatasetBase:
|
|
168
128
|
"""
|
169
129
|
return utils.file_summary(self)
|
170
130
|
|
131
|
+
@property
|
132
|
+
def coords(self):
|
133
|
+
"""
|
134
|
+
Return a tuple of coords.
|
135
|
+
"""
|
136
|
+
return tuple(self[coord_name] for coord_name in self.coord_names)
|
171
137
|
|
172
|
-
|
138
|
+
@property
|
139
|
+
def data_vars(self):
|
140
|
+
"""
|
141
|
+
Return a tuple of data variables.
|
142
|
+
"""
|
143
|
+
return tuple(self[var_name] for var_name in self.data_var_names)
|
144
|
+
|
145
|
+
@property
|
146
|
+
def variables(self):
|
147
|
+
"""
|
148
|
+
Return a tuple of variables.
|
149
|
+
"""
|
150
|
+
return tuple(self[var_name] for var_name in self.var_names)
|
151
|
+
|
152
|
+
|
153
|
+
def select(self, sel: dict):
|
173
154
|
"""
|
174
155
|
Filter the dataset variables by a selection of the coordinate positions.
|
175
156
|
"""
|
@@ -200,9 +181,9 @@ class DatasetBase:
|
|
200
181
|
return DatasetView(self, _sel)
|
201
182
|
|
202
183
|
|
203
|
-
def
|
184
|
+
def select_loc(self, sel: dict):
|
204
185
|
"""
|
205
|
-
Filter the dataset variables by a selection of the coordinate locations.
|
186
|
+
Filter the dataset variables by a selection of the coordinate locations/values.
|
206
187
|
"""
|
207
188
|
## Checks on input
|
208
189
|
coord_names = self.coord_names
|
@@ -282,7 +263,7 @@ class DatasetBase:
|
|
282
263
|
# return x1
|
283
264
|
|
284
265
|
|
285
|
-
def copy(self, file_path):
|
266
|
+
def copy(self, file_path: Union[str, pathlib.Path], include_data_vars: List[str]=None, exclude_data_vars: List[str]=None):
|
286
267
|
"""
|
287
268
|
|
288
269
|
"""
|
@@ -290,14 +271,18 @@ class DatasetBase:
|
|
290
271
|
|
291
272
|
new_ds = open_dataset(file_path, 'n', compression=self.compression, compression_level=self.compression_level, **kwargs)
|
292
273
|
|
293
|
-
|
294
|
-
|
274
|
+
data_var_names, coord_names = utils.filter_var_names(self, include_data_vars, exclude_data_vars)
|
275
|
+
|
276
|
+
for coord_name in coord_names:
|
277
|
+
coord = self[coord_name]
|
278
|
+
new_coord = new_ds.create.coord.like(coord_name, coord, True)
|
295
279
|
new_coord.attrs.update(coord.attrs.data)
|
296
280
|
|
297
|
-
for
|
298
|
-
|
281
|
+
for data_var_name in data_var_names:
|
282
|
+
data_var = self[data_var_name]
|
283
|
+
new_data_var = new_ds.create.data_var.like(data_var_name, data_var)
|
299
284
|
new_data_var.attrs.update(data_var.attrs.data)
|
300
|
-
for write_chunk, data in data_var.iter_chunks(False):
|
285
|
+
for write_chunk, data in data_var.iter_chunks(decoded=False):
|
301
286
|
new_data_var.set(write_chunk, data, False)
|
302
287
|
|
303
288
|
new_ds.attrs.update(self.attrs.data)
|
@@ -305,90 +290,99 @@ class DatasetBase:
|
|
305
290
|
return new_ds
|
306
291
|
|
307
292
|
|
308
|
-
def to_netcdf4(self, file_path: Union[str, pathlib.Path], compression: str='gzip', **file_kwargs):
|
293
|
+
def to_netcdf4(self, file_path: Union[str, pathlib.Path], compression: str='gzip', include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, **file_kwargs):
|
309
294
|
"""
|
310
295
|
Save a dataset to a netcdf4 file using h5netcdf.
|
311
296
|
"""
|
312
297
|
if not import_h5netcdf:
|
313
298
|
raise ImportError('h5netcdf must be installed to save files to netcdf4.')
|
314
299
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
chunk_shape = (chunk_len,)
|
327
|
-
|
328
|
-
h5_coord = h5.create_variable(name, (name,), coord.dtype_encoded, compression=compression, chunks=chunk_shape, fillvalue=coord.fillvalue)
|
329
|
-
attrs = deepcopy(coord.attrs.data)
|
330
|
-
dtype_decoded, dtype_encoded = utils.parse_dtype_names(coord.dtype_decoded, coord.dtype_encoded)
|
331
|
-
if coord.step is not None:
|
332
|
-
attrs['step'] = coord.step
|
333
|
-
if coord.scale_factor is not None:
|
334
|
-
attrs['scale_factor'] = coord.scale_factor
|
335
|
-
elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
|
336
|
-
attrs['scale_factor'] = 1
|
337
|
-
if coord.add_offset is not None:
|
338
|
-
attrs['add_offset'] = coord.add_offset
|
339
|
-
elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
|
340
|
-
attrs['add_offset'] = 0
|
341
|
-
if coord.dtype_decoded.kind == 'M':
|
342
|
-
units = utils.parse_cf_time_units(coord.dtype_decoded)
|
343
|
-
calendar = "proleptic_gregorian"
|
344
|
-
attrs['units'] = units
|
345
|
-
attrs['calendar'] = calendar
|
346
|
-
attrs['standard_name'] = 'time'
|
347
|
-
|
348
|
-
attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': coord.fillvalue})
|
349
|
-
h5_coord.attrs.update(attrs)
|
350
|
-
|
351
|
-
for write_chunk, data in coord.iter_chunks(decoded=False):
|
352
|
-
h5_coord[write_chunk] = data
|
353
|
-
|
354
|
-
# Data vars
|
355
|
-
for data_var in self.data_vars:
|
356
|
-
name = data_var.name
|
357
|
-
chunk_shape = []
|
358
|
-
for s, cs in zip(data_var.shape, data_var.chunk_shape):
|
359
|
-
if cs > s:
|
360
|
-
chunk_shape.append(s)
|
300
|
+
data_var_names, coord_names = utils.filter_var_names(self, include_data_vars, exclude_data_vars)
|
301
|
+
|
302
|
+
with h5netcdf.File(file_path, 'w', **file_kwargs) as h5:
|
303
|
+
# dims/coords
|
304
|
+
for coord_name in coord_names:
|
305
|
+
coord = self[coord_name]
|
306
|
+
h5.dimensions[coord_name] = coord.shape[0]
|
307
|
+
coord_len = coord.shape[0]
|
308
|
+
chunk_len = coord.chunk_shape[0]
|
309
|
+
if chunk_len > coord_len:
|
310
|
+
chunk_shape = (coord_len,)
|
361
311
|
else:
|
362
|
-
chunk_shape
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
312
|
+
chunk_shape = (chunk_len,)
|
313
|
+
|
314
|
+
h5_coord = h5.create_variable(coord_name, (coord_name,), coord.dtype_encoded, compression=compression, chunks=chunk_shape, fillvalue=coord.fillvalue)
|
315
|
+
attrs = deepcopy(coord.attrs.data)
|
316
|
+
dtype_decoded, dtype_encoded = utils.parse_dtype_names(coord.dtype_decoded, coord.dtype_encoded)
|
317
|
+
if coord.step is not None:
|
318
|
+
attrs['step'] = coord.step
|
319
|
+
if coord.scale_factor is not None:
|
320
|
+
attrs['scale_factor'] = coord.scale_factor
|
321
|
+
elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
|
322
|
+
attrs['scale_factor'] = 1
|
323
|
+
if coord.add_offset is not None:
|
324
|
+
attrs['add_offset'] = coord.add_offset
|
325
|
+
elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
|
326
|
+
attrs['add_offset'] = 0
|
327
|
+
if coord.dtype_decoded.kind == 'M':
|
328
|
+
units = utils.parse_cf_time_units(coord.dtype_decoded)
|
329
|
+
calendar = "proleptic_gregorian"
|
330
|
+
attrs['units'] = units
|
331
|
+
attrs['calendar'] = calendar
|
332
|
+
attrs['standard_name'] = 'time'
|
333
|
+
|
334
|
+
if coord.fillvalue is not None:
|
335
|
+
attrs['_FillValue'] = coord.fillvalue
|
336
|
+
|
337
|
+
attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded})
|
338
|
+
try:
|
339
|
+
h5_coord.attrs.update(attrs)
|
340
|
+
except Exception as err:
|
341
|
+
print(attrs)
|
342
|
+
raise err
|
343
|
+
|
344
|
+
for write_chunk, data in coord.iter_chunks(decoded=False):
|
345
|
+
h5_coord[write_chunk] = data
|
346
|
+
|
347
|
+
# Data vars
|
348
|
+
for data_var_name in data_var_names:
|
349
|
+
data_var = self[data_var_name]
|
350
|
+
chunk_shape = []
|
351
|
+
for s, cs in zip(data_var.shape, data_var.chunk_shape):
|
352
|
+
if cs > s:
|
353
|
+
chunk_shape.append(s)
|
354
|
+
else:
|
355
|
+
chunk_shape.append(cs)
|
356
|
+
|
357
|
+
h5_data_var = h5.create_variable(data_var_name, data_var.coord_names, data_var.dtype_encoded, compression=compression, chunks=tuple(chunk_shape), fillvalue=data_var.fillvalue)
|
358
|
+
attrs = deepcopy(data_var.attrs.data)
|
359
|
+
dtype_decoded, dtype_encoded = utils.parse_dtype_names(data_var.dtype_decoded, data_var.dtype_encoded)
|
360
|
+
if data_var.scale_factor is not None:
|
361
|
+
attrs['scale_factor'] = data_var.scale_factor
|
362
|
+
elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
|
363
|
+
attrs['scale_factor'] = 1
|
364
|
+
if data_var.add_offset is not None:
|
365
|
+
attrs['add_offset'] = data_var.add_offset
|
366
|
+
elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
|
367
|
+
attrs['add_offset'] = 0
|
368
|
+
if data_var.dtype_decoded.kind == 'M':
|
369
|
+
units = utils.parse_cf_time_units(data_var.dtype_decoded)
|
370
|
+
calendar = "proleptic_gregorian"
|
371
|
+
attrs['units'] = units
|
372
|
+
attrs['calendar'] = calendar
|
373
|
+
attrs['standard_name'] = 'time'
|
374
|
+
|
375
|
+
if coord.fillvalue is not None:
|
376
|
+
attrs['_FillValue'] = data_var.fillvalue
|
377
|
+
|
378
|
+
attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded})
|
379
|
+
h5_data_var.attrs.update(attrs)
|
380
|
+
|
381
|
+
for write_chunk, data in data_var.iter_chunks(decoded=False):
|
382
|
+
h5_data_var[write_chunk] = data
|
383
|
+
|
384
|
+
# Add global attrs
|
385
|
+
h5.attrs.update(self.attrs.data)
|
392
386
|
|
393
387
|
|
394
388
|
class Dataset(DatasetBase):
|
@@ -401,7 +395,7 @@ class Dataset(DatasetBase):
|
|
401
395
|
"""
|
402
396
|
self._blt = open_blt
|
403
397
|
self.writable = self._blt.writable
|
404
|
-
self.file_path = file_path
|
398
|
+
self.file_path = pathlib.Path(file_path)
|
405
399
|
self.is_open = True
|
406
400
|
|
407
401
|
if hasattr(self._blt, 'load_items'):
|
@@ -503,27 +497,6 @@ class Dataset(DatasetBase):
|
|
503
497
|
return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable))
|
504
498
|
|
505
499
|
|
506
|
-
@property
|
507
|
-
def coords(self):
|
508
|
-
"""
|
509
|
-
Return a tuple of coords.
|
510
|
-
"""
|
511
|
-
return tuple(self[coord_name] for coord_name in self.coord_names)
|
512
|
-
|
513
|
-
@property
|
514
|
-
def data_vars(self):
|
515
|
-
"""
|
516
|
-
Return a tuple of data variables.
|
517
|
-
"""
|
518
|
-
return tuple(self[var_name] for var_name in self.data_var_names)
|
519
|
-
|
520
|
-
@property
|
521
|
-
def variables(self):
|
522
|
-
"""
|
523
|
-
Return a tuple of variables.
|
524
|
-
"""
|
525
|
-
return tuple(self[var_name] for var_name in self.var_names)
|
526
|
-
|
527
500
|
def prune(self, timestamp=None, reindex=False):
|
528
501
|
"""
|
529
502
|
Prunes deleted data from the file. Returns the number of removed items. The method can also prune remove keys/values older than the timestamp. The user can also reindex the booklet file. False does no reindexing, True increases the n_buckets to a preassigned value, or an int of the n_buckets. True can only be used if the default n_buckets were used at original initialisation.
|
@@ -597,17 +570,17 @@ class DatasetView(DatasetBase):
|
|
597
570
|
"""
|
598
571
|
return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable) if k in self._sel)
|
599
572
|
|
600
|
-
@property
|
601
|
-
def coords(self):
|
602
|
-
|
573
|
+
# @property
|
574
|
+
# def coords(self):
|
575
|
+
# return tuple(self[coord_name][self._sel[coord_name]] for coord_name in self.coord_names if coord_name in self._sel)
|
603
576
|
|
604
|
-
@property
|
605
|
-
def data_vars(self):
|
606
|
-
|
577
|
+
# @property
|
578
|
+
# def data_vars(self):
|
579
|
+
# return tuple(self[var_name][self._sel[var_name]] for var_name in self.data_var_names if var_name in self._sel)
|
607
580
|
|
608
|
-
@property
|
609
|
-
def variables(self):
|
610
|
-
|
581
|
+
# @property
|
582
|
+
# def variables(self):
|
583
|
+
# return tuple(self[var_name][self._sel[var_name]] for var_name in self.var_names if var_name in self._sel)
|
611
584
|
|
612
585
|
|
613
586
|
|
@@ -689,7 +662,7 @@ def open_dataset(file_path: Union[str, pathlib.Path], flag: str = "r", compressi
|
|
689
662
|
else:
|
690
663
|
create = False
|
691
664
|
|
692
|
-
return Dataset(
|
665
|
+
return Dataset(fp, open_blt, create, compression, compression_level)
|
693
666
|
|
694
667
|
|
695
668
|
def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
|
@@ -699,7 +672,7 @@ def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
|
|
699
672
|
compression_level: int=1,
|
700
673
|
**kwargs):
|
701
674
|
"""
|
702
|
-
Open a cfdb that is linked with a remote S3 database.
|
675
|
+
Open a cfdb that is linked with a remote S3 database.
|
703
676
|
|
704
677
|
Parameters
|
705
678
|
-----------
|
@@ -754,7 +727,7 @@ def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
|
|
754
727
|
else:
|
755
728
|
create = False
|
756
729
|
|
757
|
-
return EDataset(
|
730
|
+
return EDataset(fp, open_blt, create, compression, compression_level)
|
758
731
|
|
759
732
|
|
760
733
|
|
cfdb/support_classes.py
CHANGED
@@ -65,6 +65,7 @@ class Rechunker:
|
|
65
65
|
shape of the chunk
|
66
66
|
"""
|
67
67
|
chunk_shape = rechunkit.guess_chunk_shape(self._var.shape, self._var.dtype_encoded, target_chunk_size)
|
68
|
+
|
68
69
|
return chunk_shape
|
69
70
|
|
70
71
|
def calc_ideal_read_chunk_shape(self, target_chunk_shape: Tuple[int, ...]):
|
@@ -556,14 +557,16 @@ class Variable:
|
|
556
557
|
# TODO
|
557
558
|
|
558
559
|
|
559
|
-
def iter_chunks(self, decoded=True):
|
560
|
+
def iter_chunks(self, include_data=True, decoded=True):
|
560
561
|
"""
|
561
|
-
Iterate through the chunks of the variable and return numpy arrays associated with the index slices. This should be the main way for users to get large amounts of data from a variable. The "ends" of the data will be clipped to the shape of the variable (i.e. not all chunks will be the chunk_shape).
|
562
|
+
Iterate through the chunks of the variable and return numpy arrays associated with the index slices (Optional). This should be the main way for users to get large amounts of data from a variable. The "ends" of the data will be clipped to the shape of the variable (i.e. not all chunks will be the chunk_shape).
|
562
563
|
|
563
564
|
Parameters
|
564
565
|
----------
|
565
566
|
decoded: bool
|
566
567
|
Should the data be decoded?
|
568
|
+
include_data: bool
|
569
|
+
Should the data be included in the output?
|
567
570
|
|
568
571
|
Returns
|
569
572
|
-------
|
@@ -577,19 +580,29 @@ class Variable:
|
|
577
580
|
blank = self._make_blank_chunk_array(decoded)
|
578
581
|
|
579
582
|
slices = indexers.index_combo_all(self._sel, coord_origins, self.shape)
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
data = self._encoder.decode(self._encoder.from_bytes(b1))
|
583
|
+
|
584
|
+
if include_data:
|
585
|
+
for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, self.name, self.chunk_shape):
|
586
|
+
# print(target_chunk, source_chunk, blt_key)
|
587
|
+
b1 = self._blt.get(blt_key)
|
588
|
+
if b1 is None:
|
589
|
+
blank_slices = tuple(slice(0, sc.stop - sc.start) for sc in source_chunk)
|
590
|
+
yield target_chunk, blank[blank_slices]
|
589
591
|
else:
|
590
|
-
|
592
|
+
if decoded:
|
593
|
+
data = self._encoder.decode(self._encoder.from_bytes(b1))
|
594
|
+
else:
|
595
|
+
data = self._encoder.from_bytes(b1)
|
591
596
|
|
592
|
-
|
597
|
+
yield target_chunk, data[source_chunk]
|
598
|
+
else:
|
599
|
+
starts = tuple(s.start for s in slices)
|
600
|
+
stops = tuple(s.stop for s in slices)
|
601
|
+
chunk_iter2 = rechunkit.chunk_range(starts, stops, self.chunk_shape)
|
602
|
+
for partial_chunk in chunk_iter2:
|
603
|
+
target_chunk = tuple(slice(s.start - start, s.stop - start) for start, s in zip(starts, partial_chunk))
|
604
|
+
|
605
|
+
yield target_chunk
|
593
606
|
|
594
607
|
def __iter__(self):
|
595
608
|
return self.iter_chunks()
|
@@ -859,7 +872,7 @@ class Coordinate(CoordinateView):
|
|
859
872
|
|
860
873
|
def append(self, data):
|
861
874
|
"""
|
862
|
-
Append data to the end of the coordinate. The extra length will be added to the associated data variables with the fillvalue.
|
875
|
+
Append data to the end of the coordinate. The extra length will be added to the associated data variables with the fillvalue.
|
863
876
|
"""
|
864
877
|
if not self.writable:
|
865
878
|
raise ValueError('Dataset is not writable.')
|
@@ -947,6 +960,29 @@ class DataVariableView(Variable):
|
|
947
960
|
self._blt.set(blt_key, self._encoder.to_bytes(new_data))
|
948
961
|
|
949
962
|
|
963
|
+
# def set_chunk(self, sel, data, encode=True):
|
964
|
+
# """
|
965
|
+
# Set the first chunk associated with the selection.
|
966
|
+
# """
|
967
|
+
# if not self.writable:
|
968
|
+
# raise ValueError('Dataset is not writable.')
|
969
|
+
|
970
|
+
# if sel is None:
|
971
|
+
# sel = self._sel
|
972
|
+
# coord_origins = self.get_coord_origins()
|
973
|
+
# slices = indexers.index_combo_all(sel, coord_origins, self.shape)
|
974
|
+
# starts_chunk = tuple((pc.start//cs) * cs for cs, pc in zip(self.chunk_shape, slices))
|
975
|
+
# chunk_stop = tuple(min(cs, s - sc) for cs, sc, s in zip(self.chunk_shape, starts_chunk, self.shape))
|
976
|
+
# if data.shape != chunk_stop:
|
977
|
+
# raise ValueError(f'The shape of this chunk should be {chunk_stop}, but the data passed is {data.shape}')
|
978
|
+
|
979
|
+
# blt_key = utils.make_var_chunk_key(self.name, starts_chunk)
|
980
|
+
# if encode:
|
981
|
+
# self._blt.set(blt_key, self._encoder.to_bytes(self._encoder.encode(data)))
|
982
|
+
# else:
|
983
|
+
# self._blt.set(blt_key, self._encoder.to_bytes(data))
|
984
|
+
|
985
|
+
|
950
986
|
def __setitem__(self, sel, data):
|
951
987
|
"""
|
952
988
|
|
@@ -954,14 +990,14 @@ class DataVariableView(Variable):
|
|
954
990
|
self.set(sel, data)
|
955
991
|
|
956
992
|
|
957
|
-
def groupby(self, coord_names: Iterable, max_mem: int=2**27, decoded=True):
|
993
|
+
def groupby(self, coord_names: Union[str, Iterable], max_mem: int=2**27, decoded=True):
|
958
994
|
"""
|
959
995
|
This method takes one or more coord names to group by and returns a generator. This generator will return chunks of data according to these groupings with the associated tuple of slices. The more max_mem provided, the more efficient the chunking.
|
960
996
|
This is effectively the rechunking method where each coord name supplied is set to 1 and all other coords are set to their full their full length.
|
961
997
|
|
962
998
|
Parameters
|
963
999
|
----------
|
964
|
-
coord_names: Iterable
|
1000
|
+
coord_names: str or Iterable
|
965
1001
|
The coord names to group by.
|
966
1002
|
max_mem: int
|
967
1003
|
The max allocated memory to perform the chunking operation in bytes. This will only be as large as necessary for an optimum size chunk for the rechunking.
|
cfdb/tools.py
ADDED
@@ -0,0 +1,427 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
Created on Thu Jul 17 09:04:49 2025
|
5
|
+
|
6
|
+
@author: mike
|
7
|
+
"""
|
8
|
+
import numpy as np
|
9
|
+
import rechunkit
|
10
|
+
import copy
|
11
|
+
from typing import List, Union
|
12
|
+
import pathlib
|
13
|
+
|
14
|
+
try:
|
15
|
+
import h5netcdf
|
16
|
+
import_h5netcdf = True
|
17
|
+
except ImportError:
|
18
|
+
import_h5netcdf = False
|
19
|
+
|
20
|
+
from . import utils, main, indexers, support_classes as sc
|
21
|
+
# import utils, main, indexers, support_classes as sc
|
22
|
+
|
23
|
+
##########################################
|
24
|
+
### Parameters
|
25
|
+
|
26
|
+
inv_time_units_dict = {value: key for key, value in utils.time_units_dict.items()}
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
#########################################
|
31
|
+
### Functions
|
32
|
+
|
33
|
+
|
34
|
+
class H5DataVarReader:
|
35
|
+
"""
|
36
|
+
|
37
|
+
"""
|
38
|
+
def __init__(self, h5_data_var, inverted_coords, shape):
|
39
|
+
"""
|
40
|
+
|
41
|
+
"""
|
42
|
+
self.is_inverted = any(inverted_coords)
|
43
|
+
self.data_var = h5_data_var
|
44
|
+
self.inverted_coords = inverted_coords
|
45
|
+
self.shape = shape
|
46
|
+
|
47
|
+
def get(self, slices):
|
48
|
+
"""
|
49
|
+
|
50
|
+
"""
|
51
|
+
if self.is_inverted:
|
52
|
+
source_slices = tuple(slice(s - cs.stop, s - cs.start) if inverted else cs for inverted, cs, s in zip(self.inverted_coords, slices, self.shape))
|
53
|
+
data = np.flip(self.data_var[source_slices], np.nonzero(self.inverted_coords)[0])
|
54
|
+
else:
|
55
|
+
data = self.data_var[slices]
|
56
|
+
|
57
|
+
return data
|
58
|
+
|
59
|
+
|
60
|
+
def filter_var_names_h5(h5, include_data_vars, exclude_data_vars):
|
61
|
+
"""
|
62
|
+
|
63
|
+
"""
|
64
|
+
coord_names_all = set(h5.dims)
|
65
|
+
data_var_names_all = set(h5.variables).difference(coord_names_all)
|
66
|
+
|
67
|
+
if include_data_vars is not None:
|
68
|
+
if isinstance(include_data_vars, str):
|
69
|
+
include_data_vars = [include_data_vars]
|
70
|
+
data_var_names = set(include_data_vars)
|
71
|
+
if not data_var_names.isubset(data_var_names_all):
|
72
|
+
raise ValueError(f'{data_var_names} is not a subset of {data_var_names_all}')
|
73
|
+
elif exclude_data_vars is not None:
|
74
|
+
if isinstance(exclude_data_vars, str):
|
75
|
+
exclude_data_vars = [exclude_data_vars]
|
76
|
+
data_var_names = data_var_names_all.difference(set(exclude_data_vars))
|
77
|
+
else:
|
78
|
+
data_var_names = data_var_names_all
|
79
|
+
|
80
|
+
coord_names = set()
|
81
|
+
for data_var_name in data_var_names:
|
82
|
+
data_var = h5[data_var_name]
|
83
|
+
coord_names.update(data_var.dimensions)
|
84
|
+
|
85
|
+
return data_var_names, coord_names
|
86
|
+
|
87
|
+
|
88
|
+
def parse_attrs(attrs):
|
89
|
+
"""
|
90
|
+
|
91
|
+
"""
|
92
|
+
input_params = {}
|
93
|
+
for attr, value in copy.deepcopy(attrs).items():
|
94
|
+
if attr == 'scale_factor':
|
95
|
+
input_params['scale_factor'] = float(attrs.pop(attr))
|
96
|
+
elif attr == 'add_offset':
|
97
|
+
input_params['add_offset'] = float(attrs.pop(attr))
|
98
|
+
elif attr == '_FillValue':
|
99
|
+
if value is not None:
|
100
|
+
input_params['fillvalue'] = int(attrs.pop(attr))
|
101
|
+
elif attr == 'missing_value':
|
102
|
+
del attrs['missing_value']
|
103
|
+
elif isinstance(value, np.bytes_):
|
104
|
+
attrs[attr] = str(value.astype(str))
|
105
|
+
elif isinstance(value, np.floating):
|
106
|
+
attrs[attr] = float(value)
|
107
|
+
elif isinstance(value, np.integer):
|
108
|
+
attrs[attr] = int(value)
|
109
|
+
elif isinstance(value, np.str_):
|
110
|
+
attrs[attr] = str(value)
|
111
|
+
|
112
|
+
return attrs, input_params
|
113
|
+
|
114
|
+
|
115
|
+
def parse_cf_dates(units, dtype_encoded):
|
116
|
+
"""
|
117
|
+
|
118
|
+
"""
|
119
|
+
if ' since ' in units:
|
120
|
+
freq, start_date = units.split(' since ')
|
121
|
+
freq_code = inv_time_units_dict[freq]
|
122
|
+
origin_date = np.datetime64(start_date, freq_code)
|
123
|
+
unix_date = np.datetime64('1970-01-01', freq_code)
|
124
|
+
# origin_diff = (unix_date - origin_date).astype(dtype_encoded)
|
125
|
+
units = f'{freq} since {str(unix_date)}'
|
126
|
+
if freq_code not in ('M', 'D', 'h', 'm'):
|
127
|
+
dtype_encoded = np.dtype('int64')
|
128
|
+
dtype_decoded = origin_date.dtype
|
129
|
+
else:
|
130
|
+
dtype_decoded = dtype_encoded
|
131
|
+
origin_date = None
|
132
|
+
|
133
|
+
return units, dtype_decoded, dtype_encoded, origin_date
|
134
|
+
|
135
|
+
|
136
|
+
def netcdf4_to_cfdb(nc_path: Union[str, pathlib.Path], cfdb_path: Union[str, pathlib.Path], sel: dict=None, sel_loc: dict=None, include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, max_mem: int=2**27, **kwargs):
|
137
|
+
"""
|
138
|
+
Simple function to convert a netcdf4 to a cfdb. Selection options are also available. The h5netcdf package must be installed to read netcdf4 files.
|
139
|
+
|
140
|
+
Parameters
|
141
|
+
----------
|
142
|
+
nc_path: str or pathlib.Path
|
143
|
+
The source netcdf4 file to be converted.
|
144
|
+
cfdb_path: str or pathlib.Path
|
145
|
+
The target path for the cfdb.
|
146
|
+
sel: dict
|
147
|
+
Selection by coordinate indexes.
|
148
|
+
sel_loc: dict
|
149
|
+
Selection by coordinate values.
|
150
|
+
max_mem: int
|
151
|
+
The max memory in bytes if required when coordinates are in decending order (and must be resorted in ascending order).
|
152
|
+
kwargs
|
153
|
+
Any kwargs that can be passed to the cfdb.open_dataset function.
|
154
|
+
|
155
|
+
Returns
|
156
|
+
-------
|
157
|
+
None
|
158
|
+
"""
|
159
|
+
if not import_h5netcdf:
|
160
|
+
raise ImportError('h5netcdf must be installed to save files to netcdf4.')
|
161
|
+
|
162
|
+
if (sel is not None) and (sel_loc is not None):
|
163
|
+
raise ValueError('Only one of sel or sel_loc can be passed, not both.')
|
164
|
+
|
165
|
+
## Get the coordinates data
|
166
|
+
inverted_coords = []
|
167
|
+
# coords_data = {}
|
168
|
+
sel_dict = {}
|
169
|
+
with main.open_dataset(cfdb_path, 'n', **kwargs) as ds:
|
170
|
+
with h5netcdf.File(nc_path, 'r') as h5:
|
171
|
+
dims = tuple(h5.dims)
|
172
|
+
|
173
|
+
## Check the selection inputs
|
174
|
+
if isinstance(sel, dict):
|
175
|
+
for key in sel:
|
176
|
+
if key not in dims:
|
177
|
+
raise ValueError(f'{key} is not a dimension in the dataset.')
|
178
|
+
elif isinstance(sel_loc, dict):
|
179
|
+
for key in sel_loc:
|
180
|
+
if key not in dims:
|
181
|
+
raise ValueError(f'{key} is not a dimension in the dataset.')
|
182
|
+
|
183
|
+
data_var_names, coord_names = filter_var_names_h5(h5, include_data_vars, exclude_data_vars)
|
184
|
+
|
185
|
+
for dim in coord_names:
|
186
|
+
h5_coord = h5[dim]
|
187
|
+
dtype_encoded = h5_coord.dtype
|
188
|
+
attrs = dict(h5_coord.attrs)
|
189
|
+
attrs, input_params = parse_attrs(attrs)
|
190
|
+
|
191
|
+
if 'scale_factor' in input_params:
|
192
|
+
dtype_decoded = np.dtype('float64')
|
193
|
+
elif 'units' in attrs:
|
194
|
+
units, dtype_decoded, dtype_encoded, origin_date = parse_cf_dates(attrs['units'], dtype_encoded)
|
195
|
+
attrs['units'] = units
|
196
|
+
else:
|
197
|
+
dtype_decoded = dtype_encoded
|
198
|
+
|
199
|
+
input_params['dtype_decoded'] = dtype_decoded
|
200
|
+
input_params['dtype_encoded'] = dtype_encoded
|
201
|
+
|
202
|
+
# chunk_start = (0,)
|
203
|
+
shape = h5_coord.shape
|
204
|
+
chunk_shape = h5_coord.chunks
|
205
|
+
if chunk_shape is None:
|
206
|
+
chunk_shape = rechunkit.guess_chunk_shape(shape, dtype_encoded)
|
207
|
+
|
208
|
+
input_params['chunk_shape'] = chunk_shape
|
209
|
+
|
210
|
+
data = h5_coord[()]
|
211
|
+
h5_coord_diff = np.diff(data)
|
212
|
+
if h5_coord_diff[0] > 0:
|
213
|
+
order_check = np.all(h5_coord_diff > 0)
|
214
|
+
inverted = False
|
215
|
+
else:
|
216
|
+
order_check = np.all(h5_coord_diff < 0)
|
217
|
+
inverted = True
|
218
|
+
|
219
|
+
inverted_coords.append(inverted)
|
220
|
+
|
221
|
+
if not order_check:
|
222
|
+
raise ValueError('Either the coordinate values are not increasing/decreasing or they are not unique.')
|
223
|
+
|
224
|
+
data = h5_coord[()]
|
225
|
+
|
226
|
+
if inverted:
|
227
|
+
data.sort()
|
228
|
+
|
229
|
+
## Decode data if necessary
|
230
|
+
if dtype_decoded.kind == 'M':
|
231
|
+
data = data + origin_date
|
232
|
+
elif 'scale_factor' in input_params:
|
233
|
+
if 'add_offset' in input_params:
|
234
|
+
add_offset = input_params['add_offset']
|
235
|
+
else:
|
236
|
+
add_offset = None
|
237
|
+
if 'fillvalue' in input_params:
|
238
|
+
fillvalue = input_params['fillvalue']
|
239
|
+
else:
|
240
|
+
fillvalue = None
|
241
|
+
encoding = sc.Encoding(chunk_shape, dtype_decoded, dtype_encoded, fillvalue, input_params['scale_factor'], add_offset, None)
|
242
|
+
|
243
|
+
data = encoding.decode(data)
|
244
|
+
|
245
|
+
## Selection
|
246
|
+
if isinstance(sel, dict):
|
247
|
+
if dim in sel:
|
248
|
+
slices = indexers.index_combo_one(sel[dim], (0,), shape, 0)
|
249
|
+
data = data[slices]
|
250
|
+
else:
|
251
|
+
slices = indexers.slice_none((0,), shape, 0)
|
252
|
+
|
253
|
+
elif isinstance(sel_loc, dict):
|
254
|
+
if dim in sel_loc:
|
255
|
+
idx = indexers.loc_index_combo_one(sel_loc[dim], data)
|
256
|
+
slices = indexers.index_combo_one(idx, (0,), shape, 0)
|
257
|
+
data = data[slices]
|
258
|
+
else:
|
259
|
+
slices = indexers.slice_none((0,), shape, 0)
|
260
|
+
else:
|
261
|
+
slices = indexers.slice_none((0,), shape, 0)
|
262
|
+
|
263
|
+
sel_dict[dim] = slices
|
264
|
+
|
265
|
+
## Create coord
|
266
|
+
coord = ds.create.coord.generic(dim, data=data, **input_params)
|
267
|
+
coord.attrs.update(attrs)
|
268
|
+
|
269
|
+
# coords_data[dim] = {'data': data, 'attrs': attrs, 'input_params': input_params}
|
270
|
+
|
271
|
+
## Data Vars
|
272
|
+
inverted_coords = tuple(inverted_coords)
|
273
|
+
# is_inverted = any(inverted_coords)
|
274
|
+
|
275
|
+
for var_name in data_var_names:
|
276
|
+
h5_var = h5[var_name]
|
277
|
+
dtype_encoded = h5_var.dtype
|
278
|
+
attrs = dict(h5_var.attrs)
|
279
|
+
attrs, input_params = parse_attrs(attrs)
|
280
|
+
|
281
|
+
if 'scale_factor' in input_params:
|
282
|
+
dtype_decoded = np.dtype('float64')
|
283
|
+
elif 'units' in attrs:
|
284
|
+
units, dtype_decoded, dtype_encoded, origin_date = parse_cf_dates(attrs['units'], dtype_encoded)
|
285
|
+
attrs['units'] = units
|
286
|
+
else:
|
287
|
+
dtype_decoded = dtype_encoded
|
288
|
+
|
289
|
+
var_sel = tuple(sel_dict[dim] for dim in h5_var.dimensions)
|
290
|
+
|
291
|
+
# chunk_start = tuple(s.start for s in var_sel)
|
292
|
+
# shape = tuple(s.stop - s.start for s in var_sel)
|
293
|
+
# chunk_start = tuple(0 for i in range(len(h5_var.shape)))
|
294
|
+
shape = h5_var.shape
|
295
|
+
chunk_shape = h5_var.chunks
|
296
|
+
if chunk_shape is None:
|
297
|
+
chunk_shape = rechunkit.guess_chunk_shape(shape, dtype_encoded)
|
298
|
+
|
299
|
+
data_var = ds.create.data_var.generic(var_name, h5_var.dimensions, dtype_decoded=dtype_decoded, dtype_encoded=dtype_encoded, chunk_shape=chunk_shape, **input_params)
|
300
|
+
data_var.attrs.update(attrs)
|
301
|
+
|
302
|
+
h5_reader = H5DataVarReader(h5_var, inverted_coords, shape)
|
303
|
+
|
304
|
+
chunks_iter = rechunkit.rechunker(h5_reader.get, shape, dtype_encoded, chunk_shape, chunk_shape, max_mem, var_sel)
|
305
|
+
for chunk_slices, encoded_data in chunks_iter:
|
306
|
+
if not np.all(encoded_data == data_var.fillvalue):
|
307
|
+
data_var.set(chunk_slices, encoded_data, False)
|
308
|
+
|
309
|
+
|
310
|
+
# chunks_iter = rechunkit.chunk_range(chunk_start, shape, chunk_shape)
|
311
|
+
# for chunk_slices in chunks_iter:
|
312
|
+
# if is_inverted:
|
313
|
+
# source_slices = tuple(slice(s - cs.stop, s - cs.start) if inverted else cs for inverted, cs, s in zip(inverted_coords, chunk_slices, shape))
|
314
|
+
# data = np.flip(h5_var[source_slices], np.nonzero(inverted_coords)[0])
|
315
|
+
# else:
|
316
|
+
# data = h5_var[chunk_slices]
|
317
|
+
# if not np.all(data == data_var.fillvalue):
|
318
|
+
# # data_var.set_chunk(chunk_slices, data, False)
|
319
|
+
# data_var.set(chunk_slices, data, False)
|
320
|
+
|
321
|
+
ds.attrs.update(dict(h5.attrs))
|
322
|
+
|
323
|
+
|
324
|
+
def cfdb_to_netcdf4(cfdb_path: Union[str, pathlib.Path], nc_path: Union[str, pathlib.Path], compression: str='gzip', sel: dict=None, sel_loc: dict=None, include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, **kwargs):
|
325
|
+
"""
|
326
|
+
Simple function to convert a cfdb to a netcdf4. Selection options are also available. The h5netcdf package must be installed to write netcdf4 files.
|
327
|
+
|
328
|
+
Parameters
|
329
|
+
----------
|
330
|
+
cfdb_path: str or pathlib.Path
|
331
|
+
The source path of the cfdb to be converted.
|
332
|
+
nc_path: str or pathlib.Path
|
333
|
+
The target path for the netcdf4 file.
|
334
|
+
sel: dict
|
335
|
+
Selection by coordinate indexes.
|
336
|
+
sel_loc: dict
|
337
|
+
Selection by coordinate values.
|
338
|
+
max_mem: int
|
339
|
+
The max memory in bytes if required when coordinates are in decending order (and must be resorted in ascending order).
|
340
|
+
kwargs
|
341
|
+
Any kwargs that can be passed to the h5netcdf.File function.
|
342
|
+
|
343
|
+
Returns
|
344
|
+
-------
|
345
|
+
None
|
346
|
+
"""
|
347
|
+
if not import_h5netcdf:
|
348
|
+
raise ImportError('h5netcdf must be installed to save files to netcdf4.')
|
349
|
+
|
350
|
+
if (sel is not None) and (sel_loc is not None):
|
351
|
+
raise ValueError('Only one of sel or sel_loc can be passed, not both.')
|
352
|
+
|
353
|
+
with main.open_dataset(cfdb_path) as ds:
|
354
|
+
if isinstance(sel, dict):
|
355
|
+
ds_view = ds.select(sel)
|
356
|
+
elif isinstance(sel_loc, dict):
|
357
|
+
ds_view = ds.select_loc(sel_loc)
|
358
|
+
else:
|
359
|
+
ds_view = ds
|
360
|
+
|
361
|
+
ds_view.to_netcdf4(nc_path, compression=compression, include_data_vars=include_data_vars, exclude_data_vars=exclude_data_vars, **kwargs)
|
362
|
+
|
363
|
+
|
364
|
+
|
365
|
+
|
366
|
+
|
367
|
+
|
368
|
+
|
369
|
+
|
370
|
+
|
371
|
+
|
372
|
+
|
373
|
+
|
374
|
+
|
375
|
+
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
|
380
|
+
|
381
|
+
|
382
|
+
|
383
|
+
|
384
|
+
|
385
|
+
|
386
|
+
|
387
|
+
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
|
392
|
+
|
393
|
+
|
394
|
+
|
395
|
+
|
396
|
+
|
397
|
+
|
398
|
+
|
399
|
+
|
400
|
+
|
401
|
+
|
402
|
+
|
403
|
+
|
404
|
+
|
405
|
+
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
|
410
|
+
|
411
|
+
|
412
|
+
|
413
|
+
|
414
|
+
|
415
|
+
|
416
|
+
|
417
|
+
|
418
|
+
|
419
|
+
|
420
|
+
|
421
|
+
|
422
|
+
|
423
|
+
|
424
|
+
|
425
|
+
|
426
|
+
|
427
|
+
|
cfdb/utils.py
CHANGED
@@ -233,6 +233,30 @@ default_attrs = dict(
|
|
233
233
|
### Functions
|
234
234
|
|
235
235
|
|
236
|
+
def filter_var_names(ds, include_data_vars, exclude_data_vars):
|
237
|
+
"""
|
238
|
+
|
239
|
+
"""
|
240
|
+
if include_data_vars is not None:
|
241
|
+
if isinstance(include_data_vars, str):
|
242
|
+
include_data_vars = [include_data_vars]
|
243
|
+
data_var_names = set(include_data_vars)
|
244
|
+
elif exclude_data_vars is not None:
|
245
|
+
if isinstance(exclude_data_vars, str):
|
246
|
+
exclude_data_vars = [exclude_data_vars]
|
247
|
+
data_var_names_all = set(ds.data_var_names)
|
248
|
+
data_var_names = data_var_names_all.difference(set(exclude_data_vars))
|
249
|
+
else:
|
250
|
+
data_var_names = set(ds.data_var_names)
|
251
|
+
|
252
|
+
coord_names = set()
|
253
|
+
for data_var_name in data_var_names:
|
254
|
+
data_var = ds[data_var_name]
|
255
|
+
coord_names.update(data_var.coord_names)
|
256
|
+
|
257
|
+
return data_var_names, coord_names
|
258
|
+
|
259
|
+
|
236
260
|
def parse_cf_time_units(dtype_decoded):
|
237
261
|
"""
|
238
262
|
|
@@ -1959,6 +1983,7 @@ def file_summary(ds):
|
|
1959
1983
|
dim_name = var.name
|
1960
1984
|
dtype_name = var.dtype_decoded
|
1961
1985
|
dim_len = var.shape[0]
|
1986
|
+
# print(var.data)
|
1962
1987
|
first_value = format_value(var.data[0])
|
1963
1988
|
last_value = format_value(var.data[-1])
|
1964
1989
|
spacing = value_indent - name_indent - len(dim_name)
|
@@ -0,0 +1,204 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: cfdb
|
3
|
+
Version: 0.1.1
|
4
|
+
Summary: CF conventions multi-dimensional array storage on top of Booklet
|
5
|
+
Project-URL: Documentation, https://mullenkamp.github.io/cfdb/
|
6
|
+
Project-URL: Source, https://github.com/mullenkamp/cfdb
|
7
|
+
Author-email: mullenkamp <mullenkamp1@gmail.com>
|
8
|
+
License-File: LICENSE
|
9
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
10
|
+
Requires-Python: >=3.10
|
11
|
+
Requires-Dist: booklet>=0.9.2
|
12
|
+
Requires-Dist: cftime
|
13
|
+
Requires-Dist: lz4
|
14
|
+
Requires-Dist: msgspec
|
15
|
+
Requires-Dist: numpy
|
16
|
+
Requires-Dist: rechunkit>=0.1.0
|
17
|
+
Requires-Dist: zstandard
|
18
|
+
Provides-Extra: ebooklet
|
19
|
+
Requires-Dist: ebooklet>=0.5.10; extra == 'ebooklet'
|
20
|
+
Provides-Extra: netcdf4
|
21
|
+
Requires-Dist: h5netcdf; extra == 'netcdf4'
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
|
24
|
+
# cfdb
|
25
|
+
|
26
|
+
<p align="center">
|
27
|
+
<em>CF conventions multi-dimensional array storage on top of Booklet</em>
|
28
|
+
</p>
|
29
|
+
|
30
|
+
[](https://github.com/mullenkamp/cfdb/actions)
|
31
|
+
[](https://codecov.io/gh/mullenkamp/cfdb)
|
32
|
+
[](https://badge.fury.io/py/cfdb)
|
33
|
+
|
34
|
+
---
|
35
|
+
|
36
|
+
**Source Code**: <a href="https://github.com/mullenkamp/cfdb" target="_blank">https://github.com/mullenkamp/cfbdb</a>
|
37
|
+
|
38
|
+
---
|
39
|
+
## Introduction
|
40
|
+
cfdb is a pure python database for managing labeled multi-dimensional arrays that mostly follows the [CF conventions](https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html). It is an alternative to netcdf4 and [xarray](https://docs.xarray.dev/). It builds upon the [Booklet](https://github.com/mullenkamp/booklet) for the underlying local file storage and [EBooklet](https://github.com/mullenkamp/ebooklet) to sync and share on any S3 system. It has been designed to follow the programming style of opening a file, iteratively read data, iteratively write data, then closing the file.
|
41
|
+
It is thread-safe on reads and writes (using thread locks) and multiprocessing-safe (using file locks) including on the S3 remote (using object locking).
|
42
|
+
|
43
|
+
When an error occurs, cfdb will try to properly close the file and remove the file (object) locks. This will not sync any changes, so the user will lose any changes that were not synced. There will be circumstances that can occur that will not properly close the file, so care still needs to be made.
|
44
|
+
|
45
|
+
|
46
|
+
## Installation
|
47
|
+
|
48
|
+
Install via pip:
|
49
|
+
|
50
|
+
```
|
51
|
+
pip install cfdb
|
52
|
+
```
|
53
|
+
|
54
|
+
I'll probably put it on conda-forge once I feel appropriately motivated...
|
55
|
+
|
56
|
+
## Usage
|
57
|
+
### Opening a file/dataset
|
58
|
+
Usage starts off by opening the file (and closing the file when done):
|
59
|
+
```python
|
60
|
+
import cfdb
|
61
|
+
import numpy as np
|
62
|
+
|
63
|
+
file_path = '/path/to/file.cfdb'
|
64
|
+
|
65
|
+
ds = cfdb.open_dataset(file_path, flag='n')
|
66
|
+
# Do fancy stuff
|
67
|
+
ds.close()
|
68
|
+
```
|
69
|
+
|
70
|
+
By default, files will be open for read-only, so we need to specify that we want to write (in this case, 'n' is to open for write and replace the existing file with a new one). There are also some compression options, and those are described in the doc strings. Other kwargs from [Booklet](https://github.com/mullenkamp/booklet?tab=readme-ov-file#usage) can be passed to open_dataset.
|
71
|
+
|
72
|
+
The dataset can also be opened with the context manager like so:
|
73
|
+
```python
|
74
|
+
with cfdb.open_dataset(file_path, flag='n') as ds:
|
75
|
+
print(ds)
|
76
|
+
```
|
77
|
+
This is generally encouraged as this will ensure that the file is closed properly and file locks are removed.
|
78
|
+
|
79
|
+
### Variables
|
80
|
+
In the [CF conventions](https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#dimensions), variables are the objects that store data. These can be 1 dimensional or many dimensional. The dimensions are the labels of 1-D variables (like latitude or time). These 1-D variables are called coordinate variables (or coordinates) with the same name as their associated dimension. All variables that use these coordinates as their dimension labels are called data variables. The combination of multiple data variables with their coordinates in a single file is called a dataset.
|
81
|
+
|
82
|
+
#### Coordinates
|
83
|
+
Since all data variables must have coordinates, the coordinates must be created before data variables are created.
|
84
|
+
|
85
|
+
Coordinates in cfdb are more similar to the definition by the earlier [COARDS conventions](https://ferret.pmel.noaa.gov/Ferret/documentation/coards-netcdf-conventions) than the latter CF conventions. Coordinate values must be unique, sorted in ascending order (a partial consequence to np.sort), and cannot have null (or np.nan) values. The CF conventions do not have those limitations, but these limitations are good! Coordinates must also be only 1-D.
|
86
|
+
|
87
|
+
Coordinates can be created using the generic creation method, or templates can be used for some of the more common dimensions (like latitude, longitude, and time):
|
88
|
+
```python
|
89
|
+
lat_data = np.linspace(0, 19.9, 200, dtype='float32')
|
90
|
+
|
91
|
+
with cfdb.open_dataset(file_path, flag='n') as ds:
|
92
|
+
lat_coord = ds.create.coord.latitude(data=lat_data, chunk_shape=(20,))
|
93
|
+
print(lat_coord)
|
94
|
+
```
|
95
|
+
When creating coordinates, the user can pass a np.ndarray as data and cfdb will figure out the rest (especially when using a creation template). Otherwise, a coordinate can be created without any data input and the data can be appended later:
|
96
|
+
```python
|
97
|
+
with cfdb.open_dataset(file_path, flag='n') as ds:
|
98
|
+
lat_coord = ds.create.coord.latitude(chunk_shape=(20,))
|
99
|
+
lat_coord.append(lat_data)
|
100
|
+
print(lat_coord.data)
|
101
|
+
```
|
102
|
+
Coordinate data can either be appended or prepended, but keep in mind the limitations described above! And once assigned, coordinate values cannot be changed. At some point, I'll implement the ability to shrink the size of coordinates, but for now they can only be expanded. As seen in the above example, the .data method will return the entire variable data as a single np.ndarray. Coordinates always hold the entire data in memory, while data variables never do. On disk, all data are stored as chunks, whether it's coordinates or data variables.
|
103
|
+
|
104
|
+
Let's add another coordinate for fun:
|
105
|
+
```python
|
106
|
+
time_data = np.linspace(0, 199, 200, dtype='datetime64[D]')
|
107
|
+
|
108
|
+
with cfdb.open_dataset(file_path, flag='w') as ds:
|
109
|
+
time_coord = ds.create.coord.time(data=time_data, dtype_decoded=time_data.dtype, dtype_encoded='int32')
|
110
|
+
print(time_coord)
|
111
|
+
```
|
112
|
+
A time variable works similarly to other numpy dtypes, but you can assign the precision of the datetime object within the brackets (shown as [D] for days). Look at the [numpy datetime reference page](https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units) for all of the frequency codes. Do not use a frequency code finer than "ns". Encoding a datetime64 dtype to an int32 is possible down to the "m" (minute) resolution (with a max year of 6053), but all higher frequency codes should use int64.
|
113
|
+
|
114
|
+
#### Data Variables
|
115
|
+
Data variables are created in a similar way as coordinates except that you cannot pass data on creation and you must pass a tuple of the coordinate names to link the coordinates to the data variable:
|
116
|
+
```python
|
117
|
+
data_var_data = np.linspace(0, 3999.9, 40000, dtype='float64').reshape(200, 200)
|
118
|
+
name = 'data_var'
|
119
|
+
coords = ('latitude', 'time')
|
120
|
+
dtype_encoded = 'int32'
|
121
|
+
scale_factor = 0.1
|
122
|
+
|
123
|
+
with cfdb.open_dataset(file_path, flag='w') as ds:
|
124
|
+
data_var = ds.create.data_var.generic(name, coords, data_var_data.dtype, dtype_encoded, scale_factor=scale_factor)
|
125
|
+
data_var[:] = data_var_data
|
126
|
+
data_var.attrs['test'] = ['test attributes']
|
127
|
+
print(data_var)
|
128
|
+
```
|
129
|
+
Since there are no data variable templates (yet), we need to use the generic creation method. If no fillvalue or chunk_shape is passed, then cfdb figures them out for you.
|
130
|
+
|
131
|
+
Assigning data to data variables is different to coordinates. Data variables can only be expanded via the coordinates themselves. Assignment and selection is performed by the [basic numpy indexing](https://numpy.org/doc/stable/user/basics.indexing.html#basic-indexing), but not the [advanced indexing](https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing).
|
132
|
+
|
133
|
+
The example shown above is the simplest way of assigning data to a data variable, but it's not a preferred method when datasets are very large. The recommended way to write (and read) data is to iterate over the chunks:
|
134
|
+
|
135
|
+
```python
|
136
|
+
with cfdb.open_dataset(file_path, flag='w') as ds:
|
137
|
+
data_var = ds[name]
|
138
|
+
for chunk_slices in data_var.iter_chunks(include_data=False):
|
139
|
+
data_var[chunk_slices] = data_var_data[chunk_slices]
|
140
|
+
```
|
141
|
+
|
142
|
+
This is a bit of a contrived example given that data_var_data is a single in-memory numpy array, but in many cases your data source will be much larger or in many pieces. The chunk_slices is a tuple of index slices that the data chunk covers. It is the same indexing that can be passed to a numpy ndarray.
|
143
|
+
|
144
|
+
Reading data uses the same "iter_chunks" method. This ensures that memory usage is kept to a minimum:
|
145
|
+
|
146
|
+
```python
|
147
|
+
with cfdb.open_dataset(file_path, flag='r') as ds:
|
148
|
+
data_var = ds[name]
|
149
|
+
for chunk_slices, data in data_var.iter_chunks():
|
150
|
+
print(chunk_slices)
|
151
|
+
print(data.shape)
|
152
|
+
```
|
153
|
+
|
154
|
+
There's a groupby method that works similarly to the iter_chunks method except that it requires one or more coordinate names (like pandas or xarray):
|
155
|
+
|
156
|
+
```python
|
157
|
+
with cfdb.open_dataset(file_path, flag='r') as ds:
|
158
|
+
data_var = ds[name]
|
159
|
+
for slices, data in data_var.groupby('latitude'):
|
160
|
+
print(slices)
|
161
|
+
print(data.shape)
|
162
|
+
```
|
163
|
+
|
164
|
+
#### Rechunking
|
165
|
+
All data for variables are stored as chunks of data. For example, the shape of your data may be 2000 x 2000, but the data are stored in 100 x 100 chunks. This is done for a variety of reasons including the ability to compress data. When a variable is created, either the user can define their own chunk shape or cfdb will determine the chunk shape automatically.
|
166
|
+
|
167
|
+
The chunk shape defined in the variable might be good for some use cases but not others. The user might have specific use cases where they want a specific chunking; for example the groupby operation listed in the last example. In that example, the user wanted to iterate over each latitude but with all of the other coordinates (in this case the full time coordinate). A groupby operation is a common rechunking example, but the user might need chunks in many different shapes.
|
168
|
+
|
169
|
+
The [rechunkit package](https://github.com/mullenkamp/rechunkit) is used under the hood to rechunk the data in cfdb. It is exposed in cfdb via the "rechunker" method in a variable. The Rechunker class has several methods to help the user decide the chunk shape.
|
170
|
+
|
171
|
+
```python
|
172
|
+
new_chunk_shape = (41, 41)
|
173
|
+
|
174
|
+
with cfdb.open_dataset(file_path) as ds:
|
175
|
+
data_var = ds[name]
|
176
|
+
rechunker = data_var.rechunker()
|
177
|
+
alt_chunk_shape = rechunker.guess_chunk_shape(2**8)
|
178
|
+
n_chunks = rechunker.calc_n_chunks()
|
179
|
+
print(n_chunks)
|
180
|
+
n_reads, n_writes = rechunker.calc_n_reads_rechunker(new_chunk_shape)
|
181
|
+
print(n_reads, n_writes)
|
182
|
+
rechunk = rechunker.rechunk(new_chunk_shape)
|
183
|
+
|
184
|
+
for slices, data in rechunk:
|
185
|
+
print(slices)
|
186
|
+
print(data.shape)
|
187
|
+
```
|
188
|
+
|
189
|
+
#### Serializers
|
190
|
+
The datasets can be serialized to netcdf4 via the to_netcdf4 method. You must have the [h5netcdf package](https://h5netcdf.org/) installed for netcdf4. It can also be copied to another cfdb file.
|
191
|
+
|
192
|
+
```python
|
193
|
+
with open_dataset(file_path) as ds:
|
194
|
+
new_ds = ds.copy(new_file_path)
|
195
|
+
print(new_ds)
|
196
|
+
new_ds.close()
|
197
|
+
ds.to_netcdf4(nc_file_path)
|
198
|
+
```
|
199
|
+
|
200
|
+
|
201
|
+
|
202
|
+
## License
|
203
|
+
|
204
|
+
This project is licensed under the terms of the Apache Software License 2.0.
|
@@ -0,0 +1,14 @@
|
|
1
|
+
cfdb/__init__.py,sha256=jkHqBmh0aBkjWX3demwH4eh-P9YypPEnFH5ztXXInnc,289
|
2
|
+
cfdb/combine.py,sha256=B1CHZ0NOW4O5j_5NYxAHB76X1A5O3HcZwjNGNx_gfEA,19084
|
3
|
+
cfdb/core.py,sha256=IMFGhed5pa2zoYlm7reu1TeCQ6nt3sMmy5cE0LcAb2A,37337
|
4
|
+
cfdb/creation.py,sha256=hoR0MVEhbcxKT1JnZ2rK1fUAofxOQT0okKmLYh0PBAY,10686
|
5
|
+
cfdb/data_models.py,sha256=AtwtH2Uyo84GucW52aX0AzpG3Sbge41F5lrPuRxSLoY,2166
|
6
|
+
cfdb/indexers.py,sha256=BvkQLpdm2EM64ZbSjW9ByXfeUoBZ1V-YKNVVvtAy1HY,10462
|
7
|
+
cfdb/main.py,sha256=3HoJr8ZZFD3KIPfSUrQTXdJ9xo9I1vcjfQUWvEbmkv8,26020
|
8
|
+
cfdb/support_classes.py,sha256=di0pnspL4O4YL5eKJnGhIOFWdk7D3WWH2ltPziqORtM,36456
|
9
|
+
cfdb/tools.py,sha256=1hE8Qja-JdFpi_XTGSBuANRujELd2s4uYbSUCAl3Big,13725
|
10
|
+
cfdb/utils.py,sha256=sm7oeCxyrtByRlxc8NV52kBMehHwRJMOIhwpeiAmCYY,74114
|
11
|
+
cfdb-0.1.1.dist-info/METADATA,sha256=80jGlWL4ONgx8jZ88MV9YsqgbL_EW504flAG2kRkSsg,11513
|
12
|
+
cfdb-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
13
|
+
cfdb-0.1.1.dist-info/licenses/LICENSE,sha256=hNqpp2O-F2qp4ozzNN86q1sxnAeFDLNoylHyJK_aiYI,586
|
14
|
+
cfdb-0.1.1.dist-info/RECORD,,
|
cfdb-0.1.0.dist-info/METADATA
DELETED
@@ -1,57 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: cfdb
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary: CF conventions multi-dimensional array storage on top of Booklet
|
5
|
-
Project-URL: Documentation, https://mullenkamp.github.io/cfdb/
|
6
|
-
Project-URL: Source, https://github.com/mullenkamp/cfdb
|
7
|
-
Author-email: mullenkamp <mullenkamp1@gmail.com>
|
8
|
-
License-File: LICENSE
|
9
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
10
|
-
Requires-Python: >=3.10
|
11
|
-
Requires-Dist: booklet>=0.9.2
|
12
|
-
Requires-Dist: cftime
|
13
|
-
Requires-Dist: lz4
|
14
|
-
Requires-Dist: msgspec
|
15
|
-
Requires-Dist: numpy
|
16
|
-
Requires-Dist: rechunkit>=0.1.0
|
17
|
-
Requires-Dist: zstandard
|
18
|
-
Provides-Extra: ebooklet
|
19
|
-
Requires-Dist: ebooklet>=0.5.10; extra == 'ebooklet'
|
20
|
-
Provides-Extra: netcdf4
|
21
|
-
Requires-Dist: h5netcdf; extra == 'netcdf4'
|
22
|
-
Description-Content-Type: text/markdown
|
23
|
-
|
24
|
-
# cfdb
|
25
|
-
|
26
|
-
<p align="center">
|
27
|
-
<em>CF conventions multi-dimensional array storage on top of Booklet</em>
|
28
|
-
</p>
|
29
|
-
|
30
|
-
[](https://github.com/mullenkamp/cfdb/actions)
|
31
|
-
[](https://codecov.io/gh/mullenkamp/cfdb)
|
32
|
-
[](https://badge.fury.io/py/cfdb)
|
33
|
-
|
34
|
-
---
|
35
|
-
|
36
|
-
**Documentation**: <a href="https://mullenkamp.github.io/cfdb/" target="_blank">https://mullenkamp.github.io/cfdb/</a>
|
37
|
-
|
38
|
-
**Source Code**: <a href="https://github.com/mullenkamp/cfdb" target="_blank">https://github.com/mullenkamp/cfbdb</a>
|
39
|
-
|
40
|
-
---
|
41
|
-
|
42
|
-
## Development
|
43
|
-
|
44
|
-
### Coordinate variables
|
45
|
-
Must be 1D.
|
46
|
-
They should have an "ordered" parameter (bool) that defined whether the coord should always be ordered. Int, float, and datetime should default to True. Only string and category dtypes should default to False.
|
47
|
-
There should be a "regular" parameter (bool) with an associated "step" parameter (int or float). It should work similarly to np.arange. Only ints, floats, and datetimes can use this.
|
48
|
-
~~Should I add a "unique" parameter (bool)? Maybe I should just enforce this normally?~~ It should enforce uniqueness in the coords.
|
49
|
-
There can be a groupby method datasets that would use the rechunker. The rechunker would have the groupby dims set to 1 and the other dims set to the full length.
|
50
|
-
|
51
|
-
#### Multi-dimensional coords
|
52
|
-
It is possible to create a composite index from multiple 1D coords. But it seems best to implement this type of thing on top of sqlite (or something equivalent).
|
53
|
-
Keeping each coord 1D makes implementations quite a bit simpler.
|
54
|
-
|
55
|
-
## License
|
56
|
-
|
57
|
-
This project is licensed under the terms of the Apache Software License 2.0.
|
cfdb-0.1.0.dist-info/RECORD
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
cfdb/__init__.py,sha256=r2CzHI87AZOW0HsVhl0HpN0-Mjh34eB9WG2sCUK4kiA,233
|
2
|
-
cfdb/combine.py,sha256=B1CHZ0NOW4O5j_5NYxAHB76X1A5O3HcZwjNGNx_gfEA,19084
|
3
|
-
cfdb/core.py,sha256=IMFGhed5pa2zoYlm7reu1TeCQ6nt3sMmy5cE0LcAb2A,37337
|
4
|
-
cfdb/creation.py,sha256=hoR0MVEhbcxKT1JnZ2rK1fUAofxOQT0okKmLYh0PBAY,10686
|
5
|
-
cfdb/data_models.py,sha256=AtwtH2Uyo84GucW52aX0AzpG3Sbge41F5lrPuRxSLoY,2166
|
6
|
-
cfdb/indexers.py,sha256=Vl0PS44mV4_6IUvPGZIIsd0qQniM3iAtntwe8bhqDrk,10683
|
7
|
-
cfdb/main.py,sha256=L23zO_glrsOg8e5Vx2Guef3UOKNOw9KFW0Ray0uGqrQ,26372
|
8
|
-
cfdb/support_classes.py,sha256=qoSVC7eX8I_A8xHA8jLnjLD9211bc3Va9HXvo_uct0A,34806
|
9
|
-
cfdb/utils.py,sha256=ZEMmvUTa1h-FBCdfBx-oL5xVH7fDbXAObwqYjqeKQGk,73296
|
10
|
-
cfdb-0.1.0.dist-info/METADATA,sha256=n8_UtgGhkHZgC3MxxaNcFfr2-682Fxc3RP1FDQ43fik,2528
|
11
|
-
cfdb-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
12
|
-
cfdb-0.1.0.dist-info/licenses/LICENSE,sha256=hNqpp2O-F2qp4ozzNN86q1sxnAeFDLNoylHyJK_aiYI,586
|
13
|
-
cfdb-0.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|