cfdb 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cfdb/__init__.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """CF conventions multi-dimensional array database on top of Booklet"""
2
2
  from cfdb.main import open_dataset, open_edataset
3
3
  from cfdb.utils import compute_scale_and_offset
4
+ from cfdb.tools import netcdf4_to_cfdb, cfdb_to_netcdf4
4
5
  from rechunkit import guess_chunk_shape
5
6
 
6
- __version__ = '0.1.0'
7
+ __version__ = '0.1.1'
cfdb/indexers.py CHANGED
@@ -287,11 +287,8 @@ def slices_to_chunks_keys(slices, var_name, var_chunk_shape, clip_ends=True):
287
287
  """
288
288
  starts = tuple(s.start for s in slices)
289
289
  stops = tuple(s.stop for s in slices)
290
- # chunk_iter1 = rechunkit.chunk_range(starts, stops, var_chunk_shape, clip_ends=False)
291
290
  chunk_iter2 = rechunkit.chunk_range(starts, stops, var_chunk_shape, clip_ends=clip_ends)
292
- # for full_chunk, partial_chunk in zip(chunk_iter1, chunk_iter2):
293
291
  for partial_chunk in chunk_iter2:
294
- # starts_chunk = tuple(s.start for s in full_chunk)
295
292
  starts_chunk = tuple((pc.start//cs) * cs for cs, pc in zip(var_chunk_shape, partial_chunk))
296
293
  new_key = utils.make_var_chunk_key(var_name, starts_chunk)
297
294
 
cfdb/main.py CHANGED
@@ -6,7 +6,7 @@ Created on Tue Jan 7 11:25:06 2025
6
6
  @author: mike
7
7
  """
8
8
  import booklet
9
- from typing import Union
9
+ from typing import Union, List
10
10
  import pathlib
11
11
  import msgspec
12
12
  import weakref
@@ -62,50 +62,10 @@ class DatasetBase:
62
62
  def __contains__(self, key):
63
63
  return key in self.var_names
64
64
 
65
- # def get(self, var_name):
66
- # """
67
-
68
- # """
69
- # if not isinstance(var_name, str):
70
- # raise TypeError('var_name must be a string.')
71
-
72
- # if var_name not in self:
73
- # raise ValueError(f'The Variable {var_name} does not exist.')
74
-
75
- # if self._sel is not None:
76
- # if var_name not in self._sel:
77
- # raise ValueError(f'The Variable {var_name} does not exist in view.')
78
-
79
- # if var_name not in self._var_cache:
80
- # var_meta = self._sys_meta.variables[var_name]
81
- # if isinstance(var_meta, data_models.DataVariable):
82
- # var = sc.DataVariable(var_name, self)
83
- # else:
84
- # var = sc.Coordinate(var_name, self)
85
- # self._var_cache[var_name] = var
86
-
87
- # if self._sel is None:
88
- # return self._var_cache[var_name]
89
- # else:
90
- # return self._var_cache[var_name][self._sel[var_name]]
91
-
92
- # var_meta = self._sys_meta.variables[var_name]
93
- # if isinstance(var_meta, data_models.DataVariable):
94
- # var = sc.DataVariable(var_name, self)
95
- # else:
96
- # var = sc.Coordinate(var_name, self)
97
-
98
- # return var
99
-
100
65
 
101
66
  def __getitem__(self, key):
102
67
  return self.get(key)
103
68
 
104
- # def __setitem__(self, key, value):
105
- # if isinstance(value, sc.Variable):
106
- # setattr(self, key, value)
107
- # else:
108
- # raise TypeError('Assigned value must be a Variable or Coordinate object.')
109
69
 
110
70
  def __delitem__(self, key):
111
71
  if key not in self:
@@ -168,8 +128,29 @@ class DatasetBase:
168
128
  """
169
129
  return utils.file_summary(self)
170
130
 
131
+ @property
132
+ def coords(self):
133
+ """
134
+ Return a tuple of coords.
135
+ """
136
+ return tuple(self[coord_name] for coord_name in self.coord_names)
171
137
 
172
- def sel(self, sel: dict):
138
+ @property
139
+ def data_vars(self):
140
+ """
141
+ Return a tuple of data variables.
142
+ """
143
+ return tuple(self[var_name] for var_name in self.data_var_names)
144
+
145
+ @property
146
+ def variables(self):
147
+ """
148
+ Return a tuple of variables.
149
+ """
150
+ return tuple(self[var_name] for var_name in self.var_names)
151
+
152
+
153
+ def select(self, sel: dict):
173
154
  """
174
155
  Filter the dataset variables by a selection of the coordinate positions.
175
156
  """
@@ -200,9 +181,9 @@ class DatasetBase:
200
181
  return DatasetView(self, _sel)
201
182
 
202
183
 
203
- def sel_loc(self, sel: dict):
184
+ def select_loc(self, sel: dict):
204
185
  """
205
- Filter the dataset variables by a selection of the coordinate locations.
186
+ Filter the dataset variables by a selection of the coordinate locations/values.
206
187
  """
207
188
  ## Checks on input
208
189
  coord_names = self.coord_names
@@ -282,7 +263,7 @@ class DatasetBase:
282
263
  # return x1
283
264
 
284
265
 
285
- def copy(self, file_path):
266
+ def copy(self, file_path: Union[str, pathlib.Path], include_data_vars: List[str]=None, exclude_data_vars: List[str]=None):
286
267
  """
287
268
 
288
269
  """
@@ -290,14 +271,18 @@ class DatasetBase:
290
271
 
291
272
  new_ds = open_dataset(file_path, 'n', compression=self.compression, compression_level=self.compression_level, **kwargs)
292
273
 
293
- for coord in self.coords:
294
- new_coord = new_ds.create.coord.like(coord.name, coord, True)
274
+ data_var_names, coord_names = utils.filter_var_names(self, include_data_vars, exclude_data_vars)
275
+
276
+ for coord_name in coord_names:
277
+ coord = self[coord_name]
278
+ new_coord = new_ds.create.coord.like(coord_name, coord, True)
295
279
  new_coord.attrs.update(coord.attrs.data)
296
280
 
297
- for data_var in self.data_vars:
298
- new_data_var = new_ds.create.data_var.like(data_var.name, data_var)
281
+ for data_var_name in data_var_names:
282
+ data_var = self[data_var_name]
283
+ new_data_var = new_ds.create.data_var.like(data_var_name, data_var)
299
284
  new_data_var.attrs.update(data_var.attrs.data)
300
- for write_chunk, data in data_var.iter_chunks(False):
285
+ for write_chunk, data in data_var.iter_chunks(decoded=False):
301
286
  new_data_var.set(write_chunk, data, False)
302
287
 
303
288
  new_ds.attrs.update(self.attrs.data)
@@ -305,90 +290,99 @@ class DatasetBase:
305
290
  return new_ds
306
291
 
307
292
 
308
- def to_netcdf4(self, file_path: Union[str, pathlib.Path], compression: str='gzip', **file_kwargs):
293
+ def to_netcdf4(self, file_path: Union[str, pathlib.Path], compression: str='gzip', include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, **file_kwargs):
309
294
  """
310
295
  Save a dataset to a netcdf4 file using h5netcdf.
311
296
  """
312
297
  if not import_h5netcdf:
313
298
  raise ImportError('h5netcdf must be installed to save files to netcdf4.')
314
299
 
315
- h5 = h5netcdf.File(file_path, 'w', **file_kwargs)
316
-
317
- # dims/coords
318
- for coord in self.coords:
319
- name = coord.name
320
- h5.dimensions[name] = coord.shape[0]
321
- coord_len = coord.shape[0]
322
- chunk_len = coord.chunk_shape[0]
323
- if chunk_len > coord_len:
324
- chunk_shape = (coord_len,)
325
- else:
326
- chunk_shape = (chunk_len,)
327
-
328
- h5_coord = h5.create_variable(name, (name,), coord.dtype_encoded, compression=compression, chunks=chunk_shape, fillvalue=coord.fillvalue)
329
- attrs = deepcopy(coord.attrs.data)
330
- dtype_decoded, dtype_encoded = utils.parse_dtype_names(coord.dtype_decoded, coord.dtype_encoded)
331
- if coord.step is not None:
332
- attrs['step'] = coord.step
333
- if coord.scale_factor is not None:
334
- attrs['scale_factor'] = coord.scale_factor
335
- elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
336
- attrs['scale_factor'] = 1
337
- if coord.add_offset is not None:
338
- attrs['add_offset'] = coord.add_offset
339
- elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
340
- attrs['add_offset'] = 0
341
- if coord.dtype_decoded.kind == 'M':
342
- units = utils.parse_cf_time_units(coord.dtype_decoded)
343
- calendar = "proleptic_gregorian"
344
- attrs['units'] = units
345
- attrs['calendar'] = calendar
346
- attrs['standard_name'] = 'time'
347
-
348
- attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': coord.fillvalue})
349
- h5_coord.attrs.update(attrs)
350
-
351
- for write_chunk, data in coord.iter_chunks(decoded=False):
352
- h5_coord[write_chunk] = data
353
-
354
- # Data vars
355
- for data_var in self.data_vars:
356
- name = data_var.name
357
- chunk_shape = []
358
- for s, cs in zip(data_var.shape, data_var.chunk_shape):
359
- if cs > s:
360
- chunk_shape.append(s)
300
+ data_var_names, coord_names = utils.filter_var_names(self, include_data_vars, exclude_data_vars)
301
+
302
+ with h5netcdf.File(file_path, 'w', **file_kwargs) as h5:
303
+ # dims/coords
304
+ for coord_name in coord_names:
305
+ coord = self[coord_name]
306
+ h5.dimensions[coord_name] = coord.shape[0]
307
+ coord_len = coord.shape[0]
308
+ chunk_len = coord.chunk_shape[0]
309
+ if chunk_len > coord_len:
310
+ chunk_shape = (coord_len,)
361
311
  else:
362
- chunk_shape.append(cs)
363
-
364
- h5_data_var = h5.create_variable(name, data_var.coord_names, data_var.dtype_encoded, compression=compression, chunks=tuple(chunk_shape), fillvalue=data_var.fillvalue)
365
- attrs = deepcopy(data_var.attrs.data)
366
- dtype_decoded, dtype_encoded = utils.parse_dtype_names(data_var.dtype_decoded, data_var.dtype_encoded)
367
- if data_var.scale_factor is not None:
368
- attrs['scale_factor'] = data_var.scale_factor
369
- elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
370
- attrs['scale_factor'] = 1
371
- if data_var.add_offset is not None:
372
- attrs['add_offset'] = data_var.add_offset
373
- elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
374
- attrs['add_offset'] = 0
375
- if data_var.dtype_decoded.kind == 'M':
376
- units = utils.parse_cf_time_units(data_var.dtype_decoded)
377
- calendar = "proleptic_gregorian"
378
- attrs['units'] = units
379
- attrs['calendar'] = calendar
380
- attrs['standard_name'] = 'time'
381
-
382
- attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': data_var.fillvalue})
383
- h5_data_var.attrs.update(attrs)
384
-
385
- for write_chunk, data in data_var.iter_chunks(decoded=False):
386
- h5_data_var[write_chunk] = data
387
-
388
- # Add global attrs
389
- h5.attrs.update(self.attrs.data)
390
-
391
- h5.close()
312
+ chunk_shape = (chunk_len,)
313
+
314
+ h5_coord = h5.create_variable(coord_name, (coord_name,), coord.dtype_encoded, compression=compression, chunks=chunk_shape, fillvalue=coord.fillvalue)
315
+ attrs = deepcopy(coord.attrs.data)
316
+ dtype_decoded, dtype_encoded = utils.parse_dtype_names(coord.dtype_decoded, coord.dtype_encoded)
317
+ if coord.step is not None:
318
+ attrs['step'] = coord.step
319
+ if coord.scale_factor is not None:
320
+ attrs['scale_factor'] = coord.scale_factor
321
+ elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
322
+ attrs['scale_factor'] = 1
323
+ if coord.add_offset is not None:
324
+ attrs['add_offset'] = coord.add_offset
325
+ elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
326
+ attrs['add_offset'] = 0
327
+ if coord.dtype_decoded.kind == 'M':
328
+ units = utils.parse_cf_time_units(coord.dtype_decoded)
329
+ calendar = "proleptic_gregorian"
330
+ attrs['units'] = units
331
+ attrs['calendar'] = calendar
332
+ attrs['standard_name'] = 'time'
333
+
334
+ if coord.fillvalue is not None:
335
+ attrs['_FillValue'] = coord.fillvalue
336
+
337
+ attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded})
338
+ try:
339
+ h5_coord.attrs.update(attrs)
340
+ except Exception as err:
341
+ print(attrs)
342
+ raise err
343
+
344
+ for write_chunk, data in coord.iter_chunks(decoded=False):
345
+ h5_coord[write_chunk] = data
346
+
347
+ # Data vars
348
+ for data_var_name in data_var_names:
349
+ data_var = self[data_var_name]
350
+ chunk_shape = []
351
+ for s, cs in zip(data_var.shape, data_var.chunk_shape):
352
+ if cs > s:
353
+ chunk_shape.append(s)
354
+ else:
355
+ chunk_shape.append(cs)
356
+
357
+ h5_data_var = h5.create_variable(data_var_name, data_var.coord_names, data_var.dtype_encoded, compression=compression, chunks=tuple(chunk_shape), fillvalue=data_var.fillvalue)
358
+ attrs = deepcopy(data_var.attrs.data)
359
+ dtype_decoded, dtype_encoded = utils.parse_dtype_names(data_var.dtype_decoded, data_var.dtype_encoded)
360
+ if data_var.scale_factor is not None:
361
+ attrs['scale_factor'] = data_var.scale_factor
362
+ elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
363
+ attrs['scale_factor'] = 1
364
+ if data_var.add_offset is not None:
365
+ attrs['add_offset'] = data_var.add_offset
366
+ elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
367
+ attrs['add_offset'] = 0
368
+ if data_var.dtype_decoded.kind == 'M':
369
+ units = utils.parse_cf_time_units(data_var.dtype_decoded)
370
+ calendar = "proleptic_gregorian"
371
+ attrs['units'] = units
372
+ attrs['calendar'] = calendar
373
+ attrs['standard_name'] = 'time'
374
+
375
+ if coord.fillvalue is not None:
376
+ attrs['_FillValue'] = data_var.fillvalue
377
+
378
+ attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded})
379
+ h5_data_var.attrs.update(attrs)
380
+
381
+ for write_chunk, data in data_var.iter_chunks(decoded=False):
382
+ h5_data_var[write_chunk] = data
383
+
384
+ # Add global attrs
385
+ h5.attrs.update(self.attrs.data)
392
386
 
393
387
 
394
388
  class Dataset(DatasetBase):
@@ -401,7 +395,7 @@ class Dataset(DatasetBase):
401
395
  """
402
396
  self._blt = open_blt
403
397
  self.writable = self._blt.writable
404
- self.file_path = file_path
398
+ self.file_path = pathlib.Path(file_path)
405
399
  self.is_open = True
406
400
 
407
401
  if hasattr(self._blt, 'load_items'):
@@ -503,27 +497,6 @@ class Dataset(DatasetBase):
503
497
  return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable))
504
498
 
505
499
 
506
- @property
507
- def coords(self):
508
- """
509
- Return a tuple of coords.
510
- """
511
- return tuple(self[coord_name] for coord_name in self.coord_names)
512
-
513
- @property
514
- def data_vars(self):
515
- """
516
- Return a tuple of data variables.
517
- """
518
- return tuple(self[var_name] for var_name in self.data_var_names)
519
-
520
- @property
521
- def variables(self):
522
- """
523
- Return a tuple of variables.
524
- """
525
- return tuple(self[var_name] for var_name in self.var_names)
526
-
527
500
  def prune(self, timestamp=None, reindex=False):
528
501
  """
529
502
  Prunes deleted data from the file. Returns the number of removed items. The method can also prune remove keys/values older than the timestamp. The user can also reindex the booklet file. False does no reindexing, True increases the n_buckets to a preassigned value, or an int of the n_buckets. True can only be used if the default n_buckets were used at original initialisation.
@@ -597,17 +570,17 @@ class DatasetView(DatasetBase):
597
570
  """
598
571
  return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable) if k in self._sel)
599
572
 
600
- @property
601
- def coords(self):
602
- return tuple(self[coord_name][self._sel[coord_name]] for coord_name in self.coord_names if coord_name in self._sel)
573
+ # @property
574
+ # def coords(self):
575
+ # return tuple(self[coord_name][self._sel[coord_name]] for coord_name in self.coord_names if coord_name in self._sel)
603
576
 
604
- @property
605
- def data_vars(self):
606
- return tuple(self[var_name][self._sel[var_name]] for var_name in self.data_var_names if var_name in self._sel)
577
+ # @property
578
+ # def data_vars(self):
579
+ # return tuple(self[var_name][self._sel[var_name]] for var_name in self.data_var_names if var_name in self._sel)
607
580
 
608
- @property
609
- def variables(self):
610
- return tuple(self[var_name][self._sel[var_name]] for var_name in self.var_names if var_name in self._sel)
581
+ # @property
582
+ # def variables(self):
583
+ # return tuple(self[var_name][self._sel[var_name]] for var_name in self.var_names if var_name in self._sel)
611
584
 
612
585
 
613
586
 
@@ -689,7 +662,7 @@ def open_dataset(file_path: Union[str, pathlib.Path], flag: str = "r", compressi
689
662
  else:
690
663
  create = False
691
664
 
692
- return Dataset(file_path, open_blt, create, compression, compression_level)
665
+ return Dataset(fp, open_blt, create, compression, compression_level)
693
666
 
694
667
 
695
668
  def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
@@ -699,7 +672,7 @@ def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
699
672
  compression_level: int=1,
700
673
  **kwargs):
701
674
  """
702
- Open a cfdb that is linked with a remote S3 database.
675
+ Open a cfdb that is linked with a remote S3 database.
703
676
 
704
677
  Parameters
705
678
  -----------
@@ -754,7 +727,7 @@ def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
754
727
  else:
755
728
  create = False
756
729
 
757
- return EDataset(file_path, open_blt, create, compression, compression_level)
730
+ return EDataset(fp, open_blt, create, compression, compression_level)
758
731
 
759
732
 
760
733
 
cfdb/support_classes.py CHANGED
@@ -65,6 +65,7 @@ class Rechunker:
65
65
  shape of the chunk
66
66
  """
67
67
  chunk_shape = rechunkit.guess_chunk_shape(self._var.shape, self._var.dtype_encoded, target_chunk_size)
68
+
68
69
  return chunk_shape
69
70
 
70
71
  def calc_ideal_read_chunk_shape(self, target_chunk_shape: Tuple[int, ...]):
@@ -556,14 +557,16 @@ class Variable:
556
557
  # TODO
557
558
 
558
559
 
559
- def iter_chunks(self, decoded=True):
560
+ def iter_chunks(self, include_data=True, decoded=True):
560
561
  """
561
- Iterate through the chunks of the variable and return numpy arrays associated with the index slices. This should be the main way for users to get large amounts of data from a variable. The "ends" of the data will be clipped to the shape of the variable (i.e. not all chunks will be the chunk_shape).
562
+ Iterate through the chunks of the variable and return numpy arrays associated with the index slices (Optional). This should be the main way for users to get large amounts of data from a variable. The "ends" of the data will be clipped to the shape of the variable (i.e. not all chunks will be the chunk_shape).
562
563
 
563
564
  Parameters
564
565
  ----------
565
566
  decoded: bool
566
567
  Should the data be decoded?
568
+ include_data: bool
569
+ Should the data be included in the output?
567
570
 
568
571
  Returns
569
572
  -------
@@ -577,19 +580,29 @@ class Variable:
577
580
  blank = self._make_blank_chunk_array(decoded)
578
581
 
579
582
  slices = indexers.index_combo_all(self._sel, coord_origins, self.shape)
580
- for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, self.name, self.chunk_shape):
581
- # print(target_chunk, source_chunk, blt_key)
582
- b1 = self._blt.get(blt_key)
583
- if b1 is None:
584
- blank_slices = tuple(slice(0, sc.stop - sc.start) for sc in source_chunk)
585
- yield target_chunk, blank[blank_slices]
586
- else:
587
- if decoded:
588
- data = self._encoder.decode(self._encoder.from_bytes(b1))
583
+
584
+ if include_data:
585
+ for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, self.name, self.chunk_shape):
586
+ # print(target_chunk, source_chunk, blt_key)
587
+ b1 = self._blt.get(blt_key)
588
+ if b1 is None:
589
+ blank_slices = tuple(slice(0, sc.stop - sc.start) for sc in source_chunk)
590
+ yield target_chunk, blank[blank_slices]
589
591
  else:
590
- data = self._encoder.from_bytes(b1)
592
+ if decoded:
593
+ data = self._encoder.decode(self._encoder.from_bytes(b1))
594
+ else:
595
+ data = self._encoder.from_bytes(b1)
591
596
 
592
- yield target_chunk, data[source_chunk]
597
+ yield target_chunk, data[source_chunk]
598
+ else:
599
+ starts = tuple(s.start for s in slices)
600
+ stops = tuple(s.stop for s in slices)
601
+ chunk_iter2 = rechunkit.chunk_range(starts, stops, self.chunk_shape)
602
+ for partial_chunk in chunk_iter2:
603
+ target_chunk = tuple(slice(s.start - start, s.stop - start) for start, s in zip(starts, partial_chunk))
604
+
605
+ yield target_chunk
593
606
 
594
607
  def __iter__(self):
595
608
  return self.iter_chunks()
@@ -859,7 +872,7 @@ class Coordinate(CoordinateView):
859
872
 
860
873
  def append(self, data):
861
874
  """
862
- Append data to the end of the coordinate. The extra length will be added to the associated data variables with the fillvalue.
875
+ Append data to the end of the coordinate. The extra length will be added to the associated data variables with the fillvalue.
863
876
  """
864
877
  if not self.writable:
865
878
  raise ValueError('Dataset is not writable.')
@@ -947,6 +960,29 @@ class DataVariableView(Variable):
947
960
  self._blt.set(blt_key, self._encoder.to_bytes(new_data))
948
961
 
949
962
 
963
+ # def set_chunk(self, sel, data, encode=True):
964
+ # """
965
+ # Set the first chunk associated with the selection.
966
+ # """
967
+ # if not self.writable:
968
+ # raise ValueError('Dataset is not writable.')
969
+
970
+ # if sel is None:
971
+ # sel = self._sel
972
+ # coord_origins = self.get_coord_origins()
973
+ # slices = indexers.index_combo_all(sel, coord_origins, self.shape)
974
+ # starts_chunk = tuple((pc.start//cs) * cs for cs, pc in zip(self.chunk_shape, slices))
975
+ # chunk_stop = tuple(min(cs, s - sc) for cs, sc, s in zip(self.chunk_shape, starts_chunk, self.shape))
976
+ # if data.shape != chunk_stop:
977
+ # raise ValueError(f'The shape of this chunk should be {chunk_stop}, but the data passed is {data.shape}')
978
+
979
+ # blt_key = utils.make_var_chunk_key(self.name, starts_chunk)
980
+ # if encode:
981
+ # self._blt.set(blt_key, self._encoder.to_bytes(self._encoder.encode(data)))
982
+ # else:
983
+ # self._blt.set(blt_key, self._encoder.to_bytes(data))
984
+
985
+
950
986
  def __setitem__(self, sel, data):
951
987
  """
952
988
 
@@ -954,14 +990,14 @@ class DataVariableView(Variable):
954
990
  self.set(sel, data)
955
991
 
956
992
 
957
- def groupby(self, coord_names: Iterable, max_mem: int=2**27, decoded=True):
993
+ def groupby(self, coord_names: Union[str, Iterable], max_mem: int=2**27, decoded=True):
958
994
  """
959
995
  This method takes one or more coord names to group by and returns a generator. This generator will return chunks of data according to these groupings with the associated tuple of slices. The more max_mem provided, the more efficient the chunking.
960
996
  This is effectively the rechunking method where each coord name supplied is set to 1 and all other coords are set to their full their full length.
961
997
 
962
998
  Parameters
963
999
  ----------
964
- coord_names: Iterable
1000
+ coord_names: str or Iterable
965
1001
  The coord names to group by.
966
1002
  max_mem: int
967
1003
  The max allocated memory to perform the chunking operation in bytes. This will only be as large as necessary for an optimum size chunk for the rechunking.
cfdb/tools.py ADDED
@@ -0,0 +1,427 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Thu Jul 17 09:04:49 2025
5
+
6
+ @author: mike
7
+ """
8
+ import numpy as np
9
+ import rechunkit
10
+ import copy
11
+ from typing import List, Union
12
+ import pathlib
13
+
14
+ try:
15
+ import h5netcdf
16
+ import_h5netcdf = True
17
+ except ImportError:
18
+ import_h5netcdf = False
19
+
20
+ from . import utils, main, indexers, support_classes as sc
21
+ # import utils, main, indexers, support_classes as sc
22
+
23
+ ##########################################
24
+ ### Parameters
25
+
26
+ inv_time_units_dict = {value: key for key, value in utils.time_units_dict.items()}
27
+
28
+
29
+
30
+ #########################################
31
+ ### Functions
32
+
33
+
34
+ class H5DataVarReader:
35
+ """
36
+
37
+ """
38
+ def __init__(self, h5_data_var, inverted_coords, shape):
39
+ """
40
+
41
+ """
42
+ self.is_inverted = any(inverted_coords)
43
+ self.data_var = h5_data_var
44
+ self.inverted_coords = inverted_coords
45
+ self.shape = shape
46
+
47
+ def get(self, slices):
48
+ """
49
+
50
+ """
51
+ if self.is_inverted:
52
+ source_slices = tuple(slice(s - cs.stop, s - cs.start) if inverted else cs for inverted, cs, s in zip(self.inverted_coords, slices, self.shape))
53
+ data = np.flip(self.data_var[source_slices], np.nonzero(self.inverted_coords)[0])
54
+ else:
55
+ data = self.data_var[slices]
56
+
57
+ return data
58
+
59
+
60
+ def filter_var_names_h5(h5, include_data_vars, exclude_data_vars):
61
+ """
62
+
63
+ """
64
+ coord_names_all = set(h5.dims)
65
+ data_var_names_all = set(h5.variables).difference(coord_names_all)
66
+
67
+ if include_data_vars is not None:
68
+ if isinstance(include_data_vars, str):
69
+ include_data_vars = [include_data_vars]
70
+ data_var_names = set(include_data_vars)
71
+ if not data_var_names.isubset(data_var_names_all):
72
+ raise ValueError(f'{data_var_names} is not a subset of {data_var_names_all}')
73
+ elif exclude_data_vars is not None:
74
+ if isinstance(exclude_data_vars, str):
75
+ exclude_data_vars = [exclude_data_vars]
76
+ data_var_names = data_var_names_all.difference(set(exclude_data_vars))
77
+ else:
78
+ data_var_names = data_var_names_all
79
+
80
+ coord_names = set()
81
+ for data_var_name in data_var_names:
82
+ data_var = h5[data_var_name]
83
+ coord_names.update(data_var.dimensions)
84
+
85
+ return data_var_names, coord_names
86
+
87
+
88
+ def parse_attrs(attrs):
89
+ """
90
+
91
+ """
92
+ input_params = {}
93
+ for attr, value in copy.deepcopy(attrs).items():
94
+ if attr == 'scale_factor':
95
+ input_params['scale_factor'] = float(attrs.pop(attr))
96
+ elif attr == 'add_offset':
97
+ input_params['add_offset'] = float(attrs.pop(attr))
98
+ elif attr == '_FillValue':
99
+ if value is not None:
100
+ input_params['fillvalue'] = int(attrs.pop(attr))
101
+ elif attr == 'missing_value':
102
+ del attrs['missing_value']
103
+ elif isinstance(value, np.bytes_):
104
+ attrs[attr] = str(value.astype(str))
105
+ elif isinstance(value, np.floating):
106
+ attrs[attr] = float(value)
107
+ elif isinstance(value, np.integer):
108
+ attrs[attr] = int(value)
109
+ elif isinstance(value, np.str_):
110
+ attrs[attr] = str(value)
111
+
112
+ return attrs, input_params
113
+
114
+
115
+ def parse_cf_dates(units, dtype_encoded):
116
+ """
117
+
118
+ """
119
+ if ' since ' in units:
120
+ freq, start_date = units.split(' since ')
121
+ freq_code = inv_time_units_dict[freq]
122
+ origin_date = np.datetime64(start_date, freq_code)
123
+ unix_date = np.datetime64('1970-01-01', freq_code)
124
+ # origin_diff = (unix_date - origin_date).astype(dtype_encoded)
125
+ units = f'{freq} since {str(unix_date)}'
126
+ if freq_code not in ('M', 'D', 'h', 'm'):
127
+ dtype_encoded = np.dtype('int64')
128
+ dtype_decoded = origin_date.dtype
129
+ else:
130
+ dtype_decoded = dtype_encoded
131
+ origin_date = None
132
+
133
+ return units, dtype_decoded, dtype_encoded, origin_date
134
+
135
+
136
+ def netcdf4_to_cfdb(nc_path: Union[str, pathlib.Path], cfdb_path: Union[str, pathlib.Path], sel: dict=None, sel_loc: dict=None, include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, max_mem: int=2**27, **kwargs):
137
+ """
138
+ Simple function to convert a netcdf4 to a cfdb. Selection options are also available. The h5netcdf package must be installed to read netcdf4 files.
139
+
140
+ Parameters
141
+ ----------
142
+ nc_path: str or pathlib.Path
143
+ The source netcdf4 file to be converted.
144
+ cfdb_path: str or pathlib.Path
145
+ The target path for the cfdb.
146
+ sel: dict
147
+ Selection by coordinate indexes.
148
+ sel_loc: dict
149
+ Selection by coordinate values.
150
+ max_mem: int
151
+ The max memory in bytes if required when coordinates are in decending order (and must be resorted in ascending order).
152
+ kwargs
153
+ Any kwargs that can be passed to the cfdb.open_dataset function.
154
+
155
+ Returns
156
+ -------
157
+ None
158
+ """
159
+ if not import_h5netcdf:
160
+ raise ImportError('h5netcdf must be installed to save files to netcdf4.')
161
+
162
+ if (sel is not None) and (sel_loc is not None):
163
+ raise ValueError('Only one of sel or sel_loc can be passed, not both.')
164
+
165
+ ## Get the coordinates data
166
+ inverted_coords = []
167
+ # coords_data = {}
168
+ sel_dict = {}
169
+ with main.open_dataset(cfdb_path, 'n', **kwargs) as ds:
170
+ with h5netcdf.File(nc_path, 'r') as h5:
171
+ dims = tuple(h5.dims)
172
+
173
+ ## Check the selection inputs
174
+ if isinstance(sel, dict):
175
+ for key in sel:
176
+ if key not in dims:
177
+ raise ValueError(f'{key} is not a dimension in the dataset.')
178
+ elif isinstance(sel_loc, dict):
179
+ for key in sel_loc:
180
+ if key not in dims:
181
+ raise ValueError(f'{key} is not a dimension in the dataset.')
182
+
183
+ data_var_names, coord_names = filter_var_names_h5(h5, include_data_vars, exclude_data_vars)
184
+
185
+ for dim in coord_names:
186
+ h5_coord = h5[dim]
187
+ dtype_encoded = h5_coord.dtype
188
+ attrs = dict(h5_coord.attrs)
189
+ attrs, input_params = parse_attrs(attrs)
190
+
191
+ if 'scale_factor' in input_params:
192
+ dtype_decoded = np.dtype('float64')
193
+ elif 'units' in attrs:
194
+ units, dtype_decoded, dtype_encoded, origin_date = parse_cf_dates(attrs['units'], dtype_encoded)
195
+ attrs['units'] = units
196
+ else:
197
+ dtype_decoded = dtype_encoded
198
+
199
+ input_params['dtype_decoded'] = dtype_decoded
200
+ input_params['dtype_encoded'] = dtype_encoded
201
+
202
+ # chunk_start = (0,)
203
+ shape = h5_coord.shape
204
+ chunk_shape = h5_coord.chunks
205
+ if chunk_shape is None:
206
+ chunk_shape = rechunkit.guess_chunk_shape(shape, dtype_encoded)
207
+
208
+ input_params['chunk_shape'] = chunk_shape
209
+
210
+ data = h5_coord[()]
211
+ h5_coord_diff = np.diff(data)
212
+ if h5_coord_diff[0] > 0:
213
+ order_check = np.all(h5_coord_diff > 0)
214
+ inverted = False
215
+ else:
216
+ order_check = np.all(h5_coord_diff < 0)
217
+ inverted = True
218
+
219
+ inverted_coords.append(inverted)
220
+
221
+ if not order_check:
222
+ raise ValueError('Either the coordinate values are not increasing/decreasing or they are not unique.')
223
+
224
+ data = h5_coord[()]
225
+
226
+ if inverted:
227
+ data.sort()
228
+
229
+ ## Decode data if necessary
230
+ if dtype_decoded.kind == 'M':
231
+ data = data + origin_date
232
+ elif 'scale_factor' in input_params:
233
+ if 'add_offset' in input_params:
234
+ add_offset = input_params['add_offset']
235
+ else:
236
+ add_offset = None
237
+ if 'fillvalue' in input_params:
238
+ fillvalue = input_params['fillvalue']
239
+ else:
240
+ fillvalue = None
241
+ encoding = sc.Encoding(chunk_shape, dtype_decoded, dtype_encoded, fillvalue, input_params['scale_factor'], add_offset, None)
242
+
243
+ data = encoding.decode(data)
244
+
245
+ ## Selection
246
+ if isinstance(sel, dict):
247
+ if dim in sel:
248
+ slices = indexers.index_combo_one(sel[dim], (0,), shape, 0)
249
+ data = data[slices]
250
+ else:
251
+ slices = indexers.slice_none((0,), shape, 0)
252
+
253
+ elif isinstance(sel_loc, dict):
254
+ if dim in sel_loc:
255
+ idx = indexers.loc_index_combo_one(sel_loc[dim], data)
256
+ slices = indexers.index_combo_one(idx, (0,), shape, 0)
257
+ data = data[slices]
258
+ else:
259
+ slices = indexers.slice_none((0,), shape, 0)
260
+ else:
261
+ slices = indexers.slice_none((0,), shape, 0)
262
+
263
+ sel_dict[dim] = slices
264
+
265
+ ## Create coord
266
+ coord = ds.create.coord.generic(dim, data=data, **input_params)
267
+ coord.attrs.update(attrs)
268
+
269
+ # coords_data[dim] = {'data': data, 'attrs': attrs, 'input_params': input_params}
270
+
271
+ ## Data Vars
272
+ inverted_coords = tuple(inverted_coords)
273
+ # is_inverted = any(inverted_coords)
274
+
275
+ for var_name in data_var_names:
276
+ h5_var = h5[var_name]
277
+ dtype_encoded = h5_var.dtype
278
+ attrs = dict(h5_var.attrs)
279
+ attrs, input_params = parse_attrs(attrs)
280
+
281
+ if 'scale_factor' in input_params:
282
+ dtype_decoded = np.dtype('float64')
283
+ elif 'units' in attrs:
284
+ units, dtype_decoded, dtype_encoded, origin_date = parse_cf_dates(attrs['units'], dtype_encoded)
285
+ attrs['units'] = units
286
+ else:
287
+ dtype_decoded = dtype_encoded
288
+
289
+ var_sel = tuple(sel_dict[dim] for dim in h5_var.dimensions)
290
+
291
+ # chunk_start = tuple(s.start for s in var_sel)
292
+ # shape = tuple(s.stop - s.start for s in var_sel)
293
+ # chunk_start = tuple(0 for i in range(len(h5_var.shape)))
294
+ shape = h5_var.shape
295
+ chunk_shape = h5_var.chunks
296
+ if chunk_shape is None:
297
+ chunk_shape = rechunkit.guess_chunk_shape(shape, dtype_encoded)
298
+
299
+ data_var = ds.create.data_var.generic(var_name, h5_var.dimensions, dtype_decoded=dtype_decoded, dtype_encoded=dtype_encoded, chunk_shape=chunk_shape, **input_params)
300
+ data_var.attrs.update(attrs)
301
+
302
+ h5_reader = H5DataVarReader(h5_var, inverted_coords, shape)
303
+
304
+ chunks_iter = rechunkit.rechunker(h5_reader.get, shape, dtype_encoded, chunk_shape, chunk_shape, max_mem, var_sel)
305
+ for chunk_slices, encoded_data in chunks_iter:
306
+ if not np.all(encoded_data == data_var.fillvalue):
307
+ data_var.set(chunk_slices, encoded_data, False)
308
+
309
+
310
+ # chunks_iter = rechunkit.chunk_range(chunk_start, shape, chunk_shape)
311
+ # for chunk_slices in chunks_iter:
312
+ # if is_inverted:
313
+ # source_slices = tuple(slice(s - cs.stop, s - cs.start) if inverted else cs for inverted, cs, s in zip(inverted_coords, chunk_slices, shape))
314
+ # data = np.flip(h5_var[source_slices], np.nonzero(inverted_coords)[0])
315
+ # else:
316
+ # data = h5_var[chunk_slices]
317
+ # if not np.all(data == data_var.fillvalue):
318
+ # # data_var.set_chunk(chunk_slices, data, False)
319
+ # data_var.set(chunk_slices, data, False)
320
+
321
+ ds.attrs.update(dict(h5.attrs))
322
+
323
+
324
+ def cfdb_to_netcdf4(cfdb_path: Union[str, pathlib.Path], nc_path: Union[str, pathlib.Path], compression: str='gzip', sel: dict=None, sel_loc: dict=None, include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, **kwargs):
325
+ """
326
+ Simple function to convert a cfdb to a netcdf4. Selection options are also available. The h5netcdf package must be installed to write netcdf4 files.
327
+
328
+ Parameters
329
+ ----------
330
+ cfdb_path: str or pathlib.Path
331
+ The source path of the cfdb to be converted.
332
+ nc_path: str or pathlib.Path
333
+ The target path for the netcdf4 file.
334
+ sel: dict
335
+ Selection by coordinate indexes.
336
+ sel_loc: dict
337
+ Selection by coordinate values.
338
+ max_mem: int
339
+ The max memory in bytes if required when coordinates are in decending order (and must be resorted in ascending order).
340
+ kwargs
341
+ Any kwargs that can be passed to the h5netcdf.File function.
342
+
343
+ Returns
344
+ -------
345
+ None
346
+ """
347
+ if not import_h5netcdf:
348
+ raise ImportError('h5netcdf must be installed to save files to netcdf4.')
349
+
350
+ if (sel is not None) and (sel_loc is not None):
351
+ raise ValueError('Only one of sel or sel_loc can be passed, not both.')
352
+
353
+ with main.open_dataset(cfdb_path) as ds:
354
+ if isinstance(sel, dict):
355
+ ds_view = ds.select(sel)
356
+ elif isinstance(sel_loc, dict):
357
+ ds_view = ds.select_loc(sel_loc)
358
+ else:
359
+ ds_view = ds
360
+
361
+ ds_view.to_netcdf4(nc_path, compression=compression, include_data_vars=include_data_vars, exclude_data_vars=exclude_data_vars, **kwargs)
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+
399
+
400
+
401
+
402
+
403
+
404
+
405
+
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+
426
+
427
+
cfdb/utils.py CHANGED
@@ -233,6 +233,30 @@ default_attrs = dict(
233
233
  ### Functions
234
234
 
235
235
 
236
+ def filter_var_names(ds, include_data_vars, exclude_data_vars):
237
+ """
238
+
239
+ """
240
+ if include_data_vars is not None:
241
+ if isinstance(include_data_vars, str):
242
+ include_data_vars = [include_data_vars]
243
+ data_var_names = set(include_data_vars)
244
+ elif exclude_data_vars is not None:
245
+ if isinstance(exclude_data_vars, str):
246
+ exclude_data_vars = [exclude_data_vars]
247
+ data_var_names_all = set(ds.data_var_names)
248
+ data_var_names = data_var_names_all.difference(set(exclude_data_vars))
249
+ else:
250
+ data_var_names = set(ds.data_var_names)
251
+
252
+ coord_names = set()
253
+ for data_var_name in data_var_names:
254
+ data_var = ds[data_var_name]
255
+ coord_names.update(data_var.coord_names)
256
+
257
+ return data_var_names, coord_names
258
+
259
+
236
260
  def parse_cf_time_units(dtype_decoded):
237
261
  """
238
262
 
@@ -1959,6 +1983,7 @@ def file_summary(ds):
1959
1983
  dim_name = var.name
1960
1984
  dtype_name = var.dtype_decoded
1961
1985
  dim_len = var.shape[0]
1986
+ # print(var.data)
1962
1987
  first_value = format_value(var.data[0])
1963
1988
  last_value = format_value(var.data[-1])
1964
1989
  spacing = value_indent - name_indent - len(dim_name)
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.4
2
+ Name: cfdb
3
+ Version: 0.1.1
4
+ Summary: CF conventions multi-dimensional array storage on top of Booklet
5
+ Project-URL: Documentation, https://mullenkamp.github.io/cfdb/
6
+ Project-URL: Source, https://github.com/mullenkamp/cfdb
7
+ Author-email: mullenkamp <mullenkamp1@gmail.com>
8
+ License-File: LICENSE
9
+ Classifier: Programming Language :: Python :: 3 :: Only
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: booklet>=0.9.2
12
+ Requires-Dist: cftime
13
+ Requires-Dist: lz4
14
+ Requires-Dist: msgspec
15
+ Requires-Dist: numpy
16
+ Requires-Dist: rechunkit>=0.1.0
17
+ Requires-Dist: zstandard
18
+ Provides-Extra: ebooklet
19
+ Requires-Dist: ebooklet>=0.5.10; extra == 'ebooklet'
20
+ Provides-Extra: netcdf4
21
+ Requires-Dist: h5netcdf; extra == 'netcdf4'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # cfdb
25
+
26
+ <p align="center">
27
+ <em>CF conventions multi-dimensional array storage on top of Booklet</em>
28
+ </p>
29
+
30
+ [![build](https://github.com/mullenkamp/cfdb/workflows/Build/badge.svg)](https://github.com/mullenkamp/cfdb/actions)
31
+ [![codecov](https://codecov.io/gh/mullenkamp/cfdb/branch/master/graph/badge.svg)](https://codecov.io/gh/mullenkamp/cfdb)
32
+ [![PyPI version](https://badge.fury.io/py/cfdb.svg)](https://badge.fury.io/py/cfdb)
33
+
34
+ ---
35
+
36
+ **Source Code**: <a href="https://github.com/mullenkamp/cfdb" target="_blank">https://github.com/mullenkamp/cfbdb</a>
37
+
38
+ ---
39
+ ## Introduction
40
+ cfdb is a pure python database for managing labeled multi-dimensional arrays that mostly follows the [CF conventions](https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html). It is an alternative to netcdf4 and [xarray](https://docs.xarray.dev/). It builds upon the [Booklet](https://github.com/mullenkamp/booklet) for the underlying local file storage and [EBooklet](https://github.com/mullenkamp/ebooklet) to sync and share on any S3 system. It has been designed to follow the programming style of opening a file, iteratively read data, iteratively write data, then closing the file.
41
+ It is thread-safe on reads and writes (using thread locks) and multiprocessing-safe (using file locks) including on the S3 remote (using object locking).
42
+
43
+ When an error occurs, cfdb will try to properly close the file and remove the file (object) locks. This will not sync any changes, so the user will lose any changes that were not synced. There will be circumstances that can occur that will not properly close the file, so care still needs to be made.
44
+
45
+
46
+ ## Installation
47
+
48
+ Install via pip:
49
+
50
+ ```
51
+ pip install cfdb
52
+ ```
53
+
54
+ I'll probably put it on conda-forge once I feel appropriately motivated...
55
+
56
+ ## Usage
57
+ ### Opening a file/dataset
58
+ Usage starts off by opening the file (and closing the file when done):
59
+ ```python
60
+ import cfdb
61
+ import numpy as np
62
+
63
+ file_path = '/path/to/file.cfdb'
64
+
65
+ ds = cfdb.open_dataset(file_path, flag='n')
66
+ # Do fancy stuff
67
+ ds.close()
68
+ ```
69
+
70
+ By default, files will be open for read-only, so we need to specify that we want to write (in this case, 'n' is to open for write and replace the existing file with a new one). There are also some compression options, and those are described in the doc strings. Other kwargs from [Booklet](https://github.com/mullenkamp/booklet?tab=readme-ov-file#usage) can be passed to open_dataset.
71
+
72
+ The dataset can also be opened with the context manager like so:
73
+ ```python
74
+ with cfdb.open_dataset(file_path, flag='n') as ds:
75
+ print(ds)
76
+ ```
77
+ This is generally encouraged as this will ensure that the file is closed properly and file locks are removed.
78
+
79
+ ### Variables
80
+ In the [CF conventions](https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#dimensions), variables are the objects that store data. These can be 1 dimensional or many dimensional. The dimensions are the labels of 1-D variables (like latitude or time). These 1-D variables are called coordinate variables (or coordinates) with the same name as their associated dimension. All variables that use these coordinates as their dimension labels are called data variables. The combination of multiple data variables with their coordinates in a single file is called a dataset.
81
+
82
+ #### Coordinates
83
+ Since all data variables must have coordinates, the coordinates must be created before data variables are created.
84
+
85
+ Coordinates in cfdb are more similar to the definition by the earlier [COARDS conventions](https://ferret.pmel.noaa.gov/Ferret/documentation/coards-netcdf-conventions) than the latter CF conventions. Coordinate values must be unique, sorted in ascending order (a partial consequence to np.sort), and cannot have null (or np.nan) values. The CF conventions do not have those limitations, but these limitations are good! Coordinates must also be only 1-D.
86
+
87
+ Coordinates can be created using the generic creation method, or templates can be used for some of the more common dimensions (like latitude, longitude, and time):
88
+ ```python
89
+ lat_data = np.linspace(0, 19.9, 200, dtype='float32')
90
+
91
+ with cfdb.open_dataset(file_path, flag='n') as ds:
92
+ lat_coord = ds.create.coord.latitude(data=lat_data, chunk_shape=(20,))
93
+ print(lat_coord)
94
+ ```
95
+ When creating coordinates, the user can pass a np.ndarray as data and cfdb will figure out the rest (especially when using a creation template). Otherwise, a coordinate can be created without any data input and the data can be appended later:
96
+ ```python
97
+ with cfdb.open_dataset(file_path, flag='n') as ds:
98
+ lat_coord = ds.create.coord.latitude(chunk_shape=(20,))
99
+ lat_coord.append(lat_data)
100
+ print(lat_coord.data)
101
+ ```
102
+ Coordinate data can either be appended or prepended, but keep in mind the limitations described above! And once assigned, coordinate values cannot be changed. At some point, I'll implement the ability to shrink the size of coordinates, but for now they can only be expanded. As seen in the above example, the .data method will return the entire variable data as a single np.ndarray. Coordinates always hold the entire data in memory, while data variables never do. On disk, all data are stored as chunks, whether it's coordinates or data variables.
103
+
104
+ Let's add another coordinate for fun:
105
+ ```python
106
+ time_data = np.linspace(0, 199, 200, dtype='datetime64[D]')
107
+
108
+ with cfdb.open_dataset(file_path, flag='w') as ds:
109
+ time_coord = ds.create.coord.time(data=time_data, dtype_decoded=time_data.dtype, dtype_encoded='int32')
110
+ print(time_coord)
111
+ ```
112
+ A time variable works similarly to other numpy dtypes, but you can assign the precision of the datetime object within the brackets (shown as [D] for days). Look at the [numpy datetime reference page](https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units) for all of the frequency codes. Do not use a frequency code finer than "ns". Encoding a datetime64 dtype to an int32 is possible down to the "m" (minute) resolution (with a max year of 6053), but all higher frequency codes should use int64.
113
+
114
+ #### Data Variables
115
+ Data variables are created in a similar way as coordinates except that you cannot pass data on creation and you must pass a tuple of the coordinate names to link the coordinates to the data variable:
116
+ ```python
117
+ data_var_data = np.linspace(0, 3999.9, 40000, dtype='float64').reshape(200, 200)
118
+ name = 'data_var'
119
+ coords = ('latitude', 'time')
120
+ dtype_encoded = 'int32'
121
+ scale_factor = 0.1
122
+
123
+ with cfdb.open_dataset(file_path, flag='w') as ds:
124
+ data_var = ds.create.data_var.generic(name, coords, data_var_data.dtype, dtype_encoded, scale_factor=scale_factor)
125
+ data_var[:] = data_var_data
126
+ data_var.attrs['test'] = ['test attributes']
127
+ print(data_var)
128
+ ```
129
+ Since there are no data variable templates (yet), we need to use the generic creation method. If no fillvalue or chunk_shape is passed, then cfdb figures them out for you.
130
+
131
+ Assigning data to data variables is different to coordinates. Data variables can only be expanded via the coordinates themselves. Assignment and selection is performed by the [basic numpy indexing](https://numpy.org/doc/stable/user/basics.indexing.html#basic-indexing), but not the [advanced indexing](https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing).
132
+
133
+ The example shown above is the simplest way of assigning data to a data variable, but it's not a preferred method when datasets are very large. The recommended way to write (and read) data is to iterate over the chunks:
134
+
135
+ ```python
136
+ with cfdb.open_dataset(file_path, flag='w') as ds:
137
+ data_var = ds[name]
138
+ for chunk_slices in data_var.iter_chunks(include_data=False):
139
+ data_var[chunk_slices] = data_var_data[chunk_slices]
140
+ ```
141
+
142
+ This is a bit of a contrived example given that data_var_data is a single in-memory numpy array, but in many cases your data source will be much larger or in many pieces. The chunk_slices is a tuple of index slices that the data chunk covers. It is the same indexing that can be passed to a numpy ndarray.
143
+
144
+ Reading data uses the same "iter_chunks" method. This ensures that memory usage is kept to a minimum:
145
+
146
+ ```python
147
+ with cfdb.open_dataset(file_path, flag='r') as ds:
148
+ data_var = ds[name]
149
+ for chunk_slices, data in data_var.iter_chunks():
150
+ print(chunk_slices)
151
+ print(data.shape)
152
+ ```
153
+
154
+ There's a groupby method that works similarly to the iter_chunks method except that it requires one or more coordinate names (like pandas or xarray):
155
+
156
+ ```python
157
+ with cfdb.open_dataset(file_path, flag='r') as ds:
158
+ data_var = ds[name]
159
+ for slices, data in data_var.groupby('latitude'):
160
+ print(slices)
161
+ print(data.shape)
162
+ ```
163
+
164
+ #### Rechunking
165
+ All data for variables are stored as chunks of data. For example, the shape of your data may be 2000 x 2000, but the data are stored in 100 x 100 chunks. This is done for a variety of reasons including the ability to compress data. When a variable is created, either the user can define their own chunk shape or cfdb will determine the chunk shape automatically.
166
+
167
+ The chunk shape defined in the variable might be good for some use cases but not others. The user might have specific use cases where they want a specific chunking; for example the groupby operation listed in the last example. In that example, the user wanted to iterate over each latitude but with all of the other coordinates (in this case the full time coordinate). A groupby operation is a common rechunking example, but the user might need chunks in many different shapes.
168
+
169
+ The [rechunkit package](https://github.com/mullenkamp/rechunkit) is used under the hood to rechunk the data in cfdb. It is exposed in cfdb via the "rechunker" method in a variable. The Rechunker class has several methods to help the user decide the chunk shape.
170
+
171
+ ```python
172
+ new_chunk_shape = (41, 41)
173
+
174
+ with cfdb.open_dataset(file_path) as ds:
175
+ data_var = ds[name]
176
+ rechunker = data_var.rechunker()
177
+ alt_chunk_shape = rechunker.guess_chunk_shape(2**8)
178
+ n_chunks = rechunker.calc_n_chunks()
179
+ print(n_chunks)
180
+ n_reads, n_writes = rechunker.calc_n_reads_rechunker(new_chunk_shape)
181
+ print(n_reads, n_writes)
182
+ rechunk = rechunker.rechunk(new_chunk_shape)
183
+
184
+ for slices, data in rechunk:
185
+ print(slices)
186
+ print(data.shape)
187
+ ```
188
+
189
+ #### Serializers
190
+ The datasets can be serialized to netcdf4 via the to_netcdf4 method. You must have the [h5netcdf package](https://h5netcdf.org/) installed for netcdf4. It can also be copied to another cfdb file.
191
+
192
+ ```python
193
+ with open_dataset(file_path) as ds:
194
+ new_ds = ds.copy(new_file_path)
195
+ print(new_ds)
196
+ new_ds.close()
197
+ ds.to_netcdf4(nc_file_path)
198
+ ```
199
+
200
+
201
+
202
+ ## License
203
+
204
+ This project is licensed under the terms of the Apache Software License 2.0.
@@ -0,0 +1,14 @@
1
+ cfdb/__init__.py,sha256=jkHqBmh0aBkjWX3demwH4eh-P9YypPEnFH5ztXXInnc,289
2
+ cfdb/combine.py,sha256=B1CHZ0NOW4O5j_5NYxAHB76X1A5O3HcZwjNGNx_gfEA,19084
3
+ cfdb/core.py,sha256=IMFGhed5pa2zoYlm7reu1TeCQ6nt3sMmy5cE0LcAb2A,37337
4
+ cfdb/creation.py,sha256=hoR0MVEhbcxKT1JnZ2rK1fUAofxOQT0okKmLYh0PBAY,10686
5
+ cfdb/data_models.py,sha256=AtwtH2Uyo84GucW52aX0AzpG3Sbge41F5lrPuRxSLoY,2166
6
+ cfdb/indexers.py,sha256=BvkQLpdm2EM64ZbSjW9ByXfeUoBZ1V-YKNVVvtAy1HY,10462
7
+ cfdb/main.py,sha256=3HoJr8ZZFD3KIPfSUrQTXdJ9xo9I1vcjfQUWvEbmkv8,26020
8
+ cfdb/support_classes.py,sha256=di0pnspL4O4YL5eKJnGhIOFWdk7D3WWH2ltPziqORtM,36456
9
+ cfdb/tools.py,sha256=1hE8Qja-JdFpi_XTGSBuANRujELd2s4uYbSUCAl3Big,13725
10
+ cfdb/utils.py,sha256=sm7oeCxyrtByRlxc8NV52kBMehHwRJMOIhwpeiAmCYY,74114
11
+ cfdb-0.1.1.dist-info/METADATA,sha256=80jGlWL4ONgx8jZ88MV9YsqgbL_EW504flAG2kRkSsg,11513
12
+ cfdb-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
+ cfdb-0.1.1.dist-info/licenses/LICENSE,sha256=hNqpp2O-F2qp4ozzNN86q1sxnAeFDLNoylHyJK_aiYI,586
14
+ cfdb-0.1.1.dist-info/RECORD,,
@@ -1,57 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: cfdb
3
- Version: 0.1.0
4
- Summary: CF conventions multi-dimensional array storage on top of Booklet
5
- Project-URL: Documentation, https://mullenkamp.github.io/cfdb/
6
- Project-URL: Source, https://github.com/mullenkamp/cfdb
7
- Author-email: mullenkamp <mullenkamp1@gmail.com>
8
- License-File: LICENSE
9
- Classifier: Programming Language :: Python :: 3 :: Only
10
- Requires-Python: >=3.10
11
- Requires-Dist: booklet>=0.9.2
12
- Requires-Dist: cftime
13
- Requires-Dist: lz4
14
- Requires-Dist: msgspec
15
- Requires-Dist: numpy
16
- Requires-Dist: rechunkit>=0.1.0
17
- Requires-Dist: zstandard
18
- Provides-Extra: ebooklet
19
- Requires-Dist: ebooklet>=0.5.10; extra == 'ebooklet'
20
- Provides-Extra: netcdf4
21
- Requires-Dist: h5netcdf; extra == 'netcdf4'
22
- Description-Content-Type: text/markdown
23
-
24
- # cfdb
25
-
26
- <p align="center">
27
- <em>CF conventions multi-dimensional array storage on top of Booklet</em>
28
- </p>
29
-
30
- [![build](https://github.com/mullenkamp/cfdb/workflows/Build/badge.svg)](https://github.com/mullenkamp/cfdb/actions)
31
- [![codecov](https://codecov.io/gh/mullenkamp/cfdb/branch/master/graph/badge.svg)](https://codecov.io/gh/mullenkamp/cfdb)
32
- [![PyPI version](https://badge.fury.io/py/cfdb.svg)](https://badge.fury.io/py/cfdb)
33
-
34
- ---
35
-
36
- **Documentation**: <a href="https://mullenkamp.github.io/cfdb/" target="_blank">https://mullenkamp.github.io/cfdb/</a>
37
-
38
- **Source Code**: <a href="https://github.com/mullenkamp/cfdb" target="_blank">https://github.com/mullenkamp/cfbdb</a>
39
-
40
- ---
41
-
42
- ## Development
43
-
44
- ### Coordinate variables
45
- Must be 1D.
46
- They should have an "ordered" parameter (bool) that defined whether the coord should always be ordered. Int, float, and datetime should default to True. Only string and category dtypes should default to False.
47
- There should be a "regular" parameter (bool) with an associated "step" parameter (int or float). It should work similarly to np.arange. Only ints, floats, and datetimes can use this.
48
- ~~Should I add a "unique" parameter (bool)? Maybe I should just enforce this normally?~~ It should enforce uniqueness in the coords.
49
- There can be a groupby method datasets that would use the rechunker. The rechunker would have the groupby dims set to 1 and the other dims set to the full length.
50
-
51
- #### Multi-dimensional coords
52
- It is possible to create a composite index from multiple 1D coords. But it seems best to implement this type of thing on top of sqlite (or something equivalent).
53
- Keeping each coord 1D makes implementations quite a bit simpler.
54
-
55
- ## License
56
-
57
- This project is licensed under the terms of the Apache Software License 2.0.
@@ -1,13 +0,0 @@
1
- cfdb/__init__.py,sha256=r2CzHI87AZOW0HsVhl0HpN0-Mjh34eB9WG2sCUK4kiA,233
2
- cfdb/combine.py,sha256=B1CHZ0NOW4O5j_5NYxAHB76X1A5O3HcZwjNGNx_gfEA,19084
3
- cfdb/core.py,sha256=IMFGhed5pa2zoYlm7reu1TeCQ6nt3sMmy5cE0LcAb2A,37337
4
- cfdb/creation.py,sha256=hoR0MVEhbcxKT1JnZ2rK1fUAofxOQT0okKmLYh0PBAY,10686
5
- cfdb/data_models.py,sha256=AtwtH2Uyo84GucW52aX0AzpG3Sbge41F5lrPuRxSLoY,2166
6
- cfdb/indexers.py,sha256=Vl0PS44mV4_6IUvPGZIIsd0qQniM3iAtntwe8bhqDrk,10683
7
- cfdb/main.py,sha256=L23zO_glrsOg8e5Vx2Guef3UOKNOw9KFW0Ray0uGqrQ,26372
8
- cfdb/support_classes.py,sha256=qoSVC7eX8I_A8xHA8jLnjLD9211bc3Va9HXvo_uct0A,34806
9
- cfdb/utils.py,sha256=ZEMmvUTa1h-FBCdfBx-oL5xVH7fDbXAObwqYjqeKQGk,73296
10
- cfdb-0.1.0.dist-info/METADATA,sha256=n8_UtgGhkHZgC3MxxaNcFfr2-682Fxc3RP1FDQ43fik,2528
11
- cfdb-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
12
- cfdb-0.1.0.dist-info/licenses/LICENSE,sha256=hNqpp2O-F2qp4ozzNN86q1sxnAeFDLNoylHyJK_aiYI,586
13
- cfdb-0.1.0.dist-info/RECORD,,
File without changes