cfdb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cfdb/core.py ADDED
@@ -0,0 +1,1232 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Sat Oct 14 15:11:20 2023
5
+
6
+ @author: mike
7
+ """
8
+ import h5py
9
+ import io
10
+ import os
11
+ import numpy as np
12
+ from typing import Union, List
13
+ import pathlib
14
+ import copy
15
+ import uuid6 as uuid
16
+ import tempfile
17
+
18
+ try:
19
+ import fcntl
20
+ import_fcntl = True
21
+ except ImportError:
22
+ import_fcntl = False
23
+
24
+ try:
25
+ import pandas as pd
26
+ import_pandas = True
27
+ except ImportError:
28
+ import_pandas = False
29
+
30
+ try:
31
+ import xarray as xr
32
+ import_xarray = True
33
+ except ImportError:
34
+ import_xarray = False
35
+
36
+
37
+ # from . import utils, indexers
38
+ import utils, indexers
39
+
40
+
41
+ h5py.get_config().track_order = True
42
+
43
+ ###################################################
44
+ ### Parameters
45
+
46
+ name_indent = 4
47
+ value_indent = 20
48
+
49
+ ###################################################
50
+ ### Helper functions
51
+
52
+
53
+ def format_value(value):
54
+ """
55
+
56
+ """
57
+ if isinstance(value, (int, np.integer)):
58
+ return str(value)
59
+ elif isinstance(value, (float, np.floating)):
60
+ return f'{value:.2f}'
61
+ else:
62
+ return value
63
+
64
+
65
+ def append_summary(summary, summ_dict):
66
+ """
67
+
68
+ """
69
+ for key, value in summ_dict.items():
70
+ spacing = value_indent - len(key)
71
+ if spacing < 1:
72
+ spacing = 1
73
+
74
+ summary += f"""\n{key}""" + """ """ * spacing + value
75
+
76
+ return summary
77
+
78
+
79
+ def data_variable_summary(ds):
80
+ """
81
+
82
+ """
83
+ if ds:
84
+ summ_dict = {'name': ds.name, 'dims order': '(' + ', '.join(ds.coords) + ')', 'chunk size': str(ds.chunks)}
85
+
86
+ summary = """<cfbooklet.DataVariable>"""
87
+
88
+ summary = append_summary(summary, summ_dict)
89
+
90
+ summary += """\nCoordinates:"""
91
+
92
+ for dim_name in ds.coords:
93
+ dim = ds.file[dim_name]
94
+ dtype_name = dim.encoding['dtype_decoded']
95
+ dim_len = dim.shape[0]
96
+ first_value = format_value(dim[0])
97
+ spacing = value_indent - name_indent - len(dim_name)
98
+ if spacing < 1:
99
+ spacing = 1
100
+ dim_str = f"""\n {dim_name}""" + """ """ * spacing
101
+ dim_str += f"""({dim_len}) {dtype_name} {first_value} ..."""
102
+ summary += dim_str
103
+
104
+ attrs_summary = make_attrs_repr(ds.attrs, name_indent, value_indent, 'Attributes')
105
+ summary += """\n""" + attrs_summary
106
+
107
+ else:
108
+ summary = """DataVariable is closed"""
109
+
110
+ return summary
111
+
112
+
113
+ def coordinate_summary(ds):
114
+ """
115
+
116
+ """
117
+ if ds:
118
+ name = ds.name
119
+ dim_len = ds.shape[0]
120
+ # dtype_name = ds.dtype.name
121
+ # dtype_decoded = ds.encoding['dtype_decoded']
122
+
123
+ first_value = format_value(ds.data[0])
124
+ last_value = format_value(ds.data[-1])
125
+
126
+ # summ_dict = {'name': name, 'dtype encoded': dtype_name, 'dtype decoded': dtype_decoded, 'chunk size': str(ds.chunks), 'dim length': str(dim_len), 'values': f"""{first_value} ... {last_value}"""}
127
+ summ_dict = {'name': name, 'chunk size': str(ds.chunks), 'dim length': str(dim_len), 'values': f"""{first_value} ... {last_value}"""}
128
+
129
+ summary = """<cfbooklet.CoordVariable>"""
130
+
131
+ summary = append_summary(summary, summ_dict)
132
+
133
+ attrs_summary = make_attrs_repr(ds.attrs, name_indent, value_indent, 'Attributes')
134
+ summary += """\n""" + attrs_summary
135
+ else:
136
+ summary = """CoordVariable is closed"""
137
+
138
+ return summary
139
+
140
+
141
+ def make_attrs_repr(attrs, name_indent, value_indent, header):
142
+ summary = f"""{header}:"""
143
+ for key, value in attrs.items():
144
+ spacing = value_indent - name_indent - len(key)
145
+ if spacing < 1:
146
+ spacing = 1
147
+ line_str = f"""\n {key}""" + """ """ * spacing + f"""{value}"""
148
+ summary += line_str
149
+
150
+ return summary
151
+
152
+
153
+ def create_h5py_data_variable(file, name: str, dims: (str, tuple, list), shape: (tuple, list), encoding: dict, data=None, **kwargs):
154
+ """
155
+
156
+ """
157
+ dtype = encoding['dtype']
158
+
159
+ ## Check if dims already exist and if the dim lengths match
160
+ if isinstance(dims, str):
161
+ dims = [dims]
162
+
163
+ for i, dim in enumerate(dims):
164
+ if dim not in file:
165
+ raise ValueError(f'{dim} not in File')
166
+
167
+ dim_len = file._file[dim].shape[0]
168
+ if dim_len != shape[i]:
169
+ raise ValueError(f'{dim} does not have the same length as the input data/shape dim.')
170
+
171
+ ## Make chunks
172
+ if 'chunks' not in kwargs:
173
+ if 'maxshape' in kwargs:
174
+ maxshape = kwargs['maxshape']
175
+ else:
176
+ maxshape = shape
177
+ kwargs.setdefault('chunks', utils.guess_chunk(shape, maxshape, dtype))
178
+
179
+ ## Create variable
180
+ if data is None:
181
+ ds = file._file.create_dataset(name, shape, dtype=dtype, track_order=True, **kwargs)
182
+ else:
183
+ ## Encode data before creating variable
184
+ data = utils.encode_data(data, **encoding)
185
+
186
+ ds = file._file.create_dataset(name, dtype=dtype, data=data, track_order=True, **kwargs)
187
+
188
+ for i, dim in enumerate(dims):
189
+ ds.dims[i].attach_scale(file._file[dim])
190
+ ds.dims[i].label = dim
191
+
192
+ return ds
193
+
194
+
195
+ def create_h5py_coordinate(file, name: str, data, shape: (tuple, list), encoding: dict, **kwargs):
196
+ """
197
+
198
+ """
199
+ if len(shape) != 1:
200
+ raise ValueError('The shape of a coordinate must be 1-D.')
201
+
202
+ dtype = encoding['dtype']
203
+
204
+ ## Make chunks
205
+ if 'chunks' not in kwargs:
206
+ if 'maxshape' in kwargs:
207
+ maxshape = kwargs['maxshape']
208
+ else:
209
+ maxshape = shape
210
+ kwargs.setdefault('chunks', utils.guess_chunk(shape, maxshape, dtype))
211
+
212
+ ## Encode data before creating variable/coordinate
213
+ # print(encoding)
214
+ data = utils.encode_data(data, **encoding)
215
+
216
+ # print(data)
217
+ # print(dtype)
218
+
219
+ ## Make Variable
220
+ ds = file._file.create_dataset(name, dtype=dtype, data=data, track_order=True, **kwargs)
221
+
222
+ ds.make_scale(name)
223
+ ds.dims[0].label = name
224
+
225
+ return ds
226
+
227
+
228
+ def copy_data_variable(to_file, from_variable, name, include_data=True, include_attrs=True, **kwargs):
229
+ """
230
+
231
+ """
232
+ other1 = from_variable._dataset
233
+ for k in ('chunks', 'compression',
234
+ 'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
235
+ 'fillvalue'):
236
+ kwargs.setdefault(k, getattr(other1, k))
237
+
238
+ if 'compression' in other1.attrs:
239
+ compression = other1.attrs['compression']
240
+ kwargs.update(**utils.get_compressor(compression))
241
+ else:
242
+ compression = kwargs['compression']
243
+
244
+ # TODO: more elegant way to pass these (dcpl to create_variable?)
245
+ dcpl = other1.id.get_create_plist()
246
+ kwargs.setdefault('track_times', dcpl.get_obj_track_times())
247
+ # kwargs.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
248
+
249
+ # Special case: the maxshape property always exists, but if we pass it
250
+ # to create_variable, the new variable will automatically get chunked
251
+ # layout. So we copy it only if it is different from shape.
252
+ if other1.maxshape != other1.shape:
253
+ kwargs.setdefault('maxshape', other1.maxshape)
254
+
255
+ encoding = from_variable.encoding._encoding.copy()
256
+ shape = from_variable.shape
257
+
258
+ ds0 = create_h5py_data_variable(to_file, name, tuple(dim.label for dim in other1.dims), shape, encoding, **kwargs)
259
+
260
+ if include_data:
261
+ # Directly copy chunks using write_direct_chunk
262
+ for chunk in ds0.iter_chunks():
263
+ chunk_starts = tuple(c.start for c in chunk)
264
+ filter_mask, data = other1.id.read_direct_chunk(chunk_starts)
265
+ ds0.id.write_direct_chunk(chunk_starts, data, filter_mask)
266
+
267
+ ds = DataVariable(ds0, to_file, encoding)
268
+ if include_attrs:
269
+ ds.attrs.update(from_variable.attrs)
270
+
271
+ return ds
272
+
273
+
274
+ def copy_coordinate(to_file, from_coordinate, name, include_attrs=True, **kwargs):
275
+ """
276
+
277
+ """
278
+ other1 = from_coordinate._dataset
279
+ for k in ('chunks', 'compression',
280
+ 'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
281
+ 'fillvalue'):
282
+ kwargs.setdefault(k, getattr(other1, k))
283
+
284
+ if 'compression' in other1.attrs:
285
+ compression = other1.attrs['compression']
286
+ kwargs.update(**utils.get_compressor(compression))
287
+ else:
288
+ compression = kwargs['compression']
289
+
290
+ # TODO: more elegant way to pass these (dcpl to create_variable?)
291
+ dcpl = other1.id.get_create_plist()
292
+ kwargs.setdefault('track_times', dcpl.get_obj_track_times())
293
+ # kwargs.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
294
+
295
+ # Special case: the maxshape property always exists, but if we pass it
296
+ # to create_variable, the new variable will automatically get chunked
297
+ # layout. So we copy it only if it is different from shape.
298
+ if other1.maxshape != other1.shape:
299
+ kwargs.setdefault('maxshape', other1.maxshape)
300
+
301
+ encoding = from_coordinate.encoding._encoding.copy()
302
+ shape = from_coordinate.shape
303
+
304
+ ds0 = create_h5py_coordinate(to_file, name, from_coordinate.data, shape, encoding, **kwargs)
305
+
306
+ ds = Coordinate(ds0, to_file, encoding)
307
+ if include_attrs:
308
+ ds.attrs.update(from_coordinate.attrs)
309
+
310
+ return ds
311
+
312
+
313
+ def prepare_encodings_for_variables(dtype_encoded, dtype_decoded, scale_factor, add_offset, fillvalue, units, calendar):
314
+ """
315
+
316
+ """
317
+ encoding = {'dtype': dtype_encoded, 'dtype_encoded': dtype_encoded, 'missing_value': fillvalue, '_FillValue': fillvalue, 'add_offset': add_offset, 'scale_factor': scale_factor, 'units': units, 'calendar': calendar}
318
+ for key, value in copy.deepcopy(encoding).items():
319
+ if value is None:
320
+ del encoding[key]
321
+
322
+ if 'datetime64' in dtype_decoded:
323
+ if 'units' not in encoding:
324
+ encoding['units'] = 'seconds since 1970-01-01'
325
+ if 'calendar' not in encoding:
326
+ encoding['calendar'] = 'gregorian'
327
+ encoding['dtype'] = 'int64'
328
+
329
+ return encoding
330
+
331
+
332
+ def file_summary(file):
333
+ """
334
+
335
+ """
336
+ if file:
337
+ file_path = pathlib.Path(file.filename)
338
+ if file_path.exists() and file_path.is_file():
339
+ file_size = file_path.stat().st_size*0.000001
340
+ file_size_str = """{file_size:.1f} MB""".format(file_size=file_size)
341
+ else:
342
+ file_size_str = """NA"""
343
+
344
+ summ_dict = {'file name': file_path.name, 'file size': file_size_str, 'writable': str(file.writable)}
345
+
346
+ summary = """<hdf5tools.File>"""
347
+
348
+ summary = append_summary(summary, summ_dict)
349
+
350
+ summary += """\nCoordinates:"""
351
+
352
+ for dim_name in file.coords:
353
+ dim = file[dim_name]
354
+ dtype_name = dim.encoding['dtype_decoded']
355
+ dim_len = dim.shape[0]
356
+ first_value = format_value(dim[0])
357
+ spacing = value_indent - name_indent - len(dim_name)
358
+ if spacing < 1:
359
+ spacing = 1
360
+ dim_str = f"""\n {dim_name}""" + """ """ * spacing
361
+ dim_str += f"""({dim_len}) {dtype_name} {first_value} ..."""
362
+ summary += dim_str
363
+
364
+ summary += """\nData Variables:"""
365
+
366
+ for ds_name in file.data_vars:
367
+ ds = file[ds_name]
368
+ dtype_name = ds.encoding['dtype_decoded']
369
+ shape = ds.shape
370
+ dims = ', '.join(ds.coords)
371
+ first_value = format_value(ds[tuple(0 for i in range(len(shape)))])
372
+ spacing = value_indent - name_indent - len(ds_name)
373
+ if spacing < 1:
374
+ spacing = 1
375
+ ds_str = f"""\n {ds_name}""" + """ """ * spacing
376
+ ds_str += f"""({dims}) {dtype_name} {first_value} ..."""
377
+ summary += ds_str
378
+
379
+ attrs_summary = make_attrs_repr(file.attrs, name_indent, value_indent, 'Attributes')
380
+ summary += """\n""" + attrs_summary
381
+ else:
382
+ summary = """File is closed"""
383
+
384
+ return summary
385
+
386
+ ###################################################
387
+ ### Classes
388
+
389
+
390
+ class Attributes:
391
+ """
392
+
393
+ """
394
+ def __init__(self, attrs: h5py.AttributeManager):
395
+ self._attrs = attrs
396
+
397
+ def get(self, key, default=None):
398
+ return self._attrs.get(key, default)
399
+
400
+ def __getitem__(self, key):
401
+ return self._attrs[key]
402
+
403
+ def __setitem__(self, key, value):
404
+ self._attrs[key] = value
405
+
406
+ def clear(self):
407
+ self._attrs.clear()
408
+
409
+ def keys(self):
410
+ for key in self._attrs.keys():
411
+ if key not in utils.ignore_attrs:
412
+ yield key
413
+
414
+ def values(self):
415
+ for key, value in self._attrs.items():
416
+ if key not in utils.ignore_attrs:
417
+ yield value
418
+
419
+ def items(self):
420
+ for key, value in self._attrs.items():
421
+ if key not in utils.ignore_attrs:
422
+ yield key, value
423
+
424
+ def pop(self, key, default=None):
425
+ return self._attrs.pop(key, default)
426
+
427
+ def update(self, other=()):
428
+ self._attrs.update(other)
429
+
430
+ def create(self, key, data, shape=None, dtype=None):
431
+ self._attrs.create(key, data, shape, dtype)
432
+
433
+ def modify(self, key, value):
434
+ self._attrs.modify(key, value)
435
+
436
+ def __delitem__(self, key):
437
+ del self._attrs[key]
438
+
439
+ def __contains__(self, key):
440
+ return key in self._attrs
441
+
442
+ def __iter__(self):
443
+ return self._attrs.__iter__()
444
+
445
+ def __repr__(self):
446
+ return make_attrs_repr(self, name_indent, value_indent, 'Attributes')
447
+
448
+
449
+ class Encoding:
450
+ """
451
+
452
+ """
453
+ def __init__(self, attrs: h5py.AttributeManager, dtype, writable, encoding: dict=None):
454
+ if encoding is None:
455
+ enc = utils.get_encoding_data_from_attrs(attrs)
456
+ else:
457
+ enc = utils.get_encoding_data_from_attrs(encoding)
458
+ enc = utils.process_encoding(enc, dtype)
459
+ enc = utils.assign_dtype_decoded(enc)
460
+ self._encoding = enc
461
+ if writable:
462
+ attrs.update(enc)
463
+ self._attrs = attrs
464
+ self._writable = writable
465
+
466
+ def get(self, key, default=None):
467
+ return self._encoding.get(key, default)
468
+
469
+ def __getitem__(self, key):
470
+ return self._encoding[key]
471
+
472
+ def __setitem__(self, key, value):
473
+ if key in utils.enc_fields:
474
+ self._encoding[key] = value
475
+ if self._writable:
476
+ self._attrs[key] = value
477
+ else:
478
+ raise ValueError(f'key must be one of {utils.enc_fields}.')
479
+
480
+ def clear(self):
481
+ keys = list(self._encoding.keys())
482
+ self._encoding.clear()
483
+ if self._writable:
484
+ for key in keys:
485
+ del self._attrs[key]
486
+
487
+ def keys(self):
488
+ return self._encoding.keys()
489
+
490
+ def values(self):
491
+ return self._encoding.values()
492
+
493
+ def items(self):
494
+ return self._encoding.items()
495
+
496
+ def pop(self, key, default=None):
497
+ if self._writable:
498
+ if key in self._attrs:
499
+ del self._attrs[key]
500
+ return self._encoding.pop(key, default)
501
+
502
+ def update(self, other=()):
503
+ key_values = {**other}
504
+ for key, value in key_values.items():
505
+ if key in utils.enc_fields:
506
+ self._encoding[key] = value
507
+ if self._writable:
508
+ self._attrs[key] = value
509
+
510
+ def __delitem__(self, key):
511
+ del self._encoding[key]
512
+ if self._writable:
513
+ del self._attrs[key]
514
+
515
+ def __contains__(self, key):
516
+ return key in self._encoding
517
+
518
+ def __iter__(self):
519
+ return self._encoding.__iter__()
520
+
521
+ def __repr__(self):
522
+ return make_attrs_repr(self, name_indent, value_indent, 'Encodings')
523
+
524
+ def encode(self, values):
525
+ return utils.encode_data(np.asarray(values), **self._encoding)
526
+
527
+ def decode(self, values):
528
+ # results = utils.decode_data(values, **self._encoding)
529
+
530
+ # if results.ndim == 0:
531
+ # return results[()]
532
+ # else:
533
+ # return results
534
+
535
+ return utils.decode_data(np.asarray(values), **self._encoding)
536
+
537
+
538
+ class Variable:
539
+ """
540
+
541
+ """
542
+ def __init__(self, dataset: h5py.Dataset, file, encoding: dict=None):
543
+ """
544
+
545
+ """
546
+ self._dataset = dataset
547
+ self.coords = tuple(dim.label for dim in dataset.dims)
548
+ self.ndim = dataset.ndim
549
+ self.dtype = dataset.dtype
550
+ self.chunks = dataset.chunks
551
+ self.name = dataset.name.split('/')[-1]
552
+ self.file = file
553
+ setattr(file, self.name, self)
554
+ self.attrs = Attributes(dataset.attrs)
555
+ self.encoding = Encoding(dataset.attrs, dataset.dtype, file.writable, encoding)
556
+ self.loc = indexers.LocationIndexer(self)
557
+
558
+
559
+ @property
560
+ def shape(self):
561
+ return self._dataset.shape
562
+
563
+ @property
564
+ def size(self):
565
+ return self._dataset.size
566
+
567
+ @property
568
+ def nbytes(self):
569
+ return self._dataset.nbytes
570
+
571
+ @property
572
+ def maxshape(self):
573
+ return self._dataset.maxshape
574
+
575
+ @property
576
+ def fillvalue(self):
577
+ return self._dataset.fillvalue
578
+
579
+ def reshape(self, new_shape, axis=None):
580
+ """ Reshape the dataset, or the specified axis.
581
+
582
+ The dataset must be stored in chunked format; it can be resized up to
583
+ the "maximum shape" (keyword maxshape) specified at creation time.
584
+ The rank of the dataset cannot be changed.
585
+
586
+ "shape" should be a shape tuple, or if an axis is specified, an integer.
587
+
588
+ BEWARE: This functions differently than the NumPy resize() method!
589
+ The data is not "reshuffled" to fit in the new shape; each axis is
590
+ grown or shrunk independently. The coordinates of existing data are
591
+ fixed.
592
+ """
593
+ self._dataset.resize(new_shape, axis)
594
+
595
+
596
+ def __getitem__(self, key):
597
+ return self.encoding.decode(self._dataset[key])
598
+
599
+ def __setitem__(self, key, value):
600
+ self._dataset[key] = self.encoding.encode(value)
601
+
602
+ def iter_chunks(self, sel=None):
603
+ return self._dataset.iter_chunks(sel)
604
+
605
+ def __bool__(self):
606
+ return self._dataset.__bool__()
607
+
608
+ def len(self):
609
+ return self._dataset.len()
610
+
611
+ def sel(self, selection: dict, **file_kwargs):
612
+ """
613
+
614
+ """
615
+ dims = np.array(self.coords)
616
+
617
+ ## Checks
618
+ if selection is not None:
619
+ keys = tuple(selection.keys())
620
+ for key in keys:
621
+ if key not in dims:
622
+ raise KeyError(f'{key} is not in the coordinates.')
623
+
624
+ ## Create file
625
+ file_kwargs['mode'] = 'w'
626
+ new_file = File(**file_kwargs)
627
+
628
+ ## Iterate through the coordinates
629
+ for dim_name in dims:
630
+ old_dim = self.file[dim_name]
631
+
632
+ if selection is not None:
633
+ if dim_name in selection:
634
+ data = old_dim.loc[selection[dim_name]]
635
+ else:
636
+ data = old_dim.data
637
+ else:
638
+ data = old_dim.data
639
+
640
+ new_dim = new_file.create_coordinate(dim_name, data, encoding=old_dim.encoding._encoding)
641
+ new_dim.attrs.update(old_dim.attrs)
642
+
643
+ ## Iterate through the old variable
644
+ # TODO: Make the variable copy when doing a selection more RAM efficient
645
+
646
+ ds_sel = []
647
+ for dim in dims:
648
+ if dim in keys:
649
+ ds_sel.append(selection[dim])
650
+ else:
651
+ ds_sel.append(None)
652
+
653
+ # print(ds_sel)
654
+
655
+ data = self.loc[tuple(ds_sel)]
656
+ new_ds = new_file.create_data_variable(self.name, self.coords, data=data, encoding=self.encoding._encoding)
657
+ new_ds.attrs.update(self.attrs)
658
+
659
+ return new_ds
660
+
661
+
662
+ class Coordinate(Variable):
663
+ """
664
+
665
+ """
666
+ @property
667
+ def data(self):
668
+ return self[()]
669
+
670
+
671
+ def copy(self, to_file=None, name: str=None, include_attrs=True, **kwargs):
672
+ """
673
+ Copy a Coordinate object.
674
+ """
675
+ if (to_file is None) and (name is None):
676
+ raise ValueError('If to_file is None, then a name must be passed and it must be different from the original.')
677
+
678
+ if to_file is None:
679
+ to_file = self.file
680
+
681
+ if name is None:
682
+ name = self.name
683
+
684
+ ds = copy_coordinate(to_file, self, name, include_attrs=include_attrs, **kwargs)
685
+
686
+ return ds
687
+
688
+ def __repr__(self):
689
+ """
690
+
691
+ """
692
+ return coordinate_summary(self)
693
+
694
+
695
+ def to_pandas(self):
696
+ """
697
+
698
+ """
699
+ if not import_pandas:
700
+ raise ImportError('pandas could not be imported.')
701
+
702
+ return pd.Index(self.data, name=self.name)
703
+
704
+
705
+ # def to_xarray(self):
706
+ # """
707
+
708
+ # """
709
+
710
+
711
+ class DataVariable(Variable):
712
+ """
713
+
714
+ """
715
+ def to_pandas(self):
716
+ """
717
+
718
+ """
719
+ if not import_pandas:
720
+ raise ImportError('pandas could not be imported.')
721
+
722
+ indexes = []
723
+ for dim in self.coords:
724
+ coord = self.file[dim]
725
+ indexes.append(coord.data)
726
+
727
+ pd_index = pd.MultiIndex.from_product(indexes, names=self.coords)
728
+
729
+ series = pd.Series(self[()].flatten(), index=pd_index)
730
+ series.name = self.name
731
+
732
+ return series
733
+
734
+
735
+ def to_xarray(self, **kwargs):
736
+ """
737
+
738
+ """
739
+ if not import_xarray:
740
+ raise ImportError('xarray could not be imported.')
741
+
742
+ da = xr.DataArray(data=self[()], coords=[self.file[dim].data for dim in self.coords], dims=self.coords, name=self.name, attrs=self.attrs)
743
+
744
+ return da
745
+
746
+
747
+ def copy(self, to_file=None, name: str=None, include_data=True, include_attrs=True, **kwargs):
748
+ """
749
+ Copy a DataVariable object.
750
+ """
751
+ if (to_file is None) and (name is None):
752
+ raise ValueError('If to_file is None, then a name must be passed and it must be different from the original.')
753
+
754
+ if to_file is None:
755
+ to_file = self.file
756
+
757
+ if name is None:
758
+ name = self.name
759
+
760
+ ds = copy_data_variable(to_file, self, name, include_data=include_data, include_attrs=include_attrs, **kwargs)
761
+
762
+ return ds
763
+
764
+
765
+ def __repr__(self):
766
+ """
767
+
768
+ """
769
+ return data_variable_summary(self)
770
+
771
+
772
+ class File:
773
+ """
774
+
775
+ """
776
+ def __init__(self, name: Union[str, pathlib.Path, io.BytesIO]=None, mode: str='r', compression: str='lzf', write_lock=False, **kwargs):
777
+ """
778
+ The top level hdf5 file object for managing cf conventions data.
779
+ Variables are all labeled arrays. Coordinates are a type of variable that is a one-dimensional labelled array associated with a dimension of a data variable. Data variables are variables have one or more dimensions and must have coordinates assigned to their dimensions.
780
+
781
+ Parameters
782
+ ----------
783
+ name : str, pathlib.Path, io.BytesIO, or None
784
+ A str or pathlib.Path object to a file on disk, a BytesIO object, or None. If None, it will create an in-memory hdf5 File.
785
+ mode : str
786
+ The typical python open mode. r for read, r+/a/x for read and write, w for create new file to write.
787
+ compression : str or None
788
+ The default compression for all variables used for the chunks in the hdf5 files. These can be changed individually at variable creation. Must be one of gzip, lzf, zstd, lz4, or None. gzip is compatible with any hdf5 installation (not only h5py), so this should be used if interoperability across platforms is important. lzf is compatible with any h5py installation, so if only python users will need to access these files then this is a better option than gzip. zstd and lz4 require the hdf5plugin python package, but zstd is the best compression option if users have access to the hdf5plugin package. None has no compression and is generally not recommended except in niche situations.
789
+ write_lock : bool
790
+ Lock the file (using fcntl.flock) during write operations. Only use this when using multithreading or multiprocessing and you want to write to the same file. You probably shouldn't perform read operations during the writes.
791
+ **kwargs
792
+ Any other kwargs that will be passed to the h5py.File object.
793
+ """
794
+ writable = True if (mode.lower() in ['r+', 'w', 'a', 'w-', 'x']) else False
795
+
796
+ if 'rdcc_nbytes' not in kwargs:
797
+ kwargs['rdcc_nbytes'] = 2**21
798
+ lock_fileno = None
799
+ if name is None:
800
+ name = uuid.uuid4().hex[:16]
801
+ kwargs.setdefault('driver', 'core')
802
+ if 'backing_store' not in kwargs:
803
+ kwargs.setdefault('backing_store', False)
804
+ file = h5py.File(name=name, track_order=True, mode='w', **kwargs)
805
+ writable = True
806
+ else:
807
+ if write_lock and writable and import_fcntl:
808
+ lock_fileno = os.open(name, os.O_RDONLY)
809
+ fcntl.flock(lock_fileno, fcntl.LOCK_EX)
810
+
811
+ file = h5py.File(name=name, mode=mode, track_order=True, locking=False, **kwargs)
812
+ else:
813
+ file = h5py.File(name=name, mode=mode, track_order=True, **kwargs)
814
+
815
+ self._file = file
816
+ self.mode = mode
817
+ self.writable = writable
818
+ self.filename = file.filename
819
+ self.compression = compression
820
+ self.lock_fileno = lock_fileno
821
+ self.driver = file.driver
822
+
823
+ for ds_name in file:
824
+ ds = file[ds_name]
825
+ if utils.is_scale(ds):
826
+ Coordinate(ds, self)
827
+ else:
828
+ DataVariable(ds, self)
829
+
830
+ self.attrs = Attributes(file.attrs)
831
+
832
+
833
+ @property
834
+ def variables(self):
835
+ """
836
+ Return a tuple of all the variables (coords and data variables).
837
+ """
838
+ variables = [var for var in self]
839
+ return tuple(variables)
840
+
841
+ @property
842
+ def coords(self):
843
+ """
844
+ Return a tuple of all the coordinates.
845
+ """
846
+ coords = []
847
+ for name in self:
848
+ if isinstance(self[name], Coordinate):
849
+ coords.append(name)
850
+ return tuple(coords)
851
+
852
+ @property
853
+ def data_vars(self):
854
+ """
855
+ Return a tuple of all the data variables.
856
+ """
857
+ data_vars = []
858
+ for name in self:
859
+ if isinstance(self[name], DataVariable):
860
+ data_vars.append(name)
861
+ return tuple(data_vars)
862
+
863
+
864
+ def __bool__(self):
865
+ """
866
+
867
+ """
868
+ return self._file.__bool__()
869
+
870
+ def __iter__(self):
871
+ return self._file.__iter__()
872
+
873
+ def __len__(self):
874
+ return len(self._file)
875
+
876
+ def __contains__(self, key):
877
+ return key in self._file
878
+
879
+ def __getitem__(self, key):
880
+ if isinstance(key, str):
881
+ if key in self._file:
882
+ return getattr(self, key)
883
+ else:
884
+ raise KeyError(key)
885
+ else:
886
+ raise TypeError('key must be a string.')
887
+
888
+ def __setitem__(self, key, value):
889
+ if isinstance(value, Variable):
890
+ setattr(self, key, value)
891
+ else:
892
+ raise TypeError('Assigned value must be a Variable or Coordinate object.')
893
+
894
+ def __delitem__(self, key):
895
+ try:
896
+ if key not in self:
897
+ raise KeyError(key)
898
+
899
+ # Check if the object to delete is a coordinate
900
+ # And if it is, check that no variables are attached to it
901
+ if isinstance(self[key], Coordinate):
902
+ for ds_name in self.data_vars:
903
+ if key in self[ds_name].coords:
904
+ raise ValueError(f'{key} is a coordinate of {ds_name}. You must delete all variables associated with a coordinate before you can delete the coordinate.')
905
+
906
+ del self._file[key]
907
+ delattr(self, key)
908
+ except Exception as err:
909
+ raise err
910
+
911
+ def __enter__(self):
912
+ return self
913
+
914
+ def __exit__(self, *args):
915
+ # self._file.__exit__()
916
+ self.close()
917
+
918
+ def close(self):
919
+ self._file.close()
920
+ if self.lock_fileno is not None:
921
+ fcntl.flock(self.lock_fileno, fcntl.LOCK_UN)
922
+ os.close(self.lock_fileno)
923
+
924
+ def flush(self):
925
+ """
926
+
927
+ """
928
+ self._file.flush()
929
+
930
+
931
+ def __repr__(self):
932
+ """
933
+
934
+ """
935
+ return file_summary(self)
936
+
937
+ def intersect(self, coords: dict=None, include_dims: list=None, exclude_dims: list=None, include_variables: list=None, exclude_variables: list=None, **file_kwargs):
938
+ """
939
+
940
+ """
941
+ ## Check for coordinate names in input
942
+ dims = np.asarray(self.coords)
943
+
944
+ if coords is not None:
945
+ keys = tuple(coords.keys())
946
+ for key in keys:
947
+ if key not in dims:
948
+ raise KeyError(f'{key} is not in the coordinates.')
949
+
950
+ if include_dims is not None:
951
+ include_dims_check = np.isin(include_dims, dims)
952
+ if not include_dims_check.all():
953
+ no_dims = ', '.join(include_dims[np.where(include_dims_check)[0].tolist()])
954
+ raise KeyError(f'{no_dims} are not in dims.')
955
+
956
+ if exclude_dims is not None:
957
+ exclude_dims_check = np.isin(exclude_dims, dims)
958
+ if not exclude_dims_check.all():
959
+ no_dims = ', '.join(exclude_dims[np.where(exclude_dims_check)[0].tolist()])
960
+ raise KeyError(f'{no_dims} are not in dims.')
961
+
962
+ ## Check if variables exist
963
+ variables = np.array(self.data_vars)
964
+
965
+ if include_variables is not None:
966
+ include_variables_check = np.isin(include_variables, variables)
967
+ if not include_variables_check.all():
968
+ no_variables = ', '.join(include_variables[np.where(include_variables_check)[0].tolist()])
969
+ raise KeyError(f'{no_variables} are not in variables.')
970
+
971
+ if exclude_variables is not None:
972
+ exclude_variables_check = np.isin(exclude_variables, variables)
973
+ if not exclude_variables_check.all():
974
+ no_variables = ', '.join(exclude_variables[np.where(exclude_variables_check)[0].tolist()])
975
+ raise KeyError(f'{no_variables} are not in variables.')
976
+
977
+ ## Filter dims
978
+ if include_dims is not None:
979
+ dims = dims[np.isin(dims, include_dims)]
980
+ if exclude_dims is not None:
981
+ dims = dims[~np.isin(dims, exclude_dims)]
982
+
983
+ ## Filter variables
984
+ if include_variables is not None:
985
+ variables = variables[np.isin(variables, include_variables)]
986
+ if exclude_variables is not None:
987
+ variables = variables[~np.isin(variables, exclude_variables)]
988
+
989
+ for ds_name in copy.deepcopy(variables):
990
+ ds = self[ds_name]
991
+ ds_dims = np.asarray(ds.coords)
992
+ dims_check = np.isin(ds_dims, dims).all()
993
+ if not dims_check:
994
+ variables = np.delete(variables, np.where(variables == ds_name)[0])
995
+
996
+ ## Create file
997
+ file_kwargs['mode'] = 'w'
998
+ new_file = File(**file_kwargs)
999
+
1000
+ ## Iterate through the coordinates
1001
+ for dim_name in dims:
1002
+ old_dim = self[dim_name]
1003
+
1004
+ if coords is not None:
1005
+ if dim_name in coords:
1006
+ data = old_dim.loc[coords[dim_name]]
1007
+ else:
1008
+ data = old_dim.data
1009
+ else:
1010
+ data = old_dim.data
1011
+
1012
+ new_dim = new_file.create_coordinate(dim_name, data, encoding=old_dim.encoding._encoding)
1013
+ new_dim.attrs.update(old_dim.attrs)
1014
+
1015
+ ## Iterate through the old variables
1016
+ # TODO: Make the variable copy when doing a selection more RAM efficient
1017
+ for ds_name in variables:
1018
+ old_ds = self[ds_name]
1019
+
1020
+ if coords is not None:
1021
+ ds_dims = old_ds.coords
1022
+
1023
+ ds_sel = []
1024
+ for dim in ds_dims:
1025
+ if dim in keys:
1026
+ ds_sel.append(coords[dim])
1027
+ else:
1028
+ ds_sel.append(None)
1029
+
1030
+ data = old_ds.loc[tuple(ds_sel)]
1031
+ new_ds = new_file.create_data_variable(ds_name, old_ds.coords, data=data, encoding=old_ds.encoding._encoding)
1032
+ new_ds.attrs.update(old_ds.attrs)
1033
+ else:
1034
+ new_ds = old_ds.copy(new_file)
1035
+
1036
+ ## Add global attrs
1037
+ # new_file.attrs.update(self.attrs)
1038
+
1039
+ return new_file
1040
+
1041
+
1042
+ def to_pandas(self):
1043
+ """
1044
+ Convert the entire file into a pandas DataFrame.
1045
+ """
1046
+ if not import_pandas:
1047
+ raise ImportError('pandas could not be imported.')
1048
+
1049
+ # TODO: This feels wrong...but it works...
1050
+ result = None
1051
+ for var_name in self.data_vars:
1052
+ if result is None:
1053
+ result = self[var_name].to_pandas().to_frame()
1054
+ else:
1055
+ result = result.join(self[var_name].to_pandas().to_frame(), how='outer')
1056
+
1057
+ self.close()
1058
+
1059
+ return result
1060
+
1061
+
1062
+ def to_xarray(self, **kwargs):
1063
+ """
1064
+ Closes the file and opens it in xarray.
1065
+
1066
+ Parameters
1067
+ ----------
1068
+ kwargs
1069
+ Any kwargs that can be passed to xr.open_dataset.
1070
+
1071
+ Returns
1072
+ -------
1073
+ xr.Dataset
1074
+ """
1075
+ if not import_xarray:
1076
+ raise ImportError('xarray could not be imported.')
1077
+
1078
+ filename = pathlib.Path(self.filename)
1079
+
1080
+ if filename.is_file():
1081
+ self.close()
1082
+ else:
1083
+ temp_file = tempfile.NamedTemporaryFile()
1084
+ filename = temp_file.name
1085
+ self.to_file(filename)
1086
+ self.close()
1087
+
1088
+ x1 = xr.open_dataset(filename, **kwargs)
1089
+
1090
+ return x1
1091
+
1092
+
1093
+ def to_file(self, name: Union[str, pathlib.Path, io.BytesIO], compression: str='lzf', **file_kwargs):
1094
+ """
1095
+ Like copy, but must be a file path and will not be returned.
1096
+ """
1097
+ file = self.copy(name, compression, **file_kwargs)
1098
+ file.close()
1099
+
1100
+
1101
+ def copy(self, name: Union[str, pathlib.Path, io.BytesIO]=None, compression: str='lzf', **file_kwargs):
1102
+ """
1103
+ Copy a file object. kwargs can be any parameter for File.
1104
+ """
1105
+ # kwargs.setdefault('mode', 'w')
1106
+ file = File(name, mode='w', compression=compression, **file_kwargs)
1107
+
1108
+ ## Create coordinates
1109
+ for dim_name in self.coords:
1110
+ dim = self[dim_name]
1111
+ _ = copy_coordinate(file, dim, dim_name)
1112
+
1113
+ ## Create variables
1114
+ for ds_name in self.data_vars:
1115
+ ds = self[ds_name]
1116
+ _ = copy_data_variable(file, ds, ds_name)
1117
+
1118
+ return file
1119
+
1120
+
1121
+ def create_coordinate(self, name, data, dtype_encoded=None, dtype_decoded=None, scale_factor=None, add_offset=None, fillvalue=None, units=None, calendar=None, **kwargs):
1122
+ """
1123
+
1124
+ """
1125
+ if 'compression' not in kwargs:
1126
+ compression = self.compression
1127
+ compressor = utils.get_compressor(compression)
1128
+ kwargs.update({**compressor})
1129
+ else:
1130
+ compression = kwargs['compression']
1131
+
1132
+ data = np.asarray(data)
1133
+
1134
+ dtype_decoded, shape = utils.get_dtype_shape(data, dtype=dtype_decoded, shape=None)
1135
+
1136
+ if dtype_encoded is None:
1137
+ dtype_encoded = dtype_decoded
1138
+
1139
+ encoding = prepare_encodings_for_variables(dtype_encoded, dtype_decoded, scale_factor, add_offset, fillvalue, units, calendar)
1140
+
1141
+ coordinate = create_h5py_coordinate(self, name, data, shape, encoding, **kwargs)
1142
+ dim = Coordinate(coordinate, self, encoding)
1143
+ dim.encoding['compression'] = str(compression)
1144
+
1145
+ return dim
1146
+
1147
+
1148
+ def create_data_variable(self, name: str, dims: (str, tuple, list), shape: (tuple, list)=None, data=None, dtype_encoded=None, dtype_decoded=None, scale_factor=None, add_offset=None, fillvalue=None, units=None, calendar=None, **kwargs):
1149
+ """
1150
+ Add auto_encode option to determine the scale and offset automatically from the desired dtype? No, but provide the tool to allow the user to do it beforehand if they want.
1151
+ """
1152
+ if 'compression' not in kwargs:
1153
+ compression = self.compression
1154
+ compressor = utils.get_compressor(compression)
1155
+ kwargs.update({**compressor})
1156
+ else:
1157
+ compression = kwargs['compression']
1158
+
1159
+ if data is not None:
1160
+ data = np.asarray(data)
1161
+
1162
+ dtype_decoded, shape = utils.get_dtype_shape(data, dtype_decoded, shape)
1163
+
1164
+ if dtype_encoded is None:
1165
+ dtype_encoded = dtype_decoded
1166
+
1167
+ encoding = prepare_encodings_for_variables(dtype_encoded, dtype_decoded, scale_factor, add_offset, fillvalue, units, calendar)
1168
+
1169
+ ds0 = create_h5py_data_variable(self, name, dims, shape, encoding, data, **kwargs)
1170
+ ds = DataVariable(ds0, self, encoding)
1171
+ ds.encoding['compression'] = str(compression)
1172
+
1173
+ return ds
1174
+
1175
+
1176
+ def create_data_variable_like(self, from_data_var: DataVariable, name: str, include_data: bool=False, include_attrs: bool=False, **kwargs):
1177
+ """ Create a variable similar to `other`.
1178
+
1179
+ name
1180
+ Name of the variable (absolute or relative). Provide None to make
1181
+ an anonymous variable.
1182
+ from_variable
1183
+ The variable which the new variable should mimic. All properties, such
1184
+ as shape, dtype, chunking, ... will be taken from it, but no data
1185
+ or attributes are being copied.
1186
+
1187
+ Any variable keywords (see create_variable) may be provided, including
1188
+ shape and dtype, in which case the provided values take precedence over
1189
+ those from `other`.
1190
+ """
1191
+ ds = copy_data_variable(self, from_data_var, name, include_data, include_attrs, **kwargs)
1192
+
1193
+ return ds
1194
+
1195
+
1196
+
1197
+
1198
+
1199
+
1200
+
1201
+
1202
+
1203
+
1204
+
1205
+
1206
+
1207
+
1208
+
1209
+
1210
+
1211
+
1212
+
1213
+
1214
+
1215
+
1216
+
1217
+
1218
+
1219
+
1220
+
1221
+
1222
+
1223
+
1224
+
1225
+
1226
+
1227
+
1228
+
1229
+
1230
+
1231
+
1232
+