cfdb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cfdb/utils.py ADDED
@@ -0,0 +1,2079 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Fri Sep 30 19:52:08 2022
5
+
6
+ @author: mike
7
+ """
8
+ import io
9
+ import pathlib
10
+ # import h5py
11
+ import os
12
+ import numpy as np
13
+ import msgspec
14
+ import re
15
+ from copy import deepcopy
16
+ # import xarray as xr
17
+ # from time import time
18
+ # from datetime import datetime
19
+ import cftime
20
+ import math
21
+ import rechunkit
22
+ from typing import Set, Optional, Dict, Tuple, List, Union, Any
23
+ # import zstandard as zstd
24
+ # import lz4
25
+ import booklet
26
+
27
+ # import dateutil.parser as dparser
28
+ # import numcodecs
29
+ # import hdf5plugin
30
+
31
+ from . import data_models
32
+ # import data_models
33
+
34
+ ########################################################
35
+ ### Parmeters
36
+
37
+
38
+ CHUNK_BASE = 32*1024 # Multiplier by which chunks are adjusted
39
+ CHUNK_MIN = 32*1024 # Soft lower limit (32k)
40
+ CHUNK_MAX = 3*1024**2 # Hard upper limit (4M)
41
+
42
+ time_str_conversion = {'days': 'datetime64[D]',
43
+ 'hours': 'datetime64[h]',
44
+ 'minutes': 'datetime64[m]',
45
+ 'seconds': 'datetime64[s]',
46
+ 'milliseconds': 'datetime64[ms]',
47
+ 'microseconds': 'datetime64[us]',
48
+ 'nanoseconds': 'datetime64[ns]'}
49
+
50
+ # enc_fields = ('units', 'calendar', 'dtype', 'missing_value', '_FillValue', 'add_offset', 'scale_factor', 'dtype_decoded', 'dtype_encoded', 'compression')
51
+
52
+ fillvalue_dict = {'int8': -128, 'int16': -32768, 'int32': -2147483648, 'int64': -9223372036854775808, 'float32': np.nan, 'float64': np.nan, 'str': ''}
53
+
54
+ var_chunk_key_str = '{var_name}!{dims}'
55
+
56
+ attrs_key_str = '_{var_name}.attrs'
57
+
58
+ name_indent = 4
59
+ value_indent = 20
60
+ var_name_regex = "^[a-zA-Z][a-zA-Z0-9_]*$"
61
+ var_name_pattern = re.compile(var_name_regex)
62
+
63
+ time_units_dict = {
64
+ 'M': 'months',
65
+ 'D': 'days',
66
+ 'h': 'hours',
67
+ 'm': 'minutes',
68
+ 's': 'seconds',
69
+ 'ms': 'milliseconds',
70
+ 'us': 'microseconds',
71
+ 'ns': 'nanoseconds',
72
+ }
73
+
74
+ compression_options = ('zstd', 'lz4')
75
+ default_compression_levels = {'zstd': 1, 'lz4': 1}
76
+ default_n_buckets = 144013
77
+
78
+ default_params = {'lon': {'name': 'longitude', 'dtype_encoded': 'int32', 'fillvalue': -2147483648, 'scale_factor': 0.0000001, 'dtype_decoded': 'float32'},
79
+ 'lat': {'name': 'latitude', 'dtype_encoded': 'int32', 'fillvalue': -2147483648, 'scale_factor': 0.0000001, 'dtype_decoded': 'float32'},
80
+ 'height': {'name': 'height', 'dtype_encoded': 'int32', 'fillvalue': -2147483648, 'scale_factor': 0.001, 'dtype_decoded': 'float32'},
81
+ 'altitude': {'name': 'altitude', 'dtype_encoded': 'int32', 'fillvalue': -2147483648, 'scale_factor': 0.001, 'dtype_decoded': 'float32'},
82
+ 'time': {'name': 'time', 'dtype_encoded': 'int64', 'dtype_decoded': 'datetime64[s]'},
83
+ 'modified_date': {'name': 'modified_date', 'dtype_encoded': 'datetime64[us]', 'dtype_decoded': 'datetime64[us]'},
84
+ 'band': {'name': 'band', 'dtype_decoded': 'uint8', 'dtype_encoded': 'uint8', 'fillvalue': 0},
85
+ # 'chunk_day': {'dtype_encoded': 'int32'},
86
+ # 'chunk_date': {'fillvalue': -99999999, 'units': "days since 1970-01-01 00:00:00"},
87
+ 'censor_code': {'name': 'censor_code', 'dtype_decoded': 'uint8', 'dtype_encoded': 'uint8', 'fillvalue': 0},
88
+ # 'bore_top_of_screen': {'dtype_encoded': 'int16', 'fillvalue': 9999, 'scale_factor': 0.1},
89
+ # 'bore_bottom_of_screen': {'dtype_encoded': 'int16', 'fillvalue': 9999, 'scale_factor': 0.1},
90
+ # 'bore_depth': {'dtype_encoded': 'int16', 'fillvalue': -9999, 'scale_factor': 0.1},
91
+ # 'reference_level': {'dtype_encoded': 'int16', 'fillvalue': -9999, 'scale_factor': 1},
92
+ }
93
+
94
+ # base_attrs = {'station_id': {'cf_role': "timeseries_id", 'description': 'The unique ID associated with the geometry for a single result.'},
95
+ # 'lat': {'standard_name': "latitude", 'units': "degrees_north"},
96
+ # 'lon': {'standard_name': "longitude", 'units': "degrees_east"},
97
+ # 'altitude': {'standard_name': 'surface_altitude', 'long_name': 'height above the geoid to the lower boundary of the atmosphere', 'units': 'm'},
98
+ # 'geometry': {'long_name': 'The hexadecimal encoding of the Well-Known Binary (WKB) geometry', 'crs_EPSG': 4326},
99
+ # 'station_geometry': {'long_name': 'The hexadecimal encoding of the Well-Known Binary (WKB) station geometry', 'crs_EPSG': 4326},
100
+ # 'height': {'standard_name': 'height', 'long_name': 'vertical distance above the surface', 'units': 'm', 'positive': 'up'},
101
+ # 'time': {'standard_name': 'time', 'long_name': 'start_time'}, 'name': {'long_name': 'station name'},
102
+ # 'ref': {'long_name': 'station reference id given by the owner'}, 'modified_date': {'long_name': 'last modified date'},
103
+ # 'band': {'long_name': 'band number'},
104
+ # 'chunk_date': {'long_name': 'chunking date'},
105
+ # 'chunk_day': {'long_name': 'chunking day', 'description': 'The chunk day is the number of days after 1970-01-01. Can be negative for days before 1970-01-01 with a minimum of -106751, which is 1677-09-22 (minimum possible date). The maximum value is 106751.'},
106
+ # 'chunk_hash': {'long_name': 'chunk hash', 'description': 'The unique hash of the results parameter for comparison purposes.'},
107
+ # 'chunk_id': {'long_name': 'chunk id', 'description': 'The unique id of the results chunk associated with the specific station.'},
108
+ # 'censor_code': {'long_name': 'data censor code', 'standard_name': 'status_flag', 'flag_values': '0 1 2 3 4 5', 'flag_meanings': 'greater_than less_than not_censored non-detect present_but_not_quantified unknown'},
109
+ # 'bore_top_of_screen': {'long_name': 'bore top of screen', 'description': 'The depth to the top of the screen from the reference level.', 'units': 'm', 'positive': 'down'},
110
+ # 'bore_bottom_of_screen': {'long_name': 'bore bottom of screen', 'description': 'The depth to the bottom of the screen from the reference level.', 'units': 'm', 'positive': 'down'},
111
+ # 'bore_depth': {'long_name': 'bore depth', 'description': 'The depth of the bore from the reference level.', 'units': 'm', 'positive': 'down'},
112
+ # 'alt_name': {'long_name': 'Alternative name', 'description': 'The alternative name for the station'},
113
+ # 'reference_level': {'long_name': 'The bore reference level', 'description': 'The bore reference level for measurements.', 'units': 'mm', 'positive': 'up'}
114
+ # }
115
+
116
+ default_attrs = dict(
117
+ lat={
118
+ 'long_name': 'latitude',
119
+ 'units': 'degrees_north',
120
+ 'standard_name': 'latitude',
121
+ 'axis': 'Y',
122
+ },
123
+ lon={
124
+ 'long_name': 'longitude',
125
+ 'units': 'degrees_east',
126
+ 'standard_name': 'longitude',
127
+ 'axis': 'X',
128
+ },
129
+ height={
130
+ 'long_name': 'height',
131
+ 'units': 'm',
132
+ 'standard_name': 'height',
133
+ 'positive': 'up',
134
+ 'axis': 'Z',
135
+ },
136
+ altitude={
137
+ 'long_name': 'altitude',
138
+ 'units': 'm',
139
+ 'standard_name': 'altitude',
140
+ 'positive': 'up',
141
+ 'axis': 'Z',
142
+ },
143
+ time={
144
+ 'long_name': 'time',
145
+ # 'units': 'seconds since 1970-01-01 00:00:00',
146
+ 'standard_name': 'time',
147
+ # 'calendar': 'proleptic_gregorian',
148
+ 'axis': 'T',
149
+ },
150
+ )
151
+
152
+ #########################################################
153
+ ### Classes
154
+
155
+
156
+
157
+
158
+ # class ChunkIterator:
159
+ # """
160
+ # Class to iterate through list of chunks of a given dataset
161
+ # """
162
+ # def __init__(self, chunks, shape, source_sel=None):
163
+ # self._shape = shape
164
+ # rank = len(shape)
165
+
166
+ # # if not dset.chunks:
167
+ # # # can only use with chunked datasets
168
+ # # raise TypeError("Chunked dataset required")
169
+
170
+ # self._layout = chunks
171
+ # if source_sel is None:
172
+ # # select over entire dataset
173
+ # slices = []
174
+ # for dim in range(rank):
175
+ # slices.append(slice(0, self._shape[dim]))
176
+ # self._sel = tuple(slices)
177
+ # else:
178
+ # if isinstance(source_sel, slice):
179
+ # self._sel = (source_sel,)
180
+ # else:
181
+ # self._sel = source_sel
182
+ # if len(self._sel) != rank:
183
+ # raise ValueError("Invalid selection - selection region must have same rank as dataset")
184
+ # self._chunk_index = []
185
+ # for dim in range(rank):
186
+ # s = self._sel[dim]
187
+ # if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
188
+ # raise ValueError("Invalid selection - selection region must be within dataset space")
189
+ # index = s.start // self._layout[dim]
190
+ # self._chunk_index.append(index)
191
+
192
+ # def __iter__(self):
193
+ # return self
194
+
195
+ # def __next__(self):
196
+ # rank = len(self._shape)
197
+ # slices = []
198
+ # if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
199
+ # # ran past the last chunk, end iteration
200
+ # raise StopIteration()
201
+
202
+ # for dim in range(rank):
203
+ # s = self._sel[dim]
204
+ # start = self._chunk_index[dim] * self._layout[dim]
205
+ # stop = (self._chunk_index[dim] + 1) * self._layout[dim]
206
+ # # adjust the start if this is an edge chunk
207
+ # if start < s.start:
208
+ # start = s.start
209
+ # if stop > s.stop:
210
+ # stop = s.stop # trim to end of the selection
211
+ # s = slice(start, stop, 1)
212
+ # slices.append(s)
213
+
214
+ # # bump up the last index and carry forward if we run outside the selection
215
+ # dim = rank - 1
216
+ # while dim >= 0:
217
+ # s = self._sel[dim]
218
+ # self._chunk_index[dim] += 1
219
+
220
+ # chunk_end = self._chunk_index[dim] * self._layout[dim]
221
+ # if chunk_end < s.stop:
222
+ # # we still have room to extend along this dimensions
223
+ # return tuple(slices)
224
+
225
+ # if dim > 0:
226
+ # # reset to the start and continue iterating with higher dimension
227
+ # self._chunk_index[dim] = 0
228
+ # dim -= 1
229
+ # return tuple(slices)
230
+
231
+
232
+ #########################################################
233
+ ### Functions
234
+
235
+
236
+ def parse_cf_time_units(dtype_decoded):
237
+ """
238
+
239
+ """
240
+ np_time_str = dtype_decoded.str.split('[')[1].split(']')[0]
241
+ time_name = time_units_dict[np_time_str]
242
+ datetime = np.datetime64('1970-01-01', np_time_str)
243
+ units = f'{time_name} since {datetime}'
244
+
245
+ return units
246
+
247
+
248
+ def min_max_dates_per_bit_len(n_bits):
249
+ """
250
+
251
+ """
252
+ n_bits_options = (16, 32, 64)
253
+ if n_bits not in n_bits_options:
254
+ raise ValueError(f'n_bits must be one of {n_bits_options}')
255
+
256
+ freq_codes = ('D', 'h', 'm', 's')
257
+ res_dict = {}
258
+ for code in freq_codes:
259
+ int_len = int(2**n_bits*.5)
260
+ min_date = np.datetime64(-int_len + 1, code).astype(str)
261
+ max_date = np.datetime64(int_len - 1, code).astype(str)
262
+ res_dict[code] = (min_date, max_date)
263
+
264
+ return res_dict
265
+
266
+
267
+ def dataset_finalizer(blt_file, sys_meta):
268
+ """
269
+
270
+ """
271
+ old_meta_data = blt_file.get_metadata()
272
+ if old_meta_data is not None:
273
+ old_meta = msgspec.convert(old_meta_data, data_models.SysMeta)
274
+ if old_meta != sys_meta:
275
+ blt_file.set_metadata(msgspec.to_builtins(sys_meta))
276
+ else:
277
+ blt_file.set_metadata(msgspec.to_builtins(sys_meta))
278
+
279
+ blt_file.close()
280
+
281
+
282
+ def attrs_finalizer(blt_file, attrs, var_name, writeable):
283
+ """
284
+
285
+ """
286
+ if attrs and writeable:
287
+ key = attrs_key_str.format(var_name=var_name)
288
+ old_attrs = blt_file.get(key)
289
+ if old_attrs is not None:
290
+ old_attrs = msgspec.json.decode(old_attrs)
291
+ if old_attrs != attrs:
292
+ blt_file.set(key, msgspec.json.encode(attrs))
293
+ else:
294
+ blt_file.set(key, msgspec.json.encode(attrs))
295
+
296
+
297
+ def compute_scale_and_offset(min_value: Union[int, float, np.number], max_value: Union[int, float, np.number], dtype: Union[np.dtype, str]):
298
+ """
299
+ Computes the scale (slope) and offset for a dataset using a min value, max value, and the required np.dtype. It leaves one value at the lower extreme to use for the nan fillvalue.
300
+ These are the min values set asside for the fillvalue (up to 64 bits).
301
+ int8: -128
302
+ int16: -32768
303
+ int32: -2147483648
304
+ int64: -9223372036854775808
305
+
306
+ Unsigned integers are allowed and a value of 0 is set asside for the fillvalue.
307
+
308
+ Parameters
309
+ ----------
310
+ min_value : int or float
311
+ The min value of the dataset.
312
+ max_value : int or float
313
+ The max value of the dataset.
314
+ dtype : np.dtype
315
+ The data type that you want to shrink the data down to.
316
+
317
+ Returns
318
+ -------
319
+ scale, offset as floats
320
+ """
321
+ if isinstance(dtype, str):
322
+ dtype = np.dtype(dtype)
323
+ bits = dtype.itemsize * 8
324
+ data_range = max_value - min_value
325
+ target_range = 2**bits - 2
326
+ slope = data_range / target_range
327
+
328
+ if bits < 64:
329
+ target_min = -(2**(bits - 1) - 1)
330
+ else:
331
+ target_min = -(2**(bits - 1) - 1000)
332
+
333
+ # if bits < 64:
334
+ # target_range = 2**bits - 2
335
+ # target_min = -(2**(bits - 1) - 1)
336
+ # slope = data_range / target_range
337
+ # else:
338
+ # data_power = int(math.log10(data_range))
339
+ # target_range = 2**bits
340
+ # target_power = int(math.log10(target_range))
341
+ # target_min = -10**(target_power - 1)
342
+ # slope = 10**-(target_power - data_power)
343
+
344
+ # Correction if the dtype is unsigned
345
+ if dtype.kind == 'u':
346
+ target_min = 1
347
+
348
+ offset = min_value - (slope*target_min)
349
+
350
+ return slope, offset
351
+
352
+
353
+ def check_var_name(var_name):
354
+ """
355
+ Function to test if the user-supplied var name is allowed.
356
+ """
357
+ if isinstance(var_name, str):
358
+ if len(var_name) <= 256:
359
+ if var_name_pattern.match(var_name):
360
+ return True
361
+ return False
362
+
363
+
364
+ def coord_data_step_check(data: np.ndarray, dtype_decoded: np.dtype, step: int | float | bool = False):
365
+ """
366
+
367
+ """
368
+ # diff = np.diff(data)
369
+ if isinstance(step, bool):
370
+ diff = np.diff(data)
371
+ if dtype_decoded == 'f':
372
+ step = float(np.round(diff[0], 5))
373
+ if not np.allclose(step, diff):
374
+ raise ValueError('step is set to True, but the data does not seem to be regular.')
375
+ # data = np.linspace(data[0], data[-1], len(diff) + 1, dtype=dtype_decoded)
376
+ else:
377
+ step = int(diff[0])
378
+
379
+ if not np.all(np.equal(step, diff)):
380
+ raise ValueError('step is set to True, but the data does not seem to be regular.')
381
+ elif isinstance(step, (float, np.floating)):
382
+ if step <= 0:
383
+ raise ValueError('step must be greater than 0.')
384
+ # if not np.allclose(step, diff):
385
+ # raise ValueError('step does not seem to be the interval of the data.')
386
+ step = float(round(step, 5))
387
+ # num = round((data[-1] - data[0])/step, 5)
388
+ # if not num.is_integer():
389
+ # raise ValueError('The step is not a multiple of the difference between the first and last values of the data.')
390
+
391
+ # data = np.linspace(data[0], data[-1], int(num) + 1, dtype=dtype_decoded)
392
+ elif isinstance(step, (int, np.integer)):
393
+ if step <= 0:
394
+ raise ValueError('step must be greater than 0.')
395
+ # if not np.all(np.equal(step, diff)):
396
+ # raise ValueError('step is set to True, but the data does not seem to be regular.')
397
+ step = int(step)
398
+
399
+ # data = np.linspace(data[0], data[-1], int(num) + 1, dtype=dtype_decoded)
400
+ else:
401
+ raise TypeError('step must be a bool, int, or float. The int or float must be greater than 0.')
402
+
403
+ num = round((data[-1] - data[0])/step, 5)
404
+ if not num.is_integer():
405
+ raise ValueError('The step is not a multiple of the difference between the first and last values of the data.')
406
+
407
+ return step, int(num)
408
+
409
+
410
+ def init_coord_data_checks(data: np.ndarray, step: int | float | bool, dtype_decoded, shape):
411
+ """
412
+
413
+ """
414
+ # dtype_decoded = data.dtype
415
+ # shape = data.shape
416
+
417
+ if len(shape) > 1:
418
+ raise ValueError('Coordinates must be 1D.')
419
+
420
+ if len(np.unique(data)) < shape[0]:
421
+ raise ValueError('The data for coords must be unique.')
422
+
423
+ if dtype_decoded.kind in ('f', 'u', 'i', 'M'):
424
+ data.sort()
425
+ if step:
426
+ step, num = coord_data_step_check(data, dtype_decoded, step)
427
+ # data = np.linspace(data[0], data[-1], num + 1, dtype=dtype_decoded)
428
+ else:
429
+ step = None
430
+ else:
431
+ step = None
432
+
433
+ return step
434
+
435
+
436
+ def append_coord_data_checks(new_data: np.ndarray, source_data: np.ndarray, source_dtype_decoded: np.dtype = None, source_step: int | float | None = None):
437
+ """
438
+
439
+ """
440
+ # new_shape = new_data.shape
441
+ # new_dtype_decoded = new_data.dtype
442
+ new_data = np.asarray(new_data, dtype=source_dtype_decoded)
443
+
444
+ # if source_dtype_decoded != new_dtype_decoded:
445
+ # raise TypeError('The data dtype does not match the originally assigned dtype.')
446
+
447
+ # print(source_data)
448
+
449
+ if source_data.size > 0:
450
+ if source_dtype_decoded.kind != 'U':
451
+ last = source_data[-1]
452
+
453
+ if not np.all(last < new_data):
454
+ raise ValueError('Appending requires that all values are greater than the existing values.')
455
+
456
+ new_data.sort()
457
+ if source_step:
458
+ new_step, new_num = coord_data_step_check(new_data, source_dtype_decoded, source_step)
459
+
460
+ new_data = np.linspace(source_data[0], new_data[-1], len(source_data) + new_num + 1, dtype=source_dtype_decoded)
461
+ else:
462
+ new_data = np.append(source_data, new_data)
463
+
464
+ else:
465
+ s1 = set(source_data)
466
+ s1.update(set(new_data))
467
+ if len(s1) != (len(source_data) + len(new_data)):
468
+ raise ValueError('The data for coords must be unique.')
469
+
470
+ new_data = np.append(source_data, new_data)
471
+
472
+ else:
473
+ _ = init_coord_data_checks(new_data, source_step, source_dtype_decoded, new_data.shape)
474
+
475
+ return new_data
476
+
477
+
478
+ def prepend_coord_data_checks(new_data: np.ndarray, source_data: np.ndarray, source_dtype_decoded: np.dtype = None, source_step: int | float | None = None):
479
+ """
480
+
481
+ """
482
+ # new_shape = new_data.shape
483
+ # new_dtype_decoded = new_data.dtype
484
+ new_data = np.asarray(new_data, dtype=source_dtype_decoded)
485
+
486
+ # if source_dtype_decoded != new_dtype_decoded:
487
+ # raise TypeError('The data dtype does not match the originally assigned dtype.')
488
+
489
+ if source_data.size > 0:
490
+ if source_dtype_decoded.kind != 'U':
491
+ first = source_data[0]
492
+
493
+ if not np.all(first > new_data):
494
+ raise ValueError('Prepending requires that all values are less than the existing values.')
495
+
496
+ new_data.sort()
497
+ if source_step:
498
+ new_step, new_num = coord_data_step_check(new_data, source_dtype_decoded, source_step)
499
+
500
+ new_data = np.linspace(new_step[0], source_data[-1], len(source_data) + new_num + 1, dtype=source_dtype_decoded)
501
+ else:
502
+ new_data = np.append(new_data, source_data)
503
+ else:
504
+ s1 = set(source_data)
505
+ s1.update(set(new_data))
506
+ if len(s1) != (len(source_data) + len(new_data)):
507
+ raise ValueError('The data for coords must be unique.')
508
+
509
+ new_data = np.append(new_data, source_data)
510
+
511
+ else:
512
+ new_data, new_step = init_coord_data_checks(new_data, source_step, source_dtype_decoded, new_data.shape)
513
+
514
+ return new_data
515
+
516
+
517
+ # def coord_data_checks(data: np.ndarray, step: int | float | bool = False, source_data: np.ndarray = None, source_dtype_decoded: np.dtype = None, source_step: int | float | None = None):
518
+ # """
519
+
520
+ # """
521
+ # shape = data.shape
522
+ # dtype_decoded = data.dtype
523
+
524
+ # if dtype_decoded is not None:
525
+ # if source_dtype_decoded != dtype_decoded:
526
+ # raise TypeError('The data dtype does not match the originally assigned dtype.')
527
+
528
+ # if source_data:
529
+ # s1 = set(source_data)
530
+ # s1.update(set(data))
531
+ # if len(s1) != (len(source_data) + len(data)):
532
+ # raise ValueError('The data for coords must be unique.')
533
+ # elif len(np.unique(data)) < shape[0]:
534
+ # raise ValueError('The data for coords must be unique.')
535
+
536
+ # if dtype_decoded.kind in ('f', 'u', 'i', 'M'):
537
+ # data.sort()
538
+ # if step:
539
+ # diff = np.diff(data)
540
+ # if isinstance(step, bool):
541
+ # if dtype_decoded == 'f':
542
+ # step = float(np.round(diff[0], 5))
543
+ # if not np.allclose(step, diff):
544
+ # raise ValueError('step is set to True, but the data does not seem to be regular.')
545
+ # # data = np.linspace(data[0], data[-1], len(diff) + 1, dtype=dtype_decoded)
546
+ # else:
547
+ # step = int(diff[0])
548
+
549
+ # if not np.all(np.equal(step, diff)):
550
+ # raise ValueError('step is set to True, but the data does not seem to be regular.')
551
+ # elif isinstance(step, (float, np.floating)):
552
+ # if step <= 0:
553
+ # raise ValueError('step must be greater than 0.')
554
+ # # if not np.allclose(step, diff):
555
+ # # raise ValueError('step does not seem to be the interval of the data.')
556
+ # step = float(round(step, 5))
557
+ # num = round((data[-1] - data[0])/step, 5)
558
+ # if not num.is_integer():
559
+ # raise ValueError('The step is not a multiple of the dirrefernce between the first and last values of the data.')
560
+
561
+ # data = np.linspace(data[0], data[-1], int(num) + 1, dtype=dtype_decoded)
562
+ # elif isinstance(step, (int, np.integer)):
563
+ # if step <= 0:
564
+ # raise ValueError('step must be greater than 0.')
565
+ # # if not np.all(np.equal(step, diff)):
566
+ # # raise ValueError('step is set to True, but the data does not seem to be regular.')
567
+ # step = int(step)
568
+ # num = round((data[-1] - data[0])/step, 5)
569
+ # if not num.is_integer():
570
+ # raise ValueError('The step is not a multiple of the dirrefernce between the first and last values of the data.')
571
+
572
+ # data = np.linspace(data[0], data[-1], int(num) + 1, dtype=dtype_decoded)
573
+ # else:
574
+ # raise TypeError('step must be a bool, int, or float. The int or float must be greater than 0.')
575
+ # else:
576
+ # step = None
577
+ # else:
578
+ # step = None
579
+
580
+ # return data, step
581
+
582
+
583
+ def parse_dtypes(dtype_decoded, dtype_encoded):
584
+ """
585
+
586
+ """
587
+ dtype_decoded = np.dtype(dtype_decoded)
588
+
589
+ # if dtype_decoded.kind == 'M':
590
+ # dtype_encoded = np.dtype('int64')
591
+
592
+ if isinstance(dtype_encoded, str):
593
+ dtype_encoded = np.dtype(dtype_encoded)
594
+
595
+ elif not isinstance(dtype_encoded, np.dtype):
596
+ dtype_encoded = dtype_decoded
597
+
598
+ return dtype_decoded, dtype_encoded
599
+
600
+
601
+ def parse_dtype_names(dtype_decoded, dtype_encoded):
602
+ """
603
+
604
+ """
605
+ if dtype_encoded.kind == 'U':
606
+ dtype_decoded_name = dtype_decoded.str
607
+ dtype_encoded_name = dtype_encoded.str
608
+ else:
609
+ dtype_decoded_name = dtype_decoded.name
610
+ dtype_encoded_name = dtype_encoded.name
611
+
612
+ return dtype_decoded_name, dtype_encoded_name
613
+
614
+
615
+ def parse_fillvalue(fillvalue, dtype_encoded):
616
+ """
617
+
618
+ """
619
+ ## Fillvalue
620
+ kind = dtype_encoded.kind
621
+ if fillvalue is not None:
622
+ fillvalue_dtype = np.dtype(type(fillvalue))
623
+
624
+ if kind == 'u' and fillvalue_dtype.kind == 'i':
625
+ if fillvalue < 0:
626
+ raise ValueError('The dtype_encoded is an unsigned integer, but the fillvalue is < 0.')
627
+ elif fillvalue_dtype.kind != kind:
628
+ raise ValueError('The fillvalue dtype is not the same as the dtype_encoded dtype.')
629
+ else:
630
+ if kind == 'u':
631
+ fillvalue = 0
632
+ elif kind == 'f':
633
+ fillvalue = None
634
+ elif kind == 'U':
635
+ fillvalue = ''
636
+ elif kind == 'i':
637
+ fillvalue = fillvalue_dict[dtype_encoded.name]
638
+ elif kind == 'M':
639
+ fillvalue = None
640
+ else:
641
+ raise TypeError('Unknown/unsupported data type.')
642
+
643
+ return fillvalue
644
+
645
+
646
+ def parse_scale_offset(scale_factor, add_offset, dtype_decoded):
647
+ """
648
+
649
+ """
650
+ ## Scale and offset
651
+ if scale_factor is None and isinstance(add_offset, (int, float, np.number)):
652
+ scale_factor = 1
653
+ # if isinstance(scale_factor, (int, float, np.number)) and add_offset is None:
654
+ # add_offset = 0
655
+
656
+ if isinstance(scale_factor, (int, float, np.number)) and dtype_decoded.kind != 'f':
657
+ raise ValueError('scale_factor and add_offset only apply to floats.')
658
+
659
+ return scale_factor, add_offset
660
+
661
+
662
+ def parse_coord_inputs(name: str, data: np.ndarray | None = None, chunk_shape: Tuple[int] | None = None, dtype_decoded: str | np.dtype | None = None, dtype_encoded: str | np.dtype | None = None, fillvalue: Union[int, float, str] = None, scale_factor: Union[float, int, None] = None, add_offset: Union[float, int, None] = None, step: int | float | bool = False):
663
+ """
664
+
665
+ """
666
+ ## Check var name
667
+ if not check_var_name(name):
668
+ raise ValueError(f'{name} is not a valid variable name.')
669
+
670
+ ## Check data, shape, dtype, and step
671
+ if isinstance(data, np.ndarray):
672
+ dtype_decoded = data.dtype
673
+
674
+ step = init_coord_data_checks(data, step, dtype_decoded, data.shape)
675
+
676
+ # if dtype_decoded.kind == 'M':
677
+ # dtype_encoded = dtype_decoded
678
+
679
+ ## dtype encoding
680
+ dtype_decoded, dtype_encoded = parse_dtypes(dtype_decoded, dtype_encoded)
681
+
682
+ else:
683
+ ## dtype encoding
684
+ dtype_decoded, dtype_encoded = parse_dtypes(dtype_decoded, dtype_encoded)
685
+
686
+ if dtype_decoded.kind in ('u', 'i') and isinstance(step, (float, np.floating)):
687
+ if not step.is_integer():
688
+ raise ValueError('If the dtype_decoded is an integer, then step must be an integer.')
689
+ else:
690
+ step = int(step)
691
+ elif isinstance(step, bool):
692
+ if step:
693
+ raise TypeError('If data is not passed, then step cannot be set to True')
694
+ else:
695
+ step = None
696
+ elif isinstance(step, np.floating):
697
+ step = float(round(step, 5))
698
+ else:
699
+ raise TypeError('step must be a bool, int, or float. The int or float must be greater than 0.')
700
+
701
+ ## Guess the chunk_shape from the dtype
702
+ if isinstance(chunk_shape, tuple):
703
+ if not all([isinstance(c, int) for c in chunk_shape]):
704
+ raise TypeError('chunk_shape must be a tuple of ints.')
705
+ elif chunk_shape is None:
706
+ chunk_shape = rechunkit.guess_chunk_shape((1000000,), dtype_encoded, 2**20)
707
+ else:
708
+ raise TypeError('chunk_shape must be either a tuple of ints or None.')
709
+
710
+ ## fillvalue
711
+ fillvalue = parse_fillvalue(fillvalue, dtype_encoded)
712
+
713
+ ## Scale and offset
714
+ scale_factor, add_offset = parse_scale_offset(scale_factor, add_offset, dtype_decoded)
715
+
716
+ ## Save metadata
717
+ dtype_decoded_name, dtype_encoded_name = parse_dtype_names(dtype_decoded, dtype_encoded)
718
+
719
+ # enc = data_models.Encoding(dtype_encoded=dtype_encoded_name, dtype_decoded=dtype_decoded_name, fillvalue=fillvalue, scale_factor=scale_factor, add_offset=add_offset)
720
+
721
+ var = data_models.CoordinateVariable(shape=(0,), chunk_shape=chunk_shape, origin=0, step=step, dtype_encoded=dtype_encoded_name, dtype_decoded=dtype_decoded_name, fillvalue=fillvalue, scale_factor=scale_factor, add_offset=add_offset)
722
+
723
+ return name, var
724
+
725
+
726
+ def parse_var_inputs(sys_meta: data_models.SysMeta, name: str, coords: Tuple[str,...], dtype_decoded: str | np.dtype, dtype_encoded: str | np.dtype | None = None, chunk_shape: Tuple[int] | None = None, fillvalue: Union[int, float, str] = None, scale_factor: Union[float, int, None] = None, add_offset: Union[float, int, None] = None):
727
+ """
728
+ Function to process the inputs to a variable creation function.
729
+ """
730
+ ## Check var name
731
+ if not check_var_name(name):
732
+ raise ValueError(f'{name} is not a valid variable name.')
733
+
734
+ if name in sys_meta.variables:
735
+ raise ValueError(f"Dataset already contains the variable {name}.")
736
+
737
+ ## Check shape and dtype
738
+ if len(coords) == 0:
739
+ raise ValueError('coords must have at least one value.')
740
+
741
+ shape = []
742
+ for coord_name in coords:
743
+ if not isinstance(coord_name, str):
744
+ raise TypeError('coords must contain strings of the coordinate names.')
745
+ if coord_name not in sys_meta.variables:
746
+ raise ValueError(f'{coord_name} not in the list of coordinates.')
747
+ else:
748
+ coord = sys_meta.variables[coord_name]
749
+ shape.append(coord.shape[0])
750
+
751
+ ## dtypes
752
+ dtype_decoded, dtype_encoded = parse_dtypes(dtype_decoded, dtype_encoded)
753
+
754
+ ## Guess the chunk_shape from the dtype
755
+ if isinstance(chunk_shape, tuple):
756
+ if not all([isinstance(c, int) for c in chunk_shape]):
757
+ raise TypeError('chunk_shape must be a tuple of ints.')
758
+ elif chunk_shape is None:
759
+ chunk_shape = rechunkit.guess_chunk_shape(shape, dtype_encoded, 2**21)
760
+ else:
761
+ raise TypeError('chunk_shape must be either a tuple of ints or None.')
762
+
763
+ ## fillvalue
764
+ fillvalue = parse_fillvalue(fillvalue, dtype_encoded)
765
+
766
+ ## Scale and offset
767
+ scale_factor, add_offset = parse_scale_offset(scale_factor, add_offset, dtype_decoded)
768
+
769
+ ## Save metadata
770
+ dtype_decoded_name, dtype_encoded_name = parse_dtype_names(dtype_decoded, dtype_encoded)
771
+
772
+ # enc = data_models.Encoding(dtype_encoded=dtype_encoded_name, dtype_decoded=dtype_decoded_name, fillvalue=fillvalue, scale_factor=scale_factor, add_offset=add_offset)
773
+
774
+ var = data_models.DataVariable(coords=tuple(coords), chunk_shape=chunk_shape, dtype_encoded=dtype_encoded_name, dtype_decoded=dtype_decoded_name, fillvalue=fillvalue, scale_factor=scale_factor, add_offset=add_offset)
775
+
776
+ return name, var
777
+
778
+
779
+ # def encode_datetime(data, units=None, calendar='gregorian'):
780
+ # """
781
+
782
+ # """
783
+ # if units is None:
784
+ # output = data.astype('datetime64[s]').astype('int64')
785
+ # else:
786
+ # if '1970-01-01' in units:
787
+ # time_unit = units.split()[0]
788
+ # output = data.astype(time_str_conversion[time_unit]).astype('int64')
789
+ # else:
790
+ # output = cftime.date2num(data.astype('datetime64[s]').tolist(), units, calendar)
791
+
792
+ # return output
793
+
794
+
795
+ def decode_datetime(data, units=None, calendar='gregorian'):
796
+ """
797
+
798
+ """
799
+ if units is None:
800
+ output = data.astype('datetime64[s]')
801
+ else:
802
+ if '1970-01-01' in units:
803
+ time_unit = units.split()[0]
804
+ output = data.astype(time_str_conversion[time_unit])
805
+ else:
806
+ output = cftime.num2pydate(data, units, calendar).astype('datetime64[s]')
807
+
808
+ return output
809
+
810
+
811
+ # def encode_data(data, dtype_encoded, fillvalue, add_offset, scale_factor, compressor) -> bytes:
812
+ # """
813
+
814
+ # """
815
+ # if 'datetime64' in data.dtype.name:
816
+ # data = data.astype('int64')
817
+
818
+ # elif isinstance(scale_factor, (int, float, np.number)):
819
+ # # precision = int(np.abs(np.log10(val['scale_factor'])))
820
+ # data = np.round((data - add_offset)/scale_factor)
821
+
822
+ # if isinstance(fillvalue, (int, np.number)):
823
+ # data[np.isnan(data)] = fillvalue
824
+
825
+ # # if (data.dtype.name != dtype_encoded) or (data.dtype.name == 'object'):
826
+ # # data = data.astype(dtype_encoded)
827
+ # # print(data)
828
+ # data = data.astype(dtype_encoded)
829
+
830
+ # return compressor.compress(data.tobytes())
831
+
832
+
833
+ # def decode_data(data: bytes, dtype_encoded, dtype_decoded, missing_value, add_offset=0, scale_factor=None, units=None, calendar=None, **kwargs) -> np.ndarray:
834
+ # """
835
+
836
+ # """
837
+ # data = np.frombuffer(data, dtype=dtype_encoded)
838
+
839
+ # if isinstance(calendar, str):
840
+ # data = decode_datetime(data, units, calendar)
841
+
842
+ # elif isinstance(scale_factor, (int, float, np.number)):
843
+ # data = data.astype(dtype_decoded)
844
+
845
+ # if isinstance(missing_value, (int, np.number)):
846
+ # if isinstance(data, np.number):
847
+ # if data == missing_value:
848
+ # data = np.nan
849
+ # else:
850
+ # data[data == missing_value] = np.nan
851
+
852
+ # data = (data * scale_factor) + add_offset
853
+
854
+ # elif (data.dtype.name != dtype_decoded) or (data.dtype.name == 'object'):
855
+ # data = data.astype(dtype_decoded)
856
+
857
+ # return data
858
+
859
+
860
+ # def get_encoding_data_from_attrs(attrs):
861
+ # """
862
+
863
+ # """
864
+ # encoding = {}
865
+ # for f, v in attrs.items():
866
+ # if f in enc_fields:
867
+ # if isinstance(v, bytes):
868
+ # encoding[f] = v.decode()
869
+ # elif isinstance(v, np.ndarray):
870
+ # if len(v) == 1:
871
+ # encoding[f] = v[0]
872
+ # else:
873
+ # raise ValueError('encoding is an ndarray with len > 1.')
874
+ # else:
875
+ # encoding[f] = v
876
+
877
+ # return encoding
878
+
879
+
880
+ # def get_encoding_data_from_xr(data):
881
+ # """
882
+
883
+ # """
884
+ # attrs = {f: v for f, v in data.attrs.items() if (f in enc_fields) and (f not in ignore_attrs)}
885
+ # encoding = {f: v for f, v in data.encoding.items() if (f in enc_fields) and (f not in ignore_attrs)}
886
+
887
+ # attrs.update(encoding)
888
+
889
+ # return attrs
890
+
891
+
892
+ # def process_encoding(encoding, dtype):
893
+ # """
894
+
895
+ # """
896
+ # if (dtype.name == 'object') or ('str' in dtype.name):
897
+ # # encoding['dtype'] = h5py.string_dtype()
898
+ # encoding['dtype'] = 'object'
899
+ # elif ('datetime64' in dtype.name): # which means it's an xr.DataArray
900
+ # encoding['dtype'] = 'int64'
901
+ # encoding['calendar'] = 'gregorian'
902
+ # encoding['units'] = 'seconds since 1970-01-01 00:00:00'
903
+ # encoding['missing_value'] = missing_value_dict['int64']
904
+ # encoding['_FillValue'] = encoding['missing_value']
905
+
906
+ # elif 'calendar' in encoding: # Which means it's not an xr.DataArray
907
+ # encoding['dtype'] = 'int64'
908
+ # if 'units' not in encoding:
909
+ # encoding['units'] = 'seconds since 1970-01-01 00:00:00'
910
+ # encoding['missing_value'] = missing_value_dict['int64']
911
+ # encoding['_FillValue'] = encoding['missing_value']
912
+
913
+ # if 'dtype' not in encoding:
914
+ # if np.issubdtype(dtype, np.floating):
915
+ # # scale, offset = compute_scale_and_offset(min_value, max_value, n)
916
+ # raise ValueError('float dtypes must have encoding data to encode to int.')
917
+ # encoding['dtype'] = dtype.name
918
+ # elif not isinstance(encoding['dtype'], str):
919
+ # encoding['dtype'] = encoding['dtype'].name
920
+
921
+ # if 'scale_factor' in encoding:
922
+ # if not isinstance(encoding['scale_factor'], (int, float, np.number)):
923
+ # raise TypeError('scale_factor must be an int or float.')
924
+
925
+ # if not 'int' in encoding['dtype']:
926
+ # raise ValueError('If scale_factor is assigned, then the dtype must be an integer.')
927
+ # if 'add_offset' not in encoding:
928
+ # encoding['add_offset'] = 0
929
+ # elif not isinstance(encoding['add_offset'], (int, float, np.number)):
930
+ # raise ValueError('add_offset must be a number.')
931
+
932
+ # if 'int' in encoding['dtype']:
933
+ # if ('_FillValue' in encoding) and ('missing_value' not in encoding):
934
+ # encoding['missing_value'] = encoding['_FillValue']
935
+ # if ('_FillValue' not in encoding) and ('missing_value' in encoding):
936
+ # encoding['_FillValue'] = encoding['missing_value']
937
+
938
+ # # if 'missing_value' not in encoding:
939
+ # # encoding['missing_value'] = missing_value_dict[encoding['dtype'].name]
940
+ # # encoding['_FillValue'] = encoding['missing_value']
941
+
942
+ # return encoding
943
+
944
+
945
+ # def assign_dtype_decoded(encoding):
946
+ # """
947
+
948
+ # """
949
+ # if encoding['dtype'] == 'object':
950
+ # encoding['dtype_decoded'] = encoding['dtype']
951
+ # elif ('calendar' in encoding) and ('units' in encoding):
952
+ # encoding['dtype_decoded'] = 'datetime64[s]'
953
+
954
+ # if 'scale_factor' in encoding:
955
+
956
+ # # if isinstance(encoding['scale_factor'], (int, np.integer)):
957
+ # # encoding['dtype_decoded'] = np.dtype('float32')
958
+ # if np.dtype(encoding['dtype']).itemsize > 2:
959
+ # encoding['dtype_decoded'] = 'float64'
960
+ # else:
961
+ # encoding['dtype_decoded'] = 'float32'
962
+
963
+ # if 'dtype_decoded' not in encoding:
964
+ # encoding['dtype_decoded'] = encoding['dtype']
965
+
966
+ # return encoding
967
+
968
+
969
+ def make_var_chunk_key(var_name, chunk_start):
970
+ """
971
+
972
+ """
973
+ dims = '.'.join(map(str, chunk_start))
974
+ var_chunk_key = var_chunk_key_str.format(var_name=var_name, dims=dims)
975
+
976
+ return var_chunk_key
977
+
978
+
979
+ # def write_chunk(blt_file, var_name, chunk_start_pos, data_chunk_bytes):
980
+ # """
981
+
982
+ # """
983
+ # dims = '.'.join(map(str, chunk_start_pos))
984
+ # var_chunk_key = var_chunk_key_str.format(var_name=var_name, dims=dims)
985
+
986
+ # # var_name, dims = var_chunk_key.split('!')
987
+ # # chunk_start_pos = tuple(map(int, dims.split('.')))
988
+
989
+ # blt_file[var_chunk_key] = data_chunk_bytes
990
+
991
+
992
+ # def write_init_data(blt_file, var_name, var_meta, data, compressor):
993
+ # """
994
+
995
+ # """
996
+ # dtype_decoded = np.dtype(var_meta.encoding.dtype_decoded)
997
+ # fillvalue = dtype_decoded.type(var_meta.encoding.fillvalue)
998
+ # dtype_encoded = np.dtype(var_meta.encoding.dtype_encoded)
999
+ # add_offset = var_meta.encoding.add_offset
1000
+ # scale_factor = var_meta.encoding.scale_factor
1001
+
1002
+ # mem_arr1 = np.full(var_meta.chunk_shape, fill_value=fillvalue, dtype=dtype_encoded)
1003
+
1004
+ # chunk_iter = rechunker.chunk_range(var_meta.origin, var_meta.shape, var_meta.chunk_shape, clip_ends=True)
1005
+ # for chunk in chunk_iter:
1006
+ # # print(chunk)
1007
+ # mem_arr2 = mem_arr1.copy()
1008
+ # mem_chunk = tuple(slice(0, s.stop - s.start) for s in chunk)
1009
+ # mem_arr2[mem_chunk] = data[chunk]
1010
+
1011
+ # chunk_start_pos = tuple(s.start for s in chunk)
1012
+ # # print(mem_arr2)
1013
+ # data_chunk_bytes = encode_data(mem_arr2, dtype_encoded, fillvalue_encoded, add_offset, scale_factor, compressor)
1014
+
1015
+ # write_chunk(blt_file, var_name, chunk_start_pos, data_chunk_bytes)
1016
+
1017
+
1018
+ # def coord_init(name, shape, chunk_shape, enc, sys_meta, step):
1019
+ # """
1020
+
1021
+ # """
1022
+ # ## Update sys_meta
1023
+ # if name in sys_meta.variables:
1024
+ # raise ValueError(f'Dataset already contains the variable {name}.')
1025
+
1026
+ # var = data_models.Variable(shape=shape, chunk_shape=chunk_shape, origin=(0,), coords=(name,), is_coord=True, encoding=enc, step=step)
1027
+
1028
+ # sys_meta.variables[name] = var
1029
+
1030
+ # # if data is not None:
1031
+ # # write_init_data(blt_file, name, var, data, compressor)
1032
+
1033
+
1034
+ def check_coords(coords, shape, sys_meta):
1035
+ """
1036
+
1037
+ """
1038
+ # exist_coords = set(sys_meta.variables.keys())
1039
+ # new_coords = set(coords)
1040
+ # diff_coords = new_coords.difference(exist_coords)
1041
+
1042
+ # if diff_coords:
1043
+ # raise ValueError(f'{diff_coords} does not exist. Create the coord(s) before creating the data variable.')
1044
+
1045
+ if len(coords) != len(shape):
1046
+ raise ValueError(f'The coords length ({len(coords)}) != the shape length ({len(shape)})')
1047
+
1048
+ for coord, size in zip(coords, shape):
1049
+ if coord not in sys_meta.variables:
1050
+ raise ValueError(f'{coord} does not exist. Create the coord before creating the data variable.')
1051
+
1052
+ exist_coord = sys_meta.variables[coord]
1053
+
1054
+ if not exist_coord.is_coord:
1055
+ raise TypeError(f'{coord} must be a coord. This is a data variable.')
1056
+
1057
+ if size != exist_coord.shape[0]:
1058
+ raise ValueError(f'The {coord} shape length ({size}) != existing coord length ({exist_coord.shape[0]})')
1059
+
1060
+
1061
+ # def init_file(file_path: Union[str, pathlib.Path], flag: str = "r", compression='zstd', compression_level=1, **kwargs):
1062
+ # """
1063
+
1064
+ # """
1065
+ # if 'n_buckets' not in kwargs:
1066
+ # kwargs['n_buckets'] = default_n_buckets
1067
+
1068
+ # fp = pathlib.Path(file_path)
1069
+ # fp_exists = fp.exists()
1070
+ # blt = booklet.open(file_path, flag, key_serializer='str', **kwargs)
1071
+ # writable = blt.writable
1072
+ # file_path = fp
1073
+
1074
+ # ## Set/Get system metadata
1075
+ # if not fp_exists or flag in ('n', 'c'):
1076
+ # # Checks
1077
+ # if compression.lower() not in compression_options:
1078
+ # raise ValueError(f'compression must be one of {compression_options}.')
1079
+
1080
+ # sys_meta = data_models.SysMeta(object_type='Dataset', compression=data_models.Compressor(compression), compression_level=compression_level, variables={})
1081
+ # blt.set_metadata(msgspec.to_builtins(sys_meta))
1082
+
1083
+ # else:
1084
+ # sys_meta = msgspec.convert(blt.get_metadata(), data_models.SysMeta)
1085
+
1086
+ # compression = sys_meta.compression.value
1087
+ # compression_level = sys_meta.compression_level
1088
+ # compressor = sc.Compressor(compression, compression_level)
1089
+
1090
+ # finalizers = [weakref.finalize(self, utils.dataset_finalizer, self._blt, self._sys_meta))]
1091
+
1092
+
1093
+ # def data_var_init(name, coords, shape, chunk_shape, enc, sys_meta):
1094
+ # """
1095
+
1096
+ # """
1097
+ # ## Update sys_meta
1098
+ # if name in sys_meta.variables:
1099
+ # raise ValueError(f'Dataset already contains the variable {name}.')
1100
+
1101
+ # var = data_models.Variable(shape=shape, chunk_shape=chunk_shape, origin=0, coords=coords, is_coord=False, encoding=enc)
1102
+
1103
+ # sys_meta.variables[name] = var
1104
+
1105
+
1106
+ # def extend_coords(files, encodings, group):
1107
+ # """
1108
+
1109
+ # """
1110
+ # coords_dict = {}
1111
+
1112
+ # for file1 in files:
1113
+ # with open_file(file1, group) as file:
1114
+ # if isinstance(file, xr.Dataset):
1115
+ # ds_list = list(file.coords)
1116
+ # else:
1117
+ # ds_list = [ds_name for ds_name in file.keys() if is_scale(file[ds_name])]
1118
+
1119
+ # for ds_name in ds_list:
1120
+ # ds = file[ds_name]
1121
+
1122
+ # if isinstance(file, xr.Dataset):
1123
+ # data = encode_data(ds.values, **encodings[ds_name])
1124
+ # else:
1125
+ # if ds.dtype.name == 'object':
1126
+ # data = ds[:].astype(str).astype(h5py.string_dtype())
1127
+ # else:
1128
+ # data = ds[:]
1129
+
1130
+ # # Check for nan values in numeric types
1131
+ # dtype = data.dtype
1132
+ # if np.issubdtype(dtype, np.integer):
1133
+ # nan_value = missing_value_dict[dtype.name]
1134
+ # if nan_value in data:
1135
+ # raise ValueError(f'{ds_name} has nan values. Floats and integers coordinates cannot have nan values. Check the encoding values if the original values are floats.')
1136
+
1137
+ # if ds_name in coords_dict:
1138
+ # coords_dict[ds_name] = np.union1d(coords_dict[ds_name], data)
1139
+ # else:
1140
+ # coords_dict[ds_name] = data
1141
+
1142
+ # return coords_dict
1143
+
1144
+
1145
+ # def index_variables(files, coords_dict, encodings, group):
1146
+ # """
1147
+
1148
+ # """
1149
+ # vars_dict = {}
1150
+ # is_regular_dict = {}
1151
+
1152
+ # for i, file1 in enumerate(files):
1153
+ # with open_file(file1, group) as file:
1154
+ # # if i == 77:
1155
+ # # break
1156
+
1157
+ # if isinstance(file, xr.Dataset):
1158
+ # ds_list = list(file.data_vars)
1159
+ # else:
1160
+ # ds_list = [ds_name for ds_name in file.keys() if not is_scale(file[ds_name])]
1161
+
1162
+ # _ = [is_regular_dict.update({ds_name: True}) for ds_name in ds_list if ds_name not in is_regular_dict]
1163
+
1164
+ # for ds_name in ds_list:
1165
+ # ds = file[ds_name]
1166
+
1167
+ # var_enc = encodings[ds_name]
1168
+
1169
+ # dims = []
1170
+ # global_index = {}
1171
+ # local_index = {}
1172
+ # remove_ds = False
1173
+
1174
+ # for dim in ds.dims:
1175
+ # if isinstance(ds, xr.DataArray):
1176
+ # dim_name = dim
1177
+ # dim_data = encode_data(ds[dim_name].values, **encodings[dim_name])
1178
+ # else:
1179
+ # dim_name = dim[0].name.split('/')[-1]
1180
+ # if dim[0].dtype.name == 'object':
1181
+ # dim_data = dim[0][:].astype(str).astype(h5py.string_dtype())
1182
+ # else:
1183
+ # dim_data = dim[0][:]
1184
+
1185
+ # dims.append(dim_name)
1186
+
1187
+ # # global_arr_index = np.searchsorted(coords_dict[dim_name], dim_data)
1188
+ # # local_arr_index = np.isin(dim_data, coords_dict[dim_name], assume_unique=True).nonzero()[0]
1189
+ # values, global_arr_index, local_arr_index = np.intersect1d(coords_dict[dim_name], dim_data, assume_unique=True, return_indices=True)
1190
+
1191
+ # if len(global_arr_index) > 0:
1192
+
1193
+ # global_index[dim_name] = global_arr_index
1194
+ # local_index[dim_name] = local_arr_index
1195
+
1196
+ # if is_regular_dict[ds_name]:
1197
+ # if (not is_regular_index(global_arr_index)) or (not is_regular_index(local_arr_index)):
1198
+ # is_regular_dict[ds_name] = False
1199
+ # else:
1200
+ # remove_ds = True
1201
+ # break
1202
+
1203
+ # if remove_ds:
1204
+ # if ds_name in vars_dict:
1205
+ # if i in vars_dict[ds_name]['data']:
1206
+ # del vars_dict[ds_name]['data'][i]
1207
+
1208
+ # else:
1209
+ # dict1 = {'dims_order': tuple(i for i in range(len(dims))), 'global_index': global_index, 'local_index': local_index}
1210
+
1211
+ # if ds_name in vars_dict:
1212
+ # if not np.in1d(vars_dict[ds_name]['dims'], dims).all():
1213
+ # raise ValueError('dims are not consistant between the same named dataset: ' + ds_name)
1214
+ # # if vars_dict[ds_name]['dtype'] != ds.dtype:
1215
+ # # raise ValueError('dtypes are not consistant between the same named dataset: ' + ds_name)
1216
+
1217
+ # dims_order = [vars_dict[ds_name]['dims'].index(dim) for dim in dims]
1218
+ # dict1['dims_order'] = tuple(dims_order)
1219
+
1220
+ # vars_dict[ds_name]['data'][i] = dict1
1221
+ # else:
1222
+ # shape = tuple([coords_dict[dim_name].shape[0] for dim_name in dims])
1223
+
1224
+ # if 'missing_value' in var_enc:
1225
+ # fillvalue = var_enc['missing_value']
1226
+ # else:
1227
+ # fillvalue = None
1228
+
1229
+ # vars_dict[ds_name] = {'data': {i: dict1}, 'dims': tuple(dims), 'shape': shape, 'dtype': var_enc['dtype'], 'fillvalue': fillvalue, 'dtype_decoded': var_enc['dtype_decoded']}
1230
+
1231
+ # return vars_dict, is_regular_dict
1232
+
1233
+
1234
+ # def filter_coords(coords_dict, selection, encodings):
1235
+ # """
1236
+
1237
+ # """
1238
+ # for coord, sel in selection.items():
1239
+ # if coord not in coords_dict:
1240
+ # raise ValueError(coord + ' one of the coordinates.')
1241
+
1242
+ # coord_data = decode_data(coords_dict[coord], **encodings[coord])
1243
+
1244
+ # if isinstance(sel, slice):
1245
+ # if 'datetime64' in coord_data.dtype.name:
1246
+ # # if not isinstance(sel.start, (str, np.datetime64)):
1247
+ # # raise TypeError('Input for datetime selection should be either a datetime string or np.datetime64.')
1248
+
1249
+ # if sel.start is not None:
1250
+ # start = np.datetime64(sel.start, 's')
1251
+ # else:
1252
+ # start = np.datetime64(coord_data[0] - 1, 's')
1253
+
1254
+ # if sel.stop is not None:
1255
+ # end = np.datetime64(sel.stop, 's')
1256
+ # else:
1257
+ # end = np.datetime64(coord_data[-1] + 1, 's')
1258
+
1259
+ # bool_index = (start <= coord_data) & (coord_data < end)
1260
+ # else:
1261
+ # bool_index = (sel.start <= coord_data) & (coord_data < sel.stop)
1262
+
1263
+ # else:
1264
+ # if isinstance(sel, (int, float)):
1265
+ # sel = [sel]
1266
+
1267
+ # try:
1268
+ # sel1 = np.array(sel)
1269
+ # except:
1270
+ # raise TypeError('selection input could not be coerced to an ndarray.')
1271
+
1272
+ # if sel1.dtype.name == 'bool':
1273
+ # if sel1.shape[0] != coord_data.shape[0]:
1274
+ # raise ValueError('The boolean array does not have the same length as the coord array.')
1275
+ # bool_index = sel1
1276
+ # else:
1277
+ # bool_index = np.in1d(coord_data, sel1)
1278
+
1279
+ # new_coord_data = encode_data(coord_data[bool_index], **encodings[coord])
1280
+
1281
+ # coords_dict[coord] = new_coord_data
1282
+
1283
+
1284
+ # def guess_chunk_shape(shape: Tuple[int, ...], dtype: np.dtype, target_chunk_size: int = 2**21) -> Tuple[int, ...]:
1285
+ # """
1286
+ # Guess an appropriate chunk layout for a dataset, given its shape and
1287
+ # the size of each element in bytes. Will allocate chunks only as large
1288
+ # as target_chunk_size. Chunks are generally close to some power-of-2 fraction of
1289
+ # each axis, slightly favoring bigger values for the last index.
1290
+ # """
1291
+ # ndims = len(shape)
1292
+
1293
+ # if ndims > 0:
1294
+
1295
+ # if not all(isinstance(v, int) for v in shape):
1296
+ # raise TypeError('All values in the shape must be ints.')
1297
+
1298
+ # chunks = np.array(shape, dtype='=f8')
1299
+ # if not np.all(np.isfinite(chunks)):
1300
+ # raise ValueError("Illegal value in chunk tuple")
1301
+
1302
+ # dtype = np.dtype(dtype)
1303
+ # typesize = dtype.itemsize
1304
+
1305
+ # idx = 0
1306
+ # while True:
1307
+ # chunk_bytes = math.prod(chunks)*typesize
1308
+
1309
+ # if (chunk_bytes < target_chunk_size or \
1310
+ # abs(chunk_bytes - target_chunk_size)/target_chunk_size < 0.5):
1311
+ # break
1312
+
1313
+ # if math.prod(chunks) == 1:
1314
+ # break
1315
+
1316
+ # chunks[idx%ndims] = math.ceil(chunks[idx%ndims] / 2.0)
1317
+ # idx += 1
1318
+
1319
+ # return tuple(int(x) for x in chunks)
1320
+ # else:
1321
+ # return None
1322
+
1323
+
1324
+ # def guess_chunk_hdf5(shape, maxshape, dtype, chunk_max=2**21):
1325
+ # """ Guess an appropriate chunk layout for a dataset, given its shape and
1326
+ # the size of each element in bytes. Will allocate chunks only as large
1327
+ # as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of
1328
+ # each axis, slightly favoring bigger values for the last index.
1329
+ # Undocumented and subject to change without warning.
1330
+ # """
1331
+ # ndims = len(shape)
1332
+
1333
+ # if ndims > 0:
1334
+
1335
+ # # For unlimited dimensions we have to guess 1024
1336
+ # shape1 = []
1337
+ # for i, x in enumerate(maxshape):
1338
+ # if x is None:
1339
+ # if shape[i] > 1024:
1340
+ # shape1.append(shape[i])
1341
+ # else:
1342
+ # shape1.append(1024)
1343
+ # else:
1344
+ # shape1.append(x)
1345
+
1346
+ # shape = tuple(shape1)
1347
+
1348
+ # # ndims = len(shape)
1349
+ # # if ndims == 0:
1350
+ # # raise ValueError("Chunks not allowed for scalar datasets.")
1351
+
1352
+ # chunks = np.array(shape, dtype='=f8')
1353
+ # if not np.all(np.isfinite(chunks)):
1354
+ # raise ValueError("Illegal value in chunk tuple")
1355
+
1356
+ # # Determine the optimal chunk size in bytes using a PyTables expression.
1357
+ # # This is kept as a float.
1358
+ # typesize = np.dtype(dtype).itemsize
1359
+ # # dset_size = np.prod(chunks)*typesize
1360
+ # # target_size = CHUNK_BASE * (2**np.log10(dset_size/(1024.*1024)))
1361
+
1362
+ # # if target_size > CHUNK_MAX:
1363
+ # # target_size = CHUNK_MAX
1364
+ # # elif target_size < CHUNK_MIN:
1365
+ # # target_size = CHUNK_MIN
1366
+
1367
+ # target_size = chunk_max
1368
+
1369
+ # idx = 0
1370
+ # while True:
1371
+ # # Repeatedly loop over the axes, dividing them by 2. Stop when:
1372
+ # # 1a. We're smaller than the target chunk size, OR
1373
+ # # 1b. We're within 50% of the target chunk size, AND
1374
+ # # 2. The chunk is smaller than the maximum chunk size
1375
+
1376
+ # chunk_bytes = math.prod(chunks)*typesize
1377
+
1378
+ # if (chunk_bytes < target_size or \
1379
+ # abs(chunk_bytes - target_size)/target_size < 0.5):
1380
+ # break
1381
+
1382
+ # if math.prod(chunks) == 1:
1383
+ # break
1384
+
1385
+ # chunks[idx%ndims] = math.ceil(chunks[idx%ndims] / 2.0)
1386
+ # idx += 1
1387
+
1388
+ # return tuple(int(x) for x in chunks)
1389
+ # else:
1390
+ # return None
1391
+
1392
+
1393
+ # def guess_chunk_time(shape, maxshape, dtype, time_index, chunk_max=3*2**20):
1394
+ # """ Guess an appropriate chunk layout for a dataset, given its shape and
1395
+ # the size of each element in bytes. Will allocate chunks only as large
1396
+ # as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of
1397
+ # each axis, slightly favoring bigger values for the last index.
1398
+ # Undocumented and subject to change without warning.
1399
+ # """
1400
+ # ndims = len(shape)
1401
+
1402
+ # if ndims > 0:
1403
+
1404
+ # # For unlimited dimensions we have to guess 1024
1405
+ # shape1 = []
1406
+ # for i, x in enumerate(maxshape):
1407
+ # if x is None:
1408
+ # if shape[i] > 1024:
1409
+ # shape1.append(shape[i])
1410
+ # else:
1411
+ # shape1.append(1024)
1412
+ # else:
1413
+ # shape1.append(x)
1414
+
1415
+ # shape = tuple(shape1)
1416
+
1417
+ # chunks = np.array(shape, dtype='=f8')
1418
+ # if not np.all(np.isfinite(chunks)):
1419
+ # raise ValueError("Illegal value in chunk tuple")
1420
+
1421
+ # # Determine the optimal chunk size in bytes using a PyTables expression.
1422
+ # # This is kept as a float.
1423
+ # typesize = np.dtype(dtype).itemsize
1424
+
1425
+ # target_size = chunk_max
1426
+
1427
+ # while True:
1428
+ # # Repeatedly loop over the axes, dividing them by 2. Stop when:
1429
+ # # 1a. We're smaller than the target chunk size, OR
1430
+ # # 1b. We're within 50% of the target chunk size, AND
1431
+ # # 2. The chunk is smaller than the maximum chunk size
1432
+
1433
+ # chunk_bytes = math.prod(chunks)*typesize
1434
+
1435
+ # if (chunk_bytes < target_size or \
1436
+ # abs(chunk_bytes - target_size)/target_size < 0.5):
1437
+ # break
1438
+
1439
+ # if chunks[time_index] == 1:
1440
+ # break
1441
+
1442
+ # chunks[time_index] = np.ceil(chunks[time_index] / 2.0)
1443
+
1444
+ # return tuple(int(x) for x in chunks)
1445
+ # else:
1446
+ # return None
1447
+
1448
+
1449
+ def cartesian(arrays, out=None):
1450
+ """
1451
+ Generate a cartesian product of input arrays.
1452
+
1453
+ Parameters
1454
+ ----------
1455
+ arrays : list of array-like
1456
+ 1-D arrays to form the cartesian product of.
1457
+ out : ndarray
1458
+ Array to place the cartesian product in.
1459
+
1460
+ Returns
1461
+ -------
1462
+ out : ndarray
1463
+ 2-D array of shape (M, len(arrays)) containing cartesian products
1464
+ formed of input arrays.
1465
+
1466
+ Examples
1467
+ --------
1468
+ >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
1469
+ array([[1, 4, 6],
1470
+ [1, 4, 7],
1471
+ [1, 5, 6],
1472
+ [1, 5, 7],
1473
+ [2, 4, 6],
1474
+ [2, 4, 7],
1475
+ [2, 5, 6],
1476
+ [2, 5, 7],
1477
+ [3, 4, 6],
1478
+ [3, 4, 7],
1479
+ [3, 5, 6],
1480
+ [3, 5, 7]])
1481
+
1482
+ """
1483
+
1484
+ arrays = [np.asarray(x) for x in arrays]
1485
+ dtype = arrays[0].dtype
1486
+
1487
+ n = np.prod([x.size for x in arrays])
1488
+ if out is None:
1489
+ out = np.zeros([n, len(arrays)], dtype=dtype)
1490
+
1491
+ m = int(n / arrays[0].size)
1492
+ out[:,0] = np.repeat(arrays[0], m)
1493
+ if arrays[1:]:
1494
+ cartesian(arrays[1:], out=out[0:m, 1:])
1495
+ for j in range(1, arrays[0].size):
1496
+ out[j*m:(j+1)*m, 1:] = out[0:m, 1:]
1497
+
1498
+ return out
1499
+
1500
+
1501
+ # def get_compressor(name: str = None):
1502
+ # """
1503
+
1504
+ # """
1505
+ # if name is None:
1506
+ # compressor = {}
1507
+ # elif name.lower() == 'none':
1508
+ # compressor = {}
1509
+ # elif name.lower() == 'gzip':
1510
+ # compressor = {'compression': name}
1511
+ # elif name.lower() == 'lzf':
1512
+ # compressor = {'compression': name}
1513
+ # elif name.lower() == 'zstd':
1514
+ # compressor = hdf5plugin.Zstd(1)
1515
+ # elif name.lower() == 'lz4':
1516
+ # compressor = hdf5plugin.LZ4()
1517
+ # else:
1518
+ # raise ValueError('name must be one of gzip, lzf, zstd, lz4, or None.')
1519
+
1520
+ # return compressor
1521
+
1522
+
1523
+ # def fill_ds_by_chunks(ds, files, ds_vars, var_name, group, encodings):
1524
+ # """
1525
+
1526
+ # """
1527
+ # dims = ds_vars['dims']
1528
+ # if ds_vars['fillvalue'] is None:
1529
+ # fillvalue = -99
1530
+ # else:
1531
+ # fillvalue = ds_vars['fillvalue']
1532
+
1533
+ # for chunk in ds.iter_chunks():
1534
+ # chunk_size1 = tuple(c.stop - c.start for c in chunk)
1535
+ # chunk_arr = np.full(chunk_size1, fill_value=fillvalue, dtype=ds_vars['dtype'], order='C')
1536
+ # for i_file, data in ds_vars['data'].items():
1537
+ # # if i_file == 9:
1538
+ # # break
1539
+ # g_bool_index = [(chunk[i].start <= data['global_index'][dim]) & (data['global_index'][dim] < chunk[i].stop) for i, dim in enumerate(dims)]
1540
+ # bool1 = all([a.any() for a in g_bool_index])
1541
+ # if bool1:
1542
+ # l_slices = {}
1543
+ # for i, dim in enumerate(dims):
1544
+ # w = g_bool_index[i]
1545
+ # l_index = data['local_index'][dim][w]
1546
+ # if is_regular_index(l_index):
1547
+ # l_slices[dim] = slice(l_index[0], l_index[-1] + 1, None)
1548
+ # else:
1549
+ # l_slices[dim] = l_index
1550
+
1551
+ # if tuple(range(len(dims))) == data['dims_order']:
1552
+ # transpose_order = None
1553
+ # else:
1554
+ # transpose_order = tuple(data['dims_order'].index(i) for i in range(len(data['dims_order'])))
1555
+
1556
+ # with open_file(files[i_file], group) as f:
1557
+ # if isinstance(f, xr.Dataset):
1558
+ # l_data = encode_data(f[var_name][tuple(l_slices.values())].values, **encodings[var_name])
1559
+ # else:
1560
+ # l_data = f[var_name][tuple(l_slices.values())]
1561
+
1562
+ # if transpose_order is not None:
1563
+ # l_data = l_data.transpose(transpose_order)
1564
+
1565
+ # g_chunk_index = []
1566
+ # for i, dim in enumerate(dims):
1567
+ # s1 = data['global_index'][dim][g_bool_index[i]] - chunk[i].start
1568
+ # if is_regular_index(s1):
1569
+ # s1 = slice(s1[0], s1[-1] + 1, None)
1570
+ # g_chunk_index.append(s1)
1571
+ # chunk_arr[tuple(g_chunk_index)] = l_data
1572
+
1573
+ # ## Save chunk to new dataset
1574
+ # ds[chunk] = chunk_arr
1575
+
1576
+
1577
+ # def fill_ds_by_files(ds, files, ds_vars, var_name, group, encodings):
1578
+ # """
1579
+ # Currently the implementation is simple. It loads one entire input file into the ds. It would be nice to chunk the file before loading to handle very large input files.
1580
+ # """
1581
+ # dims = ds_vars['dims']
1582
+ # dtype = ds_vars['dtype']
1583
+
1584
+ # for i_file, data in ds_vars['data'].items():
1585
+ # dims_order = data['dims_order']
1586
+ # g_index_start = tuple(data['global_index'][dim][0] for dim in dims)
1587
+
1588
+ # if tuple(range(len(dims))) == data['dims_order']:
1589
+ # transpose_order = None
1590
+ # else:
1591
+ # transpose_order = tuple(dims_order.index(i) for i in range(len(dims_order)))
1592
+ # g_index_start = tuple(g_index_start[i] for i in dims_order)
1593
+
1594
+ # file_shape = tuple(len(arr) for dim, arr in data['local_index'].items())
1595
+ # chunk_size = guess_chunk(file_shape, file_shape, dtype, 2**27)
1596
+ # chunk_iter = ChunkIterator(chunk_size, file_shape)
1597
+
1598
+ # with open_file(files[i_file], group) as f:
1599
+ # for chunk in chunk_iter:
1600
+ # # g_chunk_slices = []
1601
+ # # l_slices = []
1602
+ # # for dim in dims:
1603
+ # # g_index = data['global_index'][dim]
1604
+ # # g_chunk_slices.append(slice(g_index[0], g_index[-1] + 1, None))
1605
+
1606
+ # # l_index = data['local_index'][dim]
1607
+ # # l_slices.append(slice(l_index[0], l_index[-1] + 1, None))
1608
+
1609
+ # if isinstance(f, xr.Dataset):
1610
+ # l_data = encode_data(f[var_name][chunk].values, **encodings[var_name])
1611
+ # else:
1612
+ # l_data = f[var_name][chunk]
1613
+
1614
+ # if transpose_order is not None:
1615
+ # l_data = l_data.transpose(transpose_order)
1616
+
1617
+ # g_chunk_slices = tuple(slice(g_index_start[i] + s.start, g_index_start[i] + s.stop, 1) for i, s in enumerate(chunk))
1618
+
1619
+ # ds[g_chunk_slices] = l_data
1620
+
1621
+
1622
+ # def get_dtype_shape(data=None, dtype=None, shape=None):
1623
+ # """
1624
+
1625
+ # """
1626
+ # if data is None:
1627
+ # if (shape is None) or (dtype is None):
1628
+ # raise ValueError('shape and dtype must be passed or data must be passed.')
1629
+ # if not isinstance(dtype, str):
1630
+ # dtype = dtype.name
1631
+ # else:
1632
+ # shape = data.shape
1633
+ # dtype = data.dtype.name
1634
+
1635
+ # return dtype, shape
1636
+
1637
+
1638
+ # def is_var_name(name):
1639
+ # """
1640
+
1641
+ # """
1642
+ # res = var_name_pattern.search(name)
1643
+ # if res:
1644
+ # return True
1645
+ # else:
1646
+ # return False
1647
+
1648
+
1649
+ def format_value(value):
1650
+ """
1651
+
1652
+ """
1653
+ if isinstance(value, (int, np.integer)):
1654
+ return str(value)
1655
+ elif isinstance(value, (float, np.floating)):
1656
+ return f'{value:.2f}'
1657
+ else:
1658
+ return value
1659
+
1660
+
1661
+ def append_summary(summary, summ_dict):
1662
+ """
1663
+
1664
+ """
1665
+ for key, value in summ_dict.items():
1666
+ spacing = value_indent - len(key)
1667
+ if spacing < 1:
1668
+ spacing = 1
1669
+
1670
+ summary += f"""\n{key}""" + """ """ * spacing + value
1671
+
1672
+ return summary
1673
+
1674
+
1675
+ def data_variable_summary(ds):
1676
+ """
1677
+
1678
+ """
1679
+ type1 = type(ds)
1680
+
1681
+ if ds:
1682
+ summ_dict = {'name': ds.name, 'dims order': '(' + ', '.join(ds.coord_names) + ')', 'shape': str(ds.shape), 'chunk size': str(ds.chunk_shape)}
1683
+
1684
+ summary = f"""<cfdb.{type1.__name__}>"""
1685
+
1686
+ summary = append_summary(summary, summ_dict)
1687
+
1688
+ summary += """\nCoordinates:"""
1689
+
1690
+ for coord in ds.coords:
1691
+ coord_name = coord.name
1692
+ dtype_name = coord.dtype_decoded
1693
+ dim_len = coord.shape[0]
1694
+ first_value = format_value(coord.data[0])
1695
+ spacing = value_indent - name_indent - len(coord_name)
1696
+ if spacing < 1:
1697
+ spacing = 1
1698
+ dim_str = f"""\n {coord_name}""" + """ """ * spacing
1699
+ dim_str += f"""({dim_len}) {dtype_name} {first_value} ..."""
1700
+ summary += dim_str
1701
+
1702
+ attrs_summary = make_attrs_repr(ds.attrs, name_indent, value_indent, 'Attributes')
1703
+ summary += """\n""" + attrs_summary
1704
+
1705
+ else:
1706
+ summary = f"""<cfdb.{type1.__name__} is closed>"""
1707
+
1708
+ return summary
1709
+
1710
+
1711
+ def coordinate_summary(ds):
1712
+ """
1713
+
1714
+ """
1715
+ type1 = type(ds)
1716
+
1717
+ if ds:
1718
+ name = ds.name
1719
+ # dim_len = ds.ndims
1720
+ # dtype_name = ds.dtype.name
1721
+ # dtype_decoded = ds.encoding['dtype_decoded']
1722
+ data = ds.data
1723
+ if len(data) > 0:
1724
+ first_value = format_value(ds.data[0])
1725
+ last_value = format_value(ds.data[-1])
1726
+ else:
1727
+ first_value = ''
1728
+ last_value = ''
1729
+
1730
+ # summ_dict = {'name': name, 'dtype encoded': dtype_name, 'dtype decoded': dtype_decoded, 'chunk size': str(ds.chunks), 'dim length': str(dim_len), 'values': f"""{first_value} ... {last_value}"""}
1731
+ summ_dict = {'name': name, 'shape': str(ds.shape), 'chunk shape': str(ds.chunk_shape), 'values': f"""{first_value} ... {last_value}"""}
1732
+
1733
+ summary = f"""<cfdb.{type1.__name__}>"""
1734
+
1735
+ summary = append_summary(summary, summ_dict)
1736
+
1737
+ attrs_summary = make_attrs_repr(ds.attrs, name_indent, value_indent, 'Attributes')
1738
+ summary += """\n""" + attrs_summary
1739
+ else:
1740
+ summary = f"""<cfdb.{type1.__name__} is closed>"""
1741
+
1742
+ return summary
1743
+
1744
+
1745
+ def make_attrs_repr(attrs, name_indent, value_indent, header):
1746
+ summary = f"""{header}:"""
1747
+ for key, value in attrs.items():
1748
+ spacing = value_indent - name_indent - len(key)
1749
+ if spacing < 1:
1750
+ spacing = 1
1751
+ line_str = f"""\n {key}""" + """ """ * spacing + f"""{value}"""
1752
+ summary += line_str
1753
+
1754
+ return summary
1755
+
1756
+
1757
+ # def create_h5py_data_variable(file, name: str, dims: (str, tuple, list), shape: (tuple, list), encoding: dict, data=None, **kwargs):
1758
+ # """
1759
+
1760
+ # """
1761
+ # dtype = encoding['dtype']
1762
+
1763
+ # ## Check if dims already exist and if the dim lengths match
1764
+ # if isinstance(dims, str):
1765
+ # dims = [dims]
1766
+
1767
+ # for i, dim in enumerate(dims):
1768
+ # if dim not in file:
1769
+ # raise ValueError(f'{dim} not in File')
1770
+
1771
+ # dim_len = file._file[dim].shape[0]
1772
+ # if dim_len != shape[i]:
1773
+ # raise ValueError(f'{dim} does not have the same length as the input data/shape dim.')
1774
+
1775
+ # ## Make chunks
1776
+ # if 'chunks' not in kwargs:
1777
+ # if 'maxshape' in kwargs:
1778
+ # maxshape = kwargs['maxshape']
1779
+ # else:
1780
+ # maxshape = shape
1781
+ # kwargs.setdefault('chunks', utils.guess_chunk(shape, maxshape, dtype))
1782
+
1783
+ # ## Create variable
1784
+ # if data is None:
1785
+ # ds = file._file.create_dataset(name, shape, dtype=dtype, track_order=True, **kwargs)
1786
+ # else:
1787
+ # ## Encode data before creating variable
1788
+ # data = utils.encode_data(data, **encoding)
1789
+
1790
+ # ds = file._file.create_dataset(name, dtype=dtype, data=data, track_order=True, **kwargs)
1791
+
1792
+ # for i, dim in enumerate(dims):
1793
+ # ds.dims[i].attach_scale(file._file[dim])
1794
+ # ds.dims[i].label = dim
1795
+
1796
+ # return ds
1797
+
1798
+
1799
+ # def create_h5py_coordinate(file, name: str, data, shape: (tuple, list), encoding: dict, **kwargs):
1800
+ # """
1801
+
1802
+ # """
1803
+ # if len(shape) != 1:
1804
+ # raise ValueError('The shape of a coordinate must be 1-D.')
1805
+
1806
+ # dtype = encoding['dtype']
1807
+
1808
+ # ## Make chunks
1809
+ # if 'chunks' not in kwargs:
1810
+ # if 'maxshape' in kwargs:
1811
+ # maxshape = kwargs['maxshape']
1812
+ # else:
1813
+ # maxshape = shape
1814
+ # kwargs.setdefault('chunks', utils.guess_chunk(shape, maxshape, dtype))
1815
+
1816
+ # ## Encode data before creating variable/coordinate
1817
+ # # print(encoding)
1818
+ # data = utils.encode_data(data, **encoding)
1819
+
1820
+ # # print(data)
1821
+ # # print(dtype)
1822
+
1823
+ # ## Make Variable
1824
+ # ds = file._file.create_dataset(name, dtype=dtype, data=data, track_order=True, **kwargs)
1825
+
1826
+ # ds.make_scale(name)
1827
+ # ds.dims[0].label = name
1828
+
1829
+ # return ds
1830
+
1831
+
1832
+ # def copy_data_variable(to_file, from_variable, name, include_data=True, include_attrs=True, **kwargs):
1833
+ # """
1834
+
1835
+ # """
1836
+ # other1 = from_variable._dataset
1837
+ # for k in ('chunks', 'compression',
1838
+ # 'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
1839
+ # 'fillvalue'):
1840
+ # kwargs.setdefault(k, getattr(other1, k))
1841
+
1842
+ # if 'compression' in other1.attrs:
1843
+ # compression = other1.attrs['compression']
1844
+ # kwargs.update(**utils.get_compressor(compression))
1845
+ # else:
1846
+ # compression = kwargs['compression']
1847
+
1848
+ # # TODO: more elegant way to pass these (dcpl to create_variable?)
1849
+ # dcpl = other1.id.get_create_plist()
1850
+ # kwargs.setdefault('track_times', dcpl.get_obj_track_times())
1851
+ # # kwargs.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
1852
+
1853
+ # # Special case: the maxshape property always exists, but if we pass it
1854
+ # # to create_variable, the new variable will automatically get chunked
1855
+ # # layout. So we copy it only if it is different from shape.
1856
+ # if other1.maxshape != other1.shape:
1857
+ # kwargs.setdefault('maxshape', other1.maxshape)
1858
+
1859
+ # encoding = from_variable.encoding._encoding.copy()
1860
+ # shape = from_variable.shape
1861
+
1862
+ # ds0 = create_h5py_data_variable(to_file, name, tuple(dim.label for dim in other1.dims), shape, encoding, **kwargs)
1863
+
1864
+ # if include_data:
1865
+ # # Directly copy chunks using write_direct_chunk
1866
+ # for chunk in ds0.iter_chunks():
1867
+ # chunk_starts = tuple(c.start for c in chunk)
1868
+ # filter_mask, data = other1.id.read_direct_chunk(chunk_starts)
1869
+ # ds0.id.write_direct_chunk(chunk_starts, data, filter_mask)
1870
+
1871
+ # ds = DataVariable(ds0, to_file, encoding)
1872
+ # if include_attrs:
1873
+ # ds.attrs.update(from_variable.attrs)
1874
+
1875
+ # return ds
1876
+
1877
+
1878
+ # def copy_coordinate(to_file, from_coordinate, name, include_attrs=True, **kwargs):
1879
+ # """
1880
+
1881
+ # """
1882
+ # other1 = from_coordinate._dataset
1883
+ # for k in ('chunks', 'compression',
1884
+ # 'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
1885
+ # 'fillvalue'):
1886
+ # kwargs.setdefault(k, getattr(other1, k))
1887
+
1888
+ # if 'compression' in other1.attrs:
1889
+ # compression = other1.attrs['compression']
1890
+ # kwargs.update(**utils.get_compressor(compression))
1891
+ # else:
1892
+ # compression = kwargs['compression']
1893
+
1894
+ # # TODO: more elegant way to pass these (dcpl to create_variable?)
1895
+ # dcpl = other1.id.get_create_plist()
1896
+ # kwargs.setdefault('track_times', dcpl.get_obj_track_times())
1897
+ # # kwargs.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
1898
+
1899
+ # # Special case: the maxshape property always exists, but if we pass it
1900
+ # # to create_variable, the new variable will automatically get chunked
1901
+ # # layout. So we copy it only if it is different from shape.
1902
+ # if other1.maxshape != other1.shape:
1903
+ # kwargs.setdefault('maxshape', other1.maxshape)
1904
+
1905
+ # encoding = from_coordinate.encoding._encoding.copy()
1906
+ # shape = from_coordinate.shape
1907
+
1908
+ # ds0 = create_h5py_coordinate(to_file, name, from_coordinate.data, shape, encoding, **kwargs)
1909
+
1910
+ # ds = Coordinate(ds0, to_file, encoding)
1911
+ # if include_attrs:
1912
+ # ds.attrs.update(from_coordinate.attrs)
1913
+
1914
+ # return ds
1915
+
1916
+
1917
+ # def prepare_encodings_for_variables(dtype_encoded, dtype_decoded, scale_factor, add_offset, fillvalue, units, calendar):
1918
+ # """
1919
+
1920
+ # """
1921
+ # encoding = {'dtype': dtype_encoded, 'dtype_encoded': dtype_encoded, 'missing_value': fillvalue, '_FillValue': fillvalue, 'add_offset': add_offset, 'scale_factor': scale_factor, 'units': units, 'calendar': calendar}
1922
+ # for key, value in copy.deepcopy(encoding).items():
1923
+ # if value is None:
1924
+ # del encoding[key]
1925
+
1926
+ # if 'datetime64' in dtype_decoded:
1927
+ # if 'units' not in encoding:
1928
+ # encoding['units'] = 'seconds since 1970-01-01'
1929
+ # if 'calendar' not in encoding:
1930
+ # encoding['calendar'] = 'gregorian'
1931
+ # encoding['dtype'] = 'int64'
1932
+
1933
+ # return encoding
1934
+
1935
+
1936
+ def file_summary(ds):
1937
+ """
1938
+
1939
+ """
1940
+ type1 = type(ds)
1941
+
1942
+ if ds:
1943
+ file_path = ds.file_path
1944
+ if file_path.exists() and file_path.is_file():
1945
+ file_size = file_path.stat().st_size*0.000001
1946
+ file_size_str = """{file_size:.1f} MB""".format(file_size=file_size)
1947
+ else:
1948
+ file_size_str = """NA"""
1949
+
1950
+ summ_dict = {'file name': file_path.name, 'file size': file_size_str, 'writable': str(ds.writable)}
1951
+
1952
+ summary = f"""<cfdb.{type1.__name__}>"""
1953
+
1954
+ summary = append_summary(summary, summ_dict)
1955
+
1956
+ summary += """\nCoordinates:"""
1957
+
1958
+ for var in ds.coords:
1959
+ dim_name = var.name
1960
+ dtype_name = var.dtype_decoded
1961
+ dim_len = var.shape[0]
1962
+ first_value = format_value(var.data[0])
1963
+ last_value = format_value(var.data[-1])
1964
+ spacing = value_indent - name_indent - len(dim_name)
1965
+ if spacing < 1:
1966
+ spacing = 1
1967
+ dim_str = f"""\n {dim_name}""" + """ """ * spacing
1968
+ dim_str += f"""({dim_len}) {dtype_name} {first_value} ... {last_value}"""
1969
+ summary += dim_str
1970
+
1971
+ summary += """\nData Variables:"""
1972
+
1973
+ for dv in ds.data_vars:
1974
+ dv_name = dv.name
1975
+ dtype_name = dv.dtype_decoded
1976
+ # shape = dv.shape
1977
+ dims = ', '.join(dv.coord_names)
1978
+ # first_value = format_value(dv[tuple(0 for i in range(len(shape)))])
1979
+ spacing = value_indent - name_indent - len(dv_name)
1980
+ if spacing < 1:
1981
+ spacing = 1
1982
+ ds_str = f"""\n {dv_name}""" + """ """ * spacing
1983
+ ds_str += f"""({dims}) {dtype_name}"""
1984
+ summary += ds_str
1985
+
1986
+ attrs_summary = make_attrs_repr(ds.attrs, name_indent, value_indent, 'Attributes')
1987
+ summary += """\n""" + attrs_summary
1988
+ else:
1989
+ summary = f"""<cfdb.{type1.__name__} is closed>"""
1990
+
1991
+ return summary
1992
+
1993
+
1994
+ def get_var_params(name, kwargs):
1995
+ """
1996
+
1997
+ """
1998
+ params = deepcopy(default_params[name])
1999
+ params.update(kwargs)
2000
+
2001
+ name = params.pop('name')
2002
+
2003
+ return name, params
2004
+
2005
+
2006
+
2007
+
2008
+
2009
+
2010
+
2011
+
2012
+
2013
+
2014
+
2015
+
2016
+
2017
+
2018
+
2019
+
2020
+
2021
+
2022
+
2023
+
2024
+
2025
+
2026
+
2027
+
2028
+
2029
+
2030
+
2031
+
2032
+
2033
+
2034
+
2035
+
2036
+
2037
+
2038
+
2039
+
2040
+
2041
+
2042
+
2043
+
2044
+
2045
+
2046
+
2047
+
2048
+
2049
+
2050
+
2051
+
2052
+
2053
+
2054
+
2055
+
2056
+
2057
+
2058
+
2059
+
2060
+
2061
+
2062
+
2063
+
2064
+
2065
+
2066
+
2067
+
2068
+
2069
+
2070
+
2071
+
2072
+
2073
+
2074
+
2075
+
2076
+
2077
+
2078
+
2079
+