cfdb 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cfdb/__init__.py +6 -0
- cfdb/combine.py +501 -0
- cfdb/core.py +1232 -0
- cfdb/creation.py +345 -0
- cfdb/data_models.py +189 -0
- cfdb/indexers.py +452 -0
- cfdb/main.py +857 -0
- cfdb/support_classes.py +1187 -0
- cfdb/utils.py +2079 -0
- cfdb-0.1.0.dist-info/METADATA +57 -0
- cfdb-0.1.0.dist-info/RECORD +13 -0
- cfdb-0.1.0.dist-info/WHEEL +4 -0
- cfdb-0.1.0.dist-info/licenses/LICENSE +16 -0
cfdb/utils.py
ADDED
@@ -0,0 +1,2079 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
Created on Fri Sep 30 19:52:08 2022
|
5
|
+
|
6
|
+
@author: mike
|
7
|
+
"""
|
8
|
+
import io
|
9
|
+
import pathlib
|
10
|
+
# import h5py
|
11
|
+
import os
|
12
|
+
import numpy as np
|
13
|
+
import msgspec
|
14
|
+
import re
|
15
|
+
from copy import deepcopy
|
16
|
+
# import xarray as xr
|
17
|
+
# from time import time
|
18
|
+
# from datetime import datetime
|
19
|
+
import cftime
|
20
|
+
import math
|
21
|
+
import rechunkit
|
22
|
+
from typing import Set, Optional, Dict, Tuple, List, Union, Any
|
23
|
+
# import zstandard as zstd
|
24
|
+
# import lz4
|
25
|
+
import booklet
|
26
|
+
|
27
|
+
# import dateutil.parser as dparser
|
28
|
+
# import numcodecs
|
29
|
+
# import hdf5plugin
|
30
|
+
|
31
|
+
from . import data_models
|
32
|
+
# import data_models
|
33
|
+
|
34
|
+
########################################################
|
35
|
+
### Parmeters
|
36
|
+
|
37
|
+
|
38
|
+
CHUNK_BASE = 32*1024 # Multiplier by which chunks are adjusted
|
39
|
+
CHUNK_MIN = 32*1024 # Soft lower limit (32k)
|
40
|
+
CHUNK_MAX = 3*1024**2 # Hard upper limit (4M)
|
41
|
+
|
42
|
+
time_str_conversion = {'days': 'datetime64[D]',
|
43
|
+
'hours': 'datetime64[h]',
|
44
|
+
'minutes': 'datetime64[m]',
|
45
|
+
'seconds': 'datetime64[s]',
|
46
|
+
'milliseconds': 'datetime64[ms]',
|
47
|
+
'microseconds': 'datetime64[us]',
|
48
|
+
'nanoseconds': 'datetime64[ns]'}
|
49
|
+
|
50
|
+
# enc_fields = ('units', 'calendar', 'dtype', 'missing_value', '_FillValue', 'add_offset', 'scale_factor', 'dtype_decoded', 'dtype_encoded', 'compression')
|
51
|
+
|
52
|
+
fillvalue_dict = {'int8': -128, 'int16': -32768, 'int32': -2147483648, 'int64': -9223372036854775808, 'float32': np.nan, 'float64': np.nan, 'str': ''}
|
53
|
+
|
54
|
+
var_chunk_key_str = '{var_name}!{dims}'
|
55
|
+
|
56
|
+
attrs_key_str = '_{var_name}.attrs'
|
57
|
+
|
58
|
+
name_indent = 4
|
59
|
+
value_indent = 20
|
60
|
+
var_name_regex = "^[a-zA-Z][a-zA-Z0-9_]*$"
|
61
|
+
var_name_pattern = re.compile(var_name_regex)
|
62
|
+
|
63
|
+
time_units_dict = {
|
64
|
+
'M': 'months',
|
65
|
+
'D': 'days',
|
66
|
+
'h': 'hours',
|
67
|
+
'm': 'minutes',
|
68
|
+
's': 'seconds',
|
69
|
+
'ms': 'milliseconds',
|
70
|
+
'us': 'microseconds',
|
71
|
+
'ns': 'nanoseconds',
|
72
|
+
}
|
73
|
+
|
74
|
+
compression_options = ('zstd', 'lz4')
|
75
|
+
default_compression_levels = {'zstd': 1, 'lz4': 1}
|
76
|
+
default_n_buckets = 144013
|
77
|
+
|
78
|
+
default_params = {'lon': {'name': 'longitude', 'dtype_encoded': 'int32', 'fillvalue': -2147483648, 'scale_factor': 0.0000001, 'dtype_decoded': 'float32'},
|
79
|
+
'lat': {'name': 'latitude', 'dtype_encoded': 'int32', 'fillvalue': -2147483648, 'scale_factor': 0.0000001, 'dtype_decoded': 'float32'},
|
80
|
+
'height': {'name': 'height', 'dtype_encoded': 'int32', 'fillvalue': -2147483648, 'scale_factor': 0.001, 'dtype_decoded': 'float32'},
|
81
|
+
'altitude': {'name': 'altitude', 'dtype_encoded': 'int32', 'fillvalue': -2147483648, 'scale_factor': 0.001, 'dtype_decoded': 'float32'},
|
82
|
+
'time': {'name': 'time', 'dtype_encoded': 'int64', 'dtype_decoded': 'datetime64[s]'},
|
83
|
+
'modified_date': {'name': 'modified_date', 'dtype_encoded': 'datetime64[us]', 'dtype_decoded': 'datetime64[us]'},
|
84
|
+
'band': {'name': 'band', 'dtype_decoded': 'uint8', 'dtype_encoded': 'uint8', 'fillvalue': 0},
|
85
|
+
# 'chunk_day': {'dtype_encoded': 'int32'},
|
86
|
+
# 'chunk_date': {'fillvalue': -99999999, 'units': "days since 1970-01-01 00:00:00"},
|
87
|
+
'censor_code': {'name': 'censor_code', 'dtype_decoded': 'uint8', 'dtype_encoded': 'uint8', 'fillvalue': 0},
|
88
|
+
# 'bore_top_of_screen': {'dtype_encoded': 'int16', 'fillvalue': 9999, 'scale_factor': 0.1},
|
89
|
+
# 'bore_bottom_of_screen': {'dtype_encoded': 'int16', 'fillvalue': 9999, 'scale_factor': 0.1},
|
90
|
+
# 'bore_depth': {'dtype_encoded': 'int16', 'fillvalue': -9999, 'scale_factor': 0.1},
|
91
|
+
# 'reference_level': {'dtype_encoded': 'int16', 'fillvalue': -9999, 'scale_factor': 1},
|
92
|
+
}
|
93
|
+
|
94
|
+
# base_attrs = {'station_id': {'cf_role': "timeseries_id", 'description': 'The unique ID associated with the geometry for a single result.'},
|
95
|
+
# 'lat': {'standard_name': "latitude", 'units': "degrees_north"},
|
96
|
+
# 'lon': {'standard_name': "longitude", 'units': "degrees_east"},
|
97
|
+
# 'altitude': {'standard_name': 'surface_altitude', 'long_name': 'height above the geoid to the lower boundary of the atmosphere', 'units': 'm'},
|
98
|
+
# 'geometry': {'long_name': 'The hexadecimal encoding of the Well-Known Binary (WKB) geometry', 'crs_EPSG': 4326},
|
99
|
+
# 'station_geometry': {'long_name': 'The hexadecimal encoding of the Well-Known Binary (WKB) station geometry', 'crs_EPSG': 4326},
|
100
|
+
# 'height': {'standard_name': 'height', 'long_name': 'vertical distance above the surface', 'units': 'm', 'positive': 'up'},
|
101
|
+
# 'time': {'standard_name': 'time', 'long_name': 'start_time'}, 'name': {'long_name': 'station name'},
|
102
|
+
# 'ref': {'long_name': 'station reference id given by the owner'}, 'modified_date': {'long_name': 'last modified date'},
|
103
|
+
# 'band': {'long_name': 'band number'},
|
104
|
+
# 'chunk_date': {'long_name': 'chunking date'},
|
105
|
+
# 'chunk_day': {'long_name': 'chunking day', 'description': 'The chunk day is the number of days after 1970-01-01. Can be negative for days before 1970-01-01 with a minimum of -106751, which is 1677-09-22 (minimum possible date). The maximum value is 106751.'},
|
106
|
+
# 'chunk_hash': {'long_name': 'chunk hash', 'description': 'The unique hash of the results parameter for comparison purposes.'},
|
107
|
+
# 'chunk_id': {'long_name': 'chunk id', 'description': 'The unique id of the results chunk associated with the specific station.'},
|
108
|
+
# 'censor_code': {'long_name': 'data censor code', 'standard_name': 'status_flag', 'flag_values': '0 1 2 3 4 5', 'flag_meanings': 'greater_than less_than not_censored non-detect present_but_not_quantified unknown'},
|
109
|
+
# 'bore_top_of_screen': {'long_name': 'bore top of screen', 'description': 'The depth to the top of the screen from the reference level.', 'units': 'm', 'positive': 'down'},
|
110
|
+
# 'bore_bottom_of_screen': {'long_name': 'bore bottom of screen', 'description': 'The depth to the bottom of the screen from the reference level.', 'units': 'm', 'positive': 'down'},
|
111
|
+
# 'bore_depth': {'long_name': 'bore depth', 'description': 'The depth of the bore from the reference level.', 'units': 'm', 'positive': 'down'},
|
112
|
+
# 'alt_name': {'long_name': 'Alternative name', 'description': 'The alternative name for the station'},
|
113
|
+
# 'reference_level': {'long_name': 'The bore reference level', 'description': 'The bore reference level for measurements.', 'units': 'mm', 'positive': 'up'}
|
114
|
+
# }
|
115
|
+
|
116
|
+
default_attrs = dict(
|
117
|
+
lat={
|
118
|
+
'long_name': 'latitude',
|
119
|
+
'units': 'degrees_north',
|
120
|
+
'standard_name': 'latitude',
|
121
|
+
'axis': 'Y',
|
122
|
+
},
|
123
|
+
lon={
|
124
|
+
'long_name': 'longitude',
|
125
|
+
'units': 'degrees_east',
|
126
|
+
'standard_name': 'longitude',
|
127
|
+
'axis': 'X',
|
128
|
+
},
|
129
|
+
height={
|
130
|
+
'long_name': 'height',
|
131
|
+
'units': 'm',
|
132
|
+
'standard_name': 'height',
|
133
|
+
'positive': 'up',
|
134
|
+
'axis': 'Z',
|
135
|
+
},
|
136
|
+
altitude={
|
137
|
+
'long_name': 'altitude',
|
138
|
+
'units': 'm',
|
139
|
+
'standard_name': 'altitude',
|
140
|
+
'positive': 'up',
|
141
|
+
'axis': 'Z',
|
142
|
+
},
|
143
|
+
time={
|
144
|
+
'long_name': 'time',
|
145
|
+
# 'units': 'seconds since 1970-01-01 00:00:00',
|
146
|
+
'standard_name': 'time',
|
147
|
+
# 'calendar': 'proleptic_gregorian',
|
148
|
+
'axis': 'T',
|
149
|
+
},
|
150
|
+
)
|
151
|
+
|
152
|
+
#########################################################
|
153
|
+
### Classes
|
154
|
+
|
155
|
+
|
156
|
+
|
157
|
+
|
158
|
+
# class ChunkIterator:
|
159
|
+
# """
|
160
|
+
# Class to iterate through list of chunks of a given dataset
|
161
|
+
# """
|
162
|
+
# def __init__(self, chunks, shape, source_sel=None):
|
163
|
+
# self._shape = shape
|
164
|
+
# rank = len(shape)
|
165
|
+
|
166
|
+
# # if not dset.chunks:
|
167
|
+
# # # can only use with chunked datasets
|
168
|
+
# # raise TypeError("Chunked dataset required")
|
169
|
+
|
170
|
+
# self._layout = chunks
|
171
|
+
# if source_sel is None:
|
172
|
+
# # select over entire dataset
|
173
|
+
# slices = []
|
174
|
+
# for dim in range(rank):
|
175
|
+
# slices.append(slice(0, self._shape[dim]))
|
176
|
+
# self._sel = tuple(slices)
|
177
|
+
# else:
|
178
|
+
# if isinstance(source_sel, slice):
|
179
|
+
# self._sel = (source_sel,)
|
180
|
+
# else:
|
181
|
+
# self._sel = source_sel
|
182
|
+
# if len(self._sel) != rank:
|
183
|
+
# raise ValueError("Invalid selection - selection region must have same rank as dataset")
|
184
|
+
# self._chunk_index = []
|
185
|
+
# for dim in range(rank):
|
186
|
+
# s = self._sel[dim]
|
187
|
+
# if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
|
188
|
+
# raise ValueError("Invalid selection - selection region must be within dataset space")
|
189
|
+
# index = s.start // self._layout[dim]
|
190
|
+
# self._chunk_index.append(index)
|
191
|
+
|
192
|
+
# def __iter__(self):
|
193
|
+
# return self
|
194
|
+
|
195
|
+
# def __next__(self):
|
196
|
+
# rank = len(self._shape)
|
197
|
+
# slices = []
|
198
|
+
# if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
|
199
|
+
# # ran past the last chunk, end iteration
|
200
|
+
# raise StopIteration()
|
201
|
+
|
202
|
+
# for dim in range(rank):
|
203
|
+
# s = self._sel[dim]
|
204
|
+
# start = self._chunk_index[dim] * self._layout[dim]
|
205
|
+
# stop = (self._chunk_index[dim] + 1) * self._layout[dim]
|
206
|
+
# # adjust the start if this is an edge chunk
|
207
|
+
# if start < s.start:
|
208
|
+
# start = s.start
|
209
|
+
# if stop > s.stop:
|
210
|
+
# stop = s.stop # trim to end of the selection
|
211
|
+
# s = slice(start, stop, 1)
|
212
|
+
# slices.append(s)
|
213
|
+
|
214
|
+
# # bump up the last index and carry forward if we run outside the selection
|
215
|
+
# dim = rank - 1
|
216
|
+
# while dim >= 0:
|
217
|
+
# s = self._sel[dim]
|
218
|
+
# self._chunk_index[dim] += 1
|
219
|
+
|
220
|
+
# chunk_end = self._chunk_index[dim] * self._layout[dim]
|
221
|
+
# if chunk_end < s.stop:
|
222
|
+
# # we still have room to extend along this dimensions
|
223
|
+
# return tuple(slices)
|
224
|
+
|
225
|
+
# if dim > 0:
|
226
|
+
# # reset to the start and continue iterating with higher dimension
|
227
|
+
# self._chunk_index[dim] = 0
|
228
|
+
# dim -= 1
|
229
|
+
# return tuple(slices)
|
230
|
+
|
231
|
+
|
232
|
+
#########################################################
|
233
|
+
### Functions
|
234
|
+
|
235
|
+
|
236
|
+
def parse_cf_time_units(dtype_decoded):
|
237
|
+
"""
|
238
|
+
|
239
|
+
"""
|
240
|
+
np_time_str = dtype_decoded.str.split('[')[1].split(']')[0]
|
241
|
+
time_name = time_units_dict[np_time_str]
|
242
|
+
datetime = np.datetime64('1970-01-01', np_time_str)
|
243
|
+
units = f'{time_name} since {datetime}'
|
244
|
+
|
245
|
+
return units
|
246
|
+
|
247
|
+
|
248
|
+
def min_max_dates_per_bit_len(n_bits):
|
249
|
+
"""
|
250
|
+
|
251
|
+
"""
|
252
|
+
n_bits_options = (16, 32, 64)
|
253
|
+
if n_bits not in n_bits_options:
|
254
|
+
raise ValueError(f'n_bits must be one of {n_bits_options}')
|
255
|
+
|
256
|
+
freq_codes = ('D', 'h', 'm', 's')
|
257
|
+
res_dict = {}
|
258
|
+
for code in freq_codes:
|
259
|
+
int_len = int(2**n_bits*.5)
|
260
|
+
min_date = np.datetime64(-int_len + 1, code).astype(str)
|
261
|
+
max_date = np.datetime64(int_len - 1, code).astype(str)
|
262
|
+
res_dict[code] = (min_date, max_date)
|
263
|
+
|
264
|
+
return res_dict
|
265
|
+
|
266
|
+
|
267
|
+
def dataset_finalizer(blt_file, sys_meta):
|
268
|
+
"""
|
269
|
+
|
270
|
+
"""
|
271
|
+
old_meta_data = blt_file.get_metadata()
|
272
|
+
if old_meta_data is not None:
|
273
|
+
old_meta = msgspec.convert(old_meta_data, data_models.SysMeta)
|
274
|
+
if old_meta != sys_meta:
|
275
|
+
blt_file.set_metadata(msgspec.to_builtins(sys_meta))
|
276
|
+
else:
|
277
|
+
blt_file.set_metadata(msgspec.to_builtins(sys_meta))
|
278
|
+
|
279
|
+
blt_file.close()
|
280
|
+
|
281
|
+
|
282
|
+
def attrs_finalizer(blt_file, attrs, var_name, writeable):
|
283
|
+
"""
|
284
|
+
|
285
|
+
"""
|
286
|
+
if attrs and writeable:
|
287
|
+
key = attrs_key_str.format(var_name=var_name)
|
288
|
+
old_attrs = blt_file.get(key)
|
289
|
+
if old_attrs is not None:
|
290
|
+
old_attrs = msgspec.json.decode(old_attrs)
|
291
|
+
if old_attrs != attrs:
|
292
|
+
blt_file.set(key, msgspec.json.encode(attrs))
|
293
|
+
else:
|
294
|
+
blt_file.set(key, msgspec.json.encode(attrs))
|
295
|
+
|
296
|
+
|
297
|
+
def compute_scale_and_offset(min_value: Union[int, float, np.number], max_value: Union[int, float, np.number], dtype: Union[np.dtype, str]):
|
298
|
+
"""
|
299
|
+
Computes the scale (slope) and offset for a dataset using a min value, max value, and the required np.dtype. It leaves one value at the lower extreme to use for the nan fillvalue.
|
300
|
+
These are the min values set asside for the fillvalue (up to 64 bits).
|
301
|
+
int8: -128
|
302
|
+
int16: -32768
|
303
|
+
int32: -2147483648
|
304
|
+
int64: -9223372036854775808
|
305
|
+
|
306
|
+
Unsigned integers are allowed and a value of 0 is set asside for the fillvalue.
|
307
|
+
|
308
|
+
Parameters
|
309
|
+
----------
|
310
|
+
min_value : int or float
|
311
|
+
The min value of the dataset.
|
312
|
+
max_value : int or float
|
313
|
+
The max value of the dataset.
|
314
|
+
dtype : np.dtype
|
315
|
+
The data type that you want to shrink the data down to.
|
316
|
+
|
317
|
+
Returns
|
318
|
+
-------
|
319
|
+
scale, offset as floats
|
320
|
+
"""
|
321
|
+
if isinstance(dtype, str):
|
322
|
+
dtype = np.dtype(dtype)
|
323
|
+
bits = dtype.itemsize * 8
|
324
|
+
data_range = max_value - min_value
|
325
|
+
target_range = 2**bits - 2
|
326
|
+
slope = data_range / target_range
|
327
|
+
|
328
|
+
if bits < 64:
|
329
|
+
target_min = -(2**(bits - 1) - 1)
|
330
|
+
else:
|
331
|
+
target_min = -(2**(bits - 1) - 1000)
|
332
|
+
|
333
|
+
# if bits < 64:
|
334
|
+
# target_range = 2**bits - 2
|
335
|
+
# target_min = -(2**(bits - 1) - 1)
|
336
|
+
# slope = data_range / target_range
|
337
|
+
# else:
|
338
|
+
# data_power = int(math.log10(data_range))
|
339
|
+
# target_range = 2**bits
|
340
|
+
# target_power = int(math.log10(target_range))
|
341
|
+
# target_min = -10**(target_power - 1)
|
342
|
+
# slope = 10**-(target_power - data_power)
|
343
|
+
|
344
|
+
# Correction if the dtype is unsigned
|
345
|
+
if dtype.kind == 'u':
|
346
|
+
target_min = 1
|
347
|
+
|
348
|
+
offset = min_value - (slope*target_min)
|
349
|
+
|
350
|
+
return slope, offset
|
351
|
+
|
352
|
+
|
353
|
+
def check_var_name(var_name):
|
354
|
+
"""
|
355
|
+
Function to test if the user-supplied var name is allowed.
|
356
|
+
"""
|
357
|
+
if isinstance(var_name, str):
|
358
|
+
if len(var_name) <= 256:
|
359
|
+
if var_name_pattern.match(var_name):
|
360
|
+
return True
|
361
|
+
return False
|
362
|
+
|
363
|
+
|
364
|
+
def coord_data_step_check(data: np.ndarray, dtype_decoded: np.dtype, step: int | float | bool = False):
|
365
|
+
"""
|
366
|
+
|
367
|
+
"""
|
368
|
+
# diff = np.diff(data)
|
369
|
+
if isinstance(step, bool):
|
370
|
+
diff = np.diff(data)
|
371
|
+
if dtype_decoded == 'f':
|
372
|
+
step = float(np.round(diff[0], 5))
|
373
|
+
if not np.allclose(step, diff):
|
374
|
+
raise ValueError('step is set to True, but the data does not seem to be regular.')
|
375
|
+
# data = np.linspace(data[0], data[-1], len(diff) + 1, dtype=dtype_decoded)
|
376
|
+
else:
|
377
|
+
step = int(diff[0])
|
378
|
+
|
379
|
+
if not np.all(np.equal(step, diff)):
|
380
|
+
raise ValueError('step is set to True, but the data does not seem to be regular.')
|
381
|
+
elif isinstance(step, (float, np.floating)):
|
382
|
+
if step <= 0:
|
383
|
+
raise ValueError('step must be greater than 0.')
|
384
|
+
# if not np.allclose(step, diff):
|
385
|
+
# raise ValueError('step does not seem to be the interval of the data.')
|
386
|
+
step = float(round(step, 5))
|
387
|
+
# num = round((data[-1] - data[0])/step, 5)
|
388
|
+
# if not num.is_integer():
|
389
|
+
# raise ValueError('The step is not a multiple of the difference between the first and last values of the data.')
|
390
|
+
|
391
|
+
# data = np.linspace(data[0], data[-1], int(num) + 1, dtype=dtype_decoded)
|
392
|
+
elif isinstance(step, (int, np.integer)):
|
393
|
+
if step <= 0:
|
394
|
+
raise ValueError('step must be greater than 0.')
|
395
|
+
# if not np.all(np.equal(step, diff)):
|
396
|
+
# raise ValueError('step is set to True, but the data does not seem to be regular.')
|
397
|
+
step = int(step)
|
398
|
+
|
399
|
+
# data = np.linspace(data[0], data[-1], int(num) + 1, dtype=dtype_decoded)
|
400
|
+
else:
|
401
|
+
raise TypeError('step must be a bool, int, or float. The int or float must be greater than 0.')
|
402
|
+
|
403
|
+
num = round((data[-1] - data[0])/step, 5)
|
404
|
+
if not num.is_integer():
|
405
|
+
raise ValueError('The step is not a multiple of the difference between the first and last values of the data.')
|
406
|
+
|
407
|
+
return step, int(num)
|
408
|
+
|
409
|
+
|
410
|
+
def init_coord_data_checks(data: np.ndarray, step: int | float | bool, dtype_decoded, shape):
|
411
|
+
"""
|
412
|
+
|
413
|
+
"""
|
414
|
+
# dtype_decoded = data.dtype
|
415
|
+
# shape = data.shape
|
416
|
+
|
417
|
+
if len(shape) > 1:
|
418
|
+
raise ValueError('Coordinates must be 1D.')
|
419
|
+
|
420
|
+
if len(np.unique(data)) < shape[0]:
|
421
|
+
raise ValueError('The data for coords must be unique.')
|
422
|
+
|
423
|
+
if dtype_decoded.kind in ('f', 'u', 'i', 'M'):
|
424
|
+
data.sort()
|
425
|
+
if step:
|
426
|
+
step, num = coord_data_step_check(data, dtype_decoded, step)
|
427
|
+
# data = np.linspace(data[0], data[-1], num + 1, dtype=dtype_decoded)
|
428
|
+
else:
|
429
|
+
step = None
|
430
|
+
else:
|
431
|
+
step = None
|
432
|
+
|
433
|
+
return step
|
434
|
+
|
435
|
+
|
436
|
+
def append_coord_data_checks(new_data: np.ndarray, source_data: np.ndarray, source_dtype_decoded: np.dtype = None, source_step: int | float | None = None):
|
437
|
+
"""
|
438
|
+
|
439
|
+
"""
|
440
|
+
# new_shape = new_data.shape
|
441
|
+
# new_dtype_decoded = new_data.dtype
|
442
|
+
new_data = np.asarray(new_data, dtype=source_dtype_decoded)
|
443
|
+
|
444
|
+
# if source_dtype_decoded != new_dtype_decoded:
|
445
|
+
# raise TypeError('The data dtype does not match the originally assigned dtype.')
|
446
|
+
|
447
|
+
# print(source_data)
|
448
|
+
|
449
|
+
if source_data.size > 0:
|
450
|
+
if source_dtype_decoded.kind != 'U':
|
451
|
+
last = source_data[-1]
|
452
|
+
|
453
|
+
if not np.all(last < new_data):
|
454
|
+
raise ValueError('Appending requires that all values are greater than the existing values.')
|
455
|
+
|
456
|
+
new_data.sort()
|
457
|
+
if source_step:
|
458
|
+
new_step, new_num = coord_data_step_check(new_data, source_dtype_decoded, source_step)
|
459
|
+
|
460
|
+
new_data = np.linspace(source_data[0], new_data[-1], len(source_data) + new_num + 1, dtype=source_dtype_decoded)
|
461
|
+
else:
|
462
|
+
new_data = np.append(source_data, new_data)
|
463
|
+
|
464
|
+
else:
|
465
|
+
s1 = set(source_data)
|
466
|
+
s1.update(set(new_data))
|
467
|
+
if len(s1) != (len(source_data) + len(new_data)):
|
468
|
+
raise ValueError('The data for coords must be unique.')
|
469
|
+
|
470
|
+
new_data = np.append(source_data, new_data)
|
471
|
+
|
472
|
+
else:
|
473
|
+
_ = init_coord_data_checks(new_data, source_step, source_dtype_decoded, new_data.shape)
|
474
|
+
|
475
|
+
return new_data
|
476
|
+
|
477
|
+
|
478
|
+
def prepend_coord_data_checks(new_data: np.ndarray, source_data: np.ndarray, source_dtype_decoded: np.dtype = None, source_step: int | float | None = None):
|
479
|
+
"""
|
480
|
+
|
481
|
+
"""
|
482
|
+
# new_shape = new_data.shape
|
483
|
+
# new_dtype_decoded = new_data.dtype
|
484
|
+
new_data = np.asarray(new_data, dtype=source_dtype_decoded)
|
485
|
+
|
486
|
+
# if source_dtype_decoded != new_dtype_decoded:
|
487
|
+
# raise TypeError('The data dtype does not match the originally assigned dtype.')
|
488
|
+
|
489
|
+
if source_data.size > 0:
|
490
|
+
if source_dtype_decoded.kind != 'U':
|
491
|
+
first = source_data[0]
|
492
|
+
|
493
|
+
if not np.all(first > new_data):
|
494
|
+
raise ValueError('Prepending requires that all values are less than the existing values.')
|
495
|
+
|
496
|
+
new_data.sort()
|
497
|
+
if source_step:
|
498
|
+
new_step, new_num = coord_data_step_check(new_data, source_dtype_decoded, source_step)
|
499
|
+
|
500
|
+
new_data = np.linspace(new_step[0], source_data[-1], len(source_data) + new_num + 1, dtype=source_dtype_decoded)
|
501
|
+
else:
|
502
|
+
new_data = np.append(new_data, source_data)
|
503
|
+
else:
|
504
|
+
s1 = set(source_data)
|
505
|
+
s1.update(set(new_data))
|
506
|
+
if len(s1) != (len(source_data) + len(new_data)):
|
507
|
+
raise ValueError('The data for coords must be unique.')
|
508
|
+
|
509
|
+
new_data = np.append(new_data, source_data)
|
510
|
+
|
511
|
+
else:
|
512
|
+
new_data, new_step = init_coord_data_checks(new_data, source_step, source_dtype_decoded, new_data.shape)
|
513
|
+
|
514
|
+
return new_data
|
515
|
+
|
516
|
+
|
517
|
+
# def coord_data_checks(data: np.ndarray, step: int | float | bool = False, source_data: np.ndarray = None, source_dtype_decoded: np.dtype = None, source_step: int | float | None = None):
|
518
|
+
# """
|
519
|
+
|
520
|
+
# """
|
521
|
+
# shape = data.shape
|
522
|
+
# dtype_decoded = data.dtype
|
523
|
+
|
524
|
+
# if dtype_decoded is not None:
|
525
|
+
# if source_dtype_decoded != dtype_decoded:
|
526
|
+
# raise TypeError('The data dtype does not match the originally assigned dtype.')
|
527
|
+
|
528
|
+
# if source_data:
|
529
|
+
# s1 = set(source_data)
|
530
|
+
# s1.update(set(data))
|
531
|
+
# if len(s1) != (len(source_data) + len(data)):
|
532
|
+
# raise ValueError('The data for coords must be unique.')
|
533
|
+
# elif len(np.unique(data)) < shape[0]:
|
534
|
+
# raise ValueError('The data for coords must be unique.')
|
535
|
+
|
536
|
+
# if dtype_decoded.kind in ('f', 'u', 'i', 'M'):
|
537
|
+
# data.sort()
|
538
|
+
# if step:
|
539
|
+
# diff = np.diff(data)
|
540
|
+
# if isinstance(step, bool):
|
541
|
+
# if dtype_decoded == 'f':
|
542
|
+
# step = float(np.round(diff[0], 5))
|
543
|
+
# if not np.allclose(step, diff):
|
544
|
+
# raise ValueError('step is set to True, but the data does not seem to be regular.')
|
545
|
+
# # data = np.linspace(data[0], data[-1], len(diff) + 1, dtype=dtype_decoded)
|
546
|
+
# else:
|
547
|
+
# step = int(diff[0])
|
548
|
+
|
549
|
+
# if not np.all(np.equal(step, diff)):
|
550
|
+
# raise ValueError('step is set to True, but the data does not seem to be regular.')
|
551
|
+
# elif isinstance(step, (float, np.floating)):
|
552
|
+
# if step <= 0:
|
553
|
+
# raise ValueError('step must be greater than 0.')
|
554
|
+
# # if not np.allclose(step, diff):
|
555
|
+
# # raise ValueError('step does not seem to be the interval of the data.')
|
556
|
+
# step = float(round(step, 5))
|
557
|
+
# num = round((data[-1] - data[0])/step, 5)
|
558
|
+
# if not num.is_integer():
|
559
|
+
# raise ValueError('The step is not a multiple of the dirrefernce between the first and last values of the data.')
|
560
|
+
|
561
|
+
# data = np.linspace(data[0], data[-1], int(num) + 1, dtype=dtype_decoded)
|
562
|
+
# elif isinstance(step, (int, np.integer)):
|
563
|
+
# if step <= 0:
|
564
|
+
# raise ValueError('step must be greater than 0.')
|
565
|
+
# # if not np.all(np.equal(step, diff)):
|
566
|
+
# # raise ValueError('step is set to True, but the data does not seem to be regular.')
|
567
|
+
# step = int(step)
|
568
|
+
# num = round((data[-1] - data[0])/step, 5)
|
569
|
+
# if not num.is_integer():
|
570
|
+
# raise ValueError('The step is not a multiple of the dirrefernce between the first and last values of the data.')
|
571
|
+
|
572
|
+
# data = np.linspace(data[0], data[-1], int(num) + 1, dtype=dtype_decoded)
|
573
|
+
# else:
|
574
|
+
# raise TypeError('step must be a bool, int, or float. The int or float must be greater than 0.')
|
575
|
+
# else:
|
576
|
+
# step = None
|
577
|
+
# else:
|
578
|
+
# step = None
|
579
|
+
|
580
|
+
# return data, step
|
581
|
+
|
582
|
+
|
583
|
+
def parse_dtypes(dtype_decoded, dtype_encoded):
|
584
|
+
"""
|
585
|
+
|
586
|
+
"""
|
587
|
+
dtype_decoded = np.dtype(dtype_decoded)
|
588
|
+
|
589
|
+
# if dtype_decoded.kind == 'M':
|
590
|
+
# dtype_encoded = np.dtype('int64')
|
591
|
+
|
592
|
+
if isinstance(dtype_encoded, str):
|
593
|
+
dtype_encoded = np.dtype(dtype_encoded)
|
594
|
+
|
595
|
+
elif not isinstance(dtype_encoded, np.dtype):
|
596
|
+
dtype_encoded = dtype_decoded
|
597
|
+
|
598
|
+
return dtype_decoded, dtype_encoded
|
599
|
+
|
600
|
+
|
601
|
+
def parse_dtype_names(dtype_decoded, dtype_encoded):
|
602
|
+
"""
|
603
|
+
|
604
|
+
"""
|
605
|
+
if dtype_encoded.kind == 'U':
|
606
|
+
dtype_decoded_name = dtype_decoded.str
|
607
|
+
dtype_encoded_name = dtype_encoded.str
|
608
|
+
else:
|
609
|
+
dtype_decoded_name = dtype_decoded.name
|
610
|
+
dtype_encoded_name = dtype_encoded.name
|
611
|
+
|
612
|
+
return dtype_decoded_name, dtype_encoded_name
|
613
|
+
|
614
|
+
|
615
|
+
def parse_fillvalue(fillvalue, dtype_encoded):
|
616
|
+
"""
|
617
|
+
|
618
|
+
"""
|
619
|
+
## Fillvalue
|
620
|
+
kind = dtype_encoded.kind
|
621
|
+
if fillvalue is not None:
|
622
|
+
fillvalue_dtype = np.dtype(type(fillvalue))
|
623
|
+
|
624
|
+
if kind == 'u' and fillvalue_dtype.kind == 'i':
|
625
|
+
if fillvalue < 0:
|
626
|
+
raise ValueError('The dtype_encoded is an unsigned integer, but the fillvalue is < 0.')
|
627
|
+
elif fillvalue_dtype.kind != kind:
|
628
|
+
raise ValueError('The fillvalue dtype is not the same as the dtype_encoded dtype.')
|
629
|
+
else:
|
630
|
+
if kind == 'u':
|
631
|
+
fillvalue = 0
|
632
|
+
elif kind == 'f':
|
633
|
+
fillvalue = None
|
634
|
+
elif kind == 'U':
|
635
|
+
fillvalue = ''
|
636
|
+
elif kind == 'i':
|
637
|
+
fillvalue = fillvalue_dict[dtype_encoded.name]
|
638
|
+
elif kind == 'M':
|
639
|
+
fillvalue = None
|
640
|
+
else:
|
641
|
+
raise TypeError('Unknown/unsupported data type.')
|
642
|
+
|
643
|
+
return fillvalue
|
644
|
+
|
645
|
+
|
646
|
+
def parse_scale_offset(scale_factor, add_offset, dtype_decoded):
|
647
|
+
"""
|
648
|
+
|
649
|
+
"""
|
650
|
+
## Scale and offset
|
651
|
+
if scale_factor is None and isinstance(add_offset, (int, float, np.number)):
|
652
|
+
scale_factor = 1
|
653
|
+
# if isinstance(scale_factor, (int, float, np.number)) and add_offset is None:
|
654
|
+
# add_offset = 0
|
655
|
+
|
656
|
+
if isinstance(scale_factor, (int, float, np.number)) and dtype_decoded.kind != 'f':
|
657
|
+
raise ValueError('scale_factor and add_offset only apply to floats.')
|
658
|
+
|
659
|
+
return scale_factor, add_offset
|
660
|
+
|
661
|
+
|
662
|
+
def parse_coord_inputs(name: str, data: np.ndarray | None = None, chunk_shape: Tuple[int] | None = None, dtype_decoded: str | np.dtype | None = None, dtype_encoded: str | np.dtype | None = None, fillvalue: Union[int, float, str] = None, scale_factor: Union[float, int, None] = None, add_offset: Union[float, int, None] = None, step: int | float | bool = False):
|
663
|
+
"""
|
664
|
+
|
665
|
+
"""
|
666
|
+
## Check var name
|
667
|
+
if not check_var_name(name):
|
668
|
+
raise ValueError(f'{name} is not a valid variable name.')
|
669
|
+
|
670
|
+
## Check data, shape, dtype, and step
|
671
|
+
if isinstance(data, np.ndarray):
|
672
|
+
dtype_decoded = data.dtype
|
673
|
+
|
674
|
+
step = init_coord_data_checks(data, step, dtype_decoded, data.shape)
|
675
|
+
|
676
|
+
# if dtype_decoded.kind == 'M':
|
677
|
+
# dtype_encoded = dtype_decoded
|
678
|
+
|
679
|
+
## dtype encoding
|
680
|
+
dtype_decoded, dtype_encoded = parse_dtypes(dtype_decoded, dtype_encoded)
|
681
|
+
|
682
|
+
else:
|
683
|
+
## dtype encoding
|
684
|
+
dtype_decoded, dtype_encoded = parse_dtypes(dtype_decoded, dtype_encoded)
|
685
|
+
|
686
|
+
if dtype_decoded.kind in ('u', 'i') and isinstance(step, (float, np.floating)):
|
687
|
+
if not step.is_integer():
|
688
|
+
raise ValueError('If the dtype_decoded is an integer, then step must be an integer.')
|
689
|
+
else:
|
690
|
+
step = int(step)
|
691
|
+
elif isinstance(step, bool):
|
692
|
+
if step:
|
693
|
+
raise TypeError('If data is not passed, then step cannot be set to True')
|
694
|
+
else:
|
695
|
+
step = None
|
696
|
+
elif isinstance(step, np.floating):
|
697
|
+
step = float(round(step, 5))
|
698
|
+
else:
|
699
|
+
raise TypeError('step must be a bool, int, or float. The int or float must be greater than 0.')
|
700
|
+
|
701
|
+
## Guess the chunk_shape from the dtype
|
702
|
+
if isinstance(chunk_shape, tuple):
|
703
|
+
if not all([isinstance(c, int) for c in chunk_shape]):
|
704
|
+
raise TypeError('chunk_shape must be a tuple of ints.')
|
705
|
+
elif chunk_shape is None:
|
706
|
+
chunk_shape = rechunkit.guess_chunk_shape((1000000,), dtype_encoded, 2**20)
|
707
|
+
else:
|
708
|
+
raise TypeError('chunk_shape must be either a tuple of ints or None.')
|
709
|
+
|
710
|
+
## fillvalue
|
711
|
+
fillvalue = parse_fillvalue(fillvalue, dtype_encoded)
|
712
|
+
|
713
|
+
## Scale and offset
|
714
|
+
scale_factor, add_offset = parse_scale_offset(scale_factor, add_offset, dtype_decoded)
|
715
|
+
|
716
|
+
## Save metadata
|
717
|
+
dtype_decoded_name, dtype_encoded_name = parse_dtype_names(dtype_decoded, dtype_encoded)
|
718
|
+
|
719
|
+
# enc = data_models.Encoding(dtype_encoded=dtype_encoded_name, dtype_decoded=dtype_decoded_name, fillvalue=fillvalue, scale_factor=scale_factor, add_offset=add_offset)
|
720
|
+
|
721
|
+
var = data_models.CoordinateVariable(shape=(0,), chunk_shape=chunk_shape, origin=0, step=step, dtype_encoded=dtype_encoded_name, dtype_decoded=dtype_decoded_name, fillvalue=fillvalue, scale_factor=scale_factor, add_offset=add_offset)
|
722
|
+
|
723
|
+
return name, var
|
724
|
+
|
725
|
+
|
726
|
+
def parse_var_inputs(sys_meta: data_models.SysMeta, name: str, coords: Tuple[str,...], dtype_decoded: str | np.dtype, dtype_encoded: str | np.dtype | None = None, chunk_shape: Tuple[int] | None = None, fillvalue: Union[int, float, str] = None, scale_factor: Union[float, int, None] = None, add_offset: Union[float, int, None] = None):
|
727
|
+
"""
|
728
|
+
Function to process the inputs to a variable creation function.
|
729
|
+
"""
|
730
|
+
## Check var name
|
731
|
+
if not check_var_name(name):
|
732
|
+
raise ValueError(f'{name} is not a valid variable name.')
|
733
|
+
|
734
|
+
if name in sys_meta.variables:
|
735
|
+
raise ValueError(f"Dataset already contains the variable {name}.")
|
736
|
+
|
737
|
+
## Check shape and dtype
|
738
|
+
if len(coords) == 0:
|
739
|
+
raise ValueError('coords must have at least one value.')
|
740
|
+
|
741
|
+
shape = []
|
742
|
+
for coord_name in coords:
|
743
|
+
if not isinstance(coord_name, str):
|
744
|
+
raise TypeError('coords must contain strings of the coordinate names.')
|
745
|
+
if coord_name not in sys_meta.variables:
|
746
|
+
raise ValueError(f'{coord_name} not in the list of coordinates.')
|
747
|
+
else:
|
748
|
+
coord = sys_meta.variables[coord_name]
|
749
|
+
shape.append(coord.shape[0])
|
750
|
+
|
751
|
+
## dtypes
|
752
|
+
dtype_decoded, dtype_encoded = parse_dtypes(dtype_decoded, dtype_encoded)
|
753
|
+
|
754
|
+
## Guess the chunk_shape from the dtype
|
755
|
+
if isinstance(chunk_shape, tuple):
|
756
|
+
if not all([isinstance(c, int) for c in chunk_shape]):
|
757
|
+
raise TypeError('chunk_shape must be a tuple of ints.')
|
758
|
+
elif chunk_shape is None:
|
759
|
+
chunk_shape = rechunkit.guess_chunk_shape(shape, dtype_encoded, 2**21)
|
760
|
+
else:
|
761
|
+
raise TypeError('chunk_shape must be either a tuple of ints or None.')
|
762
|
+
|
763
|
+
## fillvalue
|
764
|
+
fillvalue = parse_fillvalue(fillvalue, dtype_encoded)
|
765
|
+
|
766
|
+
## Scale and offset
|
767
|
+
scale_factor, add_offset = parse_scale_offset(scale_factor, add_offset, dtype_decoded)
|
768
|
+
|
769
|
+
## Save metadata
|
770
|
+
dtype_decoded_name, dtype_encoded_name = parse_dtype_names(dtype_decoded, dtype_encoded)
|
771
|
+
|
772
|
+
# enc = data_models.Encoding(dtype_encoded=dtype_encoded_name, dtype_decoded=dtype_decoded_name, fillvalue=fillvalue, scale_factor=scale_factor, add_offset=add_offset)
|
773
|
+
|
774
|
+
var = data_models.DataVariable(coords=tuple(coords), chunk_shape=chunk_shape, dtype_encoded=dtype_encoded_name, dtype_decoded=dtype_decoded_name, fillvalue=fillvalue, scale_factor=scale_factor, add_offset=add_offset)
|
775
|
+
|
776
|
+
return name, var
|
777
|
+
|
778
|
+
|
779
|
+
# def encode_datetime(data, units=None, calendar='gregorian'):
|
780
|
+
# """
|
781
|
+
|
782
|
+
# """
|
783
|
+
# if units is None:
|
784
|
+
# output = data.astype('datetime64[s]').astype('int64')
|
785
|
+
# else:
|
786
|
+
# if '1970-01-01' in units:
|
787
|
+
# time_unit = units.split()[0]
|
788
|
+
# output = data.astype(time_str_conversion[time_unit]).astype('int64')
|
789
|
+
# else:
|
790
|
+
# output = cftime.date2num(data.astype('datetime64[s]').tolist(), units, calendar)
|
791
|
+
|
792
|
+
# return output
|
793
|
+
|
794
|
+
|
795
|
+
def decode_datetime(data, units=None, calendar='gregorian'):
|
796
|
+
"""
|
797
|
+
|
798
|
+
"""
|
799
|
+
if units is None:
|
800
|
+
output = data.astype('datetime64[s]')
|
801
|
+
else:
|
802
|
+
if '1970-01-01' in units:
|
803
|
+
time_unit = units.split()[0]
|
804
|
+
output = data.astype(time_str_conversion[time_unit])
|
805
|
+
else:
|
806
|
+
output = cftime.num2pydate(data, units, calendar).astype('datetime64[s]')
|
807
|
+
|
808
|
+
return output
|
809
|
+
|
810
|
+
|
811
|
+
# def encode_data(data, dtype_encoded, fillvalue, add_offset, scale_factor, compressor) -> bytes:
|
812
|
+
# """
|
813
|
+
|
814
|
+
# """
|
815
|
+
# if 'datetime64' in data.dtype.name:
|
816
|
+
# data = data.astype('int64')
|
817
|
+
|
818
|
+
# elif isinstance(scale_factor, (int, float, np.number)):
|
819
|
+
# # precision = int(np.abs(np.log10(val['scale_factor'])))
|
820
|
+
# data = np.round((data - add_offset)/scale_factor)
|
821
|
+
|
822
|
+
# if isinstance(fillvalue, (int, np.number)):
|
823
|
+
# data[np.isnan(data)] = fillvalue
|
824
|
+
|
825
|
+
# # if (data.dtype.name != dtype_encoded) or (data.dtype.name == 'object'):
|
826
|
+
# # data = data.astype(dtype_encoded)
|
827
|
+
# # print(data)
|
828
|
+
# data = data.astype(dtype_encoded)
|
829
|
+
|
830
|
+
# return compressor.compress(data.tobytes())
|
831
|
+
|
832
|
+
|
833
|
+
# def decode_data(data: bytes, dtype_encoded, dtype_decoded, missing_value, add_offset=0, scale_factor=None, units=None, calendar=None, **kwargs) -> np.ndarray:
|
834
|
+
# """
|
835
|
+
|
836
|
+
# """
|
837
|
+
# data = np.frombuffer(data, dtype=dtype_encoded)
|
838
|
+
|
839
|
+
# if isinstance(calendar, str):
|
840
|
+
# data = decode_datetime(data, units, calendar)
|
841
|
+
|
842
|
+
# elif isinstance(scale_factor, (int, float, np.number)):
|
843
|
+
# data = data.astype(dtype_decoded)
|
844
|
+
|
845
|
+
# if isinstance(missing_value, (int, np.number)):
|
846
|
+
# if isinstance(data, np.number):
|
847
|
+
# if data == missing_value:
|
848
|
+
# data = np.nan
|
849
|
+
# else:
|
850
|
+
# data[data == missing_value] = np.nan
|
851
|
+
|
852
|
+
# data = (data * scale_factor) + add_offset
|
853
|
+
|
854
|
+
# elif (data.dtype.name != dtype_decoded) or (data.dtype.name == 'object'):
|
855
|
+
# data = data.astype(dtype_decoded)
|
856
|
+
|
857
|
+
# return data
|
858
|
+
|
859
|
+
|
860
|
+
# def get_encoding_data_from_attrs(attrs):
|
861
|
+
# """
|
862
|
+
|
863
|
+
# """
|
864
|
+
# encoding = {}
|
865
|
+
# for f, v in attrs.items():
|
866
|
+
# if f in enc_fields:
|
867
|
+
# if isinstance(v, bytes):
|
868
|
+
# encoding[f] = v.decode()
|
869
|
+
# elif isinstance(v, np.ndarray):
|
870
|
+
# if len(v) == 1:
|
871
|
+
# encoding[f] = v[0]
|
872
|
+
# else:
|
873
|
+
# raise ValueError('encoding is an ndarray with len > 1.')
|
874
|
+
# else:
|
875
|
+
# encoding[f] = v
|
876
|
+
|
877
|
+
# return encoding
|
878
|
+
|
879
|
+
|
880
|
+
# def get_encoding_data_from_xr(data):
|
881
|
+
# """
|
882
|
+
|
883
|
+
# """
|
884
|
+
# attrs = {f: v for f, v in data.attrs.items() if (f in enc_fields) and (f not in ignore_attrs)}
|
885
|
+
# encoding = {f: v for f, v in data.encoding.items() if (f in enc_fields) and (f not in ignore_attrs)}
|
886
|
+
|
887
|
+
# attrs.update(encoding)
|
888
|
+
|
889
|
+
# return attrs
|
890
|
+
|
891
|
+
|
892
|
+
# def process_encoding(encoding, dtype):
|
893
|
+
# """
|
894
|
+
|
895
|
+
# """
|
896
|
+
# if (dtype.name == 'object') or ('str' in dtype.name):
|
897
|
+
# # encoding['dtype'] = h5py.string_dtype()
|
898
|
+
# encoding['dtype'] = 'object'
|
899
|
+
# elif ('datetime64' in dtype.name): # which means it's an xr.DataArray
|
900
|
+
# encoding['dtype'] = 'int64'
|
901
|
+
# encoding['calendar'] = 'gregorian'
|
902
|
+
# encoding['units'] = 'seconds since 1970-01-01 00:00:00'
|
903
|
+
# encoding['missing_value'] = missing_value_dict['int64']
|
904
|
+
# encoding['_FillValue'] = encoding['missing_value']
|
905
|
+
|
906
|
+
# elif 'calendar' in encoding: # Which means it's not an xr.DataArray
|
907
|
+
# encoding['dtype'] = 'int64'
|
908
|
+
# if 'units' not in encoding:
|
909
|
+
# encoding['units'] = 'seconds since 1970-01-01 00:00:00'
|
910
|
+
# encoding['missing_value'] = missing_value_dict['int64']
|
911
|
+
# encoding['_FillValue'] = encoding['missing_value']
|
912
|
+
|
913
|
+
# if 'dtype' not in encoding:
|
914
|
+
# if np.issubdtype(dtype, np.floating):
|
915
|
+
# # scale, offset = compute_scale_and_offset(min_value, max_value, n)
|
916
|
+
# raise ValueError('float dtypes must have encoding data to encode to int.')
|
917
|
+
# encoding['dtype'] = dtype.name
|
918
|
+
# elif not isinstance(encoding['dtype'], str):
|
919
|
+
# encoding['dtype'] = encoding['dtype'].name
|
920
|
+
|
921
|
+
# if 'scale_factor' in encoding:
|
922
|
+
# if not isinstance(encoding['scale_factor'], (int, float, np.number)):
|
923
|
+
# raise TypeError('scale_factor must be an int or float.')
|
924
|
+
|
925
|
+
# if not 'int' in encoding['dtype']:
|
926
|
+
# raise ValueError('If scale_factor is assigned, then the dtype must be an integer.')
|
927
|
+
# if 'add_offset' not in encoding:
|
928
|
+
# encoding['add_offset'] = 0
|
929
|
+
# elif not isinstance(encoding['add_offset'], (int, float, np.number)):
|
930
|
+
# raise ValueError('add_offset must be a number.')
|
931
|
+
|
932
|
+
# if 'int' in encoding['dtype']:
|
933
|
+
# if ('_FillValue' in encoding) and ('missing_value' not in encoding):
|
934
|
+
# encoding['missing_value'] = encoding['_FillValue']
|
935
|
+
# if ('_FillValue' not in encoding) and ('missing_value' in encoding):
|
936
|
+
# encoding['_FillValue'] = encoding['missing_value']
|
937
|
+
|
938
|
+
# # if 'missing_value' not in encoding:
|
939
|
+
# # encoding['missing_value'] = missing_value_dict[encoding['dtype'].name]
|
940
|
+
# # encoding['_FillValue'] = encoding['missing_value']
|
941
|
+
|
942
|
+
# return encoding
|
943
|
+
|
944
|
+
|
945
|
+
# def assign_dtype_decoded(encoding):
|
946
|
+
# """
|
947
|
+
|
948
|
+
# """
|
949
|
+
# if encoding['dtype'] == 'object':
|
950
|
+
# encoding['dtype_decoded'] = encoding['dtype']
|
951
|
+
# elif ('calendar' in encoding) and ('units' in encoding):
|
952
|
+
# encoding['dtype_decoded'] = 'datetime64[s]'
|
953
|
+
|
954
|
+
# if 'scale_factor' in encoding:
|
955
|
+
|
956
|
+
# # if isinstance(encoding['scale_factor'], (int, np.integer)):
|
957
|
+
# # encoding['dtype_decoded'] = np.dtype('float32')
|
958
|
+
# if np.dtype(encoding['dtype']).itemsize > 2:
|
959
|
+
# encoding['dtype_decoded'] = 'float64'
|
960
|
+
# else:
|
961
|
+
# encoding['dtype_decoded'] = 'float32'
|
962
|
+
|
963
|
+
# if 'dtype_decoded' not in encoding:
|
964
|
+
# encoding['dtype_decoded'] = encoding['dtype']
|
965
|
+
|
966
|
+
# return encoding
|
967
|
+
|
968
|
+
|
969
|
+
def make_var_chunk_key(var_name, chunk_start):
|
970
|
+
"""
|
971
|
+
|
972
|
+
"""
|
973
|
+
dims = '.'.join(map(str, chunk_start))
|
974
|
+
var_chunk_key = var_chunk_key_str.format(var_name=var_name, dims=dims)
|
975
|
+
|
976
|
+
return var_chunk_key
|
977
|
+
|
978
|
+
|
979
|
+
# def write_chunk(blt_file, var_name, chunk_start_pos, data_chunk_bytes):
|
980
|
+
# """
|
981
|
+
|
982
|
+
# """
|
983
|
+
# dims = '.'.join(map(str, chunk_start_pos))
|
984
|
+
# var_chunk_key = var_chunk_key_str.format(var_name=var_name, dims=dims)
|
985
|
+
|
986
|
+
# # var_name, dims = var_chunk_key.split('!')
|
987
|
+
# # chunk_start_pos = tuple(map(int, dims.split('.')))
|
988
|
+
|
989
|
+
# blt_file[var_chunk_key] = data_chunk_bytes
|
990
|
+
|
991
|
+
|
992
|
+
# def write_init_data(blt_file, var_name, var_meta, data, compressor):
|
993
|
+
# """
|
994
|
+
|
995
|
+
# """
|
996
|
+
# dtype_decoded = np.dtype(var_meta.encoding.dtype_decoded)
|
997
|
+
# fillvalue = dtype_decoded.type(var_meta.encoding.fillvalue)
|
998
|
+
# dtype_encoded = np.dtype(var_meta.encoding.dtype_encoded)
|
999
|
+
# add_offset = var_meta.encoding.add_offset
|
1000
|
+
# scale_factor = var_meta.encoding.scale_factor
|
1001
|
+
|
1002
|
+
# mem_arr1 = np.full(var_meta.chunk_shape, fill_value=fillvalue, dtype=dtype_encoded)
|
1003
|
+
|
1004
|
+
# chunk_iter = rechunker.chunk_range(var_meta.origin, var_meta.shape, var_meta.chunk_shape, clip_ends=True)
|
1005
|
+
# for chunk in chunk_iter:
|
1006
|
+
# # print(chunk)
|
1007
|
+
# mem_arr2 = mem_arr1.copy()
|
1008
|
+
# mem_chunk = tuple(slice(0, s.stop - s.start) for s in chunk)
|
1009
|
+
# mem_arr2[mem_chunk] = data[chunk]
|
1010
|
+
|
1011
|
+
# chunk_start_pos = tuple(s.start for s in chunk)
|
1012
|
+
# # print(mem_arr2)
|
1013
|
+
# data_chunk_bytes = encode_data(mem_arr2, dtype_encoded, fillvalue_encoded, add_offset, scale_factor, compressor)
|
1014
|
+
|
1015
|
+
# write_chunk(blt_file, var_name, chunk_start_pos, data_chunk_bytes)
|
1016
|
+
|
1017
|
+
|
1018
|
+
# def coord_init(name, shape, chunk_shape, enc, sys_meta, step):
|
1019
|
+
# """
|
1020
|
+
|
1021
|
+
# """
|
1022
|
+
# ## Update sys_meta
|
1023
|
+
# if name in sys_meta.variables:
|
1024
|
+
# raise ValueError(f'Dataset already contains the variable {name}.')
|
1025
|
+
|
1026
|
+
# var = data_models.Variable(shape=shape, chunk_shape=chunk_shape, origin=(0,), coords=(name,), is_coord=True, encoding=enc, step=step)
|
1027
|
+
|
1028
|
+
# sys_meta.variables[name] = var
|
1029
|
+
|
1030
|
+
# # if data is not None:
|
1031
|
+
# # write_init_data(blt_file, name, var, data, compressor)
|
1032
|
+
|
1033
|
+
|
1034
|
+
def check_coords(coords, shape, sys_meta):
|
1035
|
+
"""
|
1036
|
+
|
1037
|
+
"""
|
1038
|
+
# exist_coords = set(sys_meta.variables.keys())
|
1039
|
+
# new_coords = set(coords)
|
1040
|
+
# diff_coords = new_coords.difference(exist_coords)
|
1041
|
+
|
1042
|
+
# if diff_coords:
|
1043
|
+
# raise ValueError(f'{diff_coords} does not exist. Create the coord(s) before creating the data variable.')
|
1044
|
+
|
1045
|
+
if len(coords) != len(shape):
|
1046
|
+
raise ValueError(f'The coords length ({len(coords)}) != the shape length ({len(shape)})')
|
1047
|
+
|
1048
|
+
for coord, size in zip(coords, shape):
|
1049
|
+
if coord not in sys_meta.variables:
|
1050
|
+
raise ValueError(f'{coord} does not exist. Create the coord before creating the data variable.')
|
1051
|
+
|
1052
|
+
exist_coord = sys_meta.variables[coord]
|
1053
|
+
|
1054
|
+
if not exist_coord.is_coord:
|
1055
|
+
raise TypeError(f'{coord} must be a coord. This is a data variable.')
|
1056
|
+
|
1057
|
+
if size != exist_coord.shape[0]:
|
1058
|
+
raise ValueError(f'The {coord} shape length ({size}) != existing coord length ({exist_coord.shape[0]})')
|
1059
|
+
|
1060
|
+
|
1061
|
+
# def init_file(file_path: Union[str, pathlib.Path], flag: str = "r", compression='zstd', compression_level=1, **kwargs):
|
1062
|
+
# """
|
1063
|
+
|
1064
|
+
# """
|
1065
|
+
# if 'n_buckets' not in kwargs:
|
1066
|
+
# kwargs['n_buckets'] = default_n_buckets
|
1067
|
+
|
1068
|
+
# fp = pathlib.Path(file_path)
|
1069
|
+
# fp_exists = fp.exists()
|
1070
|
+
# blt = booklet.open(file_path, flag, key_serializer='str', **kwargs)
|
1071
|
+
# writable = blt.writable
|
1072
|
+
# file_path = fp
|
1073
|
+
|
1074
|
+
# ## Set/Get system metadata
|
1075
|
+
# if not fp_exists or flag in ('n', 'c'):
|
1076
|
+
# # Checks
|
1077
|
+
# if compression.lower() not in compression_options:
|
1078
|
+
# raise ValueError(f'compression must be one of {compression_options}.')
|
1079
|
+
|
1080
|
+
# sys_meta = data_models.SysMeta(object_type='Dataset', compression=data_models.Compressor(compression), compression_level=compression_level, variables={})
|
1081
|
+
# blt.set_metadata(msgspec.to_builtins(sys_meta))
|
1082
|
+
|
1083
|
+
# else:
|
1084
|
+
# sys_meta = msgspec.convert(blt.get_metadata(), data_models.SysMeta)
|
1085
|
+
|
1086
|
+
# compression = sys_meta.compression.value
|
1087
|
+
# compression_level = sys_meta.compression_level
|
1088
|
+
# compressor = sc.Compressor(compression, compression_level)
|
1089
|
+
|
1090
|
+
# finalizers = [weakref.finalize(self, utils.dataset_finalizer, self._blt, self._sys_meta))]
|
1091
|
+
|
1092
|
+
|
1093
|
+
# def data_var_init(name, coords, shape, chunk_shape, enc, sys_meta):
|
1094
|
+
# """
|
1095
|
+
|
1096
|
+
# """
|
1097
|
+
# ## Update sys_meta
|
1098
|
+
# if name in sys_meta.variables:
|
1099
|
+
# raise ValueError(f'Dataset already contains the variable {name}.')
|
1100
|
+
|
1101
|
+
# var = data_models.Variable(shape=shape, chunk_shape=chunk_shape, origin=0, coords=coords, is_coord=False, encoding=enc)
|
1102
|
+
|
1103
|
+
# sys_meta.variables[name] = var
|
1104
|
+
|
1105
|
+
|
1106
|
+
# def extend_coords(files, encodings, group):
|
1107
|
+
# """
|
1108
|
+
|
1109
|
+
# """
|
1110
|
+
# coords_dict = {}
|
1111
|
+
|
1112
|
+
# for file1 in files:
|
1113
|
+
# with open_file(file1, group) as file:
|
1114
|
+
# if isinstance(file, xr.Dataset):
|
1115
|
+
# ds_list = list(file.coords)
|
1116
|
+
# else:
|
1117
|
+
# ds_list = [ds_name for ds_name in file.keys() if is_scale(file[ds_name])]
|
1118
|
+
|
1119
|
+
# for ds_name in ds_list:
|
1120
|
+
# ds = file[ds_name]
|
1121
|
+
|
1122
|
+
# if isinstance(file, xr.Dataset):
|
1123
|
+
# data = encode_data(ds.values, **encodings[ds_name])
|
1124
|
+
# else:
|
1125
|
+
# if ds.dtype.name == 'object':
|
1126
|
+
# data = ds[:].astype(str).astype(h5py.string_dtype())
|
1127
|
+
# else:
|
1128
|
+
# data = ds[:]
|
1129
|
+
|
1130
|
+
# # Check for nan values in numeric types
|
1131
|
+
# dtype = data.dtype
|
1132
|
+
# if np.issubdtype(dtype, np.integer):
|
1133
|
+
# nan_value = missing_value_dict[dtype.name]
|
1134
|
+
# if nan_value in data:
|
1135
|
+
# raise ValueError(f'{ds_name} has nan values. Floats and integers coordinates cannot have nan values. Check the encoding values if the original values are floats.')
|
1136
|
+
|
1137
|
+
# if ds_name in coords_dict:
|
1138
|
+
# coords_dict[ds_name] = np.union1d(coords_dict[ds_name], data)
|
1139
|
+
# else:
|
1140
|
+
# coords_dict[ds_name] = data
|
1141
|
+
|
1142
|
+
# return coords_dict
|
1143
|
+
|
1144
|
+
|
1145
|
+
# def index_variables(files, coords_dict, encodings, group):
|
1146
|
+
# """
|
1147
|
+
|
1148
|
+
# """
|
1149
|
+
# vars_dict = {}
|
1150
|
+
# is_regular_dict = {}
|
1151
|
+
|
1152
|
+
# for i, file1 in enumerate(files):
|
1153
|
+
# with open_file(file1, group) as file:
|
1154
|
+
# # if i == 77:
|
1155
|
+
# # break
|
1156
|
+
|
1157
|
+
# if isinstance(file, xr.Dataset):
|
1158
|
+
# ds_list = list(file.data_vars)
|
1159
|
+
# else:
|
1160
|
+
# ds_list = [ds_name for ds_name in file.keys() if not is_scale(file[ds_name])]
|
1161
|
+
|
1162
|
+
# _ = [is_regular_dict.update({ds_name: True}) for ds_name in ds_list if ds_name not in is_regular_dict]
|
1163
|
+
|
1164
|
+
# for ds_name in ds_list:
|
1165
|
+
# ds = file[ds_name]
|
1166
|
+
|
1167
|
+
# var_enc = encodings[ds_name]
|
1168
|
+
|
1169
|
+
# dims = []
|
1170
|
+
# global_index = {}
|
1171
|
+
# local_index = {}
|
1172
|
+
# remove_ds = False
|
1173
|
+
|
1174
|
+
# for dim in ds.dims:
|
1175
|
+
# if isinstance(ds, xr.DataArray):
|
1176
|
+
# dim_name = dim
|
1177
|
+
# dim_data = encode_data(ds[dim_name].values, **encodings[dim_name])
|
1178
|
+
# else:
|
1179
|
+
# dim_name = dim[0].name.split('/')[-1]
|
1180
|
+
# if dim[0].dtype.name == 'object':
|
1181
|
+
# dim_data = dim[0][:].astype(str).astype(h5py.string_dtype())
|
1182
|
+
# else:
|
1183
|
+
# dim_data = dim[0][:]
|
1184
|
+
|
1185
|
+
# dims.append(dim_name)
|
1186
|
+
|
1187
|
+
# # global_arr_index = np.searchsorted(coords_dict[dim_name], dim_data)
|
1188
|
+
# # local_arr_index = np.isin(dim_data, coords_dict[dim_name], assume_unique=True).nonzero()[0]
|
1189
|
+
# values, global_arr_index, local_arr_index = np.intersect1d(coords_dict[dim_name], dim_data, assume_unique=True, return_indices=True)
|
1190
|
+
|
1191
|
+
# if len(global_arr_index) > 0:
|
1192
|
+
|
1193
|
+
# global_index[dim_name] = global_arr_index
|
1194
|
+
# local_index[dim_name] = local_arr_index
|
1195
|
+
|
1196
|
+
# if is_regular_dict[ds_name]:
|
1197
|
+
# if (not is_regular_index(global_arr_index)) or (not is_regular_index(local_arr_index)):
|
1198
|
+
# is_regular_dict[ds_name] = False
|
1199
|
+
# else:
|
1200
|
+
# remove_ds = True
|
1201
|
+
# break
|
1202
|
+
|
1203
|
+
# if remove_ds:
|
1204
|
+
# if ds_name in vars_dict:
|
1205
|
+
# if i in vars_dict[ds_name]['data']:
|
1206
|
+
# del vars_dict[ds_name]['data'][i]
|
1207
|
+
|
1208
|
+
# else:
|
1209
|
+
# dict1 = {'dims_order': tuple(i for i in range(len(dims))), 'global_index': global_index, 'local_index': local_index}
|
1210
|
+
|
1211
|
+
# if ds_name in vars_dict:
|
1212
|
+
# if not np.in1d(vars_dict[ds_name]['dims'], dims).all():
|
1213
|
+
# raise ValueError('dims are not consistant between the same named dataset: ' + ds_name)
|
1214
|
+
# # if vars_dict[ds_name]['dtype'] != ds.dtype:
|
1215
|
+
# # raise ValueError('dtypes are not consistant between the same named dataset: ' + ds_name)
|
1216
|
+
|
1217
|
+
# dims_order = [vars_dict[ds_name]['dims'].index(dim) for dim in dims]
|
1218
|
+
# dict1['dims_order'] = tuple(dims_order)
|
1219
|
+
|
1220
|
+
# vars_dict[ds_name]['data'][i] = dict1
|
1221
|
+
# else:
|
1222
|
+
# shape = tuple([coords_dict[dim_name].shape[0] for dim_name in dims])
|
1223
|
+
|
1224
|
+
# if 'missing_value' in var_enc:
|
1225
|
+
# fillvalue = var_enc['missing_value']
|
1226
|
+
# else:
|
1227
|
+
# fillvalue = None
|
1228
|
+
|
1229
|
+
# vars_dict[ds_name] = {'data': {i: dict1}, 'dims': tuple(dims), 'shape': shape, 'dtype': var_enc['dtype'], 'fillvalue': fillvalue, 'dtype_decoded': var_enc['dtype_decoded']}
|
1230
|
+
|
1231
|
+
# return vars_dict, is_regular_dict
|
1232
|
+
|
1233
|
+
|
1234
|
+
# def filter_coords(coords_dict, selection, encodings):
|
1235
|
+
# """
|
1236
|
+
|
1237
|
+
# """
|
1238
|
+
# for coord, sel in selection.items():
|
1239
|
+
# if coord not in coords_dict:
|
1240
|
+
# raise ValueError(coord + ' one of the coordinates.')
|
1241
|
+
|
1242
|
+
# coord_data = decode_data(coords_dict[coord], **encodings[coord])
|
1243
|
+
|
1244
|
+
# if isinstance(sel, slice):
|
1245
|
+
# if 'datetime64' in coord_data.dtype.name:
|
1246
|
+
# # if not isinstance(sel.start, (str, np.datetime64)):
|
1247
|
+
# # raise TypeError('Input for datetime selection should be either a datetime string or np.datetime64.')
|
1248
|
+
|
1249
|
+
# if sel.start is not None:
|
1250
|
+
# start = np.datetime64(sel.start, 's')
|
1251
|
+
# else:
|
1252
|
+
# start = np.datetime64(coord_data[0] - 1, 's')
|
1253
|
+
|
1254
|
+
# if sel.stop is not None:
|
1255
|
+
# end = np.datetime64(sel.stop, 's')
|
1256
|
+
# else:
|
1257
|
+
# end = np.datetime64(coord_data[-1] + 1, 's')
|
1258
|
+
|
1259
|
+
# bool_index = (start <= coord_data) & (coord_data < end)
|
1260
|
+
# else:
|
1261
|
+
# bool_index = (sel.start <= coord_data) & (coord_data < sel.stop)
|
1262
|
+
|
1263
|
+
# else:
|
1264
|
+
# if isinstance(sel, (int, float)):
|
1265
|
+
# sel = [sel]
|
1266
|
+
|
1267
|
+
# try:
|
1268
|
+
# sel1 = np.array(sel)
|
1269
|
+
# except:
|
1270
|
+
# raise TypeError('selection input could not be coerced to an ndarray.')
|
1271
|
+
|
1272
|
+
# if sel1.dtype.name == 'bool':
|
1273
|
+
# if sel1.shape[0] != coord_data.shape[0]:
|
1274
|
+
# raise ValueError('The boolean array does not have the same length as the coord array.')
|
1275
|
+
# bool_index = sel1
|
1276
|
+
# else:
|
1277
|
+
# bool_index = np.in1d(coord_data, sel1)
|
1278
|
+
|
1279
|
+
# new_coord_data = encode_data(coord_data[bool_index], **encodings[coord])
|
1280
|
+
|
1281
|
+
# coords_dict[coord] = new_coord_data
|
1282
|
+
|
1283
|
+
|
1284
|
+
# def guess_chunk_shape(shape: Tuple[int, ...], dtype: np.dtype, target_chunk_size: int = 2**21) -> Tuple[int, ...]:
|
1285
|
+
# """
|
1286
|
+
# Guess an appropriate chunk layout for a dataset, given its shape and
|
1287
|
+
# the size of each element in bytes. Will allocate chunks only as large
|
1288
|
+
# as target_chunk_size. Chunks are generally close to some power-of-2 fraction of
|
1289
|
+
# each axis, slightly favoring bigger values for the last index.
|
1290
|
+
# """
|
1291
|
+
# ndims = len(shape)
|
1292
|
+
|
1293
|
+
# if ndims > 0:
|
1294
|
+
|
1295
|
+
# if not all(isinstance(v, int) for v in shape):
|
1296
|
+
# raise TypeError('All values in the shape must be ints.')
|
1297
|
+
|
1298
|
+
# chunks = np.array(shape, dtype='=f8')
|
1299
|
+
# if not np.all(np.isfinite(chunks)):
|
1300
|
+
# raise ValueError("Illegal value in chunk tuple")
|
1301
|
+
|
1302
|
+
# dtype = np.dtype(dtype)
|
1303
|
+
# typesize = dtype.itemsize
|
1304
|
+
|
1305
|
+
# idx = 0
|
1306
|
+
# while True:
|
1307
|
+
# chunk_bytes = math.prod(chunks)*typesize
|
1308
|
+
|
1309
|
+
# if (chunk_bytes < target_chunk_size or \
|
1310
|
+
# abs(chunk_bytes - target_chunk_size)/target_chunk_size < 0.5):
|
1311
|
+
# break
|
1312
|
+
|
1313
|
+
# if math.prod(chunks) == 1:
|
1314
|
+
# break
|
1315
|
+
|
1316
|
+
# chunks[idx%ndims] = math.ceil(chunks[idx%ndims] / 2.0)
|
1317
|
+
# idx += 1
|
1318
|
+
|
1319
|
+
# return tuple(int(x) for x in chunks)
|
1320
|
+
# else:
|
1321
|
+
# return None
|
1322
|
+
|
1323
|
+
|
1324
|
+
# def guess_chunk_hdf5(shape, maxshape, dtype, chunk_max=2**21):
|
1325
|
+
# """ Guess an appropriate chunk layout for a dataset, given its shape and
|
1326
|
+
# the size of each element in bytes. Will allocate chunks only as large
|
1327
|
+
# as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of
|
1328
|
+
# each axis, slightly favoring bigger values for the last index.
|
1329
|
+
# Undocumented and subject to change without warning.
|
1330
|
+
# """
|
1331
|
+
# ndims = len(shape)
|
1332
|
+
|
1333
|
+
# if ndims > 0:
|
1334
|
+
|
1335
|
+
# # For unlimited dimensions we have to guess 1024
|
1336
|
+
# shape1 = []
|
1337
|
+
# for i, x in enumerate(maxshape):
|
1338
|
+
# if x is None:
|
1339
|
+
# if shape[i] > 1024:
|
1340
|
+
# shape1.append(shape[i])
|
1341
|
+
# else:
|
1342
|
+
# shape1.append(1024)
|
1343
|
+
# else:
|
1344
|
+
# shape1.append(x)
|
1345
|
+
|
1346
|
+
# shape = tuple(shape1)
|
1347
|
+
|
1348
|
+
# # ndims = len(shape)
|
1349
|
+
# # if ndims == 0:
|
1350
|
+
# # raise ValueError("Chunks not allowed for scalar datasets.")
|
1351
|
+
|
1352
|
+
# chunks = np.array(shape, dtype='=f8')
|
1353
|
+
# if not np.all(np.isfinite(chunks)):
|
1354
|
+
# raise ValueError("Illegal value in chunk tuple")
|
1355
|
+
|
1356
|
+
# # Determine the optimal chunk size in bytes using a PyTables expression.
|
1357
|
+
# # This is kept as a float.
|
1358
|
+
# typesize = np.dtype(dtype).itemsize
|
1359
|
+
# # dset_size = np.prod(chunks)*typesize
|
1360
|
+
# # target_size = CHUNK_BASE * (2**np.log10(dset_size/(1024.*1024)))
|
1361
|
+
|
1362
|
+
# # if target_size > CHUNK_MAX:
|
1363
|
+
# # target_size = CHUNK_MAX
|
1364
|
+
# # elif target_size < CHUNK_MIN:
|
1365
|
+
# # target_size = CHUNK_MIN
|
1366
|
+
|
1367
|
+
# target_size = chunk_max
|
1368
|
+
|
1369
|
+
# idx = 0
|
1370
|
+
# while True:
|
1371
|
+
# # Repeatedly loop over the axes, dividing them by 2. Stop when:
|
1372
|
+
# # 1a. We're smaller than the target chunk size, OR
|
1373
|
+
# # 1b. We're within 50% of the target chunk size, AND
|
1374
|
+
# # 2. The chunk is smaller than the maximum chunk size
|
1375
|
+
|
1376
|
+
# chunk_bytes = math.prod(chunks)*typesize
|
1377
|
+
|
1378
|
+
# if (chunk_bytes < target_size or \
|
1379
|
+
# abs(chunk_bytes - target_size)/target_size < 0.5):
|
1380
|
+
# break
|
1381
|
+
|
1382
|
+
# if math.prod(chunks) == 1:
|
1383
|
+
# break
|
1384
|
+
|
1385
|
+
# chunks[idx%ndims] = math.ceil(chunks[idx%ndims] / 2.0)
|
1386
|
+
# idx += 1
|
1387
|
+
|
1388
|
+
# return tuple(int(x) for x in chunks)
|
1389
|
+
# else:
|
1390
|
+
# return None
|
1391
|
+
|
1392
|
+
|
1393
|
+
# def guess_chunk_time(shape, maxshape, dtype, time_index, chunk_max=3*2**20):
|
1394
|
+
# """ Guess an appropriate chunk layout for a dataset, given its shape and
|
1395
|
+
# the size of each element in bytes. Will allocate chunks only as large
|
1396
|
+
# as MAX_SIZE. Chunks are generally close to some power-of-2 fraction of
|
1397
|
+
# each axis, slightly favoring bigger values for the last index.
|
1398
|
+
# Undocumented and subject to change without warning.
|
1399
|
+
# """
|
1400
|
+
# ndims = len(shape)
|
1401
|
+
|
1402
|
+
# if ndims > 0:
|
1403
|
+
|
1404
|
+
# # For unlimited dimensions we have to guess 1024
|
1405
|
+
# shape1 = []
|
1406
|
+
# for i, x in enumerate(maxshape):
|
1407
|
+
# if x is None:
|
1408
|
+
# if shape[i] > 1024:
|
1409
|
+
# shape1.append(shape[i])
|
1410
|
+
# else:
|
1411
|
+
# shape1.append(1024)
|
1412
|
+
# else:
|
1413
|
+
# shape1.append(x)
|
1414
|
+
|
1415
|
+
# shape = tuple(shape1)
|
1416
|
+
|
1417
|
+
# chunks = np.array(shape, dtype='=f8')
|
1418
|
+
# if not np.all(np.isfinite(chunks)):
|
1419
|
+
# raise ValueError("Illegal value in chunk tuple")
|
1420
|
+
|
1421
|
+
# # Determine the optimal chunk size in bytes using a PyTables expression.
|
1422
|
+
# # This is kept as a float.
|
1423
|
+
# typesize = np.dtype(dtype).itemsize
|
1424
|
+
|
1425
|
+
# target_size = chunk_max
|
1426
|
+
|
1427
|
+
# while True:
|
1428
|
+
# # Repeatedly loop over the axes, dividing them by 2. Stop when:
|
1429
|
+
# # 1a. We're smaller than the target chunk size, OR
|
1430
|
+
# # 1b. We're within 50% of the target chunk size, AND
|
1431
|
+
# # 2. The chunk is smaller than the maximum chunk size
|
1432
|
+
|
1433
|
+
# chunk_bytes = math.prod(chunks)*typesize
|
1434
|
+
|
1435
|
+
# if (chunk_bytes < target_size or \
|
1436
|
+
# abs(chunk_bytes - target_size)/target_size < 0.5):
|
1437
|
+
# break
|
1438
|
+
|
1439
|
+
# if chunks[time_index] == 1:
|
1440
|
+
# break
|
1441
|
+
|
1442
|
+
# chunks[time_index] = np.ceil(chunks[time_index] / 2.0)
|
1443
|
+
|
1444
|
+
# return tuple(int(x) for x in chunks)
|
1445
|
+
# else:
|
1446
|
+
# return None
|
1447
|
+
|
1448
|
+
|
1449
|
+
def cartesian(arrays, out=None):
|
1450
|
+
"""
|
1451
|
+
Generate a cartesian product of input arrays.
|
1452
|
+
|
1453
|
+
Parameters
|
1454
|
+
----------
|
1455
|
+
arrays : list of array-like
|
1456
|
+
1-D arrays to form the cartesian product of.
|
1457
|
+
out : ndarray
|
1458
|
+
Array to place the cartesian product in.
|
1459
|
+
|
1460
|
+
Returns
|
1461
|
+
-------
|
1462
|
+
out : ndarray
|
1463
|
+
2-D array of shape (M, len(arrays)) containing cartesian products
|
1464
|
+
formed of input arrays.
|
1465
|
+
|
1466
|
+
Examples
|
1467
|
+
--------
|
1468
|
+
>>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
|
1469
|
+
array([[1, 4, 6],
|
1470
|
+
[1, 4, 7],
|
1471
|
+
[1, 5, 6],
|
1472
|
+
[1, 5, 7],
|
1473
|
+
[2, 4, 6],
|
1474
|
+
[2, 4, 7],
|
1475
|
+
[2, 5, 6],
|
1476
|
+
[2, 5, 7],
|
1477
|
+
[3, 4, 6],
|
1478
|
+
[3, 4, 7],
|
1479
|
+
[3, 5, 6],
|
1480
|
+
[3, 5, 7]])
|
1481
|
+
|
1482
|
+
"""
|
1483
|
+
|
1484
|
+
arrays = [np.asarray(x) for x in arrays]
|
1485
|
+
dtype = arrays[0].dtype
|
1486
|
+
|
1487
|
+
n = np.prod([x.size for x in arrays])
|
1488
|
+
if out is None:
|
1489
|
+
out = np.zeros([n, len(arrays)], dtype=dtype)
|
1490
|
+
|
1491
|
+
m = int(n / arrays[0].size)
|
1492
|
+
out[:,0] = np.repeat(arrays[0], m)
|
1493
|
+
if arrays[1:]:
|
1494
|
+
cartesian(arrays[1:], out=out[0:m, 1:])
|
1495
|
+
for j in range(1, arrays[0].size):
|
1496
|
+
out[j*m:(j+1)*m, 1:] = out[0:m, 1:]
|
1497
|
+
|
1498
|
+
return out
|
1499
|
+
|
1500
|
+
|
1501
|
+
# def get_compressor(name: str = None):
|
1502
|
+
# """
|
1503
|
+
|
1504
|
+
# """
|
1505
|
+
# if name is None:
|
1506
|
+
# compressor = {}
|
1507
|
+
# elif name.lower() == 'none':
|
1508
|
+
# compressor = {}
|
1509
|
+
# elif name.lower() == 'gzip':
|
1510
|
+
# compressor = {'compression': name}
|
1511
|
+
# elif name.lower() == 'lzf':
|
1512
|
+
# compressor = {'compression': name}
|
1513
|
+
# elif name.lower() == 'zstd':
|
1514
|
+
# compressor = hdf5plugin.Zstd(1)
|
1515
|
+
# elif name.lower() == 'lz4':
|
1516
|
+
# compressor = hdf5plugin.LZ4()
|
1517
|
+
# else:
|
1518
|
+
# raise ValueError('name must be one of gzip, lzf, zstd, lz4, or None.')
|
1519
|
+
|
1520
|
+
# return compressor
|
1521
|
+
|
1522
|
+
|
1523
|
+
# def fill_ds_by_chunks(ds, files, ds_vars, var_name, group, encodings):
|
1524
|
+
# """
|
1525
|
+
|
1526
|
+
# """
|
1527
|
+
# dims = ds_vars['dims']
|
1528
|
+
# if ds_vars['fillvalue'] is None:
|
1529
|
+
# fillvalue = -99
|
1530
|
+
# else:
|
1531
|
+
# fillvalue = ds_vars['fillvalue']
|
1532
|
+
|
1533
|
+
# for chunk in ds.iter_chunks():
|
1534
|
+
# chunk_size1 = tuple(c.stop - c.start for c in chunk)
|
1535
|
+
# chunk_arr = np.full(chunk_size1, fill_value=fillvalue, dtype=ds_vars['dtype'], order='C')
|
1536
|
+
# for i_file, data in ds_vars['data'].items():
|
1537
|
+
# # if i_file == 9:
|
1538
|
+
# # break
|
1539
|
+
# g_bool_index = [(chunk[i].start <= data['global_index'][dim]) & (data['global_index'][dim] < chunk[i].stop) for i, dim in enumerate(dims)]
|
1540
|
+
# bool1 = all([a.any() for a in g_bool_index])
|
1541
|
+
# if bool1:
|
1542
|
+
# l_slices = {}
|
1543
|
+
# for i, dim in enumerate(dims):
|
1544
|
+
# w = g_bool_index[i]
|
1545
|
+
# l_index = data['local_index'][dim][w]
|
1546
|
+
# if is_regular_index(l_index):
|
1547
|
+
# l_slices[dim] = slice(l_index[0], l_index[-1] + 1, None)
|
1548
|
+
# else:
|
1549
|
+
# l_slices[dim] = l_index
|
1550
|
+
|
1551
|
+
# if tuple(range(len(dims))) == data['dims_order']:
|
1552
|
+
# transpose_order = None
|
1553
|
+
# else:
|
1554
|
+
# transpose_order = tuple(data['dims_order'].index(i) for i in range(len(data['dims_order'])))
|
1555
|
+
|
1556
|
+
# with open_file(files[i_file], group) as f:
|
1557
|
+
# if isinstance(f, xr.Dataset):
|
1558
|
+
# l_data = encode_data(f[var_name][tuple(l_slices.values())].values, **encodings[var_name])
|
1559
|
+
# else:
|
1560
|
+
# l_data = f[var_name][tuple(l_slices.values())]
|
1561
|
+
|
1562
|
+
# if transpose_order is not None:
|
1563
|
+
# l_data = l_data.transpose(transpose_order)
|
1564
|
+
|
1565
|
+
# g_chunk_index = []
|
1566
|
+
# for i, dim in enumerate(dims):
|
1567
|
+
# s1 = data['global_index'][dim][g_bool_index[i]] - chunk[i].start
|
1568
|
+
# if is_regular_index(s1):
|
1569
|
+
# s1 = slice(s1[0], s1[-1] + 1, None)
|
1570
|
+
# g_chunk_index.append(s1)
|
1571
|
+
# chunk_arr[tuple(g_chunk_index)] = l_data
|
1572
|
+
|
1573
|
+
# ## Save chunk to new dataset
|
1574
|
+
# ds[chunk] = chunk_arr
|
1575
|
+
|
1576
|
+
|
1577
|
+
# def fill_ds_by_files(ds, files, ds_vars, var_name, group, encodings):
|
1578
|
+
# """
|
1579
|
+
# Currently the implementation is simple. It loads one entire input file into the ds. It would be nice to chunk the file before loading to handle very large input files.
|
1580
|
+
# """
|
1581
|
+
# dims = ds_vars['dims']
|
1582
|
+
# dtype = ds_vars['dtype']
|
1583
|
+
|
1584
|
+
# for i_file, data in ds_vars['data'].items():
|
1585
|
+
# dims_order = data['dims_order']
|
1586
|
+
# g_index_start = tuple(data['global_index'][dim][0] for dim in dims)
|
1587
|
+
|
1588
|
+
# if tuple(range(len(dims))) == data['dims_order']:
|
1589
|
+
# transpose_order = None
|
1590
|
+
# else:
|
1591
|
+
# transpose_order = tuple(dims_order.index(i) for i in range(len(dims_order)))
|
1592
|
+
# g_index_start = tuple(g_index_start[i] for i in dims_order)
|
1593
|
+
|
1594
|
+
# file_shape = tuple(len(arr) for dim, arr in data['local_index'].items())
|
1595
|
+
# chunk_size = guess_chunk(file_shape, file_shape, dtype, 2**27)
|
1596
|
+
# chunk_iter = ChunkIterator(chunk_size, file_shape)
|
1597
|
+
|
1598
|
+
# with open_file(files[i_file], group) as f:
|
1599
|
+
# for chunk in chunk_iter:
|
1600
|
+
# # g_chunk_slices = []
|
1601
|
+
# # l_slices = []
|
1602
|
+
# # for dim in dims:
|
1603
|
+
# # g_index = data['global_index'][dim]
|
1604
|
+
# # g_chunk_slices.append(slice(g_index[0], g_index[-1] + 1, None))
|
1605
|
+
|
1606
|
+
# # l_index = data['local_index'][dim]
|
1607
|
+
# # l_slices.append(slice(l_index[0], l_index[-1] + 1, None))
|
1608
|
+
|
1609
|
+
# if isinstance(f, xr.Dataset):
|
1610
|
+
# l_data = encode_data(f[var_name][chunk].values, **encodings[var_name])
|
1611
|
+
# else:
|
1612
|
+
# l_data = f[var_name][chunk]
|
1613
|
+
|
1614
|
+
# if transpose_order is not None:
|
1615
|
+
# l_data = l_data.transpose(transpose_order)
|
1616
|
+
|
1617
|
+
# g_chunk_slices = tuple(slice(g_index_start[i] + s.start, g_index_start[i] + s.stop, 1) for i, s in enumerate(chunk))
|
1618
|
+
|
1619
|
+
# ds[g_chunk_slices] = l_data
|
1620
|
+
|
1621
|
+
|
1622
|
+
# def get_dtype_shape(data=None, dtype=None, shape=None):
|
1623
|
+
# """
|
1624
|
+
|
1625
|
+
# """
|
1626
|
+
# if data is None:
|
1627
|
+
# if (shape is None) or (dtype is None):
|
1628
|
+
# raise ValueError('shape and dtype must be passed or data must be passed.')
|
1629
|
+
# if not isinstance(dtype, str):
|
1630
|
+
# dtype = dtype.name
|
1631
|
+
# else:
|
1632
|
+
# shape = data.shape
|
1633
|
+
# dtype = data.dtype.name
|
1634
|
+
|
1635
|
+
# return dtype, shape
|
1636
|
+
|
1637
|
+
|
1638
|
+
# def is_var_name(name):
|
1639
|
+
# """
|
1640
|
+
|
1641
|
+
# """
|
1642
|
+
# res = var_name_pattern.search(name)
|
1643
|
+
# if res:
|
1644
|
+
# return True
|
1645
|
+
# else:
|
1646
|
+
# return False
|
1647
|
+
|
1648
|
+
|
1649
|
+
def format_value(value):
|
1650
|
+
"""
|
1651
|
+
|
1652
|
+
"""
|
1653
|
+
if isinstance(value, (int, np.integer)):
|
1654
|
+
return str(value)
|
1655
|
+
elif isinstance(value, (float, np.floating)):
|
1656
|
+
return f'{value:.2f}'
|
1657
|
+
else:
|
1658
|
+
return value
|
1659
|
+
|
1660
|
+
|
1661
|
+
def append_summary(summary, summ_dict):
|
1662
|
+
"""
|
1663
|
+
|
1664
|
+
"""
|
1665
|
+
for key, value in summ_dict.items():
|
1666
|
+
spacing = value_indent - len(key)
|
1667
|
+
if spacing < 1:
|
1668
|
+
spacing = 1
|
1669
|
+
|
1670
|
+
summary += f"""\n{key}""" + """ """ * spacing + value
|
1671
|
+
|
1672
|
+
return summary
|
1673
|
+
|
1674
|
+
|
1675
|
+
def data_variable_summary(ds):
|
1676
|
+
"""
|
1677
|
+
|
1678
|
+
"""
|
1679
|
+
type1 = type(ds)
|
1680
|
+
|
1681
|
+
if ds:
|
1682
|
+
summ_dict = {'name': ds.name, 'dims order': '(' + ', '.join(ds.coord_names) + ')', 'shape': str(ds.shape), 'chunk size': str(ds.chunk_shape)}
|
1683
|
+
|
1684
|
+
summary = f"""<cfdb.{type1.__name__}>"""
|
1685
|
+
|
1686
|
+
summary = append_summary(summary, summ_dict)
|
1687
|
+
|
1688
|
+
summary += """\nCoordinates:"""
|
1689
|
+
|
1690
|
+
for coord in ds.coords:
|
1691
|
+
coord_name = coord.name
|
1692
|
+
dtype_name = coord.dtype_decoded
|
1693
|
+
dim_len = coord.shape[0]
|
1694
|
+
first_value = format_value(coord.data[0])
|
1695
|
+
spacing = value_indent - name_indent - len(coord_name)
|
1696
|
+
if spacing < 1:
|
1697
|
+
spacing = 1
|
1698
|
+
dim_str = f"""\n {coord_name}""" + """ """ * spacing
|
1699
|
+
dim_str += f"""({dim_len}) {dtype_name} {first_value} ..."""
|
1700
|
+
summary += dim_str
|
1701
|
+
|
1702
|
+
attrs_summary = make_attrs_repr(ds.attrs, name_indent, value_indent, 'Attributes')
|
1703
|
+
summary += """\n""" + attrs_summary
|
1704
|
+
|
1705
|
+
else:
|
1706
|
+
summary = f"""<cfdb.{type1.__name__} is closed>"""
|
1707
|
+
|
1708
|
+
return summary
|
1709
|
+
|
1710
|
+
|
1711
|
+
def coordinate_summary(ds):
|
1712
|
+
"""
|
1713
|
+
|
1714
|
+
"""
|
1715
|
+
type1 = type(ds)
|
1716
|
+
|
1717
|
+
if ds:
|
1718
|
+
name = ds.name
|
1719
|
+
# dim_len = ds.ndims
|
1720
|
+
# dtype_name = ds.dtype.name
|
1721
|
+
# dtype_decoded = ds.encoding['dtype_decoded']
|
1722
|
+
data = ds.data
|
1723
|
+
if len(data) > 0:
|
1724
|
+
first_value = format_value(ds.data[0])
|
1725
|
+
last_value = format_value(ds.data[-1])
|
1726
|
+
else:
|
1727
|
+
first_value = ''
|
1728
|
+
last_value = ''
|
1729
|
+
|
1730
|
+
# summ_dict = {'name': name, 'dtype encoded': dtype_name, 'dtype decoded': dtype_decoded, 'chunk size': str(ds.chunks), 'dim length': str(dim_len), 'values': f"""{first_value} ... {last_value}"""}
|
1731
|
+
summ_dict = {'name': name, 'shape': str(ds.shape), 'chunk shape': str(ds.chunk_shape), 'values': f"""{first_value} ... {last_value}"""}
|
1732
|
+
|
1733
|
+
summary = f"""<cfdb.{type1.__name__}>"""
|
1734
|
+
|
1735
|
+
summary = append_summary(summary, summ_dict)
|
1736
|
+
|
1737
|
+
attrs_summary = make_attrs_repr(ds.attrs, name_indent, value_indent, 'Attributes')
|
1738
|
+
summary += """\n""" + attrs_summary
|
1739
|
+
else:
|
1740
|
+
summary = f"""<cfdb.{type1.__name__} is closed>"""
|
1741
|
+
|
1742
|
+
return summary
|
1743
|
+
|
1744
|
+
|
1745
|
+
def make_attrs_repr(attrs, name_indent, value_indent, header):
|
1746
|
+
summary = f"""{header}:"""
|
1747
|
+
for key, value in attrs.items():
|
1748
|
+
spacing = value_indent - name_indent - len(key)
|
1749
|
+
if spacing < 1:
|
1750
|
+
spacing = 1
|
1751
|
+
line_str = f"""\n {key}""" + """ """ * spacing + f"""{value}"""
|
1752
|
+
summary += line_str
|
1753
|
+
|
1754
|
+
return summary
|
1755
|
+
|
1756
|
+
|
1757
|
+
# def create_h5py_data_variable(file, name: str, dims: (str, tuple, list), shape: (tuple, list), encoding: dict, data=None, **kwargs):
|
1758
|
+
# """
|
1759
|
+
|
1760
|
+
# """
|
1761
|
+
# dtype = encoding['dtype']
|
1762
|
+
|
1763
|
+
# ## Check if dims already exist and if the dim lengths match
|
1764
|
+
# if isinstance(dims, str):
|
1765
|
+
# dims = [dims]
|
1766
|
+
|
1767
|
+
# for i, dim in enumerate(dims):
|
1768
|
+
# if dim not in file:
|
1769
|
+
# raise ValueError(f'{dim} not in File')
|
1770
|
+
|
1771
|
+
# dim_len = file._file[dim].shape[0]
|
1772
|
+
# if dim_len != shape[i]:
|
1773
|
+
# raise ValueError(f'{dim} does not have the same length as the input data/shape dim.')
|
1774
|
+
|
1775
|
+
# ## Make chunks
|
1776
|
+
# if 'chunks' not in kwargs:
|
1777
|
+
# if 'maxshape' in kwargs:
|
1778
|
+
# maxshape = kwargs['maxshape']
|
1779
|
+
# else:
|
1780
|
+
# maxshape = shape
|
1781
|
+
# kwargs.setdefault('chunks', utils.guess_chunk(shape, maxshape, dtype))
|
1782
|
+
|
1783
|
+
# ## Create variable
|
1784
|
+
# if data is None:
|
1785
|
+
# ds = file._file.create_dataset(name, shape, dtype=dtype, track_order=True, **kwargs)
|
1786
|
+
# else:
|
1787
|
+
# ## Encode data before creating variable
|
1788
|
+
# data = utils.encode_data(data, **encoding)
|
1789
|
+
|
1790
|
+
# ds = file._file.create_dataset(name, dtype=dtype, data=data, track_order=True, **kwargs)
|
1791
|
+
|
1792
|
+
# for i, dim in enumerate(dims):
|
1793
|
+
# ds.dims[i].attach_scale(file._file[dim])
|
1794
|
+
# ds.dims[i].label = dim
|
1795
|
+
|
1796
|
+
# return ds
|
1797
|
+
|
1798
|
+
|
1799
|
+
# def create_h5py_coordinate(file, name: str, data, shape: (tuple, list), encoding: dict, **kwargs):
|
1800
|
+
# """
|
1801
|
+
|
1802
|
+
# """
|
1803
|
+
# if len(shape) != 1:
|
1804
|
+
# raise ValueError('The shape of a coordinate must be 1-D.')
|
1805
|
+
|
1806
|
+
# dtype = encoding['dtype']
|
1807
|
+
|
1808
|
+
# ## Make chunks
|
1809
|
+
# if 'chunks' not in kwargs:
|
1810
|
+
# if 'maxshape' in kwargs:
|
1811
|
+
# maxshape = kwargs['maxshape']
|
1812
|
+
# else:
|
1813
|
+
# maxshape = shape
|
1814
|
+
# kwargs.setdefault('chunks', utils.guess_chunk(shape, maxshape, dtype))
|
1815
|
+
|
1816
|
+
# ## Encode data before creating variable/coordinate
|
1817
|
+
# # print(encoding)
|
1818
|
+
# data = utils.encode_data(data, **encoding)
|
1819
|
+
|
1820
|
+
# # print(data)
|
1821
|
+
# # print(dtype)
|
1822
|
+
|
1823
|
+
# ## Make Variable
|
1824
|
+
# ds = file._file.create_dataset(name, dtype=dtype, data=data, track_order=True, **kwargs)
|
1825
|
+
|
1826
|
+
# ds.make_scale(name)
|
1827
|
+
# ds.dims[0].label = name
|
1828
|
+
|
1829
|
+
# return ds
|
1830
|
+
|
1831
|
+
|
1832
|
+
# def copy_data_variable(to_file, from_variable, name, include_data=True, include_attrs=True, **kwargs):
|
1833
|
+
# """
|
1834
|
+
|
1835
|
+
# """
|
1836
|
+
# other1 = from_variable._dataset
|
1837
|
+
# for k in ('chunks', 'compression',
|
1838
|
+
# 'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
|
1839
|
+
# 'fillvalue'):
|
1840
|
+
# kwargs.setdefault(k, getattr(other1, k))
|
1841
|
+
|
1842
|
+
# if 'compression' in other1.attrs:
|
1843
|
+
# compression = other1.attrs['compression']
|
1844
|
+
# kwargs.update(**utils.get_compressor(compression))
|
1845
|
+
# else:
|
1846
|
+
# compression = kwargs['compression']
|
1847
|
+
|
1848
|
+
# # TODO: more elegant way to pass these (dcpl to create_variable?)
|
1849
|
+
# dcpl = other1.id.get_create_plist()
|
1850
|
+
# kwargs.setdefault('track_times', dcpl.get_obj_track_times())
|
1851
|
+
# # kwargs.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
|
1852
|
+
|
1853
|
+
# # Special case: the maxshape property always exists, but if we pass it
|
1854
|
+
# # to create_variable, the new variable will automatically get chunked
|
1855
|
+
# # layout. So we copy it only if it is different from shape.
|
1856
|
+
# if other1.maxshape != other1.shape:
|
1857
|
+
# kwargs.setdefault('maxshape', other1.maxshape)
|
1858
|
+
|
1859
|
+
# encoding = from_variable.encoding._encoding.copy()
|
1860
|
+
# shape = from_variable.shape
|
1861
|
+
|
1862
|
+
# ds0 = create_h5py_data_variable(to_file, name, tuple(dim.label for dim in other1.dims), shape, encoding, **kwargs)
|
1863
|
+
|
1864
|
+
# if include_data:
|
1865
|
+
# # Directly copy chunks using write_direct_chunk
|
1866
|
+
# for chunk in ds0.iter_chunks():
|
1867
|
+
# chunk_starts = tuple(c.start for c in chunk)
|
1868
|
+
# filter_mask, data = other1.id.read_direct_chunk(chunk_starts)
|
1869
|
+
# ds0.id.write_direct_chunk(chunk_starts, data, filter_mask)
|
1870
|
+
|
1871
|
+
# ds = DataVariable(ds0, to_file, encoding)
|
1872
|
+
# if include_attrs:
|
1873
|
+
# ds.attrs.update(from_variable.attrs)
|
1874
|
+
|
1875
|
+
# return ds
|
1876
|
+
|
1877
|
+
|
1878
|
+
# def copy_coordinate(to_file, from_coordinate, name, include_attrs=True, **kwargs):
|
1879
|
+
# """
|
1880
|
+
|
1881
|
+
# """
|
1882
|
+
# other1 = from_coordinate._dataset
|
1883
|
+
# for k in ('chunks', 'compression',
|
1884
|
+
# 'compression_opts', 'scaleoffset', 'shuffle', 'fletcher32',
|
1885
|
+
# 'fillvalue'):
|
1886
|
+
# kwargs.setdefault(k, getattr(other1, k))
|
1887
|
+
|
1888
|
+
# if 'compression' in other1.attrs:
|
1889
|
+
# compression = other1.attrs['compression']
|
1890
|
+
# kwargs.update(**utils.get_compressor(compression))
|
1891
|
+
# else:
|
1892
|
+
# compression = kwargs['compression']
|
1893
|
+
|
1894
|
+
# # TODO: more elegant way to pass these (dcpl to create_variable?)
|
1895
|
+
# dcpl = other1.id.get_create_plist()
|
1896
|
+
# kwargs.setdefault('track_times', dcpl.get_obj_track_times())
|
1897
|
+
# # kwargs.setdefault('track_order', dcpl.get_attr_creation_order() > 0)
|
1898
|
+
|
1899
|
+
# # Special case: the maxshape property always exists, but if we pass it
|
1900
|
+
# # to create_variable, the new variable will automatically get chunked
|
1901
|
+
# # layout. So we copy it only if it is different from shape.
|
1902
|
+
# if other1.maxshape != other1.shape:
|
1903
|
+
# kwargs.setdefault('maxshape', other1.maxshape)
|
1904
|
+
|
1905
|
+
# encoding = from_coordinate.encoding._encoding.copy()
|
1906
|
+
# shape = from_coordinate.shape
|
1907
|
+
|
1908
|
+
# ds0 = create_h5py_coordinate(to_file, name, from_coordinate.data, shape, encoding, **kwargs)
|
1909
|
+
|
1910
|
+
# ds = Coordinate(ds0, to_file, encoding)
|
1911
|
+
# if include_attrs:
|
1912
|
+
# ds.attrs.update(from_coordinate.attrs)
|
1913
|
+
|
1914
|
+
# return ds
|
1915
|
+
|
1916
|
+
|
1917
|
+
# def prepare_encodings_for_variables(dtype_encoded, dtype_decoded, scale_factor, add_offset, fillvalue, units, calendar):
|
1918
|
+
# """
|
1919
|
+
|
1920
|
+
# """
|
1921
|
+
# encoding = {'dtype': dtype_encoded, 'dtype_encoded': dtype_encoded, 'missing_value': fillvalue, '_FillValue': fillvalue, 'add_offset': add_offset, 'scale_factor': scale_factor, 'units': units, 'calendar': calendar}
|
1922
|
+
# for key, value in copy.deepcopy(encoding).items():
|
1923
|
+
# if value is None:
|
1924
|
+
# del encoding[key]
|
1925
|
+
|
1926
|
+
# if 'datetime64' in dtype_decoded:
|
1927
|
+
# if 'units' not in encoding:
|
1928
|
+
# encoding['units'] = 'seconds since 1970-01-01'
|
1929
|
+
# if 'calendar' not in encoding:
|
1930
|
+
# encoding['calendar'] = 'gregorian'
|
1931
|
+
# encoding['dtype'] = 'int64'
|
1932
|
+
|
1933
|
+
# return encoding
|
1934
|
+
|
1935
|
+
|
1936
|
+
def file_summary(ds):
|
1937
|
+
"""
|
1938
|
+
|
1939
|
+
"""
|
1940
|
+
type1 = type(ds)
|
1941
|
+
|
1942
|
+
if ds:
|
1943
|
+
file_path = ds.file_path
|
1944
|
+
if file_path.exists() and file_path.is_file():
|
1945
|
+
file_size = file_path.stat().st_size*0.000001
|
1946
|
+
file_size_str = """{file_size:.1f} MB""".format(file_size=file_size)
|
1947
|
+
else:
|
1948
|
+
file_size_str = """NA"""
|
1949
|
+
|
1950
|
+
summ_dict = {'file name': file_path.name, 'file size': file_size_str, 'writable': str(ds.writable)}
|
1951
|
+
|
1952
|
+
summary = f"""<cfdb.{type1.__name__}>"""
|
1953
|
+
|
1954
|
+
summary = append_summary(summary, summ_dict)
|
1955
|
+
|
1956
|
+
summary += """\nCoordinates:"""
|
1957
|
+
|
1958
|
+
for var in ds.coords:
|
1959
|
+
dim_name = var.name
|
1960
|
+
dtype_name = var.dtype_decoded
|
1961
|
+
dim_len = var.shape[0]
|
1962
|
+
first_value = format_value(var.data[0])
|
1963
|
+
last_value = format_value(var.data[-1])
|
1964
|
+
spacing = value_indent - name_indent - len(dim_name)
|
1965
|
+
if spacing < 1:
|
1966
|
+
spacing = 1
|
1967
|
+
dim_str = f"""\n {dim_name}""" + """ """ * spacing
|
1968
|
+
dim_str += f"""({dim_len}) {dtype_name} {first_value} ... {last_value}"""
|
1969
|
+
summary += dim_str
|
1970
|
+
|
1971
|
+
summary += """\nData Variables:"""
|
1972
|
+
|
1973
|
+
for dv in ds.data_vars:
|
1974
|
+
dv_name = dv.name
|
1975
|
+
dtype_name = dv.dtype_decoded
|
1976
|
+
# shape = dv.shape
|
1977
|
+
dims = ', '.join(dv.coord_names)
|
1978
|
+
# first_value = format_value(dv[tuple(0 for i in range(len(shape)))])
|
1979
|
+
spacing = value_indent - name_indent - len(dv_name)
|
1980
|
+
if spacing < 1:
|
1981
|
+
spacing = 1
|
1982
|
+
ds_str = f"""\n {dv_name}""" + """ """ * spacing
|
1983
|
+
ds_str += f"""({dims}) {dtype_name}"""
|
1984
|
+
summary += ds_str
|
1985
|
+
|
1986
|
+
attrs_summary = make_attrs_repr(ds.attrs, name_indent, value_indent, 'Attributes')
|
1987
|
+
summary += """\n""" + attrs_summary
|
1988
|
+
else:
|
1989
|
+
summary = f"""<cfdb.{type1.__name__} is closed>"""
|
1990
|
+
|
1991
|
+
return summary
|
1992
|
+
|
1993
|
+
|
1994
|
+
def get_var_params(name, kwargs):
|
1995
|
+
"""
|
1996
|
+
|
1997
|
+
"""
|
1998
|
+
params = deepcopy(default_params[name])
|
1999
|
+
params.update(kwargs)
|
2000
|
+
|
2001
|
+
name = params.pop('name')
|
2002
|
+
|
2003
|
+
return name, params
|
2004
|
+
|
2005
|
+
|
2006
|
+
|
2007
|
+
|
2008
|
+
|
2009
|
+
|
2010
|
+
|
2011
|
+
|
2012
|
+
|
2013
|
+
|
2014
|
+
|
2015
|
+
|
2016
|
+
|
2017
|
+
|
2018
|
+
|
2019
|
+
|
2020
|
+
|
2021
|
+
|
2022
|
+
|
2023
|
+
|
2024
|
+
|
2025
|
+
|
2026
|
+
|
2027
|
+
|
2028
|
+
|
2029
|
+
|
2030
|
+
|
2031
|
+
|
2032
|
+
|
2033
|
+
|
2034
|
+
|
2035
|
+
|
2036
|
+
|
2037
|
+
|
2038
|
+
|
2039
|
+
|
2040
|
+
|
2041
|
+
|
2042
|
+
|
2043
|
+
|
2044
|
+
|
2045
|
+
|
2046
|
+
|
2047
|
+
|
2048
|
+
|
2049
|
+
|
2050
|
+
|
2051
|
+
|
2052
|
+
|
2053
|
+
|
2054
|
+
|
2055
|
+
|
2056
|
+
|
2057
|
+
|
2058
|
+
|
2059
|
+
|
2060
|
+
|
2061
|
+
|
2062
|
+
|
2063
|
+
|
2064
|
+
|
2065
|
+
|
2066
|
+
|
2067
|
+
|
2068
|
+
|
2069
|
+
|
2070
|
+
|
2071
|
+
|
2072
|
+
|
2073
|
+
|
2074
|
+
|
2075
|
+
|
2076
|
+
|
2077
|
+
|
2078
|
+
|
2079
|
+
|