cfdb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1187 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Wed Feb 19 14:05:23 2025
5
+
6
+ @author: mike
7
+ """
8
+ import numpy as np
9
+ import weakref
10
+ import msgspec
11
+ import lz4.frame
12
+ import zstandard as zstd
13
+ import math
14
+ from typing import Set, Optional, Dict, Tuple, List, Union, Any, Iterable
15
+ from copy import deepcopy
16
+ import rechunkit
17
+
18
+ from . import utils, indexers
19
+ # import utils, indexers
20
+
21
+ ###################################################
22
+ ### Parameters
23
+
24
+ attrs_key = '_{var_name}.attrs'
25
+
26
+ ###################################################
27
+ ### Classes
28
+
29
+
30
+ class Categorical:
31
+ """
32
+ This class and dtype should be similar to the pandas categorical dtype. Preferably, all string arrays should be cat dtypes. In the CF conventions, this is equivelant to `flags <https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#flags>`_. The CF conventions of assigning the attrs flag_values and flag_meanings should be used for compatability.
33
+ As in the CF conventions, two python lists can be used (one int in increasing order from 0 as the index, and the other as the string values). The string values would have no sorted order. They would be assigned the int index as they are assigned.
34
+ This class should replace the fixed-length numpy unicode class for data variables.
35
+ At the moment, I don't want to implement this until I've got the rest of the package implemented.
36
+ """
37
+ # TODO
38
+
39
+
40
+ class Rechunker:
41
+ """
42
+
43
+ """
44
+ def __init__(self, var):
45
+ """
46
+
47
+ """
48
+ self._var = var
49
+
50
+
51
+ def guess_chunk_shape(self, target_chunk_size: int):
52
+ """
53
+ Guess an appropriate chunk layout for a dataset, given its shape and
54
+ the size of each element in bytes. Will allocate chunks only as large
55
+ as target_chunk_size. Chunks will be assigned to the highest composite number within the target_chunk_size. Using composite numbers will benefit the rehunking process as there is a very high likelihood that the least common multiple of two composite numbers will be significantly lower than the product of those two numbers.
56
+
57
+ Parameters
58
+ ----------
59
+ target_chunk_size: int
60
+ The maximum size per chunk in bytes.
61
+
62
+ Returns
63
+ -------
64
+ tuple of ints
65
+ shape of the chunk
66
+ """
67
+ chunk_shape = rechunkit.guess_chunk_shape(self._var.shape, self._var.dtype_encoded, target_chunk_size)
68
+ return chunk_shape
69
+
70
+ def calc_ideal_read_chunk_shape(self, target_chunk_shape: Tuple[int, ...]):
71
+ """
72
+ Calculates the minimum ideal read chunk shape between a source and target.
73
+ """
74
+ return rechunkit.calc_ideal_read_chunk_shape(self._var.chunk_shape, target_chunk_shape)
75
+
76
+ def calc_ideal_read_chunk_mem(self, target_chunk_shape: Tuple[int, ...]):
77
+ """
78
+ Calculates the minimum ideal read chunk memory between a source and target.
79
+ """
80
+ ideal_read_chunk_shape = rechunkit.calc_ideal_read_chunk_shape(self._var.chunk_shape, target_chunk_shape)
81
+ return rechunkit.calc_ideal_read_chunk_mem(ideal_read_chunk_shape, self._var.dtype_encoded.itemsize)
82
+
83
+ def calc_source_read_chunk_shape(self, target_chunk_shape: Tuple[int, ...], max_mem: int):
84
+ """
85
+ Calculates the optimum read chunk shape given a maximum amount of available memory.
86
+
87
+ Parameters
88
+ ----------
89
+ target_chunk_shape: tuple of int
90
+ The target chunk shape
91
+ max_mem: int
92
+ The max allocated memory to perform the chunking operation in bytes.
93
+
94
+ Returns
95
+ -------
96
+ optimal chunk shape: tuple of ints
97
+ """
98
+ return rechunkit.calc_source_read_chunk_shape(self._var.chunk_shape, target_chunk_shape, self._var.dtype_encoded.itemsize, max_mem)
99
+
100
+ def calc_n_chunks(self):
101
+ """
102
+ Calculate the total number of chunks in the existing variable.
103
+ """
104
+ return rechunkit.calc_n_chunks(self._var.shape, self._var.chunk_shape)
105
+
106
+ def calc_n_reads_rechunker(self, target_chunk_shape: Tuple[int, ...], max_mem: int=2**27):
107
+ """
108
+ Calculate the total number of reads and writes using the rechunker.
109
+
110
+ Parameters
111
+ ----------
112
+ target_chunk_shape: tuple of ints
113
+ The chunk_shape of the target.
114
+ max_mem: int
115
+ The max allocated memory to perform the chunking operation in bytes. This will only be as large as necessary for an optimum size chunk for the rechunking.
116
+
117
+ Returns
118
+ -------
119
+ tuple
120
+ of n_reads, n_writes
121
+ """
122
+ return rechunkit.calc_n_reads_rechunker(self._var.shape, self._var.dtype_encoded, self._var.chunk_shape, target_chunk_shape, max_mem, self._var._sel)
123
+
124
+
125
+ def rechunk(self, target_chunk_shape, max_mem: int=2**27, decoded=True):
126
+ """
127
+ This method takes a target chunk_shape and max memory size and returns a generator that converts to the new target chunk shape. It optimises the rechunking by using an in-memory numpy ndarray with a size defined by the max_mem.
128
+
129
+ Parameters
130
+ ----------
131
+ target_chunk_shape: tuple of ints
132
+ The chunk_shape of the target.
133
+ max_mem: int
134
+ The max allocated memory to perform the chunking operation in bytes. This will only be as large as necessary for an optimum size chunk for the rechunking.
135
+
136
+ Returns
137
+ -------
138
+ Generator
139
+ tuple of the target slices to the np.ndarray of data
140
+ """
141
+ self._var.load()
142
+
143
+ func = lambda sel: self._var.get_chunk(sel, decoded=False)
144
+
145
+ rechunkit1 = rechunkit.rechunker(func, self._var.shape, self._var.dtype_encoded, self._var.chunk_shape, target_chunk_shape, max_mem, self._var._sel)
146
+
147
+ if decoded:
148
+ for slices, encoded_data in rechunkit1:
149
+ yield slices, self._var._encoder.decode(encoded_data)
150
+ else:
151
+ for slices, encoded_data in rechunkit1:
152
+ yield slices, encoded_data
153
+
154
+
155
+ class Attributes:
156
+ """
157
+
158
+ """
159
+ def __init__(self, blt_file, var_name, writable, finalizers):
160
+ """
161
+
162
+ """
163
+ key = attrs_key.format(var_name=var_name)
164
+ data = blt_file.get(key)
165
+ if data is None:
166
+ self._data = {}
167
+ else:
168
+ self._data = msgspec.json.decode(data)
169
+
170
+ self._blt = blt_file
171
+ # self._var_name = var_name
172
+ finalizers.append(weakref.finalize(self, utils.attrs_finalizer, self._blt, self._data, var_name, writable))
173
+ self.writable = writable
174
+
175
+ @property
176
+ def data(self):
177
+ """
178
+
179
+ """
180
+ return deepcopy(self._data)
181
+
182
+ def set(self, key, value):
183
+ """
184
+
185
+ """
186
+ if self.writable:
187
+ try:
188
+ msgspec.json.encode(value)
189
+ except:
190
+ raise ValueError('The value passed is not json serializable.')
191
+ self._data[key] = value
192
+ else:
193
+ raise ValueError('Dataset is not writable.')
194
+
195
+ def __setitem__(self, key, value):
196
+ """
197
+
198
+ """
199
+ self.set(key, value)
200
+
201
+ def get(self, key):
202
+ """
203
+
204
+ """
205
+ value = deepcopy(self._data.get(key))
206
+
207
+ return value
208
+
209
+ def __getitem__(self, key):
210
+ """
211
+
212
+ """
213
+ value = self.get(key)
214
+
215
+ return value
216
+
217
+ def clear(self):
218
+ if self.writable:
219
+ self._data.clear()
220
+ else:
221
+ raise ValueError('Dataset is not writable.')
222
+
223
+ def keys(self):
224
+ return self.data.keys()
225
+
226
+ def values(self):
227
+ return self.data.values()
228
+
229
+ def items(self):
230
+ return self.data.items()
231
+
232
+ def pop(self, key, default=None):
233
+ if self.writable:
234
+ return self._data.pop(key, default)
235
+ else:
236
+ raise ValueError('Dataset is not writable.')
237
+
238
+ def update(self, other=()):
239
+ if self.writable:
240
+ try:
241
+ msgspec.json.encode(other)
242
+ except:
243
+ raise ValueError('The values passed are not json serializable.')
244
+ self._data.update(other)
245
+ else:
246
+ raise ValueError('Dataset is not writable.')
247
+
248
+ def __delitem__(self, key):
249
+ if self.writable:
250
+ del self._data[key]
251
+ else:
252
+ raise ValueError('Dataset is not writable.')
253
+
254
+ def __contains__(self, key):
255
+ return key in self._data
256
+
257
+ def __iter__(self):
258
+ return self.keys()
259
+
260
+ # def sync(self):
261
+ # utils.attrs_finalizer(self._blt, self.data, self._var_name)
262
+
263
+ # def close(self):
264
+ # self._finalizer()
265
+
266
+ def __repr__(self):
267
+ return self._data.__repr__()
268
+
269
+
270
+ class Compressor:
271
+ """
272
+
273
+ """
274
+ def __init__(self, compression, compression_level):
275
+ """
276
+
277
+ """
278
+ self.compression = compression
279
+ self.compression_level = compression_level
280
+
281
+ if compression == 'lz4':
282
+ self.compress = self._lz4_compress
283
+ self.decompress = self._lz4_decompress
284
+ elif compression == 'zstd':
285
+ self._cctx = zstd.ZstdCompressor(level=self.compression_level)
286
+ self._dctx = zstd.ZstdDecompressor()
287
+ self.compress = self._zstd_compress
288
+ self.decompress = self._zstd_decompress
289
+ else:
290
+ raise ValueError('compression must be either lz4 or zstd')
291
+
292
+ def _lz4_compress(self, data: bytes):
293
+ """
294
+
295
+ """
296
+ return lz4.frame.compress(data, compression_level=self.compression_level)
297
+
298
+ def _lz4_decompress(self, data: bytes):
299
+ """
300
+
301
+ """
302
+ return lz4.frame.decompress(data)
303
+
304
+ def _zstd_compress(self, data: bytes):
305
+ """
306
+
307
+ """
308
+ return self._cctx.compress(data)
309
+
310
+ def _zstd_decompress(self, data: bytes):
311
+ """
312
+
313
+ """
314
+ return self._dctx.decompress(data)
315
+
316
+
317
+ class Encoding:
318
+ """
319
+
320
+ """
321
+ def __init__(self, chunk_shape, dtype_decoded, dtype_encoded, fillvalue, scale_factor, add_offset, compressor):
322
+ # self._encoding = msgspec.to_builtins(var_encoding)
323
+ # self._encoding = var_encoding
324
+ self.compressor = compressor
325
+ self.chunk_shape = chunk_shape
326
+ self.dtype_decoded = dtype_decoded
327
+ self.dtype_encoded = dtype_encoded
328
+ self.fillvalue = fillvalue
329
+ self.scale_factor = scale_factor
330
+ self.add_offset = add_offset
331
+ # for key, val in self._encoding.items():
332
+ # setattr(self, key, val)
333
+
334
+ # def get(self, key, default=None):
335
+ # return self._encoding.get(key, default)
336
+
337
+ # def __getitem__(self, key):
338
+ # return self._encoding[key]
339
+
340
+ # def __setitem__(self, key, value):
341
+ # if key in utils.enc_fields:
342
+ # self._encoding[key] = value
343
+ # if self._writable:
344
+ # self._attrs[key] = value
345
+ # else:
346
+ # raise ValueError(f'key must be one of {utils.enc_fields}.')
347
+
348
+ # def clear(self):
349
+ # keys = list(self._encoding.keys())
350
+ # self._encoding.clear()
351
+ # if self._writable:
352
+ # for key in keys:
353
+ # del self._attrs[key]
354
+
355
+ # def keys(self):
356
+ # return self._encoding.keys()
357
+
358
+ # def values(self):
359
+ # return self._encoding.values()
360
+
361
+ # def items(self):
362
+ # return self._encoding.items()
363
+
364
+ # def pop(self, key, default=None):
365
+ # if self._writable:
366
+ # if key in self._attrs:
367
+ # del self._attrs[key]
368
+ # return self._encoding.pop(key, default)
369
+
370
+ # def update(self, other=()):
371
+ # key_values = {**other}
372
+ # for key, value in key_values.items():
373
+ # if key in utils.enc_fields:
374
+ # self._encoding[key] = value
375
+ # if self._writable:
376
+ # self._attrs[key] = value
377
+
378
+ # def __delitem__(self, key):
379
+ # del self._encoding[key]
380
+ # if self._writable:
381
+ # del self._attrs[key]
382
+
383
+ # def __contains__(self, key):
384
+ # return key in self._encoding
385
+
386
+ # def __iter__(self):
387
+ # return self._encoding.__iter__()
388
+
389
+ # def __repr__(self):
390
+ # return make_attrs_repr(self, name_indent, value_indent, 'Encodings')
391
+
392
+
393
+ def to_bytes(self, encoded_array: np.ndarray) -> bytes:
394
+ """
395
+ from encoded array to bytes
396
+ """
397
+ return self.compressor.compress(encoded_array.tobytes())
398
+
399
+ def from_bytes(self, data: bytes, count=-1, offset=0) -> np.ndarray:
400
+ """
401
+ from bytes to encoded array. The count and offset are from the np.frombuffer function, but are currently unused because it's too hard at the moment.
402
+ """
403
+ b1 = bytearray(self.compressor.decompress(data))
404
+ encoded_array = np.frombuffer(b1, dtype=self.dtype_encoded, count=count, offset=offset).reshape(self.chunk_shape)
405
+
406
+ return encoded_array
407
+
408
+
409
+ def encode(self, array: np.ndarray):
410
+ """
411
+ decoded array to encoded array.
412
+ """
413
+ if array.dtype != self.dtype_decoded:
414
+ raise TypeError('The data dtype does not match the assigned dtype_decoded.')
415
+
416
+ if self.dtype_encoded != self.dtype_decoded:
417
+
418
+ # if data.dtype.kind == 'M':
419
+ # data = data.astype(self.dtype_encoded)
420
+
421
+ if isinstance(self.add_offset, (int, float)):
422
+ array = array - self.add_offset
423
+
424
+ if isinstance(self.scale_factor, (int, float)):
425
+ # precision = int(np.abs(np.log10(self.scale_factor)))
426
+ array = np.round(array/self.scale_factor)
427
+
428
+ if isinstance(self.fillvalue, int) and (self.dtype_decoded.kind == 'f'):
429
+ array[np.isnan(array)] = self.fillvalue
430
+
431
+ array = array.astype(self.dtype_encoded)
432
+
433
+ return array
434
+
435
+
436
+ def decode(self, array: np.ndarray):
437
+ """
438
+ encoded array into decode array
439
+ """
440
+ if self.dtype_encoded != self.dtype_decoded:
441
+ array = array.astype(self.dtype_decoded)
442
+
443
+ if isinstance(self.fillvalue, int) and (self.dtype_decoded.kind == 'f'):
444
+ array[np.isclose(array, self.fillvalue)] = np.nan
445
+
446
+ if isinstance(self.scale_factor, (int, float)):
447
+ array = array * self.scale_factor
448
+
449
+ if isinstance(self.add_offset, (int, float)):
450
+ array = array + self.add_offset
451
+
452
+ return array
453
+
454
+
455
+ class Variable:
456
+ """
457
+
458
+ """
459
+ def __init__(self, var_name, dataset, sel=None):
460
+ """
461
+
462
+ """
463
+ self._dataset = dataset
464
+ self._sys_meta = dataset._sys_meta
465
+ self._var_meta = dataset._sys_meta.variables[var_name]
466
+ self._blt = dataset._blt
467
+ self._has_load_items = dataset._has_load_items
468
+ self.name = var_name
469
+ self.attrs = Attributes(self._blt, var_name, dataset.writable, dataset._finalizers)
470
+ # self.encoding = msgspec.to_builtins(self._sys_meta.variables[self.name].encoding)
471
+ self.chunk_shape = self._var_meta.chunk_shape
472
+ # self.origin = self._var_meta.origin
473
+ self.dtype_decoded = np.dtype(self._var_meta.dtype_decoded)
474
+ self.dtype_encoded = np.dtype(self._var_meta.dtype_encoded)
475
+ self.fillvalue = self._var_meta.fillvalue
476
+ self.scale_factor = self._var_meta.scale_factor
477
+ self.add_offset = self._var_meta.add_offset
478
+ if hasattr(self._var_meta, 'coords'):
479
+ self.coord_names = self._var_meta.coords
480
+ self.ndims = len(self.coord_names)
481
+ else:
482
+ self.coord_names = (var_name,)
483
+ self.ndims = 1
484
+
485
+ # if sel is None:
486
+ # self._sel = tuple(slice(None, None) for i in range(self.ndims))
487
+ # else:
488
+ # self._sel = sel
489
+
490
+ self._sel = sel
491
+
492
+ self._encoder = Encoding(self.chunk_shape, self.dtype_decoded, self.dtype_encoded, self.fillvalue, self.scale_factor, self.add_offset, dataset._compressor)
493
+ self.loc = indexers.LocationIndexer(self)
494
+ self._finalizers = dataset._finalizers
495
+ self.writable = dataset.writable
496
+
497
+ ## Assign all the encodings - should I do this?
498
+ # for name, val in self._encoding_dict.items():
499
+ # setattr(self, name, val)
500
+
501
+ @property
502
+ def is_open(self):
503
+ return self._dataset.is_open
504
+
505
+ def __bool__(self):
506
+ return self.is_open
507
+
508
+
509
+ def _make_blank_sel_array(self, sel, coord_origins, decoded=True):
510
+ """
511
+
512
+ """
513
+ new_shape = indexers.determine_final_array_shape(sel, coord_origins, self.shape)
514
+
515
+ if self.dtype_decoded.kind == 'f' and decoded:
516
+ fillvalue = np.nan
517
+ else:
518
+ fillvalue = self.fillvalue
519
+
520
+ if decoded:
521
+ return np.full(new_shape, fillvalue, self.dtype_decoded)
522
+ else:
523
+ return np.full(new_shape, fillvalue, self.dtype_encoded)
524
+
525
+
526
+ def _make_blank_chunk_array(self, decoded=True):
527
+ """
528
+
529
+ """
530
+ if self.dtype_decoded.kind == 'f' and decoded:
531
+ fillvalue = np.nan
532
+ else:
533
+ fillvalue = self.fillvalue
534
+
535
+ if decoded:
536
+ return np.full(self.chunk_shape, fillvalue, self.dtype_decoded)
537
+ else:
538
+ return np.full(self.chunk_shape, fillvalue, self.dtype_encoded)
539
+
540
+
541
+ def rechunker(self):
542
+ """
543
+ Initialize a Rechunker class to assist in rechunking the variable.
544
+ """
545
+ return Rechunker(self)
546
+
547
+
548
+ def __getitem__(self, sel):
549
+ return self.get(sel)
550
+
551
+
552
+ # def __delitem__(self, sel):
553
+ # """
554
+ # Should I implement this as a way to "delete" data? It wouldn't actually delete rather. It would instead set those values to the fillvalue/nan. I should probably delete chunks if the values become nan.
555
+ # """
556
+ # TODO
557
+
558
+
559
+ def iter_chunks(self, decoded=True):
560
+ """
561
+ Iterate through the chunks of the variable and return numpy arrays associated with the index slices. This should be the main way for users to get large amounts of data from a variable. The "ends" of the data will be clipped to the shape of the variable (i.e. not all chunks will be the chunk_shape).
562
+
563
+ Parameters
564
+ ----------
565
+ decoded: bool
566
+ Should the data be decoded?
567
+
568
+ Returns
569
+ -------
570
+ Generator
571
+ tuple of slices of the indexes, numpy array of the data
572
+ """
573
+ self.load()
574
+
575
+ coord_origins = self.get_coord_origins()
576
+
577
+ blank = self._make_blank_chunk_array(decoded)
578
+
579
+ slices = indexers.index_combo_all(self._sel, coord_origins, self.shape)
580
+ for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, self.name, self.chunk_shape):
581
+ # print(target_chunk, source_chunk, blt_key)
582
+ b1 = self._blt.get(blt_key)
583
+ if b1 is None:
584
+ blank_slices = tuple(slice(0, sc.stop - sc.start) for sc in source_chunk)
585
+ yield target_chunk, blank[blank_slices]
586
+ else:
587
+ if decoded:
588
+ data = self._encoder.decode(self._encoder.from_bytes(b1))
589
+ else:
590
+ data = self._encoder.from_bytes(b1)
591
+
592
+ yield target_chunk, data[source_chunk]
593
+
594
+ def __iter__(self):
595
+ return self.iter_chunks()
596
+
597
+
598
+ def get_chunk(self, sel=None, decoded=True, missing_none=False):
599
+ """
600
+ Get data from one chunk. The method will return the first chunk parsed from sel.
601
+
602
+ Parameters
603
+ ----------
604
+ sel: tuple of slices, ints
605
+ The selection based on index positions.
606
+ decoded: bool
607
+ Should the data be decoded?
608
+ missing_none: bool
609
+ If chunk is missing, should the method return None or a blank array (filled with the fillvalue)?
610
+
611
+ Returns
612
+ -------
613
+ np.ndarray
614
+ """
615
+ if sel is None:
616
+ sel = self._sel
617
+ coord_origins = self.get_coord_origins()
618
+ slices = indexers.index_combo_all(sel, coord_origins, self.shape)
619
+ starts_chunk = tuple((pc.start//cs) * cs for cs, pc in zip(self.chunk_shape, slices))
620
+ blt_key = utils.make_var_chunk_key(self.name, starts_chunk)
621
+ b1 = self._blt.get(blt_key)
622
+ if missing_none and b1 is None:
623
+ return None
624
+ elif b1 is None:
625
+ return self._make_blank_chunk_array(decoded)
626
+ else:
627
+ encoded_data = self._encoder.from_bytes(b1)
628
+ if decoded:
629
+ return self._encoder.decode(encoded_data)
630
+ else:
631
+ return encoded_data
632
+
633
+
634
+ def get_coord_origins(self):
635
+ """
636
+ Get the coordinate origins for the variable.
637
+ """
638
+ if hasattr(self, 'coords'):
639
+ coord_origins = tuple(self._sys_meta.variables[coord].origin for coord in self.coord_names)
640
+ else:
641
+ coord_origins = (self.origin,)
642
+
643
+ return coord_origins
644
+
645
+
646
+ @property
647
+ def coords(self):
648
+ if self._sel is None:
649
+ return tuple(self._dataset[coord_name] for coord_name in self.coord_names)
650
+ else:
651
+ return tuple(self._dataset[coord_name][self._sel[i]] for i, coord_name in enumerate(self.coord_names))
652
+
653
+
654
+ def __len__(self):
655
+ return math.prod(self.shape)
656
+
657
+ def load(self):
658
+ """
659
+ This method only applies if the dataset has been open as an EDataset.
660
+ Load the chunks from the remote into the local file based on the selection. If not selection has been made, then it will load in all the chunks.
661
+ """
662
+ if self._has_load_items:
663
+ coord_origins = self.get_coord_origins()
664
+ slices = indexers.index_combo_all(self._sel, coord_origins, self.shape)
665
+ # keys = list(indexers.slices_to_keys(slices, self.name, self.chunk_shape))
666
+ # print(keys)
667
+ # failures = self._blt.load_items(keys)
668
+ failures = self._blt.load_items(indexers.slices_to_keys(slices, self.name, self.chunk_shape))
669
+ # self._blt.sync()
670
+ if failures:
671
+ raise Exception(failures)
672
+
673
+
674
+ class CoordinateView(Variable):
675
+ """
676
+
677
+ """
678
+ @property
679
+ def data(self):
680
+ if not hasattr(self, '_data'):
681
+ coord_origins = self.get_coord_origins()
682
+
683
+ target = self._make_blank_sel_array(self._sel, coord_origins)
684
+
685
+ for target_chunk, data in self.iter_chunks():
686
+ target[target_chunk] = data
687
+
688
+ self._data = target
689
+
690
+ return self._data
691
+
692
+
693
+ def get(self, sel):
694
+ """
695
+ Get a CoordinateView based on the index position(s).
696
+ The parameter sel can be an int, slice, or some combo within a tuple. For example, a tuple of slices (of the index positions).
697
+
698
+ Parameters
699
+ ----------
700
+ sel: int, slice, tuple of ints or slices
701
+ It can be an int, slice, or a tuple of ints or slices. Numpy advanced indexing is not implemented.
702
+
703
+ Returns
704
+ -------
705
+ cfdb.CoordinateView
706
+ """
707
+ coord_origins = self.get_coord_origins()
708
+
709
+ slices = indexers.index_combo_all(sel, coord_origins, self.shape)
710
+
711
+ if self._sel is not None:
712
+ slices = tuple(slice(s.start, s.stop) if ss.start is None else slice(ss.start + s.start, ss.start + s.stop) for ss, s in zip(self._sel, slices))
713
+
714
+ return CoordinateView(self.name, self._dataset, slices)
715
+
716
+
717
+ # def resize(self, start=None, end=None):
718
+ # """
719
+ # Resize a coordinate. If step is an int or float, then resizing can add or truncate the length. If step is None, then the coordinate can only have the length truncated.
720
+ # If the coordinate length is reduced, then all data variables associated with the coordinate will have their data truncated.
721
+ # """
722
+ # if end is not None:
723
+ # idx = indexers.loc_index_combo_one(end, self.data)
724
+ # if self.step is not None:
725
+ # pass
726
+ # else:
727
+ # updated_data =
728
+
729
+
730
+ @property
731
+ def step(self):
732
+ return getattr(self._var_meta, 'step')
733
+
734
+ @property
735
+ def auto_increment(self):
736
+ return getattr(self._var_meta, 'auto_increment')
737
+
738
+ @property
739
+ def origin(self):
740
+ return getattr(self._var_meta, 'origin')
741
+
742
+ @property
743
+ def shape(self):
744
+ return tuple(s.stop - s.start for s in self._sel)
745
+
746
+
747
+
748
+
749
+ # def copy(self, to_file=None, name: str=None, include_attrs=True, **kwargs):
750
+ # """
751
+ # Copy a Coordinate object.
752
+ # """
753
+ # if (to_file is None) and (name is None):
754
+ # raise ValueError('If to_file is None, then a name must be passed and it must be different from the original.')
755
+
756
+ # if to_file is None:
757
+ # to_file = self.file
758
+
759
+ # if name is None:
760
+ # name = self.name
761
+
762
+ # ds = copy_coordinate(to_file, self, name, include_attrs=include_attrs, **kwargs)
763
+
764
+ # return ds
765
+
766
+ def __repr__(self):
767
+ """
768
+
769
+ """
770
+ return utils.coordinate_summary(self)
771
+
772
+
773
+ # def to_pandas(self):
774
+ # """
775
+
776
+ # """
777
+ # if not import_pandas:
778
+ # raise ImportError('pandas could not be imported.')
779
+
780
+ # return pd.Index(self.data, name=self.name)
781
+
782
+
783
+ # def to_xarray(self):
784
+ # """
785
+
786
+ # """
787
+
788
+
789
+ class Coordinate(CoordinateView):
790
+ """
791
+
792
+ """
793
+ @property
794
+ def shape(self):
795
+ return getattr(self._var_meta, 'shape')
796
+
797
+
798
+ def _add_updated_data(self, chunk_start, chunk_stop, new_origin, updated_data):
799
+ """
800
+
801
+ """
802
+ chunk_len = self.chunk_shape[0]
803
+
804
+ mem_arr1 = np.full(self.chunk_shape, fill_value=self.fillvalue, dtype=self.dtype_encoded)
805
+
806
+ # print(chunk_start)
807
+
808
+ chunk_iter = rechunkit.chunk_range(chunk_start, chunk_stop, self.chunk_shape, clip_ends=True)
809
+ for chunk in chunk_iter:
810
+ chunk = chunk[0] # Because coords are always 1D
811
+ # print(chunk)
812
+
813
+ chunk_start_pos = chunk.start
814
+ chunk_stop_pos = chunk.stop
815
+
816
+ chunk_origin = (chunk_start_pos//chunk_len) * chunk_len
817
+ mem_chunk_start_pos = chunk_start_pos - chunk_origin
818
+ mem_chunk_stop_pos = chunk_stop_pos - chunk_origin
819
+ mem_chunk_slice = slice(mem_chunk_start_pos, mem_chunk_stop_pos)
820
+
821
+ coord_start_pos = chunk_start_pos - new_origin
822
+ coord_stop_pos = chunk_stop_pos - new_origin
823
+ coord_chunk_slice = slice(coord_start_pos, coord_stop_pos)
824
+
825
+ # print(updated_data[coord_chunk_slice])
826
+
827
+ mem_arr2 = mem_arr1.copy()
828
+ mem_arr2[mem_chunk_slice] = self._encoder.encode(updated_data[coord_chunk_slice])
829
+
830
+ key = utils.make_var_chunk_key(self.name, (chunk_origin,))
831
+ # print(key)
832
+
833
+ self._blt.set(key, self._encoder.to_bytes(mem_arr2))
834
+
835
+ self._data = updated_data
836
+
837
+
838
+ def prepend(self, data):
839
+ """
840
+ Prepend data to the start of the coordinate. The extra length will be added to the associated data variables with the fillvalue.
841
+ """
842
+ if not self.writable:
843
+ raise ValueError('Dataset is not writable.')
844
+
845
+ updated_data = utils.prepend_coord_data_checks(data, self.data, self.dtype_decoded, self.step)
846
+
847
+ data_diff = updated_data.size - self.data.size
848
+
849
+ new_origin = self.origin - data_diff
850
+ chunk_stop = (updated_data.size + new_origin,)
851
+
852
+ chunk_start = (new_origin,)
853
+
854
+ self._add_updated_data(chunk_start, chunk_stop, new_origin, updated_data)
855
+
856
+ self._var_meta.origin = new_origin
857
+ self._var_meta.shape = updated_data.shape
858
+
859
+
860
+ def append(self, data):
861
+ """
862
+ Append data to the end of the coordinate. The extra length will be added to the associated data variables with the fillvalue.
863
+ """
864
+ if not self.writable:
865
+ raise ValueError('Dataset is not writable.')
866
+
867
+ updated_data = utils.append_coord_data_checks(data, self.data, self.dtype_decoded, self.step)
868
+
869
+ shape = (updated_data.size,)
870
+
871
+ chunk_start = (self.origin,)
872
+ chunk_stop = shape
873
+
874
+ self._add_updated_data(chunk_start, chunk_stop, self.origin, updated_data)
875
+
876
+ self._var_meta.shape = shape
877
+
878
+
879
+
880
+ class DataVariableView(Variable):
881
+ """
882
+
883
+ """
884
+ @property
885
+ def data(self):
886
+ coord_origins = self.get_coord_origins()
887
+
888
+ target = self._make_blank_sel_array(self._sel, coord_origins)
889
+
890
+ for target_chunk, data in self.iter_chunks():
891
+ target[target_chunk] = data
892
+
893
+ return target
894
+
895
+
896
+ def get(self, sel):
897
+ """
898
+ Get a DataVariableView based on the index position(s).
899
+ The parameter sel can be an int, slice, or some combo within a tuple. For example, a tuple of slices (of the index positions).
900
+
901
+ Parameters
902
+ ----------
903
+ sel: int, slice, tuple of ints or slices
904
+ It can be an int, slice, or a tuple of ints or slices. Numpy advanced indexing is not implemented.
905
+
906
+ Returns
907
+ -------
908
+ cfdb.DataVariableView
909
+ """
910
+ coord_origins = self.get_coord_origins()
911
+
912
+ slices = indexers.index_combo_all(sel, coord_origins, self.shape)
913
+
914
+ if self._sel is not None:
915
+ slices = tuple(slice(s.start, s.stop) if ss.start is None else slice(ss.start + s.start, ss.start + s.stop) for ss, s in zip(self._sel, slices))
916
+
917
+ return DataVariableView(self.name, self._dataset, slices)
918
+
919
+
920
+ def set(self, sel, data, encode=True):
921
+ """
922
+ Set data based on index positions.
923
+ """
924
+ if not self.writable:
925
+ raise ValueError('Dataset is not writable.')
926
+
927
+ coord_origins = self.get_coord_origins()
928
+
929
+ chunk_blank = self._make_blank_chunk_array(False)
930
+
931
+ slices = indexers.check_sel_input_data(sel, data, coord_origins, self.shape)
932
+
933
+ if self._sel is not None:
934
+ slices = tuple(slice(s.start, s.stop) if ss.start is None else slice(ss.start + s.start, ss.start + s.stop) for ss, s in zip(self._sel, slices))
935
+
936
+ for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, self.name, self.chunk_shape):
937
+ b1 = self._blt.get(blt_key)
938
+ if b1 is None:
939
+ new_data = chunk_blank.copy()
940
+ else:
941
+ new_data = self._encoder.from_bytes(b1)
942
+
943
+ if encode:
944
+ new_data[source_chunk] = self._encoder.encode(data[target_chunk])
945
+ else:
946
+ new_data[source_chunk] = data[target_chunk]
947
+ self._blt.set(blt_key, self._encoder.to_bytes(new_data))
948
+
949
+
950
+ def __setitem__(self, sel, data):
951
+ """
952
+
953
+ """
954
+ self.set(sel, data)
955
+
956
+
957
+ def groupby(self, coord_names: Iterable, max_mem: int=2**27, decoded=True):
958
+ """
959
+ This method takes one or more coord names to group by and returns a generator. This generator will return chunks of data according to these groupings with the associated tuple of slices. The more max_mem provided, the more efficient the chunking.
960
+ This is effectively the rechunking method where each coord name supplied is set to 1 and all other coords are set to their full their full length.
961
+
962
+ Parameters
963
+ ----------
964
+ coord_names: Iterable
965
+ The coord names to group by.
966
+ max_mem: int
967
+ The max allocated memory to perform the chunking operation in bytes. This will only be as large as necessary for an optimum size chunk for the rechunking.
968
+
969
+ Returns
970
+ -------
971
+ Generator
972
+ tuple of the target slices to the np.ndarray of data
973
+ """
974
+ self.load()
975
+
976
+ var_coord_names = self.coord_names
977
+ if isinstance(coord_names, str):
978
+ coord_names = (coord_names,)
979
+ else:
980
+ coord_names = tuple(coord_names)
981
+
982
+ # checks
983
+ for coord_name in coord_names:
984
+ if coord_name not in var_coord_names:
985
+ raise ValueError(f'{coord_name} is not a coord of this variable.')
986
+
987
+ # Build target chunk shape
988
+ target_chunk_shape = []
989
+ for coord in self.coords:
990
+ coord_name = coord.name
991
+ if coord_name in coord_names:
992
+ target_chunk_shape.append(1)
993
+ else:
994
+ target_chunk_shape.append(coord.shape[0])
995
+
996
+ # Do the chunking
997
+ func = lambda sel: self.get_chunk(sel, decoded=False)
998
+
999
+ rechunkit1 = rechunkit.rechunker(func, self.shape, self.dtype_encoded, self.chunk_shape, tuple(target_chunk_shape), max_mem, self._sel)
1000
+
1001
+ if decoded:
1002
+ for slices, encoded_data in rechunkit1:
1003
+ yield slices, self._encoder.decode(encoded_data)
1004
+ else:
1005
+ for slices, encoded_data in rechunkit1:
1006
+ yield slices, encoded_data
1007
+
1008
+
1009
+ # def to_pandas(self):
1010
+ # """
1011
+
1012
+ # """
1013
+ # if not import_pandas:
1014
+ # raise ImportError('pandas could not be imported.')
1015
+
1016
+ # indexes = []
1017
+ # for dim in self.coords:
1018
+ # coord = self.file[dim]
1019
+ # indexes.append(coord.data)
1020
+
1021
+ # pd_index = pd.MultiIndex.from_product(indexes, names=self.coords)
1022
+
1023
+ # series = pd.Series(self[()].flatten(), index=pd_index)
1024
+ # series.name = self.name
1025
+
1026
+ # return series
1027
+
1028
+
1029
+ # def to_xarray(self, **kwargs):
1030
+ # """
1031
+
1032
+ # """
1033
+ # if not import_xarray:
1034
+ # raise ImportError('xarray could not be imported.')
1035
+
1036
+ # da = xr.DataArray(data=self[()], coords=[self.file[dim].data for dim in self.coords], dims=self.coords, name=self.name, attrs=self.attrs)
1037
+
1038
+ # return da
1039
+
1040
+
1041
+ # def copy(self, to_file=None, name: str=None, include_data=True, include_attrs=True, **kwargs):
1042
+ # """
1043
+ # Copy a DataVariable object.
1044
+ # """
1045
+ # if (to_file is None) and (name is None):
1046
+ # raise ValueError('If to_file is None, then a name must be passed and it must be different from the original.')
1047
+
1048
+ # if to_file is None:
1049
+ # to_file = self.file
1050
+
1051
+ # if name is None:
1052
+ # name = self.name
1053
+
1054
+ # ds = copy_data_variable(to_file, self, name, include_data=include_data, include_attrs=include_attrs, **kwargs)
1055
+
1056
+ # return ds
1057
+
1058
+
1059
+ def __repr__(self):
1060
+ """
1061
+
1062
+ """
1063
+ return utils.data_variable_summary(self)
1064
+
1065
+
1066
+ # @property
1067
+ # def coords(self):
1068
+ # return getattr(self._var_meta, 'coords')
1069
+
1070
+ @property
1071
+ def shape(self):
1072
+ return tuple(s.stop - s.start for s in self._sel)
1073
+
1074
+ # @property
1075
+ # def coords(self):
1076
+ # return tuple(self._dataset[coord_name][self._sel[i]] for i, coord_name in enumerate(self.coord_names))
1077
+
1078
+
1079
+
1080
+
1081
+ class DataVariable(DataVariableView):
1082
+ """
1083
+
1084
+ """
1085
+ @property
1086
+ def shape(self):
1087
+ return tuple(self._sys_meta.variables[coord_name].shape[0] for coord_name in self.coord_names)
1088
+
1089
+
1090
+
1091
+
1092
+
1093
+
1094
+
1095
+
1096
+
1097
+
1098
+
1099
+
1100
+
1101
+
1102
+
1103
+
1104
+
1105
+
1106
+
1107
+
1108
+
1109
+
1110
+
1111
+
1112
+
1113
+
1114
+
1115
+
1116
+
1117
+
1118
+
1119
+
1120
+
1121
+
1122
+
1123
+
1124
+
1125
+
1126
+
1127
+
1128
+
1129
+
1130
+
1131
+
1132
+
1133
+
1134
+
1135
+
1136
+
1137
+
1138
+
1139
+
1140
+
1141
+
1142
+
1143
+
1144
+
1145
+
1146
+
1147
+
1148
+
1149
+
1150
+
1151
+
1152
+
1153
+
1154
+
1155
+
1156
+
1157
+
1158
+
1159
+
1160
+
1161
+
1162
+
1163
+
1164
+
1165
+
1166
+
1167
+
1168
+
1169
+
1170
+
1171
+
1172
+
1173
+
1174
+
1175
+
1176
+
1177
+
1178
+
1179
+
1180
+
1181
+
1182
+
1183
+
1184
+
1185
+
1186
+
1187
+