cfdb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cfdb/main.py ADDED
@@ -0,0 +1,857 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Tue Jan 7 11:25:06 2025
5
+
6
+ @author: mike
7
+ """
8
+ import booklet
9
+ from typing import Union
10
+ import pathlib
11
+ import msgspec
12
+ import weakref
13
+ from copy import deepcopy
14
+
15
+ try:
16
+ import h5netcdf
17
+ import_h5netcdf = True
18
+ except ImportError:
19
+ import_h5netcdf = False
20
+
21
+ try:
22
+ import ebooklet
23
+ import_ebooklet = True
24
+ except ImportError:
25
+ import_ebooklet = False
26
+
27
+ from . import utils, indexers, data_models, creation, support_classes as sc
28
+ # import utils, indexers, data_models, creation, support_classes as sc
29
+
30
+
31
+ ############################################
32
+ ### Parameters
33
+
34
+
35
+
36
+
37
+ ############################################
38
+ ### Functions
39
+
40
+
41
+
42
+
43
+ ############################################
44
+ ### Classes
45
+
46
+
47
+ class DatasetBase:
48
+
49
+ # def __bool__(self):
50
+ # """
51
+
52
+ # """
53
+ # return self._file.__bool__()
54
+
55
+ def __iter__(self):
56
+ for key in self.var_names:
57
+ yield key
58
+
59
+ def __len__(self):
60
+ return len(self.var_names)
61
+
62
+ def __contains__(self, key):
63
+ return key in self.var_names
64
+
65
+ # def get(self, var_name):
66
+ # """
67
+
68
+ # """
69
+ # if not isinstance(var_name, str):
70
+ # raise TypeError('var_name must be a string.')
71
+
72
+ # if var_name not in self:
73
+ # raise ValueError(f'The Variable {var_name} does not exist.')
74
+
75
+ # if self._sel is not None:
76
+ # if var_name not in self._sel:
77
+ # raise ValueError(f'The Variable {var_name} does not exist in view.')
78
+
79
+ # if var_name not in self._var_cache:
80
+ # var_meta = self._sys_meta.variables[var_name]
81
+ # if isinstance(var_meta, data_models.DataVariable):
82
+ # var = sc.DataVariable(var_name, self)
83
+ # else:
84
+ # var = sc.Coordinate(var_name, self)
85
+ # self._var_cache[var_name] = var
86
+
87
+ # if self._sel is None:
88
+ # return self._var_cache[var_name]
89
+ # else:
90
+ # return self._var_cache[var_name][self._sel[var_name]]
91
+
92
+ # var_meta = self._sys_meta.variables[var_name]
93
+ # if isinstance(var_meta, data_models.DataVariable):
94
+ # var = sc.DataVariable(var_name, self)
95
+ # else:
96
+ # var = sc.Coordinate(var_name, self)
97
+
98
+ # return var
99
+
100
+
101
+ def __getitem__(self, key):
102
+ return self.get(key)
103
+
104
+ # def __setitem__(self, key, value):
105
+ # if isinstance(value, sc.Variable):
106
+ # setattr(self, key, value)
107
+ # else:
108
+ # raise TypeError('Assigned value must be a Variable or Coordinate object.')
109
+
110
+ def __delitem__(self, key):
111
+ if key not in self:
112
+ raise KeyError(key)
113
+
114
+ if not self.writable:
115
+ raise ValueError('Dataset is not writable.')
116
+
117
+ # Check if the object to delete is a coordinate
118
+ # And if it is, check that no variables are attached to it
119
+ if isinstance(self[key], sc.Coordinate):
120
+ for var_name, var in self._sys_meta.variables.items():
121
+ if isinstance(var, data_models.DataVariable):
122
+ if key in var.coords:
123
+ raise ValueError(f'{key} is a coordinate of {var_name}. You must delete all variables associated with a coordinate before you can delete the coordinate.')
124
+
125
+ # Delete all chunks from file
126
+ var = self[key]
127
+ coord_origins = var.get_coord_origins()
128
+
129
+ slices = indexers.index_combo_all(None, coord_origins, var.shape)
130
+ for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, var.name, var.chunk_shape):
131
+ try:
132
+ del self._blt[blt_key]
133
+ except KeyError:
134
+ pass
135
+
136
+ # Delete the attrs key
137
+ try:
138
+ del self._blt[sc.attrs_key.format(var_name=key)]
139
+ except KeyError:
140
+ pass
141
+
142
+ # Delete in cache
143
+ try:
144
+ del self._var_cache[key]
145
+ except KeyError:
146
+ pass
147
+
148
+ # Delete the instance in the sys meta
149
+ del self._sys_meta.variables[key]
150
+
151
+
152
+ # def sync(self):
153
+ # """
154
+
155
+ # """
156
+ # old_meta = msgspec.convert(self._blt.get_metadata(), data_models.SysMeta)
157
+ # if old_meta != self._meta:
158
+ # self._blt.set_metadata(msgspec.to_builtins(self._meta))
159
+ # self._blt.sync()
160
+
161
+ def __bool__(self):
162
+ return self.is_open
163
+
164
+
165
+ def __repr__(self):
166
+ """
167
+
168
+ """
169
+ return utils.file_summary(self)
170
+
171
+
172
+ def sel(self, sel: dict):
173
+ """
174
+ Filter the dataset variables by a selection of the coordinate positions.
175
+ """
176
+ ## Checks on input
177
+ coord_names = self.coord_names
178
+ for key in sel:
179
+ if key not in coord_names:
180
+ raise KeyError(f'The coordinate {key} does not exist in the dataset.')
181
+
182
+ ## Create selections per coord
183
+ _sel = {}
184
+ for coord_name in coord_names:
185
+ coord = self[coord_name]
186
+ if coord_name in sel:
187
+ slices = indexers.index_combo_all(sel[coord_name], coord.get_coord_origins(), coord.shape)
188
+ else:
189
+ slices = indexers.index_combo_all(None, coord.get_coord_origins(), coord.shape)
190
+ _sel[coord_name] = slices
191
+
192
+ ## Create selections for data vars
193
+ data_var_names = self.data_var_names
194
+ for data_var_name in data_var_names:
195
+ data_var = self[data_var_name]
196
+ data_var_sel = tuple(_sel[coord_name][0] for coord_name in data_var.coord_names)
197
+ _sel[data_var_name] = data_var_sel
198
+
199
+ ## Init DatasetView
200
+ return DatasetView(self, _sel)
201
+
202
+
203
+ def sel_loc(self, sel: dict):
204
+ """
205
+ Filter the dataset variables by a selection of the coordinate locations.
206
+ """
207
+ ## Checks on input
208
+ coord_names = self.coord_names
209
+ for key in sel:
210
+ if key not in coord_names:
211
+ raise KeyError(f'The coordinate {key} does not exist in the dataset.')
212
+
213
+ ## Create selections per coord
214
+ _sel = {}
215
+ for coord_name in coord_names:
216
+ coord = self[coord_name]
217
+ if coord_name in sel:
218
+ slices = indexers.index_combo_all(indexers.loc_index_combo_all(sel[coord_name], (coord,)), coord.get_coord_origins(), coord.shape)
219
+ else:
220
+ slices = indexers.index_combo_all(None, coord.get_coord_origins(), coord.shape)
221
+ _sel[coord_name] = slices
222
+
223
+ ## Create selections for data vars
224
+ data_var_names = self.data_var_names
225
+ for data_var_name in data_var_names:
226
+ data_var = self[data_var_name]
227
+ data_var_sel = tuple(_sel[coord_name][0] for coord_name in data_var.coord_names)
228
+ _sel[data_var_name] = data_var_sel
229
+
230
+ ## Init DatasetView
231
+ return DatasetView(self, _sel)
232
+
233
+
234
+ # def to_pandas(self):
235
+ # """
236
+ # Convert the entire file into a pandas DataFrame.
237
+ # """
238
+ # if not import_pandas:
239
+ # raise ImportError('pandas could not be imported.')
240
+
241
+ # # TODO: This feels wrong...but it works...
242
+ # result = None
243
+ # for var_name in self.data_vars:
244
+ # if result is None:
245
+ # result = self[var_name].to_pandas().to_frame()
246
+ # else:
247
+ # result = result.join(self[var_name].to_pandas().to_frame(), how='outer')
248
+
249
+ # self.close()
250
+
251
+ # return result
252
+
253
+
254
+ # def to_xarray(self, **kwargs):
255
+ # """
256
+ # Closes the file and opens it in xarray.
257
+
258
+ # Parameters
259
+ # ----------
260
+ # kwargs
261
+ # Any kwargs that can be passed to xr.open_dataset.
262
+
263
+ # Returns
264
+ # -------
265
+ # xr.Dataset
266
+ # """
267
+ # if not import_xarray:
268
+ # raise ImportError('xarray could not be imported.')
269
+
270
+ # filename = pathlib.Path(self.filename)
271
+
272
+ # if filename.is_file():
273
+ # self.close()
274
+ # else:
275
+ # temp_file = tempfile.NamedTemporaryFile()
276
+ # filename = temp_file.name
277
+ # self.to_file(filename)
278
+ # self.close()
279
+
280
+ # x1 = xr.open_dataset(filename, **kwargs)
281
+
282
+ # return x1
283
+
284
+
285
+ def copy(self, file_path):
286
+ """
287
+
288
+ """
289
+ kwargs = dict(n_buckets=self._blt._n_buckets, buffer_size=self._blt._write_buffer_size)
290
+
291
+ new_ds = open_dataset(file_path, 'n', compression=self.compression, compression_level=self.compression_level, **kwargs)
292
+
293
+ for coord in self.coords:
294
+ new_coord = new_ds.create.coord.like(coord.name, coord, True)
295
+ new_coord.attrs.update(coord.attrs.data)
296
+
297
+ for data_var in self.data_vars:
298
+ new_data_var = new_ds.create.data_var.like(data_var.name, data_var)
299
+ new_data_var.attrs.update(data_var.attrs.data)
300
+ for write_chunk, data in data_var.iter_chunks(False):
301
+ new_data_var.set(write_chunk, data, False)
302
+
303
+ new_ds.attrs.update(self.attrs.data)
304
+
305
+ return new_ds
306
+
307
+
308
+ def to_netcdf4(self, file_path: Union[str, pathlib.Path], compression: str='gzip', **file_kwargs):
309
+ """
310
+ Save a dataset to a netcdf4 file using h5netcdf.
311
+ """
312
+ if not import_h5netcdf:
313
+ raise ImportError('h5netcdf must be installed to save files to netcdf4.')
314
+
315
+ h5 = h5netcdf.File(file_path, 'w', **file_kwargs)
316
+
317
+ # dims/coords
318
+ for coord in self.coords:
319
+ name = coord.name
320
+ h5.dimensions[name] = coord.shape[0]
321
+ coord_len = coord.shape[0]
322
+ chunk_len = coord.chunk_shape[0]
323
+ if chunk_len > coord_len:
324
+ chunk_shape = (coord_len,)
325
+ else:
326
+ chunk_shape = (chunk_len,)
327
+
328
+ h5_coord = h5.create_variable(name, (name,), coord.dtype_encoded, compression=compression, chunks=chunk_shape, fillvalue=coord.fillvalue)
329
+ attrs = deepcopy(coord.attrs.data)
330
+ dtype_decoded, dtype_encoded = utils.parse_dtype_names(coord.dtype_decoded, coord.dtype_encoded)
331
+ if coord.step is not None:
332
+ attrs['step'] = coord.step
333
+ if coord.scale_factor is not None:
334
+ attrs['scale_factor'] = coord.scale_factor
335
+ elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
336
+ attrs['scale_factor'] = 1
337
+ if coord.add_offset is not None:
338
+ attrs['add_offset'] = coord.add_offset
339
+ elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
340
+ attrs['add_offset'] = 0
341
+ if coord.dtype_decoded.kind == 'M':
342
+ units = utils.parse_cf_time_units(coord.dtype_decoded)
343
+ calendar = "proleptic_gregorian"
344
+ attrs['units'] = units
345
+ attrs['calendar'] = calendar
346
+ attrs['standard_name'] = 'time'
347
+
348
+ attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': coord.fillvalue})
349
+ h5_coord.attrs.update(attrs)
350
+
351
+ for write_chunk, data in coord.iter_chunks(decoded=False):
352
+ h5_coord[write_chunk] = data
353
+
354
+ # Data vars
355
+ for data_var in self.data_vars:
356
+ name = data_var.name
357
+ chunk_shape = []
358
+ for s, cs in zip(data_var.shape, data_var.chunk_shape):
359
+ if cs > s:
360
+ chunk_shape.append(s)
361
+ else:
362
+ chunk_shape.append(cs)
363
+
364
+ h5_data_var = h5.create_variable(name, data_var.coord_names, data_var.dtype_encoded, compression=compression, chunks=tuple(chunk_shape), fillvalue=data_var.fillvalue)
365
+ attrs = deepcopy(data_var.attrs.data)
366
+ dtype_decoded, dtype_encoded = utils.parse_dtype_names(data_var.dtype_decoded, data_var.dtype_encoded)
367
+ if data_var.scale_factor is not None:
368
+ attrs['scale_factor'] = data_var.scale_factor
369
+ elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
370
+ attrs['scale_factor'] = 1
371
+ if data_var.add_offset is not None:
372
+ attrs['add_offset'] = data_var.add_offset
373
+ elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
374
+ attrs['add_offset'] = 0
375
+ if data_var.dtype_decoded.kind == 'M':
376
+ units = utils.parse_cf_time_units(data_var.dtype_decoded)
377
+ calendar = "proleptic_gregorian"
378
+ attrs['units'] = units
379
+ attrs['calendar'] = calendar
380
+ attrs['standard_name'] = 'time'
381
+
382
+ attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': data_var.fillvalue})
383
+ h5_data_var.attrs.update(attrs)
384
+
385
+ for write_chunk, data in data_var.iter_chunks(decoded=False):
386
+ h5_data_var[write_chunk] = data
387
+
388
+ # Add global attrs
389
+ h5.attrs.update(self.attrs.data)
390
+
391
+ h5.close()
392
+
393
+
394
+ class Dataset(DatasetBase):
395
+ """
396
+
397
+ """
398
+ def __init__(self, file_path, open_blt, create, compression, compression_level):
399
+ """
400
+ Compression can be either zstd, lz4, or None. But there's no point in using None.
401
+ """
402
+ self._blt = open_blt
403
+ self.writable = self._blt.writable
404
+ self.file_path = file_path
405
+ self.is_open = True
406
+
407
+ if hasattr(self._blt, 'load_items'):
408
+ self._has_load_items = True
409
+ else:
410
+ self._has_load_items = False
411
+
412
+ ## Set/Get system metadata
413
+ if create:
414
+ # Checks
415
+ compression = compression.lower()
416
+ if compression not in utils.compression_options:
417
+ raise ValueError(f'compression must be one of {utils.compression_options}.')
418
+ if compression_level is None:
419
+ compression_level = utils.default_compression_levels[compression]
420
+ elif not isinstance(compression_level, int):
421
+ raise ValueError('compression_level must be either None or an int.')
422
+
423
+ self._sys_meta = data_models.SysMeta(object_type='Dataset', compression=data_models.Compressor(compression), compression_level=compression_level, variables={})
424
+ self._blt.set_metadata(msgspec.to_builtins(self._sys_meta))
425
+
426
+ else:
427
+ self._sys_meta = msgspec.convert(self._blt.get_metadata(), data_models.SysMeta)
428
+
429
+ self.compression = self._sys_meta.compression.value
430
+ self.compression_level = self._sys_meta.compression_level
431
+ self._compressor = sc.Compressor(self.compression, self.compression_level)
432
+
433
+ self._finalizers = [weakref.finalize(self, utils.dataset_finalizer, self._blt, self._sys_meta)]
434
+
435
+ self.attrs = sc.Attributes(self._blt, '_', self.writable, self._finalizers)
436
+
437
+ self._var_cache = weakref.WeakValueDictionary()
438
+
439
+ if self.writable:
440
+ self.create = creation.Creator(self)
441
+
442
+
443
+ def get(self, var_name):
444
+ """
445
+ Get a variable contained within the dataset.
446
+ """
447
+ if not isinstance(var_name, str):
448
+ raise TypeError('var_name must be a string.')
449
+
450
+ if var_name not in self:
451
+ raise ValueError(f'The Variable {var_name} does not exist.')
452
+
453
+ # if self._sel is not None:
454
+ # if var_name not in self._sel:
455
+ # raise ValueError(f'The Variable {var_name} does not exist in view.')
456
+
457
+ if var_name not in self._var_cache:
458
+ var_meta = self._sys_meta.variables[var_name]
459
+ if isinstance(var_meta, data_models.DataVariable):
460
+ var = sc.DataVariable(var_name, self)
461
+ else:
462
+ var = sc.Coordinate(var_name, self)
463
+ self._var_cache[var_name] = var
464
+
465
+ return self._var_cache[var_name]
466
+
467
+ def __enter__(self):
468
+ return self
469
+
470
+ def __exit__(self, *args):
471
+ self.close()
472
+
473
+ def close(self):
474
+ """
475
+ Close the database.
476
+ """
477
+ # self.sync()
478
+ for finalizer in reversed(self._finalizers):
479
+ finalizer()
480
+ self.is_open = False
481
+
482
+
483
+ @property
484
+ def var_names(self):
485
+ """
486
+ Return a tuple of all the variables names (coord and data variables).
487
+ """
488
+ return tuple(self._sys_meta.variables.keys())
489
+
490
+ @property
491
+ def coord_names(self):
492
+ """
493
+ Return a tuple of all the coordinate names.
494
+ """
495
+ return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.CoordinateVariable))
496
+
497
+
498
+ @property
499
+ def data_var_names(self):
500
+ """
501
+ Return a tuple of all the data variable names.
502
+ """
503
+ return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable))
504
+
505
+
506
+ @property
507
+ def coords(self):
508
+ """
509
+ Return a tuple of coords.
510
+ """
511
+ return tuple(self[coord_name] for coord_name in self.coord_names)
512
+
513
+ @property
514
+ def data_vars(self):
515
+ """
516
+ Return a tuple of data variables.
517
+ """
518
+ return tuple(self[var_name] for var_name in self.data_var_names)
519
+
520
+ @property
521
+ def variables(self):
522
+ """
523
+ Return a tuple of variables.
524
+ """
525
+ return tuple(self[var_name] for var_name in self.var_names)
526
+
527
+ def prune(self, timestamp=None, reindex=False):
528
+ """
529
+ Prunes deleted data from the file. Returns the number of removed items. The method can also prune remove keys/values older than the timestamp. The user can also reindex the booklet file. False does no reindexing, True increases the n_buckets to a preassigned value, or an int of the n_buckets. True can only be used if the default n_buckets were used at original initialisation.
530
+ """
531
+ return self._blt.prune(timestamp, reindex)
532
+
533
+ # def sync(self):
534
+ # """
535
+
536
+ # """
537
+ # self._blt.sync()
538
+
539
+
540
+
541
+
542
+ class DatasetView(DatasetBase):
543
+ """
544
+
545
+ """
546
+ def __init__(self, dataset, sel):
547
+ """
548
+
549
+ """
550
+ self._dataset = dataset
551
+ self._sel = sel
552
+ self._blt = dataset._blt
553
+ self._has_load_items = dataset._has_load_items
554
+ self.writable = False
555
+ self.file_path = dataset.file_path
556
+ self._sys_meta = dataset._sys_meta
557
+ self._compressor = dataset._compressor
558
+ self.compression = dataset.compression
559
+ self.compression_level = dataset.compression_level
560
+ self.attrs = dataset.attrs
561
+ self._var_cache = dataset._var_cache
562
+
563
+
564
+ def get(self, var_name):
565
+ """
566
+ Get a variable contained within the dataset.
567
+ """
568
+ if self._sel is not None:
569
+ if var_name not in self._sel:
570
+ raise ValueError(f'The Variable {var_name} does not exist in view.')
571
+
572
+ return self._dataset.get(var_name)[self._sel[var_name]]
573
+
574
+
575
+ @property
576
+ def is_open(self):
577
+ return self._dataset.is_open
578
+
579
+ @property
580
+ def var_names(self):
581
+ """
582
+ Return a tuple of all the variables names (coord and data variables).
583
+ """
584
+ return tuple(self._sel.keys())
585
+
586
+ @property
587
+ def coord_names(self):
588
+ """
589
+ Return a tuple of all the coordinate names.
590
+ """
591
+ return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.CoordinateVariable) if k in self._sel)
592
+
593
+ @property
594
+ def data_var_names(self):
595
+ """
596
+ Return a tuple of all the data variable names.
597
+ """
598
+ return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable) if k in self._sel)
599
+
600
+ @property
601
+ def coords(self):
602
+ return tuple(self[coord_name][self._sel[coord_name]] for coord_name in self.coord_names if coord_name in self._sel)
603
+
604
+ @property
605
+ def data_vars(self):
606
+ return tuple(self[var_name][self._sel[var_name]] for var_name in self.data_var_names if var_name in self._sel)
607
+
608
+ @property
609
+ def variables(self):
610
+ return tuple(self[var_name][self._sel[var_name]] for var_name in self.var_names if var_name in self._sel)
611
+
612
+
613
+
614
+ class EDataset(Dataset):
615
+ """
616
+
617
+ """
618
+ def changes(self):
619
+ """
620
+ Return a Change object of the changes that have occurred during this session.
621
+ """
622
+ return self._blt.changes()
623
+
624
+ def delete_remote(self):
625
+ """
626
+ Completely delete the remote dataset, but keep the local dataset.
627
+ """
628
+ self._blt.delete_remote()
629
+
630
+ def copy_remote(self, remote_conn: ebooklet.S3Connection):
631
+ """
632
+ Copy the entire remote dataset to another remote location. The new location must be empty.
633
+ """
634
+ self._blt.copy_remote(remote_conn)
635
+
636
+
637
+
638
+ #######################################################
639
+ ### Open functions
640
+
641
+
642
+ def open_dataset(file_path: Union[str, pathlib.Path], flag: str = "r", compression: str='zstd', compression_level: int=None, **kwargs):
643
+ """
644
+ Open a cfdb dataset. This uses the python package booklet for managing data in a single file.
645
+
646
+ Parameters
647
+ ----------
648
+ file_path: str or pathlib.Path
649
+ It must be a path to a local file location. If you want to use a tempfile, then use the name from the NamedTemporaryFile initialized class.
650
+ flag: str
651
+ Flag associated with how the file is opened according to the dbm style. See below for details.
652
+ compression: str
653
+ The compression algorithm used for compressing all data. Must be either zstd or lz4. The option zstd has a really good combo of compression ratio to speed, while lz4 has a stronger emphasis on speed (and is lightning fast). Default is zstd.
654
+ compression_level: int or None
655
+ The compression level used by the compression algorithm. Setting this to None will d=used the deafults, which is 1 for both compression options.
656
+ kwargs
657
+ Any kwargs that can be passed to booklet.open.
658
+
659
+ Returns
660
+ -------
661
+ cfdb.Dataset
662
+
663
+ The optional *flag* argument can be:
664
+ +---------+-------------------------------------------+
665
+ | Value | Meaning |
666
+ +=========+===========================================+
667
+ | ``'r'`` | Open existing database for reading only |
668
+ | | (default) |
669
+ +---------+-------------------------------------------+
670
+ | ``'w'`` | Open existing database for reading and |
671
+ | | writing |
672
+ +---------+-------------------------------------------+
673
+ | ``'c'`` | Open database for reading and writing, |
674
+ | | creating it if it doesn't exist |
675
+ +---------+-------------------------------------------+
676
+ | ``'n'`` | Always create a new, empty database, open |
677
+ | | for reading and writing |
678
+ +---------+-------------------------------------------+
679
+ """
680
+ if 'n_buckets' not in kwargs:
681
+ kwargs['n_buckets'] = utils.default_n_buckets
682
+
683
+ fp = pathlib.Path(file_path)
684
+ fp_exists = fp.exists()
685
+ open_blt = booklet.open(file_path, flag, key_serializer='str', **kwargs)
686
+
687
+ if not fp_exists or flag == 'n':
688
+ create = True
689
+ else:
690
+ create = False
691
+
692
+ return Dataset(file_path, open_blt, create, compression, compression_level)
693
+
694
+
695
+ def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
696
+ file_path: Union[str, pathlib.Path],
697
+ flag: str = "r",
698
+ compression: str='zstd',
699
+ compression_level: int=1,
700
+ **kwargs):
701
+ """
702
+ Open a cfdb that is linked with a remote S3 database.
703
+
704
+ Parameters
705
+ -----------
706
+ remote_conn : S3Connection, str, or dict
707
+ The object to connect to a remote. It can be an S3Connection object, an http url string, or a dict with the parameters for initializing an S3Connection object.
708
+
709
+ file_path : str or pathlib.Path
710
+ It must be a path to a local file location. If you want to use a tempfile, then use the name from the NamedTemporaryFile initialized class.
711
+
712
+ flag : str
713
+ Flag associated with how the file is opened according to the dbm style. See below for details.
714
+ compression: str
715
+ The compression algorithm used for compressing all data. Must be either zstd or lz4. The option zstd has a really good combo of compression ratio to speed, while lz4 has a stronger emphasis on speed (and is lightning fast). Default is zstd.
716
+ compression_level: int or None
717
+ The compression level used by the compression algorithm. Setting this to None will d=used the deafults, which is 1 for both compression options.
718
+ kwargs
719
+ Any kwargs that can be passed to ebooklet.open.
720
+
721
+ Returns
722
+ -------
723
+ cfdb.EDataset
724
+
725
+ The optional *flag* argument can be:
726
+ +---------+-------------------------------------------+
727
+ | Value | Meaning |
728
+ +=========+===========================================+
729
+ | ``'r'`` | Open existing database for reading only |
730
+ | | (default) |
731
+ +---------+-------------------------------------------+
732
+ | ``'w'`` | Open existing database for reading and |
733
+ | | writing |
734
+ +---------+-------------------------------------------+
735
+ | ``'c'`` | Open database for reading and writing, |
736
+ | | creating it if it doesn't exist |
737
+ +---------+-------------------------------------------+
738
+ | ``'n'`` | Always create a new, empty database, open |
739
+ | | for reading and writing |
740
+ +---------+-------------------------------------------+
741
+ """
742
+ if not import_ebooklet:
743
+ raise ImportError('ebooklet must be installed to open ebooklets.')
744
+
745
+ if 'n_buckets' not in kwargs:
746
+ kwargs['n_buckets'] = utils.default_n_buckets
747
+
748
+ fp = pathlib.Path(file_path)
749
+ fp_exists = fp.exists()
750
+ open_blt = ebooklet.open(remote_conn, file_path, flag, **kwargs)
751
+
752
+ if (not fp_exists or flag == 'n') and open_blt.writable:
753
+ create = True
754
+ else:
755
+ create = False
756
+
757
+ return EDataset(file_path, open_blt, create, compression, compression_level)
758
+
759
+
760
+
761
+
762
+
763
+
764
+
765
+
766
+
767
+
768
+
769
+
770
+
771
+
772
+
773
+
774
+
775
+
776
+
777
+
778
+
779
+
780
+
781
+
782
+
783
+
784
+
785
+
786
+
787
+
788
+
789
+
790
+
791
+
792
+
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
+
818
+
819
+
820
+
821
+
822
+
823
+
824
+
825
+
826
+
827
+
828
+
829
+
830
+
831
+
832
+
833
+
834
+
835
+
836
+
837
+
838
+
839
+
840
+
841
+
842
+
843
+
844
+
845
+
846
+
847
+
848
+
849
+
850
+
851
+
852
+
853
+
854
+
855
+
856
+
857
+