PyPI - cfdb - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

cfdb 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

cfdb/__init__.py +2 -1
cfdb/indexers.py +0 -3
cfdb/main.py +134 -161
cfdb/support_classes.py +52 -16
cfdb/tools.py +427 -0
cfdb/utils.py +25 -0
cfdb-0.1.1.dist-info/METADATA +204 -0
cfdb-0.1.1.dist-info/RECORD +14 -0
cfdb-0.1.0.dist-info/METADATA +0 -57
cfdb-0.1.0.dist-info/RECORD +0 -13
{cfdb-0.1.0.dist-info → cfdb-0.1.1.dist-info}/WHEEL +0 -0
{cfdb-0.1.0.dist-info → cfdb-0.1.1.dist-info}/licenses/LICENSE +0 -0

cfdb/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """CF conventions multi-dimensional array database on top of Booklet"""
 from cfdb.main import open_dataset, open_edataset
 from cfdb.utils import compute_scale_and_offset
+from cfdb.tools import netcdf4_to_cfdb, cfdb_to_netcdf4
 from rechunkit import guess_chunk_shape
-__version__ = '0.1.0'
+__version__ = '0.1.1'

cfdb/indexers.py CHANGED Viewed

@@ -287,11 +287,8 @@ def slices_to_chunks_keys(slices, var_name, var_chunk_shape, clip_ends=True):
     """
     starts = tuple(s.start for s in slices)
     stops = tuple(s.stop for s in slices)
-    # chunk_iter1 = rechunkit.chunk_range(starts, stops, var_chunk_shape, clip_ends=False)
     chunk_iter2 = rechunkit.chunk_range(starts, stops, var_chunk_shape, clip_ends=clip_ends)
-    # for full_chunk, partial_chunk in zip(chunk_iter1, chunk_iter2):
     for partial_chunk in chunk_iter2:
-        # starts_chunk = tuple(s.start for s in full_chunk)
         starts_chunk = tuple((pc.start//cs) * cs for cs, pc in zip(var_chunk_shape, partial_chunk))
         new_key = utils.make_var_chunk_key(var_name, starts_chunk)

cfdb/main.py CHANGED Viewed

@@ -6,7 +6,7 @@ Created on Tue Jan  7 11:25:06 2025
 @author: mike
 """
 import booklet
-from typing import Union
+from typing import Union, List
 import pathlib
 import msgspec
 import weakref
@@ -62,50 +62,10 @@ class DatasetBase:
     def __contains__(self, key):
         return key in self.var_names
-    # def get(self, var_name):
-    #     """
-    #     """
-    #     if not isinstance(var_name, str):
-    #         raise TypeError('var_name must be a string.')
-    #     if var_name not in self:
-    #         raise ValueError(f'The Variable {var_name} does not exist.')
-    #     if self._sel is not None:
-    #         if var_name not in self._sel:
-    #             raise ValueError(f'The Variable {var_name} does not exist in view.')
-    #     if var_name not in self._var_cache:
-    #         var_meta = self._sys_meta.variables[var_name]
-    #         if isinstance(var_meta, data_models.DataVariable):
-    #             var = sc.DataVariable(var_name, self)
-    #         else:
-    #             var = sc.Coordinate(var_name, self)
-    #         self._var_cache[var_name] = var
-    #     if self._sel is None:
-    #         return self._var_cache[var_name]
-    #     else:
-    #         return self._var_cache[var_name][self._sel[var_name]]
-        # var_meta = self._sys_meta.variables[var_name]
-        # if isinstance(var_meta, data_models.DataVariable):
-        #     var = sc.DataVariable(var_name, self)
-        # else:
-        #     var = sc.Coordinate(var_name, self)
-        # return var
     def __getitem__(self, key):
         return self.get(key)
-    # def __setitem__(self, key, value):
-    #     if isinstance(value, sc.Variable):
-    #         setattr(self, key, value)
-    #     else:
-    #         raise TypeError('Assigned value must be a Variable or Coordinate object.')
     def __delitem__(self, key):
         if key not in self:
@@ -168,8 +128,29 @@ class DatasetBase:
         """
         return utils.file_summary(self)
+    @property
+    def coords(self):
+        """
+        Return a tuple of coords.
+        """
+        return tuple(self[coord_name] for coord_name in self.coord_names)
-    def sel(self, sel: dict):
+    @property
+    def data_vars(self):
+        """
+        Return a tuple of data variables.
+        """
+        return tuple(self[var_name] for var_name in self.data_var_names)
+    @property
+    def variables(self):
+        """
+        Return a tuple of variables.
+        """
+        return tuple(self[var_name] for var_name in self.var_names)
+    def select(self, sel: dict):
         """
         Filter the dataset variables by a selection of the coordinate positions.
         """
@@ -200,9 +181,9 @@ class DatasetBase:
         return DatasetView(self, _sel)
-    def sel_loc(self, sel: dict):
+    def select_loc(self, sel: dict):
         """
-        Filter the dataset variables by a selection of the coordinate locations.
+        Filter the dataset variables by a selection of the coordinate locations/values.
         """
         ## Checks on input
         coord_names = self.coord_names
@@ -282,7 +263,7 @@ class DatasetBase:
     #     return x1
-    def copy(self, file_path):
+    def copy(self, file_path: Union[str, pathlib.Path], include_data_vars: List[str]=None, exclude_data_vars: List[str]=None):
         """
         """
@@ -290,14 +271,18 @@ class DatasetBase:
         new_ds = open_dataset(file_path, 'n', compression=self.compression, compression_level=self.compression_level, **kwargs)
-        for coord in self.coords:
-            new_coord = new_ds.create.coord.like(coord.name, coord, True)
+        data_var_names, coord_names = utils.filter_var_names(self, include_data_vars, exclude_data_vars)
+        for coord_name in coord_names:
+            coord = self[coord_name]
+            new_coord = new_ds.create.coord.like(coord_name, coord, True)
             new_coord.attrs.update(coord.attrs.data)
-        for data_var in self.data_vars:
-            new_data_var = new_ds.create.data_var.like(data_var.name, data_var)
+        for data_var_name in data_var_names:
+            data_var = self[data_var_name]
+            new_data_var = new_ds.create.data_var.like(data_var_name, data_var)
             new_data_var.attrs.update(data_var.attrs.data)
-            for write_chunk, data in data_var.iter_chunks(False):
+            for write_chunk, data in data_var.iter_chunks(decoded=False):
                 new_data_var.set(write_chunk, data, False)
         new_ds.attrs.update(self.attrs.data)
@@ -305,90 +290,99 @@ class DatasetBase:
         return new_ds
-    def to_netcdf4(self, file_path: Union[str, pathlib.Path], compression: str='gzip', **file_kwargs):
+    def to_netcdf4(self, file_path: Union[str, pathlib.Path], compression: str='gzip', include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, **file_kwargs):
         """
         Save a dataset to a netcdf4 file using h5netcdf.
         """
         if not import_h5netcdf:
             raise ImportError('h5netcdf must be installed to save files to netcdf4.')
-        h5 = h5netcdf.File(file_path, 'w', **file_kwargs)
-        # dims/coords
-        for coord in self.coords:
-            name = coord.name
-            h5.dimensions[name] = coord.shape[0]
-            coord_len = coord.shape[0]
-            chunk_len = coord.chunk_shape[0]
-            if chunk_len > coord_len:
-                chunk_shape = (coord_len,)
-            else:
-                chunk_shape = (chunk_len,)
-            h5_coord = h5.create_variable(name, (name,), coord.dtype_encoded, compression=compression, chunks=chunk_shape, fillvalue=coord.fillvalue)
-            attrs = deepcopy(coord.attrs.data)
-            dtype_decoded, dtype_encoded = utils.parse_dtype_names(coord.dtype_decoded, coord.dtype_encoded)
-            if coord.step is not None:
-                attrs['step'] = coord.step
-            if coord.scale_factor is not None:
-                attrs['scale_factor'] = coord.scale_factor
-            elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
-                attrs['scale_factor'] = 1
-            if coord.add_offset is not None:
-                attrs['add_offset'] = coord.add_offset
-            elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
-                attrs['add_offset'] = 0
-            if coord.dtype_decoded.kind == 'M':
-                units = utils.parse_cf_time_units(coord.dtype_decoded)
-                calendar = "proleptic_gregorian"
-                attrs['units'] = units
-                attrs['calendar'] = calendar
-                attrs['standard_name'] = 'time'
-            attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': coord.fillvalue})
-            h5_coord.attrs.update(attrs)
-            for write_chunk, data in coord.iter_chunks(decoded=False):
-                h5_coord[write_chunk] = data
-        # Data vars
-        for data_var in self.data_vars:
-            name = data_var.name
-            chunk_shape = []
-            for s, cs in zip(data_var.shape, data_var.chunk_shape):
-                if cs > s:
-                    chunk_shape.append(s)
+        data_var_names, coord_names = utils.filter_var_names(self, include_data_vars, exclude_data_vars)
+        with h5netcdf.File(file_path, 'w', **file_kwargs) as h5:
+            # dims/coords
+            for coord_name in coord_names:
+                coord = self[coord_name]
+                h5.dimensions[coord_name] = coord.shape[0]
+                coord_len = coord.shape[0]
+                chunk_len = coord.chunk_shape[0]
+                if chunk_len > coord_len:
+                    chunk_shape = (coord_len,)
                 else:
-                    chunk_shape.append(cs)
-            h5_data_var = h5.create_variable(name, data_var.coord_names, data_var.dtype_encoded, compression=compression, chunks=tuple(chunk_shape), fillvalue=data_var.fillvalue)
-            attrs = deepcopy(data_var.attrs.data)
-            dtype_decoded, dtype_encoded = utils.parse_dtype_names(data_var.dtype_decoded, data_var.dtype_encoded)
-            if data_var.scale_factor is not None:
-                attrs['scale_factor'] = data_var.scale_factor
-            elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
-                attrs['scale_factor'] = 1
-            if data_var.add_offset is not None:
-                attrs['add_offset'] = data_var.add_offset
-            elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
-                attrs['add_offset'] = 0
-            if data_var.dtype_decoded.kind == 'M':
-                units = utils.parse_cf_time_units(data_var.dtype_decoded)
-                calendar = "proleptic_gregorian"
-                attrs['units'] = units
-                attrs['calendar'] = calendar
-                attrs['standard_name'] = 'time'
-            attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded, '_FillValue': data_var.fillvalue})
-            h5_data_var.attrs.update(attrs)
-            for write_chunk, data in data_var.iter_chunks(decoded=False):
-                h5_data_var[write_chunk] = data
-        # Add global attrs
-        h5.attrs.update(self.attrs.data)
-        h5.close()
+                    chunk_shape = (chunk_len,)
+                h5_coord = h5.create_variable(coord_name, (coord_name,), coord.dtype_encoded, compression=compression, chunks=chunk_shape, fillvalue=coord.fillvalue)
+                attrs = deepcopy(coord.attrs.data)
+                dtype_decoded, dtype_encoded = utils.parse_dtype_names(coord.dtype_decoded, coord.dtype_encoded)
+                if coord.step is not None:
+                    attrs['step'] = coord.step
+                if coord.scale_factor is not None:
+                    attrs['scale_factor'] = coord.scale_factor
+                elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
+                    attrs['scale_factor'] = 1
+                if coord.add_offset is not None:
+                    attrs['add_offset'] = coord.add_offset
+                elif coord.dtype_decoded.kind == 'f' and coord.dtype_encoded.kind in ('u', 'i'):
+                    attrs['add_offset'] = 0
+                if coord.dtype_decoded.kind == 'M':
+                    units = utils.parse_cf_time_units(coord.dtype_decoded)
+                    calendar = "proleptic_gregorian"
+                    attrs['units'] = units
+                    attrs['calendar'] = calendar
+                    attrs['standard_name'] = 'time'
+                if coord.fillvalue is not None:
+                    attrs['_FillValue'] = coord.fillvalue
+                attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded})
+                try:
+                    h5_coord.attrs.update(attrs)
+                except Exception as err:
+                    print(attrs)
+                    raise err
+                for write_chunk, data in coord.iter_chunks(decoded=False):
+                    h5_coord[write_chunk] = data
+            # Data vars
+            for data_var_name in data_var_names:
+                data_var = self[data_var_name]
+                chunk_shape = []
+                for s, cs in zip(data_var.shape, data_var.chunk_shape):
+                    if cs > s:
+                        chunk_shape.append(s)
+                    else:
+                        chunk_shape.append(cs)
+                h5_data_var = h5.create_variable(data_var_name, data_var.coord_names, data_var.dtype_encoded, compression=compression, chunks=tuple(chunk_shape), fillvalue=data_var.fillvalue)
+                attrs = deepcopy(data_var.attrs.data)
+                dtype_decoded, dtype_encoded = utils.parse_dtype_names(data_var.dtype_decoded, data_var.dtype_encoded)
+                if data_var.scale_factor is not None:
+                    attrs['scale_factor'] = data_var.scale_factor
+                elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
+                    attrs['scale_factor'] = 1
+                if data_var.add_offset is not None:
+                    attrs['add_offset'] = data_var.add_offset
+                elif data_var.dtype_decoded.kind == 'f' and data_var.dtype_encoded.kind in ('u', 'i'):
+                    attrs['add_offset'] = 0
+                if data_var.dtype_decoded.kind == 'M':
+                    units = utils.parse_cf_time_units(data_var.dtype_decoded)
+                    calendar = "proleptic_gregorian"
+                    attrs['units'] = units
+                    attrs['calendar'] = calendar
+                    attrs['standard_name'] = 'time'
+                if coord.fillvalue is not None:
+                    attrs['_FillValue'] = data_var.fillvalue
+                attrs.update({'dtype_decoded': dtype_decoded, 'dtype_encoded': dtype_encoded, 'dtype': dtype_encoded})
+                h5_data_var.attrs.update(attrs)
+                for write_chunk, data in data_var.iter_chunks(decoded=False):
+                    h5_data_var[write_chunk] = data
+            # Add global attrs
+            h5.attrs.update(self.attrs.data)
 class Dataset(DatasetBase):
@@ -401,7 +395,7 @@ class Dataset(DatasetBase):
         """
         self._blt = open_blt
         self.writable = self._blt.writable
-        self.file_path = file_path
+        self.file_path = pathlib.Path(file_path)
         self.is_open = True
         if hasattr(self._blt, 'load_items'):
@@ -503,27 +497,6 @@ class Dataset(DatasetBase):
         return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable))
-    @property
-    def coords(self):
-        """
-        Return a tuple of coords.
-        """
-        return tuple(self[coord_name] for coord_name in self.coord_names)
-    @property
-    def data_vars(self):
-        """
-        Return a tuple of data variables.
-        """
-        return tuple(self[var_name] for var_name in self.data_var_names)
-    @property
-    def variables(self):
-        """
-        Return a tuple of variables.
-        """
-        return tuple(self[var_name] for var_name in self.var_names)
     def prune(self, timestamp=None, reindex=False):
         """
         Prunes deleted data from the file. Returns the number of removed items. The method can also prune remove keys/values older than the timestamp. The user can also reindex the booklet file. False does no reindexing, True increases the n_buckets to a preassigned value, or an int of the n_buckets. True can only be used if the default n_buckets were used at original initialisation.
@@ -597,17 +570,17 @@ class DatasetView(DatasetBase):
         """
         return tuple(k for k, v in self._sys_meta.variables.items() if isinstance(v, data_models.DataVariable) if k in self._sel)
-    @property
-    def coords(self):
-        return tuple(self[coord_name][self._sel[coord_name]] for coord_name in self.coord_names if coord_name in self._sel)
+    # @property
+    # def coords(self):
+    #     return tuple(self[coord_name][self._sel[coord_name]] for coord_name in self.coord_names if coord_name in self._sel)
-    @property
-    def data_vars(self):
-        return tuple(self[var_name][self._sel[var_name]] for var_name in self.data_var_names if var_name in self._sel)
+    # @property
+    # def data_vars(self):
+    #     return tuple(self[var_name][self._sel[var_name]] for var_name in self.data_var_names if var_name in self._sel)
-    @property
-    def variables(self):
-        return tuple(self[var_name][self._sel[var_name]] for var_name in self.var_names if var_name in self._sel)
+    # @property
+    # def variables(self):
+    #     return tuple(self[var_name][self._sel[var_name]] for var_name in self.var_names if var_name in self._sel)
@@ -689,7 +662,7 @@ def open_dataset(file_path: Union[str, pathlib.Path], flag: str = "r", compressi
     else:
         create = False
-    return Dataset(file_path, open_blt, create, compression, compression_level)
+    return Dataset(fp, open_blt, create, compression, compression_level)
 def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
@@ -699,7 +672,7 @@ def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
                   compression_level: int=1,
                   **kwargs):
     """
-    Open a cfdb that is linked with a remote S3 database.
+    Open a cfdb that is linked with a remote S3 database.
     Parameters
     -----------
@@ -754,7 +727,7 @@ def open_edataset(remote_conn: Union[ebooklet.S3Connection, str, dict],
     else:
         create = False
-    return EDataset(file_path, open_blt, create, compression, compression_level)
+    return EDataset(fp, open_blt, create, compression, compression_level)

cfdb/support_classes.py CHANGED Viewed

@@ -65,6 +65,7 @@ class Rechunker:
             shape of the chunk
         """
         chunk_shape = rechunkit.guess_chunk_shape(self._var.shape, self._var.dtype_encoded, target_chunk_size)
         return chunk_shape
     def calc_ideal_read_chunk_shape(self, target_chunk_shape: Tuple[int, ...]):
@@ -556,14 +557,16 @@ class Variable:
         # TODO
-    def iter_chunks(self, decoded=True):
+    def iter_chunks(self, include_data=True, decoded=True):
         """
-        Iterate through the chunks of the variable and return numpy arrays associated with the index slices. This should be the main way for users to get large amounts of data from a variable. The "ends" of the data will be clipped to the shape of the variable (i.e. not all chunks will be the chunk_shape).
+        Iterate through the chunks of the variable and return numpy arrays associated with the index slices (Optional). This should be the main way for users to get large amounts of data from a variable. The "ends" of the data will be clipped to the shape of the variable (i.e. not all chunks will be the chunk_shape).
         Parameters
         ----------
         decoded: bool
             Should the data be decoded?
+        include_data: bool
+            Should the data be included in the output?
         Returns
         -------
@@ -577,19 +580,29 @@ class Variable:
         blank = self._make_blank_chunk_array(decoded)
         slices = indexers.index_combo_all(self._sel, coord_origins, self.shape)
-        for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, self.name, self.chunk_shape):
-            # print(target_chunk, source_chunk, blt_key)
-            b1 = self._blt.get(blt_key)
-            if b1 is None:
-                blank_slices = tuple(slice(0, sc.stop - sc.start) for sc in source_chunk)
-                yield target_chunk, blank[blank_slices]
-            else:
-                if decoded:
-                    data = self._encoder.decode(self._encoder.from_bytes(b1))
+        if include_data:
+            for target_chunk, source_chunk, blt_key in indexers.slices_to_chunks_keys(slices, self.name, self.chunk_shape):
+                # print(target_chunk, source_chunk, blt_key)
+                b1 = self._blt.get(blt_key)
+                if b1 is None:
+                    blank_slices = tuple(slice(0, sc.stop - sc.start) for sc in source_chunk)
+                    yield target_chunk, blank[blank_slices]
                 else:
-                    data = self._encoder.from_bytes(b1)
+                    if decoded:
+                        data = self._encoder.decode(self._encoder.from_bytes(b1))
+                    else:
+                        data = self._encoder.from_bytes(b1)
-                yield target_chunk, data[source_chunk]
+                    yield target_chunk, data[source_chunk]
+        else:
+            starts = tuple(s.start for s in slices)
+            stops = tuple(s.stop for s in slices)
+            chunk_iter2 = rechunkit.chunk_range(starts, stops, self.chunk_shape)
+            for partial_chunk in chunk_iter2:
+                target_chunk = tuple(slice(s.start - start, s.stop - start) for start, s in zip(starts, partial_chunk))
+                yield target_chunk
     def __iter__(self):
         return self.iter_chunks()
@@ -859,7 +872,7 @@ class Coordinate(CoordinateView):
     def append(self, data):
         """
-        Append data to the end of the coordinate. The extra length will be added to the associated data variables with the fillvalue.
+        Append data to the end of the coordinate. The extra length will be added to the associated data variables with the fillvalue.
         """
         if not self.writable:
             raise ValueError('Dataset is not writable.')
@@ -947,6 +960,29 @@ class DataVariableView(Variable):
             self._blt.set(blt_key, self._encoder.to_bytes(new_data))
+    # def set_chunk(self, sel, data, encode=True):
+    #     """
+    #     Set the first chunk associated with the selection.
+    #     """
+    #     if not self.writable:
+    #         raise ValueError('Dataset is not writable.')
+    #     if sel is None:
+    #         sel = self._sel
+    #     coord_origins = self.get_coord_origins()
+    #     slices = indexers.index_combo_all(sel, coord_origins, self.shape)
+    #     starts_chunk = tuple((pc.start//cs) * cs for cs, pc in zip(self.chunk_shape, slices))
+    #     chunk_stop = tuple(min(cs, s - sc) for cs, sc, s in zip(self.chunk_shape, starts_chunk, self.shape))
+    #     if data.shape != chunk_stop:
+    #         raise ValueError(f'The shape of this chunk should be {chunk_stop}, but the data passed is {data.shape}')
+    #     blt_key = utils.make_var_chunk_key(self.name, starts_chunk)
+    #     if encode:
+    #         self._blt.set(blt_key, self._encoder.to_bytes(self._encoder.encode(data)))
+    #     else:
+    #         self._blt.set(blt_key, self._encoder.to_bytes(data))
     def __setitem__(self, sel, data):
         """
@@ -954,14 +990,14 @@ class DataVariableView(Variable):
         self.set(sel, data)
-    def groupby(self, coord_names: Iterable, max_mem: int=2**27, decoded=True):
+    def groupby(self, coord_names: Union[str, Iterable], max_mem: int=2**27, decoded=True):
         """
         This method takes one or more coord names to group by and returns a generator. This generator will return chunks of data according to these groupings with the associated tuple of slices. The more max_mem provided, the more efficient the chunking.
         This is effectively the rechunking method where each coord name supplied is set to 1 and all other coords are set to their full their full length.
         Parameters
         ----------
-        coord_names: Iterable
+        coord_names: str or Iterable
             The coord names to group by.
         max_mem: int
             The max allocated memory to perform the chunking operation in bytes. This will only be as large as necessary for an optimum size chunk for the rechunking.

cfdb/tools.py ADDED Viewed

@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul 17 09:04:49 2025
+@author: mike
+"""
+import numpy as np
+import rechunkit
+import copy
+from typing import List, Union
+import pathlib
+try:
+    import h5netcdf
+    import_h5netcdf = True
+except ImportError:
+    import_h5netcdf = False
+from . import utils, main, indexers, support_classes as sc
+# import utils, main, indexers, support_classes as sc
+##########################################
+### Parameters
+inv_time_units_dict = {value: key for key, value in utils.time_units_dict.items()}
+#########################################
+### Functions
+class H5DataVarReader:
+    """
+    """
+    def __init__(self, h5_data_var, inverted_coords, shape):
+        """
+        """
+        self.is_inverted = any(inverted_coords)
+        self.data_var = h5_data_var
+        self.inverted_coords = inverted_coords
+        self.shape = shape
+    def get(self, slices):
+        """
+        """
+        if self.is_inverted:
+            source_slices = tuple(slice(s - cs.stop, s - cs.start) if inverted else cs for inverted, cs, s in zip(self.inverted_coords, slices, self.shape))
+            data = np.flip(self.data_var[source_slices], np.nonzero(self.inverted_coords)[0])
+        else:
+            data = self.data_var[slices]
+        return data
+def filter_var_names_h5(h5, include_data_vars, exclude_data_vars):
+    """
+    """
+    coord_names_all = set(h5.dims)
+    data_var_names_all = set(h5.variables).difference(coord_names_all)
+    if include_data_vars is not None:
+        if isinstance(include_data_vars, str):
+            include_data_vars = [include_data_vars]
+        data_var_names = set(include_data_vars)
+        if not data_var_names.isubset(data_var_names_all):
+            raise ValueError(f'{data_var_names} is not a subset of {data_var_names_all}')
+    elif exclude_data_vars is not None:
+        if isinstance(exclude_data_vars, str):
+            exclude_data_vars = [exclude_data_vars]
+        data_var_names = data_var_names_all.difference(set(exclude_data_vars))
+    else:
+        data_var_names = data_var_names_all
+    coord_names = set()
+    for data_var_name in data_var_names:
+        data_var = h5[data_var_name]
+        coord_names.update(data_var.dimensions)
+    return data_var_names, coord_names
+def parse_attrs(attrs):
+    """
+    """
+    input_params = {}
+    for attr, value in copy.deepcopy(attrs).items():
+        if attr == 'scale_factor':
+            input_params['scale_factor'] = float(attrs.pop(attr))
+        elif attr == 'add_offset':
+            input_params['add_offset'] = float(attrs.pop(attr))
+        elif attr == '_FillValue':
+            if value is not None:
+                input_params['fillvalue'] = int(attrs.pop(attr))
+        elif attr == 'missing_value':
+            del attrs['missing_value']
+        elif isinstance(value, np.bytes_):
+            attrs[attr] = str(value.astype(str))
+        elif isinstance(value, np.floating):
+            attrs[attr] = float(value)
+        elif isinstance(value, np.integer):
+            attrs[attr] = int(value)
+        elif isinstance(value, np.str_):
+            attrs[attr] = str(value)
+    return attrs, input_params
+def parse_cf_dates(units, dtype_encoded):
+    """
+    """
+    if ' since ' in units:
+        freq, start_date = units.split(' since ')
+        freq_code = inv_time_units_dict[freq]
+        origin_date = np.datetime64(start_date, freq_code)
+        unix_date = np.datetime64('1970-01-01', freq_code)
+        # origin_diff = (unix_date - origin_date).astype(dtype_encoded)
+        units = f'{freq} since {str(unix_date)}'
+        if freq_code not in ('M', 'D', 'h', 'm'):
+            dtype_encoded = np.dtype('int64')
+        dtype_decoded = origin_date.dtype
+    else:
+        dtype_decoded = dtype_encoded
+        origin_date = None
+    return units, dtype_decoded, dtype_encoded, origin_date
+def netcdf4_to_cfdb(nc_path: Union[str, pathlib.Path], cfdb_path: Union[str, pathlib.Path], sel: dict=None, sel_loc: dict=None, include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, max_mem: int=2**27, **kwargs):
+    """
+    Simple function to convert a netcdf4 to a cfdb. Selection options are also available. The h5netcdf package must be installed to read netcdf4 files.
+    Parameters
+    ----------
+    nc_path: str or pathlib.Path
+        The source netcdf4 file to be converted.
+    cfdb_path: str or pathlib.Path
+        The target path for the cfdb.
+    sel: dict
+        Selection by coordinate indexes.
+    sel_loc: dict
+        Selection by coordinate values.
+    max_mem: int
+        The max memory in bytes if required when coordinates are in decending order (and must be resorted in ascending order).
+    kwargs
+        Any kwargs that can be passed to the cfdb.open_dataset function.
+    Returns
+    -------
+    None
+    """
+    if not import_h5netcdf:
+        raise ImportError('h5netcdf must be installed to save files to netcdf4.')
+    if (sel is not None) and (sel_loc is not None):
+        raise ValueError('Only one of sel or sel_loc can be passed, not both.')
+    ## Get the coordinates data
+    inverted_coords = []
+    # coords_data = {}
+    sel_dict = {}
+    with main.open_dataset(cfdb_path, 'n', **kwargs) as ds:
+        with h5netcdf.File(nc_path, 'r') as h5:
+            dims = tuple(h5.dims)
+            ## Check the selection inputs
+            if isinstance(sel, dict):
+                for key in sel:
+                    if key not in dims:
+                        raise ValueError(f'{key} is not a dimension in the dataset.')
+            elif isinstance(sel_loc, dict):
+                for key in sel_loc:
+                    if key not in dims:
+                        raise ValueError(f'{key} is not a dimension in the dataset.')
+            data_var_names, coord_names = filter_var_names_h5(h5, include_data_vars, exclude_data_vars)
+            for dim in coord_names:
+                h5_coord = h5[dim]
+                dtype_encoded = h5_coord.dtype
+                attrs = dict(h5_coord.attrs)
+                attrs, input_params = parse_attrs(attrs)
+                if 'scale_factor' in input_params:
+                    dtype_decoded = np.dtype('float64')
+                elif 'units' in attrs:
+                    units, dtype_decoded, dtype_encoded, origin_date = parse_cf_dates(attrs['units'], dtype_encoded)
+                    attrs['units'] = units
+                else:
+                    dtype_decoded = dtype_encoded
+                input_params['dtype_decoded'] = dtype_decoded
+                input_params['dtype_encoded'] = dtype_encoded
+                # chunk_start = (0,)
+                shape = h5_coord.shape
+                chunk_shape = h5_coord.chunks
+                if chunk_shape is None:
+                    chunk_shape = rechunkit.guess_chunk_shape(shape, dtype_encoded)
+                input_params['chunk_shape'] = chunk_shape
+                data = h5_coord[()]
+                h5_coord_diff = np.diff(data)
+                if h5_coord_diff[0] > 0:
+                    order_check = np.all(h5_coord_diff > 0)
+                    inverted = False
+                else:
+                    order_check = np.all(h5_coord_diff < 0)
+                    inverted = True
+                inverted_coords.append(inverted)
+                if not order_check:
+                    raise ValueError('Either the coordinate values are not increasing/decreasing or they are not unique.')
+                data = h5_coord[()]
+                if inverted:
+                    data.sort()
+                ## Decode data if necessary
+                if dtype_decoded.kind == 'M':
+                    data = data + origin_date
+                elif 'scale_factor' in input_params:
+                    if 'add_offset' in input_params:
+                        add_offset = input_params['add_offset']
+                    else:
+                        add_offset = None
+                    if 'fillvalue' in input_params:
+                        fillvalue = input_params['fillvalue']
+                    else:
+                        fillvalue = None
+                    encoding = sc.Encoding(chunk_shape, dtype_decoded, dtype_encoded, fillvalue, input_params['scale_factor'], add_offset, None)
+                    data = encoding.decode(data)
+                ## Selection
+                if isinstance(sel, dict):
+                    if dim in sel:
+                        slices = indexers.index_combo_one(sel[dim], (0,), shape, 0)
+                        data = data[slices]
+                    else:
+                        slices = indexers.slice_none((0,), shape, 0)
+                elif isinstance(sel_loc, dict):
+                    if dim in sel_loc:
+                        idx = indexers.loc_index_combo_one(sel_loc[dim], data)
+                        slices = indexers.index_combo_one(idx, (0,), shape, 0)
+                        data = data[slices]
+                    else:
+                        slices = indexers.slice_none((0,), shape, 0)
+                else:
+                    slices = indexers.slice_none((0,), shape, 0)
+                sel_dict[dim] = slices
+                ## Create coord
+                coord = ds.create.coord.generic(dim, data=data, **input_params)
+                coord.attrs.update(attrs)
+                # coords_data[dim] = {'data': data, 'attrs': attrs, 'input_params': input_params}
+            ## Data Vars
+            inverted_coords = tuple(inverted_coords)
+            # is_inverted = any(inverted_coords)
+            for var_name in data_var_names:
+                h5_var = h5[var_name]
+                dtype_encoded = h5_var.dtype
+                attrs = dict(h5_var.attrs)
+                attrs, input_params = parse_attrs(attrs)
+                if 'scale_factor' in input_params:
+                    dtype_decoded = np.dtype('float64')
+                elif 'units' in attrs:
+                    units, dtype_decoded, dtype_encoded, origin_date = parse_cf_dates(attrs['units'], dtype_encoded)
+                    attrs['units'] = units
+                else:
+                    dtype_decoded = dtype_encoded
+                var_sel = tuple(sel_dict[dim] for dim in h5_var.dimensions)
+                # chunk_start = tuple(s.start for s in var_sel)
+                # shape = tuple(s.stop - s.start for s in var_sel)
+                # chunk_start = tuple(0 for i in range(len(h5_var.shape)))
+                shape = h5_var.shape
+                chunk_shape = h5_var.chunks
+                if chunk_shape is None:
+                    chunk_shape = rechunkit.guess_chunk_shape(shape, dtype_encoded)
+                data_var = ds.create.data_var.generic(var_name, h5_var.dimensions, dtype_decoded=dtype_decoded, dtype_encoded=dtype_encoded, chunk_shape=chunk_shape, **input_params)
+                data_var.attrs.update(attrs)
+                h5_reader = H5DataVarReader(h5_var, inverted_coords, shape)
+                chunks_iter = rechunkit.rechunker(h5_reader.get, shape, dtype_encoded, chunk_shape, chunk_shape, max_mem, var_sel)
+                for chunk_slices, encoded_data in chunks_iter:
+                    if not np.all(encoded_data == data_var.fillvalue):
+                        data_var.set(chunk_slices, encoded_data, False)
+                # chunks_iter = rechunkit.chunk_range(chunk_start, shape, chunk_shape)
+                # for chunk_slices in chunks_iter:
+                #     if is_inverted:
+                #         source_slices = tuple(slice(s - cs.stop, s - cs.start) if inverted else cs for inverted, cs, s in zip(inverted_coords, chunk_slices, shape))
+                #         data = np.flip(h5_var[source_slices], np.nonzero(inverted_coords)[0])
+                #     else:
+                #         data = h5_var[chunk_slices]
+                #     if not np.all(data == data_var.fillvalue):
+                #         # data_var.set_chunk(chunk_slices, data, False)
+                #         data_var.set(chunk_slices, data, False)
+            ds.attrs.update(dict(h5.attrs))
+def cfdb_to_netcdf4(cfdb_path: Union[str, pathlib.Path], nc_path: Union[str, pathlib.Path], compression: str='gzip', sel: dict=None, sel_loc: dict=None, include_data_vars: List[str]=None, exclude_data_vars: List[str]=None, **kwargs):
+    """
+    Simple function to convert a cfdb to a netcdf4. Selection options are also available. The h5netcdf package must be installed to write netcdf4 files.
+    Parameters
+    ----------
+    cfdb_path: str or pathlib.Path
+        The source path of the cfdb to be converted.
+    nc_path: str or pathlib.Path
+        The target path for the netcdf4 file.
+    sel: dict
+        Selection by coordinate indexes.
+    sel_loc: dict
+        Selection by coordinate values.
+    max_mem: int
+        The max memory in bytes if required when coordinates are in decending order (and must be resorted in ascending order).
+    kwargs
+        Any kwargs that can be passed to the h5netcdf.File function.
+    Returns
+    -------
+    None
+    """
+    if not import_h5netcdf:
+        raise ImportError('h5netcdf must be installed to save files to netcdf4.')
+    if (sel is not None) and (sel_loc is not None):
+        raise ValueError('Only one of sel or sel_loc can be passed, not both.')
+    with main.open_dataset(cfdb_path) as ds:
+        if isinstance(sel, dict):
+            ds_view = ds.select(sel)
+        elif isinstance(sel_loc, dict):
+            ds_view = ds.select_loc(sel_loc)
+        else:
+            ds_view = ds
+        ds_view.to_netcdf4(nc_path, compression=compression, include_data_vars=include_data_vars, exclude_data_vars=exclude_data_vars, **kwargs)

cfdb/utils.py CHANGED Viewed

@@ -233,6 +233,30 @@ default_attrs = dict(
 ### Functions
+def filter_var_names(ds, include_data_vars, exclude_data_vars):
+    """
+    """
+    if include_data_vars is not None:
+        if isinstance(include_data_vars, str):
+            include_data_vars = [include_data_vars]
+        data_var_names = set(include_data_vars)
+    elif exclude_data_vars is not None:
+        if isinstance(exclude_data_vars, str):
+            exclude_data_vars = [exclude_data_vars]
+        data_var_names_all = set(ds.data_var_names)
+        data_var_names = data_var_names_all.difference(set(exclude_data_vars))
+    else:
+        data_var_names = set(ds.data_var_names)
+    coord_names = set()
+    for data_var_name in data_var_names:
+        data_var = ds[data_var_name]
+        coord_names.update(data_var.coord_names)
+    return data_var_names, coord_names
 def parse_cf_time_units(dtype_decoded):
     """
@@ -1959,6 +1983,7 @@ def file_summary(ds):
             dim_name = var.name
             dtype_name = var.dtype_decoded
             dim_len = var.shape[0]
+            # print(var.data)
             first_value = format_value(var.data[0])
             last_value = format_value(var.data[-1])
             spacing = value_indent - name_indent - len(dim_name)

cfdb-0.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,204 @@
+Metadata-Version: 2.4
+Name: cfdb
+Version: 0.1.1
+Summary: CF conventions multi-dimensional array storage on top of Booklet
+Project-URL: Documentation, https://mullenkamp.github.io/cfdb/
+Project-URL: Source, https://github.com/mullenkamp/cfdb
+Author-email: mullenkamp <mullenkamp1@gmail.com>
+License-File: LICENSE
+Classifier: Programming Language :: Python :: 3 :: Only
+Requires-Python: >=3.10
+Requires-Dist: booklet>=0.9.2
+Requires-Dist: cftime
+Requires-Dist: lz4
+Requires-Dist: msgspec
+Requires-Dist: numpy
+Requires-Dist: rechunkit>=0.1.0
+Requires-Dist: zstandard
+Provides-Extra: ebooklet
+Requires-Dist: ebooklet>=0.5.10; extra == 'ebooklet'
+Provides-Extra: netcdf4
+Requires-Dist: h5netcdf; extra == 'netcdf4'
+Description-Content-Type: text/markdown
+# cfdb
+<p align="center">
+    <em>CF conventions multi-dimensional array storage on top of Booklet</em>
+</p>
+[![build](https://github.com/mullenkamp/cfdb/workflows/Build/badge.svg)](https://github.com/mullenkamp/cfdb/actions)
+[![codecov](https://codecov.io/gh/mullenkamp/cfdb/branch/master/graph/badge.svg)](https://codecov.io/gh/mullenkamp/cfdb)
+[![PyPI version](https://badge.fury.io/py/cfdb.svg)](https://badge.fury.io/py/cfdb)
+---
+**Source Code**: <a href="https://github.com/mullenkamp/cfdb" target="_blank">https://github.com/mullenkamp/cfbdb</a>
+---
+## Introduction
+cfdb is a pure python database for managing labeled multi-dimensional arrays that mostly follows the [CF conventions](https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html). It is an alternative to netcdf4 and [xarray](https://docs.xarray.dev/). It builds upon the [Booklet](https://github.com/mullenkamp/booklet) for the underlying local file storage and [EBooklet](https://github.com/mullenkamp/ebooklet) to sync and share on any S3 system. It has been designed to follow the programming style of opening a file, iteratively read data, iteratively write data, then closing the file.
+It is thread-safe on reads and writes (using thread locks) and multiprocessing-safe (using file locks) including on the S3 remote (using object locking).
+When an error occurs, cfdb will try to properly close the file and remove the file (object) locks. This will not sync any changes, so the user will lose any changes that were not synced. There will be circumstances that can occur that will not properly close the file, so care still needs to be made.
+## Installation
+Install via pip:
+```
+pip install cfdb
+```
+I'll probably put it on conda-forge once I feel appropriately motivated...
+## Usage
+### Opening a file/dataset
+Usage starts off by opening the file (and closing the file when done):
+```python
+import cfdb
+import numpy as np
+file_path = '/path/to/file.cfdb'
+ds = cfdb.open_dataset(file_path, flag='n')
+# Do fancy stuff
+ds.close()
+```
+By default, files will be open for read-only, so we need to specify that we want to write (in this case, 'n' is to open for write and replace the existing file with a new one). There are also some compression options, and those are described in the doc strings. Other kwargs from [Booklet](https://github.com/mullenkamp/booklet?tab=readme-ov-file#usage) can be passed to open_dataset.
+The dataset can also be opened with the context manager like so:
+```python
+with cfdb.open_dataset(file_path, flag='n') as ds:
+    print(ds)
+```
+This is generally encouraged as this will ensure that the file is closed properly and file locks are removed.
+### Variables
+In the [CF conventions](https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#dimensions), variables are the objects that store data. These can be 1 dimensional or many dimensional. The dimensions are the labels of 1-D variables (like latitude or time). These 1-D variables are called coordinate variables (or coordinates) with the same name as their associated dimension. All variables that use these coordinates as their dimension labels are called data variables. The combination of multiple data variables with their coordinates in a single file is called a dataset.
+#### Coordinates
+Since all data variables must have coordinates, the coordinates must be created before data variables are created.
+Coordinates in cfdb are more similar to the definition by the earlier [COARDS conventions](https://ferret.pmel.noaa.gov/Ferret/documentation/coards-netcdf-conventions) than the latter CF conventions. Coordinate values must be unique, sorted in ascending order (a partial consequence to np.sort), and cannot have null (or np.nan) values. The CF conventions do not have those limitations, but these limitations are good! Coordinates must also be only 1-D.
+Coordinates can be created using the generic creation method, or templates can be used for some of the more common dimensions (like latitude, longitude, and time):
+```python
+lat_data = np.linspace(0, 19.9, 200, dtype='float32')
+with cfdb.open_dataset(file_path, flag='n') as ds:
+    lat_coord = ds.create.coord.latitude(data=lat_data, chunk_shape=(20,))
+    print(lat_coord)
+```
+When creating coordinates, the user can pass a np.ndarray as data and cfdb will figure out the rest (especially when using a creation template). Otherwise, a coordinate can be created without any data input and the data can be appended later:
+```python
+with cfdb.open_dataset(file_path, flag='n') as ds:
+    lat_coord = ds.create.coord.latitude(chunk_shape=(20,))
+    lat_coord.append(lat_data)
+    print(lat_coord.data)
+```
+Coordinate data can either be appended or prepended, but keep in mind the limitations described above! And once assigned, coordinate values cannot be changed. At some point, I'll implement the ability to shrink the size of coordinates, but for now they can only be expanded. As seen in the above example, the .data method will return the entire variable data as a single np.ndarray. Coordinates always hold the entire data in memory, while data variables never do. On disk, all data are stored as chunks, whether it's coordinates or data variables.
+Let's add another coordinate for fun:
+```python
+time_data = np.linspace(0, 199, 200, dtype='datetime64[D]')
+with cfdb.open_dataset(file_path, flag='w') as ds:
+    time_coord = ds.create.coord.time(data=time_data, dtype_decoded=time_data.dtype, dtype_encoded='int32')
+    print(time_coord)
+```
+A time variable works similarly to other numpy dtypes, but you can assign the precision of the datetime object within the brackets (shown as [D] for days). Look at the [numpy datetime reference page](https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units) for all of the frequency codes. Do not use a frequency code finer than "ns". Encoding a datetime64 dtype to an int32 is possible down to the "m" (minute) resolution (with a max year of 6053), but all higher frequency codes should use int64.
+#### Data Variables
+Data variables are created in a similar way as coordinates except that you cannot pass data on creation and you must pass a tuple of the coordinate names to link the coordinates to the data variable:
+```python
+data_var_data = np.linspace(0, 3999.9, 40000, dtype='float64').reshape(200, 200)
+name = 'data_var'
+coords = ('latitude', 'time')
+dtype_encoded = 'int32'
+scale_factor = 0.1
+with cfdb.open_dataset(file_path, flag='w') as ds:
+    data_var = ds.create.data_var.generic(name, coords, data_var_data.dtype, dtype_encoded, scale_factor=scale_factor)
+    data_var[:] = data_var_data
+    data_var.attrs['test'] = ['test attributes']
+    print(data_var)
+```
+Since there are no data variable templates (yet), we need to use the generic creation method. If no fillvalue or chunk_shape is passed, then cfdb figures them out for you.
+Assigning data to data variables is different to coordinates. Data variables can only be expanded via the coordinates themselves. Assignment and selection is performed by the [basic numpy indexing](https://numpy.org/doc/stable/user/basics.indexing.html#basic-indexing), but not the [advanced indexing](https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing).
+The example shown above is the simplest way of assigning data to a data variable, but it's not a preferred method when datasets are very large. The recommended way to write (and read) data is to iterate over the chunks:
+```python
+with cfdb.open_dataset(file_path, flag='w') as ds:
+    data_var = ds[name]
+    for chunk_slices in data_var.iter_chunks(include_data=False):
+        data_var[chunk_slices] = data_var_data[chunk_slices]
+```
+This is a bit of a contrived example given that data_var_data is a single in-memory numpy array, but in many cases your data source will be much larger or in many pieces. The chunk_slices is a tuple of index slices that the data chunk covers. It is the same indexing that can be passed to a numpy ndarray.
+Reading data uses the same "iter_chunks" method. This ensures that memory usage is kept to a minimum:
+```python
+with cfdb.open_dataset(file_path, flag='r') as ds:
+    data_var = ds[name]
+    for chunk_slices, data in data_var.iter_chunks():
+        print(chunk_slices)
+        print(data.shape)
+```
+There's a groupby method that works similarly to the iter_chunks method except that it requires one or more coordinate names (like pandas or xarray):
+```python
+with cfdb.open_dataset(file_path, flag='r') as ds:
+    data_var = ds[name]
+    for slices, data in data_var.groupby('latitude'):
+        print(slices)
+        print(data.shape)
+```
+#### Rechunking
+All data for variables are stored as chunks of data. For example, the shape of your data may be 2000 x 2000, but the data are stored in 100 x 100 chunks. This is done for a variety of reasons including the ability to compress data. When a variable is created, either the user can define their own chunk shape or cfdb will determine the chunk shape automatically.
+The chunk shape defined in the variable might be good for some use cases but not others. The user might have specific use cases where they want a specific chunking; for example the groupby operation listed in the last example. In that example, the user wanted to iterate over each latitude but with all of the other coordinates (in this case the full time coordinate). A groupby operation is a common rechunking example, but the user might need chunks in many different shapes.
+The [rechunkit package](https://github.com/mullenkamp/rechunkit) is used under the hood to rechunk the data in cfdb. It is exposed in cfdb via the "rechunker" method in a variable. The Rechunker class has several methods to help the user decide the chunk shape.
+```python
+new_chunk_shape = (41, 41)
+with cfdb.open_dataset(file_path) as ds:
+    data_var = ds[name]
+    rechunker = data_var.rechunker()
+    alt_chunk_shape = rechunker.guess_chunk_shape(2**8)
+    n_chunks = rechunker.calc_n_chunks()
+    print(n_chunks)
+    n_reads, n_writes = rechunker.calc_n_reads_rechunker(new_chunk_shape)
+    print(n_reads, n_writes)
+    rechunk = rechunker.rechunk(new_chunk_shape)
+    for slices, data in rechunk:
+        print(slices)
+        print(data.shape)
+```
+#### Serializers
+The datasets can be serialized to netcdf4 via the to_netcdf4 method. You must have the [h5netcdf package](https://h5netcdf.org/) installed for netcdf4. It can also be copied to another cfdb file.
+```python
+with open_dataset(file_path) as ds:
+    new_ds = ds.copy(new_file_path)
+    print(new_ds)
+    new_ds.close()
+    ds.to_netcdf4(nc_file_path)
+```
+## License
+This project is licensed under the terms of the Apache Software License 2.0.

cfdb-0.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+cfdb/__init__.py,sha256=jkHqBmh0aBkjWX3demwH4eh-P9YypPEnFH5ztXXInnc,289
+cfdb/combine.py,sha256=B1CHZ0NOW4O5j_5NYxAHB76X1A5O3HcZwjNGNx_gfEA,19084
+cfdb/core.py,sha256=IMFGhed5pa2zoYlm7reu1TeCQ6nt3sMmy5cE0LcAb2A,37337
+cfdb/creation.py,sha256=hoR0MVEhbcxKT1JnZ2rK1fUAofxOQT0okKmLYh0PBAY,10686
+cfdb/data_models.py,sha256=AtwtH2Uyo84GucW52aX0AzpG3Sbge41F5lrPuRxSLoY,2166
+cfdb/indexers.py,sha256=BvkQLpdm2EM64ZbSjW9ByXfeUoBZ1V-YKNVVvtAy1HY,10462
+cfdb/main.py,sha256=3HoJr8ZZFD3KIPfSUrQTXdJ9xo9I1vcjfQUWvEbmkv8,26020
+cfdb/support_classes.py,sha256=di0pnspL4O4YL5eKJnGhIOFWdk7D3WWH2ltPziqORtM,36456
+cfdb/tools.py,sha256=1hE8Qja-JdFpi_XTGSBuANRujELd2s4uYbSUCAl3Big,13725
+cfdb/utils.py,sha256=sm7oeCxyrtByRlxc8NV52kBMehHwRJMOIhwpeiAmCYY,74114
+cfdb-0.1.1.dist-info/METADATA,sha256=80jGlWL4ONgx8jZ88MV9YsqgbL_EW504flAG2kRkSsg,11513
+cfdb-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+cfdb-0.1.1.dist-info/licenses/LICENSE,sha256=hNqpp2O-F2qp4ozzNN86q1sxnAeFDLNoylHyJK_aiYI,586
+cfdb-0.1.1.dist-info/RECORD,,

cfdb-0.1.0.dist-info/METADATA DELETED Viewed

@@ -1,57 +0,0 @@
-Metadata-Version: 2.4
-Name: cfdb
-Version: 0.1.0
-Summary: CF conventions multi-dimensional array storage on top of Booklet
-Project-URL: Documentation, https://mullenkamp.github.io/cfdb/
-Project-URL: Source, https://github.com/mullenkamp/cfdb
-Author-email: mullenkamp <mullenkamp1@gmail.com>
-License-File: LICENSE
-Classifier: Programming Language :: Python :: 3 :: Only
-Requires-Python: >=3.10
-Requires-Dist: booklet>=0.9.2
-Requires-Dist: cftime
-Requires-Dist: lz4
-Requires-Dist: msgspec
-Requires-Dist: numpy
-Requires-Dist: rechunkit>=0.1.0
-Requires-Dist: zstandard
-Provides-Extra: ebooklet
-Requires-Dist: ebooklet>=0.5.10; extra == 'ebooklet'
-Provides-Extra: netcdf4
-Requires-Dist: h5netcdf; extra == 'netcdf4'
-Description-Content-Type: text/markdown
-# cfdb
-<p align="center">
-    <em>CF conventions multi-dimensional array storage on top of Booklet</em>
-</p>
-[![build](https://github.com/mullenkamp/cfdb/workflows/Build/badge.svg)](https://github.com/mullenkamp/cfdb/actions)
-[![codecov](https://codecov.io/gh/mullenkamp/cfdb/branch/master/graph/badge.svg)](https://codecov.io/gh/mullenkamp/cfdb)
-[![PyPI version](https://badge.fury.io/py/cfdb.svg)](https://badge.fury.io/py/cfdb)
----
-**Documentation**: <a href="https://mullenkamp.github.io/cfdb/" target="_blank">https://mullenkamp.github.io/cfdb/</a>
-**Source Code**: <a href="https://github.com/mullenkamp/cfdb" target="_blank">https://github.com/mullenkamp/cfbdb</a>
----
-## Development
-### Coordinate variables
-Must be 1D.
-They should have an "ordered" parameter (bool) that defined whether the coord should always be ordered. Int, float, and datetime should default to True. Only string and category dtypes should default to False.
-There should be a "regular" parameter (bool) with an associated "step" parameter (int or float). It should work similarly to np.arange. Only ints, floats, and datetimes can use this.
-~~Should I add a "unique" parameter (bool)? Maybe I should just enforce this normally?~~ It should enforce uniqueness in the coords.
-There can be a groupby method datasets that would use the rechunker. The rechunker would have the groupby dims set to 1 and the other dims set to the full length.
-#### Multi-dimensional coords
-It is possible to create a composite index from multiple 1D coords. But it seems best to implement this type of thing on top of sqlite (or something equivalent).
-Keeping each coord 1D makes implementations quite a bit simpler.
-## License
-This project is licensed under the terms of the Apache Software License 2.0.

cfdb-0.1.0.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-cfdb/__init__.py,sha256=r2CzHI87AZOW0HsVhl0HpN0-Mjh34eB9WG2sCUK4kiA,233
-cfdb/combine.py,sha256=B1CHZ0NOW4O5j_5NYxAHB76X1A5O3HcZwjNGNx_gfEA,19084
-cfdb/core.py,sha256=IMFGhed5pa2zoYlm7reu1TeCQ6nt3sMmy5cE0LcAb2A,37337
-cfdb/creation.py,sha256=hoR0MVEhbcxKT1JnZ2rK1fUAofxOQT0okKmLYh0PBAY,10686
-cfdb/data_models.py,sha256=AtwtH2Uyo84GucW52aX0AzpG3Sbge41F5lrPuRxSLoY,2166
-cfdb/indexers.py,sha256=Vl0PS44mV4_6IUvPGZIIsd0qQniM3iAtntwe8bhqDrk,10683
-cfdb/main.py,sha256=L23zO_glrsOg8e5Vx2Guef3UOKNOw9KFW0Ray0uGqrQ,26372
-cfdb/support_classes.py,sha256=qoSVC7eX8I_A8xHA8jLnjLD9211bc3Va9HXvo_uct0A,34806
-cfdb/utils.py,sha256=ZEMmvUTa1h-FBCdfBx-oL5xVH7fDbXAObwqYjqeKQGk,73296
-cfdb-0.1.0.dist-info/METADATA,sha256=n8_UtgGhkHZgC3MxxaNcFfr2-682Fxc3RP1FDQ43fik,2528
-cfdb-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-cfdb-0.1.0.dist-info/licenses/LICENSE,sha256=hNqpp2O-F2qp4ozzNN86q1sxnAeFDLNoylHyJK_aiYI,586
-cfdb-0.1.0.dist-info/RECORD,,

{cfdb-0.1.0.dist-info → cfdb-0.1.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{cfdb-0.1.0.dist-info → cfdb-0.1.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

cfdb 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

cfdb 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl