PyPI - nuthatch - Versions diffs - 0.1.0__py3-none-any.whl - Mend

nuthatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nuthatch might be problematic. Click here for more details.

Files changed (21) hide show

nuthatch/__init__.py +14 -0
nuthatch/backend.py +301 -0
nuthatch/backends/__init__.py +8 -0
nuthatch/backends/basic.py +28 -0
nuthatch/backends/delta.py +46 -0
nuthatch/backends/parquet.py +130 -0
nuthatch/backends/sql.py +147 -0
nuthatch/backends/terracotta.py +199 -0
nuthatch/backends/zarr.py +207 -0
nuthatch/cache.py +529 -0
nuthatch/cli.py +174 -0
nuthatch/config.py +94 -0
nuthatch/memoizer.py +67 -0
nuthatch/nuthatch.py +498 -0
nuthatch/processor.py +89 -0
nuthatch/processors/__init__.py +6 -0
nuthatch/processors/timeseries.py +157 -0
nuthatch-0.1.0.dist-info/METADATA +38 -0
nuthatch-0.1.0.dist-info/RECORD +21 -0
nuthatch-0.1.0.dist-info/WHEEL +4 -0
nuthatch-0.1.0.dist-info/entry_points.txt +2 -0

nuthatch/backends/terracotta.py ADDED Viewed

@@ -0,0 +1,199 @@
+from nuthatch.backend import DatabaseBackend, FileBackend, register_backend
+import shutil
+from pathlib import Path
+import terracotta as tc
+import sqlalchemy
+import xarray as xr
+import numpy as np
+from rasterio.io import MemoryFile
+from rasterio.enums import Resampling
+def base360_to_base180(lons):
+    """Converts a list of longitudes from base 360 to base 180.
+    Args:
+        lons (list, float): A list of longitudes, or a single longitude
+    """
+    if not isinstance(lons, np.ndarray) and not isinstance(lons, list):
+        lons = [lons]
+    val = [x - 360.0 if x >= 180.0 else x for x in lons]
+    if len(val) == 1:
+        return val[0]
+    return np.array(val)
+def base180_to_base360(lons):
+    """Converts a list of longitudes from base 180 to base 360.
+    Args:
+        lons (list, float): A list of longitudes, or a single longitude
+    """
+    if not isinstance(lons, np.ndarray) and not isinstance(lons, list):
+        lons = [lons]
+    val = [x + 360.0 if x < 0.0 else x for x in lons]
+    if len(val) == 1:
+        return val[0]
+    return np.array(val)
+def is_wrapped(lons):
+    """Check if the longitudes are wrapped.
+    Works for both base180 and base360 longitudes. Requires that
+    longitudes are in increasing order, outside of a wrap point.
+    """
+    wraps = (np.diff(lons) < 0.0).sum()
+    if wraps > 1:
+        raise ValueError("Only one wrapping discontinuity allowed.")
+    elif wraps == 1:
+        return True
+    return False
+def lon_base_change(ds, to_base="base180", lon_dim='lon'):
+    """Change the base of the dataset from base 360 to base 180 or vice versa.
+    Args:
+        ds (xr.Dataset): Dataset to change.
+        to_base (str): The base to change to. One of:
+            - base180
+            - base360
+        lon_dim (str): The longitude column name.
+    """
+    if to_base == "base180":
+        if (ds[lon_dim] < 0.0).any():
+            print("Longitude already in base 180 format.")
+            return ds
+        lons = base360_to_base180(ds[lon_dim].values)
+    elif to_base == "base360":
+        if (ds[lon_dim] > 180.0).any():
+            print("Longitude already in base 360 format.")
+            return ds
+        lons = base180_to_base360(ds[lon_dim].values)
+    else:
+        raise ValueError(f"Invalid base {to_base}.")
+    # Check if original data is wrapped
+    wrapped = is_wrapped(ds.lon.values)
+    # Then assign new coordinates
+    ds = ds.assign_coords({lon_dim: lons})
+    # Sort the lons after conversion, unless the slice
+    # you're considering wraps around the meridian
+    # in the resultant base.
+    if not wrapped:
+        ds = ds.sortby('lon')
+    return ds
+@register_backend
+class TerracottaBackend(DatabaseBackend, FileBackend):
+    """
+    Terracotta backend for caching geospatial data in a terracotta database.
+    This backend supports xarray datasets.
+    """
+    backend_name = 'terracotta'
+    config_parameters = DatabaseBackend.config_parameters + FileBackend.config_parameters + ['override_path']
+    def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
+        # This calls both inits right?
+        DatabaseBackend.__init__(cacheable_config, cache_key, namespace, args, backend_kwargs)
+        FileBackend.__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'tif')
+        tc.update_settings(SQL_USER=self.config['write_username'], SQL_PASSWORD=self.config['write_password'])
+        self.driver = tc.get_driver(self.write_uri)
+        try:
+            self.driver.get_keys()
+        except sqlalchemy.exc.DatabaseError:
+            # Create a metastore
+            print("Creating new terracotta metastore")
+            self.driver.create(['key'])
+        if 'override_path' in backend_kwargs:
+            base_path = Path(backend_kwargs['override_path'])
+            if namespace:
+                self.raw_override_path = base_path.joinpath(namespace, cache_key)
+            else:
+                self.raw_override_path = base_path.joinpath(cache_key)
+            self.override_path = self.raw_override_path + '.tif'
+    def write(self, ds, upsert=False, primary_keys=None):
+        if not isinstance(ds, xr.Dataset):
+            raise NotImplementedError("Terracotta backend only supports xarray datasets")
+        # Check to make sure this is geospatial data
+        lats = ['lat', 'y', 'latitude']
+        lons = ['lon', 'x', 'longitude']
+        if len(ds.dims) != 2:
+            if len(ds.dims) != 3 or 'time' not in ds.dims:
+                raise RuntimeError("Can only store two dimensional geospatial data to terracotta")
+        foundx = False
+        foundy = False
+        for y in lats:
+            if y in ds.dims:
+                ds = ds.rename({y: 'y'})
+                foundy = True
+        for x in lons:
+            if x in ds.dims:
+                ds = ds.rename({x: 'x'})
+                foundx = True
+        if not foundx or not foundy:
+            raise RuntimeError("Can only store two or three dimensional (with time) geospatial data to terracotta")
+        # Adjust coordinates
+        if (ds['x'] > 180.0).any():
+            lon_base_change(ds, lon_dim='x')
+            ds = ds.sortby(['x'])
+        # Adapt the CRS
+        ds.rio.write_crs("epsg:4326", inplace=True)
+        ds = ds.rio.reproject('EPSG:3857', resampling=Resampling.nearest, nodata=np.nan)
+        ds.rio.write_crs("epsg:3857", inplace=True)
+        # Insert the parameters.
+        with self.driver.connect():
+            if 'time' in ds.dims:
+                for t in ds.time:
+                    # Select just this time and squeeze the dimension
+                    sub_ds = ds.sel(time=t)
+                    sub_ds = sub_ds.reset_coords('time', drop=True)
+                    # add the time to the cache_key
+                    sub_cache_key = self.cache_key + '_' + str(t.values)
+                    sub_path = self.raw_cache_path + '_' + str(t.values) + '.tif'
+                    sub_override_path = self.raw_override_path + '_' + str(t.values) + '.tif'
+                    self.write_individual_raster(self.driver, sub_ds, sub_path, sub_cache_key, sub_override_path)
+            else:
+                self.write_individual_raster(self.driver, ds, self.path, self.cache_key, self.override_path)
+            pass
+    def write_individual_raster(self, driver, ds, path, cache_key, override_path):
+        # Write the raster
+        with MemoryFile() as mem_dst:
+            ds.rio.to_raster(mem_dst.name, driver="COG")
+            with self.fs.open(path, 'wb') as f_out:
+                shutil.copyfileobj(mem_dst, f_out)
+            driver.insert({'key': cache_key.replace('/', '_')}, mem_dst,
+                         override_path=override_path, skip_metadata=False)
+            print(f"Inserted {cache_key.replace('/', '_')} into the terracotta database.")
+    def read(engine):
+        raise NotImplementedError("Cannot read from the terracotta backend.")
+    def delete():
+        raise NotImplementedError("Cannot delete from the terracotta backend.")

nuthatch/backends/zarr.py ADDED Viewed

@@ -0,0 +1,207 @@
+from nuthatch.backend import FileBackend, register_backend
+import xarray as xr
+import numpy as np
+CHUNK_SIZE_UPPER_LIMIT_MB = 300
+CHUNK_SIZE_LOWER_LIMIT_MB = 30
+def get_chunk_size(ds, size_in='MB'):
+    """Get the chunk size of a dataset in MB or number of chunks.
+    Args:
+        ds (xr.Dataset): The dataset to get the chunk size of.
+        size_in (str): The size to return the chunk size in. One of:
+            'KB', 'MB', 'GB', 'TB' for kilo, mega, giga, and terabytes respectively.
+    """
+    chunk_groups = [(dim, np.median(chunks)) for dim, chunks in ds.chunks.items()]
+    div = {'KB': 10**3, 'MB': 10**6, 'GB': 10**9, 'TB': 10**12}[size_in]
+    chunk_sizes = [x[1] for x in chunk_groups]
+    return np.prod(chunk_sizes) * 4 / div, chunk_groups
+def merge_chunk_by_arg(chunking, chunk_by_arg, kwargs):
+    """Merge chunking and chunking modifiers into a single chunking dict.
+    Args:
+        chunking (dict): The chunking to merge.
+        chunk_by_arg (dict): The chunking modifiers to merge.
+        kwargs (dict): The kwargs to check for chunking modifiers.
+    """
+    if chunk_by_arg is None:
+        return chunking
+    for k in chunk_by_arg:
+        if k not in kwargs:
+            raise ValueError(f"Chunking modifier {k} not found in kwargs.")
+        if kwargs[k] in chunk_by_arg[k]:
+            # If argument value in chunk_by_arg then merge the chunking
+            chunk_dict = chunk_by_arg[k][kwargs[k]]
+            chunking.update(chunk_dict)
+    return chunking
+#TODO Why was this the way it was in sheerwater???
+def prune_chunking_dimensions(ds, chunking):
+    """Prune the chunking dimensions to only those that exist in the dataset.
+    Args:
+        ds (xr.Dataset): The dataset to check for chunking dimensions.
+        chunking (dict): The chunking dimensions to prune.
+    """
+    # Drop any dimensions that don't exist in the ds_chunks
+    for dim in chunking:
+        if dim not in ds.dims:
+            del chunking[dim]
+    return chunking
+def chunking_compare(ds, chunking):
+    """Compare the chunking of a dataset to a specified chunking.
+    Args:
+        ds (xr.Dataset): The dataset to check the chunking of.
+        chunking (dict): The chunking to compare to.
+    """
+    # Get the chunks for the dataset
+    ds_chunks = {dim: ds.chunks[dim][0] for dim in ds.chunks}
+    chunking = prune_chunking_dimensions(ds, chunking)
+    return ds_chunks == chunking
+def drop_encoded_chunks(ds):
+    """Drop the encoded chunks from a dataset."""
+    for var in ds.data_vars:
+        if 'chunks' in ds[var].encoding:
+            del ds[var].encoding['chunks']
+        if 'preferred_chunks' in ds[var].encoding:
+            del ds[var].encoding['preferred_chunks']
+    for coord in ds.coords:
+        if 'chunks' in ds[coord].encoding:
+            del ds[coord].encoding['chunks']
+        if 'preferred_chunks' in ds[coord].encoding:
+            del ds[coord].encoding['preferred_chunks']
+    return ds
+@register_backend
+class ZarrBackend(FileBackend):
+    """
+    Zarr backend for caching data in a zarr store.
+    This backend supports xarray datasets.
+    Possible backend_kwargs:
+        chunking(dict): Specifies chunking if that coordinate exists. If coordinate does not exist
+            the chunking specified will be dropped.
+        chunk_by_arg(dict): Specifies chunking modifiers based on the passed cached arguments,
+            e.g. grid resolution.  For example:
+            chunk_by_arg={
+                'grid': {
+                    'global0_25': {"lat": 721, "lon": 1440, 'time': 30}
+                    'global1_5': {"lat": 121, "lon": 240, 'time': 1000}
+                }
+            }
+            will modify the chunking dict values for lat, lon, and time, depending
+            on the value of the 'grid' argument. If multiple cache arguments specify
+            modifiers for the same chunking dimension, the last one specified will prevail.
+        auto_rechunk(bool): If True will aggressively rechunk a cache on load.
+    """
+    backend_name = 'zarr'
+    default_for_type = xr.Dataset
+    def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
+        super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'zarr')
+        if backend_kwargs and 'chunking' in backend_kwargs and 'chunk_by_arg' in backend_kwargs:
+            self.chunking = merge_chunk_by_arg(self.backend_kwargs['chunking'], self.backend_kwargs['chunk_by_arg'], args)
+        elif backend_kwargs and 'chunking' in backend_kwargs:
+            self.chunking = backend_kwargs['chunking']
+        else:
+            self.chunking = 'auto'
+        if backend_kwargs and 'auto_rechunk' in backend_kwargs and backend_kwargs['auto_rechunk']:
+            self.auto_rechunk = True
+        else:
+            self.auto_rechunk = False
+    def write(self, data, upsert=False, primary_keys=None):
+        if upsert:
+            raise NotImplementedError("Zarr backend does not support upsert.")
+        if isinstance(data, xr.Dataset):
+            self.chunk_to_zarr(data, self.path)
+        else:
+            raise NotImplementedError("Zarr backend only supports caching of xarray datasets.")
+    def read(self, engine):
+        if engine == 'xarray' or engine == xr.Dataset or engine is None:
+            # We must auto open chunks. This tries to use the underlying zarr chunking if possible.
+            # Setting chunks=True triggers what I think is an xarray/zarr engine bug where
+            # every chunk is only 4B!
+            if self.auto_rechunk:
+                # If rechunk is passed then check to see if the rechunk array
+                # matches chunking. If not then rechunk
+                ds_remote = xr.open_dataset(self.path, engine='zarr', chunks={}, decode_timedelta=True)
+                if not isinstance(self.chunking, dict):
+                    raise ValueError("If auto_rechunk is True, a chunking dict must be supplied.")
+                # Compare the dict to the rechunk dict
+                if not chunking_compare(ds_remote, self.chunking):
+                    print("Rechunk was passed and cached chunks do not match rechunk request. "
+                          "Performing rechunking.")
+                    # write to a temp cache map
+                    # writing to temp cache is necessary because if you overwrite
+                    # the original cache map it will write it before reading the
+                    # data leading to corruption.
+                    self.chunk_to_zarr(ds_remote, self.temp_path)
+                    # Remove the old cache and verify files
+                    if self.fs.exists(self.path):
+                        self.fs.rm(self.path, recursive=True)
+                    self.fs.mv(self.temp_path, self.path, recursive=True)
+                    # Reopen the dataset - will use the appropriate global or local cache
+                    return xr.open_dataset(self.path, engine='zarr',
+                                           chunks={}, decode_timedelta=True)
+                else:
+                    # Requested chunks already match rechunk.
+                    return xr.open_dataset(self.path, engine='zarr',
+                                           chunks={}, decode_timedelta=True)
+            else:
+                return xr.open_dataset(self.path, engine='zarr', chunks={}, decode_timedelta=True)
+        else:
+            raise NotImplementedError(f"Zarr backend does not support reading zarrs to {engine} engine")
+    def chunk_to_zarr(self, ds, path):
+        """Write a dataset to a zarr cache map and check the chunking."""
+        ds = drop_encoded_chunks(ds)
+        chunking = self.chunking
+        if isinstance(self.chunking, dict):
+            # No need to prune if chunking is None or 'auto'
+            chunking = prune_chunking_dimensions(ds, self.chunking)
+        ds = ds.chunk(chunks=chunking)
+        try:
+            chunk_size, chunk_with_labels = get_chunk_size(ds)
+            if chunk_size > CHUNK_SIZE_UPPER_LIMIT_MB or chunk_size < CHUNK_SIZE_LOWER_LIMIT_MB:
+                print(f"WARNING: Chunk size is {chunk_size}MB. Target approx 100MB.")
+                print(chunk_with_labels)
+        except ValueError:
+            print("Failed to get chunks size! Continuing with unknown chunking...")
+        ds.to_zarr(store=path, mode='w')