nuthatch 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nuthatch might be problematic. Click here for more details.
- nuthatch/__init__.py +14 -0
- nuthatch/backend.py +301 -0
- nuthatch/backends/__init__.py +8 -0
- nuthatch/backends/basic.py +28 -0
- nuthatch/backends/delta.py +46 -0
- nuthatch/backends/parquet.py +130 -0
- nuthatch/backends/sql.py +147 -0
- nuthatch/backends/terracotta.py +199 -0
- nuthatch/backends/zarr.py +207 -0
- nuthatch/cache.py +529 -0
- nuthatch/cli.py +174 -0
- nuthatch/config.py +94 -0
- nuthatch/memoizer.py +67 -0
- nuthatch/nuthatch.py +498 -0
- nuthatch/processor.py +89 -0
- nuthatch/processors/__init__.py +6 -0
- nuthatch/processors/timeseries.py +157 -0
- nuthatch-0.1.0.dist-info/METADATA +38 -0
- nuthatch-0.1.0.dist-info/RECORD +21 -0
- nuthatch-0.1.0.dist-info/WHEEL +4 -0
- nuthatch-0.1.0.dist-info/entry_points.txt +2 -0
nuthatch/cli.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Command-line interface for nuthatch.
|
|
4
|
+
|
|
5
|
+
This module provides a CLI for interacting with nuthatch caching functionality,
|
|
6
|
+
including cache management, backend operations, and configuration.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import importlib
|
|
10
|
+
import click
|
|
11
|
+
import shutil
|
|
12
|
+
from .config import get_config
|
|
13
|
+
from .backend import get_backend_by_name, registered_backends
|
|
14
|
+
from .cache import Cache
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@click.group()
|
|
19
|
+
@click.version_option(version="0.1.0", prog_name="nuthatch")
|
|
20
|
+
def cli():
|
|
21
|
+
"""Nuthatch - Intelligent caching system for data science workflows.
|
|
22
|
+
|
|
23
|
+
This CLI provides tools for managing cache entries, inspecting backends,
|
|
24
|
+
and configuring the nuthatch system.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
config = get_config(location='root', requested_parameters=['dynamic_config_path'], backend_name=None)
|
|
28
|
+
if 'dynamic_config_path' in config:
|
|
29
|
+
try:
|
|
30
|
+
importlib.import_module(config['dynamic_config_path'])
|
|
31
|
+
except Exception as e:
|
|
32
|
+
click.echo(f"WARNGIN: Failed to import {config['dynamic_config_path']} with '{e}'. You may be missing dynamic secret resolution.")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@cli.command('import')
|
|
37
|
+
@click.argument('cache_key')
|
|
38
|
+
@click.option('--namespace', help='Namespace for the cache')
|
|
39
|
+
@click.option('--backend', help='Backend to use', required=True)
|
|
40
|
+
@click.option('--location', help='Location to search', default='root')
|
|
41
|
+
def import_data(cache_key, namespace, backend, location):
|
|
42
|
+
"""Import data from a glob pattern."""
|
|
43
|
+
|
|
44
|
+
# First instantiate the backend based on the passed backend
|
|
45
|
+
backend_name = backend
|
|
46
|
+
backend_class = get_backend_by_name(backend)
|
|
47
|
+
config = get_config(location=location, requested_parameters=backend_class.config_parameters, backend_name=backend_class.backend_name)
|
|
48
|
+
backend = backend_class(config, cache_key, namespace, None, {})
|
|
49
|
+
|
|
50
|
+
cache_keys = []
|
|
51
|
+
if hasattr(backend, 'fs') and backend.fs is not None:
|
|
52
|
+
paths = backend.fs.glob(backend.path)
|
|
53
|
+
for path in paths:
|
|
54
|
+
cache_keys.append(backend.get_cache_key(path))
|
|
55
|
+
|
|
56
|
+
if len(cache_keys) > 0:
|
|
57
|
+
click.confirm(f"Are you sure you want to import {len(paths)} cache entries?", abort=True)
|
|
58
|
+
else:
|
|
59
|
+
print("No caches found for import.")
|
|
60
|
+
|
|
61
|
+
for key in cache_keys:
|
|
62
|
+
print(f"Importing {key}.")
|
|
63
|
+
|
|
64
|
+
if backend_name == 'null':
|
|
65
|
+
config = get_config(location=location, requested_parameters=Cache.config_parameters)
|
|
66
|
+
cache = Cache(config, key, namespace, None, location, None, {})
|
|
67
|
+
if cache.is_null():
|
|
68
|
+
print(f"{key} already in cache as null!")
|
|
69
|
+
elif cache.exists():
|
|
70
|
+
print(f"Cache {key} already exists and is valid. Skipping entry. Delete this cache key if you would like to reimport it as null.")
|
|
71
|
+
else:
|
|
72
|
+
cache.set_null()
|
|
73
|
+
print(f"Set {key} successfully to null.")
|
|
74
|
+
else:
|
|
75
|
+
config = get_config(location=location, requested_parameters=Cache.config_parameters)
|
|
76
|
+
cache = Cache(config, key, namespace, None, location, backend_name, {})
|
|
77
|
+
if not cache.exists():
|
|
78
|
+
cache._commit_metadata()
|
|
79
|
+
print(f"Imported {key} successfully.")
|
|
80
|
+
else:
|
|
81
|
+
print(f"{key} already in cache!")
|
|
82
|
+
|
|
83
|
+
def list_helper(cache_key, namespace, backend, location):
|
|
84
|
+
"""List all cache entries."""
|
|
85
|
+
config = get_config(location=location, requested_parameters=Cache.config_parameters)
|
|
86
|
+
cache = Cache(config, None, namespace, None, location, backend, {})
|
|
87
|
+
|
|
88
|
+
if cache_key is None:
|
|
89
|
+
cache_key = '*'
|
|
90
|
+
|
|
91
|
+
caches = cache.list(cache_key)
|
|
92
|
+
|
|
93
|
+
return caches
|
|
94
|
+
|
|
95
|
+
@cli.command('list')
|
|
96
|
+
@click.argument('cache_key', required=False)
|
|
97
|
+
@click.option('--namespace', help='Namespace for the cache')
|
|
98
|
+
@click.option('--backend', help='Backend filter')
|
|
99
|
+
@click.option('--location', help='Location to search', default='root')
|
|
100
|
+
@click.option('--long', '-l', is_flag=True, help='List all information about the cache')
|
|
101
|
+
def list_caches(cache_key, namespace, backend, location, long):
|
|
102
|
+
|
|
103
|
+
caches = list_helper(cache_key, namespace, backend, location)
|
|
104
|
+
pager = len(caches) > shutil.get_terminal_size()[0]
|
|
105
|
+
|
|
106
|
+
if not long:
|
|
107
|
+
caches = [cache['cache_key'] for cache in caches]
|
|
108
|
+
caches = '\n'.join(caches)
|
|
109
|
+
else:
|
|
110
|
+
caches = pd.DataFrame(caches)
|
|
111
|
+
caches['last_modified'] = pd.to_datetime(caches['last_modified'], unit='us').dt.floor('s')
|
|
112
|
+
caches = caches[['cache_key', 'namespace', 'backend', 'state', 'last_modified', 'user', 'commit_hash', 'path']]
|
|
113
|
+
caches = caches.to_string()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if pager:
|
|
117
|
+
click.echo_via_pager(caches)
|
|
118
|
+
else:
|
|
119
|
+
click.echo(caches)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@cli.command('delete')
|
|
123
|
+
@click.argument('cache_key')
|
|
124
|
+
@click.option('--namespace', help='Namespace for the cache')
|
|
125
|
+
@click.option('--backend', help='Backend to use')
|
|
126
|
+
@click.option('--location', help='Location to search', default='root')
|
|
127
|
+
@click.option('--force', '-f', is_flag=True, help='Force deletion without confirmation')
|
|
128
|
+
@click.option('--metadata-only', '-m', is_flag=True, help='Only delete the metadata for the cache, not the underlying data.')
|
|
129
|
+
def delete_cache(cache_key, namespace, backend, location, force, metadata_only):
|
|
130
|
+
"""Clear cache entries."""
|
|
131
|
+
caches = list_helper(cache_key, namespace, backend, location)
|
|
132
|
+
config = get_config(location=location, requested_parameters=Cache.config_parameters)
|
|
133
|
+
|
|
134
|
+
click.confirm(f"Are you sure you want to delete {len(caches)} cache entries?", abort=True)
|
|
135
|
+
|
|
136
|
+
for cache in caches:
|
|
137
|
+
cache = Cache(config, cache.cache_key, cache.namespace, None, location, cache.backend, {})
|
|
138
|
+
click.echo(f"Deleting {cache.cache_key} from {cache.location} with backend {cache.backend_name}.")
|
|
139
|
+
if metadata_only:
|
|
140
|
+
cache._delete_metadata()
|
|
141
|
+
else:
|
|
142
|
+
cache.delete()
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@cli.command('print-config')
|
|
146
|
+
@click.option('--location', help='Location to search', default='root')
|
|
147
|
+
@click.option('--backend', help='Backend to use')
|
|
148
|
+
@click.option('--show-secrets', '-s', is_flag=True, help='Only delete the metadata for the cache, not the underlying data.')
|
|
149
|
+
def get_config_value(location, backend, show_secrets):
|
|
150
|
+
"""Get configuration value for a specific key."""
|
|
151
|
+
if backend:
|
|
152
|
+
backend_classes = [get_backend_by_name(backend)]
|
|
153
|
+
else:
|
|
154
|
+
backend_classes = [Cache] + list(registered_backends.values())
|
|
155
|
+
|
|
156
|
+
for backend_class in backend_classes:
|
|
157
|
+
if show_secrets:
|
|
158
|
+
config = get_config(location=location, requested_parameters=backend_class.config_parameters,
|
|
159
|
+
backend_name=backend_class.backend_name, mask_secrets=False)
|
|
160
|
+
else:
|
|
161
|
+
config = get_config(location=location, requested_parameters=backend_class.config_parameters,
|
|
162
|
+
backend_name=backend_class.backend_name, mask_secrets=True)
|
|
163
|
+
|
|
164
|
+
click.echo(backend_class.backend_name.title())
|
|
165
|
+
for key, value in config.items():
|
|
166
|
+
click.echo(f"\t{key}: {value}")
|
|
167
|
+
click.echo()
|
|
168
|
+
|
|
169
|
+
def main():
|
|
170
|
+
return cli()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
if __name__ == '__main__':
|
|
174
|
+
main()
|
nuthatch/config.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""The config module is used to get the config for a given location and backend."""
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import os
|
|
4
|
+
import tomllib
|
|
5
|
+
|
|
6
|
+
dynamic_parameters = {}
|
|
7
|
+
|
|
8
|
+
def config_parameter(parameter_name, location='root', backend=None, secret=False):
|
|
9
|
+
"""A decorator to register a function as a dynamic parameter.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
parameter_name (str): The name of the parameter.
|
|
13
|
+
location (str, optional): The location to register the parameter.
|
|
14
|
+
backend (str, optional): The backend to register the parameter.
|
|
15
|
+
"""
|
|
16
|
+
def decorator(function):
|
|
17
|
+
if location not in dynamic_parameters:
|
|
18
|
+
dynamic_parameters[location] = {}
|
|
19
|
+
if backend and backend not in dynamic_parameters[location]:
|
|
20
|
+
dynamic_parameters[location][backend] = {}
|
|
21
|
+
|
|
22
|
+
if backend:
|
|
23
|
+
dynamic_parameters[location][backend][parameter_name] = (function, secret)
|
|
24
|
+
else:
|
|
25
|
+
dynamic_parameters[location][parameter_name] = (function, secret)
|
|
26
|
+
return decorator
|
|
27
|
+
|
|
28
|
+
def _is_fs_root(p):
|
|
29
|
+
"""Check if a path is the root of a filesystem."""
|
|
30
|
+
return os.path.splitdrive(str(p))[1] == os.sep
|
|
31
|
+
|
|
32
|
+
def get_config(location='root', requested_parameters=[], backend_name=None, mask_secrets=False):
|
|
33
|
+
"""Get the config for a given location and backend.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
location (str, optional): The location to get the config for.
|
|
37
|
+
requested_parameters (list, optional): The parameters to get the config for.
|
|
38
|
+
backend_name (str, optional): The backend to get the config for.
|
|
39
|
+
"""
|
|
40
|
+
#Find pyproject.toml or nuthatch.ini
|
|
41
|
+
current_directory = Path.cwd()
|
|
42
|
+
|
|
43
|
+
config_file = None
|
|
44
|
+
while not _is_fs_root(current_directory):
|
|
45
|
+
if current_directory.joinpath('pyproject.toml').exists():
|
|
46
|
+
config_file = current_directory.joinpath('pyproject.toml')
|
|
47
|
+
|
|
48
|
+
current_directory = current_directory.parent
|
|
49
|
+
|
|
50
|
+
#TODO: enable ini and environment variable configuration
|
|
51
|
+
|
|
52
|
+
with open(config_file, "rb") as f:
|
|
53
|
+
config = tomllib.load(f)
|
|
54
|
+
|
|
55
|
+
# If it's root allow the base parameters to be used and root to be set
|
|
56
|
+
location_params = {}
|
|
57
|
+
if location == 'root':
|
|
58
|
+
location_params = config['tool']['nuthatch']
|
|
59
|
+
if location in config['tool']['nuthatch']:
|
|
60
|
+
location_params.update(config['tool']['nuthatch'][location])
|
|
61
|
+
else:
|
|
62
|
+
if location in config['tool']['nuthatch']:
|
|
63
|
+
location_params = config['tool']['nuthatch'][location]
|
|
64
|
+
|
|
65
|
+
if backend_name and backend_name in location_params:
|
|
66
|
+
backend_specific_params = config['tool']['nuthatch'][location][backend_name]
|
|
67
|
+
else:
|
|
68
|
+
backend_specific_params = {}
|
|
69
|
+
|
|
70
|
+
# Merge the two together
|
|
71
|
+
merged_config = backend_specific_params | location_params
|
|
72
|
+
|
|
73
|
+
filtered_config = {k: merged_config[k] for k in merged_config if k in requested_parameters}
|
|
74
|
+
|
|
75
|
+
# Now call all the relevant config registrations and add them
|
|
76
|
+
for p in requested_parameters:
|
|
77
|
+
if location in dynamic_parameters:
|
|
78
|
+
if backend_name in dynamic_parameters[location] and p in dynamic_parameters[location][backend_name]:
|
|
79
|
+
secret = dynamic_parameters[location][backend_name][p][1]
|
|
80
|
+
param = dynamic_parameters[location][backend_name][p][0]()
|
|
81
|
+
if secret and mask_secrets:
|
|
82
|
+
filtered_config[p] = '*'*len(param)
|
|
83
|
+
else:
|
|
84
|
+
filtered_config[p] = param
|
|
85
|
+
elif p in dynamic_parameters[location]:
|
|
86
|
+
secret = dynamic_parameters[location][p][1]
|
|
87
|
+
param = dynamic_parameters[location][p][0]()
|
|
88
|
+
if secret and mask_secrets:
|
|
89
|
+
filtered_config[p] = '*'*len(param)
|
|
90
|
+
else:
|
|
91
|
+
filtered_config[p] = param
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
return filtered_config
|
nuthatch/memoizer.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""The memoizer is a module that memoizes data in memory."""
|
|
2
|
+
import sys
|
|
3
|
+
import xarray as xr
|
|
4
|
+
import dask.dataframe as dd
|
|
5
|
+
from .config import get_config
|
|
6
|
+
|
|
7
|
+
memoized_objects = {}
|
|
8
|
+
cache_key_lru = []
|
|
9
|
+
|
|
10
|
+
def save_to_memory(cache_key, data):
|
|
11
|
+
"""Save data to memory.
|
|
12
|
+
|
|
13
|
+
Implements special handling for xarray and dask dataframes.
|
|
14
|
+
Evicts the least recently used object when the memory limit is reached.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
cache_key (str): The key to save the data to.
|
|
18
|
+
data (any): The data to save to memory.
|
|
19
|
+
"""
|
|
20
|
+
if isinstance(data, xr.Dataset):
|
|
21
|
+
data = data.persist()
|
|
22
|
+
elif isinstance(data, dd.DataFrame):
|
|
23
|
+
data = data.persist()
|
|
24
|
+
else:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
max_size = get_config(location='root', requested_parameters=['maximum_memory_usage'])
|
|
28
|
+
if 'maximum_memory_usage' in max_size:
|
|
29
|
+
max_size = max_size['maximum_memory_usage']
|
|
30
|
+
else:
|
|
31
|
+
max_size = 100*10^6
|
|
32
|
+
|
|
33
|
+
if(sys.getsizeof(data) > max_size):
|
|
34
|
+
print("WARNING: Data too large to memoize.")
|
|
35
|
+
|
|
36
|
+
while(sys.getsizeof(memoized_objects) + sys.getsizeof(data) > max_size):
|
|
37
|
+
del memoized_objects[cache_key_lru[0]]
|
|
38
|
+
del cache_key_lru[0]
|
|
39
|
+
|
|
40
|
+
memoized_objects[cache_key] = data
|
|
41
|
+
|
|
42
|
+
if cache_key in cache_key_lru:
|
|
43
|
+
cache_key_lru.remove(cache_key)
|
|
44
|
+
|
|
45
|
+
cache_key_lru.append(cache_key)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def recall_from_memory(cache_key):
|
|
49
|
+
"""Recall data from memory.
|
|
50
|
+
|
|
51
|
+
Refreshes the LRU cache when the data is recalled.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
cache_key (str): The key to recall the data from.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
The memoized object.
|
|
58
|
+
"""
|
|
59
|
+
if cache_key in memoized_objects:
|
|
60
|
+
# refresh the lru
|
|
61
|
+
cache_key_lru.remove(cache_key)
|
|
62
|
+
cache_key_lru.append(cache_key)
|
|
63
|
+
|
|
64
|
+
# return the object
|
|
65
|
+
return memoized_objects[cache_key]
|
|
66
|
+
else:
|
|
67
|
+
return None
|