nuthatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nuthatch might be problematic. Click here for more details.

nuthatch/cli.py ADDED
@@ -0,0 +1,174 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command-line interface for nuthatch.
4
+
5
+ This module provides a CLI for interacting with nuthatch caching functionality,
6
+ including cache management, backend operations, and configuration.
7
+ """
8
+
9
+ import importlib
10
+ import click
11
+ import shutil
12
+ from .config import get_config
13
+ from .backend import get_backend_by_name, registered_backends
14
+ from .cache import Cache
15
+ import pandas as pd
16
+
17
+
18
+ @click.group()
19
+ @click.version_option(version="0.1.0", prog_name="nuthatch")
20
+ def cli():
21
+ """Nuthatch - Intelligent caching system for data science workflows.
22
+
23
+ This CLI provides tools for managing cache entries, inspecting backends,
24
+ and configuring the nuthatch system.
25
+ """
26
+
27
+ config = get_config(location='root', requested_parameters=['dynamic_config_path'], backend_name=None)
28
+ if 'dynamic_config_path' in config:
29
+ try:
30
+ importlib.import_module(config['dynamic_config_path'])
31
+ except Exception as e:
32
+ click.echo(f"WARNGIN: Failed to import {config['dynamic_config_path']} with '{e}'. You may be missing dynamic secret resolution.")
33
+
34
+
35
+
36
+ @cli.command('import')
37
+ @click.argument('cache_key')
38
+ @click.option('--namespace', help='Namespace for the cache')
39
+ @click.option('--backend', help='Backend to use', required=True)
40
+ @click.option('--location', help='Location to search', default='root')
41
+ def import_data(cache_key, namespace, backend, location):
42
+ """Import data from a glob pattern."""
43
+
44
+ # First instantiate the backend based on the passed backend
45
+ backend_name = backend
46
+ backend_class = get_backend_by_name(backend)
47
+ config = get_config(location=location, requested_parameters=backend_class.config_parameters, backend_name=backend_class.backend_name)
48
+ backend = backend_class(config, cache_key, namespace, None, {})
49
+
50
+ cache_keys = []
51
+ if hasattr(backend, 'fs') and backend.fs is not None:
52
+ paths = backend.fs.glob(backend.path)
53
+ for path in paths:
54
+ cache_keys.append(backend.get_cache_key(path))
55
+
56
+ if len(cache_keys) > 0:
57
+ click.confirm(f"Are you sure you want to import {len(paths)} cache entries?", abort=True)
58
+ else:
59
+ print("No caches found for import.")
60
+
61
+ for key in cache_keys:
62
+ print(f"Importing {key}.")
63
+
64
+ if backend_name == 'null':
65
+ config = get_config(location=location, requested_parameters=Cache.config_parameters)
66
+ cache = Cache(config, key, namespace, None, location, None, {})
67
+ if cache.is_null():
68
+ print(f"{key} already in cache as null!")
69
+ elif cache.exists():
70
+ print(f"Cache {key} already exists and is valid. Skipping entry. Delete this cache key if you would like to reimport it as null.")
71
+ else:
72
+ cache.set_null()
73
+ print(f"Set {key} successfully to null.")
74
+ else:
75
+ config = get_config(location=location, requested_parameters=Cache.config_parameters)
76
+ cache = Cache(config, key, namespace, None, location, backend_name, {})
77
+ if not cache.exists():
78
+ cache._commit_metadata()
79
+ print(f"Imported {key} successfully.")
80
+ else:
81
+ print(f"{key} already in cache!")
82
+
83
+ def list_helper(cache_key, namespace, backend, location):
84
+ """List all cache entries."""
85
+ config = get_config(location=location, requested_parameters=Cache.config_parameters)
86
+ cache = Cache(config, None, namespace, None, location, backend, {})
87
+
88
+ if cache_key is None:
89
+ cache_key = '*'
90
+
91
+ caches = cache.list(cache_key)
92
+
93
+ return caches
94
+
95
+ @cli.command('list')
96
+ @click.argument('cache_key', required=False)
97
+ @click.option('--namespace', help='Namespace for the cache')
98
+ @click.option('--backend', help='Backend filter')
99
+ @click.option('--location', help='Location to search', default='root')
100
+ @click.option('--long', '-l', is_flag=True, help='List all information about the cache')
101
+ def list_caches(cache_key, namespace, backend, location, long):
102
+
103
+ caches = list_helper(cache_key, namespace, backend, location)
104
+ pager = len(caches) > shutil.get_terminal_size()[0]
105
+
106
+ if not long:
107
+ caches = [cache['cache_key'] for cache in caches]
108
+ caches = '\n'.join(caches)
109
+ else:
110
+ caches = pd.DataFrame(caches)
111
+ caches['last_modified'] = pd.to_datetime(caches['last_modified'], unit='us').dt.floor('s')
112
+ caches = caches[['cache_key', 'namespace', 'backend', 'state', 'last_modified', 'user', 'commit_hash', 'path']]
113
+ caches = caches.to_string()
114
+
115
+
116
+ if pager:
117
+ click.echo_via_pager(caches)
118
+ else:
119
+ click.echo(caches)
120
+
121
+
122
+ @cli.command('delete')
123
+ @click.argument('cache_key')
124
+ @click.option('--namespace', help='Namespace for the cache')
125
+ @click.option('--backend', help='Backend to use')
126
+ @click.option('--location', help='Location to search', default='root')
127
+ @click.option('--force', '-f', is_flag=True, help='Force deletion without confirmation')
128
+ @click.option('--metadata-only', '-m', is_flag=True, help='Only delete the metadata for the cache, not the underlying data.')
129
+ def delete_cache(cache_key, namespace, backend, location, force, metadata_only):
130
+ """Clear cache entries."""
131
+ caches = list_helper(cache_key, namespace, backend, location)
132
+ config = get_config(location=location, requested_parameters=Cache.config_parameters)
133
+
134
+ click.confirm(f"Are you sure you want to delete {len(caches)} cache entries?", abort=True)
135
+
136
+ for cache in caches:
137
+ cache = Cache(config, cache.cache_key, cache.namespace, None, location, cache.backend, {})
138
+ click.echo(f"Deleting {cache.cache_key} from {cache.location} with backend {cache.backend_name}.")
139
+ if metadata_only:
140
+ cache._delete_metadata()
141
+ else:
142
+ cache.delete()
143
+
144
+
145
+ @cli.command('print-config')
146
+ @click.option('--location', help='Location to search', default='root')
147
+ @click.option('--backend', help='Backend to use')
148
+ @click.option('--show-secrets', '-s', is_flag=True, help='Only delete the metadata for the cache, not the underlying data.')
149
+ def get_config_value(location, backend, show_secrets):
150
+ """Get configuration value for a specific key."""
151
+ if backend:
152
+ backend_classes = [get_backend_by_name(backend)]
153
+ else:
154
+ backend_classes = [Cache] + list(registered_backends.values())
155
+
156
+ for backend_class in backend_classes:
157
+ if show_secrets:
158
+ config = get_config(location=location, requested_parameters=backend_class.config_parameters,
159
+ backend_name=backend_class.backend_name, mask_secrets=False)
160
+ else:
161
+ config = get_config(location=location, requested_parameters=backend_class.config_parameters,
162
+ backend_name=backend_class.backend_name, mask_secrets=True)
163
+
164
+ click.echo(backend_class.backend_name.title())
165
+ for key, value in config.items():
166
+ click.echo(f"\t{key}: {value}")
167
+ click.echo()
168
+
169
+ def main():
170
+ return cli()
171
+
172
+
173
+ if __name__ == '__main__':
174
+ main()
nuthatch/config.py ADDED
@@ -0,0 +1,94 @@
1
+ """The config module is used to get the config for a given location and backend."""
2
+ from pathlib import Path
3
+ import os
4
+ import tomllib
5
+
6
+ dynamic_parameters = {}
7
+
8
+ def config_parameter(parameter_name, location='root', backend=None, secret=False):
9
+ """A decorator to register a function as a dynamic parameter.
10
+
11
+ Args:
12
+ parameter_name (str): The name of the parameter.
13
+ location (str, optional): The location to register the parameter.
14
+ backend (str, optional): The backend to register the parameter.
15
+ """
16
+ def decorator(function):
17
+ if location not in dynamic_parameters:
18
+ dynamic_parameters[location] = {}
19
+ if backend and backend not in dynamic_parameters[location]:
20
+ dynamic_parameters[location][backend] = {}
21
+
22
+ if backend:
23
+ dynamic_parameters[location][backend][parameter_name] = (function, secret)
24
+ else:
25
+ dynamic_parameters[location][parameter_name] = (function, secret)
26
+ return decorator
27
+
28
+ def _is_fs_root(p):
29
+ """Check if a path is the root of a filesystem."""
30
+ return os.path.splitdrive(str(p))[1] == os.sep
31
+
32
+ def get_config(location='root', requested_parameters=[], backend_name=None, mask_secrets=False):
33
+ """Get the config for a given location and backend.
34
+
35
+ Args:
36
+ location (str, optional): The location to get the config for.
37
+ requested_parameters (list, optional): The parameters to get the config for.
38
+ backend_name (str, optional): The backend to get the config for.
39
+ """
40
+ #Find pyproject.toml or nuthatch.ini
41
+ current_directory = Path.cwd()
42
+
43
+ config_file = None
44
+ while not _is_fs_root(current_directory):
45
+ if current_directory.joinpath('pyproject.toml').exists():
46
+ config_file = current_directory.joinpath('pyproject.toml')
47
+
48
+ current_directory = current_directory.parent
49
+
50
+ #TODO: enable ini and environment variable configuration
51
+
52
+ with open(config_file, "rb") as f:
53
+ config = tomllib.load(f)
54
+
55
+ # If it's root allow the base parameters to be used and root to be set
56
+ location_params = {}
57
+ if location == 'root':
58
+ location_params = config['tool']['nuthatch']
59
+ if location in config['tool']['nuthatch']:
60
+ location_params.update(config['tool']['nuthatch'][location])
61
+ else:
62
+ if location in config['tool']['nuthatch']:
63
+ location_params = config['tool']['nuthatch'][location]
64
+
65
+ if backend_name and backend_name in location_params:
66
+ backend_specific_params = config['tool']['nuthatch'][location][backend_name]
67
+ else:
68
+ backend_specific_params = {}
69
+
70
+ # Merge the two together
71
+ merged_config = backend_specific_params | location_params
72
+
73
+ filtered_config = {k: merged_config[k] for k in merged_config if k in requested_parameters}
74
+
75
+ # Now call all the relevant config registrations and add them
76
+ for p in requested_parameters:
77
+ if location in dynamic_parameters:
78
+ if backend_name in dynamic_parameters[location] and p in dynamic_parameters[location][backend_name]:
79
+ secret = dynamic_parameters[location][backend_name][p][1]
80
+ param = dynamic_parameters[location][backend_name][p][0]()
81
+ if secret and mask_secrets:
82
+ filtered_config[p] = '*'*len(param)
83
+ else:
84
+ filtered_config[p] = param
85
+ elif p in dynamic_parameters[location]:
86
+ secret = dynamic_parameters[location][p][1]
87
+ param = dynamic_parameters[location][p][0]()
88
+ if secret and mask_secrets:
89
+ filtered_config[p] = '*'*len(param)
90
+ else:
91
+ filtered_config[p] = param
92
+
93
+
94
+ return filtered_config
nuthatch/memoizer.py ADDED
@@ -0,0 +1,67 @@
1
+ """The memoizer is a module that memoizes data in memory."""
2
+ import sys
3
+ import xarray as xr
4
+ import dask.dataframe as dd
5
+ from .config import get_config
6
+
7
+ memoized_objects = {}
8
+ cache_key_lru = []
9
+
10
+ def save_to_memory(cache_key, data):
11
+ """Save data to memory.
12
+
13
+ Implements special handling for xarray and dask dataframes.
14
+ Evicts the least recently used object when the memory limit is reached.
15
+
16
+ Args:
17
+ cache_key (str): The key to save the data to.
18
+ data (any): The data to save to memory.
19
+ """
20
+ if isinstance(data, xr.Dataset):
21
+ data = data.persist()
22
+ elif isinstance(data, dd.DataFrame):
23
+ data = data.persist()
24
+ else:
25
+ pass
26
+
27
+ max_size = get_config(location='root', requested_parameters=['maximum_memory_usage'])
28
+ if 'maximum_memory_usage' in max_size:
29
+ max_size = max_size['maximum_memory_usage']
30
+ else:
31
+ max_size = 100*10^6
32
+
33
+ if(sys.getsizeof(data) > max_size):
34
+ print("WARNING: Data too large to memoize.")
35
+
36
+ while(sys.getsizeof(memoized_objects) + sys.getsizeof(data) > max_size):
37
+ del memoized_objects[cache_key_lru[0]]
38
+ del cache_key_lru[0]
39
+
40
+ memoized_objects[cache_key] = data
41
+
42
+ if cache_key in cache_key_lru:
43
+ cache_key_lru.remove(cache_key)
44
+
45
+ cache_key_lru.append(cache_key)
46
+
47
+
48
+ def recall_from_memory(cache_key):
49
+ """Recall data from memory.
50
+
51
+ Refreshes the LRU cache when the data is recalled.
52
+
53
+ Args:
54
+ cache_key (str): The key to recall the data from.
55
+
56
+ Returns:
57
+ The memoized object.
58
+ """
59
+ if cache_key in memoized_objects:
60
+ # refresh the lru
61
+ cache_key_lru.remove(cache_key)
62
+ cache_key_lru.append(cache_key)
63
+
64
+ # return the object
65
+ return memoized_objects[cache_key]
66
+ else:
67
+ return None