nuthatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nuthatch might be problematic. Click here for more details.

nuthatch/nuthatch.py ADDED
@@ -0,0 +1,498 @@
1
+ """The main module for the nuthatch package.
2
+
3
+ This module contains the main decorator for caching functions and the global
4
+ variables for caching configuration.
5
+ """
6
+ import inspect
7
+ from functools import wraps
8
+ from inspect import signature, Parameter
9
+ from .cache import Cache
10
+ from .backend import get_default_backend
11
+ from .config import get_config
12
+ from .memoizer import save_to_memory, recall_from_memory
13
+ import logging
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Global variables for caching configuration
19
+ global_recompute = None
20
+ global_memoize = None
21
+ global_force_overwrite = None
22
+ global_retry_null_cache = None
23
+
24
+
25
+ def set_global_cache_variables(recompute=None, memoize=None, force_overwrite=None,
26
+ retry_null_cache=None):
27
+ """Reset all global variables to defaults and set the new values."""
28
+ global global_recompute, global_memoize, global_force_overwrite, \
29
+ global_retry_null_cache
30
+
31
+ # Simple logic for global variables
32
+ global_force_overwrite = force_overwrite
33
+ global_retry_null_cache = retry_null_cache
34
+
35
+ # More complex logic for recompute
36
+ if recompute == True: # noqa: E712
37
+ global_recompute = False
38
+ elif isinstance(recompute, str) and recompute != '_all':
39
+ # if a single function's name is passed, convert to a list
40
+ global_recompute = [recompute]
41
+ else:
42
+ global_recompute = recompute # if recompute is false, '_all' or a list
43
+
44
+ # More complex logic for memoize
45
+ if memoize == True: # noqa: E712
46
+ global_memoize = False
47
+ elif isinstance(memoize, str) and memoize != '_all':
48
+ # if a single function's name is passed, convert to a list
49
+ global_memoize = [memoize]
50
+ else:
51
+ global_memoize = memoize # if memoize is false, '_all' or a list
52
+
53
+
54
+
55
+ def check_if_nested_fn():
56
+ """Check if the current scope is downstream from another cached function."""
57
+ # Get the current frame
58
+ stack = inspect.stack()
59
+ # skip the first two frames (this function and the current cacheable function)
60
+ for frame_info in stack[2:]:
61
+ frame = frame_info.frame
62
+ func_name = frame.f_code.co_name
63
+ if func_name == "cacheable_wrapper":
64
+ # There is a cachable function upstream of this one
65
+ return True
66
+ # No cachable function upstream of this one
67
+ return False
68
+
69
+
70
+ def sync_local_remote(cache, local_cache):
71
+ """Sync a local cache mirror to a remote cache.
72
+
73
+ Args:
74
+ backend (NuthatchBackend): The backend to use for the cache
75
+ local_backend (NuthatchBackend): The backend to use for the local cache
76
+ """
77
+ local_cache.sync(cache)
78
+
79
+ def get_cache_args(passed_kwargs, default_cache_kwargs, nonlocals):
80
+ """Extract the cache arguments from the kwargs and return them."""
81
+ cache_args = {}
82
+ for k in default_cache_kwargs:
83
+ if k in passed_kwargs:
84
+ cache_args[k] = passed_kwargs[k]
85
+ del passed_kwargs[k]
86
+ else:
87
+ cache_args[k] = default_cache_kwargs[k]
88
+
89
+ if cache_args['cache'] is None:
90
+ cache_args['cache'] = nonlocals['cache']
91
+ if cache_args['namespace'] is None:
92
+ cache_args['namespace'] = nonlocals['namespace']
93
+ if cache_args['engine'] is None:
94
+ cache_args['engine'] = nonlocals['engine']
95
+ if cache_args['memoize'] is None:
96
+ cache_args['memoize'] = nonlocals['memoize']
97
+ if cache_args['backend'] is None:
98
+ cache_args['backend'] = nonlocals['backend']
99
+
100
+ if 'backend_kwargs' in cache_args and isinstance(cache_args['backend_kwargs'], dict):
101
+ cache_args['backend_kwargs'] = cache_args['backend_kwargs'].update(nonlocals['backend_kwargs'])
102
+ elif 'backend_kwargs' in cache_args and cache_args['backend_kwargs'] is None:
103
+ cache_args['backend_kwargs'] = nonlocals['backend_kwargs']
104
+
105
+ if cache_args['cache_local'] is None:
106
+ cache_args['cache_local'] = nonlocals['cache_local']
107
+
108
+ return cache_args
109
+
110
+
111
+ def check_cache_disable_if(cache_disable_if, cache_arg_values):
112
+ """Check if the cache should be disabled for the given kwargs.
113
+
114
+ Cache disable if is a dict or list of dicts. Each dict specifies a set of
115
+ arguments that should disable the cache if they are present.
116
+ """
117
+ if not cache_disable_if:
118
+ return True
119
+
120
+ if isinstance(cache_disable_if, dict):
121
+ cache_disable_if = [cache_disable_if]
122
+ elif isinstance(cache_disable_if, list):
123
+ pass
124
+ else:
125
+ raise ValueError("cache_disable_if only accepts a dict or list of dicts.")
126
+
127
+ for d in cache_disable_if:
128
+ if not isinstance(d, dict):
129
+ raise ValueError("cache_disable_if only accepts a dict or list of dicts.")
130
+
131
+ # Get the common keys
132
+ common_keys = set(cache_arg_values).intersection(d)
133
+
134
+ # Remove any args not passed
135
+ comp_arg_values = {key: cache_arg_values[key] for key in common_keys}
136
+ d = {key: d[key] for key in common_keys}
137
+
138
+ # Iterate through each key and check if the values match, with support for lists
139
+ key_match = [
140
+ (not isinstance(d[k], list) and comp_arg_values[k] == d[k]) or
141
+ (isinstance(d[k], list) and comp_arg_values[k] in d[k])
142
+ for k in common_keys
143
+ ]
144
+ # Within a cache disable if dict, if all keys match, disable the cache
145
+ if all(key_match):
146
+ print(f"Caching disabled for arg values {d}")
147
+ return False
148
+
149
+ # Keep the cache enabled - we didn't find a match
150
+ return True
151
+
152
+
153
+ def extract_cache_arg_values(cache_args, args, params, kwargs):
154
+ """Extract the cache arguments from the kwargs and return them."""
155
+ # Handle keying based on cache arguments
156
+ cache_arg_values = {}
157
+
158
+ for a in cache_args:
159
+ # If it's in kwargs, great
160
+ if a in kwargs:
161
+ cache_arg_values[a] = kwargs[a]
162
+ continue
163
+
164
+ # If it's not in kwargs it must either be (1) in args or (2) passed as default
165
+ found = False
166
+ for i, p in enumerate(params):
167
+ if (a == p and len(args) > i and
168
+ (params[p].kind == Parameter.VAR_POSITIONAL or
169
+ params[p].kind == Parameter.POSITIONAL_OR_KEYWORD)):
170
+ cache_arg_values[a] = args[i]
171
+ found = True
172
+ break
173
+ elif a == p and params[p].default != Parameter.empty:
174
+ cache_arg_values[a] = params[p].default
175
+ found = True
176
+ break
177
+
178
+ if not found:
179
+ raise RuntimeError(f"Specified cacheable argument {a} "
180
+ "not discovered as passed argument or default argument.")
181
+
182
+ return cache_arg_values
183
+
184
+
185
+ def get_cache_key(func, cache_arg_values):
186
+ """Calculate the cache key from the function and the cache arguments."""
187
+ imkeys = list(cache_arg_values.keys())
188
+ imkeys.sort()
189
+ sorted_values = [cache_arg_values[i] for i in imkeys]
190
+ flat_values = []
191
+ for val in sorted_values:
192
+ if isinstance(val, list):
193
+ sub_vals = [str(v) for v in val]
194
+ sub_vals.sort()
195
+ flat_values += sub_vals
196
+ elif isinstance(val, dict):
197
+ sub_vals = [f"{k}-{v}" for k, v in val.items()]
198
+ sub_vals.sort()
199
+ flat_values += sub_vals
200
+ else:
201
+ flat_values.append(str(val))
202
+
203
+ return func.__name__ + '/' + '_'.join(flat_values)
204
+
205
+
206
+ def instantiate_read_caches(cache_key, namespace, cache_arg_values, requested_backend, backend_kwargs):
207
+ """Returns a priority ordered list of caches to read from.
208
+
209
+ Args:
210
+ cache_key (str): The cache key.
211
+ namespace (str): The namespace.
212
+ cache_arg_values (dict): The cache arguments.
213
+ requested_backend (str): The requested backend.
214
+ backend_kwargs (dict): The backend kwargs.
215
+
216
+ Returns:
217
+ A priority ordered list of caches to read from.
218
+ """
219
+ # The general order of priority to check validity is:
220
+ # (1) local if local is requested and local config is provided
221
+ # (2) the root cache
222
+ # (3) any mirror caches
223
+
224
+ # Start by trying to instantiate the metadata stores
225
+ # then do our best to instantiate the backends themselves
226
+ resolution_list = ['local', 'root', 'mirror']
227
+ caches = {}
228
+
229
+ for location in resolution_list:
230
+ cache = None
231
+ cache_config = get_config(location=location, requested_parameters=Cache.config_parameters)
232
+ if cache_config:
233
+ cache = Cache(cache_config, cache_key, namespace, cache_arg_values,
234
+ location, requested_backend, backend_kwargs)
235
+ elif location == 'root':
236
+ raise ValueError("At least a root filesystem for metadata storage must be configured. No configuration found.")
237
+
238
+ caches[location] = cache
239
+
240
+ return caches
241
+
242
+
243
+ def cache(cache=True,
244
+ namespace=None,
245
+ cache_args=[],
246
+ cache_disable_if=None,
247
+ engine=None,
248
+ backend=None,
249
+ backend_kwargs=None,
250
+ storage_backend=None,
251
+ storage_backend_kwargs=None,
252
+ cache_local=False,
253
+ memoize=False,
254
+ primary_keys=None):
255
+ """Decorator for caching function results.
256
+
257
+ Args:
258
+ cache_args(list): The arguments to use as the cache key.
259
+ backend_kwargs(dict): A dictionary of backend-specific arguments that will be passed to
260
+ and used back the backend for writign and reading
261
+ cache(bool): Whether to cache the result.
262
+ force_overwrite(bool): Whether to overwrite the cache if it
263
+ already exists (if False, will prompt the user before overwriting).
264
+ retry_null_cache(bool): If True, ignore and delete the null caches and attempts to recompute
265
+ result for null values. If False (default), will return None for null caches.
266
+ cache_disable_if(dict, list): If the cache arguments match the dict or list of dicts
267
+ then the cache will be disabled. This is useful for disabling caching based on
268
+ certain arguments. Defaults to None.
269
+ backend(str): The name of the backend to use for cache recall/storage. None for
270
+ default, zarr, delta, postgres, terracotta.
271
+ backend_kwargs(dict): A dictionary of backend-specific arguments that will be passed to
272
+ and used back the backend for reading and possibly writing
273
+ storage_backend(str): The name of the backend to use for cache storage only. None
274
+ to match backend. Useful for pulling from one backend and writing to another.
275
+ storage_backend_kwargs(dict): A dictionary of backend-specific arguments that will be passed to
276
+ and used back the backend for writing
277
+ cache_local (bool): If True, will mirror the result locally, at the location
278
+ specified by the LOCAL_CACHE_ROOT_DIR variable. Default is False.
279
+ memoize(bool): Whether to memoize the result in memory. Default is False.
280
+ primary_keys (list(str)): Column names of the primary keys to user for upsert.
281
+ """
282
+ # Valid configuration kwargs for the cacheable decorator
283
+ default_cache_kwargs = {
284
+ "cache": None,
285
+ "namespace": None,
286
+ "engine": None,
287
+ "backend": None,
288
+ "backend_kwargs": None,
289
+ "cache_local": False,
290
+ "storage_backend": None,
291
+ "storage_backend_kwargs": None,
292
+ "filepath_only": False,
293
+ "recompute": False,
294
+ "memoize": False,
295
+ "force_overwrite": None,
296
+ "retry_null_cache": False,
297
+ "upsert": False,
298
+ "fail_if_no_cache": False,
299
+ }
300
+
301
+ nonlocals = locals()
302
+
303
+ def create_cacheable(func):
304
+
305
+ @wraps(func)
306
+ def cacheable_wrapper(*args, **passed_kwargs):
307
+ final_cache_config = get_cache_args(passed_kwargs, default_cache_kwargs, nonlocals)
308
+
309
+ # Set all the final cache config variables
310
+ cache = final_cache_config['cache']
311
+ namespace = final_cache_config['namespace']
312
+ engine = final_cache_config['engine']
313
+ backend = final_cache_config['backend']
314
+ backend_kwargs = final_cache_config['backend_kwargs']
315
+ storage_backend = final_cache_config['storage_backend']
316
+ storage_backend_kwargs = final_cache_config['storage_backend_kwargs']
317
+ filepath_only = final_cache_config['filepath_only']
318
+ recompute = final_cache_config['recompute']
319
+ cache_local = final_cache_config['cache_local']
320
+ memoize = final_cache_config['memoize']
321
+ force_overwrite = final_cache_config['force_overwrite']
322
+ retry_null_cache = final_cache_config['retry_null_cache']
323
+ upsert = final_cache_config['upsert']
324
+ fail_if_no_cache = final_cache_config['fail_if_no_cache']
325
+ cache_args = nonlocals['cache_args']
326
+ cache_disable_if = nonlocals['cache_disable_if']
327
+ primary_keys = nonlocals['primary_keys']
328
+
329
+ # Check if this is a nested cacheable function
330
+ if not check_if_nested_fn():
331
+ # This is a top level cacheable function, reset global cache variables
332
+ set_global_cache_variables(recompute=recompute, memoize=memoize,
333
+ force_overwrite=force_overwrite,
334
+ retry_null_cache=retry_null_cache)
335
+ if isinstance(recompute, list) or isinstance(recompute, str) or recompute == '_all':
336
+ recompute = True
337
+ if isinstance(memoize, list) or isinstance(memoize, str) or memoize == '_all':
338
+ memoize = True
339
+ else:
340
+ # Inherit global cache variables
341
+ global global_recompute, global_memoize, global_force_overwrite, global_retry_null_cache
342
+
343
+ # Set all global variables
344
+ if global_force_overwrite is not None:
345
+ force_overwrite = global_force_overwrite
346
+ if global_retry_null_cache is not None:
347
+ retry_null_cache = global_retry_null_cache
348
+ if global_recompute:
349
+ if func.__name__ in global_recompute or global_recompute == '_all':
350
+ recompute = True
351
+ if global_memoize:
352
+ if func.__name__ in global_memoize or global_memoize == '_all':
353
+ memoize = True
354
+
355
+ # The the function parameters and their values
356
+ params = signature(func).parameters
357
+ cache_arg_values = extract_cache_arg_values(cache_args, args, params, passed_kwargs)
358
+
359
+ # Disable the cache if it's enabled and the function params/values match the disable statement
360
+ if cache:
361
+ cache = check_cache_disable_if(cache_disable_if, cache_arg_values)
362
+
363
+ # Calculate our unique cache key from the params and values
364
+ cache_key = get_cache_key(func, cache_arg_values)
365
+
366
+ ds = None
367
+ compute_result = True
368
+
369
+ read_caches = instantiate_read_caches(cache_key, namespace, cache_arg_values, backend, backend_kwargs)
370
+
371
+ # Try to sync local/remote only once on read. All syncing is done lazily
372
+ if cache_local:
373
+ if not read_caches['local']:
374
+ raise ValueError("Local filesystem must be configured if local caching is requested.")
375
+
376
+ sync_local_remote(read_caches['root'], read_caches['local'])
377
+ else:
378
+ # If local isn't set we shuldn't use it even if it's configured
379
+ read_caches['local'] = None
380
+
381
+
382
+ # Try the memoizer first
383
+ if not recompute and not upsert and cache and memoize:
384
+ ds = recall_from_memory(cache_key)
385
+ if ds:
386
+ print(f"Found cache for {cache_key} in memory.")
387
+ compute_result = False
388
+
389
+ # Try to read from the cache in priority locations
390
+ used_read_backend = None
391
+ if not recompute and not upsert and cache and not ds:
392
+ for location, read_cache in read_caches.items():
393
+ # If the metadata is null this backend isn't configured - continue
394
+ if not read_cache:
395
+ continue
396
+
397
+ # First check if it's null
398
+ if read_cache.is_null():
399
+ print(f"Found null cache for {cache_key} in {location} cache.")
400
+ if retry_null_cache:
401
+ print("Retry null cache set. Recomputing.")
402
+ read_cache.delete_null()
403
+ break
404
+ else:
405
+ return None
406
+
407
+ # If it's not null see if it exists
408
+ if read_cache.exists():
409
+ print(f"Found cache for {cache_key} with backend {read_cache.get_backend()} in {location} cache")
410
+
411
+ used_read_backend = read_cache.get_backend()
412
+ if filepath_only:
413
+ return read_cache.get_file_path()
414
+ else:
415
+ ds = read_cache.read(engine=engine)
416
+
417
+ if memoize:
418
+ print(f"Memoizing {cache_key}.")
419
+ save_to_memory(cache_key, ds)
420
+
421
+ compute_result = False
422
+ break
423
+
424
+ # If the cache doesn't exist or we are recomputing, compute the result
425
+ if compute_result:
426
+ if recompute:
427
+ print(f"Recompute for {cache_key} requested. Not checking for cached result.")
428
+ elif upsert:
429
+ print(f"Computing {cache_key} to enable data upsert.")
430
+ elif not cache:
431
+ # The function isn't cacheable, recomputing
432
+ pass
433
+ else:
434
+ if fail_if_no_cache:
435
+ raise RuntimeError(f"""Computation has been disabled by
436
+ `fail_if_no_cache` and cache doesn't exist for {cache_key}.""")
437
+
438
+ print(f"Cache doesn't exist for {cache_key}. Running function")
439
+
440
+ ##### IF NOT EXISTS ######
441
+ ds = func(*args, **passed_kwargs)
442
+ ##########################
443
+
444
+ if memoize:
445
+ print(f"Memoizing {cache_key}.")
446
+ save_to_memory(cache_key, ds)
447
+
448
+
449
+ # Store the result
450
+ if cache and (compute_result or (storage_backend and storage_backend != used_read_backend)):
451
+ # Instantiate write backend
452
+ write_cache = None
453
+ write_cache_config = get_config(location='root', requested_parameters=Cache.config_parameters)
454
+ if write_cache_config:
455
+ if not storage_backend and not backend:
456
+ storage_backend = get_default_backend(type(ds))
457
+ if not storage_backend_kwargs:
458
+ storage_backend_kwargs = backend_kwargs
459
+ elif backend:
460
+ storage_backend = backend
461
+ storage_backend_kwargs = backend_kwargs
462
+
463
+ write_cache = Cache(write_cache_config, cache_key, namespace,
464
+ cache_arg_values, 'root', storage_backend, storage_backend_kwargs)
465
+ else:
466
+ raise ValueError("At least a root filesystem for metadata storage must be configured. No configuration found.")
467
+
468
+
469
+ if ds is None:
470
+ if not upsert:
471
+ write_cache.set_null()
472
+ else:
473
+ print("Null result not cached in upsert mode.")
474
+
475
+ return None
476
+
477
+ write = False # boolean to determine if we should write to the cache
478
+ if (write_cache.exists() and force_overwrite is None and not upsert):
479
+ inp = input(f"""A cache already exists at {cache_key} for type {write_cache.get_backend()}
480
+ Are you sure you want to overwrite it? (y/n)""")
481
+ if inp == 'y' or inp == 'Y':
482
+ write = True
483
+ elif force_overwrite is False:
484
+ pass
485
+ else:
486
+ write = True
487
+
488
+ if write:
489
+ print(f"Caching result for {cache_key} in {write_cache.get_backend()}.")
490
+ write_cache.write(ds, upsert=upsert, primary_keys=primary_keys)
491
+
492
+ if filepath_only:
493
+ return write_cache.get_file_path()
494
+ else:
495
+ return ds
496
+
497
+ return cacheable_wrapper
498
+ return create_cacheable
nuthatch/processor.py ADDED
@@ -0,0 +1,89 @@
1
+ """
2
+ This module contains the NuthatchProcessor class, which is a base class for all Nuthatch processors.
3
+ """
4
+ from abc import ABC, abstractmethod
5
+ from inspect import signature
6
+
7
+ class NuthatchProcessor(ABC):
8
+ """
9
+ Base class for all Nuthatch processors.
10
+
11
+ A NuthatchProcessor is a class that wraps a function and provides a way to process the function's arguments,
12
+ post-process the function's return value, and validate the function's return value.
13
+ """
14
+ def __init__(self, func):
15
+ self.func = func
16
+
17
+ def __call__(self, *args, **kwargs):
18
+ """
19
+ Call the wrapped function with the given arguments.
20
+ """
21
+ params = signature(self.func).parameters
22
+ args, kwargs = self.process_arguments(params, args, kwargs)
23
+
24
+ data = self.func(*args, **kwargs)
25
+
26
+ if not self.validate_data(data):
27
+ if 'force_overwrite' in kwargs and kwargs['force_overwrite']:
28
+ print("Data validation failed and forceoverwrite set. Overwriting the result.")
29
+ kwargs['recompute'] = True #TODO - does this mess with recompute?
30
+ data = self.func(*args, **kwargs)
31
+ else:
32
+ inp = input("""Data failed validation. Would you like to overwrite the result (y/n)?""")
33
+ if inp == 'y' or inp == 'Y':
34
+ kwargs['recompute'] = True #TODO - does this mess with recompute?
35
+ kwargs['force_overwrite'] = True #TODO - does this mess with recompute?
36
+ data = self.func(*args, **kwargs)
37
+
38
+ data = self.post_process(data)
39
+
40
+ return data
41
+
42
+ @abstractmethod
43
+ def post_process(self, data):
44
+ """
45
+ Post-process the data.
46
+
47
+ Args:
48
+ data: The data to post-process.
49
+
50
+ Returns:
51
+ The post-processed data.
52
+
53
+ Raises:
54
+ ValueError: If the data is of the wrong type.
55
+ """
56
+ return data
57
+
58
+ def process_arguments(self, params, args, kwargs):
59
+ """
60
+ Process the arguments.
61
+
62
+ Args:
63
+ params: The parameters of the function.
64
+ args: The arguments to the function.
65
+ kwargs: The keyword arguments to the function.
66
+
67
+ Returns:
68
+ The processed arguments and keyword arguments to be passed to the function.
69
+ """
70
+ return args, kwargs
71
+
72
+ @abstractmethod
73
+ def validate_data(self, data):
74
+ """
75
+ Validate the data. Used to trigger recomputation if the data is invalid.
76
+
77
+ Args:
78
+ data: The data to validate.
79
+
80
+ Returns:
81
+ True if the data is valid, False otherwise.
82
+ """
83
+ return True
84
+
85
+ # example use of nutach processor
86
+ #def my_dec_factory(param1, param2):
87
+ # def decorator(func):
88
+ # return MyDecorator(func, param1, param2)
89
+ # return decorator
@@ -0,0 +1,6 @@
1
+ """
2
+ This module contains Nuthatch processors.
3
+ """
4
+ from .timeseries import timeseries
5
+
6
+ __all__ = ["timeseries"]