nuthatch 0.3.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. nuthatch-0.3.8/.github/workflows/publish.yml +36 -0
  2. nuthatch-0.3.8/.github/workflows/test.yml +51 -0
  3. nuthatch-0.3.8/.gitignore +16 -0
  4. nuthatch-0.3.8/.python-version +1 -0
  5. nuthatch-0.3.8/PKG-INFO +262 -0
  6. nuthatch-0.3.8/README.md +237 -0
  7. nuthatch-0.3.8/examples/readme_examples.py +31 -0
  8. nuthatch-0.3.8/pyproject.toml +129 -0
  9. nuthatch-0.3.8/src/nuthatch/__init__.py +12 -0
  10. nuthatch-0.3.8/src/nuthatch/backend.py +301 -0
  11. nuthatch-0.3.8/src/nuthatch/backends/__init__.py +8 -0
  12. nuthatch-0.3.8/src/nuthatch/backends/basic.py +30 -0
  13. nuthatch-0.3.8/src/nuthatch/backends/delta.py +51 -0
  14. nuthatch-0.3.8/src/nuthatch/backends/parquet.py +135 -0
  15. nuthatch-0.3.8/src/nuthatch/backends/sql.py +156 -0
  16. nuthatch-0.3.8/src/nuthatch/backends/terracotta.py +245 -0
  17. nuthatch-0.3.8/src/nuthatch/backends/zarr.py +240 -0
  18. nuthatch-0.3.8/src/nuthatch/cache.py +478 -0
  19. nuthatch-0.3.8/src/nuthatch/cli.py +306 -0
  20. nuthatch-0.3.8/src/nuthatch/config.py +416 -0
  21. nuthatch-0.3.8/src/nuthatch/memoizer.py +108 -0
  22. nuthatch-0.3.8/src/nuthatch/nuthatch.py +877 -0
  23. nuthatch-0.3.8/src/nuthatch/processor.py +151 -0
  24. nuthatch-0.3.8/src/nuthatch/processors/__init__.py +6 -0
  25. nuthatch-0.3.8/src/nuthatch/processors/timeseries.py +142 -0
  26. nuthatch-0.3.8/src/nuthatch/test_secrets.py +17 -0
  27. nuthatch-0.3.8/tests/__init__.py +0 -0
  28. nuthatch-0.3.8/tests/backends/__init__.py +0 -0
  29. nuthatch-0.3.8/tests/backends/tabular_test.py +62 -0
  30. nuthatch-0.3.8/tests/backends/test_basic.py +22 -0
  31. nuthatch-0.3.8/tests/backends/test_delta.py +5 -0
  32. nuthatch-0.3.8/tests/backends/test_parquet.py +5 -0
  33. nuthatch-0.3.8/tests/backends/test_sql.py +26 -0
  34. nuthatch-0.3.8/tests/backends/test_zarr.py +133 -0
  35. nuthatch-0.3.8/tests/processors/test_timeseries.py +211 -0
  36. nuthatch-0.3.8/tests/test_backend_identification.py +10 -0
  37. nuthatch-0.3.8/tests/test_cache_args.py +176 -0
  38. nuthatch-0.3.8/tests/test_config_registration.py +35 -0
  39. nuthatch-0.3.8/tests/test_core.py +94 -0
  40. nuthatch-0.3.8/tests/test_engines.py +1 -0
  41. nuthatch-0.3.8/tests/test_local.py +43 -0
  42. nuthatch-0.3.8/tests/test_memoizer.py +59 -0
  43. nuthatch-0.3.8/tests/test_mirror_storage.py +26 -0
  44. nuthatch-0.3.8/tests/test_namespace.py +27 -0
  45. nuthatch-0.3.8/tests/test_nuthatch_metastore.py +123 -0
  46. nuthatch-0.3.8/uv.lock +2477 -0
@@ -0,0 +1,36 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ permissions:
11
+ id-token: write # Required for trusted publishing
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ with:
16
+ fetch-depth: 0 # Fetch all history for git tags
17
+
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: '3.12'
22
+
23
+ - name: Install uv
24
+ run: |
25
+ curl -LsSf https://astral.sh/uv/install.sh | sh
26
+ echo "$HOME/.cargo/bin" >> $GITHUB_PATH
27
+
28
+ - name: Build package
29
+ run: |
30
+ uv build
31
+
32
+ - name: Publish to PyPI
33
+ run: |
34
+ uv publish
35
+ env:
36
+ UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,51 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, develop ]
6
+ pull_request:
7
+ branches: [ main, develop ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ matrix:
14
+ os: [ubuntu-latest]
15
+ python-version: ['3.12']
16
+ permissions:
17
+ contents: read
18
+ id-token: write
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+
22
+ - name: Auth to Google Cloud
23
+ uses: google-github-actions/auth@v2
24
+ with:
25
+ credentials_json: ${{ secrets.GCLOUD_CREDENTIALS }}
26
+
27
+ - name: Set up Python ${{ matrix.python-version }}
28
+ uses: actions/setup-python@v5
29
+ with:
30
+ python-version: ${{ matrix.python-version }}
31
+
32
+ - name: Install uv
33
+ run: |
34
+ curl -LsSf https://astral.sh/uv/install.sh | sh
35
+ echo "$HOME/.cargo/bin" >> $GITHUB_PATH
36
+
37
+ - name: Install dependencies
38
+ run: |
39
+ uv pip install --system -e . --group dev
40
+
41
+ - name: Run tests with pytest
42
+ run: |
43
+ pytest -v --cov=nuthatch --cov-report=xml --cov-report=term
44
+
45
+ - name: Upload coverage to Codecov
46
+ uses: codecov/codecov-action@v4
47
+ with:
48
+ file: ./coverage.xml
49
+ flags: unittests
50
+ name: codecov-${{ matrix.os }}-py${{ matrix.python-version }}
51
+ fail_ci_if_error: false
@@ -0,0 +1,16 @@
1
+ # python generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # venv
10
+ .venv
11
+
12
+ .cache
13
+ .cache*
14
+
15
+ .coverage
16
+ coverage.xml
@@ -0,0 +1 @@
1
+ 3.12.4
@@ -0,0 +1,262 @@
1
+ Metadata-Version: 2.3
2
+ Name: nuthatch
3
+ Version: 0.3.8
4
+ Summary: Cacheable big data pipelines
5
+ Author-email: Joshua Adkins <josh@rhizaresearch.org>, Genevieve Flaspohler <geneviee@rhizaresearch.org>
6
+ License: MIT
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: click
9
+ Requires-Dist: dask-deltatable
10
+ Requires-Dist: dask[dataframe]
11
+ Requires-Dist: dateparser
12
+ Requires-Dist: deltalake==1.1.2
13
+ Requires-Dist: fsspec
14
+ Requires-Dist: gitpython
15
+ Requires-Dist: pandas
16
+ Requires-Dist: polars
17
+ Requires-Dist: psycopg2-binary
18
+ Requires-Dist: pyarrow
19
+ Requires-Dist: sqlalchemy
20
+ Requires-Dist: terracotta
21
+ Requires-Dist: tomli-w
22
+ Requires-Dist: xarray
23
+ Requires-Dist: zarr
24
+ Description-Content-Type: text/markdown
25
+
26
+ # Nuthatch
27
+
28
+ [![Tests](https://github.com/rhiza-research/nuthatch/actions/workflows/test.yml/badge.svg)](https://github.com/rhiza-research/nuthatch/actions/workflows/test.yml)
29
+
30
+ Nuthatch is a tool for building pure-python big data pipelines. At its core it
31
+ enables the transparent multi-level caching and recall of results in formats that
32
+ are efficient for each data type. It supports a variety of
33
+ common storage backends, data processing frameworks, and their associated
34
+ data types for caching.
35
+
36
+ It also provides a framework for re-using and sharing data-type specific
37
+ post-processing, and for these data type
38
+ processors to pass hints to storage backends for more efficient storage and recall.
39
+
40
+ Nuthatch was created to alleviate the comon pattern of data processing pipelines manually
41
+ specifying their output storage locations, and the requirements of pipeline builders to
42
+ use external data orchestration tools to specify the execution of their pipelines. With Nuthatch
43
+ simply tag your functions and anyone who has access to your storage backend - you, your
44
+ team, or the public - can access and build off of your most up-to-date data.
45
+
46
+ ## Getting started
47
+
48
+ The most basic form of Nuthatch simply stores and recalls your data based on its arguments in efficient formats:
49
+
50
+ ```python
51
+ from nuthatch import cache
52
+ import xarray as xr
53
+
54
+ @cache()
55
+ def my_first_cache():
56
+ ds = xr.tutorial.open_dataset("air_temperature")
57
+
58
+ # Data will automatically be saved in a zarr store and recalled
59
+ return ds
60
+
61
+ my_first_cache(cache_mode='local')
62
+ ```
63
+
64
+ But it's much more powerful if you configure nuthatch to be shared across a team:
65
+
66
+ ```python
67
+ from nuthatch import cache, set_parameter
68
+ import xarray as xr
69
+
70
+ set_parameter({'filesystem': "gs://my-datalake"})
71
+
72
+ @cache()
73
+ def my_first_cache():
74
+ ds = xr.tutorial.open_dataset("air_temperature")
75
+
76
+ # Data will automatically be saved in a zarr store and recalled
77
+ return ds
78
+
79
+ my_first_cache()
80
+ ```
81
+
82
+ Commit your code and anyone with access to your datalake has access to a self-documented cache of your data.
83
+
84
+ More powerful - push your code to pypi and anyone who imports your code can access the data simply
85
+ by calling the function (assuming they have read-only access to the storage.)
86
+
87
+ ## Slightly more advanced use cases
88
+
89
+ Nuthatch has many more features:
90
+ - Caches that are keyed by argument
91
+ - Processors to enable slicing and data validation
92
+ - Rerunning of DAGs explicitly
93
+ - Per-data-type memoization of results (i.e. persisting an xarray and recalling the compute graph from memory)
94
+ - Caching of data locally for lower-latency access
95
+ - Namespacing of caches to rerun the same data pipeline for multiple scenarios
96
+ - Cache versioning (to invalidate stale caches)
97
+
98
+ ```python
99
+ set_parameter({'filesystem': "gs://my-datalake"})
100
+
101
+ @timeseries(timeseries='time')
102
+ @cache(cache_args=['agg_days'],
103
+ version="0.1.0")
104
+ def agg_and_clip(start_time, end_time, agg_days=1):
105
+ ds = my_first_cache()
106
+
107
+ # aggregate based on time
108
+ ds = ds.rolling({'time': agg_days}).mean()
109
+
110
+ return ds
111
+
112
+ # Daily aggregate
113
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=1)
114
+
115
+ # Daily aggregate recalled, persisted in memory (or cluster memory if setup), and clipped to 2013-06
116
+ agg_and_clip("2013-01-01", "2013-06-01", agg_days=1, memoize=True)
117
+
118
+ # Daily aggregate recalled from memory and clipped to 2013-06
119
+ agg_and_clip("2013-01-01", "2013-06-01", agg_days=1, memoize=True)
120
+
121
+ # Weekly aggregate computed fresh
122
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7)
123
+
124
+ # Weekly aggregate recomputed and overwrite existing cache
125
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, recompute=True, cache_mode='overwrite')
126
+
127
+ # Weekly aggregate with both functions recomputed and overwritten
128
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, recompute=['agg_and_clip', 'my_first_cache'], cache_mode='overwrite')
129
+
130
+ # Weekly aggregate with both functions recomputed and saved to a local cache for faster recall
131
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, recompute=['agg_and_clip', 'my_first_cache'], cache_mode='local')
132
+
133
+ # Weekly aggregate with cache saved to a new namespace
134
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, namespace='experiment2')
135
+ ```
136
+
137
+ ## Nuthatch caching levels
138
+
139
+ ### Root
140
+ The root cache is your main storage location. It's often the remote cloud bucket serving as your project's datalake, but it could also be a shared drive on a local cluster.
141
+
142
+ ### Local
143
+ If you use a local mode, nuthatch will automatically create a local cache for you and store data there for more efficient recall.
144
+
145
+ ### Mirror(s)
146
+ You can configure any number of read-only mirrors to look for your data in. If you import a project that uses nuthatch its root and all of its mirrors will be added as your mirrors so that you can fetch nuthatch data/functions that it defines.
147
+
148
+ ## Nuthatch cache modes
149
+
150
+ When calling nuthatch functions you can operate in several distinct modes which control which of the levels you write to and read from.
151
+
152
+ #### cache_mode='write'
153
+ The default mode when you have a root cache configured. By default writes to and reads from the root cache if the function is set to be cached. This mode prompts the user if a cache exists before overwriting.
154
+
155
+ #### cache_mode='overwrite'
156
+ Same as above but does not prompt the user before overwriting.
157
+
158
+ #### cache_mode='read_only'
159
+ This reads from all available cache locations but writes to no location, simply returns the results (or computes them if they do not exist). This is the default mode if you do not have a root configured. This still allows you to import external projects and read their data.
160
+
161
+ #### cache_mode='local'
162
+ This mode reads from all available caches, and will store results to your local cache for faster recall. This is useful for the faster recall of data from a remote source.
163
+
164
+ #### cache_mode='offline'
165
+ This mode only reads from local caches and doesn't read from the root cache.
166
+
167
+ ## Nuthatch supported backends
168
+
169
+ Nuthatch supports multiple backends for writing, and multiple engines (datatypes) for reading from those backends. The following are currently supported. Backends beyond the defaults can be configured by passing `backend='name'` and read data types beyond the default can be configured by passing `engine=<type>`. I.e. a function returning pandas dataframe for storing in parquet instead of delta could pass `backend='parquet', engine=pandas.DataFrame`
170
+
171
+ | Backend | Default for Data Type | Parameters | Supported read engines |
172
+ |---|:---:|:---:|:---:|
173
+ | Basic (pickle) | Basic python types, default for unmatched types | filesystem, <br>filesystem_options (optional) | N/A, unpickles to stored typed |
174
+ | Zarr | xarray.Dataset, xarray.DataArray | filesystem, <br>filesystem_options (optional) | xarray.Dataset, xarray.DataArray |
175
+ | Parquet | dask.dataframe.DataFrame | filesystem, <br>filesystem_options (optional) | pandas.DataFrame, dask.dataframe.DataFrame |
176
+ | Deltalake | pandas.DataFrame | filesystem, <br>filesystem_options (optional) | pandas.DataFrame, dask.dataframe |
177
+ | SQL | None | host, port, database, driver, user, password,<br>write_user (optional), write_password (optional) | pandas, dask.dataframe |
178
+
179
+
180
+
181
+
182
+ ## Nuthatch configuration
183
+
184
+ If you are developing a Nuthatch-based project you should configure its root filestore, and possible its mirrors
185
+ and your preferred local caching location. The root store most likely a remote cloud bucket (like gcs, s3, etc). Configuration can be done in three places: (1) in your pyproject.toml, (2) in a special nuthatch.toml built into your package
186
+ or (3) in your code - useful if you need to access secrets dynamical or configure nuthatch on distributed workers.
187
+
188
+ Nuthatch itself and most storage backends only need access to a filesystem. Some storage backends, like databases,
189
+ may need additional parameters.
190
+
191
+ ### TOML Configuration
192
+
193
+ In either pyproject.toml or src/nuthatch.toml:
194
+
195
+ ```toml
196
+ [tool.nuthatch]
197
+ filesystem = "s3://my-bucket/caches"
198
+
199
+ [tool.nuthatch.filesystem_options]
200
+ key = "your_key_id"
201
+ secret= "your_secret_key" #do NOT put your secret in your toml file. Use dynamic secrets below.
202
+ ```
203
+
204
+ pyproject.toml cannot be easily packaged. If you would like your caches to be accessible
205
+ when your package is installed and imported by others, you must use either a nuthatch.toml
206
+ file or dynamic configuration. Make sure your nuthatch.toml is packaged with your project!
207
+
208
+ ### Dynamic configuration - decorators
209
+
210
+ You *should not* save secrets in files. To solve this problem nuthatch enables a method of fetching
211
+ secrets dynamically, from a cloud secret store, or from another location like an environment variable or
212
+ file. Just make sure this file is imported before you run your code
213
+
214
+ ```python
215
+ from nuthatch import config_parameter
216
+
217
+ @config_parameter('filesystem_options', secret=True)
218
+ def fetch_key():
219
+ # Fetch from secret store, environment, etc
220
+ filesystem_options = {
221
+ 'key': os.environ['S3_KEY'],
222
+ 'secret': os.environ['S3_SECRET']
223
+ }
224
+
225
+ return filesystem_options
226
+ ```
227
+
228
+ ### Dynamic configuration - direct setting
229
+
230
+ You can also simply set configuration parameters in code, which is sometimes necessary
231
+ for distributed environments
232
+
233
+ ```python
234
+ from nuthatch import set_parameter
235
+ set_parameter({'filesystem': "gs://my-datalake"})
236
+ ```
237
+
238
+ ### Backend-specific configuration
239
+
240
+ Nuthatch backends can be individually configured - for instance if all of your Zarr's are too big for
241
+ the datalake and need cheaper storage you can set the zarr backend to have a different fileysystem location:
242
+
243
+ ```toml
244
+ [tool.nuthatch.root.zarr]
245
+ filesystem = "s3://my-zarr-bucket/"
246
+ ```
247
+
248
+ ### Environment variables
249
+
250
+ You can configure nuthatch with environment variables in the form NUTHATCH_<LOCATION>_<PARAMETER_NAME> or NUTHATCH_<LOCATION>_<BACKEND>_<PARAMETER_NAME> or NUTHATCH_<PARAMETER_NAME> (where location defalts to root.)
251
+
252
+ There is a special environment varialbe 'NUTHATCH_ALLOW_INSTALLED_PACKAGE_CONFIGURATION' that enables dynamic
253
+ parameters to set the root parameters even when Nuthatch is an installed package. This is useful for running
254
+ on clusters where your package is installed for execution even though it is the primary project.
255
+
256
+
257
+ ## Nuthatch Limitations
258
+
259
+ Current limitations:
260
+ - Arguments must be basic types, not objects to key caches
261
+ - There is currently no mechanism to detect cache "staleness". Automatically tracking and detecting changes is planned for future work.
262
+ - We have **not** tested Nuthatch on S3 and Azure blob storage, only on google cloud, but that is ongoing and hopefully an update will be released soon.
@@ -0,0 +1,237 @@
1
+ # Nuthatch
2
+
3
+ [![Tests](https://github.com/rhiza-research/nuthatch/actions/workflows/test.yml/badge.svg)](https://github.com/rhiza-research/nuthatch/actions/workflows/test.yml)
4
+
5
+ Nuthatch is a tool for building pure-python big data pipelines. At its core it
6
+ enables the transparent multi-level caching and recall of results in formats that
7
+ are efficient for each data type. It supports a variety of
8
+ common storage backends, data processing frameworks, and their associated
9
+ data types for caching.
10
+
11
+ It also provides a framework for re-using and sharing data-type specific
12
+ post-processing, and for these data type
13
+ processors to pass hints to storage backends for more efficient storage and recall.
14
+
15
+ Nuthatch was created to alleviate the comon pattern of data processing pipelines manually
16
+ specifying their output storage locations, and the requirements of pipeline builders to
17
+ use external data orchestration tools to specify the execution of their pipelines. With Nuthatch
18
+ simply tag your functions and anyone who has access to your storage backend - you, your
19
+ team, or the public - can access and build off of your most up-to-date data.
20
+
21
+ ## Getting started
22
+
23
+ The most basic form of Nuthatch simply stores and recalls your data based on its arguments in efficient formats:
24
+
25
+ ```python
26
+ from nuthatch import cache
27
+ import xarray as xr
28
+
29
+ @cache()
30
+ def my_first_cache():
31
+ ds = xr.tutorial.open_dataset("air_temperature")
32
+
33
+ # Data will automatically be saved in a zarr store and recalled
34
+ return ds
35
+
36
+ my_first_cache(cache_mode='local')
37
+ ```
38
+
39
+ But it's much more powerful if you configure nuthatch to be shared across a team:
40
+
41
+ ```python
42
+ from nuthatch import cache, set_parameter
43
+ import xarray as xr
44
+
45
+ set_parameter({'filesystem': "gs://my-datalake"})
46
+
47
+ @cache()
48
+ def my_first_cache():
49
+ ds = xr.tutorial.open_dataset("air_temperature")
50
+
51
+ # Data will automatically be saved in a zarr store and recalled
52
+ return ds
53
+
54
+ my_first_cache()
55
+ ```
56
+
57
+ Commit your code and anyone with access to your datalake has access to a self-documented cache of your data.
58
+
59
+ More powerful - push your code to pypi and anyone who imports your code can access the data simply
60
+ by calling the function (assuming they have read-only access to the storage.)
61
+
62
+ ## Slightly more advanced use cases
63
+
64
+ Nuthatch has many more features:
65
+ - Caches that are keyed by argument
66
+ - Processors to enable slicing and data validation
67
+ - Rerunning of DAGs explicitly
68
+ - Per-data-type memoization of results (i.e. persisting an xarray and recalling the compute graph from memory)
69
+ - Caching of data locally for lower-latency access
70
+ - Namespacing of caches to rerun the same data pipeline for multiple scenarios
71
+ - Cache versioning (to invalidate stale caches)
72
+
73
+ ```python
74
+ set_parameter({'filesystem': "gs://my-datalake"})
75
+
76
+ @timeseries(timeseries='time')
77
+ @cache(cache_args=['agg_days'],
78
+ version="0.1.0")
79
+ def agg_and_clip(start_time, end_time, agg_days=1):
80
+ ds = my_first_cache()
81
+
82
+ # aggregate based on time
83
+ ds = ds.rolling({'time': agg_days}).mean()
84
+
85
+ return ds
86
+
87
+ # Daily aggregate
88
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=1)
89
+
90
+ # Daily aggregate recalled, persisted in memory (or cluster memory if setup), and clipped to 2013-06
91
+ agg_and_clip("2013-01-01", "2013-06-01", agg_days=1, memoize=True)
92
+
93
+ # Daily aggregate recalled from memory and clipped to 2013-06
94
+ agg_and_clip("2013-01-01", "2013-06-01", agg_days=1, memoize=True)
95
+
96
+ # Weekly aggregate computed fresh
97
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7)
98
+
99
+ # Weekly aggregate recomputed and overwrite existing cache
100
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, recompute=True, cache_mode='overwrite')
101
+
102
+ # Weekly aggregate with both functions recomputed and overwritten
103
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, recompute=['agg_and_clip', 'my_first_cache'], cache_mode='overwrite')
104
+
105
+ # Weekly aggregate with both functions recomputed and saved to a local cache for faster recall
106
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, recompute=['agg_and_clip', 'my_first_cache'], cache_mode='local')
107
+
108
+ # Weekly aggregate with cache saved to a new namespace
109
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, namespace='experiment2')
110
+ ```
111
+
112
+ ## Nuthatch caching levels
113
+
114
+ ### Root
115
+ The root cache is your main storage location. It's often the remote cloud bucket serving as your project's datalake, but it could also be a shared drive on a local cluster.
116
+
117
+ ### Local
118
+ If you use a local mode, nuthatch will automatically create a local cache for you and store data there for more efficient recall.
119
+
120
+ ### Mirror(s)
121
+ You can configure any number of read-only mirrors to look for your data in. If you import a project that uses nuthatch its root and all of its mirrors will be added as your mirrors so that you can fetch nuthatch data/functions that it defines.
122
+
123
+ ## Nuthatch cache modes
124
+
125
+ When calling nuthatch functions you can operate in several distinct modes which control which of the levels you write to and read from.
126
+
127
+ #### cache_mode='write'
128
+ The default mode when you have a root cache configured. By default writes to and reads from the root cache if the function is set to be cached. This mode prompts the user if a cache exists before overwriting.
129
+
130
+ #### cache_mode='overwrite'
131
+ Same as above but does not prompt the user before overwriting.
132
+
133
+ #### cache_mode='read_only'
134
+ This reads from all available cache locations but writes to no location, simply returns the results (or computes them if they do not exist). This is the default mode if you do not have a root configured. This still allows you to import external projects and read their data.
135
+
136
+ #### cache_mode='local'
137
+ This mode reads from all available caches, and will store results to your local cache for faster recall. This is useful for the faster recall of data from a remote source.
138
+
139
+ #### cache_mode='offline'
140
+ This mode only reads from local caches and doesn't read from the root cache.
141
+
142
+ ## Nuthatch supported backends
143
+
144
+ Nuthatch supports multiple backends for writing, and multiple engines (datatypes) for reading from those backends. The following are currently supported. Backends beyond the defaults can be configured by passing `backend='name'` and read data types beyond the default can be configured by passing `engine=<type>`. I.e. a function returning pandas dataframe for storing in parquet instead of delta could pass `backend='parquet', engine=pandas.DataFrame`
145
+
146
+ | Backend | Default for Data Type | Parameters | Supported read engines |
147
+ |---|:---:|:---:|:---:|
148
+ | Basic (pickle) | Basic python types, default for unmatched types | filesystem, <br>filesystem_options (optional) | N/A, unpickles to stored typed |
149
+ | Zarr | xarray.Dataset, xarray.DataArray | filesystem, <br>filesystem_options (optional) | xarray.Dataset, xarray.DataArray |
150
+ | Parquet | dask.dataframe.DataFrame | filesystem, <br>filesystem_options (optional) | pandas.DataFrame, dask.dataframe.DataFrame |
151
+ | Deltalake | pandas.DataFrame | filesystem, <br>filesystem_options (optional) | pandas.DataFrame, dask.dataframe |
152
+ | SQL | None | host, port, database, driver, user, password,<br>write_user (optional), write_password (optional) | pandas, dask.dataframe |
153
+
154
+
155
+
156
+
157
+ ## Nuthatch configuration
158
+
159
+ If you are developing a Nuthatch-based project you should configure its root filestore, and possible its mirrors
160
+ and your preferred local caching location. The root store most likely a remote cloud bucket (like gcs, s3, etc). Configuration can be done in three places: (1) in your pyproject.toml, (2) in a special nuthatch.toml built into your package
161
+ or (3) in your code - useful if you need to access secrets dynamical or configure nuthatch on distributed workers.
162
+
163
+ Nuthatch itself and most storage backends only need access to a filesystem. Some storage backends, like databases,
164
+ may need additional parameters.
165
+
166
+ ### TOML Configuration
167
+
168
+ In either pyproject.toml or src/nuthatch.toml:
169
+
170
+ ```toml
171
+ [tool.nuthatch]
172
+ filesystem = "s3://my-bucket/caches"
173
+
174
+ [tool.nuthatch.filesystem_options]
175
+ key = "your_key_id"
176
+ secret= "your_secret_key" #do NOT put your secret in your toml file. Use dynamic secrets below.
177
+ ```
178
+
179
+ pyproject.toml cannot be easily packaged. If you would like your caches to be accessible
180
+ when your package is installed and imported by others, you must use either a nuthatch.toml
181
+ file or dynamic configuration. Make sure your nuthatch.toml is packaged with your project!
182
+
183
+ ### Dynamic configuration - decorators
184
+
185
+ You *should not* save secrets in files. To solve this problem nuthatch enables a method of fetching
186
+ secrets dynamically, from a cloud secret store, or from another location like an environment variable or
187
+ file. Just make sure this file is imported before you run your code
188
+
189
+ ```python
190
+ from nuthatch import config_parameter
191
+
192
+ @config_parameter('filesystem_options', secret=True)
193
+ def fetch_key():
194
+ # Fetch from secret store, environment, etc
195
+ filesystem_options = {
196
+ 'key': os.environ['S3_KEY'],
197
+ 'secret': os.environ['S3_SECRET']
198
+ }
199
+
200
+ return filesystem_options
201
+ ```
202
+
203
+ ### Dynamic configuration - direct setting
204
+
205
+ You can also simply set configuration parameters in code, which is sometimes necessary
206
+ for distributed environments
207
+
208
+ ```python
209
+ from nuthatch import set_parameter
210
+ set_parameter({'filesystem': "gs://my-datalake"})
211
+ ```
212
+
213
+ ### Backend-specific configuration
214
+
215
+ Nuthatch backends can be individually configured - for instance if all of your Zarr's are too big for
216
+ the datalake and need cheaper storage you can set the zarr backend to have a different fileysystem location:
217
+
218
+ ```toml
219
+ [tool.nuthatch.root.zarr]
220
+ filesystem = "s3://my-zarr-bucket/"
221
+ ```
222
+
223
+ ### Environment variables
224
+
225
+ You can configure nuthatch with environment variables in the form NUTHATCH_<LOCATION>_<PARAMETER_NAME> or NUTHATCH_<LOCATION>_<BACKEND>_<PARAMETER_NAME> or NUTHATCH_<PARAMETER_NAME> (where location defalts to root.)
226
+
227
+ There is a special environment varialbe 'NUTHATCH_ALLOW_INSTALLED_PACKAGE_CONFIGURATION' that enables dynamic
228
+ parameters to set the root parameters even when Nuthatch is an installed package. This is useful for running
229
+ on clusters where your package is installed for execution even though it is the primary project.
230
+
231
+
232
+ ## Nuthatch Limitations
233
+
234
+ Current limitations:
235
+ - Arguments must be basic types, not objects to key caches
236
+ - There is currently no mechanism to detect cache "staleness". Automatically tracking and detecting changes is planned for future work.
237
+ - We have **not** tested Nuthatch on S3 and Azure blob storage, only on google cloud, but that is ongoing and hopefully an update will be released soon.
@@ -0,0 +1,31 @@
1
+ from nuthatch import cache
2
+ from nuthatch.processors import timeseries
3
+ import xarray as xr
4
+
5
+ @cache()
6
+ def my_first_cache():
7
+ ds = xr.tutorial.open_dataset("air_temperature")
8
+
9
+ # Data will automatically be saved in a zarr store and recalled
10
+ return ds
11
+
12
+
13
+
14
+ @timeseries(timeseries='time')
15
+ @cache(cache_args=['agg_days'])
16
+ def agg_and_clip(start_time, end_time, agg_days=1):
17
+ ds = my_first_cache()
18
+
19
+ # aggregate based on time
20
+ ds = ds.rolling({'time': agg_days}).mean()
21
+
22
+ return ds
23
+
24
+ my_first_cache()
25
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=1) # Daily aggregate
26
+ agg_and_clip("2013-01-01", "2013-06-01", agg_days=1, memoize=True) # Dailu aggregate recalled, persisted in memory, and clipped to 2021
27
+ agg_and_clip("2013-01-01", "2013-06-01", agg_days=1, memoize=True) # Daily aggregate recalled from memory and clipped to 2021
28
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7) # Weekly aggregate computed fresh
29
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, recompute=True, force_overwrite=True) # Weekly aggregate recomputed and overwrite existing cache
30
+ agg_and_clip("2013-01-01", "2014-01-01", agg_days=7, recompute=['agg_and_clip', 'my_first_cache'], force_overwrite=True) # Weekly aggregate with both functions recomputed and overwritten
31
+