nuthatch 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nuthatch might be problematic. Click here for more details.
- nuthatch/__init__.py +14 -0
- nuthatch/backend.py +301 -0
- nuthatch/backends/__init__.py +8 -0
- nuthatch/backends/basic.py +28 -0
- nuthatch/backends/delta.py +46 -0
- nuthatch/backends/parquet.py +130 -0
- nuthatch/backends/sql.py +147 -0
- nuthatch/backends/terracotta.py +199 -0
- nuthatch/backends/zarr.py +207 -0
- nuthatch/cache.py +529 -0
- nuthatch/cli.py +174 -0
- nuthatch/config.py +94 -0
- nuthatch/memoizer.py +67 -0
- nuthatch/nuthatch.py +498 -0
- nuthatch/processor.py +89 -0
- nuthatch/processors/__init__.py +6 -0
- nuthatch/processors/timeseries.py +157 -0
- nuthatch-0.1.0.dist-info/METADATA +38 -0
- nuthatch-0.1.0.dist-info/RECORD +21 -0
- nuthatch-0.1.0.dist-info/WHEEL +4 -0
- nuthatch-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from nuthatch.processor import NuthatchProcessor
|
|
2
|
+
import dateparser
|
|
3
|
+
import datetime
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import dask.dataframe as dd
|
|
6
|
+
import xarray as xr
|
|
7
|
+
|
|
8
|
+
class TimeseriesProcessor(NuthatchProcessor):
|
|
9
|
+
"""
|
|
10
|
+
Processor for timeseries data.
|
|
11
|
+
|
|
12
|
+
This processor is used to slice a timeseries dataset based on the start and end times.
|
|
13
|
+
|
|
14
|
+
It also validates the timeseries data to ensure it has data within the start and end times.
|
|
15
|
+
|
|
16
|
+
It supports xarray datasets and pandas/dask dataframes.
|
|
17
|
+
"""
|
|
18
|
+
def __init__(self, func, timeseries, validate_timeseries):
|
|
19
|
+
"""
|
|
20
|
+
Initialize the timeseries processor.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
func: The function to wrap.
|
|
24
|
+
timeseries: The name of the timeseries dimension.
|
|
25
|
+
validate_timeseries: Whether to validate the timeseries data.
|
|
26
|
+
"""
|
|
27
|
+
self.func = func
|
|
28
|
+
self.validate_timeseries = validate_timeseries
|
|
29
|
+
self.timeseries = timeseries
|
|
30
|
+
|
|
31
|
+
def post_process(self, ds):
|
|
32
|
+
start_time = self.start_time
|
|
33
|
+
end_time = self.end_time
|
|
34
|
+
|
|
35
|
+
if isinstance(ds, xr.Dataset):
|
|
36
|
+
match_time = [t for t in self.timeseries if t in ds.dims]
|
|
37
|
+
if len(match_time) == 0:
|
|
38
|
+
raise RuntimeError(
|
|
39
|
+
f"Timeseries must have a dimension named {self.timeseries} for slicing."
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
time_col = match_time[0]
|
|
43
|
+
ds = ds.sel({time_col: slice(start_time, end_time)})
|
|
44
|
+
elif isinstance(ds, pd.DataFrame) or isinstance(ds, dd.DataFrame):
|
|
45
|
+
match_time = [t for t in self.timeseries if t in ds.columns]
|
|
46
|
+
|
|
47
|
+
if len(match_time) == 0:
|
|
48
|
+
raise RuntimeError(
|
|
49
|
+
f"Timeseries must have a dimension named {self.timeseries} for slicing."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
time_col = match_time[0]
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
if start_time is not None:
|
|
56
|
+
ds = ds[ds[time_col] >= start_time]
|
|
57
|
+
if end_time is not None:
|
|
58
|
+
ds = ds[ds[time_col] <= end_time]
|
|
59
|
+
|
|
60
|
+
except TypeError as e:
|
|
61
|
+
if "Invalid comparison" not in str(e):
|
|
62
|
+
raise e
|
|
63
|
+
|
|
64
|
+
time_col_tz = ds[time_col].dt.tz.compute()
|
|
65
|
+
|
|
66
|
+
if start_time is not None:
|
|
67
|
+
start_time = pd.Timestamp(start_time)
|
|
68
|
+
if start_time.tz is None:
|
|
69
|
+
start_time = start_time.tz_localize(time_col_tz)
|
|
70
|
+
else:
|
|
71
|
+
start_time = start_time.tz_convert(time_col_tz)
|
|
72
|
+
ds = ds[ds[time_col] >= start_time]
|
|
73
|
+
|
|
74
|
+
if end_time is not None:
|
|
75
|
+
end_time = pd.Timestamp(end_time)
|
|
76
|
+
if end_time.tz is None:
|
|
77
|
+
end_time = end_time.tz_localize(time_col_tz)
|
|
78
|
+
else:
|
|
79
|
+
end_time = end_time.tz_convert(time_col_tz)
|
|
80
|
+
ds = ds[ds[time_col] <= end_time]
|
|
81
|
+
else:
|
|
82
|
+
raise RuntimeError(f"Cannot filter timeseries for data type {type(ds)}")
|
|
83
|
+
|
|
84
|
+
return ds
|
|
85
|
+
|
|
86
|
+
def process_arguments(self, params, args, kwargs):
|
|
87
|
+
if 'validate_timeseries' in kwargs:
|
|
88
|
+
passed_validate_timeseries = kwargs['validate_timeseries']
|
|
89
|
+
if passed_validate_timeseries:
|
|
90
|
+
self.validate_timeseries = passed_validate_timeseries
|
|
91
|
+
del kwargs['validate_timeseries']
|
|
92
|
+
|
|
93
|
+
# Validate time series params
|
|
94
|
+
self.start_time = None
|
|
95
|
+
self.end_time = None
|
|
96
|
+
|
|
97
|
+
# Convert to a list if not
|
|
98
|
+
self.timeseries = self.timeseries if isinstance(self.timeseries, list) else [self.timeseries]
|
|
99
|
+
|
|
100
|
+
if 'start_time' not in params or 'end_time' not in params:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
"Time series functions must have the parameters 'start_time' and 'end_time'")
|
|
103
|
+
else:
|
|
104
|
+
keys = [item for item in params]
|
|
105
|
+
try:
|
|
106
|
+
self.start_time = args[keys.index('start_time')]
|
|
107
|
+
self.end_time = args[keys.index('end_time')]
|
|
108
|
+
except IndexError:
|
|
109
|
+
raise ValueError("'start_time' and 'end_time' must be passed as positional arguments, not "
|
|
110
|
+
"keyword arguments")
|
|
111
|
+
|
|
112
|
+
return args, kwargs
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def validate_data(self, ds):
|
|
116
|
+
start_time = self.start_time
|
|
117
|
+
end_time = self.end_time
|
|
118
|
+
if self.validate_timeseries:
|
|
119
|
+
if isinstance(ds, xr.Dataset):
|
|
120
|
+
# Check to see if the dataset extends roughly the full time series set
|
|
121
|
+
match_time = [t for t in self.timeseries if t in ds.dims]
|
|
122
|
+
if len(match_time) == 0:
|
|
123
|
+
raise RuntimeError("Timeseries array functions must return "
|
|
124
|
+
"a time dimension for slicing. "
|
|
125
|
+
"This could be an invalid cache. "
|
|
126
|
+
"Try running with recompute=True to reset the cache.")
|
|
127
|
+
else:
|
|
128
|
+
time_col = match_time[0]
|
|
129
|
+
|
|
130
|
+
# Assign start and end times if None are passed
|
|
131
|
+
st = dateparser.parse(start_time) if start_time is not None \
|
|
132
|
+
else pd.Timestamp(ds[time_col].min().values)
|
|
133
|
+
et = dateparser.parse(end_time) if end_time is not None \
|
|
134
|
+
else pd.Timestamp(ds[time_col].max().values)
|
|
135
|
+
|
|
136
|
+
# Check if within 1 year at least
|
|
137
|
+
if (pd.Timestamp(ds[time_col].min().values) <
|
|
138
|
+
st + datetime.timedelta(days=365) and
|
|
139
|
+
pd.Timestamp(ds[time_col].max().values) >
|
|
140
|
+
et - datetime.timedelta(days=365)):
|
|
141
|
+
return True
|
|
142
|
+
else:
|
|
143
|
+
print("""WARNING: The cached array does not have data within
|
|
144
|
+
1 year of your start or end time. Triggering recompute.
|
|
145
|
+
If you do not want to recompute the result set
|
|
146
|
+
`validate_cache_timeseries=False`""")
|
|
147
|
+
return False
|
|
148
|
+
else:
|
|
149
|
+
raise RuntimeError(f"Cannot validate timeseries for data type {type(ds)}")
|
|
150
|
+
|
|
151
|
+
return True
|
|
152
|
+
|
|
153
|
+
def timeseries(timeseries='time', validate_timeseries=False):
|
|
154
|
+
def decorator(func):
|
|
155
|
+
return TimeseriesProcessor(func, timeseries, validate_timeseries)
|
|
156
|
+
return decorator
|
|
157
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: nuthatch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Cacheable big data pipelines
|
|
5
|
+
Author-email: Joshua Adkins <josh@rhizaresearch.org>, Genevieve Flaspohler <geneviee@rhizaresearch.org>
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: click
|
|
8
|
+
Requires-Dist: dask-deltatable
|
|
9
|
+
Requires-Dist: dask[dataframe]
|
|
10
|
+
Requires-Dist: deltalake==1.1.2
|
|
11
|
+
Requires-Dist: fsspec
|
|
12
|
+
Requires-Dist: gitpython
|
|
13
|
+
Requires-Dist: pandas
|
|
14
|
+
Requires-Dist: psycopg2
|
|
15
|
+
Requires-Dist: pyarrow
|
|
16
|
+
Requires-Dist: sqlalchemy
|
|
17
|
+
Requires-Dist: terracotta
|
|
18
|
+
Requires-Dist: xarray
|
|
19
|
+
Requires-Dist: zarr
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Nuthatch
|
|
23
|
+
|
|
24
|
+
Nuthatch is a tool for building pure-python big data pipelines. At its core it
|
|
25
|
+
enables the transparent multi-level caching and recall of results in formats that
|
|
26
|
+
are efficient for each data type. It supports a variety of
|
|
27
|
+
common storage backends, data processing frameworks, and their associated
|
|
28
|
+
data types for caching.
|
|
29
|
+
|
|
30
|
+
It also provides a framework for re-using and sharing data-type specific
|
|
31
|
+
post-processing, and for these data type
|
|
32
|
+
processors to pass hints to storage backends for more efficient storager and recall.
|
|
33
|
+
|
|
34
|
+
Nuthatch was created to alleviate the comon pattern of data processing pipelines manually
|
|
35
|
+
specifying their output storage locations, and the requirements of pipeline builders to
|
|
36
|
+
use external data orchestration tools to specify the execution of their pipeliness. With Nuthatch
|
|
37
|
+
simply tag your functions and anyone who has access to your storage backend - you, your
|
|
38
|
+
team, or the public - can acess and build off of your most up-to-date data.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
nuthatch/__init__.py,sha256=h4jzi33pztmMY3QPHSp6WcOeO3QMzEWjVUDAWICHgjE,384
|
|
2
|
+
nuthatch/backend.py,sha256=1BijlFcQrkyvr7V8ai4PnKoizDsOl3-u1KpxOyDGARg,11018
|
|
3
|
+
nuthatch/cache.py,sha256=QVhwi2x5nRopPJdJBWuudCwUHs3PcTvTmpmwuLqmPNI,22193
|
|
4
|
+
nuthatch/cli.py,sha256=9AU1N_aMsTC_Nm81XgcH0v4CPaeULo1pW74nEV6CZJk,7016
|
|
5
|
+
nuthatch/config.py,sha256=Rwqkjn70IHM4N_M3cCg5hZEqOroPC95rs_e2--z0Hjc,3756
|
|
6
|
+
nuthatch/memoizer.py,sha256=MkEdzodZjq3gZnTGNWhR0cJRFDsRo2EKRJaDrLpWPsM,1798
|
|
7
|
+
nuthatch/nuthatch.py,sha256=mSi2uAhuY06uOqnnXxsavaeKV5RrLKQcNkUVpSkDHnw,20859
|
|
8
|
+
nuthatch/processor.py,sha256=PTEg6jTURJfDeDCZP0TOQim1Q44Vi3S0Z6uIeKdxkrI,2834
|
|
9
|
+
nuthatch/backends/__init__.py,sha256=7vT9xP1QNTSX2Y3YE3Flt5h8mgQi45ROlcD9Wl9IkUk,312
|
|
10
|
+
nuthatch/backends/basic.py,sha256=KPQUTDbDRkFCvkfOp8zmaRxd9IkrxRQBPZx-VfW0e9Y,856
|
|
11
|
+
nuthatch/backends/delta.py,sha256=1HQILMAtJ4oIrvbLFtwIr4JbhlqBZ_X6qsNDYyIVyQs,1846
|
|
12
|
+
nuthatch/backends/parquet.py,sha256=q1ysnpdfkedT64xNW28StvqiWRWd7AXdMGvMOlCqXe0,5319
|
|
13
|
+
nuthatch/backends/sql.py,sha256=BHdxReQAr5GO3Nu99aSIHUZHFrvk-xqDHogxAR-bSCw,6746
|
|
14
|
+
nuthatch/backends/terracotta.py,sha256=My3nPYEYNuuZ2d93Ogsh5PBmJWm9JBQT9h0HbX0Mcss,7049
|
|
15
|
+
nuthatch/backends/zarr.py,sha256=hJ8Cy28cBwDlBNTgzhWEFi5E747fP2PuwIC-jGgmMs0,8356
|
|
16
|
+
nuthatch/processors/__init__.py,sha256=6b46sDY3IoxnK0V8x9vPomCfGXoHWoserGcUtxWKyQg,111
|
|
17
|
+
nuthatch/processors/timeseries.py,sha256=Jumjn0qqsel2Oh-AJWCTUHW4xodi5_oMoaXbDRGoo9Y,6427
|
|
18
|
+
nuthatch-0.1.0.dist-info/METADATA,sha256=o1BJhQxY0KYRRLfhXUccGj8QsLZePbxftwksRrYRoVA,1558
|
|
19
|
+
nuthatch-0.1.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
|
20
|
+
nuthatch-0.1.0.dist-info/entry_points.txt,sha256=Sn3SYzKxaDtQyiM1dfAz13g_NiZRzIOagOawnCmfrR4,47
|
|
21
|
+
nuthatch-0.1.0.dist-info/RECORD,,
|