nuthatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nuthatch might be problematic. Click here for more details.

@@ -0,0 +1,157 @@
1
+ from nuthatch.processor import NuthatchProcessor
2
+ import dateparser
3
+ import datetime
4
+ import pandas as pd
5
+ import dask.dataframe as dd
6
+ import xarray as xr
7
+
8
+ class TimeseriesProcessor(NuthatchProcessor):
9
+ """
10
+ Processor for timeseries data.
11
+
12
+ This processor is used to slice a timeseries dataset based on the start and end times.
13
+
14
+ It also validates the timeseries data to ensure it has data within the start and end times.
15
+
16
+ It supports xarray datasets and pandas/dask dataframes.
17
+ """
18
+ def __init__(self, func, timeseries, validate_timeseries):
19
+ """
20
+ Initialize the timeseries processor.
21
+
22
+ Args:
23
+ func: The function to wrap.
24
+ timeseries: The name of the timeseries dimension.
25
+ validate_timeseries: Whether to validate the timeseries data.
26
+ """
27
+ self.func = func
28
+ self.validate_timeseries = validate_timeseries
29
+ self.timeseries = timeseries
30
+
31
+ def post_process(self, ds):
32
+ start_time = self.start_time
33
+ end_time = self.end_time
34
+
35
+ if isinstance(ds, xr.Dataset):
36
+ match_time = [t for t in self.timeseries if t in ds.dims]
37
+ if len(match_time) == 0:
38
+ raise RuntimeError(
39
+ f"Timeseries must have a dimension named {self.timeseries} for slicing."
40
+ )
41
+
42
+ time_col = match_time[0]
43
+ ds = ds.sel({time_col: slice(start_time, end_time)})
44
+ elif isinstance(ds, pd.DataFrame) or isinstance(ds, dd.DataFrame):
45
+ match_time = [t for t in self.timeseries if t in ds.columns]
46
+
47
+ if len(match_time) == 0:
48
+ raise RuntimeError(
49
+ f"Timeseries must have a dimension named {self.timeseries} for slicing."
50
+ )
51
+
52
+ time_col = match_time[0]
53
+
54
+ try:
55
+ if start_time is not None:
56
+ ds = ds[ds[time_col] >= start_time]
57
+ if end_time is not None:
58
+ ds = ds[ds[time_col] <= end_time]
59
+
60
+ except TypeError as e:
61
+ if "Invalid comparison" not in str(e):
62
+ raise e
63
+
64
+ time_col_tz = ds[time_col].dt.tz.compute()
65
+
66
+ if start_time is not None:
67
+ start_time = pd.Timestamp(start_time)
68
+ if start_time.tz is None:
69
+ start_time = start_time.tz_localize(time_col_tz)
70
+ else:
71
+ start_time = start_time.tz_convert(time_col_tz)
72
+ ds = ds[ds[time_col] >= start_time]
73
+
74
+ if end_time is not None:
75
+ end_time = pd.Timestamp(end_time)
76
+ if end_time.tz is None:
77
+ end_time = end_time.tz_localize(time_col_tz)
78
+ else:
79
+ end_time = end_time.tz_convert(time_col_tz)
80
+ ds = ds[ds[time_col] <= end_time]
81
+ else:
82
+ raise RuntimeError(f"Cannot filter timeseries for data type {type(ds)}")
83
+
84
+ return ds
85
+
86
+ def process_arguments(self, params, args, kwargs):
87
+ if 'validate_timeseries' in kwargs:
88
+ passed_validate_timeseries = kwargs['validate_timeseries']
89
+ if passed_validate_timeseries:
90
+ self.validate_timeseries = passed_validate_timeseries
91
+ del kwargs['validate_timeseries']
92
+
93
+ # Validate time series params
94
+ self.start_time = None
95
+ self.end_time = None
96
+
97
+ # Convert to a list if not
98
+ self.timeseries = self.timeseries if isinstance(self.timeseries, list) else [self.timeseries]
99
+
100
+ if 'start_time' not in params or 'end_time' not in params:
101
+ raise ValueError(
102
+ "Time series functions must have the parameters 'start_time' and 'end_time'")
103
+ else:
104
+ keys = [item for item in params]
105
+ try:
106
+ self.start_time = args[keys.index('start_time')]
107
+ self.end_time = args[keys.index('end_time')]
108
+ except IndexError:
109
+ raise ValueError("'start_time' and 'end_time' must be passed as positional arguments, not "
110
+ "keyword arguments")
111
+
112
+ return args, kwargs
113
+
114
+
115
+ def validate_data(self, ds):
116
+ start_time = self.start_time
117
+ end_time = self.end_time
118
+ if self.validate_timeseries:
119
+ if isinstance(ds, xr.Dataset):
120
+ # Check to see if the dataset extends roughly the full time series set
121
+ match_time = [t for t in self.timeseries if t in ds.dims]
122
+ if len(match_time) == 0:
123
+ raise RuntimeError("Timeseries array functions must return "
124
+ "a time dimension for slicing. "
125
+ "This could be an invalid cache. "
126
+ "Try running with recompute=True to reset the cache.")
127
+ else:
128
+ time_col = match_time[0]
129
+
130
+ # Assign start and end times if None are passed
131
+ st = dateparser.parse(start_time) if start_time is not None \
132
+ else pd.Timestamp(ds[time_col].min().values)
133
+ et = dateparser.parse(end_time) if end_time is not None \
134
+ else pd.Timestamp(ds[time_col].max().values)
135
+
136
+ # Check if within 1 year at least
137
+ if (pd.Timestamp(ds[time_col].min().values) <
138
+ st + datetime.timedelta(days=365) and
139
+ pd.Timestamp(ds[time_col].max().values) >
140
+ et - datetime.timedelta(days=365)):
141
+ return True
142
+ else:
143
+ print("""WARNING: The cached array does not have data within
144
+ 1 year of your start or end time. Triggering recompute.
145
+ If you do not want to recompute the result set
146
+ `validate_cache_timeseries=False`""")
147
+ return False
148
+ else:
149
+ raise RuntimeError(f"Cannot validate timeseries for data type {type(ds)}")
150
+
151
+ return True
152
+
153
+ def timeseries(timeseries='time', validate_timeseries=False):
154
+ def decorator(func):
155
+ return TimeseriesProcessor(func, timeseries, validate_timeseries)
156
+ return decorator
157
+
@@ -0,0 +1,38 @@
1
+ Metadata-Version: 2.3
2
+ Name: nuthatch
3
+ Version: 0.1.0
4
+ Summary: Cacheable big data pipelines
5
+ Author-email: Joshua Adkins <josh@rhizaresearch.org>, Genevieve Flaspohler <geneviee@rhizaresearch.org>
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: click
8
+ Requires-Dist: dask-deltatable
9
+ Requires-Dist: dask[dataframe]
10
+ Requires-Dist: deltalake==1.1.2
11
+ Requires-Dist: fsspec
12
+ Requires-Dist: gitpython
13
+ Requires-Dist: pandas
14
+ Requires-Dist: psycopg2
15
+ Requires-Dist: pyarrow
16
+ Requires-Dist: sqlalchemy
17
+ Requires-Dist: terracotta
18
+ Requires-Dist: xarray
19
+ Requires-Dist: zarr
20
+ Description-Content-Type: text/markdown
21
+
22
+ # Nuthatch
23
+
24
+ Nuthatch is a tool for building pure-python big data pipelines. At its core it
25
+ enables the transparent multi-level caching and recall of results in formats that
26
+ are efficient for each data type. It supports a variety of
27
+ common storage backends, data processing frameworks, and their associated
28
+ data types for caching.
29
+
30
+ It also provides a framework for re-using and sharing data-type specific
31
+ post-processing, and for these data type
32
+ processors to pass hints to storage backends for more efficient storager and recall.
33
+
34
+ Nuthatch was created to alleviate the comon pattern of data processing pipelines manually
35
+ specifying their output storage locations, and the requirements of pipeline builders to
36
+ use external data orchestration tools to specify the execution of their pipeliness. With Nuthatch
37
+ simply tag your functions and anyone who has access to your storage backend - you, your
38
+ team, or the public - can acess and build off of your most up-to-date data.
@@ -0,0 +1,21 @@
1
+ nuthatch/__init__.py,sha256=h4jzi33pztmMY3QPHSp6WcOeO3QMzEWjVUDAWICHgjE,384
2
+ nuthatch/backend.py,sha256=1BijlFcQrkyvr7V8ai4PnKoizDsOl3-u1KpxOyDGARg,11018
3
+ nuthatch/cache.py,sha256=QVhwi2x5nRopPJdJBWuudCwUHs3PcTvTmpmwuLqmPNI,22193
4
+ nuthatch/cli.py,sha256=9AU1N_aMsTC_Nm81XgcH0v4CPaeULo1pW74nEV6CZJk,7016
5
+ nuthatch/config.py,sha256=Rwqkjn70IHM4N_M3cCg5hZEqOroPC95rs_e2--z0Hjc,3756
6
+ nuthatch/memoizer.py,sha256=MkEdzodZjq3gZnTGNWhR0cJRFDsRo2EKRJaDrLpWPsM,1798
7
+ nuthatch/nuthatch.py,sha256=mSi2uAhuY06uOqnnXxsavaeKV5RrLKQcNkUVpSkDHnw,20859
8
+ nuthatch/processor.py,sha256=PTEg6jTURJfDeDCZP0TOQim1Q44Vi3S0Z6uIeKdxkrI,2834
9
+ nuthatch/backends/__init__.py,sha256=7vT9xP1QNTSX2Y3YE3Flt5h8mgQi45ROlcD9Wl9IkUk,312
10
+ nuthatch/backends/basic.py,sha256=KPQUTDbDRkFCvkfOp8zmaRxd9IkrxRQBPZx-VfW0e9Y,856
11
+ nuthatch/backends/delta.py,sha256=1HQILMAtJ4oIrvbLFtwIr4JbhlqBZ_X6qsNDYyIVyQs,1846
12
+ nuthatch/backends/parquet.py,sha256=q1ysnpdfkedT64xNW28StvqiWRWd7AXdMGvMOlCqXe0,5319
13
+ nuthatch/backends/sql.py,sha256=BHdxReQAr5GO3Nu99aSIHUZHFrvk-xqDHogxAR-bSCw,6746
14
+ nuthatch/backends/terracotta.py,sha256=My3nPYEYNuuZ2d93Ogsh5PBmJWm9JBQT9h0HbX0Mcss,7049
15
+ nuthatch/backends/zarr.py,sha256=hJ8Cy28cBwDlBNTgzhWEFi5E747fP2PuwIC-jGgmMs0,8356
16
+ nuthatch/processors/__init__.py,sha256=6b46sDY3IoxnK0V8x9vPomCfGXoHWoserGcUtxWKyQg,111
17
+ nuthatch/processors/timeseries.py,sha256=Jumjn0qqsel2Oh-AJWCTUHW4xodi5_oMoaXbDRGoo9Y,6427
18
+ nuthatch-0.1.0.dist-info/METADATA,sha256=o1BJhQxY0KYRRLfhXUccGj8QsLZePbxftwksRrYRoVA,1558
19
+ nuthatch-0.1.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
20
+ nuthatch-0.1.0.dist-info/entry_points.txt,sha256=Sn3SYzKxaDtQyiM1dfAz13g_NiZRzIOagOawnCmfrR4,47
21
+ nuthatch-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.26.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ nuthatch = nuthatch.cli:main