nuthatch 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nuthatch might be problematic. Click here for more details.
- nuthatch/__init__.py +14 -0
- nuthatch/backend.py +301 -0
- nuthatch/backends/__init__.py +8 -0
- nuthatch/backends/basic.py +28 -0
- nuthatch/backends/delta.py +46 -0
- nuthatch/backends/parquet.py +130 -0
- nuthatch/backends/sql.py +147 -0
- nuthatch/backends/terracotta.py +199 -0
- nuthatch/backends/zarr.py +207 -0
- nuthatch/cache.py +529 -0
- nuthatch/cli.py +174 -0
- nuthatch/config.py +94 -0
- nuthatch/memoizer.py +67 -0
- nuthatch/nuthatch.py +498 -0
- nuthatch/processor.py +89 -0
- nuthatch/processors/__init__.py +6 -0
- nuthatch/processors/timeseries.py +157 -0
- nuthatch-0.1.0.dist-info/METADATA +38 -0
- nuthatch-0.1.0.dist-info/RECORD +21 -0
- nuthatch-0.1.0.dist-info/WHEEL +4 -0
- nuthatch-0.1.0.dist-info/entry_points.txt +2 -0
nuthatch/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Nuthatch is a library for caching data based on the function call and its arguments.
|
|
3
|
+
|
|
4
|
+
It caches data in a variety of backends optimized for different data types.
|
|
5
|
+
"""
|
|
6
|
+
from .config import config_parameter
|
|
7
|
+
from .nuthatch import cache
|
|
8
|
+
|
|
9
|
+
# Trigger backend registration
|
|
10
|
+
import nuthatch.backends #noqa
|
|
11
|
+
|
|
12
|
+
__version__ = "0.1.0"
|
|
13
|
+
|
|
14
|
+
__all__ = ["config_parameter", "cache", "__version__", "cli"]
|
nuthatch/backend.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""The backend module is used to register and get backends.
|
|
2
|
+
|
|
3
|
+
It also contains the abstract base class for all backends.
|
|
4
|
+
"""
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from os.path import join
|
|
7
|
+
import fsspec
|
|
8
|
+
import sqlalchemy
|
|
9
|
+
|
|
10
|
+
registered_backends = {}
|
|
11
|
+
default_backends = {}
|
|
12
|
+
|
|
13
|
+
def register_backend(backendClass):
|
|
14
|
+
"""Register a backend class with the nuthatch system.
|
|
15
|
+
|
|
16
|
+
This function registers a backend class so it can be used by the nuthatch
|
|
17
|
+
caching system. The backend class must have a `backend_name` attribute.
|
|
18
|
+
Optionally, if the backend class has a `default_for_type` attribute, it
|
|
19
|
+
will be set as the default backend for that data type.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
backendClass: The backend class to register. Must have a `backend_name`
|
|
23
|
+
attribute and optionally a `default_for_type` attribute.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
The registered backend class (allows use as a decorator).
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
@register_backend
|
|
30
|
+
class MyBackend:
|
|
31
|
+
backend_name = "my_backend"
|
|
32
|
+
default_for_type = "my_data_type"
|
|
33
|
+
"""
|
|
34
|
+
registered_backends[backendClass.backend_name] = backendClass
|
|
35
|
+
|
|
36
|
+
if 'default_for_type' in backendClass.__dict__:
|
|
37
|
+
default_backends[backendClass.default_for_type] = backendClass.backend_name
|
|
38
|
+
|
|
39
|
+
return backendClass
|
|
40
|
+
|
|
41
|
+
def get_backend_by_name(backend_name):
|
|
42
|
+
"""Retrieve a registered backend class by its name.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
backend_name (str): The name of the backend to retrieve.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The backend class associated with the given name.
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
KeyError: If no backend is registered with the given name.
|
|
52
|
+
"""
|
|
53
|
+
return registered_backends[backend_name]
|
|
54
|
+
|
|
55
|
+
def get_default_backend(data_type):
|
|
56
|
+
"""Get the default backend name for a specific data type.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
data_type (str): The data type to get the default backend for.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
str: The name of the default backend for the data type, or 'basic'
|
|
63
|
+
if no default is set.
|
|
64
|
+
"""
|
|
65
|
+
if data_type in default_backends:
|
|
66
|
+
return default_backends[data_type]
|
|
67
|
+
else:
|
|
68
|
+
return 'basic'
|
|
69
|
+
|
|
70
|
+
class NuthatchBackend(ABC):
|
|
71
|
+
config_parameters = []
|
|
72
|
+
|
|
73
|
+
def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
|
|
74
|
+
"""The abstract base class for all cacheable backends.
|
|
75
|
+
|
|
76
|
+
This is the base class for all cacheable backends in the nuthatch system.
|
|
77
|
+
Each backend instance manages a specific cache entry with its own
|
|
78
|
+
configuration and storage mechanism.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
cacheable_config (dict): Configuration dictionary containing static or
|
|
82
|
+
dynamic parameters. Required parameters should be listed as strings
|
|
83
|
+
in the backend's `config_parameters` class attribute.
|
|
84
|
+
cache_key (str): Unique identifier for the cache entry. This key is
|
|
85
|
+
used to distinguish between different cached items.
|
|
86
|
+
namespace (str, optional): Optional namespace to organize cache entries.
|
|
87
|
+
If provided, cache entries will be stored under this namespace.
|
|
88
|
+
args (dict): Key-value pairs of arguments and values that were passed
|
|
89
|
+
to the function being cached. These are used to generate cache
|
|
90
|
+
keys and may influence backend behavior.
|
|
91
|
+
backend_kwargs (dict): User-configurable keyword arguments specific to
|
|
92
|
+
the backend implementation. For example, the zarr backend uses
|
|
93
|
+
these for per-argument-value chunking configuration.
|
|
94
|
+
|
|
95
|
+
Note:
|
|
96
|
+
This is an abstract base class. Subclasses must implement the abstract
|
|
97
|
+
methods: `write()`, `read()`, `delete()`, `exists()`, `get_file_path()`,
|
|
98
|
+
and `sync()`.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
# Store base
|
|
102
|
+
self.config = cacheable_config
|
|
103
|
+
self.cache_key = cache_key
|
|
104
|
+
self.namespace = namespace
|
|
105
|
+
self.backend_kwargs = backend_kwargs
|
|
106
|
+
self.args = args
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@abstractmethod
|
|
110
|
+
def write(self, data, upsert=False, primary_keys=None):
|
|
111
|
+
"""Write data to the backend.
|
|
112
|
+
|
|
113
|
+
This method is responsible for writing data to the backend. It should
|
|
114
|
+
be implemented by subclasses to handle the specific storage mechanism
|
|
115
|
+
of the backend.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
data (any): The data to write to the backend.
|
|
119
|
+
upsert (bool, optional): Whether to perform an upsert operation.
|
|
120
|
+
If True, the data will be inserted or updated based on the
|
|
121
|
+
primary keys provided.
|
|
122
|
+
primary_keys (list, optional): List of primary key columns to use
|
|
123
|
+
for upsert operations.
|
|
124
|
+
"""
|
|
125
|
+
pass
|
|
126
|
+
|
|
127
|
+
@abstractmethod
|
|
128
|
+
def read(self, engine):
|
|
129
|
+
"""Read data from the backend.
|
|
130
|
+
|
|
131
|
+
This method is responsible for reading data from the backend. It should
|
|
132
|
+
be implemented by subclasses to handle the specific storage mechanism
|
|
133
|
+
of the backend.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
engine (str or type): The data processing engine to use for
|
|
137
|
+
reading data from the backend.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Any: The data read from the backend.
|
|
141
|
+
"""
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
@abstractmethod
|
|
145
|
+
def delete(self):
|
|
146
|
+
"""Delete data from the backend.
|
|
147
|
+
|
|
148
|
+
This method is responsible for deleting data from the backend. It should
|
|
149
|
+
be implemented by subclasses to handle the specific storage mechanism
|
|
150
|
+
of the backend.
|
|
151
|
+
"""
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
@abstractmethod
|
|
155
|
+
def exists(self):
|
|
156
|
+
"""Check if the data exists in the backend.
|
|
157
|
+
|
|
158
|
+
This method is responsible for checking if the data exists in the backend.
|
|
159
|
+
It should be implemented by subclasses to handle the specific storage
|
|
160
|
+
mechanism of the backend.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
bool: True if the data exists in the backend, False otherwise.
|
|
164
|
+
"""
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
@abstractmethod
|
|
168
|
+
def get_file_path(self):
|
|
169
|
+
"""Get the file path of the data in the backend.
|
|
170
|
+
|
|
171
|
+
This method is responsible for returning the file path of the data in the
|
|
172
|
+
backend. It should be implemented by subclasses to handle the specific
|
|
173
|
+
storage mechanism of the backend.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
str: The file path of the data in the backend.
|
|
177
|
+
"""
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
@abstractmethod
|
|
181
|
+
def sync(self, from_backend):
|
|
182
|
+
"""Sync data from one backend to another.
|
|
183
|
+
|
|
184
|
+
This method is responsible for syncing data from one backend to another.
|
|
185
|
+
It should be implemented by subclasses to handle the specific storage
|
|
186
|
+
mechanism of the backend.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
from_backend (NuthatchBackend): The backend to sync data from.
|
|
190
|
+
"""
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class FileBackend(NuthatchBackend):
|
|
195
|
+
"""Base class for all backends that rely on a filesystem."""
|
|
196
|
+
|
|
197
|
+
config_parameters = ["filesystem", "filesystem_options"]
|
|
198
|
+
|
|
199
|
+
def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs, extension):
|
|
200
|
+
super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs)
|
|
201
|
+
|
|
202
|
+
self.base_path = self.config['filesystem']
|
|
203
|
+
|
|
204
|
+
if namespace:
|
|
205
|
+
self.raw_cache_path = join(self.base_path, namespace, cache_key)
|
|
206
|
+
else:
|
|
207
|
+
self.raw_cache_path = join(self.base_path, cache_key)
|
|
208
|
+
|
|
209
|
+
self.temp_cache_path = join(self.base_path, 'temp', cache_key)
|
|
210
|
+
self.extension = extension
|
|
211
|
+
self.path = self.raw_cache_path + '.' + extension
|
|
212
|
+
if 'filesystem_options' not in self.config:
|
|
213
|
+
self.config['filesystem_options'] = {}
|
|
214
|
+
|
|
215
|
+
if fsspec.utils.get_protocol(self.path) == 'file':
|
|
216
|
+
# If the protocol is a local filesystem, we need to create the directory if it doesn't exist
|
|
217
|
+
self.fs = fsspec.core.url_to_fs(self.path, auto_mkdir=True, **self.config['filesystem_options'])[0]
|
|
218
|
+
else:
|
|
219
|
+
self.fs = fsspec.core.url_to_fs(self.path, **self.config['filesystem_options'])[0]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def exists(self):
|
|
223
|
+
return (self.fs.exists(self.path))
|
|
224
|
+
|
|
225
|
+
def delete(self):
|
|
226
|
+
self.fs.rm(self.path, recursive=True)
|
|
227
|
+
|
|
228
|
+
def get_file_path(self):
|
|
229
|
+
return self.path
|
|
230
|
+
|
|
231
|
+
def get_cache_key(self, path):
|
|
232
|
+
if path.endswith('.' + self.extension):
|
|
233
|
+
path = path[:-len('.' + self.extension)]
|
|
234
|
+
|
|
235
|
+
if path.startswith(self.base_path):
|
|
236
|
+
path = path[len(self.base_path):]
|
|
237
|
+
|
|
238
|
+
stripped = fsspec.core.strip_protocol(self.base_path)
|
|
239
|
+
if path.startswith(stripped):
|
|
240
|
+
path = path[len(stripped):]
|
|
241
|
+
|
|
242
|
+
if path.startswith('/'):
|
|
243
|
+
path = path[1:]
|
|
244
|
+
|
|
245
|
+
if self.namespace:
|
|
246
|
+
if path.startswith(self.namespace):
|
|
247
|
+
path = path[len(self.namespace):]
|
|
248
|
+
|
|
249
|
+
if path.startswith('/'):
|
|
250
|
+
path = path[1:]
|
|
251
|
+
|
|
252
|
+
return path
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def sync(self, from_backend):
|
|
256
|
+
from_backend.fs.get(from_backend.path, self.path)
|
|
257
|
+
|
|
258
|
+
@register_backend
|
|
259
|
+
class NullBackend(FileBackend):
|
|
260
|
+
|
|
261
|
+
backend_name = 'null'
|
|
262
|
+
|
|
263
|
+
def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
|
|
264
|
+
super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'null')
|
|
265
|
+
|
|
266
|
+
def read(self, engine=None):
|
|
267
|
+
raise RuntimeError("Null backend used only for import/export path manipulation. Cannot read from null backend.")
|
|
268
|
+
|
|
269
|
+
def write(self, engine=None):
|
|
270
|
+
raise RuntimeError("Null backend used only for import/export path manipulation. Cannot write to null backend.")
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class DatabaseBackend(NuthatchBackend):
|
|
274
|
+
"""Base class for all backends that rely on a database."""
|
|
275
|
+
|
|
276
|
+
config_parameters = ["driver", "host", "port", "database", "username", "password", "write_username", "write_password"]
|
|
277
|
+
|
|
278
|
+
def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
|
|
279
|
+
super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs)
|
|
280
|
+
|
|
281
|
+
database_url = sqlalchemy.URL.create(self.config['driver'],
|
|
282
|
+
username = self.config['username'],
|
|
283
|
+
password = self.config['password'],
|
|
284
|
+
host = self.config['host'],
|
|
285
|
+
port = self.config['port'],
|
|
286
|
+
database = self.config['database'])
|
|
287
|
+
self.engine = sqlalchemy.create_engine(database_url)
|
|
288
|
+
|
|
289
|
+
if 'write_username' in self.config and 'write_password' in self.config:
|
|
290
|
+
write_database_url = sqlalchemy.URL.create(self.config['driver'],
|
|
291
|
+
username = self.config['write_username'],
|
|
292
|
+
password = self.config['write_password'],
|
|
293
|
+
host = self.config['host'],
|
|
294
|
+
port = self.config['port'],
|
|
295
|
+
database = self.config['database'])
|
|
296
|
+
self.write_engine = sqlalchemy.create_engine(write_database_url)
|
|
297
|
+
else:
|
|
298
|
+
self.write_engine = self.engine
|
|
299
|
+
|
|
300
|
+
def sync(self, local_backend):
|
|
301
|
+
raise NotImplementedError("Backend syncing not implemented for database-like backends.")
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from .sql import SQLBackend
|
|
2
|
+
from .delta import DeltaBackend
|
|
3
|
+
from .basic import BasicBackend
|
|
4
|
+
from .zarr import ZarrBackend
|
|
5
|
+
from .terracotta import TerracottaBackend
|
|
6
|
+
from .parquet import ParquetBackend
|
|
7
|
+
|
|
8
|
+
__all__ = ["SQLBackend", "DeltaBackend", "BasicBackend", "ZarrBackend", "TerracottaBackend", "ParquetBackend"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from nuthatch.backend import FileBackend, register_backend
|
|
2
|
+
import pickle
|
|
3
|
+
|
|
4
|
+
@register_backend
|
|
5
|
+
class BasicBackend(FileBackend):
|
|
6
|
+
"""
|
|
7
|
+
Basic backend for caching data in a pickle file.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
backend_name = "basic"
|
|
11
|
+
|
|
12
|
+
def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
|
|
13
|
+
super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'pkl')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def write(self, data, upsert=False, primary_keys=None):
|
|
17
|
+
if upsert:
|
|
18
|
+
raise ValueError("Basic/pickle backend does not support upsert.")
|
|
19
|
+
|
|
20
|
+
with self.fs.open(self.path, 'wb') as f:
|
|
21
|
+
pickle.dump(data, f)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def read(self, engine=None):
|
|
25
|
+
# Check to make sure the verify exists
|
|
26
|
+
if self.fs.exists(self.path):
|
|
27
|
+
with self.fs.open(self.path, 'rb') as f:
|
|
28
|
+
return pickle.load(f)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from nuthatch.backend import FileBackend, register_backend
|
|
2
|
+
from deltalake import DeltaTable, write_deltalake
|
|
3
|
+
import dask.dataframe as dd
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import dask_deltatable as ddt
|
|
6
|
+
|
|
7
|
+
@register_backend
|
|
8
|
+
class DeltaBackend(FileBackend):
|
|
9
|
+
"""
|
|
10
|
+
Delta backend for caching tabular data in a delta table.
|
|
11
|
+
|
|
12
|
+
This backend supports dask and pandas dataframes.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
backend_name = "delta"
|
|
16
|
+
default_for_type = pd.DataFrame
|
|
17
|
+
|
|
18
|
+
def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
|
|
19
|
+
super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'delta')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def write(self, data, upsert=False, primary_keys=None):
|
|
23
|
+
"""Write a pandas dataframe to a delta table."""
|
|
24
|
+
if upsert:
|
|
25
|
+
raise ValueError("Delta backend does not support upsert.")
|
|
26
|
+
|
|
27
|
+
if isinstance(data, dd.DataFrame):
|
|
28
|
+
print("""Warning: Dask datafame passed to delta backend. Will run `compute()`
|
|
29
|
+
on the dataframe prior to storage. This will fail if the dataframe
|
|
30
|
+
does not fit in memory. Use `backend=parquet` to handle parallel writing of dask dataframes.""")
|
|
31
|
+
write_data = data.compute()
|
|
32
|
+
elif isinstance(data, pd.DataFrame):
|
|
33
|
+
write_data = data
|
|
34
|
+
else:
|
|
35
|
+
raise RuntimeError("Delta backend only supports dask and pandas engines.")
|
|
36
|
+
|
|
37
|
+
write_deltalake(self.path, write_data, mode='overwrite', schema_mode='overwrite')
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def read(self, engine=None):
|
|
41
|
+
if engine == 'pandas' or engine == pd.DataFrame or engine is None:
|
|
42
|
+
return DeltaTable(self.path).to_pandas()
|
|
43
|
+
elif engine == 'dask' or engine == dd.DataFrame:
|
|
44
|
+
return ddt.read_deltalake(self.path)
|
|
45
|
+
else:
|
|
46
|
+
raise RuntimeError("Delta backend only supports dask and pandas engines.")
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from nuthatch.backend import FileBackend, register_backend
|
|
2
|
+
from pandas.api.types import is_datetime64_any_dtype as is_datetime
|
|
3
|
+
import dask.dataframe as dd
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def write_parquet_helper(df, path, partition_on=None):
|
|
8
|
+
"""Helper to write parquets."""
|
|
9
|
+
print(path)
|
|
10
|
+
df.to_parquet(
|
|
11
|
+
path,
|
|
12
|
+
overwrite=True,
|
|
13
|
+
partition_on=partition_on,
|
|
14
|
+
engine="pyarrow",
|
|
15
|
+
write_metadata_file=True,
|
|
16
|
+
write_index=False,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def read_from_parquet(cache_path):
|
|
20
|
+
"""Read from a deltatable into a pandas dataframe."""
|
|
21
|
+
return dd.read_parquet(cache_path, engine='pyarrow', ignore_metadata_file=True)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@register_backend
|
|
26
|
+
class ParquetBackend(FileBackend):
|
|
27
|
+
"""
|
|
28
|
+
Parquet backend for caching tabular data in a parquet file.
|
|
29
|
+
|
|
30
|
+
This backend supports dask and pandas dataframes.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
backend_name = 'parquet'
|
|
34
|
+
default_for_type = dd.DataFrame
|
|
35
|
+
|
|
36
|
+
def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
|
|
37
|
+
super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'parquet')
|
|
38
|
+
|
|
39
|
+
def write(self, data, upsert=False, primary_keys=None):
|
|
40
|
+
if isinstance(data, dd.DataFrame):
|
|
41
|
+
self.write_to_parquet(data, self.path, self.temp_cache_path, upsert=upsert, primary_keys=primary_keys)
|
|
42
|
+
elif isinstance(data, pd.DataFrame):
|
|
43
|
+
if upsert:
|
|
44
|
+
raise RuntimeError("Parquet backend does not support upsert for pandas engine.")
|
|
45
|
+
|
|
46
|
+
part = None
|
|
47
|
+
if hasattr(data, 'cache_partition'):
|
|
48
|
+
part = data.cache_partition
|
|
49
|
+
|
|
50
|
+
data.to_parquet(self.path, partition_cols=part, engine='pyarrow')
|
|
51
|
+
else:
|
|
52
|
+
raise RuntimeError("Delta backend only supports dask and pandas engines.")
|
|
53
|
+
|
|
54
|
+
def read(self, engine):
|
|
55
|
+
if engine == 'pandas' or engine == pd.DataFrame or engine is None:
|
|
56
|
+
return pd.read_parquet(self.path)
|
|
57
|
+
elif engine == 'dask' or engine == dd.DataFrame:
|
|
58
|
+
return dd.read_parquet(self.path, engine='pyarrow', ignore_metadata_file=True)
|
|
59
|
+
else:
|
|
60
|
+
raise RuntimeError("Delta backend only supports dask and pandas engines.")
|
|
61
|
+
|
|
62
|
+
def write_to_parquet(self, df, cache_path, temp_cache_path, upsert=False, primary_keys=None):
|
|
63
|
+
"""Write a pandas or dask dataframe to a parquet."""
|
|
64
|
+
part = None
|
|
65
|
+
if hasattr(df, 'cache_partition'):
|
|
66
|
+
part = df.cache_partition
|
|
67
|
+
|
|
68
|
+
if upsert and self.fs.exists(cache_path):
|
|
69
|
+
print("Found existing cache for upsert.")
|
|
70
|
+
if primary_keys is None:
|
|
71
|
+
raise ValueError("Upsert may only be performed with primary keys specified")
|
|
72
|
+
|
|
73
|
+
if isinstance(df, pd.DataFrame):
|
|
74
|
+
print("Auto converting pandas to dask dataframe.")
|
|
75
|
+
df = dd.from_pandas(df)
|
|
76
|
+
|
|
77
|
+
if not isinstance(df, dd.DataFrame):
|
|
78
|
+
raise RuntimeError("Upsert is only supported by dask dataframes for parquet")
|
|
79
|
+
|
|
80
|
+
existing_df = read_from_parquet(cache_path)
|
|
81
|
+
|
|
82
|
+
# Record starting partitions
|
|
83
|
+
start_parts = df.npartitions
|
|
84
|
+
existing_parts = existing_df.npartitions
|
|
85
|
+
|
|
86
|
+
# Coearce dtypes before joining
|
|
87
|
+
for key in primary_keys:
|
|
88
|
+
if is_datetime(existing_df[key].dtype):
|
|
89
|
+
# The only way I could get this to work was by removing timezones
|
|
90
|
+
# many attempts to coerc df to existing df with the correct tz
|
|
91
|
+
df[key] = df[key].dt.tz_localize(None)
|
|
92
|
+
df[key] = dd.to_datetime(df[key], utc=False)
|
|
93
|
+
existing_df[key] = existing_df[key].dt.tz_localize(None)
|
|
94
|
+
existing_df[key] = dd.to_datetime(existing_df[key], utc=False)
|
|
95
|
+
elif df[key].dtype != existing_df[key].dtype:
|
|
96
|
+
df[key] = df[key].astype(existing_df[key].dtype)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
outer_join = existing_df.merge(df, how = 'outer', on=primary_keys, indicator = True, suffixes=('_drop',''))
|
|
100
|
+
new_rows = outer_join[(outer_join._merge == 'right_only')].drop('_merge', axis = 1)
|
|
101
|
+
cols_to_drop = [x for x in new_rows.columns if x.endswith('_drop')]
|
|
102
|
+
new_rows = new_rows.drop(columns=cols_to_drop)
|
|
103
|
+
|
|
104
|
+
# Now concat with existing df
|
|
105
|
+
new_rows = new_rows.astype(existing_df.dtypes)
|
|
106
|
+
new_rows = new_rows[list(existing_df.columns)]
|
|
107
|
+
final_df = dd.concat([existing_df, new_rows])
|
|
108
|
+
|
|
109
|
+
if len(new_rows.index) > 0:
|
|
110
|
+
final_df = final_df.repartition(npartitions=start_parts + existing_parts)
|
|
111
|
+
|
|
112
|
+
# Coearce dtypes and make the columns the same order
|
|
113
|
+
|
|
114
|
+
print("Copying cache for ``consistent'' upsert.")
|
|
115
|
+
if self.fs.exists(temp_cache_path):
|
|
116
|
+
self.fs.rm(temp_cache_path, recursive=True)
|
|
117
|
+
|
|
118
|
+
write_parquet_helper(final_df, temp_cache_path, part)
|
|
119
|
+
print("Successfully appended rows to temp parquet. Overwriting existing cache.")
|
|
120
|
+
|
|
121
|
+
if self.fs.exists(cache_path):
|
|
122
|
+
self.fs.rm(cache_path, recursive=True)
|
|
123
|
+
|
|
124
|
+
self.fs.cp(temp_cache_path, cache_path, recursive=True)
|
|
125
|
+
|
|
126
|
+
else:
|
|
127
|
+
print("No rows to upsert.")
|
|
128
|
+
else:
|
|
129
|
+
write_parquet_helper(df, cache_path, part)
|
|
130
|
+
|
nuthatch/backends/sql.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from nuthatch.backend import DatabaseBackend, register_backend
|
|
2
|
+
from os.path import join
|
|
3
|
+
import hashlib
|
|
4
|
+
import sqlalchemy
|
|
5
|
+
import uuid
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import dask.dataframe as dd
|
|
8
|
+
|
|
9
|
+
def hashed_table_name(table_name):
|
|
10
|
+
"""Return a qualified postgres table name."""
|
|
11
|
+
return hashlib.md5(table_name.encode()).hexdigest()
|
|
12
|
+
|
|
13
|
+
@register_backend
|
|
14
|
+
class SQLBackend(DatabaseBackend):
|
|
15
|
+
"""
|
|
16
|
+
SQL backend for caching tabular data in a SQL database.
|
|
17
|
+
|
|
18
|
+
This backend supports dask and pandas dataframes.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
backend_name = "sql"
|
|
22
|
+
|
|
23
|
+
def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
|
|
24
|
+
super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs)
|
|
25
|
+
|
|
26
|
+
if backend_kwargs and 'hash_table_name' in backend_kwargs:
|
|
27
|
+
self.table_name = hashed_table_name(cache_key)
|
|
28
|
+
else:
|
|
29
|
+
self.table_name = cache_key
|
|
30
|
+
|
|
31
|
+
if namespace:
|
|
32
|
+
self.table_name = namespace + '.' + self.table_name
|
|
33
|
+
|
|
34
|
+
if not self.write_engine.dialect.has_schema(self.write_engine, namespace):
|
|
35
|
+
self.write_engine.execute(sqlalchemy.schema.CreateSchema(namespace))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def write(self, data, upsert=False, primary_keys=None):
|
|
40
|
+
if upsert and self.exists():
|
|
41
|
+
if primary_keys is None or not isinstance(primary_keys, list):
|
|
42
|
+
raise ValueError("Upsert may only be performed with primary keys specified as a list.")
|
|
43
|
+
|
|
44
|
+
if not isinstance(data, dd.DataFrame):
|
|
45
|
+
raise RuntimeError("Upsert is only supported by dask dataframes for parquet")
|
|
46
|
+
|
|
47
|
+
with self.engine.begin() as conn:
|
|
48
|
+
print("SQL cache exists for upsert.")
|
|
49
|
+
# If it already exists...
|
|
50
|
+
|
|
51
|
+
# Extract the primary key columns for SQL constraint
|
|
52
|
+
for key in primary_keys:
|
|
53
|
+
if key not in data.columns:
|
|
54
|
+
raise ValueError("Dataframe MUST contain all primary keys as columns")
|
|
55
|
+
|
|
56
|
+
# Write a temporary table
|
|
57
|
+
temp_table_name = f"temp_{uuid.uuid4().hex[:6]}"
|
|
58
|
+
|
|
59
|
+
if isinstance(data, pd.DataFrame):
|
|
60
|
+
data.to_sql(temp_table_name, self.engine, index=False)
|
|
61
|
+
elif isinstance(data, dd.DataFrame):
|
|
62
|
+
data.to_sql(temp_table_name, uri=self.engine.url.render_as_string(hide_password=False), index=False, parallel=True, chunksize=10000)
|
|
63
|
+
else:
|
|
64
|
+
raise RuntimeError("Did not return dataframe type.")
|
|
65
|
+
|
|
66
|
+
index_sql_txt = ", ".join([f'"{i}"' for i in primary_keys])
|
|
67
|
+
columns = list(data.columns)
|
|
68
|
+
headers = primary_keys + list(set(columns) - set(primary_keys))
|
|
69
|
+
headers_sql_txt = ", ".join(
|
|
70
|
+
[f'"{i}"' for i in headers]
|
|
71
|
+
) # index1, index2, ..., column 1, col2, ...
|
|
72
|
+
|
|
73
|
+
# col1 = exluded.col1, col2=excluded.col2
|
|
74
|
+
# Excluded statement updates values of rows where primary keys conflict
|
|
75
|
+
update_column_stmt = ", ".join([f'"{col}" = EXCLUDED."{col}"' for col in columns])
|
|
76
|
+
|
|
77
|
+
# For the ON CONFLICT clause, postgres requires that the columns have unique constraint
|
|
78
|
+
# To add if not exists must drop if exists and then add. In a transaction this is consistent
|
|
79
|
+
|
|
80
|
+
# Constraint IDs need to be globally unique to not conflict in the database
|
|
81
|
+
constraint_id = hashlib.md5(self.cache_key.encode()).hexdigest()[:10]
|
|
82
|
+
query_pk = f"""
|
|
83
|
+
ALTER TABLE "{self.table_name}" DROP CONSTRAINT IF EXISTS unique_constraint_for_{constraint_id} CASCADE;
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
print("Adding a unique to contraint to table if it doesn't exist.")
|
|
87
|
+
conn.exec_driver_sql(query_pk)
|
|
88
|
+
|
|
89
|
+
query_pk = f"""
|
|
90
|
+
ALTER TABLE "{self.table_name}" ADD CONSTRAINT unique_constraint_for_{constraint_id}
|
|
91
|
+
UNIQUE ({index_sql_txt});
|
|
92
|
+
"""
|
|
93
|
+
conn.exec_driver_sql(query_pk)
|
|
94
|
+
|
|
95
|
+
# Compose and execute upsert query
|
|
96
|
+
query_upsert = f"""INSERT INTO "{self.table_name}" ({headers_sql_txt})
|
|
97
|
+
SELECT {headers_sql_txt} FROM "{temp_table_name}"
|
|
98
|
+
ON CONFLICT ({index_sql_txt}) DO UPDATE
|
|
99
|
+
SET {update_column_stmt};
|
|
100
|
+
"""
|
|
101
|
+
print("Upserting.")
|
|
102
|
+
conn.exec_driver_sql(query_upsert)
|
|
103
|
+
conn.exec_driver_sql(f"DROP TABLE {temp_table_name}")
|
|
104
|
+
else:
|
|
105
|
+
try:
|
|
106
|
+
if isinstance(data, pd.DataFrame):
|
|
107
|
+
data.to_sql(self.table_name, self.write_engine, if_exists='replace', index=False)
|
|
108
|
+
return data
|
|
109
|
+
elif isinstance(data, dd.DataFrame):
|
|
110
|
+
data.to_sql(self.table_name, self.engine.url.render_as_string(hide_password=False), if_exists='replace', index=False, parallel=True, chunksize=10000)
|
|
111
|
+
else:
|
|
112
|
+
raise RuntimeError("Did not return dataframe type.")
|
|
113
|
+
|
|
114
|
+
# Also log the table name in the tables table
|
|
115
|
+
pd_name = {'table_name': [self.cache_key], 'table_key': [self.table_name], 'created_at': [pd.Timestamp.now()]}
|
|
116
|
+
pd_name = pd.DataFrame(pd_name)
|
|
117
|
+
pd_name.to_sql('cache_tables', self.write_engine, if_exists='append')
|
|
118
|
+
except sqlalchemy.exc.InterfaceError:
|
|
119
|
+
raise RuntimeError("Error connecting to database.")
|
|
120
|
+
|
|
121
|
+
def read(self, engine=None):
|
|
122
|
+
if engine == 'pandas' or engine == pd.DataFrame or engine =='dask' or engine == dd.DataFrame or engine is None:
|
|
123
|
+
try:
|
|
124
|
+
data = pd.read_sql_query(f'select * from "{self.table_name}"', con=self.engine)
|
|
125
|
+
if engine == 'dask' or engine == dd.DataFrame:
|
|
126
|
+
return dd.from_pandas(data)
|
|
127
|
+
else:
|
|
128
|
+
return data
|
|
129
|
+
except sqlalchemy.exc.InterfaceError:
|
|
130
|
+
raise RuntimeError("Error connecting to database.")
|
|
131
|
+
else:
|
|
132
|
+
raise RuntimeError("SQL backend only supports pandas engine.")
|
|
133
|
+
|
|
134
|
+
def exists(self):
|
|
135
|
+
try:
|
|
136
|
+
insp = sqlalchemy.inspect(self.engine)
|
|
137
|
+
return insp.has_table(self.table_name)
|
|
138
|
+
except sqlalchemy.exc.InterfaceError:
|
|
139
|
+
raise RuntimeError("Error connecting to database.")
|
|
140
|
+
|
|
141
|
+
def get_file_path(self):
|
|
142
|
+
return join(self.engine.url.render_as_string(), self.table_name)
|
|
143
|
+
|
|
144
|
+
def delete(self):
|
|
145
|
+
metadata = sqlalchemy.MetaData()
|
|
146
|
+
table = sqlalchemy.Table(self.table_name, metadata)
|
|
147
|
+
table.drop(self.engine, checkfirst=True)
|