nuthatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nuthatch might be problematic. Click here for more details.

nuthatch/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """
2
+ Nuthatch is a library for caching data based on the function call and its arguments.
3
+
4
+ It caches data in a variety of backends optimized for different data types.
5
+ """
6
+ from .config import config_parameter
7
+ from .nuthatch import cache
8
+
9
+ # Trigger backend registration
10
+ import nuthatch.backends #noqa
11
+
12
+ __version__ = "0.1.0"
13
+
14
+ __all__ = ["config_parameter", "cache", "__version__", "cli"]
nuthatch/backend.py ADDED
@@ -0,0 +1,301 @@
1
+ """The backend module is used to register and get backends.
2
+
3
+ It also contains the abstract base class for all backends.
4
+ """
5
+ from abc import ABC, abstractmethod
6
+ from os.path import join
7
+ import fsspec
8
+ import sqlalchemy
9
+
10
+ registered_backends = {}
11
+ default_backends = {}
12
+
13
+ def register_backend(backendClass):
14
+ """Register a backend class with the nuthatch system.
15
+
16
+ This function registers a backend class so it can be used by the nuthatch
17
+ caching system. The backend class must have a `backend_name` attribute.
18
+ Optionally, if the backend class has a `default_for_type` attribute, it
19
+ will be set as the default backend for that data type.
20
+
21
+ Args:
22
+ backendClass: The backend class to register. Must have a `backend_name`
23
+ attribute and optionally a `default_for_type` attribute.
24
+
25
+ Returns:
26
+ The registered backend class (allows use as a decorator).
27
+
28
+ Example:
29
+ @register_backend
30
+ class MyBackend:
31
+ backend_name = "my_backend"
32
+ default_for_type = "my_data_type"
33
+ """
34
+ registered_backends[backendClass.backend_name] = backendClass
35
+
36
+ if 'default_for_type' in backendClass.__dict__:
37
+ default_backends[backendClass.default_for_type] = backendClass.backend_name
38
+
39
+ return backendClass
40
+
41
+ def get_backend_by_name(backend_name):
42
+ """Retrieve a registered backend class by its name.
43
+
44
+ Args:
45
+ backend_name (str): The name of the backend to retrieve.
46
+
47
+ Returns:
48
+ The backend class associated with the given name.
49
+
50
+ Raises:
51
+ KeyError: If no backend is registered with the given name.
52
+ """
53
+ return registered_backends[backend_name]
54
+
55
+ def get_default_backend(data_type):
56
+ """Get the default backend name for a specific data type.
57
+
58
+ Args:
59
+ data_type (str): The data type to get the default backend for.
60
+
61
+ Returns:
62
+ str: The name of the default backend for the data type, or 'basic'
63
+ if no default is set.
64
+ """
65
+ if data_type in default_backends:
66
+ return default_backends[data_type]
67
+ else:
68
+ return 'basic'
69
+
70
+ class NuthatchBackend(ABC):
71
+ config_parameters = []
72
+
73
+ def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
74
+ """The abstract base class for all cacheable backends.
75
+
76
+ This is the base class for all cacheable backends in the nuthatch system.
77
+ Each backend instance manages a specific cache entry with its own
78
+ configuration and storage mechanism.
79
+
80
+ Args:
81
+ cacheable_config (dict): Configuration dictionary containing static or
82
+ dynamic parameters. Required parameters should be listed as strings
83
+ in the backend's `config_parameters` class attribute.
84
+ cache_key (str): Unique identifier for the cache entry. This key is
85
+ used to distinguish between different cached items.
86
+ namespace (str, optional): Optional namespace to organize cache entries.
87
+ If provided, cache entries will be stored under this namespace.
88
+ args (dict): Key-value pairs of arguments and values that were passed
89
+ to the function being cached. These are used to generate cache
90
+ keys and may influence backend behavior.
91
+ backend_kwargs (dict): User-configurable keyword arguments specific to
92
+ the backend implementation. For example, the zarr backend uses
93
+ these for per-argument-value chunking configuration.
94
+
95
+ Note:
96
+ This is an abstract base class. Subclasses must implement the abstract
97
+ methods: `write()`, `read()`, `delete()`, `exists()`, `get_file_path()`,
98
+ and `sync()`.
99
+ """
100
+
101
+ # Store base
102
+ self.config = cacheable_config
103
+ self.cache_key = cache_key
104
+ self.namespace = namespace
105
+ self.backend_kwargs = backend_kwargs
106
+ self.args = args
107
+
108
+
109
+ @abstractmethod
110
+ def write(self, data, upsert=False, primary_keys=None):
111
+ """Write data to the backend.
112
+
113
+ This method is responsible for writing data to the backend. It should
114
+ be implemented by subclasses to handle the specific storage mechanism
115
+ of the backend.
116
+
117
+ Args:
118
+ data (any): The data to write to the backend.
119
+ upsert (bool, optional): Whether to perform an upsert operation.
120
+ If True, the data will be inserted or updated based on the
121
+ primary keys provided.
122
+ primary_keys (list, optional): List of primary key columns to use
123
+ for upsert operations.
124
+ """
125
+ pass
126
+
127
+ @abstractmethod
128
+ def read(self, engine):
129
+ """Read data from the backend.
130
+
131
+ This method is responsible for reading data from the backend. It should
132
+ be implemented by subclasses to handle the specific storage mechanism
133
+ of the backend.
134
+
135
+ Args:
136
+ engine (str or type): The data processing engine to use for
137
+ reading data from the backend.
138
+
139
+ Returns:
140
+ Any: The data read from the backend.
141
+ """
142
+ pass
143
+
144
+ @abstractmethod
145
+ def delete(self):
146
+ """Delete data from the backend.
147
+
148
+ This method is responsible for deleting data from the backend. It should
149
+ be implemented by subclasses to handle the specific storage mechanism
150
+ of the backend.
151
+ """
152
+ pass
153
+
154
+ @abstractmethod
155
+ def exists(self):
156
+ """Check if the data exists in the backend.
157
+
158
+ This method is responsible for checking if the data exists in the backend.
159
+ It should be implemented by subclasses to handle the specific storage
160
+ mechanism of the backend.
161
+
162
+ Returns:
163
+ bool: True if the data exists in the backend, False otherwise.
164
+ """
165
+ pass
166
+
167
+ @abstractmethod
168
+ def get_file_path(self):
169
+ """Get the file path of the data in the backend.
170
+
171
+ This method is responsible for returning the file path of the data in the
172
+ backend. It should be implemented by subclasses to handle the specific
173
+ storage mechanism of the backend.
174
+
175
+ Returns:
176
+ str: The file path of the data in the backend.
177
+ """
178
+ pass
179
+
180
+ @abstractmethod
181
+ def sync(self, from_backend):
182
+ """Sync data from one backend to another.
183
+
184
+ This method is responsible for syncing data from one backend to another.
185
+ It should be implemented by subclasses to handle the specific storage
186
+ mechanism of the backend.
187
+
188
+ Args:
189
+ from_backend (NuthatchBackend): The backend to sync data from.
190
+ """
191
+ pass
192
+
193
+
194
+ class FileBackend(NuthatchBackend):
195
+ """Base class for all backends that rely on a filesystem."""
196
+
197
+ config_parameters = ["filesystem", "filesystem_options"]
198
+
199
+ def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs, extension):
200
+ super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs)
201
+
202
+ self.base_path = self.config['filesystem']
203
+
204
+ if namespace:
205
+ self.raw_cache_path = join(self.base_path, namespace, cache_key)
206
+ else:
207
+ self.raw_cache_path = join(self.base_path, cache_key)
208
+
209
+ self.temp_cache_path = join(self.base_path, 'temp', cache_key)
210
+ self.extension = extension
211
+ self.path = self.raw_cache_path + '.' + extension
212
+ if 'filesystem_options' not in self.config:
213
+ self.config['filesystem_options'] = {}
214
+
215
+ if fsspec.utils.get_protocol(self.path) == 'file':
216
+ # If the protocol is a local filesystem, we need to create the directory if it doesn't exist
217
+ self.fs = fsspec.core.url_to_fs(self.path, auto_mkdir=True, **self.config['filesystem_options'])[0]
218
+ else:
219
+ self.fs = fsspec.core.url_to_fs(self.path, **self.config['filesystem_options'])[0]
220
+
221
+
222
+ def exists(self):
223
+ return (self.fs.exists(self.path))
224
+
225
+ def delete(self):
226
+ self.fs.rm(self.path, recursive=True)
227
+
228
+ def get_file_path(self):
229
+ return self.path
230
+
231
+ def get_cache_key(self, path):
232
+ if path.endswith('.' + self.extension):
233
+ path = path[:-len('.' + self.extension)]
234
+
235
+ if path.startswith(self.base_path):
236
+ path = path[len(self.base_path):]
237
+
238
+ stripped = fsspec.core.strip_protocol(self.base_path)
239
+ if path.startswith(stripped):
240
+ path = path[len(stripped):]
241
+
242
+ if path.startswith('/'):
243
+ path = path[1:]
244
+
245
+ if self.namespace:
246
+ if path.startswith(self.namespace):
247
+ path = path[len(self.namespace):]
248
+
249
+ if path.startswith('/'):
250
+ path = path[1:]
251
+
252
+ return path
253
+
254
+
255
+ def sync(self, from_backend):
256
+ from_backend.fs.get(from_backend.path, self.path)
257
+
258
+ @register_backend
259
+ class NullBackend(FileBackend):
260
+
261
+ backend_name = 'null'
262
+
263
+ def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
264
+ super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'null')
265
+
266
+ def read(self, engine=None):
267
+ raise RuntimeError("Null backend used only for import/export path manipulation. Cannot read from null backend.")
268
+
269
+ def write(self, engine=None):
270
+ raise RuntimeError("Null backend used only for import/export path manipulation. Cannot write to null backend.")
271
+
272
+
273
+ class DatabaseBackend(NuthatchBackend):
274
+ """Base class for all backends that rely on a database."""
275
+
276
+ config_parameters = ["driver", "host", "port", "database", "username", "password", "write_username", "write_password"]
277
+
278
+ def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
279
+ super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs)
280
+
281
+ database_url = sqlalchemy.URL.create(self.config['driver'],
282
+ username = self.config['username'],
283
+ password = self.config['password'],
284
+ host = self.config['host'],
285
+ port = self.config['port'],
286
+ database = self.config['database'])
287
+ self.engine = sqlalchemy.create_engine(database_url)
288
+
289
+ if 'write_username' in self.config and 'write_password' in self.config:
290
+ write_database_url = sqlalchemy.URL.create(self.config['driver'],
291
+ username = self.config['write_username'],
292
+ password = self.config['write_password'],
293
+ host = self.config['host'],
294
+ port = self.config['port'],
295
+ database = self.config['database'])
296
+ self.write_engine = sqlalchemy.create_engine(write_database_url)
297
+ else:
298
+ self.write_engine = self.engine
299
+
300
+ def sync(self, local_backend):
301
+ raise NotImplementedError("Backend syncing not implemented for database-like backends.")
@@ -0,0 +1,8 @@
1
+ from .sql import SQLBackend
2
+ from .delta import DeltaBackend
3
+ from .basic import BasicBackend
4
+ from .zarr import ZarrBackend
5
+ from .terracotta import TerracottaBackend
6
+ from .parquet import ParquetBackend
7
+
8
+ __all__ = ["SQLBackend", "DeltaBackend", "BasicBackend", "ZarrBackend", "TerracottaBackend", "ParquetBackend"]
@@ -0,0 +1,28 @@
1
+ from nuthatch.backend import FileBackend, register_backend
2
+ import pickle
3
+
4
+ @register_backend
5
+ class BasicBackend(FileBackend):
6
+ """
7
+ Basic backend for caching data in a pickle file.
8
+ """
9
+
10
+ backend_name = "basic"
11
+
12
+ def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
13
+ super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'pkl')
14
+
15
+
16
+ def write(self, data, upsert=False, primary_keys=None):
17
+ if upsert:
18
+ raise ValueError("Basic/pickle backend does not support upsert.")
19
+
20
+ with self.fs.open(self.path, 'wb') as f:
21
+ pickle.dump(data, f)
22
+
23
+
24
+ def read(self, engine=None):
25
+ # Check to make sure the verify exists
26
+ if self.fs.exists(self.path):
27
+ with self.fs.open(self.path, 'rb') as f:
28
+ return pickle.load(f)
@@ -0,0 +1,46 @@
1
+ from nuthatch.backend import FileBackend, register_backend
2
+ from deltalake import DeltaTable, write_deltalake
3
+ import dask.dataframe as dd
4
+ import pandas as pd
5
+ import dask_deltatable as ddt
6
+
7
+ @register_backend
8
+ class DeltaBackend(FileBackend):
9
+ """
10
+ Delta backend for caching tabular data in a delta table.
11
+
12
+ This backend supports dask and pandas dataframes.
13
+ """
14
+
15
+ backend_name = "delta"
16
+ default_for_type = pd.DataFrame
17
+
18
+ def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
19
+ super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'delta')
20
+
21
+
22
+ def write(self, data, upsert=False, primary_keys=None):
23
+ """Write a pandas dataframe to a delta table."""
24
+ if upsert:
25
+ raise ValueError("Delta backend does not support upsert.")
26
+
27
+ if isinstance(data, dd.DataFrame):
28
+ print("""Warning: Dask datafame passed to delta backend. Will run `compute()`
29
+ on the dataframe prior to storage. This will fail if the dataframe
30
+ does not fit in memory. Use `backend=parquet` to handle parallel writing of dask dataframes.""")
31
+ write_data = data.compute()
32
+ elif isinstance(data, pd.DataFrame):
33
+ write_data = data
34
+ else:
35
+ raise RuntimeError("Delta backend only supports dask and pandas engines.")
36
+
37
+ write_deltalake(self.path, write_data, mode='overwrite', schema_mode='overwrite')
38
+
39
+
40
+ def read(self, engine=None):
41
+ if engine == 'pandas' or engine == pd.DataFrame or engine is None:
42
+ return DeltaTable(self.path).to_pandas()
43
+ elif engine == 'dask' or engine == dd.DataFrame:
44
+ return ddt.read_deltalake(self.path)
45
+ else:
46
+ raise RuntimeError("Delta backend only supports dask and pandas engines.")
@@ -0,0 +1,130 @@
1
+ from nuthatch.backend import FileBackend, register_backend
2
+ from pandas.api.types import is_datetime64_any_dtype as is_datetime
3
+ import dask.dataframe as dd
4
+ import pandas as pd
5
+
6
+
7
+ def write_parquet_helper(df, path, partition_on=None):
8
+ """Helper to write parquets."""
9
+ print(path)
10
+ df.to_parquet(
11
+ path,
12
+ overwrite=True,
13
+ partition_on=partition_on,
14
+ engine="pyarrow",
15
+ write_metadata_file=True,
16
+ write_index=False,
17
+ )
18
+
19
+ def read_from_parquet(cache_path):
20
+ """Read from a deltatable into a pandas dataframe."""
21
+ return dd.read_parquet(cache_path, engine='pyarrow', ignore_metadata_file=True)
22
+
23
+
24
+
25
+ @register_backend
26
+ class ParquetBackend(FileBackend):
27
+ """
28
+ Parquet backend for caching tabular data in a parquet file.
29
+
30
+ This backend supports dask and pandas dataframes.
31
+ """
32
+
33
+ backend_name = 'parquet'
34
+ default_for_type = dd.DataFrame
35
+
36
+ def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
37
+ super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs, 'parquet')
38
+
39
+ def write(self, data, upsert=False, primary_keys=None):
40
+ if isinstance(data, dd.DataFrame):
41
+ self.write_to_parquet(data, self.path, self.temp_cache_path, upsert=upsert, primary_keys=primary_keys)
42
+ elif isinstance(data, pd.DataFrame):
43
+ if upsert:
44
+ raise RuntimeError("Parquet backend does not support upsert for pandas engine.")
45
+
46
+ part = None
47
+ if hasattr(data, 'cache_partition'):
48
+ part = data.cache_partition
49
+
50
+ data.to_parquet(self.path, partition_cols=part, engine='pyarrow')
51
+ else:
52
+ raise RuntimeError("Delta backend only supports dask and pandas engines.")
53
+
54
+ def read(self, engine):
55
+ if engine == 'pandas' or engine == pd.DataFrame or engine is None:
56
+ return pd.read_parquet(self.path)
57
+ elif engine == 'dask' or engine == dd.DataFrame:
58
+ return dd.read_parquet(self.path, engine='pyarrow', ignore_metadata_file=True)
59
+ else:
60
+ raise RuntimeError("Delta backend only supports dask and pandas engines.")
61
+
62
+ def write_to_parquet(self, df, cache_path, temp_cache_path, upsert=False, primary_keys=None):
63
+ """Write a pandas or dask dataframe to a parquet."""
64
+ part = None
65
+ if hasattr(df, 'cache_partition'):
66
+ part = df.cache_partition
67
+
68
+ if upsert and self.fs.exists(cache_path):
69
+ print("Found existing cache for upsert.")
70
+ if primary_keys is None:
71
+ raise ValueError("Upsert may only be performed with primary keys specified")
72
+
73
+ if isinstance(df, pd.DataFrame):
74
+ print("Auto converting pandas to dask dataframe.")
75
+ df = dd.from_pandas(df)
76
+
77
+ if not isinstance(df, dd.DataFrame):
78
+ raise RuntimeError("Upsert is only supported by dask dataframes for parquet")
79
+
80
+ existing_df = read_from_parquet(cache_path)
81
+
82
+ # Record starting partitions
83
+ start_parts = df.npartitions
84
+ existing_parts = existing_df.npartitions
85
+
86
+ # Coearce dtypes before joining
87
+ for key in primary_keys:
88
+ if is_datetime(existing_df[key].dtype):
89
+ # The only way I could get this to work was by removing timezones
90
+ # many attempts to coerc df to existing df with the correct tz
91
+ df[key] = df[key].dt.tz_localize(None)
92
+ df[key] = dd.to_datetime(df[key], utc=False)
93
+ existing_df[key] = existing_df[key].dt.tz_localize(None)
94
+ existing_df[key] = dd.to_datetime(existing_df[key], utc=False)
95
+ elif df[key].dtype != existing_df[key].dtype:
96
+ df[key] = df[key].astype(existing_df[key].dtype)
97
+
98
+
99
+ outer_join = existing_df.merge(df, how = 'outer', on=primary_keys, indicator = True, suffixes=('_drop',''))
100
+ new_rows = outer_join[(outer_join._merge == 'right_only')].drop('_merge', axis = 1)
101
+ cols_to_drop = [x for x in new_rows.columns if x.endswith('_drop')]
102
+ new_rows = new_rows.drop(columns=cols_to_drop)
103
+
104
+ # Now concat with existing df
105
+ new_rows = new_rows.astype(existing_df.dtypes)
106
+ new_rows = new_rows[list(existing_df.columns)]
107
+ final_df = dd.concat([existing_df, new_rows])
108
+
109
+ if len(new_rows.index) > 0:
110
+ final_df = final_df.repartition(npartitions=start_parts + existing_parts)
111
+
112
+ # Coearce dtypes and make the columns the same order
113
+
114
+ print("Copying cache for ``consistent'' upsert.")
115
+ if self.fs.exists(temp_cache_path):
116
+ self.fs.rm(temp_cache_path, recursive=True)
117
+
118
+ write_parquet_helper(final_df, temp_cache_path, part)
119
+ print("Successfully appended rows to temp parquet. Overwriting existing cache.")
120
+
121
+ if self.fs.exists(cache_path):
122
+ self.fs.rm(cache_path, recursive=True)
123
+
124
+ self.fs.cp(temp_cache_path, cache_path, recursive=True)
125
+
126
+ else:
127
+ print("No rows to upsert.")
128
+ else:
129
+ write_parquet_helper(df, cache_path, part)
130
+
@@ -0,0 +1,147 @@
1
+ from nuthatch.backend import DatabaseBackend, register_backend
2
+ from os.path import join
3
+ import hashlib
4
+ import sqlalchemy
5
+ import uuid
6
+ import pandas as pd
7
+ import dask.dataframe as dd
8
+
9
+ def hashed_table_name(table_name):
10
+ """Return a qualified postgres table name."""
11
+ return hashlib.md5(table_name.encode()).hexdigest()
12
+
13
+ @register_backend
14
+ class SQLBackend(DatabaseBackend):
15
+ """
16
+ SQL backend for caching tabular data in a SQL database.
17
+
18
+ This backend supports dask and pandas dataframes.
19
+ """
20
+
21
+ backend_name = "sql"
22
+
23
+ def __init__(self, cacheable_config, cache_key, namespace, args, backend_kwargs):
24
+ super().__init__(cacheable_config, cache_key, namespace, args, backend_kwargs)
25
+
26
+ if backend_kwargs and 'hash_table_name' in backend_kwargs:
27
+ self.table_name = hashed_table_name(cache_key)
28
+ else:
29
+ self.table_name = cache_key
30
+
31
+ if namespace:
32
+ self.table_name = namespace + '.' + self.table_name
33
+
34
+ if not self.write_engine.dialect.has_schema(self.write_engine, namespace):
35
+ self.write_engine.execute(sqlalchemy.schema.CreateSchema(namespace))
36
+
37
+
38
+
39
+ def write(self, data, upsert=False, primary_keys=None):
40
+ if upsert and self.exists():
41
+ if primary_keys is None or not isinstance(primary_keys, list):
42
+ raise ValueError("Upsert may only be performed with primary keys specified as a list.")
43
+
44
+ if not isinstance(data, dd.DataFrame):
45
+ raise RuntimeError("Upsert is only supported by dask dataframes for parquet")
46
+
47
+ with self.engine.begin() as conn:
48
+ print("SQL cache exists for upsert.")
49
+ # If it already exists...
50
+
51
+ # Extract the primary key columns for SQL constraint
52
+ for key in primary_keys:
53
+ if key not in data.columns:
54
+ raise ValueError("Dataframe MUST contain all primary keys as columns")
55
+
56
+ # Write a temporary table
57
+ temp_table_name = f"temp_{uuid.uuid4().hex[:6]}"
58
+
59
+ if isinstance(data, pd.DataFrame):
60
+ data.to_sql(temp_table_name, self.engine, index=False)
61
+ elif isinstance(data, dd.DataFrame):
62
+ data.to_sql(temp_table_name, uri=self.engine.url.render_as_string(hide_password=False), index=False, parallel=True, chunksize=10000)
63
+ else:
64
+ raise RuntimeError("Did not return dataframe type.")
65
+
66
+ index_sql_txt = ", ".join([f'"{i}"' for i in primary_keys])
67
+ columns = list(data.columns)
68
+ headers = primary_keys + list(set(columns) - set(primary_keys))
69
+ headers_sql_txt = ", ".join(
70
+ [f'"{i}"' for i in headers]
71
+ ) # index1, index2, ..., column 1, col2, ...
72
+
73
+ # col1 = exluded.col1, col2=excluded.col2
74
+ # Excluded statement updates values of rows where primary keys conflict
75
+ update_column_stmt = ", ".join([f'"{col}" = EXCLUDED."{col}"' for col in columns])
76
+
77
+ # For the ON CONFLICT clause, postgres requires that the columns have unique constraint
78
+ # To add if not exists must drop if exists and then add. In a transaction this is consistent
79
+
80
+ # Constraint IDs need to be globally unique to not conflict in the database
81
+ constraint_id = hashlib.md5(self.cache_key.encode()).hexdigest()[:10]
82
+ query_pk = f"""
83
+ ALTER TABLE "{self.table_name}" DROP CONSTRAINT IF EXISTS unique_constraint_for_{constraint_id} CASCADE;
84
+ """
85
+
86
+ print("Adding a unique to contraint to table if it doesn't exist.")
87
+ conn.exec_driver_sql(query_pk)
88
+
89
+ query_pk = f"""
90
+ ALTER TABLE "{self.table_name}" ADD CONSTRAINT unique_constraint_for_{constraint_id}
91
+ UNIQUE ({index_sql_txt});
92
+ """
93
+ conn.exec_driver_sql(query_pk)
94
+
95
+ # Compose and execute upsert query
96
+ query_upsert = f"""INSERT INTO "{self.table_name}" ({headers_sql_txt})
97
+ SELECT {headers_sql_txt} FROM "{temp_table_name}"
98
+ ON CONFLICT ({index_sql_txt}) DO UPDATE
99
+ SET {update_column_stmt};
100
+ """
101
+ print("Upserting.")
102
+ conn.exec_driver_sql(query_upsert)
103
+ conn.exec_driver_sql(f"DROP TABLE {temp_table_name}")
104
+ else:
105
+ try:
106
+ if isinstance(data, pd.DataFrame):
107
+ data.to_sql(self.table_name, self.write_engine, if_exists='replace', index=False)
108
+ return data
109
+ elif isinstance(data, dd.DataFrame):
110
+ data.to_sql(self.table_name, self.engine.url.render_as_string(hide_password=False), if_exists='replace', index=False, parallel=True, chunksize=10000)
111
+ else:
112
+ raise RuntimeError("Did not return dataframe type.")
113
+
114
+ # Also log the table name in the tables table
115
+ pd_name = {'table_name': [self.cache_key], 'table_key': [self.table_name], 'created_at': [pd.Timestamp.now()]}
116
+ pd_name = pd.DataFrame(pd_name)
117
+ pd_name.to_sql('cache_tables', self.write_engine, if_exists='append')
118
+ except sqlalchemy.exc.InterfaceError:
119
+ raise RuntimeError("Error connecting to database.")
120
+
121
+ def read(self, engine=None):
122
+ if engine == 'pandas' or engine == pd.DataFrame or engine =='dask' or engine == dd.DataFrame or engine is None:
123
+ try:
124
+ data = pd.read_sql_query(f'select * from "{self.table_name}"', con=self.engine)
125
+ if engine == 'dask' or engine == dd.DataFrame:
126
+ return dd.from_pandas(data)
127
+ else:
128
+ return data
129
+ except sqlalchemy.exc.InterfaceError:
130
+ raise RuntimeError("Error connecting to database.")
131
+ else:
132
+ raise RuntimeError("SQL backend only supports pandas engine.")
133
+
134
+ def exists(self):
135
+ try:
136
+ insp = sqlalchemy.inspect(self.engine)
137
+ return insp.has_table(self.table_name)
138
+ except sqlalchemy.exc.InterfaceError:
139
+ raise RuntimeError("Error connecting to database.")
140
+
141
+ def get_file_path(self):
142
+ return join(self.engine.url.render_as_string(), self.table_name)
143
+
144
+ def delete(self):
145
+ metadata = sqlalchemy.MetaData()
146
+ table = sqlalchemy.Table(self.table_name, metadata)
147
+ table.drop(self.engine, checkfirst=True)