nuthatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nuthatch might be problematic. Click here for more details.

nuthatch/cache.py ADDED
@@ -0,0 +1,529 @@
1
+ from deltalake import DeltaTable, write_deltalake, QueryBuilder
2
+ from os.path import join
3
+ import copy
4
+ import git
5
+ import getpass
6
+ import datetime
7
+ from .backend import get_backend_by_name
8
+ from .config import get_config
9
+ import pyarrow as pa
10
+ import pandas as pd
11
+ import sqlalchemy
12
+
13
+ class Cache():
14
+ """The cache class is the main class that manages the cache.
15
+
16
+ It is responsible for:
17
+ - Instantiating the correct backend
18
+ - Managing the metadata in the metadata database or delta table
19
+ - Writing and reading data to the backend
20
+ """
21
+ database_parameters = ["driver", "host", "port", "database", "username", "password"]
22
+ config_parameters = ['filesystem', 'filesystem_options', 'metadata_location'] + database_parameters
23
+ backend_name = "cache_metadata"
24
+ delta_tables = {}
25
+ delta_table_configs = {}
26
+
27
+ def __init__(self, config, cache_key, namespace, args, backend_location, requested_backend, backend_kwargs):
28
+ self.cache_key = cache_key
29
+ self.config = config
30
+ self.namespace = namespace
31
+ self.location = backend_location
32
+ self.args = args
33
+ self. backend_kwargs = backend_kwargs
34
+
35
+ # Either instantiate a delta table or a postgres table
36
+ if (('metadata_location' in config and config['metadata_location'] == 'filesystem') or
37
+ ('metadata_location' not in config and 'filesystem' in config) or
38
+ (any(param not in config for param in self.__class__.database_parameters))):
39
+ # This is a delta/filesystem type
40
+ base_path = self.config['filesystem']
41
+ table_path = join(base_path, 'nuthatch_metadata.delta')
42
+
43
+ options = None
44
+ if 'filesystem_options' in self.config:
45
+ options = self.config['filesystem_options']
46
+ for key, value in options.items():
47
+ options[key] = str(value)
48
+
49
+ self.store = 'delta'
50
+ if (backend_location in self.__class__.delta_tables and
51
+ self.__class__.delta_table_configs[backend_location] == config):
52
+ self.dt = self.__class__.delta_tables[backend_location]
53
+ else:
54
+ # Instantiate the metadata store here so that _get_backend_from_metadata() works
55
+ if not DeltaTable.is_deltatable(table_path, storage_options=options):
56
+ print("Instantiating empty delta table.")
57
+ DeltaTable.create(table_path,
58
+ schema=pa.schema(
59
+ [pa.field("cache_key", pa.string()), pa.field("backend", pa.string()),
60
+ pa.field("namespace", pa.string()), pa.field("state", pa.string()),
61
+ pa.field("last_modified", pa.int64()), pa.field("commit_hash", pa.string()),
62
+ pa.field("user", pa.string()), pa.field("path", pa.string())]
63
+ ),
64
+ storage_options=options,
65
+ partition_by="cache_key")
66
+
67
+ self.dt = DeltaTable(table_path, storage_options=options)
68
+ self.__class__.delta_tables[backend_location] = self.dt
69
+ self.__class__.delta_table_configs[backend_location] = config
70
+ else:
71
+ # This is a database type
72
+ database_url = sqlalchemy.URL.create(self.config['driver'],
73
+ username = self.config['username'],
74
+ password = self.config['password'],
75
+ host = self.config['host'],
76
+ port = self.config['port'],
77
+ database = self.config['database'])
78
+ self.engine = sqlalchemy.create_engine(database_url)
79
+ metadata = sqlalchemy.MetaData()
80
+
81
+ self.db_table = sqlalchemy.Table(
82
+ 'nuthatch_metadata', metadata,
83
+ sqlalchemy.Column('cache_key', sqlalchemy.String),
84
+ sqlalchemy.Column('backend', sqlalchemy.String),
85
+ sqlalchemy.Column('namespace', sqlalchemy.String),
86
+ sqlalchemy.Column('state', sqlalchemy.String),
87
+ sqlalchemy.Column('last_modified', sqlalchemy.BigInteger),
88
+ sqlalchemy.Column('commit_hash', sqlalchemy.String),
89
+ sqlalchemy.Column('user', sqlalchemy.String),
90
+ sqlalchemy.Column('path', sqlalchemy.String)
91
+ )
92
+
93
+ metadata.create_all(self.engine, checkfirst=True)
94
+ self.store = 'database'
95
+
96
+ self.backend = None
97
+ self.backend_name = None
98
+
99
+ backend_class = None
100
+ if requested_backend:
101
+ backend_class = get_backend_by_name(requested_backend)
102
+ self.backend_name = requested_backend
103
+ elif self.cache_key:
104
+ stored_backend = self._get_backend_from_metadata()
105
+ if stored_backend:
106
+ backend_class = get_backend_by_name(stored_backend)
107
+ self.backend_name = stored_backend
108
+
109
+ if backend_class and self.cache_key:
110
+ backend_config = get_config(location=backend_location, requested_parameters=backend_class.config_parameters,
111
+ backend_name=backend_class.backend_name)
112
+ if backend_config:
113
+ self.backend = backend_class(backend_config, cache_key, namespace, args, copy.deepcopy(backend_kwargs))
114
+
115
+ def _delta_check_exists(self, state=None, include_backend=False):
116
+ """Check if the metadata exists in the delta table.
117
+
118
+ Args:
119
+ state (str, optional): The state of the metadata to check for.
120
+ include_backend (bool, optional): Whether to include the backend in the check.
121
+
122
+ Returns:
123
+ bool: True if the metadata exists in the delta table, False otherwise.
124
+ """
125
+ base = f"""select * from metadata where cache_key = '{self.cache_key}' AND namespace = '{self.namespace}'"""
126
+ if include_backend:
127
+ base += f" AND backend = '{self.backend_name}'"
128
+ if state:
129
+ base += f" AND state = '{state}'"
130
+
131
+ rows = QueryBuilder().register('metadata', self.dt).execute(base).read_all()
132
+
133
+ if len(rows) > 0:
134
+ return True
135
+ else:
136
+ return False
137
+
138
+ def _sql_check_exists(self, state=None, include_backend=False):
139
+ """Check if the metadata exists in the database.
140
+
141
+ Args:
142
+ state (str, optional): The state of the metadata to check for.
143
+ include_backend (bool, optional): Whether to include the backend in the check.
144
+
145
+ Returns:
146
+ bool: True if the metadata exists in the database, False otherwise.
147
+ """
148
+ statement = sqlalchemy.select(sqlalchemy.func.count(self.db_table.c.cache_key)).where(self.db_table.c.cache_key == self.cache_key)\
149
+ .where(self.db_table.c.namespace == self.namespace)
150
+ if state:
151
+ statement = statement.where(self.db_table.c.state == state)
152
+ if include_backend:
153
+ statement = statement.where(self.db_table.c.backend == self.backend_name)
154
+
155
+ with self.engine.connect() as conn:
156
+ num = conn.execute(statement)
157
+ if num.fetchone()[0] > 0:
158
+ return True
159
+ else:
160
+ return False
161
+
162
+ def _check_row_exists(self, state=None, include_backend=False):
163
+ """Check if the metadata exists in the database or delta table.
164
+
165
+ Args:
166
+ state (str, optional): The state of the metadata to check for.
167
+ include_backend (bool, optional): Whether to include the backend in the check.
168
+
169
+ Returns:
170
+ bool: True if the metadata exists in the database or delta table, False otherwise.
171
+ """
172
+ if self.store == 'delta':
173
+ return self._delta_check_exists(state, include_backend)
174
+ else:
175
+ return self._sql_check_exists(state, include_backend)
176
+
177
+ def _sql_get_row(self, select, include_backend=False):
178
+ """Get a row from the database.
179
+
180
+ Args:
181
+ select (str): The column to select.
182
+ include_backend (bool, optional): Whether to include the backend in the check.
183
+
184
+ Returns:
185
+ The row from the database.
186
+ """
187
+ if isinstance(select, str):
188
+ select = [select]
189
+
190
+ statement = sqlalchemy.select(self.db_table.c[*select]).where(self.db_table.c.namespace == self.namespace)\
191
+ .where(self.db_table.c.cache_key.like(self.cache_key))
192
+
193
+ if include_backend:
194
+ statement = statement.where(self.db_table.c.backend == self.backend_name)
195
+
196
+ with self.engine.connect() as conn:
197
+ rows = conn.execute(statement)
198
+ return rows.mappings().all()
199
+
200
+ def _delta_get_row(self, select, include_backend=False):
201
+ """Get a row from the delta table.
202
+
203
+ Args:
204
+ select (str): The column to select.
205
+ include_backend (bool, optional): Whether to include the backend in the check.
206
+
207
+ Returns:
208
+ The row from the delta table.
209
+ """
210
+ if isinstance(select, str):
211
+ select = [select]
212
+
213
+ base = f"""select {', '.join(f'"{s}"' for s in select)} from metadata where namespace = '{self.namespace}' AND
214
+ cache_key LIKE '{self.cache_key}'"""
215
+ if include_backend:
216
+ base += f" AND backend = '{self.backend_name}'"
217
+
218
+ rows = QueryBuilder().register('metadata', self.dt).execute(base).read_all()
219
+
220
+ return rows.to_struct_array().to_pylist()
221
+
222
+ def _get_row(self, select, include_backend=False):
223
+ """Get a row from the database or delta table.
224
+
225
+ Args:
226
+ select (str): The column to select.
227
+ include_backend (bool, optional): Whether to include the backend in the check.
228
+
229
+ Returns:
230
+ The row from the database or delta table.
231
+ """
232
+ if self.store == 'delta':
233
+ return self._delta_get_row(select, include_backend)
234
+ else:
235
+ return self._sql_get_row(select, include_backend)
236
+
237
+ def list(self, cache_key):
238
+ #convert cache_key glob to valid sql pattern matching
239
+ if not self.cache_key:
240
+ cache_key = cache_key.replace('*', '%')
241
+ cache_key = cache_key.replace('?', '_')
242
+ self.cache_key = cache_key
243
+
244
+ if self.backend_name:
245
+ include_backend = True
246
+ else:
247
+ include_backend = False
248
+
249
+ return self._get_row(['cache_key',
250
+ 'namespace',
251
+ 'backend',
252
+ 'state',
253
+ 'last_modified',
254
+ 'user',
255
+ 'commit_hash',
256
+ 'path'], include_backend=include_backend)
257
+
258
+ def is_null(self):
259
+ """Check if the metadata is null.
260
+
261
+ Returns:
262
+ bool: True if the metadata is null, False otherwise.
263
+ """
264
+ return self._check_row_exists(state='null', include_backend=False)
265
+
266
+ def set_null(self):
267
+ """Set the metadata to null."""
268
+ self._update_metadata_state(state='null')
269
+
270
+ def delete_null(self):
271
+ """Delete the metadata that is null."""
272
+ # Deleting a null is really just deleting a metadata
273
+ self._delete_metadata(null=True)
274
+
275
+ def _delete_metadata(self, null=False):
276
+ """Delete the metadata from the database or delta table.
277
+
278
+ Args:
279
+ null (bool, optional): Whether to delete the metadata that is null.
280
+ """
281
+ if self.store == 'delta':
282
+ if null:
283
+ self.dt.delete(predicate=f"cache_key = '{self.cache_key}' AND namespace = '{self.namespace}' AND state = 'null'")
284
+ else:
285
+ if self.backend_name:
286
+ self.dt.delete(predicate=f"cache_key = '{self.cache_key}' AND namespace = '{self.namespace}' AND backend = '{self.backend_name}'")
287
+ else:
288
+ raise RuntimeError("Can only delete non-null metadata with a valid backend")
289
+ else:
290
+ statement = sqlalchemy.delete(self.db_table).where(self.db_table.c.cache_key == self.cache_key)\
291
+ .where(self.db_table.c.namespace == self.namespace)
292
+ if null:
293
+ statement = statement.where(self.db_table.c.state == 'null')
294
+ else:
295
+ statement = statement.where(self.db_table.c.backend == self.backend_name)
296
+ with self.engine.connect() as conn:
297
+ conn.execute(statement)
298
+ conn.commit()
299
+
300
+
301
+ def _metadata_confirmed(self):
302
+ """Check if the metadata is confirmed.
303
+
304
+ Returns:
305
+ bool: True if the metadata is confirmed, False otherwise.
306
+ """
307
+ if not self.backend:
308
+ return False
309
+
310
+ return self._check_row_exists(state='confirmed', include_backend=True)
311
+
312
+ def _metadata_exists(self):
313
+ """Check if the metadata exists.
314
+
315
+ Returns:
316
+ bool: True if the metadata exists, False otherwise.
317
+ """
318
+ return self._check_row_exists(state=None, include_backend=True)
319
+
320
+ def _get_backend_from_metadata(self):
321
+ """Get the backend from the metadata.
322
+
323
+ Returns:
324
+ The backend from the metadata.
325
+ """
326
+ rows = self._get_row('backend', include_backend=False)
327
+ if len(rows) == 0:
328
+ return None
329
+ else:
330
+ return rows[0]['backend']
331
+
332
+ def last_modified(self):
333
+ """Get the last modified time of the metadata.
334
+
335
+ Returns:
336
+ The last modified time of the metadata.
337
+ """
338
+ rows = self._get_row('last_modified', include_backend=True)
339
+ if len(rows) == 0:
340
+ return None
341
+ else:
342
+ return rows[0]['last_modified']
343
+
344
+ def _set_metadata_pending(self):
345
+ """Set the metadata to pending."""
346
+ self._update_metadata_state(state='pending')
347
+
348
+ def _commit_metadata(self):
349
+ """Commit the metadata."""
350
+ self._update_metadata_state(state='confirmed')
351
+
352
+ def _update_metadata_state(self, state=None):
353
+ """Update the state of the metadata.
354
+
355
+ If metadata doesn't exist, it will be created.
356
+
357
+ Args:
358
+ state (str, optional): The state to update the metadata to.
359
+ """
360
+ repo = git.Repo(search_parent_directories=True)
361
+ if repo:
362
+ sha = repo.head.object.hexsha
363
+ else:
364
+ sha = 'no_git_repo'
365
+
366
+ path = 'None'
367
+ if self.backend:
368
+ path = self.backend.get_file_path()
369
+
370
+ if self.store == 'delta':
371
+ if self._metadata_exists():
372
+ values = {'state': state, 'last_modified': datetime.datetime.now(datetime.timezone.utc).timestamp() * 1000000,
373
+ 'commit_hash': sha, 'user': getpass.getuser(), 'path': path}
374
+ if state == 'null':
375
+ self.dt.update(predicate=f"cache_key = '{self.cache_key}' AND namespace = '{self.namespace}'",
376
+ new_values = values)
377
+ else:
378
+ self.dt.update(predicate=f"cache_key = '{self.cache_key}' AND namespace = '{self.namespace}' AND backend = '{self.backend_name}'",
379
+ new_values= values)
380
+ else:
381
+ df = pd.DataFrame({'cache_key': [self.cache_key],
382
+ 'namespace': [str(self.namespace)],
383
+ 'backend': [self.backend_name],
384
+ 'commit_hash': [sha],
385
+ 'user': [getpass.getuser()],
386
+ 'path' : [path],
387
+ 'state': [state],
388
+ 'last_modified': [datetime.datetime.now(datetime.timezone.utc).timestamp() * 1000000]})
389
+
390
+ write_deltalake(self.dt, df, mode='append')
391
+ else:
392
+ if self._metadata_exists():
393
+ statement = sqlalchemy.update(self.db_table).where(self.db_table.c.cache_key == self.cache_key)\
394
+ .where(self.db_table.c.namespace == self.namespace)
395
+
396
+ if state != 'null':
397
+ statement = statement.where(self.db_table.c.backend == self.backend_name)
398
+
399
+ statement = statement.values(state=state, last_modified=datetime.datetime.now(datetime.timezone.utc).timestamp()*1000000,
400
+ commit_hash=sha, user=getpass.getuser(), path=path)
401
+ else:
402
+ statement = sqlalchemy.insert(self.db_table).values(state=state,
403
+ last_modified=datetime.datetime.now(datetime.timezone.utc).timestamp()*1000000,
404
+ commit_hash=sha, user=getpass.getuser(), path=path,
405
+ backend=self.backend_name,
406
+ cache_key=self.cache_key,
407
+ namespace=self.namespace)
408
+
409
+ with self.engine.connect() as conn:
410
+ conn.execute(statement)
411
+ conn.commit()
412
+
413
+ def get_backend(self):
414
+ """Get the backend name.
415
+
416
+ Returns:
417
+ The backend name.
418
+ """
419
+ return self.backend.backend_name
420
+
421
+ def exists(self):
422
+ """Check if the metadata exists, is confirmed, and the data exists in the backend.
423
+
424
+ Returns:
425
+ bool: True if the metadata exists, False otherwise.
426
+ """
427
+ if self._metadata_confirmed() and not self.backend:
428
+ raise RuntimeError("If metadata exists then there should be a backend. Inconsistent state error.")
429
+ elif not self.backend:
430
+ # We just aren't initialized yet
431
+ return False
432
+ elif self._metadata_confirmed() and self.backend.exists():
433
+ # Both exists!
434
+ return True
435
+ elif self._metadata_confirmed() and not self.backend.exists():
436
+ # The data doesn't exist in the backend, so delete the metadata
437
+ self._delete_metadata()
438
+ return False
439
+ elif not self._metadata_confirmed():
440
+ return False
441
+ else:
442
+ raise ValueError("Inconsistent and unknown cache state.")
443
+
444
+ def write(self, ds, upsert=False, primary_keys=None):
445
+ """Write data to the backend.
446
+
447
+ First we set the metadata to pending, then we write the data to the backend,
448
+ and then we commit the metadata.
449
+
450
+ Args:
451
+ ds (any): The data to write to the backend.
452
+ upsert (bool, optional): Whether to upsert the data.
453
+ primary_keys (list, optional): The primary keys to use for upsert.
454
+ """
455
+ if self.backend:
456
+ self._set_metadata_pending()
457
+ self.backend.write(ds, upsert, primary_keys)
458
+ self._commit_metadata()
459
+ else:
460
+ raise RuntimeError("Cannot not write to an uninitialized backend")
461
+
462
+ def read(self, engine=None):
463
+ """Read data from the backend.
464
+
465
+ Args:
466
+ engine (str or type, optional): The data processing engine to use for
467
+ reading data from the backend.
468
+
469
+ Returns:
470
+ The data from the backend.
471
+ """
472
+ if self.backend:
473
+ return self.backend.read(engine)
474
+ else:
475
+ raise RuntimeError("Cannot not read from an uninitialized backend")
476
+
477
+ def delete(self):
478
+ """Delete the metadata and the data from the backend."""
479
+ if self.backend and self.backend.exists():
480
+ self.backend.delete()
481
+
482
+ self._delete_metadata()
483
+
484
+ def get_file_path(self):
485
+ """Get the file path of the data in the backend.
486
+
487
+ Returns:
488
+ The file path of the data in the backend.
489
+ """
490
+ if self.backend:
491
+ return self.backend.get_file_path()
492
+ else:
493
+ raise RuntimeError("Cannot not get file path for an uninitialized backend")
494
+
495
+ def sync(self, from_cache):
496
+ """Sync the data from one cache to another.
497
+
498
+ Args:
499
+ from_cache (Cache): The cache to sync data from.
500
+ """
501
+ if self.exists():
502
+ if from_cache.exists():
503
+ # If we both exists, copy if last modified is before other cache
504
+ if self.last_modified() < from_cache.last_modified():
505
+ self._set_metadata_pending()
506
+ self.backend.sync(from_cache.backend)
507
+ self._commit_metadata()
508
+ else:
509
+ return
510
+ else:
511
+ # If it's not in the form cache either don't sync
512
+ return
513
+ else:
514
+ if from_cache.exists():
515
+ # We don't exist at all. Setup backend and write
516
+ backend_class = get_backend_by_name(from_cache.backend_name)
517
+ backend_config = get_config(location=self.location, requested_parameters=backend_class.config_parameters,
518
+ backend_name=backend_class.backend_name)
519
+ if backend_config:
520
+ self.backend = backend_class(backend_config, self.cache_key, self.namespace, self.args, copy.deepcopy(self.backend_kwargs))
521
+ self.backend_name = backend_class.backend_name
522
+ else:
523
+ raise RuntimeError("Error finding backend config for syncing.")
524
+
525
+ self._set_metadata_pending()
526
+ self.backend.sync(from_cache.backend)
527
+ self._commit_metadata()
528
+ else:
529
+ return