pixeltable 0.3.14__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +292 -105
- pixeltable/catalog/column.py +10 -8
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +25 -20
- pixeltable/catalog/schema_object.py +3 -6
- pixeltable/catalog/table.py +245 -189
- pixeltable/catalog/table_version.py +319 -201
- pixeltable/catalog/table_version_handle.py +15 -2
- pixeltable/catalog/table_version_path.py +60 -21
- pixeltable/catalog/view.py +14 -5
- pixeltable/dataframe.py +11 -9
- pixeltable/env.py +2 -4
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +20 -11
- pixeltable/exprs/column_property_ref.py +15 -6
- pixeltable/exprs/column_ref.py +32 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/query_template_function.py +1 -1
- pixeltable/functions/gemini.py +166 -33
- pixeltable/functions/math.py +63 -0
- pixeltable/functions/string.py +212 -58
- pixeltable/globals.py +7 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +8 -29
- pixeltable/io/label_studio.py +1 -1
- pixeltable/io/parquet.py +4 -4
- pixeltable/io/table_data_conduit.py +0 -31
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +5 -1
- pixeltable/plan.py +4 -4
- pixeltable/share/packager.py +207 -15
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +31 -13
- pixeltable/utils/dbms.py +1 -1
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/METADATA +1 -1
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/RECORD +50 -49
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/entry_points.txt +0 -0
pixeltable/catalog/catalog.py
CHANGED
|
@@ -3,8 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import functools
|
|
5
5
|
import logging
|
|
6
|
+
import random
|
|
6
7
|
import time
|
|
7
|
-
from
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
|
|
8
10
|
from uuid import UUID
|
|
9
11
|
|
|
10
12
|
import psycopg
|
|
@@ -56,40 +58,60 @@ def _unpack_row(
|
|
|
56
58
|
return result
|
|
57
59
|
|
|
58
60
|
|
|
59
|
-
|
|
61
|
+
# for now, we don't limit the number of retries, because we haven't seen situations where the actual number of retries
|
|
62
|
+
# grows uncontrollably
|
|
63
|
+
_MAX_RETRIES = 0
|
|
64
|
+
|
|
60
65
|
T = TypeVar('T')
|
|
61
66
|
|
|
62
67
|
|
|
63
|
-
def _retry_loop(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
68
|
+
def _retry_loop(*, for_write: bool) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
69
|
+
def decorator(op: Callable[..., T]) -> Callable[..., T]:
|
|
70
|
+
@functools.wraps(op)
|
|
71
|
+
def loop(*args: Any, **kwargs: Any) -> T:
|
|
72
|
+
num_remaining_retries = _MAX_RETRIES
|
|
73
|
+
while True:
|
|
74
|
+
try:
|
|
75
|
+
# in order for retry to work, we need to make sure that there aren't any prior db updates
|
|
76
|
+
# that are part of an ongoing transaction
|
|
77
|
+
assert not Env.get().in_xact
|
|
78
|
+
with Catalog.get().begin_xact(for_write=for_write):
|
|
79
|
+
return op(*args, **kwargs)
|
|
80
|
+
except sql.exc.DBAPIError as e:
|
|
81
|
+
# TODO: what other exceptions should we be looking for?
|
|
82
|
+
if isinstance(e.orig, psycopg.errors.SerializationFailure):
|
|
83
|
+
if num_remaining_retries > 0:
|
|
84
|
+
num_remaining_retries -= 1
|
|
85
|
+
_logger.debug(f'Serialization failure, retrying ({num_remaining_retries} retries left)')
|
|
86
|
+
time.sleep(random.uniform(0.1, 0.5))
|
|
87
|
+
else:
|
|
88
|
+
raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
|
|
81
89
|
else:
|
|
82
|
-
raise
|
|
83
|
-
|
|
84
|
-
|
|
90
|
+
raise
|
|
91
|
+
|
|
92
|
+
return loop
|
|
85
93
|
|
|
86
|
-
return
|
|
94
|
+
return decorator
|
|
87
95
|
|
|
88
96
|
|
|
89
97
|
class Catalog:
|
|
90
98
|
"""The functional interface to getting access to catalog objects
|
|
91
99
|
|
|
92
|
-
All interface functions must be called in the context of a transaction, started with
|
|
100
|
+
All interface functions must be called in the context of a transaction, started with Catalog.begin_xact().
|
|
101
|
+
|
|
102
|
+
Caching and invalidation of metadata:
|
|
103
|
+
- Catalog caches TableVersion instances in order to avoid excessive metadata loading
|
|
104
|
+
- for any specific table version (ie, combination of id and effective version) there can be only a single
|
|
105
|
+
Tableversion instance in circulation; the reason is that each TV instance has its own store_tbl.sa_tbl, and
|
|
106
|
+
mixing multiple instances of sqlalchemy Table objects in the same query (for the same underlying table) leads to
|
|
107
|
+
duplicate references to that table in the From clause (ie, incorrect Cartesian products)
|
|
108
|
+
- in order to allow multiple concurrent Python processes to perform updates (data and/or schema) against a shared
|
|
109
|
+
Pixeltable instance, Catalog needs to reload metadata from the store when there are changes
|
|
110
|
+
- concurrent changes are detected by comparing TableVersion.version with the stored current version
|
|
111
|
+
(TableMd.current_version)
|
|
112
|
+
- cached live TableVersion instances (those with effective_version == None) are validated against the stored
|
|
113
|
+
metadata on transaction boundaries; this is recorded in TableVersion.is_validated
|
|
114
|
+
- metadata validation is only needed for live TableVersion instances (snapshot instances are immutable)
|
|
93
115
|
"""
|
|
94
116
|
|
|
95
117
|
_instance: Optional[Catalog] = None
|
|
@@ -99,6 +121,8 @@ class Catalog:
|
|
|
99
121
|
# - snapshot versions: records the version of the snapshot
|
|
100
122
|
_tbl_versions: dict[tuple[UUID, Optional[int]], TableVersion]
|
|
101
123
|
_tbls: dict[UUID, Table]
|
|
124
|
+
_in_write_xact: bool # True if we're in a write transaction
|
|
125
|
+
_x_locked_tbl_id: Optional[UUID] # set if begin_xact() was asked to write-lock a table
|
|
102
126
|
|
|
103
127
|
@classmethod
|
|
104
128
|
def get(cls) -> Catalog:
|
|
@@ -109,22 +133,127 @@ class Catalog:
|
|
|
109
133
|
@classmethod
|
|
110
134
|
def clear(cls) -> None:
|
|
111
135
|
"""Remove the instance. Used for testing."""
|
|
136
|
+
# invalidate all existing instances to force reloading of metadata
|
|
137
|
+
for tbl_version in cls._instance._tbl_versions.values():
|
|
138
|
+
# _logger.debug(
|
|
139
|
+
# f'Invalidating table version {tbl_version.id}:{tbl_version.effective_version} ({id(tbl_version):x})'
|
|
140
|
+
# )
|
|
141
|
+
tbl_version.is_validated = False
|
|
112
142
|
cls._instance = None
|
|
113
143
|
|
|
114
144
|
def __init__(self) -> None:
|
|
115
145
|
self._tbl_versions = {}
|
|
116
146
|
self._tbls = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
|
|
147
|
+
self._in_write_xact = False
|
|
148
|
+
self._x_locked_tbl_id = None
|
|
117
149
|
self._init_store()
|
|
118
150
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
151
|
+
def validate(self) -> None:
|
|
152
|
+
"""Validate structural consistency of cached metadata"""
|
|
153
|
+
for (tbl_id, effective_version), tbl_version in self._tbl_versions.items():
|
|
154
|
+
assert tbl_id == tbl_version.id, f'{tbl_id} != {tbl_version.id}'
|
|
155
|
+
assert tbl_version.effective_version == tbl_version.version or tbl_version.effective_version is None, (
|
|
156
|
+
f'{tbl_version.effective_version} != {tbl_version.version} for id {tbl_id}'
|
|
157
|
+
)
|
|
158
|
+
assert effective_version == tbl_version.effective_version, (
|
|
159
|
+
f'{effective_version} != {tbl_version.effective_version} for id {tbl_id}'
|
|
160
|
+
)
|
|
161
|
+
assert len(tbl_version.mutable_views) == 0 or tbl_version.is_mutable, (
|
|
162
|
+
f'snapshot_id={tbl_version.id} mutable_views={tbl_version.mutable_views}'
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if tbl_version.is_view and tbl_version.is_mutable:
|
|
166
|
+
# make sure this mutable view is recorded in a mutable base
|
|
167
|
+
base = tbl_version.base
|
|
168
|
+
assert base is not None
|
|
169
|
+
if base.effective_version is None:
|
|
170
|
+
assert (base.id, None) in self._tbl_versions
|
|
171
|
+
assert TableVersionHandle.create(tbl_version) in self._tbl_versions[base.id, None].mutable_views
|
|
172
|
+
|
|
173
|
+
if len(tbl_version.mutable_views) > 0:
|
|
174
|
+
# make sure we also loaded mutable view metadata, which is needed to detect column dependencies
|
|
175
|
+
for v in tbl_version.mutable_views:
|
|
176
|
+
assert v.effective_version is None, f'{v.id}:{v.effective_version}'
|
|
177
|
+
|
|
178
|
+
@contextmanager
|
|
179
|
+
def begin_xact(self, *, tbl_id: Optional[UUID] = None, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
180
|
+
"""
|
|
181
|
+
Return a context manager that yields a connection to the database. Idempotent.
|
|
182
|
+
|
|
183
|
+
It is mandatory to call this method, not Env.begin_xact(), if the transaction accesses any table data
|
|
184
|
+
or metadata.
|
|
185
|
+
|
|
186
|
+
Lock acquisition:
|
|
187
|
+
- x-locks Table records by updating Table.lock_dummy
|
|
188
|
+
- this needs to be done in a retry loop, because Postgres can decide to abort the transaction
|
|
189
|
+
(SerializationFailure, LockNotAvailable)
|
|
190
|
+
- for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
|
|
191
|
+
to minimize (maybe avoid altogether) loosing that work
|
|
192
|
+
"""
|
|
193
|
+
if Env.get().in_xact:
|
|
194
|
+
if tbl_id is not None and for_write:
|
|
195
|
+
# make sure that we requested the required table lock at the beginning of the transaction
|
|
196
|
+
assert tbl_id == self._x_locked_tbl_id, f'{tbl_id} != {self._x_locked_tbl_id}'
|
|
197
|
+
yield Env.get().conn
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
# tv_msg = '\n'.join(
|
|
201
|
+
# [
|
|
202
|
+
# f'{tv.id}:{tv.effective_version} : tv={id(tv):x} sa_tbl={id(tv.store_tbl.sa_tbl):x}'
|
|
203
|
+
# for tv in self._tbl_versions.values()
|
|
204
|
+
# ]
|
|
205
|
+
# )
|
|
206
|
+
# _logger.debug(f'begin_xact(): {tv_msg}')
|
|
207
|
+
num_retries = 0
|
|
208
|
+
while True:
|
|
209
|
+
try:
|
|
210
|
+
with Env.get().begin_xact() as conn:
|
|
211
|
+
if tbl_id is not None and for_write:
|
|
212
|
+
# X-lock Table record
|
|
213
|
+
conn.execute(
|
|
214
|
+
sql.select(schema.Table).where(schema.Table.id == tbl_id).with_for_update(nowait=True)
|
|
215
|
+
)
|
|
216
|
+
conn.execute(sql.update(schema.Table).values(lock_dummy=1).where(schema.Table.id == tbl_id))
|
|
217
|
+
self._x_locked_tbl_id = tbl_id
|
|
218
|
+
|
|
219
|
+
self._in_write_xact = for_write
|
|
220
|
+
yield conn
|
|
221
|
+
return
|
|
222
|
+
except sql.exc.DBAPIError as e:
|
|
223
|
+
if isinstance(e.orig, (psycopg.errors.SerializationFailure, psycopg.errors.LockNotAvailable)) and (
|
|
224
|
+
num_retries < _MAX_RETRIES or _MAX_RETRIES == 0
|
|
225
|
+
):
|
|
226
|
+
num_retries += 1
|
|
227
|
+
_logger.debug(f'Retrying ({num_retries}) after {type(e.orig)}')
|
|
228
|
+
time.sleep(random.uniform(0.1, 0.5))
|
|
229
|
+
else:
|
|
230
|
+
raise
|
|
231
|
+
finally:
|
|
232
|
+
self._in_write_xact = False
|
|
233
|
+
self._x_locked_tbl_id = None
|
|
234
|
+
|
|
235
|
+
# invalidate cached current TableVersion instances
|
|
236
|
+
for tv in self._tbl_versions.values():
|
|
237
|
+
if tv.effective_version is None:
|
|
238
|
+
_logger.debug(f'invalidating table version {tv.id}:None (tv={id(tv):x})')
|
|
239
|
+
tv.is_validated = False
|
|
240
|
+
|
|
241
|
+
if _logger.isEnabledFor(logging.DEBUG):
|
|
242
|
+
self.validate()
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def in_write_xact(self) -> bool:
|
|
246
|
+
return self._in_write_xact
|
|
247
|
+
|
|
248
|
+
def _acquire_dir_xlock(self, parent_id: Optional[UUID], dir_id: Optional[UUID], dir_name: Optional[str]) -> None:
|
|
249
|
+
"""Force acquisition of an X-lock on a Dir record via a blind update.
|
|
250
|
+
|
|
122
251
|
If dir_id is present, then all other conditions are ignored.
|
|
123
252
|
Note that (parent_id==None) is a valid where condition.
|
|
124
253
|
If dir_id is not specified, the user from the environment is added to the directory filters.
|
|
125
254
|
"""
|
|
126
255
|
user = Env.get().user
|
|
127
|
-
|
|
256
|
+
assert self._in_write_xact
|
|
128
257
|
q = sql.update(schema.Dir).values(lock_dummy=1)
|
|
129
258
|
if dir_id is not None:
|
|
130
259
|
q = q.where(schema.Dir.id == dir_id)
|
|
@@ -134,7 +263,7 @@ class Catalog:
|
|
|
134
263
|
q = q.where(schema.Dir.md['name'].astext == dir_name)
|
|
135
264
|
if user is not None:
|
|
136
265
|
q = q.where(schema.Dir.md['user'].astext == user)
|
|
137
|
-
conn.execute(q)
|
|
266
|
+
Env.get().conn.execute(q)
|
|
138
267
|
|
|
139
268
|
def get_dir_path(self, dir_id: UUID) -> Path:
|
|
140
269
|
"""Return path for directory with given id"""
|
|
@@ -156,7 +285,7 @@ class Catalog:
|
|
|
156
285
|
dir_entries: dict[str, Catalog.DirEntry]
|
|
157
286
|
table: Optional[schema.Table]
|
|
158
287
|
|
|
159
|
-
@_retry_loop
|
|
288
|
+
@_retry_loop(for_write=False)
|
|
160
289
|
def get_dir_contents(self, dir_path: Path, recursive: bool = False) -> dict[str, DirEntry]:
|
|
161
290
|
dir = self._get_schema_object(dir_path, expected=Dir, raise_if_not_exists=True)
|
|
162
291
|
return self._get_dir_contents(dir._id, recursive=recursive)
|
|
@@ -183,7 +312,7 @@ class Catalog:
|
|
|
183
312
|
|
|
184
313
|
return result
|
|
185
314
|
|
|
186
|
-
@_retry_loop
|
|
315
|
+
@_retry_loop(for_write=True)
|
|
187
316
|
def move(self, path: Path, new_path: Path) -> None:
|
|
188
317
|
self._move(path, new_path)
|
|
189
318
|
|
|
@@ -272,7 +401,7 @@ class Catalog:
|
|
|
272
401
|
|
|
273
402
|
# check for subdirectory
|
|
274
403
|
if for_update:
|
|
275
|
-
self.
|
|
404
|
+
self._acquire_dir_xlock(dir_id, None, name)
|
|
276
405
|
q = sql.select(schema.Dir).where(
|
|
277
406
|
schema.Dir.parent_id == dir_id, schema.Dir.md['name'].astext == name, schema.Dir.md['user'].astext == user
|
|
278
407
|
)
|
|
@@ -296,7 +425,7 @@ class Catalog:
|
|
|
296
425
|
tbl_id = conn.execute(q).scalar_one_or_none()
|
|
297
426
|
if tbl_id is not None:
|
|
298
427
|
if tbl_id not in self._tbls:
|
|
299
|
-
|
|
428
|
+
_ = self._load_tbl(tbl_id)
|
|
300
429
|
return self._tbls[tbl_id]
|
|
301
430
|
|
|
302
431
|
return None
|
|
@@ -349,10 +478,15 @@ class Catalog:
|
|
|
349
478
|
tbl = self._load_tbl(tbl_id)
|
|
350
479
|
if tbl is None:
|
|
351
480
|
return None
|
|
352
|
-
|
|
481
|
+
# if this is a mutable table, we also need to have its mutable views loaded, in order to track column
|
|
482
|
+
# dependencies
|
|
483
|
+
tbl_version = tbl._tbl_version.get()
|
|
484
|
+
if tbl_version.is_mutable:
|
|
485
|
+
for v in tbl_version.mutable_views:
|
|
486
|
+
_ = self.get_table_by_id(v.id)
|
|
353
487
|
return self._tbls[tbl_id]
|
|
354
488
|
|
|
355
|
-
@_retry_loop
|
|
489
|
+
@_retry_loop(for_write=True)
|
|
356
490
|
def create_table(
|
|
357
491
|
self,
|
|
358
492
|
path: Path,
|
|
@@ -385,7 +519,7 @@ class Catalog:
|
|
|
385
519
|
self._tbls[tbl._id] = tbl
|
|
386
520
|
return tbl
|
|
387
521
|
|
|
388
|
-
@_retry_loop
|
|
522
|
+
@_retry_loop(for_write=True)
|
|
389
523
|
def create_view(
|
|
390
524
|
self,
|
|
391
525
|
path: Path,
|
|
@@ -431,14 +565,17 @@ class Catalog:
|
|
|
431
565
|
self._tbls[view._id] = view
|
|
432
566
|
return view
|
|
433
567
|
|
|
434
|
-
@_retry_loop
|
|
568
|
+
@_retry_loop(for_write=True)
|
|
435
569
|
def create_replica(
|
|
436
570
|
self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam = IfExistsParam.ERROR
|
|
437
|
-
) ->
|
|
571
|
+
) -> None:
|
|
438
572
|
"""
|
|
439
573
|
Creates table, table_version, and table_schema_version records for a replica with the given metadata.
|
|
440
574
|
The metadata should be presented in standard "ancestor order", with the table being replicated at
|
|
441
575
|
list position 0 and the (root) base table at list position -1.
|
|
576
|
+
|
|
577
|
+
TODO: create_replica() also needs to create the store tables and populate them in order to make
|
|
578
|
+
replica creation atomic.
|
|
442
579
|
"""
|
|
443
580
|
tbl_id = UUID(md[0].tbl_md.tbl_id)
|
|
444
581
|
|
|
@@ -451,20 +588,19 @@ class Catalog:
|
|
|
451
588
|
'but a different table already exists at that location.'
|
|
452
589
|
)
|
|
453
590
|
assert isinstance(existing, View)
|
|
454
|
-
return
|
|
591
|
+
return
|
|
455
592
|
|
|
456
593
|
# Ensure that the system directory exists.
|
|
457
594
|
self._create_dir(Path('_system', allow_system_paths=True), if_exists=IfExistsParam.IGNORE, parents=False)
|
|
458
595
|
|
|
459
596
|
# Now check to see if this table already exists in the catalog.
|
|
460
|
-
# TODO: Handle concurrency in create_replica()
|
|
461
597
|
existing = Catalog.get().get_table_by_id(tbl_id)
|
|
462
598
|
if existing is not None:
|
|
463
|
-
existing_path = Path(existing._path, allow_system_paths=True)
|
|
599
|
+
existing_path = Path(existing._path(), allow_system_paths=True)
|
|
464
600
|
# It does exist. If it's a non-system table, that's an error: it's already been replicated.
|
|
465
601
|
if not existing_path.is_system_path:
|
|
466
602
|
raise excs.Error(
|
|
467
|
-
f'That table has already been replicated as {existing._path!r}. \n'
|
|
603
|
+
f'That table has already been replicated as {existing._path()!r}. \n'
|
|
468
604
|
f'Drop the existing replica if you wish to re-create it.'
|
|
469
605
|
)
|
|
470
606
|
# If it's a system table, then this means it was created at some point as the ancestor of some other
|
|
@@ -489,22 +625,20 @@ class Catalog:
|
|
|
489
625
|
# The table already exists in the catalog. The existing path might be a system path (if the table
|
|
490
626
|
# was created as an anonymous base table of some other table), or it might not (if it's a snapshot
|
|
491
627
|
# that was directly replicated by the user at some point). In either case, use the existing path.
|
|
492
|
-
replica_path = Path(replica._path, allow_system_paths=True)
|
|
628
|
+
replica_path = Path(replica._path(), allow_system_paths=True)
|
|
493
629
|
|
|
494
630
|
# Store the metadata; it could be a new version (in which case a new record will be created) or a
|
|
495
631
|
# known version (in which case the newly received metadata will be validated as identical).
|
|
496
632
|
self.__store_replica_md(replica_path, ancestor_md)
|
|
497
633
|
|
|
498
|
-
#
|
|
499
|
-
#
|
|
500
|
-
self._tbls[tbl_id] = self._load_tbl(tbl_id)
|
|
501
|
-
return self._tbls[tbl_id]
|
|
634
|
+
# don't create TableVersion instances at this point, they would be superseded by calls to TV.create_replica()
|
|
635
|
+
# in TableRestorer.restore()
|
|
502
636
|
|
|
503
637
|
def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
|
|
504
638
|
_logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
|
|
505
|
-
# TODO: Handle concurrency
|
|
506
639
|
dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
|
|
507
640
|
assert dir is not None
|
|
641
|
+
assert self._in_write_xact
|
|
508
642
|
|
|
509
643
|
conn = Env.get().conn
|
|
510
644
|
tbl_id = md.tbl_md.tbl_id
|
|
@@ -582,14 +716,24 @@ class Catalog:
|
|
|
582
716
|
|
|
583
717
|
self.store_tbl_md(UUID(tbl_id), new_tbl_md, new_version_md, new_schema_version_md)
|
|
584
718
|
|
|
585
|
-
@_retry_loop
|
|
719
|
+
@_retry_loop(for_write=False)
|
|
586
720
|
def get_table(self, path: Path) -> Table:
|
|
721
|
+
obj = self._get_table(path)
|
|
722
|
+
return obj
|
|
723
|
+
|
|
724
|
+
def _get_table(self, path: Path) -> Table:
|
|
587
725
|
obj = Catalog.get()._get_schema_object(path, expected=Table, raise_if_not_exists=True)
|
|
588
726
|
assert isinstance(obj, Table)
|
|
589
|
-
obj._tbl_version.get()
|
|
727
|
+
tbl_version = obj._tbl_version.get()
|
|
728
|
+
# TODO: instead of calling this here, move the logic into TableVersion.init(), which is called after
|
|
729
|
+
# registering the instance in _tbl_versions
|
|
730
|
+
tbl_version.ensure_md_loaded()
|
|
731
|
+
# if this table has mutable views, we need to load those as well, in order to record column dependencies
|
|
732
|
+
for v in tbl_version.mutable_views:
|
|
733
|
+
self.get_table_by_id(v.id)
|
|
590
734
|
return obj
|
|
591
735
|
|
|
592
|
-
@_retry_loop
|
|
736
|
+
@_retry_loop(for_write=True)
|
|
593
737
|
def drop_table(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
|
|
594
738
|
_, _, src_obj = self._prepare_dir_op(
|
|
595
739
|
drop_dir_path=path.parent,
|
|
@@ -621,11 +765,11 @@ class Catalog:
|
|
|
621
765
|
msg: str
|
|
622
766
|
if is_replace:
|
|
623
767
|
msg = (
|
|
624
|
-
f'{obj_type_str} {tbl._path} already exists and has dependents. '
|
|
768
|
+
f'{obj_type_str} {tbl._path()} already exists and has dependents. '
|
|
625
769
|
"Use `if_exists='replace_force'` to replace it."
|
|
626
770
|
)
|
|
627
771
|
else:
|
|
628
|
-
msg = f'{obj_type_str} {tbl._path} has dependents.'
|
|
772
|
+
msg = f'{obj_type_str} {tbl._path()} has dependents.'
|
|
629
773
|
raise excs.Error(msg)
|
|
630
774
|
|
|
631
775
|
for view_id in view_ids:
|
|
@@ -636,9 +780,9 @@ class Catalog:
|
|
|
636
780
|
tbl._drop()
|
|
637
781
|
assert tbl._id in self._tbls
|
|
638
782
|
del self._tbls[tbl._id]
|
|
639
|
-
_logger.info(f'Dropped table `{tbl._path}`.')
|
|
783
|
+
_logger.info(f'Dropped table `{tbl._path()}`.')
|
|
640
784
|
|
|
641
|
-
@_retry_loop
|
|
785
|
+
@_retry_loop(for_write=True)
|
|
642
786
|
def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
|
|
643
787
|
return self._create_dir(path, if_exists, parents)
|
|
644
788
|
|
|
@@ -673,7 +817,7 @@ class Catalog:
|
|
|
673
817
|
Env.get().console_logger.info(f'Created directory {str(path)!r}.')
|
|
674
818
|
return dir
|
|
675
819
|
|
|
676
|
-
@_retry_loop
|
|
820
|
+
@_retry_loop(for_write=True)
|
|
677
821
|
def drop_dir(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
|
|
678
822
|
_, _, schema_obj = self._prepare_dir_op(
|
|
679
823
|
drop_dir_path=path.parent,
|
|
@@ -698,7 +842,7 @@ class Catalog:
|
|
|
698
842
|
raise excs.Error(f'Directory {str(dir_path)!r} is not empty.')
|
|
699
843
|
|
|
700
844
|
# drop existing subdirs
|
|
701
|
-
self.
|
|
845
|
+
self._acquire_dir_xlock(dir_id, None, None)
|
|
702
846
|
dir_q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
|
|
703
847
|
for row in conn.execute(dir_q).all():
|
|
704
848
|
self._drop_dir(row.id, dir_path.append(row.md['name']), force=True)
|
|
@@ -725,17 +869,37 @@ class Catalog:
|
|
|
725
869
|
return result
|
|
726
870
|
|
|
727
871
|
def get_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
872
|
+
# we need a transaction here, if we're not already in one; if this starts a new transaction,
|
|
873
|
+
# the returned TableVersion instance will not be validated
|
|
874
|
+
with self.begin_xact(tbl_id=tbl_id, for_write=False) as conn:
|
|
875
|
+
tv = self._tbl_versions.get((tbl_id, effective_version))
|
|
876
|
+
if tv is None:
|
|
877
|
+
tv = self._load_tbl_version(tbl_id, effective_version)
|
|
878
|
+
elif not tv.is_validated:
|
|
879
|
+
# only live instances are invalidated
|
|
880
|
+
assert effective_version is None
|
|
881
|
+
# we validate live instances by comparing our cached version number to the stored current version
|
|
882
|
+
# _logger.debug(f'validating metadata for table {tbl_id}:{tv.version} ({id(tv):x})')
|
|
883
|
+
q = sql.select(schema.Table.md).where(schema.Table.id == tbl_id)
|
|
884
|
+
row = conn.execute(q).one()
|
|
885
|
+
current_version = row.md['current_version']
|
|
886
|
+
|
|
887
|
+
# the stored version can be behind TableVersion.version, because we don't roll back the in-memory
|
|
888
|
+
# metadata changes after a failed update operation
|
|
889
|
+
if current_version != tv.version:
|
|
890
|
+
# the cached metadata is invalid
|
|
891
|
+
_logger.debug(
|
|
892
|
+
f'reloading metadata for table {tbl_id} '
|
|
893
|
+
f'(cached version: {tv.version}, current version: {current_version}'
|
|
894
|
+
# f', id: {id(tv):x})'
|
|
895
|
+
)
|
|
896
|
+
tv = self._load_tbl_version(tbl_id, None)
|
|
897
|
+
else:
|
|
898
|
+
# the cached metadata is valid
|
|
899
|
+
tv.is_validated = True
|
|
900
|
+
|
|
901
|
+
assert tv.is_validated
|
|
902
|
+
return tv
|
|
739
903
|
|
|
740
904
|
def remove_tbl_version(self, tbl_version: TableVersion) -> None:
|
|
741
905
|
assert (tbl_version.id, tbl_version.effective_version) in self._tbl_versions
|
|
@@ -745,7 +909,7 @@ class Catalog:
|
|
|
745
909
|
"""Return the Dir with the given id, or None if it doesn't exist"""
|
|
746
910
|
conn = Env.get().conn
|
|
747
911
|
if for_update:
|
|
748
|
-
self.
|
|
912
|
+
self._acquire_dir_xlock(None, dir_id, None)
|
|
749
913
|
q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
|
|
750
914
|
row = conn.execute(q).one_or_none()
|
|
751
915
|
if row is None:
|
|
@@ -761,7 +925,7 @@ class Catalog:
|
|
|
761
925
|
conn = Env.get().conn
|
|
762
926
|
if path.is_root:
|
|
763
927
|
if for_update:
|
|
764
|
-
self.
|
|
928
|
+
self._acquire_dir_xlock(parent_id=None, dir_id=None, dir_name='')
|
|
765
929
|
q = sql.select(schema.Dir).where(schema.Dir.parent_id.is_(None), schema.Dir.md['user'].astext == user)
|
|
766
930
|
row = conn.execute(q).one_or_none()
|
|
767
931
|
return schema.Dir(**row._mapping) if row is not None else None
|
|
@@ -770,7 +934,7 @@ class Catalog:
|
|
|
770
934
|
if parent_dir is None:
|
|
771
935
|
return None
|
|
772
936
|
if for_update:
|
|
773
|
-
self.
|
|
937
|
+
self._acquire_dir_xlock(parent_id=parent_dir.id, dir_id=None, dir_name=path.name)
|
|
774
938
|
q = sql.select(schema.Dir).where(
|
|
775
939
|
schema.Dir.parent_id == parent_dir.id,
|
|
776
940
|
schema.Dir.md['name'].astext == path.name,
|
|
@@ -780,6 +944,7 @@ class Catalog:
|
|
|
780
944
|
return schema.Dir(**row._mapping) if row is not None else None
|
|
781
945
|
|
|
782
946
|
def _load_tbl(self, tbl_id: UUID) -> Optional[Table]:
|
|
947
|
+
"""Loads metadata for the table with the given id and caches it."""
|
|
783
948
|
_logger.info(f'Loading table {tbl_id}')
|
|
784
949
|
from .insertable_table import InsertableTable
|
|
785
950
|
from .view import View
|
|
@@ -808,8 +973,9 @@ class Catalog:
|
|
|
808
973
|
if view_md is None:
|
|
809
974
|
# this is a base table
|
|
810
975
|
if (tbl_id, None) not in self._tbl_versions:
|
|
811
|
-
|
|
976
|
+
_ = self._load_tbl_version(tbl_id, None)
|
|
812
977
|
tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
|
|
978
|
+
self._tbls[tbl_id] = tbl
|
|
813
979
|
return tbl
|
|
814
980
|
|
|
815
981
|
# this is a view; determine the sequence of TableVersions to load
|
|
@@ -829,18 +995,18 @@ class Catalog:
|
|
|
829
995
|
view_path: Optional[TableVersionPath] = None
|
|
830
996
|
for id, effective_version in tbl_version_path[::-1]:
|
|
831
997
|
if (id, effective_version) not in self._tbl_versions:
|
|
832
|
-
|
|
998
|
+
_ = self._load_tbl_version(id, effective_version)
|
|
833
999
|
view_path = TableVersionPath(TableVersionHandle(id, effective_version), base=base_path)
|
|
834
1000
|
base_path = view_path
|
|
835
1001
|
view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=pure_snapshot)
|
|
836
|
-
|
|
1002
|
+
self._tbls[tbl_id] = view
|
|
837
1003
|
return view
|
|
838
1004
|
|
|
839
1005
|
def load_tbl_md(self, tbl_id: UUID, effective_version: Optional[int]) -> schema.FullTableMd:
|
|
840
1006
|
"""
|
|
841
1007
|
Loads metadata from the store for a given table UUID and version.
|
|
842
1008
|
"""
|
|
843
|
-
_logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
|
|
1009
|
+
# _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
|
|
844
1010
|
conn = Env.get().conn
|
|
845
1011
|
|
|
846
1012
|
q = (
|
|
@@ -915,8 +1081,15 @@ class Catalog:
|
|
|
915
1081
|
If inserting `version_md` or `schema_version_md` would be a primary key violation, an exception will be raised.
|
|
916
1082
|
"""
|
|
917
1083
|
conn = Env.get().conn
|
|
1084
|
+
assert self._in_write_xact
|
|
918
1085
|
|
|
919
1086
|
if tbl_md is not None:
|
|
1087
|
+
assert tbl_md.tbl_id == str(tbl_id)
|
|
1088
|
+
if version_md is not None:
|
|
1089
|
+
assert tbl_md.current_version == version_md.version
|
|
1090
|
+
assert tbl_md.current_schema_version == version_md.schema_version
|
|
1091
|
+
if schema_version_md is not None:
|
|
1092
|
+
assert tbl_md.current_schema_version == schema_version_md.schema_version
|
|
920
1093
|
result = conn.execute(
|
|
921
1094
|
sql.update(schema.Table.__table__)
|
|
922
1095
|
.values({schema.Table.md: dataclasses.asdict(tbl_md)})
|
|
@@ -925,6 +1098,9 @@ class Catalog:
|
|
|
925
1098
|
assert result.rowcount == 1, result.rowcount
|
|
926
1099
|
|
|
927
1100
|
if version_md is not None:
|
|
1101
|
+
assert version_md.tbl_id == str(tbl_id)
|
|
1102
|
+
if schema_version_md is not None:
|
|
1103
|
+
assert version_md.schema_version == schema_version_md.schema_version
|
|
928
1104
|
conn.execute(
|
|
929
1105
|
sql.insert(schema.TableVersion.__table__).values(
|
|
930
1106
|
tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
|
|
@@ -932,6 +1108,7 @@ class Catalog:
|
|
|
932
1108
|
)
|
|
933
1109
|
|
|
934
1110
|
if schema_version_md is not None:
|
|
1111
|
+
assert schema_version_md.tbl_id == str(tbl_id)
|
|
935
1112
|
conn.execute(
|
|
936
1113
|
sql.insert(schema.TableSchemaVersion.__table__).values(
|
|
937
1114
|
tbl_id=tbl_id,
|
|
@@ -978,50 +1155,60 @@ class Catalog:
|
|
|
978
1155
|
return md
|
|
979
1156
|
|
|
980
1157
|
def _load_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
|
|
1158
|
+
"""Creates TableVersion instance from stored metadata and registers it in _tbl_versions."""
|
|
981
1159
|
tbl_md, _, schema_version_md = self.load_tbl_md(tbl_id, effective_version)
|
|
982
1160
|
view_md = tbl_md.view_md
|
|
983
1161
|
|
|
984
|
-
_logger.info(f'Loading table version: {tbl_id}:{effective_version}')
|
|
985
1162
|
conn = Env.get().conn
|
|
986
1163
|
|
|
987
|
-
# load mutable view ids
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
1164
|
+
# load mutable view ids for mutable TableVersions
|
|
1165
|
+
mutable_view_ids: list[UUID] = []
|
|
1166
|
+
# If this is a replica, effective_version should not be None. We see this today, because
|
|
1167
|
+
# the replica's TV instance's Column instances contain value_expr_dicts that reference the live version.
|
|
1168
|
+
# This is presumably a source of bugs, because it ignores schema version changes (eg, column renames).
|
|
1169
|
+
# TODO: retarget the value_expr_dict when instantiating Columns for a particular TV instance.
|
|
1170
|
+
if effective_version is None and not tbl_md.is_replica:
|
|
1171
|
+
q = sql.select(schema.Table.id).where(
|
|
1172
|
+
sql.text(
|
|
1173
|
+
f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r} "
|
|
1174
|
+
"AND md->'view_md'->'base_versions'->0->>1 IS NULL"
|
|
1175
|
+
)
|
|
992
1176
|
)
|
|
993
|
-
|
|
994
|
-
mutable_view_ids = [r[0] for r in conn.execute(q).all()]
|
|
1177
|
+
mutable_view_ids = [r[0] for r in conn.execute(q).all()]
|
|
995
1178
|
mutable_views = [TableVersionHandle(id, None) for id in mutable_view_ids]
|
|
996
1179
|
|
|
1180
|
+
tbl_version: TableVersion
|
|
997
1181
|
if view_md is None:
|
|
998
1182
|
# this is a base table
|
|
999
1183
|
tbl_version = TableVersion(
|
|
1000
1184
|
tbl_id, tbl_md, effective_version, schema_version_md, mutable_views=mutable_views
|
|
1001
1185
|
)
|
|
1002
|
-
|
|
1186
|
+
else:
|
|
1187
|
+
assert len(view_md.base_versions) > 0 # a view needs to have a base
|
|
1188
|
+
pure_snapshot = view_md.is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
|
|
1189
|
+
assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
|
|
1190
|
+
|
|
1191
|
+
base: TableVersionHandle
|
|
1192
|
+
base_path: Optional[TableVersionPath] = None # needed for live view
|
|
1193
|
+
if view_md.is_snapshot:
|
|
1194
|
+
base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
|
|
1195
|
+
else:
|
|
1196
|
+
base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
|
|
1197
|
+
base = base_path.tbl_version
|
|
1003
1198
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1199
|
+
tbl_version = TableVersion(
|
|
1200
|
+
tbl_id,
|
|
1201
|
+
tbl_md,
|
|
1202
|
+
effective_version,
|
|
1203
|
+
schema_version_md,
|
|
1204
|
+
base_path=base_path,
|
|
1205
|
+
base=base,
|
|
1206
|
+
mutable_views=mutable_views,
|
|
1207
|
+
)
|
|
1208
|
+
|
|
1209
|
+
self._tbl_versions[tbl_id, effective_version] = tbl_version
|
|
1210
|
+
tbl_version.init()
|
|
1007
1211
|
|
|
1008
|
-
base: TableVersionHandle
|
|
1009
|
-
base_path: Optional[TableVersionPath] = None # needed for live view
|
|
1010
|
-
if view_md.is_snapshot:
|
|
1011
|
-
base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
|
|
1012
|
-
else:
|
|
1013
|
-
base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
|
|
1014
|
-
base = base_path.tbl_version
|
|
1015
|
-
|
|
1016
|
-
tbl_version = TableVersion(
|
|
1017
|
-
tbl_id,
|
|
1018
|
-
tbl_md,
|
|
1019
|
-
effective_version,
|
|
1020
|
-
schema_version_md,
|
|
1021
|
-
base_path=base_path,
|
|
1022
|
-
base=base,
|
|
1023
|
-
mutable_views=mutable_views,
|
|
1024
|
-
)
|
|
1025
1212
|
return tbl_version
|
|
1026
1213
|
|
|
1027
1214
|
def _init_store(self) -> None:
|