pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +296 -105
- pixeltable/catalog/column.py +10 -8
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +25 -20
- pixeltable/catalog/schema_object.py +3 -6
- pixeltable/catalog/table.py +261 -189
- pixeltable/catalog/table_version.py +333 -202
- pixeltable/catalog/table_version_handle.py +15 -2
- pixeltable/catalog/table_version_path.py +60 -14
- pixeltable/catalog/view.py +38 -6
- pixeltable/dataframe.py +196 -18
- pixeltable/env.py +4 -4
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +4 -1
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +171 -22
- pixeltable/exprs/column_property_ref.py +15 -6
- pixeltable/exprs/column_ref.py +32 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +7 -0
- pixeltable/exprs/literal.py +2 -0
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/query_template_function.py +1 -1
- pixeltable/func/tools.py +1 -1
- pixeltable/functions/gemini.py +0 -1
- pixeltable/functions/string.py +212 -58
- pixeltable/globals.py +12 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +8 -29
- pixeltable/io/label_studio.py +1 -1
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +0 -31
- pixeltable/metadata/__init__.py +11 -2
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +8 -1
- pixeltable/plan.py +221 -14
- pixeltable/share/packager.py +137 -13
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +19 -13
- pixeltable/utils/dbms.py +1 -1
- pixeltable/utils/formatter.py +64 -42
- pixeltable/utils/sample.py +25 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0
pixeltable/catalog/catalog.py
CHANGED
|
@@ -3,8 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
import functools
|
|
5
5
|
import logging
|
|
6
|
+
import random
|
|
6
7
|
import time
|
|
7
|
-
from
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
|
|
8
10
|
from uuid import UUID
|
|
9
11
|
|
|
10
12
|
import psycopg
|
|
@@ -15,6 +17,8 @@ from pixeltable.env import Env
|
|
|
15
17
|
from pixeltable.iterators import ComponentIterator
|
|
16
18
|
from pixeltable.metadata import schema
|
|
17
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from pixeltable.plan import SampleClause
|
|
18
22
|
from .dir import Dir
|
|
19
23
|
from .globals import IfExistsParam, IfNotExistsParam, MediaValidation
|
|
20
24
|
from .insertable_table import InsertableTable
|
|
@@ -56,40 +60,60 @@ def _unpack_row(
|
|
|
56
60
|
return result
|
|
57
61
|
|
|
58
62
|
|
|
59
|
-
|
|
63
|
+
# for now, we don't limit the number of retries, because we haven't seen situations where the actual number of retries
|
|
64
|
+
# grows uncontrollably
|
|
65
|
+
_MAX_RETRIES = 0
|
|
66
|
+
|
|
60
67
|
T = TypeVar('T')
|
|
61
68
|
|
|
62
69
|
|
|
63
|
-
def _retry_loop(
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
70
|
+
def _retry_loop(*, for_write: bool) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
71
|
+
def decorator(op: Callable[..., T]) -> Callable[..., T]:
|
|
72
|
+
@functools.wraps(op)
|
|
73
|
+
def loop(*args: Any, **kwargs: Any) -> T:
|
|
74
|
+
num_remaining_retries = _MAX_RETRIES
|
|
75
|
+
while True:
|
|
76
|
+
try:
|
|
77
|
+
# in order for retry to work, we need to make sure that there aren't any prior db updates
|
|
78
|
+
# that are part of an ongoing transaction
|
|
79
|
+
assert not Env.get().in_xact
|
|
80
|
+
with Catalog.get().begin_xact(for_write=for_write):
|
|
81
|
+
return op(*args, **kwargs)
|
|
82
|
+
except sql.exc.DBAPIError as e:
|
|
83
|
+
# TODO: what other exceptions should we be looking for?
|
|
84
|
+
if isinstance(e.orig, psycopg.errors.SerializationFailure):
|
|
85
|
+
if num_remaining_retries > 0:
|
|
86
|
+
num_remaining_retries -= 1
|
|
87
|
+
_logger.debug(f'Serialization failure, retrying ({num_remaining_retries} retries left)')
|
|
88
|
+
time.sleep(random.uniform(0.1, 0.5))
|
|
89
|
+
else:
|
|
90
|
+
raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
|
|
81
91
|
else:
|
|
82
|
-
raise
|
|
83
|
-
else:
|
|
84
|
-
raise
|
|
92
|
+
raise
|
|
85
93
|
|
|
86
|
-
|
|
94
|
+
return loop
|
|
95
|
+
|
|
96
|
+
return decorator
|
|
87
97
|
|
|
88
98
|
|
|
89
99
|
class Catalog:
|
|
90
100
|
"""The functional interface to getting access to catalog objects
|
|
91
101
|
|
|
92
|
-
All interface functions must be called in the context of a transaction, started with
|
|
102
|
+
All interface functions must be called in the context of a transaction, started with Catalog.begin_xact().
|
|
103
|
+
|
|
104
|
+
Caching and invalidation of metadata:
|
|
105
|
+
- Catalog caches TableVersion instances in order to avoid excessive metadata loading
|
|
106
|
+
- for any specific table version (ie, combination of id and effective version) there can be only a single
|
|
107
|
+
Tableversion instance in circulation; the reason is that each TV instance has its own store_tbl.sa_tbl, and
|
|
108
|
+
mixing multiple instances of sqlalchemy Table objects in the same query (for the same underlying table) leads to
|
|
109
|
+
duplicate references to that table in the From clause (ie, incorrect Cartesian products)
|
|
110
|
+
- in order to allow multiple concurrent Python processes to perform updates (data and/or schema) against a shared
|
|
111
|
+
Pixeltable instance, Catalog needs to reload metadata from the store when there are changes
|
|
112
|
+
- concurrent changes are detected by comparing TableVersion.version with the stored current version
|
|
113
|
+
(TableMd.current_version)
|
|
114
|
+
- cached live TableVersion instances (those with effective_version == None) are validated against the stored
|
|
115
|
+
metadata on transaction boundaries; this is recorded in TableVersion.is_validated
|
|
116
|
+
- metadata validation is only needed for live TableVersion instances (snapshot instances are immutable)
|
|
93
117
|
"""
|
|
94
118
|
|
|
95
119
|
_instance: Optional[Catalog] = None
|
|
@@ -99,6 +123,8 @@ class Catalog:
|
|
|
99
123
|
# - snapshot versions: records the version of the snapshot
|
|
100
124
|
_tbl_versions: dict[tuple[UUID, Optional[int]], TableVersion]
|
|
101
125
|
_tbls: dict[UUID, Table]
|
|
126
|
+
_in_write_xact: bool # True if we're in a write transaction
|
|
127
|
+
_x_locked_tbl_id: Optional[UUID] # set if begin_xact() was asked to write-lock a table
|
|
102
128
|
|
|
103
129
|
@classmethod
|
|
104
130
|
def get(cls) -> Catalog:
|
|
@@ -109,22 +135,127 @@ class Catalog:
|
|
|
109
135
|
@classmethod
|
|
110
136
|
def clear(cls) -> None:
|
|
111
137
|
"""Remove the instance. Used for testing."""
|
|
138
|
+
# invalidate all existing instances to force reloading of metadata
|
|
139
|
+
for tbl_version in cls._instance._tbl_versions.values():
|
|
140
|
+
# _logger.debug(
|
|
141
|
+
# f'Invalidating table version {tbl_version.id}:{tbl_version.effective_version} ({id(tbl_version):x})'
|
|
142
|
+
# )
|
|
143
|
+
tbl_version.is_validated = False
|
|
112
144
|
cls._instance = None
|
|
113
145
|
|
|
114
146
|
def __init__(self) -> None:
|
|
115
147
|
self._tbl_versions = {}
|
|
116
148
|
self._tbls = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
|
|
149
|
+
self._in_write_xact = False
|
|
150
|
+
self._x_locked_tbl_id = None
|
|
117
151
|
self._init_store()
|
|
118
152
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
153
|
+
def validate(self) -> None:
|
|
154
|
+
"""Validate structural consistency of cached metadata"""
|
|
155
|
+
for (tbl_id, effective_version), tbl_version in self._tbl_versions.items():
|
|
156
|
+
assert tbl_id == tbl_version.id, f'{tbl_id} != {tbl_version.id}'
|
|
157
|
+
assert tbl_version.effective_version == tbl_version.version or tbl_version.effective_version is None, (
|
|
158
|
+
f'{tbl_version.effective_version} != {tbl_version.version} for id {tbl_id}'
|
|
159
|
+
)
|
|
160
|
+
assert effective_version == tbl_version.effective_version, (
|
|
161
|
+
f'{effective_version} != {tbl_version.effective_version} for id {tbl_id}'
|
|
162
|
+
)
|
|
163
|
+
assert len(tbl_version.mutable_views) == 0 or tbl_version.is_mutable, (
|
|
164
|
+
f'snapshot_id={tbl_version.id} mutable_views={tbl_version.mutable_views}'
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if tbl_version.is_view and tbl_version.is_mutable:
|
|
168
|
+
# make sure this mutable view is recorded in a mutable base
|
|
169
|
+
base = tbl_version.base
|
|
170
|
+
assert base is not None
|
|
171
|
+
if base.effective_version is None:
|
|
172
|
+
assert (base.id, None) in self._tbl_versions
|
|
173
|
+
assert TableVersionHandle.create(tbl_version) in self._tbl_versions[base.id, None].mutable_views
|
|
174
|
+
|
|
175
|
+
if len(tbl_version.mutable_views) > 0:
|
|
176
|
+
# make sure we also loaded mutable view metadata, which is needed to detect column dependencies
|
|
177
|
+
for v in tbl_version.mutable_views:
|
|
178
|
+
assert v.effective_version is None, f'{v.id}:{v.effective_version}'
|
|
179
|
+
|
|
180
|
+
@contextmanager
|
|
181
|
+
def begin_xact(self, *, tbl_id: Optional[UUID] = None, for_write: bool = False) -> Iterator[sql.Connection]:
|
|
182
|
+
"""
|
|
183
|
+
Return a context manager that yields a connection to the database. Idempotent.
|
|
184
|
+
|
|
185
|
+
It is mandatory to call this method, not Env.begin_xact(), if the transaction accesses any table data
|
|
186
|
+
or metadata.
|
|
187
|
+
|
|
188
|
+
Lock acquisition:
|
|
189
|
+
- x-locks Table records by updating Table.lock_dummy
|
|
190
|
+
- this needs to be done in a retry loop, because Postgres can decide to abort the transaction
|
|
191
|
+
(SerializationFailure, LockNotAvailable)
|
|
192
|
+
- for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
|
|
193
|
+
to minimize (maybe avoid altogether) loosing that work
|
|
194
|
+
"""
|
|
195
|
+
if Env.get().in_xact:
|
|
196
|
+
if tbl_id is not None and for_write:
|
|
197
|
+
# make sure that we requested the required table lock at the beginning of the transaction
|
|
198
|
+
assert tbl_id == self._x_locked_tbl_id, f'{tbl_id} != {self._x_locked_tbl_id}'
|
|
199
|
+
yield Env.get().conn
|
|
200
|
+
return
|
|
201
|
+
|
|
202
|
+
# tv_msg = '\n'.join(
|
|
203
|
+
# [
|
|
204
|
+
# f'{tv.id}:{tv.effective_version} : tv={id(tv):x} sa_tbl={id(tv.store_tbl.sa_tbl):x}'
|
|
205
|
+
# for tv in self._tbl_versions.values()
|
|
206
|
+
# ]
|
|
207
|
+
# )
|
|
208
|
+
# _logger.debug(f'begin_xact(): {tv_msg}')
|
|
209
|
+
num_retries = 0
|
|
210
|
+
while True:
|
|
211
|
+
try:
|
|
212
|
+
with Env.get().begin_xact() as conn:
|
|
213
|
+
if tbl_id is not None and for_write:
|
|
214
|
+
# X-lock Table record
|
|
215
|
+
conn.execute(
|
|
216
|
+
sql.select(schema.Table).where(schema.Table.id == tbl_id).with_for_update(nowait=True)
|
|
217
|
+
)
|
|
218
|
+
conn.execute(sql.update(schema.Table).values(lock_dummy=1).where(schema.Table.id == tbl_id))
|
|
219
|
+
self._x_locked_tbl_id = tbl_id
|
|
220
|
+
|
|
221
|
+
self._in_write_xact = for_write
|
|
222
|
+
yield conn
|
|
223
|
+
return
|
|
224
|
+
except sql.exc.DBAPIError as e:
|
|
225
|
+
if isinstance(e.orig, (psycopg.errors.SerializationFailure, psycopg.errors.LockNotAvailable)) and (
|
|
226
|
+
num_retries < _MAX_RETRIES or _MAX_RETRIES == 0
|
|
227
|
+
):
|
|
228
|
+
num_retries += 1
|
|
229
|
+
_logger.debug(f'Retrying ({num_retries}) after {type(e.orig)}')
|
|
230
|
+
time.sleep(random.uniform(0.1, 0.5))
|
|
231
|
+
else:
|
|
232
|
+
raise
|
|
233
|
+
finally:
|
|
234
|
+
self._in_write_xact = False
|
|
235
|
+
self._x_locked_tbl_id = None
|
|
236
|
+
|
|
237
|
+
# invalidate cached current TableVersion instances
|
|
238
|
+
for tv in self._tbl_versions.values():
|
|
239
|
+
if tv.effective_version is None:
|
|
240
|
+
_logger.debug(f'invalidating table version {tv.id}:None (tv={id(tv):x})')
|
|
241
|
+
tv.is_validated = False
|
|
242
|
+
|
|
243
|
+
if _logger.isEnabledFor(logging.DEBUG):
|
|
244
|
+
self.validate()
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def in_write_xact(self) -> bool:
|
|
248
|
+
return self._in_write_xact
|
|
249
|
+
|
|
250
|
+
def _acquire_dir_xlock(self, parent_id: Optional[UUID], dir_id: Optional[UUID], dir_name: Optional[str]) -> None:
|
|
251
|
+
"""Force acquisition of an X-lock on a Dir record via a blind update.
|
|
252
|
+
|
|
122
253
|
If dir_id is present, then all other conditions are ignored.
|
|
123
254
|
Note that (parent_id==None) is a valid where condition.
|
|
124
255
|
If dir_id is not specified, the user from the environment is added to the directory filters.
|
|
125
256
|
"""
|
|
126
257
|
user = Env.get().user
|
|
127
|
-
|
|
258
|
+
assert self._in_write_xact
|
|
128
259
|
q = sql.update(schema.Dir).values(lock_dummy=1)
|
|
129
260
|
if dir_id is not None:
|
|
130
261
|
q = q.where(schema.Dir.id == dir_id)
|
|
@@ -134,7 +265,7 @@ class Catalog:
|
|
|
134
265
|
q = q.where(schema.Dir.md['name'].astext == dir_name)
|
|
135
266
|
if user is not None:
|
|
136
267
|
q = q.where(schema.Dir.md['user'].astext == user)
|
|
137
|
-
conn.execute(q)
|
|
268
|
+
Env.get().conn.execute(q)
|
|
138
269
|
|
|
139
270
|
def get_dir_path(self, dir_id: UUID) -> Path:
|
|
140
271
|
"""Return path for directory with given id"""
|
|
@@ -156,7 +287,7 @@ class Catalog:
|
|
|
156
287
|
dir_entries: dict[str, Catalog.DirEntry]
|
|
157
288
|
table: Optional[schema.Table]
|
|
158
289
|
|
|
159
|
-
@_retry_loop
|
|
290
|
+
@_retry_loop(for_write=False)
|
|
160
291
|
def get_dir_contents(self, dir_path: Path, recursive: bool = False) -> dict[str, DirEntry]:
|
|
161
292
|
dir = self._get_schema_object(dir_path, expected=Dir, raise_if_not_exists=True)
|
|
162
293
|
return self._get_dir_contents(dir._id, recursive=recursive)
|
|
@@ -183,7 +314,7 @@ class Catalog:
|
|
|
183
314
|
|
|
184
315
|
return result
|
|
185
316
|
|
|
186
|
-
@_retry_loop
|
|
317
|
+
@_retry_loop(for_write=True)
|
|
187
318
|
def move(self, path: Path, new_path: Path) -> None:
|
|
188
319
|
self._move(path, new_path)
|
|
189
320
|
|
|
@@ -272,7 +403,7 @@ class Catalog:
|
|
|
272
403
|
|
|
273
404
|
# check for subdirectory
|
|
274
405
|
if for_update:
|
|
275
|
-
self.
|
|
406
|
+
self._acquire_dir_xlock(dir_id, None, name)
|
|
276
407
|
q = sql.select(schema.Dir).where(
|
|
277
408
|
schema.Dir.parent_id == dir_id, schema.Dir.md['name'].astext == name, schema.Dir.md['user'].astext == user
|
|
278
409
|
)
|
|
@@ -296,7 +427,7 @@ class Catalog:
|
|
|
296
427
|
tbl_id = conn.execute(q).scalar_one_or_none()
|
|
297
428
|
if tbl_id is not None:
|
|
298
429
|
if tbl_id not in self._tbls:
|
|
299
|
-
|
|
430
|
+
_ = self._load_tbl(tbl_id)
|
|
300
431
|
return self._tbls[tbl_id]
|
|
301
432
|
|
|
302
433
|
return None
|
|
@@ -349,10 +480,15 @@ class Catalog:
|
|
|
349
480
|
tbl = self._load_tbl(tbl_id)
|
|
350
481
|
if tbl is None:
|
|
351
482
|
return None
|
|
352
|
-
|
|
483
|
+
# if this is a mutable table, we also need to have its mutable views loaded, in order to track column
|
|
484
|
+
# dependencies
|
|
485
|
+
tbl_version = tbl._tbl_version.get()
|
|
486
|
+
if tbl_version.is_mutable:
|
|
487
|
+
for v in tbl_version.mutable_views:
|
|
488
|
+
_ = self.get_table_by_id(v.id)
|
|
353
489
|
return self._tbls[tbl_id]
|
|
354
490
|
|
|
355
|
-
@_retry_loop
|
|
491
|
+
@_retry_loop(for_write=True)
|
|
356
492
|
def create_table(
|
|
357
493
|
self,
|
|
358
494
|
path: Path,
|
|
@@ -385,13 +521,14 @@ class Catalog:
|
|
|
385
521
|
self._tbls[tbl._id] = tbl
|
|
386
522
|
return tbl
|
|
387
523
|
|
|
388
|
-
@_retry_loop
|
|
524
|
+
@_retry_loop(for_write=True)
|
|
389
525
|
def create_view(
|
|
390
526
|
self,
|
|
391
527
|
path: Path,
|
|
392
528
|
base: TableVersionPath,
|
|
393
529
|
select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
|
|
394
530
|
where: Optional[exprs.Expr],
|
|
531
|
+
sample_clause: Optional['SampleClause'],
|
|
395
532
|
additional_columns: Optional[dict[str, Any]],
|
|
396
533
|
is_snapshot: bool,
|
|
397
534
|
iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]],
|
|
@@ -420,6 +557,7 @@ class Catalog:
|
|
|
420
557
|
select_list=select_list,
|
|
421
558
|
additional_columns=additional_columns,
|
|
422
559
|
predicate=where,
|
|
560
|
+
sample_clause=sample_clause,
|
|
423
561
|
is_snapshot=is_snapshot,
|
|
424
562
|
iterator_cls=iterator_class,
|
|
425
563
|
iterator_args=iterator_args,
|
|
@@ -431,14 +569,17 @@ class Catalog:
|
|
|
431
569
|
self._tbls[view._id] = view
|
|
432
570
|
return view
|
|
433
571
|
|
|
434
|
-
@_retry_loop
|
|
572
|
+
@_retry_loop(for_write=True)
|
|
435
573
|
def create_replica(
|
|
436
574
|
self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam = IfExistsParam.ERROR
|
|
437
|
-
) ->
|
|
575
|
+
) -> None:
|
|
438
576
|
"""
|
|
439
577
|
Creates table, table_version, and table_schema_version records for a replica with the given metadata.
|
|
440
578
|
The metadata should be presented in standard "ancestor order", with the table being replicated at
|
|
441
579
|
list position 0 and the (root) base table at list position -1.
|
|
580
|
+
|
|
581
|
+
TODO: create_replica() also needs to create the store tables and populate them in order to make
|
|
582
|
+
replica creation atomic.
|
|
442
583
|
"""
|
|
443
584
|
tbl_id = UUID(md[0].tbl_md.tbl_id)
|
|
444
585
|
|
|
@@ -451,20 +592,19 @@ class Catalog:
|
|
|
451
592
|
'but a different table already exists at that location.'
|
|
452
593
|
)
|
|
453
594
|
assert isinstance(existing, View)
|
|
454
|
-
return
|
|
595
|
+
return
|
|
455
596
|
|
|
456
597
|
# Ensure that the system directory exists.
|
|
457
598
|
self._create_dir(Path('_system', allow_system_paths=True), if_exists=IfExistsParam.IGNORE, parents=False)
|
|
458
599
|
|
|
459
600
|
# Now check to see if this table already exists in the catalog.
|
|
460
|
-
# TODO: Handle concurrency in create_replica()
|
|
461
601
|
existing = Catalog.get().get_table_by_id(tbl_id)
|
|
462
602
|
if existing is not None:
|
|
463
|
-
existing_path = Path(existing._path, allow_system_paths=True)
|
|
603
|
+
existing_path = Path(existing._path(), allow_system_paths=True)
|
|
464
604
|
# It does exist. If it's a non-system table, that's an error: it's already been replicated.
|
|
465
605
|
if not existing_path.is_system_path:
|
|
466
606
|
raise excs.Error(
|
|
467
|
-
f'That table has already been replicated as {existing._path!r}. \n'
|
|
607
|
+
f'That table has already been replicated as {existing._path()!r}. \n'
|
|
468
608
|
f'Drop the existing replica if you wish to re-create it.'
|
|
469
609
|
)
|
|
470
610
|
# If it's a system table, then this means it was created at some point as the ancestor of some other
|
|
@@ -489,22 +629,20 @@ class Catalog:
|
|
|
489
629
|
# The table already exists in the catalog. The existing path might be a system path (if the table
|
|
490
630
|
# was created as an anonymous base table of some other table), or it might not (if it's a snapshot
|
|
491
631
|
# that was directly replicated by the user at some point). In either case, use the existing path.
|
|
492
|
-
replica_path = Path(replica._path, allow_system_paths=True)
|
|
632
|
+
replica_path = Path(replica._path(), allow_system_paths=True)
|
|
493
633
|
|
|
494
634
|
# Store the metadata; it could be a new version (in which case a new record will be created) or a
|
|
495
635
|
# known version (in which case the newly received metadata will be validated as identical).
|
|
496
636
|
self.__store_replica_md(replica_path, ancestor_md)
|
|
497
637
|
|
|
498
|
-
#
|
|
499
|
-
#
|
|
500
|
-
self._tbls[tbl_id] = self._load_tbl(tbl_id)
|
|
501
|
-
return self._tbls[tbl_id]
|
|
638
|
+
# don't create TableVersion instances at this point, they would be superseded by calls to TV.create_replica()
|
|
639
|
+
# in TableRestorer.restore()
|
|
502
640
|
|
|
503
641
|
def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
|
|
504
642
|
_logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
|
|
505
|
-
# TODO: Handle concurrency
|
|
506
643
|
dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
|
|
507
644
|
assert dir is not None
|
|
645
|
+
assert self._in_write_xact
|
|
508
646
|
|
|
509
647
|
conn = Env.get().conn
|
|
510
648
|
tbl_id = md.tbl_md.tbl_id
|
|
@@ -582,14 +720,24 @@ class Catalog:
|
|
|
582
720
|
|
|
583
721
|
self.store_tbl_md(UUID(tbl_id), new_tbl_md, new_version_md, new_schema_version_md)
|
|
584
722
|
|
|
585
|
-
@_retry_loop
|
|
723
|
+
@_retry_loop(for_write=False)
|
|
586
724
|
def get_table(self, path: Path) -> Table:
|
|
725
|
+
obj = self._get_table(path)
|
|
726
|
+
return obj
|
|
727
|
+
|
|
728
|
+
def _get_table(self, path: Path) -> Table:
|
|
587
729
|
obj = Catalog.get()._get_schema_object(path, expected=Table, raise_if_not_exists=True)
|
|
588
730
|
assert isinstance(obj, Table)
|
|
589
|
-
obj._tbl_version.get()
|
|
731
|
+
tbl_version = obj._tbl_version.get()
|
|
732
|
+
# TODO: instead of calling this here, move the logic into TableVersion.init(), which is called after
|
|
733
|
+
# registering the instance in _tbl_versions
|
|
734
|
+
tbl_version.ensure_md_loaded()
|
|
735
|
+
# if this table has mutable views, we need to load those as well, in order to record column dependencies
|
|
736
|
+
for v in tbl_version.mutable_views:
|
|
737
|
+
self.get_table_by_id(v.id)
|
|
590
738
|
return obj
|
|
591
739
|
|
|
592
|
-
@_retry_loop
|
|
740
|
+
@_retry_loop(for_write=True)
|
|
593
741
|
def drop_table(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
|
|
594
742
|
_, _, src_obj = self._prepare_dir_op(
|
|
595
743
|
drop_dir_path=path.parent,
|
|
@@ -621,11 +769,11 @@ class Catalog:
|
|
|
621
769
|
msg: str
|
|
622
770
|
if is_replace:
|
|
623
771
|
msg = (
|
|
624
|
-
f'{obj_type_str} {tbl._path} already exists and has dependents. '
|
|
772
|
+
f'{obj_type_str} {tbl._path()} already exists and has dependents. '
|
|
625
773
|
"Use `if_exists='replace_force'` to replace it."
|
|
626
774
|
)
|
|
627
775
|
else:
|
|
628
|
-
msg = f'{obj_type_str} {tbl._path} has dependents.'
|
|
776
|
+
msg = f'{obj_type_str} {tbl._path()} has dependents.'
|
|
629
777
|
raise excs.Error(msg)
|
|
630
778
|
|
|
631
779
|
for view_id in view_ids:
|
|
@@ -636,9 +784,9 @@ class Catalog:
|
|
|
636
784
|
tbl._drop()
|
|
637
785
|
assert tbl._id in self._tbls
|
|
638
786
|
del self._tbls[tbl._id]
|
|
639
|
-
_logger.info(f'Dropped table `{tbl._path}`.')
|
|
787
|
+
_logger.info(f'Dropped table `{tbl._path()}`.')
|
|
640
788
|
|
|
641
|
-
@_retry_loop
|
|
789
|
+
@_retry_loop(for_write=True)
|
|
642
790
|
def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
|
|
643
791
|
return self._create_dir(path, if_exists, parents)
|
|
644
792
|
|
|
@@ -673,7 +821,7 @@ class Catalog:
|
|
|
673
821
|
Env.get().console_logger.info(f'Created directory {str(path)!r}.')
|
|
674
822
|
return dir
|
|
675
823
|
|
|
676
|
-
@_retry_loop
|
|
824
|
+
@_retry_loop(for_write=True)
|
|
677
825
|
def drop_dir(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
|
|
678
826
|
_, _, schema_obj = self._prepare_dir_op(
|
|
679
827
|
drop_dir_path=path.parent,
|
|
@@ -698,7 +846,7 @@ class Catalog:
|
|
|
698
846
|
raise excs.Error(f'Directory {str(dir_path)!r} is not empty.')
|
|
699
847
|
|
|
700
848
|
# drop existing subdirs
|
|
701
|
-
self.
|
|
849
|
+
self._acquire_dir_xlock(dir_id, None, None)
|
|
702
850
|
dir_q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
|
|
703
851
|
for row in conn.execute(dir_q).all():
|
|
704
852
|
self._drop_dir(row.id, dir_path.append(row.md['name']), force=True)
|
|
@@ -725,17 +873,37 @@ class Catalog:
|
|
|
725
873
|
return result
|
|
726
874
|
|
|
727
875
|
def get_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
876
|
+
# we need a transaction here, if we're not already in one; if this starts a new transaction,
|
|
877
|
+
# the returned TableVersion instance will not be validated
|
|
878
|
+
with self.begin_xact(tbl_id=tbl_id, for_write=False) as conn:
|
|
879
|
+
tv = self._tbl_versions.get((tbl_id, effective_version))
|
|
880
|
+
if tv is None:
|
|
881
|
+
tv = self._load_tbl_version(tbl_id, effective_version)
|
|
882
|
+
elif not tv.is_validated:
|
|
883
|
+
# only live instances are invalidated
|
|
884
|
+
assert effective_version is None
|
|
885
|
+
# we validate live instances by comparing our cached version number to the stored current version
|
|
886
|
+
# _logger.debug(f'validating metadata for table {tbl_id}:{tv.version} ({id(tv):x})')
|
|
887
|
+
q = sql.select(schema.Table.md).where(schema.Table.id == tbl_id)
|
|
888
|
+
row = conn.execute(q).one()
|
|
889
|
+
current_version = row.md['current_version']
|
|
890
|
+
|
|
891
|
+
# the stored version can be behind TableVersion.version, because we don't roll back the in-memory
|
|
892
|
+
# metadata changes after a failed update operation
|
|
893
|
+
if current_version != tv.version:
|
|
894
|
+
# the cached metadata is invalid
|
|
895
|
+
_logger.debug(
|
|
896
|
+
f'reloading metadata for table {tbl_id} '
|
|
897
|
+
f'(cached version: {tv.version}, current version: {current_version}'
|
|
898
|
+
# f', id: {id(tv):x})'
|
|
899
|
+
)
|
|
900
|
+
tv = self._load_tbl_version(tbl_id, None)
|
|
901
|
+
else:
|
|
902
|
+
# the cached metadata is valid
|
|
903
|
+
tv.is_validated = True
|
|
904
|
+
|
|
905
|
+
assert tv.is_validated
|
|
906
|
+
return tv
|
|
739
907
|
|
|
740
908
|
def remove_tbl_version(self, tbl_version: TableVersion) -> None:
|
|
741
909
|
assert (tbl_version.id, tbl_version.effective_version) in self._tbl_versions
|
|
@@ -745,7 +913,7 @@ class Catalog:
|
|
|
745
913
|
"""Return the Dir with the given id, or None if it doesn't exist"""
|
|
746
914
|
conn = Env.get().conn
|
|
747
915
|
if for_update:
|
|
748
|
-
self.
|
|
916
|
+
self._acquire_dir_xlock(None, dir_id, None)
|
|
749
917
|
q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
|
|
750
918
|
row = conn.execute(q).one_or_none()
|
|
751
919
|
if row is None:
|
|
@@ -761,7 +929,7 @@ class Catalog:
|
|
|
761
929
|
conn = Env.get().conn
|
|
762
930
|
if path.is_root:
|
|
763
931
|
if for_update:
|
|
764
|
-
self.
|
|
932
|
+
self._acquire_dir_xlock(parent_id=None, dir_id=None, dir_name='')
|
|
765
933
|
q = sql.select(schema.Dir).where(schema.Dir.parent_id.is_(None), schema.Dir.md['user'].astext == user)
|
|
766
934
|
row = conn.execute(q).one_or_none()
|
|
767
935
|
return schema.Dir(**row._mapping) if row is not None else None
|
|
@@ -770,7 +938,7 @@ class Catalog:
|
|
|
770
938
|
if parent_dir is None:
|
|
771
939
|
return None
|
|
772
940
|
if for_update:
|
|
773
|
-
self.
|
|
941
|
+
self._acquire_dir_xlock(parent_id=parent_dir.id, dir_id=None, dir_name=path.name)
|
|
774
942
|
q = sql.select(schema.Dir).where(
|
|
775
943
|
schema.Dir.parent_id == parent_dir.id,
|
|
776
944
|
schema.Dir.md['name'].astext == path.name,
|
|
@@ -780,6 +948,7 @@ class Catalog:
|
|
|
780
948
|
return schema.Dir(**row._mapping) if row is not None else None
|
|
781
949
|
|
|
782
950
|
def _load_tbl(self, tbl_id: UUID) -> Optional[Table]:
|
|
951
|
+
"""Loads metadata for the table with the given id and caches it."""
|
|
783
952
|
_logger.info(f'Loading table {tbl_id}')
|
|
784
953
|
from .insertable_table import InsertableTable
|
|
785
954
|
from .view import View
|
|
@@ -808,8 +977,9 @@ class Catalog:
|
|
|
808
977
|
if view_md is None:
|
|
809
978
|
# this is a base table
|
|
810
979
|
if (tbl_id, None) not in self._tbl_versions:
|
|
811
|
-
|
|
980
|
+
_ = self._load_tbl_version(tbl_id, None)
|
|
812
981
|
tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
|
|
982
|
+
self._tbls[tbl_id] = tbl
|
|
813
983
|
return tbl
|
|
814
984
|
|
|
815
985
|
# this is a view; determine the sequence of TableVersions to load
|
|
@@ -829,18 +999,18 @@ class Catalog:
|
|
|
829
999
|
view_path: Optional[TableVersionPath] = None
|
|
830
1000
|
for id, effective_version in tbl_version_path[::-1]:
|
|
831
1001
|
if (id, effective_version) not in self._tbl_versions:
|
|
832
|
-
|
|
1002
|
+
_ = self._load_tbl_version(id, effective_version)
|
|
833
1003
|
view_path = TableVersionPath(TableVersionHandle(id, effective_version), base=base_path)
|
|
834
1004
|
base_path = view_path
|
|
835
1005
|
view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=pure_snapshot)
|
|
836
|
-
|
|
1006
|
+
self._tbls[tbl_id] = view
|
|
837
1007
|
return view
|
|
838
1008
|
|
|
839
1009
|
def load_tbl_md(self, tbl_id: UUID, effective_version: Optional[int]) -> schema.FullTableMd:
|
|
840
1010
|
"""
|
|
841
1011
|
Loads metadata from the store for a given table UUID and version.
|
|
842
1012
|
"""
|
|
843
|
-
_logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
|
|
1013
|
+
# _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
|
|
844
1014
|
conn = Env.get().conn
|
|
845
1015
|
|
|
846
1016
|
q = (
|
|
@@ -915,8 +1085,15 @@ class Catalog:
|
|
|
915
1085
|
If inserting `version_md` or `schema_version_md` would be a primary key violation, an exception will be raised.
|
|
916
1086
|
"""
|
|
917
1087
|
conn = Env.get().conn
|
|
1088
|
+
assert self._in_write_xact
|
|
918
1089
|
|
|
919
1090
|
if tbl_md is not None:
|
|
1091
|
+
assert tbl_md.tbl_id == str(tbl_id)
|
|
1092
|
+
if version_md is not None:
|
|
1093
|
+
assert tbl_md.current_version == version_md.version
|
|
1094
|
+
assert tbl_md.current_schema_version == version_md.schema_version
|
|
1095
|
+
if schema_version_md is not None:
|
|
1096
|
+
assert tbl_md.current_schema_version == schema_version_md.schema_version
|
|
920
1097
|
result = conn.execute(
|
|
921
1098
|
sql.update(schema.Table.__table__)
|
|
922
1099
|
.values({schema.Table.md: dataclasses.asdict(tbl_md)})
|
|
@@ -925,6 +1102,9 @@ class Catalog:
|
|
|
925
1102
|
assert result.rowcount == 1, result.rowcount
|
|
926
1103
|
|
|
927
1104
|
if version_md is not None:
|
|
1105
|
+
assert version_md.tbl_id == str(tbl_id)
|
|
1106
|
+
if schema_version_md is not None:
|
|
1107
|
+
assert version_md.schema_version == schema_version_md.schema_version
|
|
928
1108
|
conn.execute(
|
|
929
1109
|
sql.insert(schema.TableVersion.__table__).values(
|
|
930
1110
|
tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
|
|
@@ -932,6 +1112,7 @@ class Catalog:
|
|
|
932
1112
|
)
|
|
933
1113
|
|
|
934
1114
|
if schema_version_md is not None:
|
|
1115
|
+
assert schema_version_md.tbl_id == str(tbl_id)
|
|
935
1116
|
conn.execute(
|
|
936
1117
|
sql.insert(schema.TableSchemaVersion.__table__).values(
|
|
937
1118
|
tbl_id=tbl_id,
|
|
@@ -978,50 +1159,60 @@ class Catalog:
|
|
|
978
1159
|
return md
|
|
979
1160
|
|
|
980
1161
|
def _load_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
|
|
1162
|
+
"""Creates TableVersion instance from stored metadata and registers it in _tbl_versions."""
|
|
981
1163
|
tbl_md, _, schema_version_md = self.load_tbl_md(tbl_id, effective_version)
|
|
982
1164
|
view_md = tbl_md.view_md
|
|
983
1165
|
|
|
984
|
-
_logger.info(f'Loading table version: {tbl_id}:{effective_version}')
|
|
985
1166
|
conn = Env.get().conn
|
|
986
1167
|
|
|
987
|
-
# load mutable view ids
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
1168
|
+
# load mutable view ids for mutable TableVersions
|
|
1169
|
+
mutable_view_ids: list[UUID] = []
|
|
1170
|
+
# If this is a replica, effective_version should not be None. We see this today, because
|
|
1171
|
+
# the replica's TV instance's Column instances contain value_expr_dicts that reference the live version.
|
|
1172
|
+
# This is presumably a source of bugs, because it ignores schema version changes (eg, column renames).
|
|
1173
|
+
# TODO: retarget the value_expr_dict when instantiating Columns for a particular TV instance.
|
|
1174
|
+
if effective_version is None and not tbl_md.is_replica:
|
|
1175
|
+
q = sql.select(schema.Table.id).where(
|
|
1176
|
+
sql.text(
|
|
1177
|
+
f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r} "
|
|
1178
|
+
"AND md->'view_md'->'base_versions'->0->>1 IS NULL"
|
|
1179
|
+
)
|
|
992
1180
|
)
|
|
993
|
-
|
|
994
|
-
mutable_view_ids = [r[0] for r in conn.execute(q).all()]
|
|
1181
|
+
mutable_view_ids = [r[0] for r in conn.execute(q).all()]
|
|
995
1182
|
mutable_views = [TableVersionHandle(id, None) for id in mutable_view_ids]
|
|
996
1183
|
|
|
1184
|
+
tbl_version: TableVersion
|
|
997
1185
|
if view_md is None:
|
|
998
1186
|
# this is a base table
|
|
999
1187
|
tbl_version = TableVersion(
|
|
1000
1188
|
tbl_id, tbl_md, effective_version, schema_version_md, mutable_views=mutable_views
|
|
1001
1189
|
)
|
|
1002
|
-
|
|
1190
|
+
else:
|
|
1191
|
+
assert len(view_md.base_versions) > 0 # a view needs to have a base
|
|
1192
|
+
pure_snapshot = view_md.is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
|
|
1193
|
+
assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
|
|
1194
|
+
|
|
1195
|
+
base: TableVersionHandle
|
|
1196
|
+
base_path: Optional[TableVersionPath] = None # needed for live view
|
|
1197
|
+
if view_md.is_snapshot:
|
|
1198
|
+
base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
|
|
1199
|
+
else:
|
|
1200
|
+
base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
|
|
1201
|
+
base = base_path.tbl_version
|
|
1202
|
+
|
|
1203
|
+
tbl_version = TableVersion(
|
|
1204
|
+
tbl_id,
|
|
1205
|
+
tbl_md,
|
|
1206
|
+
effective_version,
|
|
1207
|
+
schema_version_md,
|
|
1208
|
+
base_path=base_path,
|
|
1209
|
+
base=base,
|
|
1210
|
+
mutable_views=mutable_views,
|
|
1211
|
+
)
|
|
1003
1212
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
|
|
1213
|
+
self._tbl_versions[tbl_id, effective_version] = tbl_version
|
|
1214
|
+
tbl_version.init()
|
|
1007
1215
|
|
|
1008
|
-
base: TableVersionHandle
|
|
1009
|
-
base_path: Optional[TableVersionPath] = None # needed for live view
|
|
1010
|
-
if view_md.is_snapshot:
|
|
1011
|
-
base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
|
|
1012
|
-
else:
|
|
1013
|
-
base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
|
|
1014
|
-
base = base_path.tbl_version
|
|
1015
|
-
|
|
1016
|
-
tbl_version = TableVersion(
|
|
1017
|
-
tbl_id,
|
|
1018
|
-
tbl_md,
|
|
1019
|
-
effective_version,
|
|
1020
|
-
schema_version_md,
|
|
1021
|
-
base_path=base_path,
|
|
1022
|
-
base=base,
|
|
1023
|
-
mutable_views=mutable_views,
|
|
1024
|
-
)
|
|
1025
1216
|
return tbl_version
|
|
1026
1217
|
|
|
1027
1218
|
def _init_store(self) -> None:
|