pixeltable 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (78) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +9 -1
  4. pixeltable/catalog/catalog.py +559 -134
  5. pixeltable/catalog/column.py +36 -32
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +12 -0
  8. pixeltable/catalog/insertable_table.py +30 -25
  9. pixeltable/catalog/schema_object.py +9 -6
  10. pixeltable/catalog/table.py +334 -267
  11. pixeltable/catalog/table_version.py +358 -241
  12. pixeltable/catalog/table_version_handle.py +18 -2
  13. pixeltable/catalog/table_version_path.py +86 -16
  14. pixeltable/catalog/view.py +47 -23
  15. pixeltable/dataframe.py +198 -19
  16. pixeltable/env.py +6 -4
  17. pixeltable/exceptions.py +6 -0
  18. pixeltable/exec/__init__.py +1 -1
  19. pixeltable/exec/exec_node.py +2 -0
  20. pixeltable/exec/expr_eval/evaluators.py +4 -1
  21. pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
  22. pixeltable/exec/in_memory_data_node.py +1 -1
  23. pixeltable/exec/sql_node.py +188 -22
  24. pixeltable/exprs/column_property_ref.py +16 -6
  25. pixeltable/exprs/column_ref.py +33 -11
  26. pixeltable/exprs/comparison.py +1 -1
  27. pixeltable/exprs/data_row.py +5 -3
  28. pixeltable/exprs/expr.py +11 -4
  29. pixeltable/exprs/literal.py +2 -0
  30. pixeltable/exprs/row_builder.py +4 -6
  31. pixeltable/exprs/rowid_ref.py +8 -0
  32. pixeltable/exprs/similarity_expr.py +1 -0
  33. pixeltable/func/__init__.py +1 -0
  34. pixeltable/func/mcp.py +74 -0
  35. pixeltable/func/query_template_function.py +5 -3
  36. pixeltable/func/tools.py +12 -2
  37. pixeltable/func/udf.py +2 -2
  38. pixeltable/functions/__init__.py +1 -0
  39. pixeltable/functions/anthropic.py +19 -45
  40. pixeltable/functions/deepseek.py +19 -38
  41. pixeltable/functions/fireworks.py +9 -18
  42. pixeltable/functions/gemini.py +2 -3
  43. pixeltable/functions/groq.py +108 -0
  44. pixeltable/functions/llama_cpp.py +6 -6
  45. pixeltable/functions/mistralai.py +16 -53
  46. pixeltable/functions/ollama.py +1 -1
  47. pixeltable/functions/openai.py +82 -165
  48. pixeltable/functions/string.py +212 -58
  49. pixeltable/functions/together.py +22 -80
  50. pixeltable/globals.py +10 -4
  51. pixeltable/index/base.py +5 -0
  52. pixeltable/index/btree.py +5 -0
  53. pixeltable/index/embedding_index.py +5 -0
  54. pixeltable/io/external_store.py +10 -31
  55. pixeltable/io/label_studio.py +5 -5
  56. pixeltable/io/parquet.py +2 -2
  57. pixeltable/io/table_data_conduit.py +1 -32
  58. pixeltable/metadata/__init__.py +11 -2
  59. pixeltable/metadata/converters/convert_13.py +2 -2
  60. pixeltable/metadata/converters/convert_30.py +6 -11
  61. pixeltable/metadata/converters/convert_35.py +9 -0
  62. pixeltable/metadata/converters/convert_36.py +38 -0
  63. pixeltable/metadata/converters/convert_37.py +15 -0
  64. pixeltable/metadata/converters/util.py +3 -9
  65. pixeltable/metadata/notes.py +3 -0
  66. pixeltable/metadata/schema.py +13 -1
  67. pixeltable/plan.py +135 -12
  68. pixeltable/share/packager.py +138 -14
  69. pixeltable/share/publish.py +2 -2
  70. pixeltable/store.py +19 -13
  71. pixeltable/type_system.py +30 -0
  72. pixeltable/utils/dbms.py +1 -1
  73. pixeltable/utils/formatter.py +64 -42
  74. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
  75. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/RECORD +78 -73
  76. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
  77. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
  78. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -3,8 +3,11 @@ from __future__ import annotations
3
3
  import dataclasses
4
4
  import functools
5
5
  import logging
6
+ import random
6
7
  import time
7
- from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
8
+ from collections import defaultdict
9
+ from contextlib import contextmanager
10
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
8
11
  from uuid import UUID
9
12
 
10
13
  import psycopg
@@ -15,8 +18,9 @@ from pixeltable.env import Env
15
18
  from pixeltable.iterators import ComponentIterator
16
19
  from pixeltable.metadata import schema
17
20
 
21
+ from .column import Column
18
22
  from .dir import Dir
19
- from .globals import IfExistsParam, IfNotExistsParam, MediaValidation
23
+ from .globals import IfExistsParam, IfNotExistsParam, MediaValidation, QColumnId
20
24
  from .insertable_table import InsertableTable
21
25
  from .path import Path
22
26
  from .schema_object import SchemaObject
@@ -27,6 +31,8 @@ from .table_version_path import TableVersionPath
27
31
  from .view import View
28
32
 
29
33
  if TYPE_CHECKING:
34
+ from pixeltable.plan import SampleClause
35
+
30
36
  from .. import DataFrame, exprs
31
37
 
32
38
 
@@ -56,49 +62,85 @@ def _unpack_row(
56
62
  return result
57
63
 
58
64
 
59
- _MAX_RETRIES = 3
65
+ # -1: unlimited
66
+ # for now, we don't limit the number of retries, because we haven't seen situations where the actual number of retries
67
+ # grows uncontrollably
68
+ _MAX_RETRIES = -1
69
+
60
70
  T = TypeVar('T')
61
71
 
62
72
 
63
- def _retry_loop(op: Callable[..., T]) -> Callable[..., T]:
64
- @functools.wraps(op)
65
- def loop(*args: Any, **kwargs: Any) -> T:
66
- num_remaining_retries = _MAX_RETRIES
67
- while True:
68
- try:
69
- # in order for retry to work, we need to make sure that there aren't any prior db updates
70
- # that are part of an ongoing transaction
71
- assert not Env.get().in_xact()
72
- with Env.get().begin_xact():
73
- return op(*args, **kwargs)
74
- except sql.exc.DBAPIError as e:
75
- if isinstance(e.orig, psycopg.errors.SerializationFailure):
76
- if num_remaining_retries > 0:
77
- num_remaining_retries -= 1
78
- # print(f'serialization failure:\n{e}')
79
- # print('retrying ************************************************************')
80
- time.sleep(1)
73
+ def _retry_loop(*, for_write: bool) -> Callable[[Callable[..., T]], Callable[..., T]]:
74
+ def decorator(op: Callable[..., T]) -> Callable[..., T]:
75
+ @functools.wraps(op)
76
+ def loop(*args: Any, **kwargs: Any) -> T:
77
+ num_retries = 0
78
+ while True:
79
+ try:
80
+ # in order for retry to work, we need to make sure that there aren't any prior db updates
81
+ # that are part of an ongoing transaction
82
+ assert not Env.get().in_xact
83
+ with Catalog.get().begin_xact(for_write=for_write, convert_db_excs=False):
84
+ return op(*args, **kwargs)
85
+ except sql.exc.DBAPIError as e:
86
+ # TODO: what other exceptions should we be looking for?
87
+ if isinstance(e.orig, (psycopg.errors.SerializationFailure, psycopg.errors.LockNotAvailable)):
88
+ if num_retries < _MAX_RETRIES or _MAX_RETRIES == -1:
89
+ num_retries += 1
90
+ _logger.debug(f'Retrying ({num_retries}) after {type(e.orig)}')
91
+ time.sleep(random.uniform(0.1, 0.5))
92
+ else:
93
+ raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
81
94
  else:
82
- raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
83
- else:
95
+ raise
96
+ except Exception as e:
97
+ # for informational/debugging purposes
98
+ _logger.debug(f'retry_loop(): passing along {e}')
84
99
  raise
85
100
 
86
- return loop
101
+ return loop
102
+
103
+ return decorator
87
104
 
88
105
 
89
106
  class Catalog:
90
107
  """The functional interface to getting access to catalog objects
91
108
 
92
- All interface functions must be called in the context of a transaction, started with Env.begin().
109
+ All interface functions must be called in the context of a transaction, started with Catalog.begin_xact().
110
+
111
+ Caching and invalidation of metadata:
112
+ - Catalog caches TableVersion instances in order to avoid excessive metadata loading
113
+ - for any specific table version (ie, combination of id and effective version) there can be only a single
114
+ Tableversion instance in circulation; the reason is that each TV instance has its own store_tbl.sa_tbl, and
115
+ mixing multiple instances of sqlalchemy Table objects in the same query (for the same underlying table) leads to
116
+ duplicate references to that table in the From clause (ie, incorrect Cartesian products)
117
+ - in order to allow multiple concurrent Python processes to perform updates (data and/or schema) against a shared
118
+ Pixeltable instance, Catalog needs to reload metadata from the store when there are changes
119
+ - concurrent changes are detected by comparing TableVersion.version/view_sn with the stored current version
120
+ (TableMd.current_version/view_sn)
121
+ - cached live TableVersion instances (those with effective_version == None) are validated against the stored
122
+ metadata on transaction boundaries; this is recorded in TableVersion.is_validated
123
+ - metadata validation is only needed for live TableVersion instances (snapshot instances are immutable)
93
124
  """
94
125
 
95
126
  _instance: Optional[Catalog] = None
96
127
 
97
- # key: [id, version]
128
+ # cached TableVersion instances; key: [id, version]
98
129
  # - mutable version of a table: version == None (even though TableVersion.version is set correctly)
99
130
  # - snapshot versions: records the version of the snapshot
100
131
  _tbl_versions: dict[tuple[UUID, Optional[int]], TableVersion]
101
132
  _tbls: dict[UUID, Table]
133
+ _in_write_xact: bool # True if we're in a write transaction
134
+ _x_locked_tbl_ids: set[UUID] # non-empty for write transactions
135
+
136
+ # cached column dependencies
137
+ # - key: table id, value: mapping from column id to its dependencies
138
+ # - only maintained for dependencies between non-snapshot table versions
139
+ # - can contain stale entries (stemming from invalidated TV instances)
140
+ _column_dependencies: dict[UUID, dict[QColumnId, set[QColumnId]]]
141
+
142
+ # column dependents are recomputed at the beginning of every write transaction and only reflect the locked tree
143
+ _column_dependents: Optional[dict[QColumnId, set[QColumnId]]]
102
144
 
103
145
  @classmethod
104
146
  def get(cls) -> Catalog:
@@ -109,22 +151,291 @@ class Catalog:
109
151
  @classmethod
110
152
  def clear(cls) -> None:
111
153
  """Remove the instance. Used for testing."""
154
+ # invalidate all existing instances to force reloading of metadata
155
+ for tbl_version in cls._instance._tbl_versions.values():
156
+ # _logger.debug(
157
+ # f'Invalidating table version {tbl_version.id}:{tbl_version.effective_version} ({id(tbl_version):x})'
158
+ # )
159
+ tbl_version.is_validated = False
112
160
  cls._instance = None
113
161
 
114
162
  def __init__(self) -> None:
115
163
  self._tbl_versions = {}
116
164
  self._tbls = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
165
+ self._in_write_xact = False
166
+ self._x_locked_tbl_ids = set()
167
+ self._column_dependencies = {}
168
+ self._column_dependents = None
117
169
  self._init_store()
118
170
 
119
- @classmethod
120
- def _lock_dir(cls, parent_id: Optional[UUID], dir_id: Optional[UUID], dir_name: Optional[str]) -> None:
121
- """Update directory record(s) to sequentialize thread access. Lock is released when transaction commits.
171
+ def _dropped_tbl_error_msg(self, tbl_id: UUID) -> str:
172
+ return f'Table was dropped (no record found for {tbl_id})'
173
+
174
+ def validate(self) -> None:
175
+ """Validate structural consistency of cached metadata"""
176
+ for (tbl_id, effective_version), tbl_version in self._tbl_versions.items():
177
+ assert tbl_id == tbl_version.id, f'{tbl_id} != {tbl_version.id}'
178
+ assert tbl_version.effective_version == tbl_version.version or tbl_version.effective_version is None, (
179
+ f'{tbl_version.effective_version} != {tbl_version.version} for id {tbl_id}'
180
+ )
181
+ assert effective_version == tbl_version.effective_version, (
182
+ f'{effective_version} != {tbl_version.effective_version} for id {tbl_id}'
183
+ )
184
+ assert len(tbl_version.mutable_views) == 0 or tbl_version.is_mutable, (
185
+ f'snapshot_id={tbl_version.id} mutable_views={tbl_version.mutable_views}'
186
+ )
187
+
188
+ if tbl_version.is_view and tbl_version.is_mutable and tbl_version.is_validated:
189
+ # make sure this mutable view is recorded in a mutable base
190
+ base = tbl_version.base
191
+ assert base is not None
192
+ if base.effective_version is None:
193
+ assert (base.id, None) in self._tbl_versions
194
+ base_tv = self._tbl_versions[base.id, None]
195
+ if not base_tv.is_validated:
196
+ continue
197
+ mutable_view_ids = ', '.join(str(tv.id) for tv in self._tbl_versions[base.id, None].mutable_views)
198
+ mutable_view_names = ', '.join(
199
+ tv._tbl_version.name
200
+ for tv in self._tbl_versions[base.id, None].mutable_views
201
+ if tv._tbl_version is not None
202
+ )
203
+ assert TableVersionHandle.create(tbl_version) in self._tbl_versions[base.id, None].mutable_views, (
204
+ f'{tbl_version.name} ({tbl_version.id}) missing in {mutable_view_ids} ({mutable_view_names})'
205
+ )
206
+
207
+ if len(tbl_version.mutable_views) > 0:
208
+ # make sure we also loaded mutable view metadata, which is needed to detect column dependencies
209
+ for v in tbl_version.mutable_views:
210
+ assert v.effective_version is None, f'{v.id}:{v.effective_version}'
211
+
212
+ @contextmanager
213
+ def begin_xact(
214
+ self,
215
+ *,
216
+ tbl: Optional[TableVersionPath] = None,
217
+ for_write: bool = False,
218
+ lock_mutable_tree: bool = False,
219
+ convert_db_excs: bool = True,
220
+ ) -> Iterator[sql.Connection]:
221
+ """
222
+ Return a context manager that yields a connection to the database. Idempotent.
223
+
224
+ It is mandatory to call this method, not Env.begin_xact(), if the transaction accesses any table data
225
+ or metadata.
226
+
227
+ If tbl != None, follows this locking protocol:
228
+ - validates/reloads the TableVersion instances of tbl's ancestors (in the hope that this reduces potential
229
+ SerializationErrors later on)
230
+ - if for_write == True, x-locks Table record (by updating Table.lock_dummy; see _acquire_tbl_xlock())
231
+ - if for_write == False, validates TableVersion instance
232
+ - if lock_mutable_tree == True, also x-locks all mutable views of the table
233
+ - this needs to be done in a retry loop, because Postgres can decide to abort the transaction
234
+ (SerializationFailure, LockNotAvailable)
235
+ - for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
236
+ to minimize the probability of loosing that work due to a forced abort
237
+
238
+ If convert_db_excs == True, converts DBAPIErrors into excs.Errors.
239
+ """
240
+ if Env.get().in_xact:
241
+ if tbl is not None and for_write:
242
+ # make sure that we requested the required table lock at the beginning of the transaction
243
+ assert tbl.tbl_id in self._x_locked_tbl_ids, f'{tbl.tbl_id} not in {self._x_locked_tbl_ids}'
244
+ yield Env.get().conn
245
+ return
246
+
247
+ # tv_msg = '\n'.join(
248
+ # [
249
+ # f'{tv.id}:{tv.effective_version} : tv={id(tv):x} sa_tbl={id(tv.store_tbl.sa_tbl):x}'
250
+ # for tv in self._tbl_versions.values()
251
+ # ]
252
+ # )
253
+ # _logger.debug(f'begin_xact(): {tv_msg}')
254
+ num_retries = 0
255
+ while True:
256
+ try:
257
+ self._in_write_xact = False
258
+ self._x_locked_tbl_ids = set()
259
+ self._column_dependents = None
260
+
261
+ with Env.get().begin_xact() as conn:
262
+ if tbl is not None:
263
+ try:
264
+ if not self._acquire_path_locks(
265
+ tbl=tbl, for_write=for_write, lock_mutable_tree=lock_mutable_tree
266
+ ):
267
+ # this is a snapshot
268
+ yield conn
269
+ return
270
+
271
+ if for_write:
272
+ if lock_mutable_tree:
273
+ self._x_locked_tbl_ids = self._get_mutable_tree(tbl.tbl_id)
274
+ self._compute_column_dependents(self._x_locked_tbl_ids)
275
+ else:
276
+ self._x_locked_tbl_ids = {tbl.tbl_id}
277
+ if _logger.isEnabledFor(logging.DEBUG):
278
+ # validate only when we don't see errors
279
+ self.validate()
280
+
281
+ except sql.exc.DBAPIError as e:
282
+ if isinstance(
283
+ e.orig, (psycopg.errors.SerializationFailure, psycopg.errors.LockNotAvailable)
284
+ ) and (num_retries < _MAX_RETRIES or _MAX_RETRIES == -1):
285
+ num_retries += 1
286
+ _logger.debug(f'Retrying ({num_retries}) after {type(e.orig)}')
287
+ time.sleep(random.uniform(0.1, 0.5))
288
+ continue
289
+ else:
290
+ raise
291
+
292
+ self._in_write_xact = for_write
293
+ yield conn
294
+ return
295
+
296
+ except sql.exc.DBAPIError as e:
297
+ # we got some db error during the actual operation (not just while trying to get locks on the metadata
298
+ # records): we convert these into Errors, if asked to do so, and abort
299
+ # TODO: what other concurrency-related exceptions should we expect?
300
+
301
+ # we always convert UndefinedTable exceptions (they can't be retried)
302
+ if isinstance(e.orig, psycopg.errors.UndefinedTable):
303
+ # the table got dropped in the middle of the table operation
304
+ _logger.debug(f'Exception: undefined table ({tbl.tbl_name()}): Caught {type(e.orig)}: {e!r}')
305
+ assert tbl is not None
306
+ raise excs.Error(f'Table was dropped: {tbl.tbl_name()}') from None
307
+ elif isinstance(e.orig, psycopg.errors.SerializationFailure) and convert_db_excs:
308
+ # we still got a serialization error, despite getting x-locks at the beginning
309
+ msg = f'{tbl.tbl_name()} ({tbl.tbl_id})' if tbl is not None else ''
310
+ _logger.debug(f'Exception: serialization failure: {msg} ({e})')
311
+ raise excs.Error('Serialization failure. Please re-run the operation.') from None
312
+ else:
313
+ raise
314
+
315
+ finally:
316
+ self._in_write_xact = False
317
+ self._x_locked_tbl_ids = set()
318
+ self._column_dependents = None
319
+
320
+ # invalidate cached current TableVersion instances
321
+ for tv in self._tbl_versions.values():
322
+ if tv.effective_version is None:
323
+ _logger.debug(f'invalidating table version {tv.id}:None (tv={id(tv):x})')
324
+ tv.is_validated = False
325
+
326
+ @property
327
+ def in_write_xact(self) -> bool:
328
+ return self._in_write_xact
329
+
330
+ def _acquire_path_locks(
331
+ self, *, tbl: TableVersionPath, for_write: bool = False, lock_mutable_tree: bool = False
332
+ ) -> bool:
333
+ """
334
+ Path locking protocol:
335
+ - refresh cached TableVersions of ancestors (we need those even during inserts, for computed columns that
336
+ reference the base tables)
337
+ - refresh cached TableVersion of tbl or get X-lock, depending on for_write
338
+ - if lock_mutable_tree, also X-lock all mutable views of tbl
339
+
340
+ Returns False if trying to lock a pure snapshot with for_write == True
341
+ Raises Error if tbl doesn't exist.
342
+ """
343
+ start_idx = 1 if for_write else 0
344
+ for handle in tbl.get_tbl_versions()[start_idx::-1]:
345
+ _ = self.get_tbl_version(handle.id, handle.effective_version)
346
+ if not for_write:
347
+ return True # nothing left to lock
348
+ return self._acquire_tbl_xlock(tbl_id=tbl.tbl_id, lock_mutable_tree=lock_mutable_tree, raise_if_not_exists=True)
349
+
350
+ def _acquire_tbl_xlock(
351
+ self,
352
+ *,
353
+ tbl_id: Optional[UUID] = None,
354
+ dir_id: Optional[UUID] = None,
355
+ tbl_name: Optional[str] = None,
356
+ lock_mutable_tree: bool = False,
357
+ raise_if_not_exists: bool = False,
358
+ ) -> bool:
359
+ """Force acquisition of an X-lock on a Table record via a blind update
360
+
361
+ Either tbl_id or dir_id/tbl_name need to be specified.
362
+ Returns True if the table was locked, False if it was a snapshot or not found.
363
+ If lock_mutable_tree, recursively locks all mutable views of the table.
364
+
365
+ Returns False if the table is a snapshot or not found and !raise_if_not_exists.
366
+ """
367
+ where_clause: sql.ColumnElement
368
+ if tbl_id is not None:
369
+ where_clause = schema.Table.id == tbl_id
370
+ else:
371
+ where_clause = sql.and_(schema.Table.dir_id == dir_id, schema.Table.md['name'].astext == tbl_name)
372
+ user = Env.get().user
373
+ if user is not None:
374
+ where_clause = sql.and_(where_clause, schema.Table.md['user'].astext == Env.get().user)
375
+
376
+ conn = Env.get().conn
377
+ row = conn.execute(sql.select(schema.Table).where(where_clause).with_for_update(nowait=True)).one_or_none()
378
+ if row is None:
379
+ if raise_if_not_exists:
380
+ raise excs.Error(self._dropped_tbl_error_msg(tbl_id))
381
+ return False # nothing to lock
382
+ if row.md['view_md'] is not None and row.md['view_md']['is_snapshot']:
383
+ return False # nothing to lock
384
+ conn.execute(sql.update(schema.Table).values(lock_dummy=1).where(where_clause))
385
+
386
+ if not lock_mutable_tree:
387
+ return True
388
+ # also lock mutable views
389
+ tv = self.get_tbl_version(tbl_id, None)
390
+ for view in tv.mutable_views:
391
+ self._acquire_tbl_xlock(tbl_id=view.id, lock_mutable_tree=True, raise_if_not_exists=raise_if_not_exists)
392
+ return True
393
+
394
+ def _get_mutable_tree(self, tbl_id: UUID) -> set[UUID]:
395
+ """Returns ids of all tables that form the tree of mutable views starting at tbl_id; includes the root."""
396
+ tv = self.get_tbl_version(tbl_id, None)
397
+ result: set[UUID] = {tv.id}
398
+ for view in tv.mutable_views:
399
+ result.update(self._get_mutable_tree(view.id))
400
+ return result
401
+
402
+ def _compute_column_dependents(self, mutable_tree: set[UUID]) -> None:
403
+ """Populate self._column_dependents for all tables in mutable_tree"""
404
+ assert self._column_dependents is None
405
+ self._column_dependents = defaultdict(set)
406
+ for tbl_id in mutable_tree:
407
+ assert tbl_id in self._column_dependencies
408
+ for col, dependencies in self._column_dependencies[tbl_id].items():
409
+ for dependency in dependencies:
410
+ if dependency.tbl_id not in mutable_tree:
411
+ continue
412
+ dependents = self._column_dependents[dependency]
413
+ dependents.add(col)
414
+
415
+ def get_column_dependents(self, tbl_id: UUID, col_id: int) -> set[Column]:
416
+ """Return all Columns that transitively depend on the given column."""
417
+ assert self._column_dependents is not None
418
+ dependents = self._column_dependents[QColumnId(tbl_id, col_id)]
419
+ result: set[Column] = set()
420
+ for dependent in dependents:
421
+ tv = self.get_tbl_version(dependent.tbl_id, None)
422
+ col = tv.cols_by_id[dependent.col_id]
423
+ result.add(col)
424
+ return result
425
+
426
+ def _acquire_dir_xlock(
427
+ self, *, parent_id: Optional[UUID] = None, dir_id: Optional[UUID] = None, dir_name: Optional[str] = None
428
+ ) -> None:
429
+ """Force acquisition of an X-lock on a Dir record via a blind update.
430
+
122
431
  If dir_id is present, then all other conditions are ignored.
123
432
  Note that (parent_id==None) is a valid where condition.
124
433
  If dir_id is not specified, the user from the environment is added to the directory filters.
125
434
  """
435
+ assert (dir_name is None) != (dir_id is None)
436
+ assert not (parent_id is not None and dir_name is None)
126
437
  user = Env.get().user
127
- conn = Env.get().conn
438
+ assert self._in_write_xact
128
439
  q = sql.update(schema.Dir).values(lock_dummy=1)
129
440
  if dir_id is not None:
130
441
  q = q.where(schema.Dir.id == dir_id)
@@ -134,7 +445,7 @@ class Catalog:
134
445
  q = q.where(schema.Dir.md['name'].astext == dir_name)
135
446
  if user is not None:
136
447
  q = q.where(schema.Dir.md['user'].astext == user)
137
- conn.execute(q)
448
+ Env.get().conn.execute(q)
138
449
 
139
450
  def get_dir_path(self, dir_id: UUID) -> Path:
140
451
  """Return path for directory with given id"""
@@ -156,7 +467,7 @@ class Catalog:
156
467
  dir_entries: dict[str, Catalog.DirEntry]
157
468
  table: Optional[schema.Table]
158
469
 
159
- @_retry_loop
470
+ @_retry_loop(for_write=False)
160
471
  def get_dir_contents(self, dir_path: Path, recursive: bool = False) -> dict[str, DirEntry]:
161
472
  dir = self._get_schema_object(dir_path, expected=Dir, raise_if_not_exists=True)
162
473
  return self._get_dir_contents(dir._id, recursive=recursive)
@@ -183,7 +494,7 @@ class Catalog:
183
494
 
184
495
  return result
185
496
 
186
- @_retry_loop
497
+ @_retry_loop(for_write=True)
187
498
  def move(self, path: Path, new_path: Path) -> None:
188
499
  self._move(path, new_path)
189
500
 
@@ -236,7 +547,7 @@ class Catalog:
236
547
  add_dir: Optional[schema.Dir] = None
237
548
  drop_dir: Optional[schema.Dir] = None
238
549
  for p in sorted(dir_paths):
239
- dir = self._get_dir(p, for_update=True)
550
+ dir = self._get_dir(p, lock_dir=True)
240
551
  if dir is None:
241
552
  raise excs.Error(f'Directory {str(p)!r} does not exist.')
242
553
  if p == add_dir_path:
@@ -246,7 +557,7 @@ class Catalog:
246
557
 
247
558
  add_obj: Optional[SchemaObject] = None
248
559
  if add_dir is not None:
249
- add_obj = self._get_dir_entry(add_dir.id, add_name, for_update=True)
560
+ add_obj = self._get_dir_entry(add_dir.id, add_name, lock_entry=True)
250
561
  if add_obj is not None and raise_if_exists:
251
562
  add_path = add_dir_path.append(add_name)
252
563
  raise excs.Error(f'Path {str(add_path)!r} already exists.')
@@ -254,7 +565,7 @@ class Catalog:
254
565
  drop_obj: Optional[SchemaObject] = None
255
566
  if drop_dir is not None:
256
567
  drop_path = drop_dir_path.append(drop_name)
257
- drop_obj = self._get_dir_entry(drop_dir.id, drop_name, for_update=True)
568
+ drop_obj = self._get_dir_entry(drop_dir.id, drop_name, lock_entry=True)
258
569
  if drop_obj is None and raise_if_not_exists:
259
570
  raise excs.Error(f'Path {str(drop_path)!r} does not exist.')
260
571
  if drop_obj is not None and drop_expected is not None and not isinstance(drop_obj, drop_expected):
@@ -266,13 +577,13 @@ class Catalog:
266
577
  add_dir_obj = Dir(add_dir.id, add_dir.parent_id, add_dir.md['name']) if add_dir is not None else None
267
578
  return add_obj, add_dir_obj, drop_obj
268
579
 
269
- def _get_dir_entry(self, dir_id: UUID, name: str, for_update: bool = False) -> Optional[SchemaObject]:
580
+ def _get_dir_entry(self, dir_id: UUID, name: str, lock_entry: bool = False) -> Optional[SchemaObject]:
270
581
  user = Env.get().user
271
582
  conn = Env.get().conn
272
583
 
273
584
  # check for subdirectory
274
- if for_update:
275
- self._lock_dir(dir_id, None, name)
585
+ if lock_entry:
586
+ self._acquire_dir_xlock(parent_id=dir_id, dir_id=None, dir_name=name)
276
587
  q = sql.select(schema.Dir).where(
277
588
  schema.Dir.parent_id == dir_id, schema.Dir.md['name'].astext == name, schema.Dir.md['user'].astext == user
278
589
  )
@@ -286,17 +597,17 @@ class Catalog:
286
597
  return Dir(dir_record.id, dir_record.parent_id, name)
287
598
 
288
599
  # check for table
600
+ if lock_entry:
601
+ self._acquire_tbl_xlock(dir_id=dir_id, tbl_name=name)
289
602
  q = sql.select(schema.Table.id).where(
290
603
  schema.Table.dir_id == dir_id,
291
604
  schema.Table.md['name'].astext == name,
292
605
  schema.Table.md['user'].astext == user,
293
606
  )
294
- if for_update:
295
- q = q.with_for_update()
296
607
  tbl_id = conn.execute(q).scalar_one_or_none()
297
608
  if tbl_id is not None:
298
609
  if tbl_id not in self._tbls:
299
- self._tbls[tbl_id] = self._load_tbl(tbl_id)
610
+ _ = self._load_tbl(tbl_id)
300
611
  return self._tbls[tbl_id]
301
612
 
302
613
  return None
@@ -307,7 +618,8 @@ class Catalog:
307
618
  expected: Optional[type[SchemaObject]] = None,
308
619
  raise_if_exists: bool = False,
309
620
  raise_if_not_exists: bool = False,
310
- for_update: bool = False,
621
+ lock_parent: bool = False,
622
+ lock_obj: bool = False,
311
623
  ) -> Optional[SchemaObject]:
312
624
  """Return the schema object at the given path, or None if it doesn't exist.
313
625
 
@@ -323,16 +635,16 @@ class Catalog:
323
635
  raise excs.Error(
324
636
  f'{str(path)!r} needs to be a {expected._display_name()} but is a {Dir._display_name()}'
325
637
  )
326
- dir = self._get_dir(path, for_update=for_update)
638
+ dir = self._get_dir(path, lock_dir=lock_obj)
327
639
  if dir is None:
328
640
  raise excs.Error(f'Unknown user: {Env.get().user}')
329
641
  return Dir(dir.id, dir.parent_id, dir.md['name'])
330
642
 
331
643
  parent_path = path.parent
332
- parent_dir = self._get_dir(parent_path, for_update=False)
644
+ parent_dir = self._get_dir(parent_path, lock_dir=lock_parent)
333
645
  if parent_dir is None:
334
646
  raise excs.Error(f'Directory {str(parent_path)!r} does not exist.')
335
- obj = self._get_dir_entry(parent_dir.id, path.name, for_update=for_update)
647
+ obj = self._get_dir_entry(parent_dir.id, path.name, lock_entry=lock_obj)
336
648
 
337
649
  if obj is None and raise_if_not_exists:
338
650
  raise excs.Error(f'Path {str(path)!r} does not exist.')
@@ -349,10 +661,15 @@ class Catalog:
349
661
  tbl = self._load_tbl(tbl_id)
350
662
  if tbl is None:
351
663
  return None
352
- self._tbls[tbl_id] = tbl
664
+ # # if this is a mutable table, we also need to have its mutable views loaded, in order to track column
665
+ # # dependencies
666
+ # tbl_version = tbl._tbl_version.get()
667
+ # if tbl_version.is_mutable:
668
+ # for v in tbl_version.mutable_views:
669
+ # _ = self.get_table_by_id(v.id)
353
670
  return self._tbls[tbl_id]
354
671
 
355
- @_retry_loop
672
+ @_retry_loop(for_write=True)
356
673
  def create_table(
357
674
  self,
358
675
  path: Path,
@@ -385,13 +702,14 @@ class Catalog:
385
702
  self._tbls[tbl._id] = tbl
386
703
  return tbl
387
704
 
388
- @_retry_loop
705
+ @_retry_loop(for_write=True)
389
706
  def create_view(
390
707
  self,
391
708
  path: Path,
392
709
  base: TableVersionPath,
393
710
  select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
394
711
  where: Optional[exprs.Expr],
712
+ sample_clause: Optional['SampleClause'],
395
713
  additional_columns: Optional[dict[str, Any]],
396
714
  is_snapshot: bool,
397
715
  iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]],
@@ -402,6 +720,18 @@ class Catalog:
402
720
  ) -> Table:
403
721
  from pixeltable.utils.filecache import FileCache
404
722
 
723
+ if not is_snapshot and not base.is_snapshot():
724
+ # this is a mutable view of a mutable base; X-lock the base and advance its view_sn before adding the view
725
+ self._acquire_tbl_xlock(tbl_id=base.tbl_id)
726
+ base_tv = self.get_tbl_version(base.tbl_id, None)
727
+ base_tv.tbl_md.view_sn += 1
728
+ result = Env.get().conn.execute(
729
+ sql.update(schema.Table)
730
+ .values({schema.Table.md: dataclasses.asdict(base_tv.tbl_md)})
731
+ .where(schema.Table.id == base.tbl_id)
732
+ )
733
+ assert result.rowcount == 1, result.rowcount
734
+
405
735
  existing = self._handle_path_collision(path, View, is_snapshot, if_exists)
406
736
  if existing is not None:
407
737
  assert isinstance(existing, View)
@@ -420,6 +750,7 @@ class Catalog:
420
750
  select_list=select_list,
421
751
  additional_columns=additional_columns,
422
752
  predicate=where,
753
+ sample_clause=sample_clause,
423
754
  is_snapshot=is_snapshot,
424
755
  iterator_cls=iterator_class,
425
756
  iterator_args=iterator_args,
@@ -431,14 +762,17 @@ class Catalog:
431
762
  self._tbls[view._id] = view
432
763
  return view
433
764
 
434
- @_retry_loop
765
+ @_retry_loop(for_write=True)
435
766
  def create_replica(
436
767
  self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam = IfExistsParam.ERROR
437
- ) -> Table:
768
+ ) -> None:
438
769
  """
439
770
  Creates table, table_version, and table_schema_version records for a replica with the given metadata.
440
771
  The metadata should be presented in standard "ancestor order", with the table being replicated at
441
772
  list position 0 and the (root) base table at list position -1.
773
+
774
+ TODO: create_replica() also needs to create the store tables and populate them in order to make
775
+ replica creation atomic.
442
776
  """
443
777
  tbl_id = UUID(md[0].tbl_md.tbl_id)
444
778
 
@@ -451,20 +785,19 @@ class Catalog:
451
785
  'but a different table already exists at that location.'
452
786
  )
453
787
  assert isinstance(existing, View)
454
- return existing
788
+ return
455
789
 
456
790
  # Ensure that the system directory exists.
457
791
  self._create_dir(Path('_system', allow_system_paths=True), if_exists=IfExistsParam.IGNORE, parents=False)
458
792
 
459
793
  # Now check to see if this table already exists in the catalog.
460
- # TODO: Handle concurrency in create_replica()
461
794
  existing = Catalog.get().get_table_by_id(tbl_id)
462
795
  if existing is not None:
463
- existing_path = Path(existing._path, allow_system_paths=True)
796
+ existing_path = Path(existing._path(), allow_system_paths=True)
464
797
  # It does exist. If it's a non-system table, that's an error: it's already been replicated.
465
798
  if not existing_path.is_system_path:
466
799
  raise excs.Error(
467
- f'That table has already been replicated as {existing._path!r}. \n'
800
+ f'That table has already been replicated as {existing._path()!r}. \n'
468
801
  f'Drop the existing replica if you wish to re-create it.'
469
802
  )
470
803
  # If it's a system table, then this means it was created at some point as the ancestor of some other
@@ -489,22 +822,20 @@ class Catalog:
489
822
  # The table already exists in the catalog. The existing path might be a system path (if the table
490
823
  # was created as an anonymous base table of some other table), or it might not (if it's a snapshot
491
824
  # that was directly replicated by the user at some point). In either case, use the existing path.
492
- replica_path = Path(replica._path, allow_system_paths=True)
825
+ replica_path = Path(replica._path(), allow_system_paths=True)
493
826
 
494
827
  # Store the metadata; it could be a new version (in which case a new record will be created) or a
495
828
  # known version (in which case the newly received metadata will be validated as identical).
496
829
  self.__store_replica_md(replica_path, ancestor_md)
497
830
 
498
- # Update the catalog (as a final step, after all DB operations completed successfully).
499
- # Only the table being replicated is actually made visible in the catalog.
500
- self._tbls[tbl_id] = self._load_tbl(tbl_id)
501
- return self._tbls[tbl_id]
831
+ # don't create TableVersion instances at this point, they would be superseded by calls to TV.create_replica()
832
+ # in TableRestorer.restore()
502
833
 
503
834
  def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
504
835
  _logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
505
- # TODO: Handle concurrency
506
836
  dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
507
837
  assert dir is not None
838
+ assert self._in_write_xact
508
839
 
509
840
  conn = Env.get().conn
510
841
  tbl_id = md.tbl_md.tbl_id
@@ -582,26 +913,33 @@ class Catalog:
582
913
 
583
914
  self.store_tbl_md(UUID(tbl_id), new_tbl_md, new_version_md, new_schema_version_md)
584
915
 
585
- @_retry_loop
916
+ @_retry_loop(for_write=False)
586
917
  def get_table(self, path: Path) -> Table:
587
918
  obj = Catalog.get()._get_schema_object(path, expected=Table, raise_if_not_exists=True)
588
919
  assert isinstance(obj, Table)
589
- obj._tbl_version.get().ensure_md_loaded()
590
920
  return obj
591
921
 
592
- @_retry_loop
922
+ @_retry_loop(for_write=True)
593
923
  def drop_table(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
594
- _, _, src_obj = self._prepare_dir_op(
595
- drop_dir_path=path.parent,
596
- drop_name=path.name,
597
- drop_expected=Table,
924
+ tbl = self._get_schema_object(
925
+ path,
926
+ expected=Table,
598
927
  raise_if_not_exists=if_not_exists == IfNotExistsParam.ERROR and not force,
928
+ lock_parent=True,
929
+ lock_obj=False,
599
930
  )
600
- if src_obj is None:
931
+ if tbl is None:
601
932
  _logger.info(f'Skipped table {str(path)!r} (does not exist).')
602
933
  return
603
- assert isinstance(src_obj, Table)
604
- self._drop_tbl(src_obj, force=force, is_replace=False)
934
+ assert isinstance(tbl, Table)
935
+
936
+ if isinstance(tbl, View) and tbl._tbl_version_path.is_mutable() and tbl._tbl_version_path.base.is_mutable():
937
+ # this is a mutable view of a mutable base;
938
+ # lock the base before the view, in order to avoid deadlocks with concurrent inserts/updates
939
+ base_id = tbl._tbl_version_path.base.tbl_id
940
+ self._acquire_tbl_xlock(tbl_id=base_id, lock_mutable_tree=False)
941
+
942
+ self._drop_tbl(tbl, force=force, is_replace=False)
605
943
 
606
944
  def _drop_tbl(self, tbl: Table, force: bool, is_replace: bool) -> None:
607
945
  """
@@ -611,8 +949,11 @@ class Catalog:
611
949
  - X-lock base before X-locking any view
612
950
  - deadlock-free wrt to TableVersion.insert() (insert propagation also proceeds top-down)
613
951
  - X-locks parent dir prior to calling TableVersion.drop(): prevent concurrent creation of another SchemaObject
614
- in the same directory with the same name (which could lead to duplicate names if we get rolled back)
952
+ in the same directory with the same name (which could lead to duplicate names if we get aborted)
615
953
  """
954
+ self._acquire_dir_xlock(dir_id=tbl._dir_id)
955
+ self._acquire_tbl_xlock(tbl_id=tbl._id, lock_mutable_tree=False)
956
+
616
957
  view_ids = self.get_view_ids(tbl._id, for_update=True)
617
958
  if len(view_ids) > 0:
618
959
  if not force:
@@ -621,24 +962,46 @@ class Catalog:
621
962
  msg: str
622
963
  if is_replace:
623
964
  msg = (
624
- f'{obj_type_str} {tbl._path} already exists and has dependents. '
965
+ f'{obj_type_str} {tbl._path()} already exists and has dependents. '
625
966
  "Use `if_exists='replace_force'` to replace it."
626
967
  )
627
968
  else:
628
- msg = f'{obj_type_str} {tbl._path} has dependents.'
969
+ msg = f'{obj_type_str} {tbl._path()} has dependents.'
629
970
  raise excs.Error(msg)
630
971
 
631
972
  for view_id in view_ids:
632
973
  view = self.get_table_by_id(view_id)
633
974
  self._drop_tbl(view, force=force, is_replace=is_replace)
634
975
 
635
- _ = self.get_dir(tbl._dir_id, for_update=True) # X-lock the parent directory
636
- tbl._drop()
976
+ # if this is a mutable view of a mutable base, advance the base's view_sn
977
+ if isinstance(tbl, View) and tbl._tbl_version_path.is_mutable() and tbl._tbl_version_path.base.is_mutable():
978
+ base_id = tbl._tbl_version_path.base.tbl_id
979
+ base_tv = self.get_tbl_version(base_id, None)
980
+ base_tv.tbl_md.view_sn += 1
981
+ result = Env.get().conn.execute(
982
+ sql.update(schema.Table.__table__)
983
+ .values({schema.Table.md: dataclasses.asdict(base_tv.tbl_md)})
984
+ .where(schema.Table.id == base_id)
985
+ )
986
+ assert result.rowcount == 1, result.rowcount
987
+
988
+ tv = tbl._tbl_version.get() if tbl._tbl_version is not None else None
989
+ if tv is not None:
990
+ tv = tbl._tbl_version.get()
991
+ # invalidate the TableVersion instance so that existing references to it can find out it has been dropped
992
+ tv.is_validated = False
993
+
994
+ self.delete_tbl_md(tbl._id)
637
995
  assert tbl._id in self._tbls
638
996
  del self._tbls[tbl._id]
639
- _logger.info(f'Dropped table `{tbl._path}`.')
997
+ _logger.info(f'Dropped table `{tbl._path()}`.')
998
+
999
+ if tv is not None:
1000
+ tv.drop()
1001
+ assert (tv.id, tv.effective_version) in self._tbl_versions
1002
+ del self._tbl_versions[tv.id, tv.effective_version]
640
1003
 
641
- @_retry_loop
1004
+ @_retry_loop(for_write=True)
642
1005
  def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
643
1006
  return self._create_dir(path, if_exists, parents)
644
1007
 
@@ -673,7 +1036,7 @@ class Catalog:
673
1036
  Env.get().console_logger.info(f'Created directory {str(path)!r}.')
674
1037
  return dir
675
1038
 
676
- @_retry_loop
1039
+ @_retry_loop(for_write=True)
677
1040
  def drop_dir(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
678
1041
  _, _, schema_obj = self._prepare_dir_op(
679
1042
  drop_dir_path=path.parent,
@@ -698,7 +1061,7 @@ class Catalog:
698
1061
  raise excs.Error(f'Directory {str(dir_path)!r} is not empty.')
699
1062
 
700
1063
  # drop existing subdirs
701
- self._lock_dir(dir_id, None, None)
1064
+ self._acquire_dir_xlock(dir_id=dir_id)
702
1065
  dir_q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
703
1066
  for row in conn.execute(dir_q).all():
704
1067
  self._drop_dir(row.id, dir_path.append(row.md['name']), force=True)
@@ -718,6 +1081,11 @@ class Catalog:
718
1081
  def get_view_ids(self, tbl_id: UUID, for_update: bool = False) -> list[UUID]:
719
1082
  """Return the ids of views that directly reference the given table"""
720
1083
  conn = Env.get().conn
1084
+ # check whether this table still exists
1085
+ q = sql.select(sql.func.count()).select_from(schema.Table).where(schema.Table.id == tbl_id)
1086
+ tbl_count = conn.execute(q).scalar()
1087
+ if tbl_count == 0:
1088
+ raise excs.Error(self._dropped_tbl_error_msg(tbl_id))
721
1089
  q = sql.select(schema.Table.id).where(sql.text(f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r}"))
722
1090
  if for_update:
723
1091
  q = q.with_for_update()
@@ -725,17 +1093,39 @@ class Catalog:
725
1093
  return result
726
1094
 
727
1095
  def get_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
728
- if (tbl_id, effective_version) not in self._tbl_versions:
729
- self._tbl_versions[tbl_id, effective_version] = self._load_tbl_version(tbl_id, effective_version)
730
- return self._tbl_versions[tbl_id, effective_version]
731
-
732
- def add_tbl_version(self, tbl_version: TableVersion) -> None:
733
- """Explicitly add a TableVersion"""
734
- self._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
735
- # if this is a mutable view, also record it in the base
736
- if tbl_version.is_view and tbl_version.effective_version is None:
737
- base = tbl_version.base.get()
738
- base.mutable_views.append(TableVersionHandle(tbl_version.id, tbl_version.effective_version))
1096
+ # we need a transaction here, if we're not already in one; if this starts a new transaction,
1097
+ # the returned TableVersion instance will not be validated
1098
+ with self.begin_xact(for_write=False) as conn:
1099
+ tv = self._tbl_versions.get((tbl_id, effective_version))
1100
+ if tv is None:
1101
+ tv = self._load_tbl_version(tbl_id, effective_version)
1102
+ elif not tv.is_validated:
1103
+ # only live instances are invalidated
1104
+ assert effective_version is None
1105
+ # we validate live instances by comparing our cached TableMd.current_version/view_sn to what's stored
1106
+ # _logger.debug(f'validating metadata for table {tbl_id}:{tv.version} ({id(tv):x})')
1107
+ q = sql.select(schema.Table.md).where(schema.Table.id == tbl_id)
1108
+ row = conn.execute(q).one_or_none()
1109
+ if row is None:
1110
+ raise excs.Error(self._dropped_tbl_error_msg(tbl_id))
1111
+ current_version, view_sn = row.md['current_version'], row.md['view_sn']
1112
+
1113
+ # the stored version can be behind TableVersion.version, because we don't roll back the in-memory
1114
+ # metadata changes after a failed update operation
1115
+ if current_version != tv.version or view_sn != tv.tbl_md.view_sn:
1116
+ # the cached metadata is invalid
1117
+ _logger.debug(
1118
+ f'reloading metadata for table {tbl_id} '
1119
+ f'(cached/current version: {tv.version}/{current_version}, '
1120
+ f'cached/current view_sn: {tv.tbl_md.view_sn}/{view_sn})'
1121
+ )
1122
+ tv = self._load_tbl_version(tbl_id, None)
1123
+ else:
1124
+ # the cached metadata is valid
1125
+ tv.is_validated = True
1126
+
1127
+ assert tv.is_validated
1128
+ return tv
739
1129
 
740
1130
  def remove_tbl_version(self, tbl_version: TableVersion) -> None:
741
1131
  assert (tbl_version.id, tbl_version.effective_version) in self._tbl_versions
@@ -745,7 +1135,7 @@ class Catalog:
745
1135
  """Return the Dir with the given id, or None if it doesn't exist"""
746
1136
  conn = Env.get().conn
747
1137
  if for_update:
748
- self._lock_dir(None, dir_id, None)
1138
+ self._acquire_dir_xlock(dir_id=dir_id)
749
1139
  q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
750
1140
  row = conn.execute(q).one_or_none()
751
1141
  if row is None:
@@ -753,24 +1143,24 @@ class Catalog:
753
1143
  dir_record = schema.Dir(**row._mapping)
754
1144
  return Dir(dir_record.id, dir_record.parent_id, dir_record.md['name'])
755
1145
 
756
- def _get_dir(self, path: Path, for_update: bool = False) -> Optional[schema.Dir]:
1146
+ def _get_dir(self, path: Path, lock_dir: bool = False) -> Optional[schema.Dir]:
757
1147
  """
758
- Locking protocol: X locks on all ancestors
1148
+ lock_dir: if True, X-locks target (but not the ancestors)
759
1149
  """
760
1150
  user = Env.get().user
761
1151
  conn = Env.get().conn
762
1152
  if path.is_root:
763
- if for_update:
764
- self._lock_dir(parent_id=None, dir_id=None, dir_name='')
1153
+ if lock_dir:
1154
+ self._acquire_dir_xlock(dir_name='')
765
1155
  q = sql.select(schema.Dir).where(schema.Dir.parent_id.is_(None), schema.Dir.md['user'].astext == user)
766
1156
  row = conn.execute(q).one_or_none()
767
1157
  return schema.Dir(**row._mapping) if row is not None else None
768
1158
  else:
769
- parent_dir = self._get_dir(path.parent, for_update=False)
1159
+ parent_dir = self._get_dir(path.parent, lock_dir=False)
770
1160
  if parent_dir is None:
771
1161
  return None
772
- if for_update:
773
- self._lock_dir(parent_id=parent_dir.id, dir_id=None, dir_name=path.name)
1162
+ if lock_dir:
1163
+ self._acquire_dir_xlock(parent_id=parent_dir.id, dir_name=path.name)
774
1164
  q = sql.select(schema.Dir).where(
775
1165
  schema.Dir.parent_id == parent_dir.id,
776
1166
  schema.Dir.md['name'].astext == path.name,
@@ -780,6 +1170,7 @@ class Catalog:
780
1170
  return schema.Dir(**row._mapping) if row is not None else None
781
1171
 
782
1172
  def _load_tbl(self, tbl_id: UUID) -> Optional[Table]:
1173
+ """Loads metadata for the table with the given id and caches it."""
783
1174
  _logger.info(f'Loading table {tbl_id}')
784
1175
  from .insertable_table import InsertableTable
785
1176
  from .view import View
@@ -808,8 +1199,9 @@ class Catalog:
808
1199
  if view_md is None:
809
1200
  # this is a base table
810
1201
  if (tbl_id, None) not in self._tbl_versions:
811
- self._tbl_versions[tbl_id, None] = self._load_tbl_version(tbl_id, None)
1202
+ _ = self._load_tbl_version(tbl_id, None)
812
1203
  tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
1204
+ self._tbls[tbl_id] = tbl
813
1205
  return tbl
814
1206
 
815
1207
  # this is a view; determine the sequence of TableVersions to load
@@ -829,18 +1221,18 @@ class Catalog:
829
1221
  view_path: Optional[TableVersionPath] = None
830
1222
  for id, effective_version in tbl_version_path[::-1]:
831
1223
  if (id, effective_version) not in self._tbl_versions:
832
- self._tbl_versions[id, effective_version] = self._load_tbl_version(id, effective_version)
1224
+ _ = self._load_tbl_version(id, effective_version)
833
1225
  view_path = TableVersionPath(TableVersionHandle(id, effective_version), base=base_path)
834
1226
  base_path = view_path
835
1227
  view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=pure_snapshot)
836
- # TODO: also load mutable views
1228
+ self._tbls[tbl_id] = view
837
1229
  return view
838
1230
 
839
1231
  def load_tbl_md(self, tbl_id: UUID, effective_version: Optional[int]) -> schema.FullTableMd:
840
1232
  """
841
1233
  Loads metadata from the store for a given table UUID and version.
842
1234
  """
843
- _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
1235
+ # _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
844
1236
  conn = Env.get().conn
845
1237
 
846
1238
  q = (
@@ -890,7 +1282,8 @@ class Catalog:
890
1282
  )
891
1283
 
892
1284
  row = conn.execute(q).one_or_none()
893
- assert row is not None, f'Table record not found: {tbl_id}:{effective_version}'
1285
+ if row is None:
1286
+ raise excs.Error(self._dropped_tbl_error_msg(tbl_id))
894
1287
  tbl_record, version_record, schema_version_record = _unpack_row(
895
1288
  row, [schema.Table, schema.TableVersion, schema.TableSchemaVersion]
896
1289
  )
@@ -915,8 +1308,15 @@ class Catalog:
915
1308
  If inserting `version_md` or `schema_version_md` would be a primary key violation, an exception will be raised.
916
1309
  """
917
1310
  conn = Env.get().conn
1311
+ assert self._in_write_xact
918
1312
 
919
1313
  if tbl_md is not None:
1314
+ assert tbl_md.tbl_id == str(tbl_id)
1315
+ if version_md is not None:
1316
+ assert tbl_md.current_version == version_md.version
1317
+ assert tbl_md.current_schema_version == version_md.schema_version
1318
+ if schema_version_md is not None:
1319
+ assert tbl_md.current_schema_version == schema_version_md.schema_version
920
1320
  result = conn.execute(
921
1321
  sql.update(schema.Table.__table__)
922
1322
  .values({schema.Table.md: dataclasses.asdict(tbl_md)})
@@ -925,6 +1325,9 @@ class Catalog:
925
1325
  assert result.rowcount == 1, result.rowcount
926
1326
 
927
1327
  if version_md is not None:
1328
+ assert version_md.tbl_id == str(tbl_id)
1329
+ if schema_version_md is not None:
1330
+ assert version_md.schema_version == schema_version_md.schema_version
928
1331
  conn.execute(
929
1332
  sql.insert(schema.TableVersion.__table__).values(
930
1333
  tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
@@ -932,6 +1335,7 @@ class Catalog:
932
1335
  )
933
1336
 
934
1337
  if schema_version_md is not None:
1338
+ assert schema_version_md.tbl_id == str(tbl_id)
935
1339
  conn.execute(
936
1340
  sql.insert(schema.TableSchemaVersion.__table__).values(
937
1341
  tbl_id=tbl_id,
@@ -962,7 +1366,7 @@ class Catalog:
962
1366
 
963
1367
  # If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
964
1368
  # TableVersionPath. We need to prepend it separately.
965
- if tbl._id != tbl._tbl_version.id:
1369
+ if isinstance(tbl, View) and tbl._snapshot_only:
966
1370
  snapshot_md = self.load_tbl_md(tbl._id, 0)
967
1371
  md = [snapshot_md, *md]
968
1372
 
@@ -978,52 +1382,73 @@ class Catalog:
978
1382
  return md
979
1383
 
980
1384
  def _load_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
1385
+ """Creates TableVersion instance from stored metadata and registers it in _tbl_versions."""
981
1386
  tbl_md, _, schema_version_md = self.load_tbl_md(tbl_id, effective_version)
982
1387
  view_md = tbl_md.view_md
983
1388
 
984
- _logger.info(f'Loading table version: {tbl_id}:{effective_version}')
985
1389
  conn = Env.get().conn
986
1390
 
987
- # load mutable view ids
988
- q = sql.select(schema.Table.id).where(
989
- sql.text(
990
- f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r} "
991
- "AND md->'view_md'->'base_versions'->0->1 IS NULL"
1391
+ # load mutable view ids for mutable TableVersions
1392
+ mutable_view_ids: list[UUID] = []
1393
+ # If this is a replica, effective_version should not be None. We see this today, because
1394
+ # the replica's TV instance's Column instances contain value_expr_dicts that reference the live version.
1395
+ # This is presumably a source of bugs, because it ignores schema version changes (eg, column renames).
1396
+ # TODO: retarget the value_expr_dict when instantiating Columns for a particular TV instance.
1397
+ if effective_version is None and not tbl_md.is_replica:
1398
+ q = sql.select(schema.Table.id).where(
1399
+ sql.text(
1400
+ f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r} "
1401
+ "AND md->'view_md'->'base_versions'->0->>1 IS NULL"
1402
+ )
992
1403
  )
993
- )
994
- mutable_view_ids = [r[0] for r in conn.execute(q).all()]
1404
+ mutable_view_ids = [r[0] for r in conn.execute(q).all()]
995
1405
  mutable_views = [TableVersionHandle(id, None) for id in mutable_view_ids]
996
1406
 
1407
+ tbl_version: TableVersion
997
1408
  if view_md is None:
998
1409
  # this is a base table
999
1410
  tbl_version = TableVersion(
1000
1411
  tbl_id, tbl_md, effective_version, schema_version_md, mutable_views=mutable_views
1001
1412
  )
1002
- return tbl_version
1413
+ else:
1414
+ assert len(view_md.base_versions) > 0 # a view needs to have a base
1415
+ pure_snapshot = view_md.is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
1416
+ assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
1417
+
1418
+ base: TableVersionHandle
1419
+ base_path: Optional[TableVersionPath] = None # needed for live view
1420
+ if view_md.is_snapshot:
1421
+ base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
1422
+ else:
1423
+ base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
1424
+ base = base_path.tbl_version
1003
1425
 
1004
- assert len(view_md.base_versions) > 0 # a view needs to have a base
1005
- pure_snapshot = view_md.is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
1006
- assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
1426
+ tbl_version = TableVersion(
1427
+ tbl_id,
1428
+ tbl_md,
1429
+ effective_version,
1430
+ schema_version_md,
1431
+ base_path=base_path,
1432
+ base=base,
1433
+ mutable_views=mutable_views,
1434
+ )
1007
1435
 
1008
- base: TableVersionHandle
1009
- base_path: Optional[TableVersionPath] = None # needed for live view
1010
- if view_md.is_snapshot:
1011
- base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
1012
- else:
1013
- base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
1014
- base = base_path.tbl_version
1015
-
1016
- tbl_version = TableVersion(
1017
- tbl_id,
1018
- tbl_md,
1019
- effective_version,
1020
- schema_version_md,
1021
- base_path=base_path,
1022
- base=base,
1023
- mutable_views=mutable_views,
1024
- )
1436
+ self._tbl_versions[tbl_id, effective_version] = tbl_version
1437
+ tbl_version.init()
1025
1438
  return tbl_version
1026
1439
 
1440
+ def record_column_dependencies(self, tbl_version: TableVersion) -> None:
1441
+ """Update self._column_dependencies. Only valid for non-snapshot versions."""
1442
+ from pixeltable.exprs import Expr
1443
+
1444
+ assert not tbl_version.is_snapshot
1445
+ dependencies: dict[QColumnId, set[QColumnId]] = {}
1446
+ for col in tbl_version.cols_by_id.values():
1447
+ if col.value_expr_dict is None:
1448
+ continue
1449
+ dependencies[QColumnId(tbl_version.id, col.id)] = Expr.get_refd_column_ids(col.value_expr_dict)
1450
+ self._column_dependencies[tbl_version.id] = dependencies
1451
+
1027
1452
  def _init_store(self) -> None:
1028
1453
  """One-time initialization of the stored catalog. Idempotent."""
1029
1454
  self.create_user(None)