pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (48) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +292 -105
  3. pixeltable/catalog/column.py +10 -8
  4. pixeltable/catalog/dir.py +1 -2
  5. pixeltable/catalog/insertable_table.py +25 -20
  6. pixeltable/catalog/schema_object.py +3 -6
  7. pixeltable/catalog/table.py +245 -189
  8. pixeltable/catalog/table_version.py +317 -201
  9. pixeltable/catalog/table_version_handle.py +15 -2
  10. pixeltable/catalog/table_version_path.py +60 -14
  11. pixeltable/catalog/view.py +14 -5
  12. pixeltable/dataframe.py +11 -9
  13. pixeltable/env.py +2 -4
  14. pixeltable/exec/in_memory_data_node.py +1 -1
  15. pixeltable/exec/sql_node.py +20 -11
  16. pixeltable/exprs/column_property_ref.py +15 -6
  17. pixeltable/exprs/column_ref.py +32 -11
  18. pixeltable/exprs/comparison.py +1 -1
  19. pixeltable/exprs/row_builder.py +4 -6
  20. pixeltable/exprs/rowid_ref.py +8 -0
  21. pixeltable/exprs/similarity_expr.py +1 -0
  22. pixeltable/func/query_template_function.py +1 -1
  23. pixeltable/functions/string.py +212 -58
  24. pixeltable/globals.py +7 -4
  25. pixeltable/index/base.py +5 -0
  26. pixeltable/index/btree.py +5 -0
  27. pixeltable/index/embedding_index.py +5 -0
  28. pixeltable/io/external_store.py +8 -29
  29. pixeltable/io/label_studio.py +1 -1
  30. pixeltable/io/parquet.py +2 -2
  31. pixeltable/io/table_data_conduit.py +0 -31
  32. pixeltable/metadata/__init__.py +1 -1
  33. pixeltable/metadata/converters/convert_13.py +2 -2
  34. pixeltable/metadata/converters/convert_30.py +6 -11
  35. pixeltable/metadata/converters/convert_35.py +9 -0
  36. pixeltable/metadata/converters/util.py +3 -9
  37. pixeltable/metadata/notes.py +1 -0
  38. pixeltable/metadata/schema.py +5 -1
  39. pixeltable/plan.py +4 -4
  40. pixeltable/share/packager.py +24 -9
  41. pixeltable/share/publish.py +2 -2
  42. pixeltable/store.py +19 -13
  43. pixeltable/utils/dbms.py +1 -1
  44. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/METADATA +1 -1
  45. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/RECORD +48 -47
  46. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/LICENSE +0 -0
  47. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/WHEEL +0 -0
  48. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/entry_points.txt +0 -0
@@ -3,8 +3,10 @@ from __future__ import annotations
3
3
  import dataclasses
4
4
  import functools
5
5
  import logging
6
+ import random
6
7
  import time
7
- from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
8
+ from contextlib import contextmanager
9
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
8
10
  from uuid import UUID
9
11
 
10
12
  import psycopg
@@ -56,40 +58,60 @@ def _unpack_row(
56
58
  return result
57
59
 
58
60
 
59
- _MAX_RETRIES = 3
61
+ # for now, we don't limit the number of retries, because we haven't seen situations where the actual number of retries
62
+ # grows uncontrollably
63
+ _MAX_RETRIES = 0
64
+
60
65
  T = TypeVar('T')
61
66
 
62
67
 
63
- def _retry_loop(op: Callable[..., T]) -> Callable[..., T]:
64
- @functools.wraps(op)
65
- def loop(*args: Any, **kwargs: Any) -> T:
66
- num_remaining_retries = _MAX_RETRIES
67
- while True:
68
- try:
69
- # in order for retry to work, we need to make sure that there aren't any prior db updates
70
- # that are part of an ongoing transaction
71
- assert not Env.get().in_xact()
72
- with Env.get().begin_xact():
73
- return op(*args, **kwargs)
74
- except sql.exc.DBAPIError as e:
75
- if isinstance(e.orig, psycopg.errors.SerializationFailure):
76
- if num_remaining_retries > 0:
77
- num_remaining_retries -= 1
78
- # print(f'serialization failure:\n{e}')
79
- # print('retrying ************************************************************')
80
- time.sleep(1)
68
+ def _retry_loop(*, for_write: bool) -> Callable[[Callable[..., T]], Callable[..., T]]:
69
+ def decorator(op: Callable[..., T]) -> Callable[..., T]:
70
+ @functools.wraps(op)
71
+ def loop(*args: Any, **kwargs: Any) -> T:
72
+ num_remaining_retries = _MAX_RETRIES
73
+ while True:
74
+ try:
75
+ # in order for retry to work, we need to make sure that there aren't any prior db updates
76
+ # that are part of an ongoing transaction
77
+ assert not Env.get().in_xact
78
+ with Catalog.get().begin_xact(for_write=for_write):
79
+ return op(*args, **kwargs)
80
+ except sql.exc.DBAPIError as e:
81
+ # TODO: what other exceptions should we be looking for?
82
+ if isinstance(e.orig, psycopg.errors.SerializationFailure):
83
+ if num_remaining_retries > 0:
84
+ num_remaining_retries -= 1
85
+ _logger.debug(f'Serialization failure, retrying ({num_remaining_retries} retries left)')
86
+ time.sleep(random.uniform(0.1, 0.5))
87
+ else:
88
+ raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
81
89
  else:
82
- raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
83
- else:
84
- raise
90
+ raise
91
+
92
+ return loop
85
93
 
86
- return loop
94
+ return decorator
87
95
 
88
96
 
89
97
  class Catalog:
90
98
  """The functional interface to getting access to catalog objects
91
99
 
92
- All interface functions must be called in the context of a transaction, started with Env.begin().
100
+ All interface functions must be called in the context of a transaction, started with Catalog.begin_xact().
101
+
102
+ Caching and invalidation of metadata:
103
+ - Catalog caches TableVersion instances in order to avoid excessive metadata loading
104
+ - for any specific table version (ie, combination of id and effective version) there can be only a single
105
+ Tableversion instance in circulation; the reason is that each TV instance has its own store_tbl.sa_tbl, and
106
+ mixing multiple instances of sqlalchemy Table objects in the same query (for the same underlying table) leads to
107
+ duplicate references to that table in the From clause (ie, incorrect Cartesian products)
108
+ - in order to allow multiple concurrent Python processes to perform updates (data and/or schema) against a shared
109
+ Pixeltable instance, Catalog needs to reload metadata from the store when there are changes
110
+ - concurrent changes are detected by comparing TableVersion.version with the stored current version
111
+ (TableMd.current_version)
112
+ - cached live TableVersion instances (those with effective_version == None) are validated against the stored
113
+ metadata on transaction boundaries; this is recorded in TableVersion.is_validated
114
+ - metadata validation is only needed for live TableVersion instances (snapshot instances are immutable)
93
115
  """
94
116
 
95
117
  _instance: Optional[Catalog] = None
@@ -99,6 +121,8 @@ class Catalog:
99
121
  # - snapshot versions: records the version of the snapshot
100
122
  _tbl_versions: dict[tuple[UUID, Optional[int]], TableVersion]
101
123
  _tbls: dict[UUID, Table]
124
+ _in_write_xact: bool # True if we're in a write transaction
125
+ _x_locked_tbl_id: Optional[UUID] # set if begin_xact() was asked to write-lock a table
102
126
 
103
127
  @classmethod
104
128
  def get(cls) -> Catalog:
@@ -109,22 +133,127 @@ class Catalog:
109
133
  @classmethod
110
134
  def clear(cls) -> None:
111
135
  """Remove the instance. Used for testing."""
136
+ # invalidate all existing instances to force reloading of metadata
137
+ for tbl_version in cls._instance._tbl_versions.values():
138
+ # _logger.debug(
139
+ # f'Invalidating table version {tbl_version.id}:{tbl_version.effective_version} ({id(tbl_version):x})'
140
+ # )
141
+ tbl_version.is_validated = False
112
142
  cls._instance = None
113
143
 
114
144
  def __init__(self) -> None:
115
145
  self._tbl_versions = {}
116
146
  self._tbls = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
147
+ self._in_write_xact = False
148
+ self._x_locked_tbl_id = None
117
149
  self._init_store()
118
150
 
119
- @classmethod
120
- def _lock_dir(cls, parent_id: Optional[UUID], dir_id: Optional[UUID], dir_name: Optional[str]) -> None:
121
- """Update directory record(s) to sequentialize thread access. Lock is released when transaction commits.
151
+ def validate(self) -> None:
152
+ """Validate structural consistency of cached metadata"""
153
+ for (tbl_id, effective_version), tbl_version in self._tbl_versions.items():
154
+ assert tbl_id == tbl_version.id, f'{tbl_id} != {tbl_version.id}'
155
+ assert tbl_version.effective_version == tbl_version.version or tbl_version.effective_version is None, (
156
+ f'{tbl_version.effective_version} != {tbl_version.version} for id {tbl_id}'
157
+ )
158
+ assert effective_version == tbl_version.effective_version, (
159
+ f'{effective_version} != {tbl_version.effective_version} for id {tbl_id}'
160
+ )
161
+ assert len(tbl_version.mutable_views) == 0 or tbl_version.is_mutable, (
162
+ f'snapshot_id={tbl_version.id} mutable_views={tbl_version.mutable_views}'
163
+ )
164
+
165
+ if tbl_version.is_view and tbl_version.is_mutable:
166
+ # make sure this mutable view is recorded in a mutable base
167
+ base = tbl_version.base
168
+ assert base is not None
169
+ if base.effective_version is None:
170
+ assert (base.id, None) in self._tbl_versions
171
+ assert TableVersionHandle.create(tbl_version) in self._tbl_versions[base.id, None].mutable_views
172
+
173
+ if len(tbl_version.mutable_views) > 0:
174
+ # make sure we also loaded mutable view metadata, which is needed to detect column dependencies
175
+ for v in tbl_version.mutable_views:
176
+ assert v.effective_version is None, f'{v.id}:{v.effective_version}'
177
+
178
+ @contextmanager
179
+ def begin_xact(self, *, tbl_id: Optional[UUID] = None, for_write: bool = False) -> Iterator[sql.Connection]:
180
+ """
181
+ Return a context manager that yields a connection to the database. Idempotent.
182
+
183
+ It is mandatory to call this method, not Env.begin_xact(), if the transaction accesses any table data
184
+ or metadata.
185
+
186
+ Lock acquisition:
187
+ - x-locks Table records by updating Table.lock_dummy
188
+ - this needs to be done in a retry loop, because Postgres can decide to abort the transaction
189
+ (SerializationFailure, LockNotAvailable)
190
+ - for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
191
+ to minimize (maybe avoid altogether) loosing that work
192
+ """
193
+ if Env.get().in_xact:
194
+ if tbl_id is not None and for_write:
195
+ # make sure that we requested the required table lock at the beginning of the transaction
196
+ assert tbl_id == self._x_locked_tbl_id, f'{tbl_id} != {self._x_locked_tbl_id}'
197
+ yield Env.get().conn
198
+ return
199
+
200
+ # tv_msg = '\n'.join(
201
+ # [
202
+ # f'{tv.id}:{tv.effective_version} : tv={id(tv):x} sa_tbl={id(tv.store_tbl.sa_tbl):x}'
203
+ # for tv in self._tbl_versions.values()
204
+ # ]
205
+ # )
206
+ # _logger.debug(f'begin_xact(): {tv_msg}')
207
+ num_retries = 0
208
+ while True:
209
+ try:
210
+ with Env.get().begin_xact() as conn:
211
+ if tbl_id is not None and for_write:
212
+ # X-lock Table record
213
+ conn.execute(
214
+ sql.select(schema.Table).where(schema.Table.id == tbl_id).with_for_update(nowait=True)
215
+ )
216
+ conn.execute(sql.update(schema.Table).values(lock_dummy=1).where(schema.Table.id == tbl_id))
217
+ self._x_locked_tbl_id = tbl_id
218
+
219
+ self._in_write_xact = for_write
220
+ yield conn
221
+ return
222
+ except sql.exc.DBAPIError as e:
223
+ if isinstance(e.orig, (psycopg.errors.SerializationFailure, psycopg.errors.LockNotAvailable)) and (
224
+ num_retries < _MAX_RETRIES or _MAX_RETRIES == 0
225
+ ):
226
+ num_retries += 1
227
+ _logger.debug(f'Retrying ({num_retries}) after {type(e.orig)}')
228
+ time.sleep(random.uniform(0.1, 0.5))
229
+ else:
230
+ raise
231
+ finally:
232
+ self._in_write_xact = False
233
+ self._x_locked_tbl_id = None
234
+
235
+ # invalidate cached current TableVersion instances
236
+ for tv in self._tbl_versions.values():
237
+ if tv.effective_version is None:
238
+ _logger.debug(f'invalidating table version {tv.id}:None (tv={id(tv):x})')
239
+ tv.is_validated = False
240
+
241
+ if _logger.isEnabledFor(logging.DEBUG):
242
+ self.validate()
243
+
244
+ @property
245
+ def in_write_xact(self) -> bool:
246
+ return self._in_write_xact
247
+
248
+ def _acquire_dir_xlock(self, parent_id: Optional[UUID], dir_id: Optional[UUID], dir_name: Optional[str]) -> None:
249
+ """Force acquisition of an X-lock on a Dir record via a blind update.
250
+
122
251
  If dir_id is present, then all other conditions are ignored.
123
252
  Note that (parent_id==None) is a valid where condition.
124
253
  If dir_id is not specified, the user from the environment is added to the directory filters.
125
254
  """
126
255
  user = Env.get().user
127
- conn = Env.get().conn
256
+ assert self._in_write_xact
128
257
  q = sql.update(schema.Dir).values(lock_dummy=1)
129
258
  if dir_id is not None:
130
259
  q = q.where(schema.Dir.id == dir_id)
@@ -134,7 +263,7 @@ class Catalog:
134
263
  q = q.where(schema.Dir.md['name'].astext == dir_name)
135
264
  if user is not None:
136
265
  q = q.where(schema.Dir.md['user'].astext == user)
137
- conn.execute(q)
266
+ Env.get().conn.execute(q)
138
267
 
139
268
  def get_dir_path(self, dir_id: UUID) -> Path:
140
269
  """Return path for directory with given id"""
@@ -156,7 +285,7 @@ class Catalog:
156
285
  dir_entries: dict[str, Catalog.DirEntry]
157
286
  table: Optional[schema.Table]
158
287
 
159
- @_retry_loop
288
+ @_retry_loop(for_write=False)
160
289
  def get_dir_contents(self, dir_path: Path, recursive: bool = False) -> dict[str, DirEntry]:
161
290
  dir = self._get_schema_object(dir_path, expected=Dir, raise_if_not_exists=True)
162
291
  return self._get_dir_contents(dir._id, recursive=recursive)
@@ -183,7 +312,7 @@ class Catalog:
183
312
 
184
313
  return result
185
314
 
186
- @_retry_loop
315
+ @_retry_loop(for_write=True)
187
316
  def move(self, path: Path, new_path: Path) -> None:
188
317
  self._move(path, new_path)
189
318
 
@@ -272,7 +401,7 @@ class Catalog:
272
401
 
273
402
  # check for subdirectory
274
403
  if for_update:
275
- self._lock_dir(dir_id, None, name)
404
+ self._acquire_dir_xlock(dir_id, None, name)
276
405
  q = sql.select(schema.Dir).where(
277
406
  schema.Dir.parent_id == dir_id, schema.Dir.md['name'].astext == name, schema.Dir.md['user'].astext == user
278
407
  )
@@ -296,7 +425,7 @@ class Catalog:
296
425
  tbl_id = conn.execute(q).scalar_one_or_none()
297
426
  if tbl_id is not None:
298
427
  if tbl_id not in self._tbls:
299
- self._tbls[tbl_id] = self._load_tbl(tbl_id)
428
+ _ = self._load_tbl(tbl_id)
300
429
  return self._tbls[tbl_id]
301
430
 
302
431
  return None
@@ -349,10 +478,15 @@ class Catalog:
349
478
  tbl = self._load_tbl(tbl_id)
350
479
  if tbl is None:
351
480
  return None
352
- self._tbls[tbl_id] = tbl
481
+ # if this is a mutable table, we also need to have its mutable views loaded, in order to track column
482
+ # dependencies
483
+ tbl_version = tbl._tbl_version.get()
484
+ if tbl_version.is_mutable:
485
+ for v in tbl_version.mutable_views:
486
+ _ = self.get_table_by_id(v.id)
353
487
  return self._tbls[tbl_id]
354
488
 
355
- @_retry_loop
489
+ @_retry_loop(for_write=True)
356
490
  def create_table(
357
491
  self,
358
492
  path: Path,
@@ -385,7 +519,7 @@ class Catalog:
385
519
  self._tbls[tbl._id] = tbl
386
520
  return tbl
387
521
 
388
- @_retry_loop
522
+ @_retry_loop(for_write=True)
389
523
  def create_view(
390
524
  self,
391
525
  path: Path,
@@ -431,14 +565,17 @@ class Catalog:
431
565
  self._tbls[view._id] = view
432
566
  return view
433
567
 
434
- @_retry_loop
568
+ @_retry_loop(for_write=True)
435
569
  def create_replica(
436
570
  self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam = IfExistsParam.ERROR
437
- ) -> Table:
571
+ ) -> None:
438
572
  """
439
573
  Creates table, table_version, and table_schema_version records for a replica with the given metadata.
440
574
  The metadata should be presented in standard "ancestor order", with the table being replicated at
441
575
  list position 0 and the (root) base table at list position -1.
576
+
577
+ TODO: create_replica() also needs to create the store tables and populate them in order to make
578
+ replica creation atomic.
442
579
  """
443
580
  tbl_id = UUID(md[0].tbl_md.tbl_id)
444
581
 
@@ -451,20 +588,19 @@ class Catalog:
451
588
  'but a different table already exists at that location.'
452
589
  )
453
590
  assert isinstance(existing, View)
454
- return existing
591
+ return
455
592
 
456
593
  # Ensure that the system directory exists.
457
594
  self._create_dir(Path('_system', allow_system_paths=True), if_exists=IfExistsParam.IGNORE, parents=False)
458
595
 
459
596
  # Now check to see if this table already exists in the catalog.
460
- # TODO: Handle concurrency in create_replica()
461
597
  existing = Catalog.get().get_table_by_id(tbl_id)
462
598
  if existing is not None:
463
- existing_path = Path(existing._path, allow_system_paths=True)
599
+ existing_path = Path(existing._path(), allow_system_paths=True)
464
600
  # It does exist. If it's a non-system table, that's an error: it's already been replicated.
465
601
  if not existing_path.is_system_path:
466
602
  raise excs.Error(
467
- f'That table has already been replicated as {existing._path!r}. \n'
603
+ f'That table has already been replicated as {existing._path()!r}. \n'
468
604
  f'Drop the existing replica if you wish to re-create it.'
469
605
  )
470
606
  # If it's a system table, then this means it was created at some point as the ancestor of some other
@@ -489,22 +625,20 @@ class Catalog:
489
625
  # The table already exists in the catalog. The existing path might be a system path (if the table
490
626
  # was created as an anonymous base table of some other table), or it might not (if it's a snapshot
491
627
  # that was directly replicated by the user at some point). In either case, use the existing path.
492
- replica_path = Path(replica._path, allow_system_paths=True)
628
+ replica_path = Path(replica._path(), allow_system_paths=True)
493
629
 
494
630
  # Store the metadata; it could be a new version (in which case a new record will be created) or a
495
631
  # known version (in which case the newly received metadata will be validated as identical).
496
632
  self.__store_replica_md(replica_path, ancestor_md)
497
633
 
498
- # Update the catalog (as a final step, after all DB operations completed successfully).
499
- # Only the table being replicated is actually made visible in the catalog.
500
- self._tbls[tbl_id] = self._load_tbl(tbl_id)
501
- return self._tbls[tbl_id]
634
+ # don't create TableVersion instances at this point, they would be superseded by calls to TV.create_replica()
635
+ # in TableRestorer.restore()
502
636
 
503
637
  def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
504
638
  _logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
505
- # TODO: Handle concurrency
506
639
  dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
507
640
  assert dir is not None
641
+ assert self._in_write_xact
508
642
 
509
643
  conn = Env.get().conn
510
644
  tbl_id = md.tbl_md.tbl_id
@@ -582,14 +716,24 @@ class Catalog:
582
716
 
583
717
  self.store_tbl_md(UUID(tbl_id), new_tbl_md, new_version_md, new_schema_version_md)
584
718
 
585
- @_retry_loop
719
+ @_retry_loop(for_write=False)
586
720
  def get_table(self, path: Path) -> Table:
721
+ obj = self._get_table(path)
722
+ return obj
723
+
724
+ def _get_table(self, path: Path) -> Table:
587
725
  obj = Catalog.get()._get_schema_object(path, expected=Table, raise_if_not_exists=True)
588
726
  assert isinstance(obj, Table)
589
- obj._tbl_version.get().ensure_md_loaded()
727
+ tbl_version = obj._tbl_version.get()
728
+ # TODO: instead of calling this here, move the logic into TableVersion.init(), which is called after
729
+ # registering the instance in _tbl_versions
730
+ tbl_version.ensure_md_loaded()
731
+ # if this table has mutable views, we need to load those as well, in order to record column dependencies
732
+ for v in tbl_version.mutable_views:
733
+ self.get_table_by_id(v.id)
590
734
  return obj
591
735
 
592
- @_retry_loop
736
+ @_retry_loop(for_write=True)
593
737
  def drop_table(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
594
738
  _, _, src_obj = self._prepare_dir_op(
595
739
  drop_dir_path=path.parent,
@@ -621,11 +765,11 @@ class Catalog:
621
765
  msg: str
622
766
  if is_replace:
623
767
  msg = (
624
- f'{obj_type_str} {tbl._path} already exists and has dependents. '
768
+ f'{obj_type_str} {tbl._path()} already exists and has dependents. '
625
769
  "Use `if_exists='replace_force'` to replace it."
626
770
  )
627
771
  else:
628
- msg = f'{obj_type_str} {tbl._path} has dependents.'
772
+ msg = f'{obj_type_str} {tbl._path()} has dependents.'
629
773
  raise excs.Error(msg)
630
774
 
631
775
  for view_id in view_ids:
@@ -636,9 +780,9 @@ class Catalog:
636
780
  tbl._drop()
637
781
  assert tbl._id in self._tbls
638
782
  del self._tbls[tbl._id]
639
- _logger.info(f'Dropped table `{tbl._path}`.')
783
+ _logger.info(f'Dropped table `{tbl._path()}`.')
640
784
 
641
- @_retry_loop
785
+ @_retry_loop(for_write=True)
642
786
  def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
643
787
  return self._create_dir(path, if_exists, parents)
644
788
 
@@ -673,7 +817,7 @@ class Catalog:
673
817
  Env.get().console_logger.info(f'Created directory {str(path)!r}.')
674
818
  return dir
675
819
 
676
- @_retry_loop
820
+ @_retry_loop(for_write=True)
677
821
  def drop_dir(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
678
822
  _, _, schema_obj = self._prepare_dir_op(
679
823
  drop_dir_path=path.parent,
@@ -698,7 +842,7 @@ class Catalog:
698
842
  raise excs.Error(f'Directory {str(dir_path)!r} is not empty.')
699
843
 
700
844
  # drop existing subdirs
701
- self._lock_dir(dir_id, None, None)
845
+ self._acquire_dir_xlock(dir_id, None, None)
702
846
  dir_q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
703
847
  for row in conn.execute(dir_q).all():
704
848
  self._drop_dir(row.id, dir_path.append(row.md['name']), force=True)
@@ -725,17 +869,37 @@ class Catalog:
725
869
  return result
726
870
 
727
871
  def get_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
728
- if (tbl_id, effective_version) not in self._tbl_versions:
729
- self._tbl_versions[tbl_id, effective_version] = self._load_tbl_version(tbl_id, effective_version)
730
- return self._tbl_versions[tbl_id, effective_version]
731
-
732
- def add_tbl_version(self, tbl_version: TableVersion) -> None:
733
- """Explicitly add a TableVersion"""
734
- self._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
735
- # if this is a mutable view, also record it in the base
736
- if tbl_version.is_view and tbl_version.effective_version is None:
737
- base = tbl_version.base.get()
738
- base.mutable_views.append(TableVersionHandle(tbl_version.id, tbl_version.effective_version))
872
+ # we need a transaction here, if we're not already in one; if this starts a new transaction,
873
+ # the returned TableVersion instance will not be validated
874
+ with self.begin_xact(tbl_id=tbl_id, for_write=False) as conn:
875
+ tv = self._tbl_versions.get((tbl_id, effective_version))
876
+ if tv is None:
877
+ tv = self._load_tbl_version(tbl_id, effective_version)
878
+ elif not tv.is_validated:
879
+ # only live instances are invalidated
880
+ assert effective_version is None
881
+ # we validate live instances by comparing our cached version number to the stored current version
882
+ # _logger.debug(f'validating metadata for table {tbl_id}:{tv.version} ({id(tv):x})')
883
+ q = sql.select(schema.Table.md).where(schema.Table.id == tbl_id)
884
+ row = conn.execute(q).one()
885
+ current_version = row.md['current_version']
886
+
887
+ # the stored version can be behind TableVersion.version, because we don't roll back the in-memory
888
+ # metadata changes after a failed update operation
889
+ if current_version != tv.version:
890
+ # the cached metadata is invalid
891
+ _logger.debug(
892
+ f'reloading metadata for table {tbl_id} '
893
+ f'(cached version: {tv.version}, current version: {current_version}'
894
+ # f', id: {id(tv):x})'
895
+ )
896
+ tv = self._load_tbl_version(tbl_id, None)
897
+ else:
898
+ # the cached metadata is valid
899
+ tv.is_validated = True
900
+
901
+ assert tv.is_validated
902
+ return tv
739
903
 
740
904
  def remove_tbl_version(self, tbl_version: TableVersion) -> None:
741
905
  assert (tbl_version.id, tbl_version.effective_version) in self._tbl_versions
@@ -745,7 +909,7 @@ class Catalog:
745
909
  """Return the Dir with the given id, or None if it doesn't exist"""
746
910
  conn = Env.get().conn
747
911
  if for_update:
748
- self._lock_dir(None, dir_id, None)
912
+ self._acquire_dir_xlock(None, dir_id, None)
749
913
  q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
750
914
  row = conn.execute(q).one_or_none()
751
915
  if row is None:
@@ -761,7 +925,7 @@ class Catalog:
761
925
  conn = Env.get().conn
762
926
  if path.is_root:
763
927
  if for_update:
764
- self._lock_dir(parent_id=None, dir_id=None, dir_name='')
928
+ self._acquire_dir_xlock(parent_id=None, dir_id=None, dir_name='')
765
929
  q = sql.select(schema.Dir).where(schema.Dir.parent_id.is_(None), schema.Dir.md['user'].astext == user)
766
930
  row = conn.execute(q).one_or_none()
767
931
  return schema.Dir(**row._mapping) if row is not None else None
@@ -770,7 +934,7 @@ class Catalog:
770
934
  if parent_dir is None:
771
935
  return None
772
936
  if for_update:
773
- self._lock_dir(parent_id=parent_dir.id, dir_id=None, dir_name=path.name)
937
+ self._acquire_dir_xlock(parent_id=parent_dir.id, dir_id=None, dir_name=path.name)
774
938
  q = sql.select(schema.Dir).where(
775
939
  schema.Dir.parent_id == parent_dir.id,
776
940
  schema.Dir.md['name'].astext == path.name,
@@ -780,6 +944,7 @@ class Catalog:
780
944
  return schema.Dir(**row._mapping) if row is not None else None
781
945
 
782
946
  def _load_tbl(self, tbl_id: UUID) -> Optional[Table]:
947
+ """Loads metadata for the table with the given id and caches it."""
783
948
  _logger.info(f'Loading table {tbl_id}')
784
949
  from .insertable_table import InsertableTable
785
950
  from .view import View
@@ -808,8 +973,9 @@ class Catalog:
808
973
  if view_md is None:
809
974
  # this is a base table
810
975
  if (tbl_id, None) not in self._tbl_versions:
811
- self._tbl_versions[tbl_id, None] = self._load_tbl_version(tbl_id, None)
976
+ _ = self._load_tbl_version(tbl_id, None)
812
977
  tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
978
+ self._tbls[tbl_id] = tbl
813
979
  return tbl
814
980
 
815
981
  # this is a view; determine the sequence of TableVersions to load
@@ -829,18 +995,18 @@ class Catalog:
829
995
  view_path: Optional[TableVersionPath] = None
830
996
  for id, effective_version in tbl_version_path[::-1]:
831
997
  if (id, effective_version) not in self._tbl_versions:
832
- self._tbl_versions[id, effective_version] = self._load_tbl_version(id, effective_version)
998
+ _ = self._load_tbl_version(id, effective_version)
833
999
  view_path = TableVersionPath(TableVersionHandle(id, effective_version), base=base_path)
834
1000
  base_path = view_path
835
1001
  view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=pure_snapshot)
836
- # TODO: also load mutable views
1002
+ self._tbls[tbl_id] = view
837
1003
  return view
838
1004
 
839
1005
  def load_tbl_md(self, tbl_id: UUID, effective_version: Optional[int]) -> schema.FullTableMd:
840
1006
  """
841
1007
  Loads metadata from the store for a given table UUID and version.
842
1008
  """
843
- _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
1009
+ # _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
844
1010
  conn = Env.get().conn
845
1011
 
846
1012
  q = (
@@ -915,8 +1081,15 @@ class Catalog:
915
1081
  If inserting `version_md` or `schema_version_md` would be a primary key violation, an exception will be raised.
916
1082
  """
917
1083
  conn = Env.get().conn
1084
+ assert self._in_write_xact
918
1085
 
919
1086
  if tbl_md is not None:
1087
+ assert tbl_md.tbl_id == str(tbl_id)
1088
+ if version_md is not None:
1089
+ assert tbl_md.current_version == version_md.version
1090
+ assert tbl_md.current_schema_version == version_md.schema_version
1091
+ if schema_version_md is not None:
1092
+ assert tbl_md.current_schema_version == schema_version_md.schema_version
920
1093
  result = conn.execute(
921
1094
  sql.update(schema.Table.__table__)
922
1095
  .values({schema.Table.md: dataclasses.asdict(tbl_md)})
@@ -925,6 +1098,9 @@ class Catalog:
925
1098
  assert result.rowcount == 1, result.rowcount
926
1099
 
927
1100
  if version_md is not None:
1101
+ assert version_md.tbl_id == str(tbl_id)
1102
+ if schema_version_md is not None:
1103
+ assert version_md.schema_version == schema_version_md.schema_version
928
1104
  conn.execute(
929
1105
  sql.insert(schema.TableVersion.__table__).values(
930
1106
  tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
@@ -932,6 +1108,7 @@ class Catalog:
932
1108
  )
933
1109
 
934
1110
  if schema_version_md is not None:
1111
+ assert schema_version_md.tbl_id == str(tbl_id)
935
1112
  conn.execute(
936
1113
  sql.insert(schema.TableSchemaVersion.__table__).values(
937
1114
  tbl_id=tbl_id,
@@ -978,50 +1155,60 @@ class Catalog:
978
1155
  return md
979
1156
 
980
1157
  def _load_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
1158
+ """Creates TableVersion instance from stored metadata and registers it in _tbl_versions."""
981
1159
  tbl_md, _, schema_version_md = self.load_tbl_md(tbl_id, effective_version)
982
1160
  view_md = tbl_md.view_md
983
1161
 
984
- _logger.info(f'Loading table version: {tbl_id}:{effective_version}')
985
1162
  conn = Env.get().conn
986
1163
 
987
- # load mutable view ids
988
- q = sql.select(schema.Table.id).where(
989
- sql.text(
990
- f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r} "
991
- "AND md->'view_md'->'base_versions'->0->1 IS NULL"
1164
+ # load mutable view ids for mutable TableVersions
1165
+ mutable_view_ids: list[UUID] = []
1166
+ # If this is a replica, effective_version should not be None. We see this today, because
1167
+ # the replica's TV instance's Column instances contain value_expr_dicts that reference the live version.
1168
+ # This is presumably a source of bugs, because it ignores schema version changes (eg, column renames).
1169
+ # TODO: retarget the value_expr_dict when instantiating Columns for a particular TV instance.
1170
+ if effective_version is None and not tbl_md.is_replica:
1171
+ q = sql.select(schema.Table.id).where(
1172
+ sql.text(
1173
+ f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r} "
1174
+ "AND md->'view_md'->'base_versions'->0->>1 IS NULL"
1175
+ )
992
1176
  )
993
- )
994
- mutable_view_ids = [r[0] for r in conn.execute(q).all()]
1177
+ mutable_view_ids = [r[0] for r in conn.execute(q).all()]
995
1178
  mutable_views = [TableVersionHandle(id, None) for id in mutable_view_ids]
996
1179
 
1180
+ tbl_version: TableVersion
997
1181
  if view_md is None:
998
1182
  # this is a base table
999
1183
  tbl_version = TableVersion(
1000
1184
  tbl_id, tbl_md, effective_version, schema_version_md, mutable_views=mutable_views
1001
1185
  )
1002
- return tbl_version
1186
+ else:
1187
+ assert len(view_md.base_versions) > 0 # a view needs to have a base
1188
+ pure_snapshot = view_md.is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
1189
+ assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
1190
+
1191
+ base: TableVersionHandle
1192
+ base_path: Optional[TableVersionPath] = None # needed for live view
1193
+ if view_md.is_snapshot:
1194
+ base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
1195
+ else:
1196
+ base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
1197
+ base = base_path.tbl_version
1003
1198
 
1004
- assert len(view_md.base_versions) > 0 # a view needs to have a base
1005
- pure_snapshot = view_md.is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
1006
- assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
1199
+ tbl_version = TableVersion(
1200
+ tbl_id,
1201
+ tbl_md,
1202
+ effective_version,
1203
+ schema_version_md,
1204
+ base_path=base_path,
1205
+ base=base,
1206
+ mutable_views=mutable_views,
1207
+ )
1208
+
1209
+ self._tbl_versions[tbl_id, effective_version] = tbl_version
1210
+ tbl_version.init()
1007
1211
 
1008
- base: TableVersionHandle
1009
- base_path: Optional[TableVersionPath] = None # needed for live view
1010
- if view_md.is_snapshot:
1011
- base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
1012
- else:
1013
- base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
1014
- base = base_path.tbl_version
1015
-
1016
- tbl_version = TableVersion(
1017
- tbl_id,
1018
- tbl_md,
1019
- effective_version,
1020
- schema_version_md,
1021
- base_path=base_path,
1022
- base=base,
1023
- mutable_views=mutable_views,
1024
- )
1025
1212
  return tbl_version
1026
1213
 
1027
1214
  def _init_store(self) -> None: