pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (58) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +296 -105
  3. pixeltable/catalog/column.py +10 -8
  4. pixeltable/catalog/dir.py +1 -2
  5. pixeltable/catalog/insertable_table.py +25 -20
  6. pixeltable/catalog/schema_object.py +3 -6
  7. pixeltable/catalog/table.py +261 -189
  8. pixeltable/catalog/table_version.py +333 -202
  9. pixeltable/catalog/table_version_handle.py +15 -2
  10. pixeltable/catalog/table_version_path.py +60 -14
  11. pixeltable/catalog/view.py +38 -6
  12. pixeltable/dataframe.py +196 -18
  13. pixeltable/env.py +4 -4
  14. pixeltable/exec/__init__.py +1 -1
  15. pixeltable/exec/expr_eval/evaluators.py +4 -1
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/sql_node.py +171 -22
  18. pixeltable/exprs/column_property_ref.py +15 -6
  19. pixeltable/exprs/column_ref.py +32 -11
  20. pixeltable/exprs/comparison.py +1 -1
  21. pixeltable/exprs/data_row.py +5 -3
  22. pixeltable/exprs/expr.py +7 -0
  23. pixeltable/exprs/literal.py +2 -0
  24. pixeltable/exprs/row_builder.py +4 -6
  25. pixeltable/exprs/rowid_ref.py +8 -0
  26. pixeltable/exprs/similarity_expr.py +1 -0
  27. pixeltable/func/query_template_function.py +1 -1
  28. pixeltable/func/tools.py +1 -1
  29. pixeltable/functions/gemini.py +0 -1
  30. pixeltable/functions/string.py +212 -58
  31. pixeltable/globals.py +12 -4
  32. pixeltable/index/base.py +5 -0
  33. pixeltable/index/btree.py +5 -0
  34. pixeltable/index/embedding_index.py +5 -0
  35. pixeltable/io/external_store.py +8 -29
  36. pixeltable/io/label_studio.py +1 -1
  37. pixeltable/io/parquet.py +2 -2
  38. pixeltable/io/table_data_conduit.py +0 -31
  39. pixeltable/metadata/__init__.py +11 -2
  40. pixeltable/metadata/converters/convert_13.py +2 -2
  41. pixeltable/metadata/converters/convert_30.py +6 -11
  42. pixeltable/metadata/converters/convert_35.py +9 -0
  43. pixeltable/metadata/converters/convert_36.py +38 -0
  44. pixeltable/metadata/converters/util.py +3 -9
  45. pixeltable/metadata/notes.py +2 -0
  46. pixeltable/metadata/schema.py +8 -1
  47. pixeltable/plan.py +221 -14
  48. pixeltable/share/packager.py +137 -13
  49. pixeltable/share/publish.py +2 -2
  50. pixeltable/store.py +19 -13
  51. pixeltable/utils/dbms.py +1 -1
  52. pixeltable/utils/formatter.py +64 -42
  53. pixeltable/utils/sample.py +25 -0
  54. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
  55. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
  56. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
  57. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
  58. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0
@@ -3,8 +3,10 @@ from __future__ import annotations
3
3
  import dataclasses
4
4
  import functools
5
5
  import logging
6
+ import random
6
7
  import time
7
- from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar
8
+ from contextlib import contextmanager
9
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
8
10
  from uuid import UUID
9
11
 
10
12
  import psycopg
@@ -15,6 +17,8 @@ from pixeltable.env import Env
15
17
  from pixeltable.iterators import ComponentIterator
16
18
  from pixeltable.metadata import schema
17
19
 
20
+ if TYPE_CHECKING:
21
+ from pixeltable.plan import SampleClause
18
22
  from .dir import Dir
19
23
  from .globals import IfExistsParam, IfNotExistsParam, MediaValidation
20
24
  from .insertable_table import InsertableTable
@@ -56,40 +60,60 @@ def _unpack_row(
56
60
  return result
57
61
 
58
62
 
59
- _MAX_RETRIES = 3
63
+ # for now, we don't limit the number of retries, because we haven't seen situations where the actual number of retries
64
+ # grows uncontrollably
65
+ _MAX_RETRIES = 0
66
+
60
67
  T = TypeVar('T')
61
68
 
62
69
 
63
- def _retry_loop(op: Callable[..., T]) -> Callable[..., T]:
64
- @functools.wraps(op)
65
- def loop(*args: Any, **kwargs: Any) -> T:
66
- num_remaining_retries = _MAX_RETRIES
67
- while True:
68
- try:
69
- # in order for retry to work, we need to make sure that there aren't any prior db updates
70
- # that are part of an ongoing transaction
71
- assert not Env.get().in_xact()
72
- with Env.get().begin_xact():
73
- return op(*args, **kwargs)
74
- except sql.exc.DBAPIError as e:
75
- if isinstance(e.orig, psycopg.errors.SerializationFailure):
76
- if num_remaining_retries > 0:
77
- num_remaining_retries -= 1
78
- # print(f'serialization failure:\n{e}')
79
- # print('retrying ************************************************************')
80
- time.sleep(1)
70
+ def _retry_loop(*, for_write: bool) -> Callable[[Callable[..., T]], Callable[..., T]]:
71
+ def decorator(op: Callable[..., T]) -> Callable[..., T]:
72
+ @functools.wraps(op)
73
+ def loop(*args: Any, **kwargs: Any) -> T:
74
+ num_remaining_retries = _MAX_RETRIES
75
+ while True:
76
+ try:
77
+ # in order for retry to work, we need to make sure that there aren't any prior db updates
78
+ # that are part of an ongoing transaction
79
+ assert not Env.get().in_xact
80
+ with Catalog.get().begin_xact(for_write=for_write):
81
+ return op(*args, **kwargs)
82
+ except sql.exc.DBAPIError as e:
83
+ # TODO: what other exceptions should we be looking for?
84
+ if isinstance(e.orig, psycopg.errors.SerializationFailure):
85
+ if num_remaining_retries > 0:
86
+ num_remaining_retries -= 1
87
+ _logger.debug(f'Serialization failure, retrying ({num_remaining_retries} retries left)')
88
+ time.sleep(random.uniform(0.1, 0.5))
89
+ else:
90
+ raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
81
91
  else:
82
- raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
83
- else:
84
- raise
92
+ raise
85
93
 
86
- return loop
94
+ return loop
95
+
96
+ return decorator
87
97
 
88
98
 
89
99
  class Catalog:
90
100
  """The functional interface to getting access to catalog objects
91
101
 
92
- All interface functions must be called in the context of a transaction, started with Env.begin().
102
+ All interface functions must be called in the context of a transaction, started with Catalog.begin_xact().
103
+
104
+ Caching and invalidation of metadata:
105
+ - Catalog caches TableVersion instances in order to avoid excessive metadata loading
106
+ - for any specific table version (ie, combination of id and effective version) there can be only a single
107
+ Tableversion instance in circulation; the reason is that each TV instance has its own store_tbl.sa_tbl, and
108
+ mixing multiple instances of sqlalchemy Table objects in the same query (for the same underlying table) leads to
109
+ duplicate references to that table in the From clause (ie, incorrect Cartesian products)
110
+ - in order to allow multiple concurrent Python processes to perform updates (data and/or schema) against a shared
111
+ Pixeltable instance, Catalog needs to reload metadata from the store when there are changes
112
+ - concurrent changes are detected by comparing TableVersion.version with the stored current version
113
+ (TableMd.current_version)
114
+ - cached live TableVersion instances (those with effective_version == None) are validated against the stored
115
+ metadata on transaction boundaries; this is recorded in TableVersion.is_validated
116
+ - metadata validation is only needed for live TableVersion instances (snapshot instances are immutable)
93
117
  """
94
118
 
95
119
  _instance: Optional[Catalog] = None
@@ -99,6 +123,8 @@ class Catalog:
99
123
  # - snapshot versions: records the version of the snapshot
100
124
  _tbl_versions: dict[tuple[UUID, Optional[int]], TableVersion]
101
125
  _tbls: dict[UUID, Table]
126
+ _in_write_xact: bool # True if we're in a write transaction
127
+ _x_locked_tbl_id: Optional[UUID] # set if begin_xact() was asked to write-lock a table
102
128
 
103
129
  @classmethod
104
130
  def get(cls) -> Catalog:
@@ -109,22 +135,127 @@ class Catalog:
109
135
  @classmethod
110
136
  def clear(cls) -> None:
111
137
  """Remove the instance. Used for testing."""
138
+ # invalidate all existing instances to force reloading of metadata
139
+ for tbl_version in cls._instance._tbl_versions.values():
140
+ # _logger.debug(
141
+ # f'Invalidating table version {tbl_version.id}:{tbl_version.effective_version} ({id(tbl_version):x})'
142
+ # )
143
+ tbl_version.is_validated = False
112
144
  cls._instance = None
113
145
 
114
146
  def __init__(self) -> None:
115
147
  self._tbl_versions = {}
116
148
  self._tbls = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
149
+ self._in_write_xact = False
150
+ self._x_locked_tbl_id = None
117
151
  self._init_store()
118
152
 
119
- @classmethod
120
- def _lock_dir(cls, parent_id: Optional[UUID], dir_id: Optional[UUID], dir_name: Optional[str]) -> None:
121
- """Update directory record(s) to sequentialize thread access. Lock is released when transaction commits.
153
+ def validate(self) -> None:
154
+ """Validate structural consistency of cached metadata"""
155
+ for (tbl_id, effective_version), tbl_version in self._tbl_versions.items():
156
+ assert tbl_id == tbl_version.id, f'{tbl_id} != {tbl_version.id}'
157
+ assert tbl_version.effective_version == tbl_version.version or tbl_version.effective_version is None, (
158
+ f'{tbl_version.effective_version} != {tbl_version.version} for id {tbl_id}'
159
+ )
160
+ assert effective_version == tbl_version.effective_version, (
161
+ f'{effective_version} != {tbl_version.effective_version} for id {tbl_id}'
162
+ )
163
+ assert len(tbl_version.mutable_views) == 0 or tbl_version.is_mutable, (
164
+ f'snapshot_id={tbl_version.id} mutable_views={tbl_version.mutable_views}'
165
+ )
166
+
167
+ if tbl_version.is_view and tbl_version.is_mutable:
168
+ # make sure this mutable view is recorded in a mutable base
169
+ base = tbl_version.base
170
+ assert base is not None
171
+ if base.effective_version is None:
172
+ assert (base.id, None) in self._tbl_versions
173
+ assert TableVersionHandle.create(tbl_version) in self._tbl_versions[base.id, None].mutable_views
174
+
175
+ if len(tbl_version.mutable_views) > 0:
176
+ # make sure we also loaded mutable view metadata, which is needed to detect column dependencies
177
+ for v in tbl_version.mutable_views:
178
+ assert v.effective_version is None, f'{v.id}:{v.effective_version}'
179
+
180
+ @contextmanager
181
+ def begin_xact(self, *, tbl_id: Optional[UUID] = None, for_write: bool = False) -> Iterator[sql.Connection]:
182
+ """
183
+ Return a context manager that yields a connection to the database. Idempotent.
184
+
185
+ It is mandatory to call this method, not Env.begin_xact(), if the transaction accesses any table data
186
+ or metadata.
187
+
188
+ Lock acquisition:
189
+ - x-locks Table records by updating Table.lock_dummy
190
+ - this needs to be done in a retry loop, because Postgres can decide to abort the transaction
191
+ (SerializationFailure, LockNotAvailable)
192
+ - for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
193
+ to minimize (maybe avoid altogether) loosing that work
194
+ """
195
+ if Env.get().in_xact:
196
+ if tbl_id is not None and for_write:
197
+ # make sure that we requested the required table lock at the beginning of the transaction
198
+ assert tbl_id == self._x_locked_tbl_id, f'{tbl_id} != {self._x_locked_tbl_id}'
199
+ yield Env.get().conn
200
+ return
201
+
202
+ # tv_msg = '\n'.join(
203
+ # [
204
+ # f'{tv.id}:{tv.effective_version} : tv={id(tv):x} sa_tbl={id(tv.store_tbl.sa_tbl):x}'
205
+ # for tv in self._tbl_versions.values()
206
+ # ]
207
+ # )
208
+ # _logger.debug(f'begin_xact(): {tv_msg}')
209
+ num_retries = 0
210
+ while True:
211
+ try:
212
+ with Env.get().begin_xact() as conn:
213
+ if tbl_id is not None and for_write:
214
+ # X-lock Table record
215
+ conn.execute(
216
+ sql.select(schema.Table).where(schema.Table.id == tbl_id).with_for_update(nowait=True)
217
+ )
218
+ conn.execute(sql.update(schema.Table).values(lock_dummy=1).where(schema.Table.id == tbl_id))
219
+ self._x_locked_tbl_id = tbl_id
220
+
221
+ self._in_write_xact = for_write
222
+ yield conn
223
+ return
224
+ except sql.exc.DBAPIError as e:
225
+ if isinstance(e.orig, (psycopg.errors.SerializationFailure, psycopg.errors.LockNotAvailable)) and (
226
+ num_retries < _MAX_RETRIES or _MAX_RETRIES == 0
227
+ ):
228
+ num_retries += 1
229
+ _logger.debug(f'Retrying ({num_retries}) after {type(e.orig)}')
230
+ time.sleep(random.uniform(0.1, 0.5))
231
+ else:
232
+ raise
233
+ finally:
234
+ self._in_write_xact = False
235
+ self._x_locked_tbl_id = None
236
+
237
+ # invalidate cached current TableVersion instances
238
+ for tv in self._tbl_versions.values():
239
+ if tv.effective_version is None:
240
+ _logger.debug(f'invalidating table version {tv.id}:None (tv={id(tv):x})')
241
+ tv.is_validated = False
242
+
243
+ if _logger.isEnabledFor(logging.DEBUG):
244
+ self.validate()
245
+
246
+ @property
247
+ def in_write_xact(self) -> bool:
248
+ return self._in_write_xact
249
+
250
+ def _acquire_dir_xlock(self, parent_id: Optional[UUID], dir_id: Optional[UUID], dir_name: Optional[str]) -> None:
251
+ """Force acquisition of an X-lock on a Dir record via a blind update.
252
+
122
253
  If dir_id is present, then all other conditions are ignored.
123
254
  Note that (parent_id==None) is a valid where condition.
124
255
  If dir_id is not specified, the user from the environment is added to the directory filters.
125
256
  """
126
257
  user = Env.get().user
127
- conn = Env.get().conn
258
+ assert self._in_write_xact
128
259
  q = sql.update(schema.Dir).values(lock_dummy=1)
129
260
  if dir_id is not None:
130
261
  q = q.where(schema.Dir.id == dir_id)
@@ -134,7 +265,7 @@ class Catalog:
134
265
  q = q.where(schema.Dir.md['name'].astext == dir_name)
135
266
  if user is not None:
136
267
  q = q.where(schema.Dir.md['user'].astext == user)
137
- conn.execute(q)
268
+ Env.get().conn.execute(q)
138
269
 
139
270
  def get_dir_path(self, dir_id: UUID) -> Path:
140
271
  """Return path for directory with given id"""
@@ -156,7 +287,7 @@ class Catalog:
156
287
  dir_entries: dict[str, Catalog.DirEntry]
157
288
  table: Optional[schema.Table]
158
289
 
159
- @_retry_loop
290
+ @_retry_loop(for_write=False)
160
291
  def get_dir_contents(self, dir_path: Path, recursive: bool = False) -> dict[str, DirEntry]:
161
292
  dir = self._get_schema_object(dir_path, expected=Dir, raise_if_not_exists=True)
162
293
  return self._get_dir_contents(dir._id, recursive=recursive)
@@ -183,7 +314,7 @@ class Catalog:
183
314
 
184
315
  return result
185
316
 
186
- @_retry_loop
317
+ @_retry_loop(for_write=True)
187
318
  def move(self, path: Path, new_path: Path) -> None:
188
319
  self._move(path, new_path)
189
320
 
@@ -272,7 +403,7 @@ class Catalog:
272
403
 
273
404
  # check for subdirectory
274
405
  if for_update:
275
- self._lock_dir(dir_id, None, name)
406
+ self._acquire_dir_xlock(dir_id, None, name)
276
407
  q = sql.select(schema.Dir).where(
277
408
  schema.Dir.parent_id == dir_id, schema.Dir.md['name'].astext == name, schema.Dir.md['user'].astext == user
278
409
  )
@@ -296,7 +427,7 @@ class Catalog:
296
427
  tbl_id = conn.execute(q).scalar_one_or_none()
297
428
  if tbl_id is not None:
298
429
  if tbl_id not in self._tbls:
299
- self._tbls[tbl_id] = self._load_tbl(tbl_id)
430
+ _ = self._load_tbl(tbl_id)
300
431
  return self._tbls[tbl_id]
301
432
 
302
433
  return None
@@ -349,10 +480,15 @@ class Catalog:
349
480
  tbl = self._load_tbl(tbl_id)
350
481
  if tbl is None:
351
482
  return None
352
- self._tbls[tbl_id] = tbl
483
+ # if this is a mutable table, we also need to have its mutable views loaded, in order to track column
484
+ # dependencies
485
+ tbl_version = tbl._tbl_version.get()
486
+ if tbl_version.is_mutable:
487
+ for v in tbl_version.mutable_views:
488
+ _ = self.get_table_by_id(v.id)
353
489
  return self._tbls[tbl_id]
354
490
 
355
- @_retry_loop
491
+ @_retry_loop(for_write=True)
356
492
  def create_table(
357
493
  self,
358
494
  path: Path,
@@ -385,13 +521,14 @@ class Catalog:
385
521
  self._tbls[tbl._id] = tbl
386
522
  return tbl
387
523
 
388
- @_retry_loop
524
+ @_retry_loop(for_write=True)
389
525
  def create_view(
390
526
  self,
391
527
  path: Path,
392
528
  base: TableVersionPath,
393
529
  select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
394
530
  where: Optional[exprs.Expr],
531
+ sample_clause: Optional['SampleClause'],
395
532
  additional_columns: Optional[dict[str, Any]],
396
533
  is_snapshot: bool,
397
534
  iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]],
@@ -420,6 +557,7 @@ class Catalog:
420
557
  select_list=select_list,
421
558
  additional_columns=additional_columns,
422
559
  predicate=where,
560
+ sample_clause=sample_clause,
423
561
  is_snapshot=is_snapshot,
424
562
  iterator_cls=iterator_class,
425
563
  iterator_args=iterator_args,
@@ -431,14 +569,17 @@ class Catalog:
431
569
  self._tbls[view._id] = view
432
570
  return view
433
571
 
434
- @_retry_loop
572
+ @_retry_loop(for_write=True)
435
573
  def create_replica(
436
574
  self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam = IfExistsParam.ERROR
437
- ) -> Table:
575
+ ) -> None:
438
576
  """
439
577
  Creates table, table_version, and table_schema_version records for a replica with the given metadata.
440
578
  The metadata should be presented in standard "ancestor order", with the table being replicated at
441
579
  list position 0 and the (root) base table at list position -1.
580
+
581
+ TODO: create_replica() also needs to create the store tables and populate them in order to make
582
+ replica creation atomic.
442
583
  """
443
584
  tbl_id = UUID(md[0].tbl_md.tbl_id)
444
585
 
@@ -451,20 +592,19 @@ class Catalog:
451
592
  'but a different table already exists at that location.'
452
593
  )
453
594
  assert isinstance(existing, View)
454
- return existing
595
+ return
455
596
 
456
597
  # Ensure that the system directory exists.
457
598
  self._create_dir(Path('_system', allow_system_paths=True), if_exists=IfExistsParam.IGNORE, parents=False)
458
599
 
459
600
  # Now check to see if this table already exists in the catalog.
460
- # TODO: Handle concurrency in create_replica()
461
601
  existing = Catalog.get().get_table_by_id(tbl_id)
462
602
  if existing is not None:
463
- existing_path = Path(existing._path, allow_system_paths=True)
603
+ existing_path = Path(existing._path(), allow_system_paths=True)
464
604
  # It does exist. If it's a non-system table, that's an error: it's already been replicated.
465
605
  if not existing_path.is_system_path:
466
606
  raise excs.Error(
467
- f'That table has already been replicated as {existing._path!r}. \n'
607
+ f'That table has already been replicated as {existing._path()!r}. \n'
468
608
  f'Drop the existing replica if you wish to re-create it.'
469
609
  )
470
610
  # If it's a system table, then this means it was created at some point as the ancestor of some other
@@ -489,22 +629,20 @@ class Catalog:
489
629
  # The table already exists in the catalog. The existing path might be a system path (if the table
490
630
  # was created as an anonymous base table of some other table), or it might not (if it's a snapshot
491
631
  # that was directly replicated by the user at some point). In either case, use the existing path.
492
- replica_path = Path(replica._path, allow_system_paths=True)
632
+ replica_path = Path(replica._path(), allow_system_paths=True)
493
633
 
494
634
  # Store the metadata; it could be a new version (in which case a new record will be created) or a
495
635
  # known version (in which case the newly received metadata will be validated as identical).
496
636
  self.__store_replica_md(replica_path, ancestor_md)
497
637
 
498
- # Update the catalog (as a final step, after all DB operations completed successfully).
499
- # Only the table being replicated is actually made visible in the catalog.
500
- self._tbls[tbl_id] = self._load_tbl(tbl_id)
501
- return self._tbls[tbl_id]
638
+ # don't create TableVersion instances at this point, they would be superseded by calls to TV.create_replica()
639
+ # in TableRestorer.restore()
502
640
 
503
641
  def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
504
642
  _logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
505
- # TODO: Handle concurrency
506
643
  dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
507
644
  assert dir is not None
645
+ assert self._in_write_xact
508
646
 
509
647
  conn = Env.get().conn
510
648
  tbl_id = md.tbl_md.tbl_id
@@ -582,14 +720,24 @@ class Catalog:
582
720
 
583
721
  self.store_tbl_md(UUID(tbl_id), new_tbl_md, new_version_md, new_schema_version_md)
584
722
 
585
- @_retry_loop
723
+ @_retry_loop(for_write=False)
586
724
  def get_table(self, path: Path) -> Table:
725
+ obj = self._get_table(path)
726
+ return obj
727
+
728
+ def _get_table(self, path: Path) -> Table:
587
729
  obj = Catalog.get()._get_schema_object(path, expected=Table, raise_if_not_exists=True)
588
730
  assert isinstance(obj, Table)
589
- obj._tbl_version.get().ensure_md_loaded()
731
+ tbl_version = obj._tbl_version.get()
732
+ # TODO: instead of calling this here, move the logic into TableVersion.init(), which is called after
733
+ # registering the instance in _tbl_versions
734
+ tbl_version.ensure_md_loaded()
735
+ # if this table has mutable views, we need to load those as well, in order to record column dependencies
736
+ for v in tbl_version.mutable_views:
737
+ self.get_table_by_id(v.id)
590
738
  return obj
591
739
 
592
- @_retry_loop
740
+ @_retry_loop(for_write=True)
593
741
  def drop_table(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
594
742
  _, _, src_obj = self._prepare_dir_op(
595
743
  drop_dir_path=path.parent,
@@ -621,11 +769,11 @@ class Catalog:
621
769
  msg: str
622
770
  if is_replace:
623
771
  msg = (
624
- f'{obj_type_str} {tbl._path} already exists and has dependents. '
772
+ f'{obj_type_str} {tbl._path()} already exists and has dependents. '
625
773
  "Use `if_exists='replace_force'` to replace it."
626
774
  )
627
775
  else:
628
- msg = f'{obj_type_str} {tbl._path} has dependents.'
776
+ msg = f'{obj_type_str} {tbl._path()} has dependents.'
629
777
  raise excs.Error(msg)
630
778
 
631
779
  for view_id in view_ids:
@@ -636,9 +784,9 @@ class Catalog:
636
784
  tbl._drop()
637
785
  assert tbl._id in self._tbls
638
786
  del self._tbls[tbl._id]
639
- _logger.info(f'Dropped table `{tbl._path}`.')
787
+ _logger.info(f'Dropped table `{tbl._path()}`.')
640
788
 
641
- @_retry_loop
789
+ @_retry_loop(for_write=True)
642
790
  def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
643
791
  return self._create_dir(path, if_exists, parents)
644
792
 
@@ -673,7 +821,7 @@ class Catalog:
673
821
  Env.get().console_logger.info(f'Created directory {str(path)!r}.')
674
822
  return dir
675
823
 
676
- @_retry_loop
824
+ @_retry_loop(for_write=True)
677
825
  def drop_dir(self, path: Path, if_not_exists: IfNotExistsParam, force: bool) -> None:
678
826
  _, _, schema_obj = self._prepare_dir_op(
679
827
  drop_dir_path=path.parent,
@@ -698,7 +846,7 @@ class Catalog:
698
846
  raise excs.Error(f'Directory {str(dir_path)!r} is not empty.')
699
847
 
700
848
  # drop existing subdirs
701
- self._lock_dir(dir_id, None, None)
849
+ self._acquire_dir_xlock(dir_id, None, None)
702
850
  dir_q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
703
851
  for row in conn.execute(dir_q).all():
704
852
  self._drop_dir(row.id, dir_path.append(row.md['name']), force=True)
@@ -725,17 +873,37 @@ class Catalog:
725
873
  return result
726
874
 
727
875
  def get_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
728
- if (tbl_id, effective_version) not in self._tbl_versions:
729
- self._tbl_versions[tbl_id, effective_version] = self._load_tbl_version(tbl_id, effective_version)
730
- return self._tbl_versions[tbl_id, effective_version]
731
-
732
- def add_tbl_version(self, tbl_version: TableVersion) -> None:
733
- """Explicitly add a TableVersion"""
734
- self._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
735
- # if this is a mutable view, also record it in the base
736
- if tbl_version.is_view and tbl_version.effective_version is None:
737
- base = tbl_version.base.get()
738
- base.mutable_views.append(TableVersionHandle(tbl_version.id, tbl_version.effective_version))
876
+ # we need a transaction here, if we're not already in one; if this starts a new transaction,
877
+ # the returned TableVersion instance will not be validated
878
+ with self.begin_xact(tbl_id=tbl_id, for_write=False) as conn:
879
+ tv = self._tbl_versions.get((tbl_id, effective_version))
880
+ if tv is None:
881
+ tv = self._load_tbl_version(tbl_id, effective_version)
882
+ elif not tv.is_validated:
883
+ # only live instances are invalidated
884
+ assert effective_version is None
885
+ # we validate live instances by comparing our cached version number to the stored current version
886
+ # _logger.debug(f'validating metadata for table {tbl_id}:{tv.version} ({id(tv):x})')
887
+ q = sql.select(schema.Table.md).where(schema.Table.id == tbl_id)
888
+ row = conn.execute(q).one()
889
+ current_version = row.md['current_version']
890
+
891
+ # the stored version can be behind TableVersion.version, because we don't roll back the in-memory
892
+ # metadata changes after a failed update operation
893
+ if current_version != tv.version:
894
+ # the cached metadata is invalid
895
+ _logger.debug(
896
+ f'reloading metadata for table {tbl_id} '
897
+ f'(cached version: {tv.version}, current version: {current_version}'
898
+ # f', id: {id(tv):x})'
899
+ )
900
+ tv = self._load_tbl_version(tbl_id, None)
901
+ else:
902
+ # the cached metadata is valid
903
+ tv.is_validated = True
904
+
905
+ assert tv.is_validated
906
+ return tv
739
907
 
740
908
  def remove_tbl_version(self, tbl_version: TableVersion) -> None:
741
909
  assert (tbl_version.id, tbl_version.effective_version) in self._tbl_versions
@@ -745,7 +913,7 @@ class Catalog:
745
913
  """Return the Dir with the given id, or None if it doesn't exist"""
746
914
  conn = Env.get().conn
747
915
  if for_update:
748
- self._lock_dir(None, dir_id, None)
916
+ self._acquire_dir_xlock(None, dir_id, None)
749
917
  q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
750
918
  row = conn.execute(q).one_or_none()
751
919
  if row is None:
@@ -761,7 +929,7 @@ class Catalog:
761
929
  conn = Env.get().conn
762
930
  if path.is_root:
763
931
  if for_update:
764
- self._lock_dir(parent_id=None, dir_id=None, dir_name='')
932
+ self._acquire_dir_xlock(parent_id=None, dir_id=None, dir_name='')
765
933
  q = sql.select(schema.Dir).where(schema.Dir.parent_id.is_(None), schema.Dir.md['user'].astext == user)
766
934
  row = conn.execute(q).one_or_none()
767
935
  return schema.Dir(**row._mapping) if row is not None else None
@@ -770,7 +938,7 @@ class Catalog:
770
938
  if parent_dir is None:
771
939
  return None
772
940
  if for_update:
773
- self._lock_dir(parent_id=parent_dir.id, dir_id=None, dir_name=path.name)
941
+ self._acquire_dir_xlock(parent_id=parent_dir.id, dir_id=None, dir_name=path.name)
774
942
  q = sql.select(schema.Dir).where(
775
943
  schema.Dir.parent_id == parent_dir.id,
776
944
  schema.Dir.md['name'].astext == path.name,
@@ -780,6 +948,7 @@ class Catalog:
780
948
  return schema.Dir(**row._mapping) if row is not None else None
781
949
 
782
950
  def _load_tbl(self, tbl_id: UUID) -> Optional[Table]:
951
+ """Loads metadata for the table with the given id and caches it."""
783
952
  _logger.info(f'Loading table {tbl_id}')
784
953
  from .insertable_table import InsertableTable
785
954
  from .view import View
@@ -808,8 +977,9 @@ class Catalog:
808
977
  if view_md is None:
809
978
  # this is a base table
810
979
  if (tbl_id, None) not in self._tbl_versions:
811
- self._tbl_versions[tbl_id, None] = self._load_tbl_version(tbl_id, None)
980
+ _ = self._load_tbl_version(tbl_id, None)
812
981
  tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
982
+ self._tbls[tbl_id] = tbl
813
983
  return tbl
814
984
 
815
985
  # this is a view; determine the sequence of TableVersions to load
@@ -829,18 +999,18 @@ class Catalog:
829
999
  view_path: Optional[TableVersionPath] = None
830
1000
  for id, effective_version in tbl_version_path[::-1]:
831
1001
  if (id, effective_version) not in self._tbl_versions:
832
- self._tbl_versions[id, effective_version] = self._load_tbl_version(id, effective_version)
1002
+ _ = self._load_tbl_version(id, effective_version)
833
1003
  view_path = TableVersionPath(TableVersionHandle(id, effective_version), base=base_path)
834
1004
  base_path = view_path
835
1005
  view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=pure_snapshot)
836
- # TODO: also load mutable views
1006
+ self._tbls[tbl_id] = view
837
1007
  return view
838
1008
 
839
1009
  def load_tbl_md(self, tbl_id: UUID, effective_version: Optional[int]) -> schema.FullTableMd:
840
1010
  """
841
1011
  Loads metadata from the store for a given table UUID and version.
842
1012
  """
843
- _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
1013
+ # _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
844
1014
  conn = Env.get().conn
845
1015
 
846
1016
  q = (
@@ -915,8 +1085,15 @@ class Catalog:
915
1085
  If inserting `version_md` or `schema_version_md` would be a primary key violation, an exception will be raised.
916
1086
  """
917
1087
  conn = Env.get().conn
1088
+ assert self._in_write_xact
918
1089
 
919
1090
  if tbl_md is not None:
1091
+ assert tbl_md.tbl_id == str(tbl_id)
1092
+ if version_md is not None:
1093
+ assert tbl_md.current_version == version_md.version
1094
+ assert tbl_md.current_schema_version == version_md.schema_version
1095
+ if schema_version_md is not None:
1096
+ assert tbl_md.current_schema_version == schema_version_md.schema_version
920
1097
  result = conn.execute(
921
1098
  sql.update(schema.Table.__table__)
922
1099
  .values({schema.Table.md: dataclasses.asdict(tbl_md)})
@@ -925,6 +1102,9 @@ class Catalog:
925
1102
  assert result.rowcount == 1, result.rowcount
926
1103
 
927
1104
  if version_md is not None:
1105
+ assert version_md.tbl_id == str(tbl_id)
1106
+ if schema_version_md is not None:
1107
+ assert version_md.schema_version == schema_version_md.schema_version
928
1108
  conn.execute(
929
1109
  sql.insert(schema.TableVersion.__table__).values(
930
1110
  tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
@@ -932,6 +1112,7 @@ class Catalog:
932
1112
  )
933
1113
 
934
1114
  if schema_version_md is not None:
1115
+ assert schema_version_md.tbl_id == str(tbl_id)
935
1116
  conn.execute(
936
1117
  sql.insert(schema.TableSchemaVersion.__table__).values(
937
1118
  tbl_id=tbl_id,
@@ -978,50 +1159,60 @@ class Catalog:
978
1159
  return md
979
1160
 
980
1161
  def _load_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
1162
+ """Creates TableVersion instance from stored metadata and registers it in _tbl_versions."""
981
1163
  tbl_md, _, schema_version_md = self.load_tbl_md(tbl_id, effective_version)
982
1164
  view_md = tbl_md.view_md
983
1165
 
984
- _logger.info(f'Loading table version: {tbl_id}:{effective_version}')
985
1166
  conn = Env.get().conn
986
1167
 
987
- # load mutable view ids
988
- q = sql.select(schema.Table.id).where(
989
- sql.text(
990
- f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r} "
991
- "AND md->'view_md'->'base_versions'->0->1 IS NULL"
1168
+ # load mutable view ids for mutable TableVersions
1169
+ mutable_view_ids: list[UUID] = []
1170
+ # If this is a replica, effective_version should not be None. We see this today, because
1171
+ # the replica's TV instance's Column instances contain value_expr_dicts that reference the live version.
1172
+ # This is presumably a source of bugs, because it ignores schema version changes (eg, column renames).
1173
+ # TODO: retarget the value_expr_dict when instantiating Columns for a particular TV instance.
1174
+ if effective_version is None and not tbl_md.is_replica:
1175
+ q = sql.select(schema.Table.id).where(
1176
+ sql.text(
1177
+ f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r} "
1178
+ "AND md->'view_md'->'base_versions'->0->>1 IS NULL"
1179
+ )
992
1180
  )
993
- )
994
- mutable_view_ids = [r[0] for r in conn.execute(q).all()]
1181
+ mutable_view_ids = [r[0] for r in conn.execute(q).all()]
995
1182
  mutable_views = [TableVersionHandle(id, None) for id in mutable_view_ids]
996
1183
 
1184
+ tbl_version: TableVersion
997
1185
  if view_md is None:
998
1186
  # this is a base table
999
1187
  tbl_version = TableVersion(
1000
1188
  tbl_id, tbl_md, effective_version, schema_version_md, mutable_views=mutable_views
1001
1189
  )
1002
- return tbl_version
1190
+ else:
1191
+ assert len(view_md.base_versions) > 0 # a view needs to have a base
1192
+ pure_snapshot = view_md.is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
1193
+ assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
1194
+
1195
+ base: TableVersionHandle
1196
+ base_path: Optional[TableVersionPath] = None # needed for live view
1197
+ if view_md.is_snapshot:
1198
+ base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
1199
+ else:
1200
+ base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
1201
+ base = base_path.tbl_version
1202
+
1203
+ tbl_version = TableVersion(
1204
+ tbl_id,
1205
+ tbl_md,
1206
+ effective_version,
1207
+ schema_version_md,
1208
+ base_path=base_path,
1209
+ base=base,
1210
+ mutable_views=mutable_views,
1211
+ )
1003
1212
 
1004
- assert len(view_md.base_versions) > 0 # a view needs to have a base
1005
- pure_snapshot = view_md.is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
1006
- assert not pure_snapshot # a pure snapshot doesn't have a physical table backing it, no point in loading it
1213
+ self._tbl_versions[tbl_id, effective_version] = tbl_version
1214
+ tbl_version.init()
1007
1215
 
1008
- base: TableVersionHandle
1009
- base_path: Optional[TableVersionPath] = None # needed for live view
1010
- if view_md.is_snapshot:
1011
- base = TableVersionHandle(UUID(view_md.base_versions[0][0]), view_md.base_versions[0][1])
1012
- else:
1013
- base_path = TableVersionPath.from_md(tbl_md.view_md.base_versions)
1014
- base = base_path.tbl_version
1015
-
1016
- tbl_version = TableVersion(
1017
- tbl_id,
1018
- tbl_md,
1019
- effective_version,
1020
- schema_version_md,
1021
- base_path=base_path,
1022
- base=base,
1023
- mutable_views=mutable_views,
1024
- )
1025
1216
  return tbl_version
1026
1217
 
1027
1218
  def _init_store(self) -> None: