pixeltable 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (119) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -1
  4. pixeltable/catalog/catalog.py +370 -93
  5. pixeltable/catalog/column.py +6 -4
  6. pixeltable/catalog/dir.py +5 -5
  7. pixeltable/catalog/globals.py +14 -16
  8. pixeltable/catalog/insertable_table.py +6 -8
  9. pixeltable/catalog/path.py +14 -7
  10. pixeltable/catalog/table.py +72 -62
  11. pixeltable/catalog/table_version.py +137 -107
  12. pixeltable/catalog/table_version_handle.py +3 -0
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/view.py +10 -14
  15. pixeltable/dataframe.py +5 -3
  16. pixeltable/env.py +108 -42
  17. pixeltable/exec/__init__.py +2 -0
  18. pixeltable/exec/aggregation_node.py +6 -8
  19. pixeltable/exec/cache_prefetch_node.py +4 -7
  20. pixeltable/exec/component_iteration_node.py +1 -3
  21. pixeltable/exec/data_row_batch.py +1 -2
  22. pixeltable/exec/exec_context.py +1 -1
  23. pixeltable/exec/exec_node.py +1 -2
  24. pixeltable/exec/expr_eval/__init__.py +2 -0
  25. pixeltable/exec/expr_eval/evaluators.py +137 -20
  26. pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
  27. pixeltable/exec/expr_eval/globals.py +68 -7
  28. pixeltable/exec/expr_eval/schedulers.py +25 -23
  29. pixeltable/exec/in_memory_data_node.py +8 -6
  30. pixeltable/exec/row_update_node.py +3 -4
  31. pixeltable/exec/sql_node.py +16 -18
  32. pixeltable/exprs/__init__.py +1 -1
  33. pixeltable/exprs/column_property_ref.py +1 -1
  34. pixeltable/exprs/column_ref.py +3 -3
  35. pixeltable/exprs/compound_predicate.py +1 -1
  36. pixeltable/exprs/data_row.py +17 -1
  37. pixeltable/exprs/expr.py +12 -12
  38. pixeltable/exprs/function_call.py +34 -2
  39. pixeltable/exprs/json_mapper.py +95 -48
  40. pixeltable/exprs/json_path.py +4 -9
  41. pixeltable/exprs/method_ref.py +2 -2
  42. pixeltable/exprs/object_ref.py +2 -2
  43. pixeltable/exprs/row_builder.py +33 -6
  44. pixeltable/exprs/similarity_expr.py +1 -1
  45. pixeltable/exprs/sql_element_cache.py +1 -1
  46. pixeltable/exprs/string_op.py +2 -2
  47. pixeltable/ext/__init__.py +1 -1
  48. pixeltable/ext/functions/__init__.py +1 -1
  49. pixeltable/ext/functions/whisperx.py +1 -1
  50. pixeltable/ext/functions/yolox.py +1 -1
  51. pixeltable/func/__init__.py +1 -1
  52. pixeltable/func/aggregate_function.py +2 -2
  53. pixeltable/func/callable_function.py +3 -6
  54. pixeltable/func/expr_template_function.py +24 -4
  55. pixeltable/func/function.py +7 -9
  56. pixeltable/func/function_registry.py +1 -1
  57. pixeltable/func/query_template_function.py +87 -4
  58. pixeltable/func/signature.py +1 -1
  59. pixeltable/func/tools.py +1 -1
  60. pixeltable/func/udf.py +2 -2
  61. pixeltable/functions/__init__.py +1 -1
  62. pixeltable/functions/anthropic.py +2 -2
  63. pixeltable/functions/audio.py +1 -1
  64. pixeltable/functions/deepseek.py +1 -1
  65. pixeltable/functions/fireworks.py +1 -1
  66. pixeltable/functions/globals.py +6 -6
  67. pixeltable/functions/huggingface.py +1 -1
  68. pixeltable/functions/image.py +1 -1
  69. pixeltable/functions/json.py +1 -1
  70. pixeltable/functions/llama_cpp.py +1 -1
  71. pixeltable/functions/math.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +1 -1
  74. pixeltable/functions/openai.py +2 -2
  75. pixeltable/functions/replicate.py +1 -1
  76. pixeltable/functions/string.py +1 -1
  77. pixeltable/functions/timestamp.py +1 -1
  78. pixeltable/functions/together.py +1 -1
  79. pixeltable/functions/util.py +1 -1
  80. pixeltable/functions/video.py +2 -2
  81. pixeltable/functions/vision.py +2 -2
  82. pixeltable/globals.py +7 -2
  83. pixeltable/index/embedding_index.py +12 -1
  84. pixeltable/io/__init__.py +5 -3
  85. pixeltable/io/fiftyone.py +6 -7
  86. pixeltable/io/label_studio.py +21 -20
  87. pixeltable/io/pandas.py +6 -5
  88. pixeltable/iterators/__init__.py +1 -1
  89. pixeltable/metadata/__init__.py +6 -4
  90. pixeltable/metadata/converters/convert_24.py +3 -3
  91. pixeltable/metadata/converters/convert_25.py +1 -1
  92. pixeltable/metadata/converters/convert_29.py +1 -1
  93. pixeltable/metadata/converters/convert_31.py +11 -0
  94. pixeltable/metadata/converters/convert_32.py +15 -0
  95. pixeltable/metadata/converters/convert_33.py +17 -0
  96. pixeltable/metadata/notes.py +3 -0
  97. pixeltable/metadata/schema.py +26 -1
  98. pixeltable/plan.py +2 -3
  99. pixeltable/share/packager.py +8 -24
  100. pixeltable/share/publish.py +20 -9
  101. pixeltable/store.py +9 -6
  102. pixeltable/type_system.py +19 -7
  103. pixeltable/utils/console_output.py +3 -2
  104. pixeltable/utils/coroutine.py +3 -3
  105. pixeltable/utils/dbms.py +66 -0
  106. pixeltable/utils/documents.py +61 -67
  107. pixeltable/utils/exception_handler.py +59 -0
  108. pixeltable/utils/filecache.py +1 -1
  109. pixeltable/utils/http_server.py +3 -2
  110. pixeltable/utils/pytorch.py +1 -1
  111. pixeltable/utils/sql.py +1 -1
  112. pixeltable-0.3.12.dist-info/METADATA +436 -0
  113. pixeltable-0.3.12.dist-info/RECORD +183 -0
  114. pixeltable/catalog/path_dict.py +0 -169
  115. pixeltable-0.3.10.dist-info/METADATA +0 -382
  116. pixeltable-0.3.10.dist-info/RECORD +0 -179
  117. {pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/LICENSE +0 -0
  118. {pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/WHEEL +0 -0
  119. {pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/entry_points.txt +0 -0
@@ -10,10 +10,10 @@ from uuid import UUID
10
10
  import psycopg
11
11
  import sqlalchemy as sql
12
12
 
13
- import pixeltable.exceptions as excs
14
- import pixeltable.metadata.schema as schema
13
+ from pixeltable import exceptions as excs
15
14
  from pixeltable.env import Env
16
15
  from pixeltable.iterators import ComponentIterator
16
+ from pixeltable.metadata import schema
17
17
 
18
18
  from .dir import Dir
19
19
  from .globals import IfExistsParam, IfNotExistsParam, MediaValidation
@@ -33,16 +33,6 @@ if TYPE_CHECKING:
33
33
  _logger = logging.getLogger('pixeltable')
34
34
 
35
35
 
36
- def _lock_str(for_update: bool) -> str:
37
- return 'X' if for_update else 'S'
38
-
39
-
40
- # TODO: remove once the concurrent update behavior has been debugged
41
- # def _debug_print(for_update: bool, msg: str) -> None:
42
- # return
43
- # print(f'{datetime.datetime.now()}: {_lock_str(for_update)}: {msg}')
44
-
45
-
46
36
  def _unpack_row(
47
37
  row: Optional[sql.engine.Row], entities: list[type[sql.orm.decl_api.DeclarativeBase]]
48
38
  ) -> Optional[list[Any]]:
@@ -79,14 +69,17 @@ def _retry_loop(op: Callable[..., T]) -> Callable[..., T]:
79
69
  # in order for retry to work, we need to make sure that there aren't any prior db updates
80
70
  # that are part of an ongoing transaction
81
71
  assert not Env.get().in_xact()
82
- with Env.get().begin_xact() as conn:
72
+ with Env.get().begin_xact():
83
73
  return op(*args, **kwargs)
84
74
  except sql.exc.DBAPIError as e:
85
- if isinstance(e.orig, psycopg.errors.SerializationFailure) and num_remaining_retries > 0:
86
- num_remaining_retries -= 1
87
- print(f'serialization failure:\n{e}')
88
- print('retrying ************************************************************')
89
- time.sleep(1)
75
+ if isinstance(e.orig, psycopg.errors.SerializationFailure):
76
+ if num_remaining_retries > 0:
77
+ num_remaining_retries -= 1
78
+ # print(f'serialization failure:\n{e}')
79
+ # print('retrying ************************************************************')
80
+ time.sleep(1)
81
+ else:
82
+ raise excs.Error(f'Serialization retry limit ({_MAX_RETRIES}) exceeded') from e
90
83
  else:
91
84
  raise
92
85
 
@@ -123,20 +116,39 @@ class Catalog:
123
116
  self._tbls = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
124
117
  self._init_store()
125
118
 
119
+ @classmethod
120
+ def _lock_dir(cls, parent_id: Optional[UUID], dir_id: Optional[UUID], dir_name: Optional[str]) -> None:
121
+ """Update directory record(s) to sequentialize thread access. Lock is released when transaction commits.
122
+ If dir_id is present, then all other conditions are ignored.
123
+ Note that (parent_id==None) is a valid where condition.
124
+ If dir_id is not specified, the user from the environment is added to the directory filters.
125
+ """
126
+ user = Env.get().user
127
+ conn = Env.get().conn
128
+ q = sql.update(schema.Dir).values(lock_dummy=1)
129
+ if dir_id is not None:
130
+ q = q.where(schema.Dir.id == dir_id)
131
+ else:
132
+ q = q.where(schema.Dir.parent_id == parent_id)
133
+ if dir_name is not None:
134
+ q = q.where(schema.Dir.md['name'].astext == dir_name)
135
+ if user is not None:
136
+ q = q.where(schema.Dir.md['user'].astext == user)
137
+ conn.execute(q)
138
+
126
139
  def get_dir_path(self, dir_id: UUID) -> Path:
127
140
  """Return path for directory with given id"""
128
141
  conn = Env.get().conn
129
142
  names: list[str] = []
130
143
  while True:
131
144
  q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
132
- # _debug_print(for_update=False, msg=f'dir id={dir_id}')
133
145
  row = conn.execute(q).one()
134
146
  dir = schema.Dir(**row._mapping)
135
147
  if dir.md['name'] == '':
136
148
  break
137
149
  names.insert(0, dir.md['name'])
138
150
  dir_id = dir.parent_id
139
- return Path('.'.join(names), empty_is_valid=True)
151
+ return Path('.'.join(names), empty_is_valid=True, allow_system_paths=True)
140
152
 
141
153
  @dataclasses.dataclass
142
154
  class DirEntry:
@@ -155,7 +167,6 @@ class Catalog:
155
167
  result: dict[str, Catalog.DirEntry] = {}
156
168
 
157
169
  q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
158
- # _debug_print(for_update=False, msg=f'dirs parent_id={dir_id}')
159
170
  rows = conn.execute(q).all()
160
171
  for row in rows:
161
172
  dir = schema.Dir(**row._mapping)
@@ -165,7 +176,6 @@ class Catalog:
165
176
  result[dir.md['name']] = self.DirEntry(dir=dir, dir_entries=dir_contents, table=None)
166
177
 
167
178
  q = sql.select(schema.Table).where(schema.Table.dir_id == dir_id)
168
- # _debug_print(for_update=False, msg=f'tbls parent_id={dir_id}')
169
179
  rows = conn.execute(q).all()
170
180
  for row in rows:
171
181
  tbl = schema.Table(**row._mapping)
@@ -175,6 +185,9 @@ class Catalog:
175
185
 
176
186
  @_retry_loop
177
187
  def move(self, path: Path, new_path: Path) -> None:
188
+ self._move(path, new_path)
189
+
190
+ def _move(self, path: Path, new_path: Path) -> None:
178
191
  _, dest_dir, src_obj = self._prepare_dir_op(
179
192
  add_dir_path=new_path.parent,
180
193
  add_name=new_path.name,
@@ -222,10 +235,10 @@ class Catalog:
222
235
 
223
236
  add_dir: Optional[schema.Dir] = None
224
237
  drop_dir: Optional[schema.Dir] = None
225
- for p in sorted(list(dir_paths)):
238
+ for p in sorted(dir_paths):
226
239
  dir = self._get_dir(p, for_update=True)
227
240
  if dir is None:
228
- raise excs.Error(f'Directory {str(p)!r} does not exist')
241
+ raise excs.Error(f'Directory {str(p)!r} does not exist.')
229
242
  if p == add_dir_path:
230
243
  add_dir = dir
231
244
  if p == drop_dir_path:
@@ -236,14 +249,14 @@ class Catalog:
236
249
  add_obj = self._get_dir_entry(add_dir.id, add_name, for_update=True)
237
250
  if add_obj is not None and raise_if_exists:
238
251
  add_path = add_dir_path.append(add_name)
239
- raise excs.Error(f'Path {str(add_path)!r} already exists')
252
+ raise excs.Error(f'Path {str(add_path)!r} already exists.')
240
253
 
241
254
  drop_obj: Optional[SchemaObject] = None
242
255
  if drop_dir is not None:
243
256
  drop_path = drop_dir_path.append(drop_name)
244
257
  drop_obj = self._get_dir_entry(drop_dir.id, drop_name, for_update=True)
245
258
  if drop_obj is None and raise_if_not_exists:
246
- raise excs.Error(f'Path {str(drop_path)!r} does not exist')
259
+ raise excs.Error(f'Path {str(drop_path)!r} does not exist.')
247
260
  if drop_obj is not None and drop_expected is not None and not isinstance(drop_obj, drop_expected):
248
261
  raise excs.Error(
249
262
  f'{str(drop_path)!r} needs to be a {drop_expected._display_name()} '
@@ -254,32 +267,35 @@ class Catalog:
254
267
  return add_obj, add_dir_obj, drop_obj
255
268
 
256
269
  def _get_dir_entry(self, dir_id: UUID, name: str, for_update: bool = False) -> Optional[SchemaObject]:
270
+ user = Env.get().user
257
271
  conn = Env.get().conn
258
272
 
259
273
  # check for subdirectory
260
- q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id, schema.Dir.md['name'].astext == name)
261
274
  if for_update:
262
- q = q.with_for_update()
263
- # _debug_print(for_update, f'dir name={name!r} parent={dir_id}')
264
- # row = conn.execute(q).one_or_none()
265
- # if row is not None:
266
- # dir_record = schema.Dir(**row._mapping)
267
- # return Dir(dir_record.id, dir_record.parent_id, name)
275
+ self._lock_dir(dir_id, None, name)
276
+ q = sql.select(schema.Dir).where(
277
+ schema.Dir.parent_id == dir_id, schema.Dir.md['name'].astext == name, schema.Dir.md['user'].astext == user
278
+ )
268
279
  rows = conn.execute(q).all()
280
+ # The condition below can occur if there is a synchronization failure across multiple processes
281
+ # It indicates database inconsistency.
269
282
  if len(rows) > 1:
270
- assert False, rows
283
+ raise AssertionError(rows)
271
284
  if len(rows) == 1:
272
285
  dir_record = schema.Dir(**rows[0]._mapping)
273
286
  return Dir(dir_record.id, dir_record.parent_id, name)
274
287
 
275
288
  # check for table
276
- q = sql.select(schema.Table.id).where(schema.Table.dir_id == dir_id, schema.Table.md['name'].astext == name)
289
+ q = sql.select(schema.Table.id).where(
290
+ schema.Table.dir_id == dir_id,
291
+ schema.Table.md['name'].astext == name,
292
+ schema.Table.md['user'].astext == user,
293
+ )
277
294
  if for_update:
278
295
  q = q.with_for_update()
279
- # _debug_print(for_update, f'table name={name!r} parent={dir_id}')
280
296
  tbl_id = conn.execute(q).scalar_one_or_none()
281
297
  if tbl_id is not None:
282
- if not tbl_id in self._tbls:
298
+ if tbl_id not in self._tbls:
283
299
  self._tbls[tbl_id] = self._load_tbl(tbl_id)
284
300
  return self._tbls[tbl_id]
285
301
 
@@ -304,26 +320,32 @@ class Catalog:
304
320
  if path.is_root:
305
321
  # the root dir
306
322
  if expected is not None and expected is not Dir:
307
- raise excs.Error(f'{path!r} needs to be a {expected._display_name()} but is a {Dir._display_name()}')
323
+ raise excs.Error(
324
+ f'{str(path)!r} needs to be a {expected._display_name()} but is a {Dir._display_name()}'
325
+ )
308
326
  dir = self._get_dir(path, for_update=for_update)
327
+ if dir is None:
328
+ raise excs.Error(f'Unknown user: {Env.get().user}')
309
329
  return Dir(dir.id, dir.parent_id, dir.md['name'])
310
330
 
311
331
  parent_path = path.parent
312
332
  parent_dir = self._get_dir(parent_path, for_update=False)
313
333
  if parent_dir is None:
314
- raise excs.Error(f'Directory {parent_path!r} does not exist')
334
+ raise excs.Error(f'Directory {str(parent_path)!r} does not exist.')
315
335
  obj = self._get_dir_entry(parent_dir.id, path.name, for_update=for_update)
316
336
 
317
337
  if obj is None and raise_if_not_exists:
318
- raise excs.Error(f'Path {path!r} does not exist')
338
+ raise excs.Error(f'Path {str(path)!r} does not exist.')
319
339
  elif obj is not None and raise_if_exists:
320
- raise excs.Error(f'Path {path!r} is an existing {type(obj)._display_name()}')
340
+ raise excs.Error(f'Path {str(path)!r} is an existing {type(obj)._display_name()}.')
321
341
  elif obj is not None and expected is not None and not isinstance(obj, expected):
322
- raise excs.Error(f'{path!r} needs to be a {expected._display_name()} but is a {type(obj)._display_name()}')
342
+ raise excs.Error(
343
+ f'{str(path)!r} needs to be a {expected._display_name()} but is a {type(obj)._display_name()}.'
344
+ )
323
345
  return obj
324
346
 
325
347
  def get_table_by_id(self, tbl_id: UUID) -> Optional[Table]:
326
- if not tbl_id in self._tbls:
348
+ if tbl_id not in self._tbls:
327
349
  tbl = self._load_tbl(tbl_id)
328
350
  if tbl is None:
329
351
  return None
@@ -409,6 +431,155 @@ class Catalog:
409
431
  self._tbls[view._id] = view
410
432
  return view
411
433
 
434
+ @_retry_loop
435
+ def create_replica(self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam) -> Table:
436
+ """
437
+ Creates table, table_version, and table_schema_version records for a replica with the given metadata.
438
+ The metadata should be presented in standard "ancestor order", with the table being replicated at
439
+ list position 0 and the (root) base table at list position -1.
440
+ """
441
+ tbl_id = UUID(md[0].tbl_md.tbl_id)
442
+
443
+ # First handle path collisions (if_exists='ignore' or 'replace' or etc).
444
+ existing = self._handle_path_collision(path, View, False, if_exists)
445
+ if existing is not None:
446
+ if existing._id != tbl_id:
447
+ raise excs.Error(
448
+ f"An attempt was made to create a replica table at {path!r} with if_exists='ignore', "
449
+ 'but a different table already exists at that location.'
450
+ )
451
+ assert isinstance(existing, View)
452
+ return existing
453
+
454
+ # Ensure that the system directory exists.
455
+ self._create_dir(Path('_system', allow_system_paths=True), if_exists=IfExistsParam.IGNORE, parents=False)
456
+
457
+ # Now check to see if this table already exists in the catalog.
458
+ # TODO: Handle concurrency in create_replica()
459
+ existing = Catalog.get().get_table_by_id(tbl_id)
460
+ if existing is not None:
461
+ existing_path = Path(existing._path(), allow_system_paths=True)
462
+ # It does exist. If it's a non-system table, that's an error: it's already been replicated.
463
+ if not existing_path.is_system_path:
464
+ raise excs.Error(
465
+ f'That table has already been replicated as {existing._path()!r}. \n'
466
+ f'Drop the existing replica if you wish to re-create it.'
467
+ )
468
+ # If it's a system table, then this means it was created at some point as the ancestor of some other
469
+ # table (a snapshot-over-snapshot scenario). In that case, we simply move it to the new (named) location.
470
+ self._move(existing_path, path)
471
+
472
+ # Now store the metadata for this replica. In the case where the table already exists (and was just moved
473
+ # into a named location), this will be a no-op, but it still serves to validate that the newly received
474
+ # metadata is identical to what's in the catalog.
475
+ self.__store_replica_md(path, md[0])
476
+
477
+ # Now store the metadata for all of this table's proper ancestors. If one or more proper ancestors
478
+ # do not yet exist in the store, they will be created as anonymous system tables.
479
+ for ancestor_md in md[1:]:
480
+ ancestor_id = UUID(ancestor_md.tbl_md.tbl_id)
481
+ replica = Catalog.get().get_table_by_id(ancestor_id)
482
+ replica_path: Path
483
+ if replica is None:
484
+ # We've never seen this table before. Create a new anonymous system table for it.
485
+ replica_path = Path(f'_system.replica_{ancestor_id.hex}', allow_system_paths=True)
486
+ else:
487
+ # The table already exists in the catalog. The existing path might be a system path (if the table
488
+ # was created as an anonymous base table of some other table), or it might not (if it's a snapshot
489
+ # that was directly replicated by the user at some point). In either case, use the existing path.
490
+ replica_path = Path(replica._path(), allow_system_paths=True)
491
+
492
+ # Store the metadata; it could be a new version (in which case a new record will be created) or a
493
+ # known version (in which case the newly received metadata will be validated as identical).
494
+ self.__store_replica_md(replica_path, ancestor_md)
495
+
496
+ # Update the catalog (as a final step, after all DB operations completed successfully).
497
+ # Only the table being replicated is actually made visible in the catalog.
498
+ self._tbls[tbl_id] = self._load_tbl(tbl_id)
499
+ return self._tbls[tbl_id]
500
+
501
+ def __store_replica_md(self, path: Path, md: schema.FullTableMd) -> None:
502
+ _logger.info(f'Creating replica table at {path!r} with ID: {md.tbl_md.tbl_id}')
503
+ # TODO: Handle concurrency
504
+ dir = self._get_schema_object(path.parent, expected=Dir, raise_if_not_exists=True)
505
+ assert dir is not None
506
+
507
+ conn = Env.get().conn
508
+ tbl_id = md.tbl_md.tbl_id
509
+
510
+ new_tbl_md: Optional[schema.TableMd] = None
511
+ new_version_md: Optional[schema.TableVersionMd] = None
512
+ new_schema_version_md: Optional[schema.TableSchemaVersionMd] = None
513
+
514
+ # We need to ensure that the table metadata in the catalog always reflects the latest observed version of
515
+ # this table. (In particular, if this is a base table, then its table metadata need to be consistent
516
+ # with the latest version of this table having a replicated view somewhere in the catalog.)
517
+ q: sql.Executable = sql.select(schema.Table.md).where(schema.Table.id == tbl_id)
518
+ existing_md_row = conn.execute(q).one_or_none()
519
+
520
+ if existing_md_row is None:
521
+ # No existing table, so create a new record.
522
+ q = sql.insert(schema.Table.__table__).values(
523
+ id=tbl_id,
524
+ dir_id=dir._id,
525
+ md=dataclasses.asdict(
526
+ dataclasses.replace(md.tbl_md, name=path.name, user=Env.get().user, is_replica=True)
527
+ ),
528
+ )
529
+ conn.execute(q)
530
+ else:
531
+ assert existing_md_row.md['is_replica']
532
+ if md.tbl_md.current_version > existing_md_row.md['current_version']:
533
+ # New metadata is more recent than the metadata currently stored in the DB; we'll update the record
534
+ # in place in the DB.
535
+ new_tbl_md = dataclasses.replace(md.tbl_md, name=path.name, user=Env.get().user, is_replica=True)
536
+
537
+ # Now see if a TableVersion record already exists in the DB for this table version. If not, insert it. If
538
+ # it already exists, check that the existing record is identical to the new one.
539
+ q = (
540
+ sql.select(schema.TableVersion.md)
541
+ .where(schema.TableVersion.tbl_id == tbl_id)
542
+ .where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = {md.version_md.version}"))
543
+ )
544
+ existing_version_md_row = conn.execute(q).one_or_none()
545
+ if existing_version_md_row is None:
546
+ new_version_md = md.version_md
547
+ else:
548
+ existing_version_md = schema.md_from_dict(schema.TableVersionMd, existing_version_md_row.md)
549
+ if existing_version_md != md.version_md:
550
+ raise excs.Error(
551
+ f'The version metadata for the replica {path!r}:{md.version_md.version} is inconsistent with '
552
+ 'the metadata recorded from a prior replica.\n'
553
+ 'This is likely due to data corruption in the replicated table.'
554
+ )
555
+
556
+ # Do the same thing for TableSchemaVersion.
557
+ q = (
558
+ sql.select(schema.TableSchemaVersion.md)
559
+ .where(schema.TableSchemaVersion.tbl_id == tbl_id)
560
+ .where(
561
+ sql.text(
562
+ f"({schema.TableSchemaVersion.__table__}.md->>'schema_version')::int = "
563
+ f'{md.schema_version_md.schema_version}'
564
+ )
565
+ )
566
+ )
567
+ existing_schema_version_md_row = conn.execute(q).one_or_none()
568
+ if existing_schema_version_md_row is None:
569
+ new_schema_version_md = md.schema_version_md
570
+ else:
571
+ existing_schema_version_md = schema.md_from_dict(
572
+ schema.TableSchemaVersionMd, existing_schema_version_md_row.md
573
+ )
574
+ if existing_schema_version_md != md.schema_version_md:
575
+ raise excs.Error(
576
+ f'The schema version metadata for the replica {path!r}:{md.schema_version_md.schema_version} '
577
+ 'is inconsistent with the metadata recorded from a prior replica.\n'
578
+ 'This is likely due to data corruption in the replicated table.'
579
+ )
580
+
581
+ self.store_tbl_md(UUID(tbl_id), new_tbl_md, new_version_md, new_schema_version_md)
582
+
412
583
  @_retry_loop
413
584
  def get_table(self, path: Path) -> Table:
414
585
  obj = Catalog.get()._get_schema_object(path, expected=Table, raise_if_not_exists=True)
@@ -467,6 +638,9 @@ class Catalog:
467
638
 
468
639
  @_retry_loop
469
640
  def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
641
+ return self._create_dir(path, if_exists, parents)
642
+
643
+ def _create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
470
644
  # existing = self._handle_path_collision(path, Dir, False, if_exists)
471
645
  # if existing is not None:
472
646
  # assert isinstance(existing, Dir)
@@ -475,7 +649,7 @@ class Catalog:
475
649
  # parent = self._get_schema_object(path.parent)
476
650
  # assert parent is not None
477
651
  # dir = Dir._create(parent._id, path.name)
478
- # Env.get().console_logger.info(f'Created directory {path!r}.')
652
+ # Env.get().console_logger.info(f'Created directory {str(path)!r}.')
479
653
  # return dir
480
654
 
481
655
  if parents:
@@ -519,10 +693,11 @@ class Catalog:
519
693
  q = sql.select(sql.func.count()).select_from(schema.Table).where(schema.Table.dir_id == dir_id)
520
694
  num_tbls = conn.execute(q).scalar()
521
695
  if num_subdirs + num_tbls > 0:
522
- raise excs.Error(f'Directory {dir_path!r} is not empty.')
696
+ raise excs.Error(f'Directory {str(dir_path)!r} is not empty.')
523
697
 
524
698
  # drop existing subdirs
525
- dir_q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id).with_for_update()
699
+ self._lock_dir(dir_id, None, None)
700
+ dir_q = sql.select(schema.Dir).where(schema.Dir.parent_id == dir_id)
526
701
  for row in conn.execute(dir_q).all():
527
702
  self._drop_dir(row.id, dir_path.append(row.md['name']), force=True)
528
703
 
@@ -535,7 +710,6 @@ class Catalog:
535
710
  self._drop_tbl(tbl, force=True, is_replace=False)
536
711
 
537
712
  # self.drop_dir(dir_id)
538
- # _debug_print(for_update=True, msg=f'drop dir id={dir_id}')
539
713
  conn.execute(sql.delete(schema.Dir).where(schema.Dir.id == dir_id))
540
714
  _logger.info(f'Removed directory {str(dir_path)!r}.')
541
715
 
@@ -545,18 +719,17 @@ class Catalog:
545
719
  q = sql.select(schema.Table.id).where(sql.text(f"md->'view_md'->'base_versions'->0->>0 = {tbl_id.hex!r}"))
546
720
  if for_update:
547
721
  q = q.with_for_update()
548
- # _debug_print(for_update=False, msg=f'views of tbl id={tbl_id}')
549
722
  result = [r[0] for r in conn.execute(q).all()]
550
723
  return result
551
724
 
552
725
  def get_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
553
726
  if (tbl_id, effective_version) not in self._tbl_versions:
554
- self._tbl_versions[(tbl_id, effective_version)] = self._load_tbl_version(tbl_id, effective_version)
555
- return self._tbl_versions[(tbl_id, effective_version)]
727
+ self._tbl_versions[tbl_id, effective_version] = self._load_tbl_version(tbl_id, effective_version)
728
+ return self._tbl_versions[tbl_id, effective_version]
556
729
 
557
730
  def add_tbl_version(self, tbl_version: TableVersion) -> None:
558
731
  """Explicitly add a TableVersion"""
559
- self._tbl_versions[(tbl_version.id, tbl_version.effective_version)] = tbl_version
732
+ self._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
560
733
  # if this is a mutable view, also record it in the base
561
734
  if tbl_version.is_view and tbl_version.effective_version is None:
562
735
  base = tbl_version.base.get()
@@ -564,15 +737,14 @@ class Catalog:
564
737
 
565
738
  def remove_tbl_version(self, tbl_version: TableVersion) -> None:
566
739
  assert (tbl_version.id, tbl_version.effective_version) in self._tbl_versions
567
- del self._tbl_versions[(tbl_version.id, tbl_version.effective_version)]
740
+ del self._tbl_versions[tbl_version.id, tbl_version.effective_version]
568
741
 
569
742
  def get_dir(self, dir_id: UUID, for_update: bool = False) -> Optional[Dir]:
570
743
  """Return the Dir with the given id, or None if it doesn't exist"""
571
744
  conn = Env.get().conn
572
- q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
573
745
  if for_update:
574
- q = q.with_for_update()
575
- # _debug_print(for_update=False, msg=f'dir id={dir_id!r}')
746
+ self._lock_dir(None, dir_id, None)
747
+ q = sql.select(schema.Dir).where(schema.Dir.id == dir_id)
576
748
  row = conn.execute(q).one_or_none()
577
749
  if row is None:
578
750
  return None
@@ -581,28 +753,27 @@ class Catalog:
581
753
 
582
754
  def _get_dir(self, path: Path, for_update: bool = False) -> Optional[schema.Dir]:
583
755
  """
584
- Locking protocol:
585
- - S locks on all ancestors
586
- - X lock on dir if for_update == True, otherwise also an S lock
756
+ Locking protocol: X locks on all ancestors
587
757
  """
758
+ user = Env.get().user
588
759
  conn = Env.get().conn
589
760
  if path.is_root:
590
- q = sql.select(schema.Dir).where(schema.Dir.parent_id.is_(None))
591
761
  if for_update:
592
- q = q.with_for_update()
593
- # _debug_print(for_update, 'root dir')
594
- row = conn.execute(q).one()
595
- return schema.Dir(**row._mapping)
762
+ self._lock_dir(parent_id=None, dir_id=None, dir_name='')
763
+ q = sql.select(schema.Dir).where(schema.Dir.parent_id.is_(None), schema.Dir.md['user'].astext == user)
764
+ row = conn.execute(q).one_or_none()
765
+ return schema.Dir(**row._mapping) if row is not None else None
596
766
  else:
597
767
  parent_dir = self._get_dir(path.parent, for_update=False)
598
768
  if parent_dir is None:
599
769
  return None
770
+ if for_update:
771
+ self._lock_dir(parent_id=parent_dir.id, dir_id=None, dir_name=path.name)
600
772
  q = sql.select(schema.Dir).where(
601
- schema.Dir.parent_id == parent_dir.id, schema.Dir.md['name'].astext == path.name
773
+ schema.Dir.parent_id == parent_dir.id,
774
+ schema.Dir.md['name'].astext == path.name,
775
+ schema.Dir.md['user'].astext == user,
602
776
  )
603
- if for_update:
604
- q = q.with_for_update()
605
- # _debug_print(for_update, f'dir {str(path)}')
606
777
  row = conn.execute(q).one_or_none()
607
778
  return schema.Dir(**row._mapping) if row is not None else None
608
779
 
@@ -625,7 +796,6 @@ class Catalog:
625
796
  )
626
797
  .where(schema.Table.id == tbl_id)
627
798
  )
628
- # _debug_print(for_update=False, msg=f'load table id={tbl_id!r}')
629
799
  row = conn.execute(q).one_or_none()
630
800
  if row is None:
631
801
  return None
@@ -636,7 +806,7 @@ class Catalog:
636
806
  if view_md is None:
637
807
  # this is a base table
638
808
  if (tbl_id, None) not in self._tbl_versions:
639
- self._tbl_versions[(tbl_id, None)] = self._load_tbl_version(tbl_id, None)
809
+ self._tbl_versions[tbl_id, None] = self._load_tbl_version(tbl_id, None)
640
810
  tbl = InsertableTable(tbl_record.dir_id, TableVersionHandle(tbl_id, None))
641
811
  return tbl
642
812
 
@@ -657,20 +827,26 @@ class Catalog:
657
827
  view_path: Optional[TableVersionPath] = None
658
828
  for id, effective_version in tbl_version_path[::-1]:
659
829
  if (id, effective_version) not in self._tbl_versions:
660
- self._tbl_versions[(id, effective_version)] = self._load_tbl_version(id, effective_version)
830
+ self._tbl_versions[id, effective_version] = self._load_tbl_version(id, effective_version)
661
831
  view_path = TableVersionPath(TableVersionHandle(id, effective_version), base=base_path)
662
832
  base_path = view_path
663
833
  view = View(tbl_id, tbl_record.dir_id, tbl_md.name, view_path, snapshot_only=pure_snapshot)
664
834
  # TODO: also load mutable views
665
835
  return view
666
836
 
667
- def _load_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
668
- _logger.info(f'Loading table version: {tbl_id}:{effective_version}')
837
+ def load_tbl_md(self, tbl_id: UUID, effective_version: Optional[int]) -> schema.FullTableMd:
838
+ """
839
+ Loads metadata from the store for a given table UUID and version.
840
+ """
841
+ _logger.info(f'Loading metadata for table version: {tbl_id}:{effective_version}')
669
842
  conn = Env.get().conn
843
+
670
844
  q = (
671
- sql.select(schema.Table, schema.TableSchemaVersion)
845
+ sql.select(schema.Table, schema.TableVersion, schema.TableSchemaVersion)
672
846
  .select_from(schema.Table)
673
847
  .where(schema.Table.id == tbl_id)
848
+ .join(schema.TableVersion)
849
+ .where(schema.TableVersion.tbl_id == tbl_id)
674
850
  .join(schema.TableSchemaVersion)
675
851
  .where(schema.TableSchemaVersion.tbl_id == tbl_id)
676
852
  )
@@ -682,16 +858,11 @@ class Catalog:
682
858
  # JOIN TableVersion tv ON (tv.tbl_id = tbl_id AND tv.version = effective_version)
683
859
  # JOIN TableSchemaVersion tsv ON (tsv.tbl_id = tbl_id AND tv.md.schema_version = tsv.schema_version)
684
860
  # WHERE t.id = tbl_id
685
- q = (
686
- q.join(schema.TableVersion)
687
- .where(schema.TableVersion.tbl_id == tbl_id)
688
- .where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = {effective_version}"))
689
- .where(
690
- sql.text(
691
- (
692
- f"({schema.TableVersion.__table__}.md->>'schema_version')::int = "
693
- f'{schema.TableSchemaVersion.__table__}.{schema.TableSchemaVersion.schema_version.name}'
694
- )
861
+ q = q.where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = {effective_version}")).where(
862
+ sql.text(
863
+ (
864
+ f"({schema.TableVersion.__table__}.md->>'schema_version')::int = "
865
+ f'{schema.TableSchemaVersion.__table__}.{schema.TableSchemaVersion.schema_version.name}'
695
866
  )
696
867
  )
697
868
  )
@@ -699,9 +870,15 @@ class Catalog:
699
870
  # we are loading the current version
700
871
  # SELECT *
701
872
  # FROM Table t
873
+ # JOIN TableVersion tv ON (tv.tbl_id = tbl_id AND t.current_version = tv.version)
702
874
  # JOIN TableSchemaVersion tsv ON (tsv.tbl_id = tbl_id AND t.current_schema_version = tsv.schema_version)
703
875
  # WHERE t.id = tbl_id
704
876
  q = q.where(
877
+ sql.text(
878
+ f"({schema.Table.__table__}.md->>'current_version')::int = "
879
+ f'{schema.TableVersion.__table__}.{schema.TableVersion.version.name}'
880
+ )
881
+ ).where(
705
882
  sql.text(
706
883
  (
707
884
  f"({schema.Table.__table__}.md->>'current_schema_version')::int = "
@@ -711,11 +888,100 @@ class Catalog:
711
888
  )
712
889
 
713
890
  row = conn.execute(q).one_or_none()
714
- tbl_record, schema_version_record = _unpack_row(row, [schema.Table, schema.TableSchemaVersion])
891
+ assert row is not None, f'Table record not found: {tbl_id}:{effective_version}'
892
+ tbl_record, version_record, schema_version_record = _unpack_row(
893
+ row, [schema.Table, schema.TableVersion, schema.TableSchemaVersion]
894
+ )
895
+ assert tbl_record.id == tbl_id
715
896
  tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
897
+ version_md = schema.md_from_dict(schema.TableVersionMd, version_record.md)
716
898
  schema_version_md = schema.md_from_dict(schema.TableSchemaVersionMd, schema_version_record.md)
899
+
900
+ return schema.FullTableMd(tbl_md, version_md, schema_version_md)
901
+
902
+ def store_tbl_md(
903
+ self,
904
+ tbl_id: UUID,
905
+ tbl_md: Optional[schema.TableMd],
906
+ version_md: Optional[schema.TableVersionMd],
907
+ schema_version_md: Optional[schema.TableSchemaVersionMd],
908
+ ) -> None:
909
+ """
910
+ Stores metadata to the DB. If specified, `tbl_md` will be updated in place (only one such record can exist
911
+ per UUID); `version_md` and `schema_version_md` will be inserted as new records.
912
+
913
+ If inserting `version_md` or `schema_version_md` would be a primary key violation, an exception will be raised.
914
+ """
915
+ conn = Env.get().conn
916
+
917
+ if tbl_md is not None:
918
+ result = conn.execute(
919
+ sql.update(schema.Table.__table__)
920
+ .values({schema.Table.md: dataclasses.asdict(tbl_md)})
921
+ .where(schema.Table.id == tbl_id)
922
+ )
923
+ assert result.rowcount == 1, result.rowcount
924
+
925
+ if version_md is not None:
926
+ conn.execute(
927
+ sql.insert(schema.TableVersion.__table__).values(
928
+ tbl_id=tbl_id, version=version_md.version, md=dataclasses.asdict(version_md)
929
+ )
930
+ )
931
+
932
+ if schema_version_md is not None:
933
+ conn.execute(
934
+ sql.insert(schema.TableSchemaVersion.__table__).values(
935
+ tbl_id=tbl_id,
936
+ schema_version=schema_version_md.schema_version,
937
+ md=dataclasses.asdict(schema_version_md),
938
+ )
939
+ )
940
+
941
+ def delete_tbl_md(self, tbl_id: UUID) -> None:
942
+ """
943
+ Deletes all table metadata from the store for the given table UUID.
944
+ """
945
+ conn = Env.get().conn
946
+ conn.execute(sql.delete(schema.TableSchemaVersion.__table__).where(schema.TableSchemaVersion.tbl_id == tbl_id))
947
+ conn.execute(sql.delete(schema.TableVersion.__table__).where(schema.TableVersion.tbl_id == tbl_id))
948
+ conn.execute(sql.delete(schema.Table.__table__).where(schema.Table.id == tbl_id))
949
+
950
+ def load_replica_md(self, tbl: Table) -> list[schema.FullTableMd]:
951
+ """
952
+ Load metadata for the given table along with all its ancestors. The values of TableMd.current_version and
953
+ TableMd.current_schema_version will be adjusted to ensure that the metadata represent a valid (internally
954
+ consistent) table state.
955
+ """
956
+ # TODO: First acquire X-locks for all relevant metadata entries
957
+
958
+ # Load metadata for every table in the TableVersionPath for `tbl`.
959
+ md = [self.load_tbl_md(tv.id, tv.effective_version) for tv in tbl._tbl_version_path.get_tbl_versions()]
960
+
961
+ # If `tbl` is a named pure snapshot, we're not quite done, since the snapshot metadata won't appear in the
962
+ # TableVersionPath. We need to prepend it separately.
963
+ if tbl._id != tbl._tbl_version.id:
964
+ snapshot_md = self.load_tbl_md(tbl._id, 0)
965
+ md = [snapshot_md, *md]
966
+
967
+ for ancestor_md in md[1:]:
968
+ # For replica metadata, we guarantee that the current_version and current_schema_version of TableMd
969
+ # match the corresponding values in TableVersionMd and TableSchemaVersionMd. This is to ensure that,
970
+ # when the metadata is later stored in the catalog of a different Pixeltable instance, the values of
971
+ # current_version and current_schema_version will always point to versions that are known to the
972
+ # destination catalog.
973
+ ancestor_md.tbl_md.current_version = ancestor_md.version_md.version
974
+ ancestor_md.tbl_md.current_schema_version = ancestor_md.schema_version_md.schema_version
975
+
976
+ return md
977
+
978
+ def _load_tbl_version(self, tbl_id: UUID, effective_version: Optional[int]) -> Optional[TableVersion]:
979
+ tbl_md, _, schema_version_md = self.load_tbl_md(tbl_id, effective_version)
717
980
  view_md = tbl_md.view_md
718
981
 
982
+ _logger.info(f'Loading table version: {tbl_id}:{effective_version}')
983
+ conn = Env.get().conn
984
+
719
985
  # load mutable view ids
720
986
  q = sql.select(schema.Table.id).where(
721
987
  sql.text(
@@ -729,7 +995,7 @@ class Catalog:
729
995
  if view_md is None:
730
996
  # this is a base table
731
997
  tbl_version = TableVersion(
732
- tbl_record.id, tbl_md, effective_version, schema_version_md, mutable_views=mutable_views
998
+ tbl_id, tbl_md, effective_version, schema_version_md, mutable_views=mutable_views
733
999
  )
734
1000
  return tbl_version
735
1001
 
@@ -746,7 +1012,7 @@ class Catalog:
746
1012
  base = base_path.tbl_version
747
1013
 
748
1014
  tbl_version = TableVersion(
749
- tbl_record.id,
1015
+ tbl_id,
750
1016
  tbl_md,
751
1017
  effective_version,
752
1018
  schema_version_md,
@@ -758,16 +1024,25 @@ class Catalog:
758
1024
 
759
1025
  def _init_store(self) -> None:
760
1026
  """One-time initialization of the stored catalog. Idempotent."""
1027
+ self.create_user(None)
1028
+ _logger.info('Initialized catalog.')
1029
+
1030
+ def create_user(self, user: Optional[str]) -> None:
1031
+ """
1032
+ Creates a catalog record (root directory) for the specified user, if one does not already exist.
1033
+ """
761
1034
  with Env.get().begin_xact():
762
1035
  session = Env.get().session
763
- if session.query(sql.func.count(schema.Dir.id)).scalar() > 0:
1036
+ # See if there are any directories in the catalog matching the specified user.
1037
+ if session.query(schema.Dir).where(schema.Dir.md['user'].astext == user).count() > 0:
1038
+ # At least one such directory exists; no need to create a new one.
764
1039
  return
765
- # create a top-level directory, so that every schema object has a directory
766
- dir_md = schema.DirMd(name='', user=None, additional_md={})
1040
+
1041
+ dir_md = schema.DirMd(name='', user=user, additional_md={})
767
1042
  dir_record = schema.Dir(parent_id=None, md=dataclasses.asdict(dir_md))
768
1043
  session.add(dir_record)
769
1044
  session.flush()
770
- _logger.info(f'Initialized catalog')
1045
+ _logger.info(f'Added root directory record for user: {user!r}')
771
1046
 
772
1047
  def _handle_path_collision(
773
1048
  self, path: Path, expected_obj_type: type[SchemaObject], expected_snapshot: bool, if_exists: IfExistsParam
@@ -775,13 +1050,14 @@ class Catalog:
775
1050
  obj, _, _ = self._prepare_dir_op(add_dir_path=path.parent, add_name=path.name)
776
1051
 
777
1052
  if if_exists == IfExistsParam.ERROR and obj is not None:
778
- raise excs.Error(f'Path {path!r} is an existing {type(obj)._display_name()}')
1053
+ raise excs.Error(f'Path {str(path)!r} is an existing {type(obj)._display_name()}')
779
1054
  else:
780
1055
  is_snapshot = isinstance(obj, View) and obj._tbl_version_path.is_snapshot()
781
1056
  if obj is not None and (not isinstance(obj, expected_obj_type) or (expected_snapshot and not is_snapshot)):
782
1057
  obj_type_str = 'snapshot' if expected_snapshot else expected_obj_type._display_name()
783
1058
  raise excs.Error(
784
- f'Path {path!r} already exists but is not a {obj_type_str}. Cannot {if_exists.name.lower()} it.'
1059
+ f'Path {str(path)!r} already exists but is not a {obj_type_str}. '
1060
+ f'Cannot {if_exists.name.lower()} it.'
785
1061
  )
786
1062
 
787
1063
  if obj is None:
@@ -794,7 +1070,8 @@ class Catalog:
794
1070
  dir_contents = self._get_dir_contents(obj._id)
795
1071
  if len(dir_contents) > 0 and if_exists == IfExistsParam.REPLACE:
796
1072
  raise excs.Error(
797
- f'Directory {path!r} already exists and is not empty. Use `if_exists="replace_force"` to replace it.'
1073
+ f'Directory {str(path)!r} already exists and is not empty. '
1074
+ 'Use `if_exists="replace_force"` to replace it.'
798
1075
  )
799
1076
  self._drop_dir(obj._id, path, force=True)
800
1077
  else: