pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (48) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +292 -105
  3. pixeltable/catalog/column.py +10 -8
  4. pixeltable/catalog/dir.py +1 -2
  5. pixeltable/catalog/insertable_table.py +25 -20
  6. pixeltable/catalog/schema_object.py +3 -6
  7. pixeltable/catalog/table.py +245 -189
  8. pixeltable/catalog/table_version.py +317 -201
  9. pixeltable/catalog/table_version_handle.py +15 -2
  10. pixeltable/catalog/table_version_path.py +60 -14
  11. pixeltable/catalog/view.py +14 -5
  12. pixeltable/dataframe.py +11 -9
  13. pixeltable/env.py +2 -4
  14. pixeltable/exec/in_memory_data_node.py +1 -1
  15. pixeltable/exec/sql_node.py +20 -11
  16. pixeltable/exprs/column_property_ref.py +15 -6
  17. pixeltable/exprs/column_ref.py +32 -11
  18. pixeltable/exprs/comparison.py +1 -1
  19. pixeltable/exprs/row_builder.py +4 -6
  20. pixeltable/exprs/rowid_ref.py +8 -0
  21. pixeltable/exprs/similarity_expr.py +1 -0
  22. pixeltable/func/query_template_function.py +1 -1
  23. pixeltable/functions/string.py +212 -58
  24. pixeltable/globals.py +7 -4
  25. pixeltable/index/base.py +5 -0
  26. pixeltable/index/btree.py +5 -0
  27. pixeltable/index/embedding_index.py +5 -0
  28. pixeltable/io/external_store.py +8 -29
  29. pixeltable/io/label_studio.py +1 -1
  30. pixeltable/io/parquet.py +2 -2
  31. pixeltable/io/table_data_conduit.py +0 -31
  32. pixeltable/metadata/__init__.py +1 -1
  33. pixeltable/metadata/converters/convert_13.py +2 -2
  34. pixeltable/metadata/converters/convert_30.py +6 -11
  35. pixeltable/metadata/converters/convert_35.py +9 -0
  36. pixeltable/metadata/converters/util.py +3 -9
  37. pixeltable/metadata/notes.py +1 -0
  38. pixeltable/metadata/schema.py +5 -1
  39. pixeltable/plan.py +4 -4
  40. pixeltable/share/packager.py +24 -9
  41. pixeltable/share/publish.py +2 -2
  42. pixeltable/store.py +19 -13
  43. pixeltable/utils/dbms.py +1 -1
  44. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/METADATA +1 -1
  45. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/RECORD +48 -47
  46. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/LICENSE +0 -0
  47. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/WHEEL +0 -0
  48. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc1.dist-info}/entry_points.txt +0 -0
@@ -41,8 +41,21 @@ class TableVersionHandle:
41
41
  def get(self) -> TableVersion:
42
42
  from .catalog import Catalog
43
43
 
44
- if self._tbl_version is None:
45
- self._tbl_version = Catalog.get().get_tbl_version(self.id, self.effective_version)
44
+ cat = Catalog.get()
45
+ if self._tbl_version is None or not self._tbl_version.is_validated:
46
+ if self.effective_version is not None and self._tbl_version is not None:
47
+ # this is a snapshot version; we need to make sure we refer to the instance cached
48
+ # in Catalog, in order to avoid mixing sa_tbl instances in the same transaction
49
+ # (which will lead to duplicates in the From clause generated in SqlNode.create_from_clause())
50
+ assert (self.id, self.effective_version) in cat._tbl_versions
51
+ self._tbl_version = cat._tbl_versions[self.id, self.effective_version]
52
+ self._tbl_version.is_validated = True
53
+ else:
54
+ self._tbl_version = Catalog.get().get_tbl_version(self.id, self.effective_version)
55
+ if self.effective_version is None:
56
+ # make sure we don't see a discarded instance of a live TableVersion
57
+ tvs = list(Catalog.get()._tbl_versions.values())
58
+ assert self._tbl_version in tvs
46
59
  return self._tbl_version
47
60
 
48
61
  def as_dict(self) -> dict:
@@ -4,9 +4,11 @@ import logging
4
4
  from typing import Optional
5
5
  from uuid import UUID
6
6
 
7
+ from pixeltable.env import Env
7
8
  from pixeltable.metadata import schema
8
9
 
9
10
  from .column import Column
11
+ from .table_version import TableVersion
10
12
  from .table_version_handle import TableVersionHandle
11
13
 
12
14
  _logger = logging.getLogger('pixeltable')
@@ -22,15 +24,28 @@ class TableVersionPath:
22
24
 
23
25
  TableVersionPath contains all metadata needed to execute queries and updates against a particular version of a
24
26
  table/view.
27
+
28
+ TableVersionPath supplies metadata needed for query construction (eg, column names), for which it uses a
29
+ cached TableVersion instance.
30
+ - when running inside a transaction, this instance is guaranteed to be validated
31
+ - when running outside a transaction, we use an unvalidated instance in order to avoid repeated validation
32
+ on every metadata-related method call (the instance won't stay validated, because TableVersionHandle.get()
33
+ runs a local transaction, at the end of which the instance is again invalidated)
34
+ - supplying metadata from an unvalidated instance is okay, because it needs to get revalidated anyway when a
35
+ query actually runs (at which point there is a transaction context) - there is no guarantee that in between
36
+ constructing a DataFrame and executing it, the underlying table schema hasn't changed (eg, a concurrent process
37
+ could have dropped a column referenced in the query).
25
38
  """
26
39
 
27
40
  tbl_version: TableVersionHandle
28
41
  base: Optional[TableVersionPath]
42
+ _cached_tbl_version: Optional[TableVersion]
29
43
 
30
44
  def __init__(self, tbl_version: TableVersionHandle, base: Optional[TableVersionPath] = None):
31
45
  assert tbl_version is not None
32
46
  self.tbl_version = tbl_version
33
47
  self.base = base
48
+ self._cached_tbl_version = None
34
49
 
35
50
  @classmethod
36
51
  def from_md(cls, path: schema.TableVersionPath) -> TableVersionPath:
@@ -47,17 +62,40 @@ class TableVersionPath:
47
62
  result.extend(self.base.as_md())
48
63
  return result
49
64
 
65
+ def refresh_cached_md(self) -> None:
66
+ from pixeltable.catalog import Catalog
67
+
68
+ if Env.get().in_xact:
69
+ # when we're running inside a transaction, we need to make sure to supply current metadata;
70
+ # mixing stale metadata with current metadata leads to query construction failures
71
+ # (multiple sqlalchemy Table instances for the same underlying table create corrupted From clauses)
72
+ if self._cached_tbl_version is not None and self._cached_tbl_version.is_validated:
73
+ # nothing to refresh
74
+ return
75
+ elif self._cached_tbl_version is not None:
76
+ return
77
+
78
+ with Catalog.get().begin_xact(for_write=False):
79
+ self._cached_tbl_version = self.tbl_version.get()
80
+
81
+ def clear_cached_md(self) -> None:
82
+ self._cached_tbl_version = None
83
+ if self.base is not None:
84
+ self.base.clear_cached_md()
85
+
50
86
  def tbl_id(self) -> UUID:
51
87
  """Return the id of the table/view that this path represents"""
52
88
  return self.tbl_version.id
53
89
 
54
90
  def version(self) -> int:
55
91
  """Return the version of the table/view that this path represents"""
56
- return self.tbl_version.get().version
92
+ self.refresh_cached_md()
93
+ return self._cached_tbl_version.version
57
94
 
58
95
  def tbl_name(self) -> str:
59
96
  """Return the name of the table/view that this path represents"""
60
- return self.tbl_version.get().name
97
+ self.refresh_cached_md()
98
+ return self._cached_tbl_version.name
61
99
 
62
100
  def path_len(self) -> int:
63
101
  """Return the length of the path"""
@@ -65,18 +103,22 @@ class TableVersionPath:
65
103
 
66
104
  def is_snapshot(self) -> bool:
67
105
  """Return True if this is a path of snapshot versions"""
68
- if not self.tbl_version.get().is_snapshot:
106
+ self.refresh_cached_md()
107
+ if not self._cached_tbl_version.is_snapshot:
69
108
  return False
70
109
  return self.base.is_snapshot() if self.base is not None else True
71
110
 
72
111
  def is_view(self) -> bool:
73
- return self.tbl_version.get().is_view
112
+ self.refresh_cached_md()
113
+ return self._cached_tbl_version.is_view
74
114
 
75
115
  def is_component_view(self) -> bool:
76
- return self.tbl_version.get().is_component_view
116
+ self.refresh_cached_md()
117
+ return self._cached_tbl_version.is_component_view
77
118
 
78
119
  def is_insertable(self) -> bool:
79
- return self.tbl_version.get().is_insertable()
120
+ self.refresh_cached_md()
121
+ return self._cached_tbl_version.is_insertable
80
122
 
81
123
  def get_tbl_versions(self) -> list[TableVersionHandle]:
82
124
  """Return all tbl versions"""
@@ -100,11 +142,12 @@ class TableVersionPath:
100
142
 
101
143
  def columns(self) -> list[Column]:
102
144
  """Return all user columns visible in this tbl version path, including columns from bases"""
103
- result = list(self.tbl_version.get().cols_by_name.values())
104
- if self.base is not None and self.tbl_version.get().include_base_columns:
145
+ self.refresh_cached_md()
146
+ result = list(self._cached_tbl_version.cols_by_name.values())
147
+ if self.base is not None and self._cached_tbl_version.include_base_columns:
105
148
  base_cols = self.base.columns()
106
149
  # we only include base columns that don't conflict with one of our column names
107
- result.extend(c for c in base_cols if c.name not in self.tbl_version.get().cols_by_name)
150
+ result.extend(c for c in base_cols if c.name not in self._cached_tbl_version.cols_by_name)
108
151
  return result
109
152
 
110
153
  def cols_by_name(self) -> dict[str, Column]:
@@ -119,19 +162,21 @@ class TableVersionPath:
119
162
 
120
163
  def get_column(self, name: str, include_bases: Optional[bool] = None) -> Optional[Column]:
121
164
  """Return the column with the given name, or None if not found"""
122
- col = self.tbl_version.get().cols_by_name.get(name)
165
+ self.refresh_cached_md()
166
+ col = self._cached_tbl_version.cols_by_name.get(name)
123
167
  if col is not None:
124
168
  return col
125
- elif self.base is not None and (include_bases or self.tbl_version.get().include_base_columns):
169
+ elif self.base is not None and (include_bases or self._cached_tbl_version.include_base_columns):
126
170
  return self.base.get_column(name)
127
171
  else:
128
172
  return None
129
173
 
130
174
  def get_column_by_id(self, tbl_id: UUID, col_id: int) -> Optional[Column]:
131
175
  """Return the column for the given tbl/col id"""
176
+ self.refresh_cached_md()
132
177
  if self.tbl_version.id == tbl_id:
133
- assert col_id in self.tbl_version.get().cols_by_id
134
- return self.tbl_version.get().cols_by_id[col_id]
178
+ assert col_id in self._cached_tbl_version.cols_by_id
179
+ return self._cached_tbl_version.cols_by_id[col_id]
135
180
  elif self.base is not None:
136
181
  return self.base.get_column_by_id(tbl_id, col_id)
137
182
  else:
@@ -139,11 +184,12 @@ class TableVersionPath:
139
184
 
140
185
  def has_column(self, col: Column, include_bases: bool = True) -> bool:
141
186
  """Return True if this table has the given column."""
187
+ self.refresh_cached_md()
142
188
  assert col.tbl is not None
143
189
  if (
144
190
  col.tbl.id == self.tbl_version.id
145
191
  and col.tbl.effective_version == self.tbl_version.effective_version
146
- and col.id in self.tbl_version.get().cols_by_id
192
+ and col.id in self._cached_tbl_version.cols_by_id
147
193
  ):
148
194
  # the column is visible in this table version
149
195
  return True
@@ -204,8 +204,17 @@ class View(Table):
204
204
 
205
205
  from pixeltable.plan import Planner
206
206
 
207
- plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
208
- num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
207
+ try:
208
+ plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
209
+ num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
210
+ except:
211
+ # we need to remove the orphaned TableVersion instance
212
+ del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
213
+ base_tbl_version = base.tbl_version.get()
214
+ if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
215
+ # also remove tbl_version from the base
216
+ base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
217
+ raise
209
218
  Env.get().console_logger.info(f'Created view `{name}` with {num_rows} rows, {num_excs} exceptions.')
210
219
 
211
220
  session.commit()
@@ -285,13 +294,13 @@ class View(Table):
285
294
 
286
295
  def _table_descriptor(self) -> str:
287
296
  display_name = 'Snapshot' if self._snapshot_only else 'View'
288
- result = [f'{display_name} {self._path!r}']
297
+ result = [f'{display_name} {self._path()!r}']
289
298
  bases_descrs: list[str] = []
290
299
  for base, effective_version in zip(self._base_tables, self._effective_base_versions):
291
300
  if effective_version is None:
292
- bases_descrs.append(f'{base._path!r}')
301
+ bases_descrs.append(f'{base._path()!r}')
293
302
  else:
294
- base_descr = f'{base._path}:{effective_version}'
303
+ base_descr = f'{base._path()}:{effective_version}'
295
304
  bases_descrs.append(f'{base_descr!r}')
296
305
  result.append(f' (of {", ".join(bases_descrs)})')
297
306
 
pixeltable/dataframe.py CHANGED
@@ -14,7 +14,7 @@ import pandas as pd
14
14
  import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
17
- from pixeltable.catalog import is_valid_identifier
17
+ from pixeltable.catalog import Catalog, is_valid_identifier
18
18
  from pixeltable.catalog.globals import UpdateStatus
19
19
  from pixeltable.env import Env
20
20
  from pixeltable.type_system import ColumnType
@@ -431,7 +431,7 @@ class DataFrame:
431
431
  raise excs.Error(msg) from e
432
432
 
433
433
  def _output_row_iterator(self) -> Iterator[list]:
434
- with Env.get().begin_xact():
434
+ with Catalog.get().begin_xact(for_write=False):
435
435
  try:
436
436
  for data_row in self._exec():
437
437
  yield [data_row[e.slot_idx] for e in self._select_list_exprs]
@@ -463,8 +463,8 @@ class DataFrame:
463
463
 
464
464
  from pixeltable.plan import Planner
465
465
 
466
- stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
467
- with Env.get().begin_xact() as conn:
466
+ with Catalog.get().begin_xact(for_write=False) as conn:
467
+ stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
468
468
  result: int = conn.execute(stmt).scalar_one()
469
469
  assert isinstance(result, int)
470
470
  return result
@@ -988,7 +988,8 @@ class DataFrame:
988
988
  >>> df = person.where(t.year == 2014).update({'age': 30})
989
989
  """
990
990
  self._validate_mutable('update', False)
991
- with Env.get().begin_xact():
991
+ tbl_id = self._first_tbl.tbl_id()
992
+ with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
992
993
  return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
993
994
 
994
995
  def delete(self) -> UpdateStatus:
@@ -1011,7 +1012,8 @@ class DataFrame:
1011
1012
  self._validate_mutable('delete', False)
1012
1013
  if not self._first_tbl.is_insertable():
1013
1014
  raise excs.Error('Cannot delete from view')
1014
- with Env.get().begin_xact():
1015
+ tbl_id = self._first_tbl.tbl_id()
1016
+ with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
1015
1017
  return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
1016
1018
 
1017
1019
  def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
@@ -1059,7 +1061,7 @@ class DataFrame:
1059
1061
  @classmethod
1060
1062
  def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
1061
1063
  # we need to wrap the construction with a transaction, because it might need to load metadata
1062
- with Env.get().begin_xact():
1064
+ with Catalog.get().begin_xact(for_write=False):
1063
1065
  tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
1064
1066
  join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
1065
1067
  from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
@@ -1129,7 +1131,7 @@ class DataFrame:
1129
1131
  assert data_file_path.is_file()
1130
1132
  return data_file_path
1131
1133
  else:
1132
- with Env.get().begin_xact():
1134
+ with Catalog.get().begin_xact(for_write=False):
1133
1135
  return write_coco_dataset(self, dest_path)
1134
1136
 
1135
1137
  def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
@@ -1174,7 +1176,7 @@ class DataFrame:
1174
1176
  if dest_path.exists(): # fast path: use cache
1175
1177
  assert dest_path.is_dir()
1176
1178
  else:
1177
- with Env.get().begin_xact():
1179
+ with Catalog.get().begin_xact(for_write=False):
1178
1180
  export_parquet(self, dest_path, inline_images=True)
1179
1181
 
1180
1182
  return PixeltablePytorchDataset(path=dest_path, image_format=image_format)
pixeltable/env.py CHANGED
@@ -191,6 +191,7 @@ class Env:
191
191
  assert self._dbms is not None
192
192
  return self._dbms
193
193
 
194
+ @property
194
195
  def in_xact(self) -> bool:
195
196
  return self._current_conn is not None
196
197
 
@@ -201,20 +202,17 @@ class Env:
201
202
 
202
203
  @contextmanager
203
204
  def begin_xact(self) -> Iterator[sql.Connection]:
204
- """Return a context manager that yields a connection to the database. Idempotent."""
205
+ """Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
205
206
  if self._current_conn is None:
206
207
  assert self._current_session is None
207
208
  try:
208
209
  with self.engine.begin() as conn, sql.orm.Session(conn) as session:
209
- # TODO: remove print() once we're done with debugging the concurrent update behavior
210
- # print(f'{datetime.datetime.now()}: start xact')
211
210
  self._current_conn = conn
212
211
  self._current_session = session
213
212
  yield conn
214
213
  finally:
215
214
  self._current_session = None
216
215
  self._current_conn = None
217
- # print(f'{datetime.datetime.now()}: end xact')
218
216
  else:
219
217
  assert self._current_session is not None
220
218
  yield self._current_conn
@@ -38,7 +38,7 @@ class InMemoryDataNode(ExecNode):
38
38
  # we materialize the input slots
39
39
  output_exprs = list(row_builder.input_exprs)
40
40
  super().__init__(row_builder, output_exprs, [], None)
41
- assert tbl.get().is_insertable()
41
+ assert tbl.get().is_insertable
42
42
  self.tbl = tbl
43
43
  self.input_rows = rows
44
44
  self.start_row_id = start_row_id
@@ -134,6 +134,11 @@ class SqlNode(ExecNode):
134
134
  self.where_clause_element = None
135
135
  self.order_by_clause = []
136
136
 
137
+ if self.tbl is not None:
138
+ tv = self.tbl.tbl_version._tbl_version
139
+ if tv is not None:
140
+ assert tv.is_validated
141
+
137
142
  def _create_stmt(self) -> sql.Select:
138
143
  """Create Select from local state"""
139
144
 
@@ -141,6 +146,7 @@ class SqlNode(ExecNode):
141
146
  sql_select_list = [self.sql_elements.get(e) for e in self.select_list]
142
147
  if self.set_pk:
143
148
  assert self.tbl is not None
149
+ assert self.tbl.tbl_version.get().is_validated
144
150
  sql_select_list += self.tbl.tbl_version.get().store_tbl.pk_columns()
145
151
  stmt = sql.select(*sql_select_list)
146
152
 
@@ -220,26 +226,29 @@ class SqlNode(ExecNode):
220
226
  joined_tbls.append(t)
221
227
 
222
228
  first = True
223
- prev_tbl: Optional[catalog.TableVersionHandle] = None
229
+ prev_tv: Optional[catalog.TableVersion] = None
224
230
  for t in joined_tbls[::-1]:
231
+ tv = t.get()
232
+ # _logger.debug(f'create_from_clause: tbl_id={tv.id} {id(tv.store_tbl.sa_tbl)}')
225
233
  if first:
226
- stmt = stmt.select_from(t.get().store_tbl.sa_tbl)
234
+ stmt = stmt.select_from(tv.store_tbl.sa_tbl)
227
235
  first = False
228
236
  else:
229
- # join tbl to prev_tbl on prev_tbl's rowid cols
230
- prev_tbl_rowid_cols = prev_tbl.get().store_tbl.rowid_columns()
231
- tbl_rowid_cols = t.get().store_tbl.rowid_columns()
237
+ # join tv to prev_tv on prev_tv's rowid cols
238
+ prev_tbl_rowid_cols = prev_tv.store_tbl.rowid_columns()
239
+ tbl_rowid_cols = tv.store_tbl.rowid_columns()
232
240
  rowid_clauses = [
233
241
  c1 == c2 for c1, c2 in zip(prev_tbl_rowid_cols, tbl_rowid_cols[: len(prev_tbl_rowid_cols)])
234
242
  ]
235
- stmt = stmt.join(t.get().store_tbl.sa_tbl, sql.and_(*rowid_clauses))
243
+ stmt = stmt.join(tv.store_tbl.sa_tbl, sql.and_(*rowid_clauses))
244
+
236
245
  if t.id in exact_version_only:
237
- stmt = stmt.where(t.get().store_tbl.v_min_col == t.get().version)
246
+ stmt = stmt.where(tv.store_tbl.v_min_col == tv.version)
238
247
  else:
239
- stmt = stmt.where(t.get().store_tbl.v_min_col <= t.get().version).where(
240
- t.get().store_tbl.v_max_col > t.get().version
241
- )
242
- prev_tbl = t
248
+ stmt = stmt.where(tv.store_tbl.sa_tbl.c.v_min <= tv.version)
249
+ stmt = stmt.where(tv.store_tbl.sa_tbl.c.v_max > tv.version)
250
+ prev_tv = tv
251
+
243
252
  return stmt
244
253
 
245
254
  def set_where(self, where_clause: exprs.Expr) -> None:
@@ -58,20 +58,29 @@ class ColumnPropertyRef(Expr):
58
58
  if not self._col_ref.col.is_stored:
59
59
  return None
60
60
 
61
+ # we need to reestablish that we have the correct Column instance, there could have been a metadata
62
+ # reload since init()
63
+ # TODO: add an explicit prepare phase (ie, Expr.prepare()) that gives every subclass instance a chance to
64
+ # perform runtime checks and update state
65
+ tv = self._col_ref.tbl_version.get()
66
+ assert tv.is_validated
67
+ col = tv.cols_by_id[self._col_ref.col_id]
68
+ # TODO: check for column being dropped
69
+
61
70
  # the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
62
71
  if (
63
- self._col_ref.col.col_type.is_media_type()
64
- and self._col_ref.col.media_validation == catalog.MediaValidation.ON_READ
72
+ col.col_type.is_media_type()
73
+ and col.media_validation == catalog.MediaValidation.ON_READ
65
74
  and self.is_error_prop()
66
75
  ):
67
76
  return None
68
77
 
69
78
  if self.prop == self.Property.ERRORTYPE:
70
- assert self._col_ref.col.sa_errortype_col is not None
71
- return self._col_ref.col.sa_errortype_col
79
+ assert col.sa_errortype_col is not None
80
+ return col.sa_errortype_col
72
81
  if self.prop == self.Property.ERRORMSG:
73
- assert self._col_ref.col.sa_errormsg_col is not None
74
- return self._col_ref.col.sa_errormsg_col
82
+ assert col.sa_errormsg_col is not None
83
+ return col.sa_errormsg_col
75
84
  if self.prop == self.Property.FILEURL:
76
85
  # the file url is stored as the column value
77
86
  return sql_elements.get(self._col_ref)
@@ -52,6 +52,10 @@ class ColumnRef(Expr):
52
52
  id: int
53
53
  perform_validation: bool # if True, performs media validation
54
54
 
55
+ # needed by sql_expr() to re-resolve Column instance after a metadata reload
56
+ tbl_version: catalog.TableVersionHandle
57
+ col_id: int
58
+
55
59
  def __init__(
56
60
  self,
57
61
  col: catalog.Column,
@@ -62,16 +66,17 @@ class ColumnRef(Expr):
62
66
  assert col.tbl is not None
63
67
  self.col = col
64
68
  self.reference_tbl = reference_tbl
65
- self.is_unstored_iter_col = (
66
- col.tbl.get().is_component_view and col.tbl.get().is_iterator_column(col) and not col.is_stored
67
- )
69
+ self.tbl_version = catalog.TableVersionHandle(col.tbl.id, col.tbl.effective_version)
70
+ self.col_id = col.id
71
+
72
+ self.is_unstored_iter_col = col.tbl.is_component_view and col.tbl.is_iterator_column(col) and not col.is_stored
68
73
  self.iter_arg_ctx = None
69
74
  # number of rowid columns in the base table
70
- self.base_rowid_len = col.tbl.get().base.get().num_rowid_columns() if self.is_unstored_iter_col else 0
75
+ self.base_rowid_len = col.tbl.base.get().num_rowid_columns() if self.is_unstored_iter_col else 0
71
76
  self.base_rowid = [None] * self.base_rowid_len
72
77
  self.iterator = None
73
78
  # index of the position column in the view's primary key; don't try to reference tbl.store_tbl here
74
- self.pos_idx = col.tbl.get().num_rowid_columns() - 1 if self.is_unstored_iter_col else None
79
+ self.pos_idx = col.tbl.num_rowid_columns() - 1 if self.is_unstored_iter_col else None
75
80
 
76
81
  self.perform_validation = False
77
82
  if col.col_type.is_media_type():
@@ -175,7 +180,7 @@ class ColumnRef(Expr):
175
180
  assert len(idx_info) == 1
176
181
  col = copy.copy(next(iter(idx_info.values())).val_col)
177
182
  col.name = f'{self.col.name}_embedding_{idx if idx is not None else ""}'
178
- col.create_sa_cols()
183
+ # col.create_sa_cols()
179
184
  return ColumnRef(col)
180
185
 
181
186
  def default_column_name(self) -> Optional[str]:
@@ -226,7 +231,7 @@ class ColumnRef(Expr):
226
231
  def _descriptors(self) -> DescriptionHelper:
227
232
  tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl.id)
228
233
  helper = DescriptionHelper()
229
- helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path!r})')
234
+ helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path()!r})')
230
235
  helper.append(tbl._col_descriptor([self.col.name]))
231
236
  idxs = tbl._index_descriptor([self.col.name])
232
237
  if len(idxs) > 0:
@@ -234,7 +239,23 @@ class ColumnRef(Expr):
234
239
  return helper
235
240
 
236
241
  def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
237
- return None if self.perform_validation else self.col.sa_col
242
+ # return None if self.perform_validation else self.col.sa_col
243
+ if self.perform_validation:
244
+ return None
245
+ # we need to reestablish that we have the correct Column instance, there could have been a metadata
246
+ # reload since init()
247
+ # TODO: add an explicit prepare phase (ie, Expr.prepare()) that gives every subclass instance a chance to
248
+ # perform runtime checks and update state
249
+ tv = self.tbl_version.get()
250
+ assert tv.is_validated
251
+ self.col = tv.cols_by_id[self.col_id]
252
+ assert self.col.tbl is tv
253
+ # TODO: check for column being dropped
254
+ # print(
255
+ # f'ColumnRef.sql_expr: tbl={tv.id}:{tv.effective_version} sa_tbl={id(self.col.tbl.store_tbl.sa_tbl):x} '
256
+ # f'tv={id(tv):x}'
257
+ # )
258
+ return self.col.sa_col
238
259
 
239
260
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
240
261
  if self.perform_validation:
@@ -275,7 +296,7 @@ class ColumnRef(Expr):
275
296
  if self.base_rowid != data_row.pk[: self.base_rowid_len]:
276
297
  row_builder.eval(data_row, self.iter_arg_ctx)
277
298
  iterator_args = data_row[self.iter_arg_ctx.target_slot_idxs[0]]
278
- self.iterator = self.col.tbl.get().iterator_cls(**iterator_args)
299
+ self.iterator = self.col.tbl.iterator_cls(**iterator_args)
279
300
  self.base_rowid = data_row.pk[: self.base_rowid_len]
280
301
  self.iterator.set_pos(data_row.pk[self.pos_idx])
281
302
  res = next(self.iterator)
@@ -283,12 +304,12 @@ class ColumnRef(Expr):
283
304
 
284
305
  def _as_dict(self) -> dict:
285
306
  tbl = self.col.tbl
286
- tbl_version = tbl.get().version if tbl.get().is_snapshot else None
307
+ version = tbl.version if tbl.is_snapshot else None
287
308
  # we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
288
309
  # non-validating component ColumnRef
289
310
  return {
290
311
  'tbl_id': str(tbl.id),
291
- 'tbl_version': tbl_version,
312
+ 'tbl_version': version,
292
313
  'col_id': self.col.id,
293
314
  'reference_tbl': self.reference_tbl.as_dict() if self.reference_tbl is not None else None,
294
315
  'perform_validation': self.perform_validation,
@@ -81,7 +81,7 @@ class Comparison(Expr):
81
81
  if self.is_search_arg_comparison:
82
82
  # reference the index value column if there is an index and this is not a snapshot
83
83
  # (indices don't apply to snapshots)
84
- tbl = self._op1.col.tbl.get()
84
+ tbl = self._op1.col.tbl
85
85
  idx_info = [
86
86
  info for info in self._op1.col.get_idx_info().values() if isinstance(info.idx, index.BtreeIndex)
87
87
  ]
@@ -172,13 +172,11 @@ class RowBuilder:
172
172
 
173
173
  def refs_unstored_iter_col(col_ref: ColumnRef) -> bool:
174
174
  tbl = col_ref.col.tbl
175
- return (
176
- tbl.get().is_component_view and tbl.get().is_iterator_column(col_ref.col) and not col_ref.col.is_stored
177
- )
175
+ return tbl.is_component_view and tbl.is_iterator_column(col_ref.col) and not col_ref.col.is_stored
178
176
 
179
177
  unstored_iter_col_refs = [col_ref for col_ref in col_refs if refs_unstored_iter_col(col_ref)]
180
178
  component_views = [col_ref.col.tbl for col_ref in unstored_iter_col_refs]
181
- unstored_iter_args = {view.id: view.get().iterator_args.copy() for view in component_views}
179
+ unstored_iter_args = {view.id: view.iterator_args.copy() for view in component_views}
182
180
  self.unstored_iter_args = {
183
181
  id: self._record_unique_expr(arg, recursive=True) for id, arg in unstored_iter_args.items()
184
182
  }
@@ -450,9 +448,9 @@ class RowBuilder:
450
448
  else:
451
449
  if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
452
450
  # we have yet to store this image
453
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.get().version))
451
+ filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
454
452
  data_row.flush_img(slot_idx, filepath)
455
- val = data_row.get_stored_val(slot_idx, col.sa_col.type)
453
+ val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
456
454
  table_row[col.store_name()] = val
457
455
  # we unfortunately need to set these, even if there are no errors
458
456
  table_row[col.errortype_store_name()] = None
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from typing import Any, Optional, cast
4
5
  from uuid import UUID
5
6
 
@@ -12,6 +13,8 @@ from .expr import Expr
12
13
  from .row_builder import RowBuilder
13
14
  from .sql_element_cache import SqlElementCache
14
15
 
16
+ _logger = logging.getLogger('pixeltable')
17
+
15
18
 
16
19
  class RowidRef(Expr):
17
20
  """A reference to a part of a table rowid
@@ -97,10 +100,15 @@ class RowidRef(Expr):
97
100
 
98
101
  def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
99
102
  tbl = self.tbl.get() if self.tbl is not None else catalog.Catalog.get().get_tbl_version(self.tbl_id, None)
103
+ assert tbl.is_validated
100
104
  rowid_cols = tbl.store_tbl.rowid_columns()
101
105
  assert self.rowid_component_idx <= len(rowid_cols), (
102
106
  f'{self.rowid_component_idx} not consistent with {rowid_cols}'
103
107
  )
108
+ # _logger.debug(
109
+ # f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
110
+ # f'tv={id(tbl):x}'
111
+ # )
104
112
  return rowid_cols[self.rowid_component_idx]
105
113
 
106
114
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -54,6 +54,7 @@ class SimilarityExpr(Expr):
54
54
  return 'similarity'
55
55
 
56
56
  def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
57
+ # TODO: validate that the index still exists
57
58
  if not isinstance(self.components[1], Literal):
58
59
  raise excs.Error('similarity(): requires a string or a PIL.Image.Image object, not an expression')
59
60
  item = self.components[1].val
@@ -162,7 +162,7 @@ def retrieval_udf(
162
162
  else:
163
163
  for param in parameters:
164
164
  if isinstance(param, str) and param not in table.columns:
165
- raise excs.Error(f'The specified parameter {param!r} is not a column of the table {table._path!r}')
165
+ raise excs.Error(f'The specified parameter {param!r} is not a column of the table {table._path()!r}')
166
166
  col_refs = [table[param] if isinstance(param, str) else param for param in parameters]
167
167
 
168
168
  if len(col_refs) == 0: