pixeltable 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show
  1. pixeltable/catalog/column.py +25 -48
  2. pixeltable/catalog/insertable_table.py +7 -4
  3. pixeltable/catalog/table.py +163 -57
  4. pixeltable/catalog/table_version.py +416 -140
  5. pixeltable/catalog/table_version_path.py +2 -2
  6. pixeltable/client.py +0 -4
  7. pixeltable/dataframe.py +65 -21
  8. pixeltable/env.py +16 -1
  9. pixeltable/exec/cache_prefetch_node.py +1 -1
  10. pixeltable/exec/in_memory_data_node.py +11 -7
  11. pixeltable/exprs/comparison.py +3 -3
  12. pixeltable/exprs/data_row.py +5 -1
  13. pixeltable/exprs/literal.py +16 -4
  14. pixeltable/exprs/row_builder.py +8 -40
  15. pixeltable/ext/__init__.py +5 -0
  16. pixeltable/ext/functions/yolox.py +92 -0
  17. pixeltable/func/aggregate_function.py +15 -15
  18. pixeltable/func/expr_template_function.py +9 -1
  19. pixeltable/func/globals.py +24 -14
  20. pixeltable/func/signature.py +18 -12
  21. pixeltable/func/udf.py +7 -2
  22. pixeltable/functions/__init__.py +8 -8
  23. pixeltable/functions/eval.py +7 -8
  24. pixeltable/functions/huggingface.py +47 -19
  25. pixeltable/functions/openai.py +2 -2
  26. pixeltable/functions/util.py +11 -0
  27. pixeltable/index/__init__.py +2 -0
  28. pixeltable/index/base.py +49 -0
  29. pixeltable/index/embedding_index.py +95 -0
  30. pixeltable/metadata/schema.py +45 -22
  31. pixeltable/plan.py +15 -34
  32. pixeltable/store.py +38 -41
  33. pixeltable/tests/conftest.py +5 -11
  34. pixeltable/tests/ext/test_yolox.py +21 -0
  35. pixeltable/tests/functions/test_fireworks.py +1 -0
  36. pixeltable/tests/functions/test_huggingface.py +2 -2
  37. pixeltable/tests/functions/test_openai.py +15 -5
  38. pixeltable/tests/functions/test_together.py +1 -0
  39. pixeltable/tests/test_component_view.py +14 -5
  40. pixeltable/tests/test_dataframe.py +19 -18
  41. pixeltable/tests/test_exprs.py +99 -102
  42. pixeltable/tests/test_function.py +51 -43
  43. pixeltable/tests/test_index.py +138 -0
  44. pixeltable/tests/test_migration.py +2 -1
  45. pixeltable/tests/test_snapshot.py +24 -1
  46. pixeltable/tests/test_table.py +101 -25
  47. pixeltable/tests/test_types.py +30 -0
  48. pixeltable/tests/test_video.py +16 -16
  49. pixeltable/tests/test_view.py +5 -0
  50. pixeltable/tests/utils.py +43 -9
  51. pixeltable/tool/create_test_db_dump.py +16 -0
  52. pixeltable/type_system.py +37 -45
  53. {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/METADATA +5 -4
  54. {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/RECORD +56 -49
  55. {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
  56. {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0
@@ -13,7 +13,9 @@ import sqlalchemy.orm as orm
13
13
 
14
14
  import pixeltable
15
15
  import pixeltable.func as func
16
- from pixeltable import exceptions as excs
16
+ import pixeltable.type_system as ts
17
+ import pixeltable.exceptions as excs
18
+ import pixeltable.index as index
17
19
  from pixeltable.env import Env
18
20
  from pixeltable.iterators import ComponentIterator
19
21
  from pixeltable.metadata import schema
@@ -26,7 +28,8 @@ _logger = logging.getLogger('pixeltable')
26
28
 
27
29
  class TableVersion:
28
30
  """
29
- TableVersion represents a particular version of a table/view along with its store table:
31
+ TableVersion represents a particular version of a table/view along with its physical representation:
32
+ - the physical representation is a store table with indices
30
33
  - the version can be mutable or a snapshot
31
34
  - tables and their recursive views form a tree, and a mutable TableVersion also records its own
32
35
  mutable views in order to propagate updates
@@ -37,6 +40,14 @@ class TableVersion:
37
40
  have TableVersions reference those
38
41
  - mutable TableVersions record their TableVersionPath, which is needed for expr evaluation in updates
39
42
  """
43
+ @dataclasses.dataclass
44
+ class IndexInfo:
45
+ id: int
46
+ idx: index.IndexBase
47
+ col: Column
48
+ val_col: Column
49
+ undo_col: Column
50
+
40
51
 
41
52
  def __init__(
42
53
  self, id: UUID, tbl_md: schema.TableMd, version: int, schema_version_md: schema.TableSchemaVersionMd,
@@ -67,12 +78,13 @@ class TableVersion:
67
78
  self.base = base_path.tbl_version if base_path is not None else base
68
79
  if self.is_snapshot:
69
80
  self.next_col_id = -1
81
+ self.next_idx_id = -1 # TODO: can snapshots have separate indices?
70
82
  self.next_rowid = -1
71
83
  else:
72
84
  assert tbl_md.current_version == self.version
73
85
  self.next_col_id = tbl_md.next_col_id
86
+ self.next_idx_id = tbl_md.next_idx_id
74
87
  self.next_rowid = tbl_md.next_row_id
75
- self.column_history = tbl_md.column_history
76
88
 
77
89
  # view-specific initialization
78
90
  from pixeltable import exprs
@@ -101,8 +113,13 @@ class TableVersion:
101
113
  cat = catalog.Catalog.get()
102
114
  cat.tbl_versions[(self.id, self.effective_version)] = self
103
115
 
104
- # do this after we determined whether we're a component view, and before we create the store table
105
- self._init_schema(schema_version_md)
116
+ # init schema after we determined whether we're a component view, and before we create the store table
117
+ self.cols: List[Column] = [] # contains complete history of columns, incl dropped ones
118
+ self.cols_by_name: dict[str, Column] = {} # contains only user-facing (named) columns visible in this version
119
+ self.cols_by_id: dict[int, Column] = {} # contains only columns visible in this version
120
+ self.idx_md = tbl_md.index_md # needed for _create_tbl_md()
121
+ self.idxs_by_name: dict[str, TableVersion.IndexInfo] = {} # contains only actively maintained indices
122
+ self._init_schema(tbl_md, schema_version_md)
106
123
 
107
124
  def __hash__(self) -> int:
108
125
  return hash(self.id)
@@ -111,19 +128,21 @@ class TableVersion:
111
128
  """Create a snapshot copy of this TableVersion"""
112
129
  assert not self.is_snapshot
113
130
  return TableVersion(
114
- self.id, self._create_md(), self.version,
131
+ self.id, self._create_tbl_md(), self.version,
115
132
  self._create_schema_version_md(preceding_schema_version=0), # preceding_schema_version: dummy value
116
133
  is_snapshot=True, base=self.base)
117
134
 
118
135
  @classmethod
119
136
  def create(
120
- cls, session: orm.Session, dir_id: UUID, name: str, cols: List[Column], num_retained_versions: int, comment: str,
121
- base_path: Optional['pixeltable.catalog.TableVersionPath'] = None, view_md: Optional[schema.ViewMd] = None
137
+ cls, session: orm.Session, dir_id: UUID, name: str, cols: List[Column], num_retained_versions: int,
138
+ comment: str, base_path: Optional['pixeltable.catalog.TableVersionPath'] = None,
139
+ view_md: Optional[schema.ViewMd] = None
122
140
  ) -> Tuple[UUID, Optional[TableVersion]]:
123
141
  # assign ids
124
142
  cols_by_name: Dict[str, Column] = {}
125
143
  for pos, col in enumerate(cols):
126
144
  col.id = pos
145
+ col.schema_version_add = 0
127
146
  cols_by_name[col.name] = col
128
147
  if col.value_expr is None and col.compute_func is not None:
129
148
  cls._create_value_expr(col, base_path)
@@ -132,14 +151,11 @@ class TableVersion:
132
151
 
133
152
  ts = time.time()
134
153
  # create schema.Table
135
- column_history = {
136
- col.id: schema.ColumnHistory(col_id=col.id, schema_version_add=0, schema_version_drop=None)
137
- for col in cols
138
- }
154
+ # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
155
+ column_md = cls._create_column_md(cols)
139
156
  table_md = schema.TableMd(
140
157
  name=name, current_version=0, current_schema_version=0,
141
- next_col_id=len(cols), next_row_id=0, column_history=column_history,
142
- view_md=view_md)
158
+ next_col_id=len(cols), next_idx_id=0, next_row_id=0, column_md=column_md, index_md={}, view_md=view_md)
143
159
  tbl_record = schema.Table(dir_id=dir_id, md=dataclasses.asdict(table_md))
144
160
  session.add(tbl_record)
145
161
  session.flush() # sets tbl_record.id
@@ -152,16 +168,10 @@ class TableVersion:
152
168
  session.add(tbl_version_record)
153
169
 
154
170
  # create schema.TableSchemaVersion
155
- column_md: Dict[int, schema.SchemaColumn] = {}
156
- for pos, col in enumerate(cols):
157
- # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
158
- value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
159
- column_md[col.id] = schema.SchemaColumn(
160
- pos=pos, name=col.name, col_type=col.col_type.as_dict(),
161
- is_pk=col.primary_key, value_expr=value_expr_dict, stored=col.stored, is_indexed=col.is_indexed)
171
+ schema_col_md = {col.id: schema.SchemaColumn(pos=pos, name=col.name) for pos, col in enumerate(cols)}
162
172
 
163
173
  schema_version_md = schema.TableSchemaVersionMd(
164
- schema_version=0, preceding_schema_version=None, columns=column_md,
174
+ schema_version=0, preceding_schema_version=None, columns=schema_col_md,
165
175
  num_retained_versions=num_retained_versions, comment=comment)
166
176
  schema_version_record = schema.TableSchemaVersion(
167
177
  tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
@@ -202,21 +212,70 @@ class TableVersion:
202
212
  del cat.tbl_versions[(self.id, self.effective_version)]
203
213
  # TODO: remove from tbl_dependents
204
214
 
205
- def _init_schema(self, schema_version_md: schema.TableSchemaVersionMd) -> None:
206
- """Initialize self.cols as well as self.store_tbl"""
207
- self.cols = [Column.from_md(col_id, col_md, self) for col_id, col_md in schema_version_md.columns.items()]
208
- self.cols_by_name = {col.name: col for col in self.cols}
209
- self.cols_by_id = {col.id: col for col in self.cols}
210
-
211
- # make sure to traverse columns ordered by position = order in which cols were created;
212
- # this guarantees that references always point backwards
213
- from pixeltable import exprs
214
- for col, col_md in zip(self.cols, schema_version_md.columns.values()):
215
+ def _init_schema(self, tbl_md: schema.TableMd, schema_version_md: schema.TableSchemaVersionMd) -> None:
216
+ # create columns first, so the indices can reference them
217
+ self._init_cols(tbl_md, schema_version_md)
218
+ self._init_idxs(tbl_md)
219
+ # create the sa schema only after creating the columns and indices
220
+ self._init_sa_schema()
221
+
222
+ def _init_cols(self, tbl_md: schema.TableMd, schema_version_md: schema.TableSchemaVersionMd) -> None:
223
+ """Initialize self.cols with the columns visible in our effective version"""
224
+ import pixeltable.exprs as exprs
225
+ self.cols = []
226
+ self.cols_by_name = {}
227
+ self.cols_by_id = {}
228
+ for col_md in tbl_md.column_md.values():
229
+ col_name = schema_version_md.columns[col_md.id].name if col_md.id in schema_version_md.columns else None
230
+ col = Column(
231
+ col_id=col_md.id, name=col_name, col_type=ts.ColumnType.from_dict(col_md.col_type),
232
+ is_pk=col_md.is_pk, stored=col_md.stored,
233
+ schema_version_add=col_md.schema_version_add, schema_version_drop=col_md.schema_version_drop)
215
234
  col.tbl = self
235
+ self.cols.append(col)
236
+
237
+ # populate the lookup structures before Expr.from_dict()
238
+ if col_md.schema_version_add > self.schema_version:
239
+ # column was added after this version
240
+ continue
241
+ if col_md.schema_version_drop is not None and col_md.schema_version_drop <= self.schema_version:
242
+ # column was dropped
243
+ continue
244
+ if col.name is not None:
245
+ self.cols_by_name[col.name] = col
246
+ self.cols_by_id[col.id] = col
247
+
248
+ # make sure to traverse columns ordered by position = order in which cols were created;
249
+ # this guarantees that references always point backwards
216
250
  if col_md.value_expr is not None:
217
251
  col.value_expr = exprs.Expr.from_dict(col_md.value_expr)
218
252
  self._record_value_expr(col)
219
253
 
254
+ def _init_idxs(self, tbl_md: schema.TableMd) -> None:
255
+ self.idx_md = tbl_md.index_md
256
+ self.idxs_by_name = {}
257
+ import pixeltable.index as index_module
258
+ for md in tbl_md.index_md.values():
259
+ if md.schema_version_add > self.schema_version \
260
+ or md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version:
261
+ # column not visible in this schema version
262
+ continue
263
+
264
+ # instantiate index object
265
+ cls_name = md.class_fqn.rsplit('.', 1)[-1]
266
+ cls = getattr(index_module, cls_name)
267
+ idx_col = self.cols_by_id[md.indexed_col_id]
268
+ idx = cls.from_dict(idx_col, md.init_args)
269
+
270
+ # fix up the sa column type of the index value and undo columns
271
+ val_col = self.cols_by_id[md.index_val_col_id]
272
+ val_col.sa_col_type = idx.index_sa_type()
273
+ undo_col = self.cols_by_id[md.index_val_undo_col_id]
274
+ undo_col.sa_col_type = idx.index_sa_type()
275
+ idx_info = self.IndexInfo(id=md.id, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
276
+ self.idxs_by_name[md.name] = idx_info
277
+
278
+ def _init_sa_schema(self) -> None:
220
279
  # create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
221
280
  # need to record errors
222
281
  from pixeltable.store import StoreBase, StoreTable, StoreView, StoreComponentView
@@ -227,8 +286,7 @@ class TableVersion:
227
286
  else:
228
287
  self.store_tbl: StoreBase = StoreTable(self)
229
288
 
230
- def _update_md(
231
- self, ts: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection) -> None:
289
+ def _update_md(self, ts: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection) -> None:
232
290
  """Update all recorded metadata in response to a data or schema change.
233
291
  Args:
234
292
  ts: timestamp of the change
@@ -236,8 +294,9 @@ class TableVersion:
236
294
  """
237
295
  conn.execute(
238
296
  sql.update(schema.Table.__table__)
239
- .values({schema.Table.md: dataclasses.asdict(self._create_md())})
297
+ .values({schema.Table.md: dataclasses.asdict(self._create_tbl_md())})
240
298
  .where(schema.Table.id == self.id))
299
+
241
300
  version_md = self._create_version_md(ts)
242
301
  conn.execute(
243
302
  sql.insert(schema.TableVersion.__table__)
@@ -250,6 +309,80 @@ class TableVersion:
250
309
  tbl_id=self.id, schema_version=self.schema_version,
251
310
  md=dataclasses.asdict(schema_version_md)))
252
311
 
312
+ def _store_idx_name(self, idx_id: int) -> str:
313
+ """Return name of index in the store, which needs to be globally unique"""
314
+ return f'idx_{self.id.hex}_{idx_id}'
315
+
316
+ def add_index(self, col: Column, idx_name: Optional[str], idx: index.IndexBase) -> UpdateStatus:
317
+ assert not self.is_snapshot
318
+ idx_id = self.next_idx_id
319
+ self.next_idx_id += 1
320
+ if idx_name is None:
321
+ idx_name = f'idx{idx_id}'
322
+ else:
323
+ assert is_valid_identifier(idx_name)
324
+ assert idx_name not in [i.name for i in self.idx_md.values()]
325
+
326
+ # we're creating a new schema version
327
+ self.version += 1
328
+ preceding_schema_version = self.schema_version
329
+ self.schema_version = self.version
330
+ with Env.get().engine.begin() as conn:
331
+ # add the index value and undo columns (which need to be nullable);
332
+ # we don't create a new schema version, because indices aren't part of the logical schema
333
+ val_col = Column(
334
+ col_id=self.next_col_id, name=None, computed_with=idx.index_value_expr(),
335
+ sa_col_type=idx.index_sa_type(), stored=True,
336
+ schema_version_add=self.schema_version, schema_version_drop=None)
337
+ val_col.tbl = self
338
+ val_col.col_type.nullable = True
339
+ self.next_col_id += 1
340
+
341
+ undo_col = Column(
342
+ col_id=self.next_col_id, name=None, col_type=val_col.col_type,
343
+ sa_col_type=val_col.sa_col_type, stored=True,
344
+ schema_version_add=self.schema_version, schema_version_drop=None)
345
+ undo_col.tbl = self
346
+ undo_col.col_type.nullable = True
347
+ self.next_col_id += 1
348
+
349
+ # create and register the index metadata
350
+ idx_cls = type(idx)
351
+ idx_md = schema.IndexMd(
352
+ id=idx_id, name=idx_name,
353
+ indexed_col_id=col.id, index_val_col_id=val_col.id, index_val_undo_col_id=undo_col.id,
354
+ schema_version_add=self.schema_version, schema_version_drop=None,
355
+ class_fqn=idx_cls.__module__ + '.' + idx_cls.__name__, init_args=idx.as_dict())
356
+ idx_info = self.IndexInfo(id=idx_id, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
357
+ self.idx_md[idx_id] = idx_md
358
+ self.idxs_by_name[idx_name] = idx_info
359
+
360
+ # add the columns and update the metadata
361
+ status = self._add_columns([val_col, undo_col], conn, preceding_schema_version=preceding_schema_version)
362
+ # now create the index structure
363
+ idx.create_index(self._store_idx_name(idx_id), val_col, conn)
364
+
365
+ _logger.info(f'Added index {idx_name} on column {col.name} to table {self.name}')
366
+ return status
367
+
368
+ def drop_index(self, idx_id: int) -> None:
369
+ assert not self.is_snapshot
370
+ assert idx_id in self.idx_md
371
+
372
+ # we're creating a new schema version
373
+ self.version += 1
374
+ preceding_schema_version = self.schema_version
375
+ self.schema_version = self.version
376
+ idx_md = self.idx_md[idx_id]
377
+ idx_md.schema_version_drop = self.schema_version
378
+ assert idx_md.name in self.idxs_by_name
379
+ idx_info = self.idxs_by_name[idx_md.name]
380
+ del self.idxs_by_name[idx_md.name]
381
+
382
+ with Env.get().engine.begin() as conn:
383
+ self._drop_columns([idx_info.val_col, idx_info.undo_col], conn, preceding_schema_version)
384
+ _logger.info(f'Dropped index {idx_md.name} on table {self.name}')
385
+
253
386
  def add_column(self, col: Column, print_stats: bool = False) -> UpdateStatus:
254
387
  """Adds a column to the table.
255
388
  """
@@ -268,60 +401,86 @@ class TableVersion:
268
401
  col.check_value_expr()
269
402
  self._record_value_expr(col)
270
403
 
271
- row_count = self.store_tbl.count()
272
- if row_count > 0 and not col.col_type.nullable and not col.is_computed:
273
- raise excs.Error(f'Cannot add non-nullable column "{col.name}" to table {self.name} with existing rows')
274
-
275
404
  # we're creating a new schema version
276
- ts = time.time()
277
405
  self.version += 1
278
406
  preceding_schema_version = self.schema_version
279
407
  self.schema_version = self.version
408
+ with Env.get().engine.begin() as conn:
409
+ status = self._add_columns([col], conn, preceding_schema_version, print_stats=print_stats)
410
+ _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
280
411
 
281
- self.cols.append(col)
282
- self.cols_by_name[col.name] = col
283
- self.cols_by_id[col.id] = col
284
- self.column_history[col.id] = schema.ColumnHistory(col.id, self.schema_version, None)
412
+ msg = (
413
+ f'Added {status.num_rows} column value{"" if status.num_rows == 1 else "s"} '
414
+ f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}.'
415
+ )
416
+ print(msg)
417
+ _logger.info(f'Column {col.name}: {msg}')
418
+ return status
419
+
420
+ def _add_columns(
421
+ self, cols: List[Column], conn: sql.engine.Connection, preceding_schema_version: Optional[int] = None,
422
+ print_stats: bool = False
423
+ ) -> UpdateStatus:
424
+ """Add and populate columns within the current transaction"""
425
+ ts = time.time()
426
+
427
+ row_count = self.store_tbl.count(conn=conn)
428
+ for col in cols:
429
+ if not col.col_type.nullable and not col.is_computed:
430
+ if row_count > 0:
431
+ raise excs.Error(
432
+ f'Cannot add non-nullable column "{col.name}" to table {self.name} with existing rows')
433
+
434
+ num_excs = 0
435
+ cols_with_excs: List[Column] = []
436
+ for col in cols:
437
+ col.schema_version_add = self.schema_version
438
+ # add the column to the lookup structures now, rather than after the store changes executed successfully,
439
+ # because it might be referenced by the next column's value_expr
440
+ self.cols.append(col)
441
+ if col.name is not None:
442
+ self.cols_by_name[col.name] = col
443
+ self.cols_by_id[col.id] = col
285
444
 
286
- with Env.get().engine.begin() as conn:
287
- self._update_md(ts, preceding_schema_version, conn)
288
- _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
289
445
  if col.is_stored:
290
446
  self.store_tbl.add_column(col, conn)
291
447
 
292
- print(f'Added column `{col.name}` to table `{self.name}`.')
293
- if row_count == 0:
294
- return UpdateStatus()
295
- if (not col.is_computed or not col.is_stored) and not col.is_indexed:
296
- return UpdateStatus(num_rows=row_count)
297
- # compute values for the existing rows and compute embeddings, if this column is indexed;
298
- # for some reason, it's not possible to run the following updates in the same transaction as the one
299
- # that we just used to create the metadata (sqlalchemy hangs when exec() tries to run the query)
300
- from pixeltable.plan import Planner
301
- plan, value_expr_slot_idx, embedding_slot_idx = Planner.create_add_column_plan(self.path, col)
302
- plan.ctx.num_rows = row_count
303
- # TODO: create pgvector index, if col is indexed
448
+ if not col.is_computed or not col.is_stored or row_count == 0:
449
+ continue
450
+
451
+ # populate the column
452
+ from pixeltable.plan import Planner
453
+ plan, value_expr_slot_idx = Planner.create_add_column_plan(self.path, col)
454
+ plan.ctx.num_rows = row_count
304
455
 
305
- try:
306
- # TODO: do this in the same transaction as the metadata update
307
- with Env.get().engine.begin() as conn:
456
+ try:
308
457
  plan.ctx.conn = conn
309
458
  plan.open()
310
- num_excs = self.store_tbl.load_column(col, plan, value_expr_slot_idx, embedding_slot_idx, conn)
311
- except sql.exc.DBAPIError as e:
312
- self.drop_column(col.name)
313
- raise excs.Error(f'Error during SQL execution:\n{e}')
314
- finally:
315
- plan.close()
316
-
317
- msg = f'Added {row_count} column value{"" if row_count == 1 else "s"} with {num_excs} error{"" if num_excs == 1 else "s"}.'
318
- print(msg)
319
- _logger.info(f'Column {col.name}: {msg}')
459
+ num_excs = self.store_tbl.load_column(col, plan, value_expr_slot_idx, conn)
460
+ if num_excs > 0:
461
+ cols_with_excs.append(col)
462
+ except sql.exc.DBAPIError as e:
463
+ self.cols.pop()
464
+ for col in cols:
465
+ # remove columns that we already added
466
+ if col.id not in self.cols_by_id:
467
+ continue
468
+ if col.name is not None:
469
+ del self.cols_by_name[col.name]
470
+ del self.cols_by_id[col.id]
471
+ # we need to re-initialize the sqlalchemy schema
472
+ self.store_tbl.create_sa_tbl()
473
+ raise excs.Error(f'Error during SQL execution:\n{e}')
474
+ finally:
475
+ plan.close()
476
+
477
+ self._update_md(ts, preceding_schema_version, conn)
320
478
  if print_stats:
321
479
  plan.ctx.profile.print(num_rows=row_count)
480
+ # TODO(mkornacker): what to do about system columns with exceptions?
322
481
  return UpdateStatus(
323
482
  num_rows=row_count, num_computed_values=row_count, num_excs=num_excs,
324
- cols_with_excs=[f'{self.name}.{col.name}'] if num_excs > 0 else [])
483
+ cols_with_excs=[f'{col.tbl.name}.{col.name}'for col in cols_with_excs if col.name is not None])
325
484
 
326
485
  def drop_column(self, name: str) -> None:
327
486
  """Drop a column from the table.
@@ -330,35 +489,58 @@ class TableVersion:
330
489
  if name not in self.cols_by_name:
331
490
  raise excs.Error(f'Unknown column: {name}')
332
491
  col = self.cols_by_name[name]
333
- if len(col.dependent_cols) > 0:
492
+ dependent_user_cols = [c for c in col.dependent_cols if c.name is not None]
493
+ if len(dependent_user_cols) > 0:
334
494
  raise excs.Error(
335
495
  f'Cannot drop column {name} because the following columns depend on it:\n',
336
- f'{", ".join([c.name for c in col.dependent_cols])}')
337
-
338
- if col.value_expr is not None:
339
- # update Column.dependent_cols
340
- for c in self.cols:
341
- if c == col:
342
- break
343
- c.dependent_cols.discard(col)
496
+ f'{", ".join([c.name for c in dependent_user_cols])}')
344
497
 
345
498
  # we're creating a new schema version
346
- ts = time.time()
347
499
  self.version += 1
348
500
  preceding_schema_version = self.schema_version
349
501
  self.schema_version = self.version
350
502
 
351
- self.cols.remove(col)
352
- del self.cols_by_name[name]
353
- del self.cols_by_id[col.id]
354
- self.column_history[col.id].schema_version_drop = self.schema_version
355
-
356
503
  with Env.get().engine.begin() as conn:
357
- self._update_md(ts, preceding_schema_version, conn)
358
- if col.is_stored:
359
- self.store_tbl.drop_column()
504
+ # drop this column and all dependent index columns and indices
505
+ dropped_cols = [col]
506
+ dropped_idx_names: List[str] = []
507
+ for idx_info in self.idxs_by_name.values():
508
+ if idx_info.col != col:
509
+ continue
510
+ dropped_cols.extend([idx_info.val_col, idx_info.undo_col])
511
+ idx_md = self.idx_md[idx_info.id]
512
+ idx_md.schema_version_drop = self.schema_version
513
+ assert idx_md.name in self.idxs_by_name
514
+ dropped_idx_names.append(idx_md.name)
515
+ # update idxs_by_name
516
+ for idx_name in dropped_idx_names:
517
+ del self.idxs_by_name[idx_name]
518
+ self._drop_columns(dropped_cols, conn, preceding_schema_version)
360
519
  _logger.info(f'Dropped column {name} from table {self.name}, new version: {self.version}')
361
520
 
521
+ def _drop_columns(self, cols: list[Column], conn: sql.engine.Connection, preceding_schema_version: int) -> None:
522
+ """Mark columns as dropped"""
523
+ assert not self.is_snapshot
524
+
525
+ ts = time.time()
526
+ for col in cols:
527
+ if col.value_expr is not None:
528
+ # update Column.dependent_cols
529
+ for c in self.cols:
530
+ if c == col:
531
+ break
532
+ c.dependent_cols.discard(col)
533
+
534
+ col.schema_version_drop = self.schema_version
535
+ if col.name is not None:
536
+ assert col.name in self.cols_by_name
537
+ del self.cols_by_name[col.name]
538
+ assert col.id in self.cols_by_id
539
+ del self.cols_by_id[col.id]
540
+
541
+ self._update_md(ts, preceding_schema_version, conn)
542
+ self.store_tbl.create_sa_tbl()
543
+
362
544
  def rename_column(self, old_name: str, new_name: str) -> None:
363
545
  """Rename a column.
364
546
  """
@@ -387,14 +569,14 @@ class TableVersion:
387
569
  def set_comment(self, new_comment: Optional[str]):
388
570
  _logger.info(f'[{self.name}] Updating comment: {new_comment}')
389
571
  self.comment = new_comment
390
- self._commit_new_schema_version()
572
+ self._create_schema_version()
391
573
 
392
574
  def set_num_retained_versions(self, new_num_retained_versions: int):
393
575
  _logger.info(f'[{self.name}] Updating num_retained_versions: {new_num_retained_versions} (was {self.num_retained_versions})')
394
576
  self.num_retained_versions = new_num_retained_versions
395
- self._commit_new_schema_version()
577
+ self._create_schema_version()
396
578
 
397
- def _commit_new_schema_version(self):
579
+ def _create_schema_version(self):
398
580
  # we're creating a new schema version
399
581
  ts = time.time()
400
582
  self.version += 1
@@ -448,7 +630,67 @@ class TableVersion:
448
630
  return result
449
631
 
450
632
  def update(
451
- self, update_targets: Optional[List[Tuple[Column, 'pixeltable.exprs.Expr']]] = None,
633
+ self, update_targets: dict[Column, 'pixeltable.exprs.Expr'],
634
+ where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
635
+ ) -> UpdateStatus:
636
+ with Env.get().engine.begin() as conn:
637
+ return self._update(conn, update_targets, where_clause, cascade)
638
+
639
+ def batch_update(
640
+ self, batch: list[dict[Column, 'pixeltable.exprs.Expr']], rowids: list[Tuple[int, ...]],
641
+ cascade: bool = True
642
+ ) -> UpdateStatus:
643
+ """Update rows in batch.
644
+ Args:
645
+ batch: one dict per row, each mapping Columns to LiteralExprs representing the new values
646
+ rowids: if not empty, one tuple per row, each containing the rowid values for the corresponding row in batch
647
+ """
648
+ # if we do lookups of rowids, we must have one for each row in the batch
649
+ assert len(rowids) == 0 or len(rowids) == len(batch)
650
+ import pixeltable.exprs as exprs
651
+ result_status = UpdateStatus()
652
+ cols_with_excs: set[str] = set()
653
+ updated_cols: set[str] = set()
654
+ pk_cols = self.primary_key_columns()
655
+ use_rowids = len(rowids) > 0
656
+
657
+ with Env.get().engine.begin() as conn:
658
+ for i, row in enumerate(batch):
659
+ where_clause: Optional[exprs.Expr] = None
660
+ if use_rowids:
661
+ # construct Where clause to match rowid
662
+ num_rowid_cols = len(self.store_tbl.rowid_columns())
663
+ for col_idx in range(num_rowid_cols):
664
+ assert len(rowids[i]) == num_rowid_cols
665
+ clause = exprs.RowidRef(self, col_idx) == rowids[i][col_idx]
666
+ if where_clause is None:
667
+ where_clause = clause
668
+ else:
669
+ where_clause = where_clause & clause
670
+ else:
671
+ # construct Where clause for primary key columns
672
+ for col in pk_cols:
673
+ assert col in row
674
+ clause = exprs.ColumnRef(col) == row[col]
675
+ if where_clause is None:
676
+ where_clause = clause
677
+ else:
678
+ where_clause = where_clause & clause
679
+
680
+ update_targets = {col: row[col] for col in row if col not in pk_cols}
681
+ status = self._update(conn, update_targets, where_clause, cascade)
682
+ result_status.num_rows += status.num_rows
683
+ result_status.num_excs += status.num_excs
684
+ result_status.num_computed_values += status.num_computed_values
685
+ cols_with_excs.update(status.cols_with_excs)
686
+ updated_cols.update(status.updated_cols)
687
+
688
+ result_status.cols_with_excs = list(cols_with_excs)
689
+ result_status.updated_cols = list(updated_cols)
690
+ return result_status
691
+
692
+ def _update(
693
+ self, conn: sql.engine.Connection, update_targets: dict[Column, 'pixeltable.exprs.Expr'],
452
694
  where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
453
695
  ) -> UpdateStatus:
454
696
  """Update rows in this table.
@@ -458,21 +700,18 @@ class TableVersion:
458
700
  cascade: if True, also update all computed columns that transitively depend on the updated columns,
459
701
  including within views.
460
702
  """
461
- if update_targets is None:
462
- update_targets = []
463
703
  assert not self.is_snapshot
464
704
  from pixeltable.plan import Planner
465
705
  plan, updated_cols, recomputed_cols = \
466
706
  Planner.create_update_plan(self.path, update_targets, [], where_clause, cascade)
467
- with Env.get().engine.begin() as conn:
468
- ts = time.time()
469
- result = self._update(
470
- plan, where_clause.sql_expr() if where_clause is not None else None, recomputed_cols,
471
- base_versions=[], conn=conn, ts=ts, cascade=cascade)
472
- result.updated_cols = updated_cols
473
- return result
707
+ ts = time.time()
708
+ result = self._propagate_update(
709
+ plan, where_clause.sql_expr() if where_clause is not None else None, recomputed_cols,
710
+ base_versions=[], conn=conn, ts=ts, cascade=cascade)
711
+ result.updated_cols = updated_cols
712
+ return result
474
713
 
475
- def _update(
714
+ def _propagate_update(
476
715
  self, plan: Optional[exec.ExecNode], where_clause: Optional[sql.ClauseElement],
477
716
  recomputed_view_cols: List[Column], base_versions: List[Optional[int]], conn: sql.engine.Connection,
478
717
  ts: float, cascade: bool
@@ -497,7 +736,7 @@ class TableVersion:
497
736
  if len(recomputed_cols) > 0:
498
737
  from pixeltable.plan import Planner
499
738
  plan = Planner.create_view_update_plan(view.path, recompute_targets=recomputed_cols)
500
- status = view._update(
739
+ status = view._propagate_update(
501
740
  plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, ts=ts, cascade=True)
502
741
  result.num_rows += status.num_rows
503
742
  result.num_excs += status.num_excs
@@ -554,6 +793,15 @@ class TableVersion:
554
793
  self._revert(session)
555
794
  session.commit()
556
795
 
796
+ def _delete_column(self, col: Column, conn: sql.engine.Connection) -> None:
797
+ """Physically remove the column from the schema and the store table"""
798
+ if col.is_stored:
799
+ self.store_tbl.drop_column(col, conn)
800
+ self.cols.remove(col)
801
+ if col.name is not None:
802
+ del self.cols_by_name[col.name]
803
+ del self.cols_by_id[col.id]
804
+
557
805
  def _revert(self, session: orm.Session) -> None:
558
806
  """Reverts this table version and propagates to views"""
559
807
  conn = session.connection()
@@ -577,28 +825,47 @@ class TableVersion:
577
825
  # delete newly-added data
578
826
  MediaStore.delete(self.id, version=self.version)
579
827
  conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
580
- # revert new deletions
581
- conn.execute(
582
- sql.update(self.store_tbl.sa_tbl) \
583
- .values({self.store_tbl.sa_tbl.c.v_max: schema.Table.MAX_VERSION})
584
- .where(self.store_tbl.sa_tbl.c.v_max == self.version))
585
828
 
829
+ # revert new deletions
830
+ set_clause = {self.store_tbl.sa_tbl.c.v_max: schema.Table.MAX_VERSION}
831
+ for index_info in self.idxs_by_name.values():
832
+ # copy the index value back from the undo column and reset the undo column to NULL
833
+ set_clause[index_info.val_col.sa_col] = index_info.undo_col.sa_col
834
+ set_clause[index_info.undo_col.sa_col] = None
835
+ stmt = sql.update(self.store_tbl.sa_tbl) \
836
+ .values(set_clause) \
837
+ .where(self.store_tbl.sa_tbl.c.v_max == self.version)
838
+ conn.execute(stmt)
839
+
840
+ # revert schema changes
586
841
  if self.version == self.schema_version:
587
- # the current version involved a schema change:
588
- # if the schema change was to add a column, we now need to drop it
589
- added_col_ids = [
590
- col_history.col_id for col_history in self.column_history.values()
591
- if col_history.schema_version_add == self.schema_version
592
- ]
593
- assert len(added_col_ids) <= 1
594
- added_col: Optional[Column] = None
595
- if len(added_col_ids) == 1:
596
- added_col_id = added_col_ids[0]
597
- # drop this newly-added column and its ColumnHistory record
598
- c = self.cols_by_id[added_col_id]
599
- if c.is_stored:
600
- added_col = c
601
- del self.column_history[c.id]
842
+ # delete newly-added columns
843
+ added_cols = [col for col in self.cols if col.schema_version_add == self.schema_version]
844
+ if len(added_cols) > 0:
845
+ next_col_id = min(col.id for col in added_cols)
846
+ for col in added_cols:
847
+ self._delete_column(col, conn)
848
+ self.next_col_id = next_col_id
849
+
850
+ # remove newly-added indices from the lookup structures
851
+ # (the value and undo columns got removed in the preceding step)
852
+ added_idx_md = [md for md in self.idx_md.values() if md.schema_version_add == self.schema_version]
853
+ if len(added_idx_md) > 0:
854
+ next_idx_id = min(md.id for md in added_idx_md)
855
+ for md in added_idx_md:
856
+ del self.idx_md[md.id]
857
+ del self.idxs_by_name[md.name]
858
+ self.next_idx_id = next_idx_id
859
+
860
+ # make newly-dropped columns visible again
861
+ dropped_cols = [col for col in self.cols if col.schema_version_drop == self.schema_version]
862
+ for col in dropped_cols:
863
+ col.schema_version_drop = None
864
+
865
+ # make newly-dropped indices visible again
866
+ dropped_idx_md = [md for md in self.idx_md.values() if md.schema_version_drop == self.schema_version]
867
+ for md in dropped_idx_md:
868
+ md.schema_version_drop = None
602
869
 
603
870
  # we need to determine the preceding schema version and reload the schema
604
871
  schema_version_md_dict = session.query(schema.TableSchemaVersion.md) \
@@ -612,11 +879,8 @@ class TableVersion:
612
879
  .scalar()
613
880
  preceding_schema_version_md = schema.md_from_dict(
614
881
  schema.TableSchemaVersionMd, preceding_schema_version_md_dict)
615
- self._init_schema(preceding_schema_version_md)
616
-
617
- # physically drop the column, but only after we have re-created the schema
618
- if added_col is not None:
619
- self.store_tbl.drop_column(added_col, conn)
882
+ tbl_md = self._create_tbl_md()
883
+ self._init_schema(tbl_md, preceding_schema_version_md)
620
884
 
621
885
  conn.execute(
622
886
  sql.delete(schema.TableSchemaVersion.__table__)
@@ -634,7 +898,7 @@ class TableVersion:
634
898
  self.version -= 1
635
899
  conn.execute(
636
900
  sql.update(schema.Table.__table__)
637
- .values({schema.Table.md: dataclasses.asdict(self._create_md())})
901
+ .values({schema.Table.md: dataclasses.asdict(self._create_tbl_md())})
638
902
  .where(schema.Table.id == self.id))
639
903
 
640
904
  # propagate to views
@@ -667,6 +931,10 @@ class TableVersion:
667
931
  """Return all non-system columns"""
668
932
  return [c for c in self.cols if not self.is_system_column(c)]
669
933
 
934
+ def primary_key_columns(self) -> List[Column]:
935
+ """Return all non-system columns"""
936
+ return [c for c in self.cols if c.is_pk]
937
+
670
938
  def get_required_col_names(self) -> List[str]:
671
939
  """Return the names of all columns for which values must be specified in insert()"""
672
940
  assert not self.is_view()
@@ -727,22 +995,30 @@ class TableVersion:
727
995
  return 1 + self.base.num_rowid_columns()
728
996
  return 1
729
997
 
730
- def _create_md(self) -> schema.TableMd:
998
+ @classmethod
999
+ def _create_column_md(cls, cols: List[Column]) -> dict[int, schema.ColumnMd]:
1000
+ column_md: Dict[int, schema.ColumnMd] = {}
1001
+ for col in cols:
1002
+ value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
1003
+ column_md[col.id] = schema.ColumnMd(
1004
+ id=col.id, col_type=col.col_type.as_dict(), is_pk=col.is_pk,
1005
+ schema_version_add=col.schema_version_add, schema_version_drop=col.schema_version_drop,
1006
+ value_expr=value_expr_dict, stored=col.stored)
1007
+ return column_md
1008
+
1009
+ def _create_tbl_md(self) -> schema.TableMd:
731
1010
  return schema.TableMd(
732
1011
  name=self.name, current_version=self.version, current_schema_version=self.schema_version,
733
- next_col_id=self.next_col_id, next_row_id=self.next_rowid, column_history=self.column_history,
734
- view_md=self.view_md)
1012
+ next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
1013
+ column_md=self._create_column_md(self.cols), index_md=self.idx_md, view_md=self.view_md)
735
1014
 
736
1015
  def _create_version_md(self, ts: float) -> schema.TableVersionMd:
737
1016
  return schema.TableVersionMd(created_at=ts, version=self.version, schema_version=self.schema_version)
738
1017
 
739
1018
  def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
740
1019
  column_md: Dict[int, schema.SchemaColumn] = {}
741
- for pos, col in enumerate(self.cols):
742
- value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
743
- column_md[col.id] = schema.SchemaColumn(
744
- pos=pos, name=col.name, col_type=col.col_type.as_dict(),
745
- is_pk=col.primary_key, value_expr=value_expr_dict, stored=col.stored, is_indexed=col.is_indexed)
1020
+ for pos, col in enumerate(self.cols_by_name.values()):
1021
+ column_md[col.id] = schema.SchemaColumn(pos=pos, name=col.name)
746
1022
  # preceding_schema_version to be set by the caller
747
1023
  return schema.TableSchemaVersionMd(
748
1024
  schema_version=self.schema_version, preceding_schema_version=preceding_schema_version,