pixeltable 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (99) hide show
  1. pixeltable/__init__.py +18 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +31 -50
  4. pixeltable/catalog/insertable_table.py +7 -6
  5. pixeltable/catalog/table.py +171 -57
  6. pixeltable/catalog/table_version.py +417 -140
  7. pixeltable/catalog/table_version_path.py +2 -2
  8. pixeltable/dataframe.py +239 -121
  9. pixeltable/env.py +82 -16
  10. pixeltable/exec/__init__.py +2 -1
  11. pixeltable/exec/cache_prefetch_node.py +1 -1
  12. pixeltable/exec/data_row_batch.py +6 -7
  13. pixeltable/exec/expr_eval_node.py +28 -28
  14. pixeltable/exec/in_memory_data_node.py +11 -7
  15. pixeltable/exec/sql_scan_node.py +7 -6
  16. pixeltable/exprs/__init__.py +4 -3
  17. pixeltable/exprs/column_ref.py +9 -0
  18. pixeltable/exprs/comparison.py +3 -3
  19. pixeltable/exprs/data_row.py +5 -1
  20. pixeltable/exprs/expr.py +15 -7
  21. pixeltable/exprs/function_call.py +17 -15
  22. pixeltable/exprs/image_member_access.py +9 -28
  23. pixeltable/exprs/in_predicate.py +96 -0
  24. pixeltable/exprs/inline_array.py +13 -11
  25. pixeltable/exprs/inline_dict.py +15 -13
  26. pixeltable/exprs/literal.py +16 -4
  27. pixeltable/exprs/row_builder.py +15 -41
  28. pixeltable/exprs/similarity_expr.py +65 -0
  29. pixeltable/ext/__init__.py +5 -0
  30. pixeltable/ext/functions/yolox.py +92 -0
  31. pixeltable/func/__init__.py +0 -2
  32. pixeltable/func/aggregate_function.py +18 -15
  33. pixeltable/func/callable_function.py +57 -13
  34. pixeltable/func/expr_template_function.py +20 -3
  35. pixeltable/func/function.py +35 -4
  36. pixeltable/func/globals.py +24 -14
  37. pixeltable/func/signature.py +23 -27
  38. pixeltable/func/udf.py +13 -12
  39. pixeltable/functions/__init__.py +8 -8
  40. pixeltable/functions/eval.py +7 -8
  41. pixeltable/functions/huggingface.py +64 -17
  42. pixeltable/functions/openai.py +36 -3
  43. pixeltable/functions/pil/image.py +61 -64
  44. pixeltable/functions/together.py +21 -0
  45. pixeltable/functions/util.py +11 -0
  46. pixeltable/globals.py +425 -0
  47. pixeltable/index/__init__.py +2 -0
  48. pixeltable/index/base.py +51 -0
  49. pixeltable/index/embedding_index.py +168 -0
  50. pixeltable/io/__init__.py +3 -0
  51. pixeltable/{utils → io}/hf_datasets.py +48 -17
  52. pixeltable/io/pandas.py +148 -0
  53. pixeltable/{utils → io}/parquet.py +58 -33
  54. pixeltable/iterators/__init__.py +1 -1
  55. pixeltable/iterators/base.py +4 -0
  56. pixeltable/iterators/document.py +218 -97
  57. pixeltable/iterators/video.py +8 -9
  58. pixeltable/metadata/__init__.py +7 -3
  59. pixeltable/metadata/converters/convert_12.py +3 -0
  60. pixeltable/metadata/converters/convert_13.py +41 -0
  61. pixeltable/metadata/schema.py +45 -22
  62. pixeltable/plan.py +15 -51
  63. pixeltable/store.py +38 -41
  64. pixeltable/tool/create_test_db_dump.py +39 -4
  65. pixeltable/type_system.py +47 -96
  66. pixeltable/utils/documents.py +42 -12
  67. pixeltable/utils/http_server.py +70 -0
  68. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/METADATA +14 -10
  69. pixeltable-0.2.6.dist-info/RECORD +119 -0
  70. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/WHEEL +1 -1
  71. pixeltable/client.py +0 -604
  72. pixeltable/exprs/image_similarity_predicate.py +0 -58
  73. pixeltable/func/batched_function.py +0 -53
  74. pixeltable/tests/conftest.py +0 -177
  75. pixeltable/tests/functions/test_fireworks.py +0 -42
  76. pixeltable/tests/functions/test_functions.py +0 -60
  77. pixeltable/tests/functions/test_huggingface.py +0 -158
  78. pixeltable/tests/functions/test_openai.py +0 -152
  79. pixeltable/tests/functions/test_together.py +0 -111
  80. pixeltable/tests/test_audio.py +0 -65
  81. pixeltable/tests/test_catalog.py +0 -27
  82. pixeltable/tests/test_client.py +0 -21
  83. pixeltable/tests/test_component_view.py +0 -370
  84. pixeltable/tests/test_dataframe.py +0 -439
  85. pixeltable/tests/test_dirs.py +0 -107
  86. pixeltable/tests/test_document.py +0 -120
  87. pixeltable/tests/test_exprs.py +0 -805
  88. pixeltable/tests/test_function.py +0 -324
  89. pixeltable/tests/test_migration.py +0 -43
  90. pixeltable/tests/test_nos.py +0 -54
  91. pixeltable/tests/test_snapshot.py +0 -208
  92. pixeltable/tests/test_table.py +0 -1267
  93. pixeltable/tests/test_transactional_directory.py +0 -42
  94. pixeltable/tests/test_types.py +0 -22
  95. pixeltable/tests/test_video.py +0 -159
  96. pixeltable/tests/test_view.py +0 -530
  97. pixeltable/tests/utils.py +0 -408
  98. pixeltable-0.2.4.dist-info/RECORD +0 -132
  99. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/LICENSE +0 -0
@@ -13,7 +13,9 @@ import sqlalchemy.orm as orm
13
13
 
14
14
  import pixeltable
15
15
  import pixeltable.func as func
16
- from pixeltable import exceptions as excs
16
+ import pixeltable.type_system as ts
17
+ import pixeltable.exceptions as excs
18
+ import pixeltable.index as index
17
19
  from pixeltable.env import Env
18
20
  from pixeltable.iterators import ComponentIterator
19
21
  from pixeltable.metadata import schema
@@ -26,7 +28,8 @@ _logger = logging.getLogger('pixeltable')
26
28
 
27
29
  class TableVersion:
28
30
  """
29
- TableVersion represents a particular version of a table/view along with its store table:
31
+ TableVersion represents a particular version of a table/view along with its physical representation:
32
+ - the physical representation is a store table with indices
30
33
  - the version can be mutable or a snapshot
31
34
  - tables and their recursive views form a tree, and a mutable TableVersion also records its own
32
35
  mutable views in order to propagate updates
@@ -37,6 +40,15 @@ class TableVersion:
37
40
  have TableVersions reference those
38
41
  - mutable TableVersions record their TableVersionPath, which is needed for expr evaluation in updates
39
42
  """
43
+ @dataclasses.dataclass
44
+ class IndexInfo:
45
+ id: int
46
+ name: str
47
+ idx: index.IndexBase
48
+ col: Column
49
+ val_col: Column
50
+ undo_col: Column
51
+
40
52
 
41
53
  def __init__(
42
54
  self, id: UUID, tbl_md: schema.TableMd, version: int, schema_version_md: schema.TableSchemaVersionMd,
@@ -67,12 +79,13 @@ class TableVersion:
67
79
  self.base = base_path.tbl_version if base_path is not None else base
68
80
  if self.is_snapshot:
69
81
  self.next_col_id = -1
82
+ self.next_idx_id = -1 # TODO: can snapshots have separate indices?
70
83
  self.next_rowid = -1
71
84
  else:
72
85
  assert tbl_md.current_version == self.version
73
86
  self.next_col_id = tbl_md.next_col_id
87
+ self.next_idx_id = tbl_md.next_idx_id
74
88
  self.next_rowid = tbl_md.next_row_id
75
- self.column_history = tbl_md.column_history
76
89
 
77
90
  # view-specific initialization
78
91
  from pixeltable import exprs
@@ -101,8 +114,13 @@ class TableVersion:
101
114
  cat = catalog.Catalog.get()
102
115
  cat.tbl_versions[(self.id, self.effective_version)] = self
103
116
 
104
- # do this after we determined whether we're a component view, and before we create the store table
105
- self._init_schema(schema_version_md)
117
+ # init schema after we determined whether we're a component view, and before we create the store table
118
+ self.cols: List[Column] = [] # contains complete history of columns, incl dropped ones
119
+ self.cols_by_name: dict[str, Column] = {} # contains only user-facing (named) columns visible in this version
120
+ self.cols_by_id: dict[int, Column] = {} # contains only columns visible in this version
121
+ self.idx_md = tbl_md.index_md # needed for _create_tbl_md()
122
+ self.idxs_by_name: dict[str, TableVersion.IndexInfo] = {} # contains only actively maintained indices
123
+ self._init_schema(tbl_md, schema_version_md)
106
124
 
107
125
  def __hash__(self) -> int:
108
126
  return hash(self.id)
@@ -111,19 +129,21 @@ class TableVersion:
111
129
  """Create a snapshot copy of this TableVersion"""
112
130
  assert not self.is_snapshot
113
131
  return TableVersion(
114
- self.id, self._create_md(), self.version,
132
+ self.id, self._create_tbl_md(), self.version,
115
133
  self._create_schema_version_md(preceding_schema_version=0), # preceding_schema_version: dummy value
116
134
  is_snapshot=True, base=self.base)
117
135
 
118
136
  @classmethod
119
137
  def create(
120
- cls, session: orm.Session, dir_id: UUID, name: str, cols: List[Column], num_retained_versions: int, comment: str,
121
- base_path: Optional['pixeltable.catalog.TableVersionPath'] = None, view_md: Optional[schema.ViewMd] = None
138
+ cls, session: orm.Session, dir_id: UUID, name: str, cols: List[Column], num_retained_versions: int,
139
+ comment: str, base_path: Optional['pixeltable.catalog.TableVersionPath'] = None,
140
+ view_md: Optional[schema.ViewMd] = None
122
141
  ) -> Tuple[UUID, Optional[TableVersion]]:
123
142
  # assign ids
124
143
  cols_by_name: Dict[str, Column] = {}
125
144
  for pos, col in enumerate(cols):
126
145
  col.id = pos
146
+ col.schema_version_add = 0
127
147
  cols_by_name[col.name] = col
128
148
  if col.value_expr is None and col.compute_func is not None:
129
149
  cls._create_value_expr(col, base_path)
@@ -132,14 +152,11 @@ class TableVersion:
132
152
 
133
153
  ts = time.time()
134
154
  # create schema.Table
135
- column_history = {
136
- col.id: schema.ColumnHistory(col_id=col.id, schema_version_add=0, schema_version_drop=None)
137
- for col in cols
138
- }
155
+ # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
156
+ column_md = cls._create_column_md(cols)
139
157
  table_md = schema.TableMd(
140
158
  name=name, current_version=0, current_schema_version=0,
141
- next_col_id=len(cols), next_row_id=0, column_history=column_history,
142
- view_md=view_md)
159
+ next_col_id=len(cols), next_idx_id=0, next_row_id=0, column_md=column_md, index_md={}, view_md=view_md)
143
160
  tbl_record = schema.Table(dir_id=dir_id, md=dataclasses.asdict(table_md))
144
161
  session.add(tbl_record)
145
162
  session.flush() # sets tbl_record.id
@@ -152,16 +169,10 @@ class TableVersion:
152
169
  session.add(tbl_version_record)
153
170
 
154
171
  # create schema.TableSchemaVersion
155
- column_md: Dict[int, schema.SchemaColumn] = {}
156
- for pos, col in enumerate(cols):
157
- # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
158
- value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
159
- column_md[col.id] = schema.SchemaColumn(
160
- pos=pos, name=col.name, col_type=col.col_type.as_dict(),
161
- is_pk=col.primary_key, value_expr=value_expr_dict, stored=col.stored, is_indexed=col.is_indexed)
172
+ schema_col_md = {col.id: schema.SchemaColumn(pos=pos, name=col.name) for pos, col in enumerate(cols)}
162
173
 
163
174
  schema_version_md = schema.TableSchemaVersionMd(
164
- schema_version=0, preceding_schema_version=None, columns=column_md,
175
+ schema_version=0, preceding_schema_version=None, columns=schema_col_md,
165
176
  num_retained_versions=num_retained_versions, comment=comment)
166
177
  schema_version_record = schema.TableSchemaVersion(
167
178
  tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
@@ -202,21 +213,70 @@ class TableVersion:
202
213
  del cat.tbl_versions[(self.id, self.effective_version)]
203
214
  # TODO: remove from tbl_dependents
204
215
 
205
- def _init_schema(self, schema_version_md: schema.TableSchemaVersionMd) -> None:
206
- """Initialize self.cols as well as self.store_tbl"""
207
- self.cols = [Column.from_md(col_id, col_md, self) for col_id, col_md in schema_version_md.columns.items()]
208
- self.cols_by_name = {col.name: col for col in self.cols}
209
- self.cols_by_id = {col.id: col for col in self.cols}
210
-
211
- # make sure to traverse columns ordered by position = order in which cols were created;
212
- # this guarantees that references always point backwards
213
- from pixeltable import exprs
214
- for col, col_md in zip(self.cols, schema_version_md.columns.values()):
216
+ def _init_schema(self, tbl_md: schema.TableMd, schema_version_md: schema.TableSchemaVersionMd) -> None:
217
+ # create columns first, so the indices can reference them
218
+ self._init_cols(tbl_md, schema_version_md)
219
+ self._init_idxs(tbl_md)
220
+ # create the sa schema only after creating the columns and indices
221
+ self._init_sa_schema()
222
+
223
+ def _init_cols(self, tbl_md: schema.TableMd, schema_version_md: schema.TableSchemaVersionMd) -> None:
224
+ """Initialize self.cols with the columns visible in our effective version"""
225
+ import pixeltable.exprs as exprs
226
+ self.cols = []
227
+ self.cols_by_name = {}
228
+ self.cols_by_id = {}
229
+ for col_md in tbl_md.column_md.values():
230
+ col_name = schema_version_md.columns[col_md.id].name if col_md.id in schema_version_md.columns else None
231
+ col = Column(
232
+ col_id=col_md.id, name=col_name, col_type=ts.ColumnType.from_dict(col_md.col_type),
233
+ is_pk=col_md.is_pk, stored=col_md.stored,
234
+ schema_version_add=col_md.schema_version_add, schema_version_drop=col_md.schema_version_drop)
215
235
  col.tbl = self
236
+ self.cols.append(col)
237
+
238
+ # populate the lookup structures before Expr.from_dict()
239
+ if col_md.schema_version_add > self.schema_version:
240
+ # column was added after this version
241
+ continue
242
+ if col_md.schema_version_drop is not None and col_md.schema_version_drop <= self.schema_version:
243
+ # column was dropped
244
+ continue
245
+ if col.name is not None:
246
+ self.cols_by_name[col.name] = col
247
+ self.cols_by_id[col.id] = col
248
+
249
+ # make sure to traverse columns ordered by position = order in which cols were created;
250
+ # this guarantees that references always point backwards
216
251
  if col_md.value_expr is not None:
217
252
  col.value_expr = exprs.Expr.from_dict(col_md.value_expr)
218
253
  self._record_value_expr(col)
219
254
 
255
+ def _init_idxs(self, tbl_md: schema.TableMd) -> None:
256
+ self.idx_md = tbl_md.index_md
257
+ self.idxs_by_name = {}
258
+ import pixeltable.index as index_module
259
+ for md in tbl_md.index_md.values():
260
+ if md.schema_version_add > self.schema_version \
261
+ or md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version:
262
+ # column not visible in this schema version
263
+ continue
264
+
265
+ # instantiate index object
266
+ cls_name = md.class_fqn.rsplit('.', 1)[-1]
267
+ cls = getattr(index_module, cls_name)
268
+ idx_col = self.cols_by_id[md.indexed_col_id]
269
+ idx = cls.from_dict(idx_col, md.init_args)
270
+
271
+ # fix up the sa column type of the index value and undo columns
272
+ val_col = self.cols_by_id[md.index_val_col_id]
273
+ val_col.sa_col_type = idx.index_sa_type()
274
+ undo_col = self.cols_by_id[md.index_val_undo_col_id]
275
+ undo_col.sa_col_type = idx.index_sa_type()
276
+ idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
277
+ self.idxs_by_name[md.name] = idx_info
278
+
279
+ def _init_sa_schema(self) -> None:
220
280
  # create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
221
281
  # need to record errors
222
282
  from pixeltable.store import StoreBase, StoreTable, StoreView, StoreComponentView
@@ -227,8 +287,7 @@ class TableVersion:
227
287
  else:
228
288
  self.store_tbl: StoreBase = StoreTable(self)
229
289
 
230
- def _update_md(
231
- self, ts: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection) -> None:
290
+ def _update_md(self, ts: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection) -> None:
232
291
  """Update all recorded metadata in response to a data or schema change.
233
292
  Args:
234
293
  ts: timestamp of the change
@@ -236,8 +295,9 @@ class TableVersion:
236
295
  """
237
296
  conn.execute(
238
297
  sql.update(schema.Table.__table__)
239
- .values({schema.Table.md: dataclasses.asdict(self._create_md())})
298
+ .values({schema.Table.md: dataclasses.asdict(self._create_tbl_md())})
240
299
  .where(schema.Table.id == self.id))
300
+
241
301
  version_md = self._create_version_md(ts)
242
302
  conn.execute(
243
303
  sql.insert(schema.TableVersion.__table__)
@@ -250,6 +310,80 @@ class TableVersion:
250
310
  tbl_id=self.id, schema_version=self.schema_version,
251
311
  md=dataclasses.asdict(schema_version_md)))
252
312
 
313
+ def _store_idx_name(self, idx_id: int) -> str:
314
+ """Return name of index in the store, which needs to be globally unique"""
315
+ return f'idx_{self.id.hex}_{idx_id}'
316
+
317
+ def add_index(self, col: Column, idx_name: Optional[str], idx: index.IndexBase) -> UpdateStatus:
318
+ assert not self.is_snapshot
319
+ idx_id = self.next_idx_id
320
+ self.next_idx_id += 1
321
+ if idx_name is None:
322
+ idx_name = f'idx{idx_id}'
323
+ else:
324
+ assert is_valid_identifier(idx_name)
325
+ assert idx_name not in [i.name for i in self.idx_md.values()]
326
+
327
+ # we're creating a new schema version
328
+ self.version += 1
329
+ preceding_schema_version = self.schema_version
330
+ self.schema_version = self.version
331
+ with Env.get().engine.begin() as conn:
332
+ # add the index value and undo columns (which need to be nullable);
333
+ # we don't create a new schema version, because indices aren't part of the logical schema
334
+ val_col = Column(
335
+ col_id=self.next_col_id, name=None, computed_with=idx.index_value_expr(),
336
+ sa_col_type=idx.index_sa_type(), stored=True,
337
+ schema_version_add=self.schema_version, schema_version_drop=None)
338
+ val_col.tbl = self
339
+ val_col.col_type.nullable = True
340
+ self.next_col_id += 1
341
+
342
+ undo_col = Column(
343
+ col_id=self.next_col_id, name=None, col_type=val_col.col_type,
344
+ sa_col_type=val_col.sa_col_type, stored=True,
345
+ schema_version_add=self.schema_version, schema_version_drop=None)
346
+ undo_col.tbl = self
347
+ undo_col.col_type.nullable = True
348
+ self.next_col_id += 1
349
+
350
+ # create and register the index metadata
351
+ idx_cls = type(idx)
352
+ idx_md = schema.IndexMd(
353
+ id=idx_id, name=idx_name,
354
+ indexed_col_id=col.id, index_val_col_id=val_col.id, index_val_undo_col_id=undo_col.id,
355
+ schema_version_add=self.schema_version, schema_version_drop=None,
356
+ class_fqn=idx_cls.__module__ + '.' + idx_cls.__name__, init_args=idx.as_dict())
357
+ idx_info = self.IndexInfo(id=idx_id, name=idx_name, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
358
+ self.idx_md[idx_id] = idx_md
359
+ self.idxs_by_name[idx_name] = idx_info
360
+
361
+ # add the columns and update the metadata
362
+ status = self._add_columns([val_col, undo_col], conn, preceding_schema_version=preceding_schema_version)
363
+ # now create the index structure
364
+ idx.create_index(self._store_idx_name(idx_id), val_col, conn)
365
+
366
+ _logger.info(f'Added index {idx_name} on column {col.name} to table {self.name}')
367
+ return status
368
+
369
+ def drop_index(self, idx_id: int) -> None:
370
+ assert not self.is_snapshot
371
+ assert idx_id in self.idx_md
372
+
373
+ # we're creating a new schema version
374
+ self.version += 1
375
+ preceding_schema_version = self.schema_version
376
+ self.schema_version = self.version
377
+ idx_md = self.idx_md[idx_id]
378
+ idx_md.schema_version_drop = self.schema_version
379
+ assert idx_md.name in self.idxs_by_name
380
+ idx_info = self.idxs_by_name[idx_md.name]
381
+ del self.idxs_by_name[idx_md.name]
382
+
383
+ with Env.get().engine.begin() as conn:
384
+ self._drop_columns([idx_info.val_col, idx_info.undo_col], conn, preceding_schema_version)
385
+ _logger.info(f'Dropped index {idx_md.name} on table {self.name}')
386
+
253
387
  def add_column(self, col: Column, print_stats: bool = False) -> UpdateStatus:
254
388
  """Adds a column to the table.
255
389
  """
@@ -268,60 +402,86 @@ class TableVersion:
268
402
  col.check_value_expr()
269
403
  self._record_value_expr(col)
270
404
 
271
- row_count = self.store_tbl.count()
272
- if row_count > 0 and not col.col_type.nullable and not col.is_computed:
273
- raise excs.Error(f'Cannot add non-nullable column "{col.name}" to table {self.name} with existing rows')
274
-
275
405
  # we're creating a new schema version
276
- ts = time.time()
277
406
  self.version += 1
278
407
  preceding_schema_version = self.schema_version
279
408
  self.schema_version = self.version
409
+ with Env.get().engine.begin() as conn:
410
+ status = self._add_columns([col], conn, preceding_schema_version, print_stats=print_stats)
411
+ _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
280
412
 
281
- self.cols.append(col)
282
- self.cols_by_name[col.name] = col
283
- self.cols_by_id[col.id] = col
284
- self.column_history[col.id] = schema.ColumnHistory(col.id, self.schema_version, None)
413
+ msg = (
414
+ f'Added {status.num_rows} column value{"" if status.num_rows == 1 else "s"} '
415
+ f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}.'
416
+ )
417
+ print(msg)
418
+ _logger.info(f'Column {col.name}: {msg}')
419
+ return status
420
+
421
+ def _add_columns(
422
+ self, cols: List[Column], conn: sql.engine.Connection, preceding_schema_version: Optional[int] = None,
423
+ print_stats: bool = False
424
+ ) -> UpdateStatus:
425
+ """Add and populate columns within the current transaction"""
426
+ ts = time.time()
427
+
428
+ row_count = self.store_tbl.count(conn=conn)
429
+ for col in cols:
430
+ if not col.col_type.nullable and not col.is_computed:
431
+ if row_count > 0:
432
+ raise excs.Error(
433
+ f'Cannot add non-nullable column "{col.name}" to table {self.name} with existing rows')
434
+
435
+ num_excs = 0
436
+ cols_with_excs: List[Column] = []
437
+ for col in cols:
438
+ col.schema_version_add = self.schema_version
439
+ # add the column to the lookup structures now, rather than after the store changes executed successfully,
440
+ # because it might be referenced by the next column's value_expr
441
+ self.cols.append(col)
442
+ if col.name is not None:
443
+ self.cols_by_name[col.name] = col
444
+ self.cols_by_id[col.id] = col
285
445
 
286
- with Env.get().engine.begin() as conn:
287
- self._update_md(ts, preceding_schema_version, conn)
288
- _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
289
446
  if col.is_stored:
290
447
  self.store_tbl.add_column(col, conn)
291
448
 
292
- print(f'Added column `{col.name}` to table `{self.name}`.')
293
- if row_count == 0:
294
- return UpdateStatus()
295
- if (not col.is_computed or not col.is_stored) and not col.is_indexed:
296
- return UpdateStatus(num_rows=row_count)
297
- # compute values for the existing rows and compute embeddings, if this column is indexed;
298
- # for some reason, it's not possible to run the following updates in the same transaction as the one
299
- # that we just used to create the metadata (sqlalchemy hangs when exec() tries to run the query)
300
- from pixeltable.plan import Planner
301
- plan, value_expr_slot_idx, embedding_slot_idx = Planner.create_add_column_plan(self.path, col)
302
- plan.ctx.num_rows = row_count
303
- # TODO: create pgvector index, if col is indexed
449
+ if not col.is_computed or not col.is_stored or row_count == 0:
450
+ continue
451
+
452
+ # populate the column
453
+ from pixeltable.plan import Planner
454
+ plan, value_expr_slot_idx = Planner.create_add_column_plan(self.path, col)
455
+ plan.ctx.num_rows = row_count
304
456
 
305
- try:
306
- # TODO: do this in the same transaction as the metadata update
307
- with Env.get().engine.begin() as conn:
457
+ try:
308
458
  plan.ctx.conn = conn
309
459
  plan.open()
310
- num_excs = self.store_tbl.load_column(col, plan, value_expr_slot_idx, embedding_slot_idx, conn)
311
- except sql.exc.DBAPIError as e:
312
- self.drop_column(col.name)
313
- raise excs.Error(f'Error during SQL execution:\n{e}')
314
- finally:
315
- plan.close()
316
-
317
- msg = f'Added {row_count} column value{"" if row_count == 1 else "s"} with {num_excs} error{"" if num_excs == 1 else "s"}.'
318
- print(msg)
319
- _logger.info(f'Column {col.name}: {msg}')
460
+ num_excs = self.store_tbl.load_column(col, plan, value_expr_slot_idx, conn)
461
+ if num_excs > 0:
462
+ cols_with_excs.append(col)
463
+ except sql.exc.DBAPIError as e:
464
+ self.cols.pop()
465
+ for col in cols:
466
+ # remove columns that we already added
467
+ if col.id not in self.cols_by_id:
468
+ continue
469
+ if col.name is not None:
470
+ del self.cols_by_name[col.name]
471
+ del self.cols_by_id[col.id]
472
+ # we need to re-initialize the sqlalchemy schema
473
+ self.store_tbl.create_sa_tbl()
474
+ raise excs.Error(f'Error during SQL execution:\n{e}')
475
+ finally:
476
+ plan.close()
477
+
478
+ self._update_md(ts, preceding_schema_version, conn)
320
479
  if print_stats:
321
480
  plan.ctx.profile.print(num_rows=row_count)
481
+ # TODO(mkornacker): what to do about system columns with exceptions?
322
482
  return UpdateStatus(
323
483
  num_rows=row_count, num_computed_values=row_count, num_excs=num_excs,
324
- cols_with_excs=[f'{self.name}.{col.name}'] if num_excs > 0 else [])
484
+ cols_with_excs=[f'{col.tbl.name}.{col.name}'for col in cols_with_excs if col.name is not None])
325
485
 
326
486
  def drop_column(self, name: str) -> None:
327
487
  """Drop a column from the table.
@@ -330,35 +490,58 @@ class TableVersion:
330
490
  if name not in self.cols_by_name:
331
491
  raise excs.Error(f'Unknown column: {name}')
332
492
  col = self.cols_by_name[name]
333
- if len(col.dependent_cols) > 0:
493
+ dependent_user_cols = [c for c in col.dependent_cols if c.name is not None]
494
+ if len(dependent_user_cols) > 0:
334
495
  raise excs.Error(
335
496
  f'Cannot drop column {name} because the following columns depend on it:\n',
336
- f'{", ".join([c.name for c in col.dependent_cols])}')
337
-
338
- if col.value_expr is not None:
339
- # update Column.dependent_cols
340
- for c in self.cols:
341
- if c == col:
342
- break
343
- c.dependent_cols.discard(col)
497
+ f'{", ".join([c.name for c in dependent_user_cols])}')
344
498
 
345
499
  # we're creating a new schema version
346
- ts = time.time()
347
500
  self.version += 1
348
501
  preceding_schema_version = self.schema_version
349
502
  self.schema_version = self.version
350
503
 
351
- self.cols.remove(col)
352
- del self.cols_by_name[name]
353
- del self.cols_by_id[col.id]
354
- self.column_history[col.id].schema_version_drop = self.schema_version
355
-
356
504
  with Env.get().engine.begin() as conn:
357
- self._update_md(ts, preceding_schema_version, conn)
358
- if col.is_stored:
359
- self.store_tbl.drop_column()
505
+ # drop this column and all dependent index columns and indices
506
+ dropped_cols = [col]
507
+ dropped_idx_names: List[str] = []
508
+ for idx_info in self.idxs_by_name.values():
509
+ if idx_info.col != col:
510
+ continue
511
+ dropped_cols.extend([idx_info.val_col, idx_info.undo_col])
512
+ idx_md = self.idx_md[idx_info.id]
513
+ idx_md.schema_version_drop = self.schema_version
514
+ assert idx_md.name in self.idxs_by_name
515
+ dropped_idx_names.append(idx_md.name)
516
+ # update idxs_by_name
517
+ for idx_name in dropped_idx_names:
518
+ del self.idxs_by_name[idx_name]
519
+ self._drop_columns(dropped_cols, conn, preceding_schema_version)
360
520
  _logger.info(f'Dropped column {name} from table {self.name}, new version: {self.version}')
361
521
 
522
+ def _drop_columns(self, cols: list[Column], conn: sql.engine.Connection, preceding_schema_version: int) -> None:
523
+ """Mark columns as dropped"""
524
+ assert not self.is_snapshot
525
+
526
+ ts = time.time()
527
+ for col in cols:
528
+ if col.value_expr is not None:
529
+ # update Column.dependent_cols
530
+ for c in self.cols:
531
+ if c == col:
532
+ break
533
+ c.dependent_cols.discard(col)
534
+
535
+ col.schema_version_drop = self.schema_version
536
+ if col.name is not None:
537
+ assert col.name in self.cols_by_name
538
+ del self.cols_by_name[col.name]
539
+ assert col.id in self.cols_by_id
540
+ del self.cols_by_id[col.id]
541
+
542
+ self._update_md(ts, preceding_schema_version, conn)
543
+ self.store_tbl.create_sa_tbl()
544
+
362
545
  def rename_column(self, old_name: str, new_name: str) -> None:
363
546
  """Rename a column.
364
547
  """
@@ -387,14 +570,14 @@ class TableVersion:
387
570
  def set_comment(self, new_comment: Optional[str]):
388
571
  _logger.info(f'[{self.name}] Updating comment: {new_comment}')
389
572
  self.comment = new_comment
390
- self._commit_new_schema_version()
573
+ self._create_schema_version()
391
574
 
392
575
  def set_num_retained_versions(self, new_num_retained_versions: int):
393
576
  _logger.info(f'[{self.name}] Updating num_retained_versions: {new_num_retained_versions} (was {self.num_retained_versions})')
394
577
  self.num_retained_versions = new_num_retained_versions
395
- self._commit_new_schema_version()
578
+ self._create_schema_version()
396
579
 
397
- def _commit_new_schema_version(self):
580
+ def _create_schema_version(self):
398
581
  # we're creating a new schema version
399
582
  ts = time.time()
400
583
  self.version += 1
@@ -448,7 +631,67 @@ class TableVersion:
448
631
  return result
449
632
 
450
633
  def update(
451
- self, update_targets: Optional[List[Tuple[Column, 'pixeltable.exprs.Expr']]] = None,
634
+ self, update_targets: dict[Column, 'pixeltable.exprs.Expr'],
635
+ where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
636
+ ) -> UpdateStatus:
637
+ with Env.get().engine.begin() as conn:
638
+ return self._update(conn, update_targets, where_clause, cascade)
639
+
640
+ def batch_update(
641
+ self, batch: list[dict[Column, 'pixeltable.exprs.Expr']], rowids: list[Tuple[int, ...]],
642
+ cascade: bool = True
643
+ ) -> UpdateStatus:
644
+ """Update rows in batch.
645
+ Args:
646
+ batch: one dict per row, each mapping Columns to LiteralExprs representing the new values
647
+ rowids: if not empty, one tuple per row, each containing the rowid values for the corresponding row in batch
648
+ """
649
+ # if we do lookups of rowids, we must have one for each row in the batch
650
+ assert len(rowids) == 0 or len(rowids) == len(batch)
651
+ import pixeltable.exprs as exprs
652
+ result_status = UpdateStatus()
653
+ cols_with_excs: set[str] = set()
654
+ updated_cols: set[str] = set()
655
+ pk_cols = self.primary_key_columns()
656
+ use_rowids = len(rowids) > 0
657
+
658
+ with Env.get().engine.begin() as conn:
659
+ for i, row in enumerate(batch):
660
+ where_clause: Optional[exprs.Expr] = None
661
+ if use_rowids:
662
+ # construct Where clause to match rowid
663
+ num_rowid_cols = len(self.store_tbl.rowid_columns())
664
+ for col_idx in range(num_rowid_cols):
665
+ assert len(rowids[i]) == num_rowid_cols
666
+ clause = exprs.RowidRef(self, col_idx) == rowids[i][col_idx]
667
+ if where_clause is None:
668
+ where_clause = clause
669
+ else:
670
+ where_clause = where_clause & clause
671
+ else:
672
+ # construct Where clause for primary key columns
673
+ for col in pk_cols:
674
+ assert col in row
675
+ clause = exprs.ColumnRef(col) == row[col]
676
+ if where_clause is None:
677
+ where_clause = clause
678
+ else:
679
+ where_clause = where_clause & clause
680
+
681
+ update_targets = {col: row[col] for col in row if col not in pk_cols}
682
+ status = self._update(conn, update_targets, where_clause, cascade)
683
+ result_status.num_rows += status.num_rows
684
+ result_status.num_excs += status.num_excs
685
+ result_status.num_computed_values += status.num_computed_values
686
+ cols_with_excs.update(status.cols_with_excs)
687
+ updated_cols.update(status.updated_cols)
688
+
689
+ result_status.cols_with_excs = list(cols_with_excs)
690
+ result_status.updated_cols = list(updated_cols)
691
+ return result_status
692
+
693
+ def _update(
694
+ self, conn: sql.engine.Connection, update_targets: dict[Column, 'pixeltable.exprs.Expr'],
452
695
  where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
453
696
  ) -> UpdateStatus:
454
697
  """Update rows in this table.
@@ -458,21 +701,18 @@ class TableVersion:
458
701
  cascade: if True, also update all computed columns that transitively depend on the updated columns,
459
702
  including within views.
460
703
  """
461
- if update_targets is None:
462
- update_targets = []
463
704
  assert not self.is_snapshot
464
705
  from pixeltable.plan import Planner
465
706
  plan, updated_cols, recomputed_cols = \
466
707
  Planner.create_update_plan(self.path, update_targets, [], where_clause, cascade)
467
- with Env.get().engine.begin() as conn:
468
- ts = time.time()
469
- result = self._update(
470
- plan, where_clause.sql_expr() if where_clause is not None else None, recomputed_cols,
471
- base_versions=[], conn=conn, ts=ts, cascade=cascade)
472
- result.updated_cols = updated_cols
473
- return result
708
+ ts = time.time()
709
+ result = self._propagate_update(
710
+ plan, where_clause.sql_expr() if where_clause is not None else None, recomputed_cols,
711
+ base_versions=[], conn=conn, ts=ts, cascade=cascade)
712
+ result.updated_cols = updated_cols
713
+ return result
474
714
 
475
- def _update(
715
+ def _propagate_update(
476
716
  self, plan: Optional[exec.ExecNode], where_clause: Optional[sql.ClauseElement],
477
717
  recomputed_view_cols: List[Column], base_versions: List[Optional[int]], conn: sql.engine.Connection,
478
718
  ts: float, cascade: bool
@@ -497,7 +737,7 @@ class TableVersion:
497
737
  if len(recomputed_cols) > 0:
498
738
  from pixeltable.plan import Planner
499
739
  plan = Planner.create_view_update_plan(view.path, recompute_targets=recomputed_cols)
500
- status = view._update(
740
+ status = view._propagate_update(
501
741
  plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, ts=ts, cascade=True)
502
742
  result.num_rows += status.num_rows
503
743
  result.num_excs += status.num_excs
@@ -554,6 +794,15 @@ class TableVersion:
554
794
  self._revert(session)
555
795
  session.commit()
556
796
 
797
+ def _delete_column(self, col: Column, conn: sql.engine.Connection) -> None:
798
+ """Physically remove the column from the schema and the store table"""
799
+ if col.is_stored:
800
+ self.store_tbl.drop_column(col, conn)
801
+ self.cols.remove(col)
802
+ if col.name is not None:
803
+ del self.cols_by_name[col.name]
804
+ del self.cols_by_id[col.id]
805
+
557
806
  def _revert(self, session: orm.Session) -> None:
558
807
  """Reverts this table version and propagates to views"""
559
808
  conn = session.connection()
@@ -577,28 +826,47 @@ class TableVersion:
577
826
  # delete newly-added data
578
827
  MediaStore.delete(self.id, version=self.version)
579
828
  conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
580
- # revert new deletions
581
- conn.execute(
582
- sql.update(self.store_tbl.sa_tbl) \
583
- .values({self.store_tbl.sa_tbl.c.v_max: schema.Table.MAX_VERSION})
584
- .where(self.store_tbl.sa_tbl.c.v_max == self.version))
585
829
 
830
+ # revert new deletions
831
+ set_clause = {self.store_tbl.sa_tbl.c.v_max: schema.Table.MAX_VERSION}
832
+ for index_info in self.idxs_by_name.values():
833
+ # copy the index value back from the undo column and reset the undo column to NULL
834
+ set_clause[index_info.val_col.sa_col] = index_info.undo_col.sa_col
835
+ set_clause[index_info.undo_col.sa_col] = None
836
+ stmt = sql.update(self.store_tbl.sa_tbl) \
837
+ .values(set_clause) \
838
+ .where(self.store_tbl.sa_tbl.c.v_max == self.version)
839
+ conn.execute(stmt)
840
+
841
+ # revert schema changes
586
842
  if self.version == self.schema_version:
587
- # the current version involved a schema change:
588
- # if the schema change was to add a column, we now need to drop it
589
- added_col_ids = [
590
- col_history.col_id for col_history in self.column_history.values()
591
- if col_history.schema_version_add == self.schema_version
592
- ]
593
- assert len(added_col_ids) <= 1
594
- added_col: Optional[Column] = None
595
- if len(added_col_ids) == 1:
596
- added_col_id = added_col_ids[0]
597
- # drop this newly-added column and its ColumnHistory record
598
- c = self.cols_by_id[added_col_id]
599
- if c.is_stored:
600
- added_col = c
601
- del self.column_history[c.id]
843
+ # delete newly-added columns
844
+ added_cols = [col for col in self.cols if col.schema_version_add == self.schema_version]
845
+ if len(added_cols) > 0:
846
+ next_col_id = min(col.id for col in added_cols)
847
+ for col in added_cols:
848
+ self._delete_column(col, conn)
849
+ self.next_col_id = next_col_id
850
+
851
+ # remove newly-added indices from the lookup structures
852
+ # (the value and undo columns got removed in the preceding step)
853
+ added_idx_md = [md for md in self.idx_md.values() if md.schema_version_add == self.schema_version]
854
+ if len(added_idx_md) > 0:
855
+ next_idx_id = min(md.id for md in added_idx_md)
856
+ for md in added_idx_md:
857
+ del self.idx_md[md.id]
858
+ del self.idxs_by_name[md.name]
859
+ self.next_idx_id = next_idx_id
860
+
861
+ # make newly-dropped columns visible again
862
+ dropped_cols = [col for col in self.cols if col.schema_version_drop == self.schema_version]
863
+ for col in dropped_cols:
864
+ col.schema_version_drop = None
865
+
866
+ # make newly-dropped indices visible again
867
+ dropped_idx_md = [md for md in self.idx_md.values() if md.schema_version_drop == self.schema_version]
868
+ for md in dropped_idx_md:
869
+ md.schema_version_drop = None
602
870
 
603
871
  # we need to determine the preceding schema version and reload the schema
604
872
  schema_version_md_dict = session.query(schema.TableSchemaVersion.md) \
@@ -612,11 +880,8 @@ class TableVersion:
612
880
  .scalar()
613
881
  preceding_schema_version_md = schema.md_from_dict(
614
882
  schema.TableSchemaVersionMd, preceding_schema_version_md_dict)
615
- self._init_schema(preceding_schema_version_md)
616
-
617
- # physically drop the column, but only after we have re-created the schema
618
- if added_col is not None:
619
- self.store_tbl.drop_column(added_col, conn)
883
+ tbl_md = self._create_tbl_md()
884
+ self._init_schema(tbl_md, preceding_schema_version_md)
620
885
 
621
886
  conn.execute(
622
887
  sql.delete(schema.TableSchemaVersion.__table__)
@@ -634,7 +899,7 @@ class TableVersion:
634
899
  self.version -= 1
635
900
  conn.execute(
636
901
  sql.update(schema.Table.__table__)
637
- .values({schema.Table.md: dataclasses.asdict(self._create_md())})
902
+ .values({schema.Table.md: dataclasses.asdict(self._create_tbl_md())})
638
903
  .where(schema.Table.id == self.id))
639
904
 
640
905
  # propagate to views
@@ -667,6 +932,10 @@ class TableVersion:
667
932
  """Return all non-system columns"""
668
933
  return [c for c in self.cols if not self.is_system_column(c)]
669
934
 
935
+ def primary_key_columns(self) -> List[Column]:
936
+ """Return all non-system columns"""
937
+ return [c for c in self.cols if c.is_pk]
938
+
670
939
  def get_required_col_names(self) -> List[str]:
671
940
  """Return the names of all columns for which values must be specified in insert()"""
672
941
  assert not self.is_view()
@@ -727,22 +996,30 @@ class TableVersion:
727
996
  return 1 + self.base.num_rowid_columns()
728
997
  return 1
729
998
 
730
- def _create_md(self) -> schema.TableMd:
999
+ @classmethod
1000
+ def _create_column_md(cls, cols: List[Column]) -> dict[int, schema.ColumnMd]:
1001
+ column_md: Dict[int, schema.ColumnMd] = {}
1002
+ for col in cols:
1003
+ value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
1004
+ column_md[col.id] = schema.ColumnMd(
1005
+ id=col.id, col_type=col.col_type.as_dict(), is_pk=col.is_pk,
1006
+ schema_version_add=col.schema_version_add, schema_version_drop=col.schema_version_drop,
1007
+ value_expr=value_expr_dict, stored=col.stored)
1008
+ return column_md
1009
+
1010
+ def _create_tbl_md(self) -> schema.TableMd:
731
1011
  return schema.TableMd(
732
1012
  name=self.name, current_version=self.version, current_schema_version=self.schema_version,
733
- next_col_id=self.next_col_id, next_row_id=self.next_rowid, column_history=self.column_history,
734
- view_md=self.view_md)
1013
+ next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
1014
+ column_md=self._create_column_md(self.cols), index_md=self.idx_md, view_md=self.view_md)
735
1015
 
736
1016
  def _create_version_md(self, ts: float) -> schema.TableVersionMd:
737
1017
  return schema.TableVersionMd(created_at=ts, version=self.version, schema_version=self.schema_version)
738
1018
 
739
1019
  def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
740
1020
  column_md: Dict[int, schema.SchemaColumn] = {}
741
- for pos, col in enumerate(self.cols):
742
- value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
743
- column_md[col.id] = schema.SchemaColumn(
744
- pos=pos, name=col.name, col_type=col.col_type.as_dict(),
745
- is_pk=col.primary_key, value_expr=value_expr_dict, stored=col.stored, is_indexed=col.is_indexed)
1021
+ for pos, col in enumerate(self.cols_by_name.values()):
1022
+ column_md[col.id] = schema.SchemaColumn(pos=pos, name=col.name)
746
1023
  # preceding_schema_version to be set by the caller
747
1024
  return schema.TableSchemaVersionMd(
748
1025
  schema_version=self.schema_version, preceding_schema_version=preceding_schema_version,