pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show
  1. pixeltable/__init__.py +20 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +23 -7
  4. pixeltable/catalog/insertable_table.py +32 -19
  5. pixeltable/catalog/table.py +210 -20
  6. pixeltable/catalog/table_version.py +272 -111
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/dataframe.py +184 -110
  9. pixeltable/datatransfer/__init__.py +1 -0
  10. pixeltable/datatransfer/label_studio.py +526 -0
  11. pixeltable/datatransfer/remote.py +113 -0
  12. pixeltable/env.py +213 -79
  13. pixeltable/exec/__init__.py +2 -1
  14. pixeltable/exec/data_row_batch.py +6 -7
  15. pixeltable/exec/expr_eval_node.py +28 -28
  16. pixeltable/exec/sql_scan_node.py +7 -6
  17. pixeltable/exprs/__init__.py +4 -3
  18. pixeltable/exprs/column_ref.py +11 -2
  19. pixeltable/exprs/comparison.py +39 -1
  20. pixeltable/exprs/data_row.py +7 -0
  21. pixeltable/exprs/expr.py +26 -19
  22. pixeltable/exprs/function_call.py +17 -18
  23. pixeltable/exprs/globals.py +14 -2
  24. pixeltable/exprs/image_member_access.py +9 -28
  25. pixeltable/exprs/in_predicate.py +96 -0
  26. pixeltable/exprs/inline_array.py +13 -11
  27. pixeltable/exprs/inline_dict.py +15 -13
  28. pixeltable/exprs/row_builder.py +7 -1
  29. pixeltable/exprs/similarity_expr.py +67 -0
  30. pixeltable/ext/functions/whisperx.py +30 -0
  31. pixeltable/ext/functions/yolox.py +16 -0
  32. pixeltable/func/__init__.py +0 -2
  33. pixeltable/func/aggregate_function.py +5 -2
  34. pixeltable/func/callable_function.py +57 -13
  35. pixeltable/func/expr_template_function.py +14 -3
  36. pixeltable/func/function.py +35 -4
  37. pixeltable/func/signature.py +5 -15
  38. pixeltable/func/udf.py +8 -12
  39. pixeltable/functions/fireworks.py +9 -4
  40. pixeltable/functions/huggingface.py +48 -5
  41. pixeltable/functions/openai.py +49 -11
  42. pixeltable/functions/pil/image.py +61 -64
  43. pixeltable/functions/together.py +32 -6
  44. pixeltable/functions/util.py +0 -43
  45. pixeltable/functions/video.py +46 -8
  46. pixeltable/globals.py +443 -0
  47. pixeltable/index/__init__.py +1 -0
  48. pixeltable/index/base.py +9 -2
  49. pixeltable/index/btree.py +54 -0
  50. pixeltable/index/embedding_index.py +91 -15
  51. pixeltable/io/__init__.py +4 -0
  52. pixeltable/io/globals.py +59 -0
  53. pixeltable/{utils → io}/hf_datasets.py +48 -17
  54. pixeltable/io/pandas.py +148 -0
  55. pixeltable/{utils → io}/parquet.py +58 -33
  56. pixeltable/iterators/__init__.py +1 -1
  57. pixeltable/iterators/base.py +8 -4
  58. pixeltable/iterators/document.py +225 -93
  59. pixeltable/iterators/video.py +16 -9
  60. pixeltable/metadata/__init__.py +8 -4
  61. pixeltable/metadata/converters/convert_12.py +3 -0
  62. pixeltable/metadata/converters/convert_13.py +41 -0
  63. pixeltable/metadata/converters/convert_14.py +13 -0
  64. pixeltable/metadata/converters/convert_15.py +29 -0
  65. pixeltable/metadata/converters/util.py +63 -0
  66. pixeltable/metadata/schema.py +12 -6
  67. pixeltable/plan.py +11 -24
  68. pixeltable/store.py +16 -23
  69. pixeltable/tool/create_test_db_dump.py +49 -14
  70. pixeltable/type_system.py +27 -58
  71. pixeltable/utils/coco.py +94 -0
  72. pixeltable/utils/documents.py +42 -12
  73. pixeltable/utils/http_server.py +70 -0
  74. pixeltable-0.2.7.dist-info/METADATA +137 -0
  75. pixeltable-0.2.7.dist-info/RECORD +126 -0
  76. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
  77. pixeltable/client.py +0 -600
  78. pixeltable/exprs/image_similarity_predicate.py +0 -58
  79. pixeltable/func/batched_function.py +0 -53
  80. pixeltable/func/nos_function.py +0 -202
  81. pixeltable/tests/conftest.py +0 -171
  82. pixeltable/tests/ext/test_yolox.py +0 -21
  83. pixeltable/tests/functions/test_fireworks.py +0 -43
  84. pixeltable/tests/functions/test_functions.py +0 -60
  85. pixeltable/tests/functions/test_huggingface.py +0 -158
  86. pixeltable/tests/functions/test_openai.py +0 -162
  87. pixeltable/tests/functions/test_together.py +0 -112
  88. pixeltable/tests/test_audio.py +0 -65
  89. pixeltable/tests/test_catalog.py +0 -27
  90. pixeltable/tests/test_client.py +0 -21
  91. pixeltable/tests/test_component_view.py +0 -379
  92. pixeltable/tests/test_dataframe.py +0 -440
  93. pixeltable/tests/test_dirs.py +0 -107
  94. pixeltable/tests/test_document.py +0 -120
  95. pixeltable/tests/test_exprs.py +0 -802
  96. pixeltable/tests/test_function.py +0 -332
  97. pixeltable/tests/test_index.py +0 -138
  98. pixeltable/tests/test_migration.py +0 -44
  99. pixeltable/tests/test_nos.py +0 -54
  100. pixeltable/tests/test_snapshot.py +0 -231
  101. pixeltable/tests/test_table.py +0 -1343
  102. pixeltable/tests/test_transactional_directory.py +0 -42
  103. pixeltable/tests/test_types.py +0 -52
  104. pixeltable/tests/test_video.py +0 -159
  105. pixeltable/tests/test_view.py +0 -535
  106. pixeltable/tests/utils.py +0 -442
  107. pixeltable/utils/clip.py +0 -18
  108. pixeltable-0.2.5.dist-info/METADATA +0 -128
  109. pixeltable-0.2.5.dist-info/RECORD +0 -139
  110. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
@@ -5,7 +5,8 @@ import importlib
5
5
  import inspect
6
6
  import logging
7
7
  import time
8
- from typing import Optional, List, Dict, Any, Tuple, Type, Set
8
+ from typing import Optional, List, Dict, Any, Tuple, Type, Set, Iterable
9
+ import uuid
9
10
  from uuid import UUID
10
11
 
11
12
  import sqlalchemy as sql
@@ -23,6 +24,7 @@ from pixeltable.utils.filecache import FileCache
23
24
  from pixeltable.utils.media_store import MediaStore
24
25
  from .column import Column
25
26
  from .globals import UpdateStatus, POS_COLUMN_NAME, is_valid_identifier
27
+ from ..func.globals import resolve_symbol
26
28
 
27
29
  _logger = logging.getLogger('pixeltable')
28
30
 
@@ -43,6 +45,7 @@ class TableVersion:
43
45
  @dataclasses.dataclass
44
46
  class IndexInfo:
45
47
  id: int
48
+ name: str
46
49
  idx: index.IndexBase
47
50
  col: Column
48
51
  val_col: Column
@@ -86,6 +89,8 @@ class TableVersion:
86
89
  self.next_idx_id = tbl_md.next_idx_id
87
90
  self.next_rowid = tbl_md.next_row_id
88
91
 
92
+ self.remotes = dict(TableVersion._init_remote(remote_md) for remote_md in tbl_md.remotes)
93
+
89
94
  # view-specific initialization
90
95
  from pixeltable import exprs
91
96
  predicate_dict = None if not is_view or tbl_md.view_md.predicate is None else tbl_md.view_md.predicate
@@ -114,9 +119,9 @@ class TableVersion:
114
119
  cat.tbl_versions[(self.id, self.effective_version)] = self
115
120
 
116
121
  # init schema after we determined whether we're a component view, and before we create the store table
117
- self.cols: List[Column] = [] # contains complete history of columns, incl dropped ones
122
+ self.cols: list[Column] = [] # contains complete history of columns, incl dropped ones
118
123
  self.cols_by_name: dict[str, Column] = {} # contains only user-facing (named) columns visible in this version
119
- self.cols_by_id: dict[int, Column] = {} # contains only columns visible in this version
124
+ self.cols_by_id: dict[int, Column] = {} # contains only columns visible in this version, both system and user
120
125
  self.idx_md = tbl_md.index_md # needed for _create_tbl_md()
121
126
  self.idxs_by_name: dict[str, TableVersion.IndexInfo] = {} # contains only actively maintained indices
122
127
  self._init_schema(tbl_md, schema_version_md)
@@ -149,23 +154,22 @@ class TableVersion:
149
154
  if col.is_computed:
150
155
  col.check_value_expr()
151
156
 
152
- ts = time.time()
157
+ timestamp = time.time()
153
158
  # create schema.Table
154
159
  # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
155
160
  column_md = cls._create_column_md(cols)
156
161
  table_md = schema.TableMd(
157
- name=name, current_version=0, current_schema_version=0,
158
- next_col_id=len(cols), next_idx_id=0, next_row_id=0, column_md=column_md, index_md={}, view_md=view_md)
159
- tbl_record = schema.Table(dir_id=dir_id, md=dataclasses.asdict(table_md))
160
- session.add(tbl_record)
161
- session.flush() # sets tbl_record.id
162
- assert tbl_record.id is not None
162
+ name=name, current_version=0, current_schema_version=0, next_col_id=len(cols),
163
+ next_idx_id=0, next_row_id=0, column_md=column_md, index_md={}, remotes=[], view_md=view_md)
164
+ # create a schema.Table here, we need it to call our c'tor;
165
+ # don't add it to the session yet, we might add index metadata
166
+ tbl_id = uuid.uuid4()
167
+ tbl_record = schema.Table(id=tbl_id, dir_id=dir_id, md=dataclasses.asdict(table_md))
163
168
 
164
169
  # create schema.TableVersion
165
- table_version_md = schema.TableVersionMd(created_at=ts, version=0, schema_version=0)
170
+ table_version_md = schema.TableVersionMd(created_at=timestamp, version=0, schema_version=0)
166
171
  tbl_version_record = schema.TableVersion(
167
172
  tbl_id=tbl_record.id, version=0, md=dataclasses.asdict(table_version_md))
168
- session.add(tbl_version_record)
169
173
 
170
174
  # create schema.TableSchemaVersion
171
175
  schema_col_md = {col.id: schema.SchemaColumn(pos=pos, name=col.name) for pos, col in enumerate(cols)}
@@ -175,19 +179,33 @@ class TableVersion:
175
179
  num_retained_versions=num_retained_versions, comment=comment)
176
180
  schema_version_record = schema.TableSchemaVersion(
177
181
  tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
178
- session.add(schema_version_record)
179
182
 
180
183
  # if this is purely a snapshot (it doesn't require any additional storage for columns and it # doesn't have a
181
184
  # predicate to apply at runtime), we don't create a physical table and simply use the base's table version path
182
185
  if view_md is not None and view_md.is_snapshot and view_md.predicate is None and len(cols) == 0:
186
+ session.add(tbl_record)
187
+ session.add(tbl_version_record)
188
+ session.add(schema_version_record)
183
189
  return tbl_record.id, None
184
190
 
185
191
  assert (base_path is not None) == (view_md is not None)
186
192
  base = base_path.tbl_version if base_path is not None and view_md.is_snapshot else None
187
193
  base_path = base_path if base_path is not None and not view_md.is_snapshot else None
188
194
  tbl_version = cls(tbl_record.id, table_md, 0, schema_version_md, base=base, base_path=base_path)
189
- tbl_version.store_tbl.create(session.connection())
190
- # TODO: create pgvector indices
195
+
196
+ conn = session.connection()
197
+ tbl_version.store_tbl.create(conn)
198
+ if view_md is None or not view_md.is_snapshot:
199
+ # add default indices, after creating the store table
200
+ for col in tbl_version.cols_by_name.values():
201
+ status = tbl_version._add_default_index(col, conn=conn)
202
+ assert status is None or status.num_excs == 0
203
+
204
+ # we re-create the tbl_record here, now that we have new index metadata
205
+ tbl_record = schema.Table(id=tbl_id, dir_id=dir_id, md=dataclasses.asdict(tbl_version._create_tbl_md()))
206
+ session.add(tbl_record)
207
+ session.add(tbl_version_record)
208
+ session.add(schema_version_record)
191
209
  return tbl_record.id, tbl_version
192
210
 
193
211
  @classmethod
@@ -251,6 +269,16 @@ class TableVersion:
251
269
  col.value_expr = exprs.Expr.from_dict(col_md.value_expr)
252
270
  self._record_value_expr(col)
253
271
 
272
+ # if this is a stored proxy column, resolve the relationships with its proxy base.
273
+ if col_md.proxy_base is not None:
274
+ # proxy_base must have a strictly smaller id, so we must already have encountered it
275
+ # in traversal order; and if the proxy column is active at this version, then the
276
+ # proxy base must necessarily be active as well. This motivates the following assertion.
277
+ assert col_md.proxy_base in self.cols_by_id
278
+ base_col = self.cols_by_id[col_md.proxy_base]
279
+ base_col.stored_proxy = col
280
+ col.proxy_base = base_col
281
+
254
282
  def _init_idxs(self, tbl_md: schema.TableMd) -> None:
255
283
  self.idx_md = tbl_md.index_md
256
284
  self.idxs_by_name = {}
@@ -258,7 +286,7 @@ class TableVersion:
258
286
  for md in tbl_md.index_md.values():
259
287
  if md.schema_version_add > self.schema_version \
260
288
  or md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version:
261
- # column not visible in this schema version
289
+ # index not visible in this schema version
262
290
  continue
263
291
 
264
292
  # instantiate index object
@@ -270,9 +298,11 @@ class TableVersion:
270
298
  # fix up the sa column type of the index value and undo columns
271
299
  val_col = self.cols_by_id[md.index_val_col_id]
272
300
  val_col.sa_col_type = idx.index_sa_type()
301
+ val_col._records_errors = False
273
302
  undo_col = self.cols_by_id[md.index_val_undo_col_id]
274
303
  undo_col.sa_col_type = idx.index_sa_type()
275
- idx_info = self.IndexInfo(id=md.id, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
304
+ undo_col._records_errors = False
305
+ idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
276
306
  self.idxs_by_name[md.name] = idx_info
277
307
 
278
308
  def _init_sa_schema(self) -> None:
@@ -286,10 +316,12 @@ class TableVersion:
286
316
  else:
287
317
  self.store_tbl: StoreBase = StoreTable(self)
288
318
 
289
- def _update_md(self, ts: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection) -> None:
319
+ def _update_md(
320
+ self, timestamp: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection
321
+ ) -> None:
290
322
  """Update all recorded metadata in response to a data or schema change.
291
323
  Args:
292
- ts: timestamp of the change
324
+ timestamp: timestamp of the change
293
325
  preceding_schema_version: last schema version if schema change, else None
294
326
  """
295
327
  conn.execute(
@@ -297,7 +329,7 @@ class TableVersion:
297
329
  .values({schema.Table.md: dataclasses.asdict(self._create_tbl_md())})
298
330
  .where(schema.Table.id == self.id))
299
331
 
300
- version_md = self._create_version_md(ts)
332
+ version_md = self._create_version_md(timestamp)
301
333
  conn.execute(
302
334
  sql.insert(schema.TableVersion.__table__)
303
335
  .values(tbl_id=self.id, version=self.version, md=dataclasses.asdict(version_md)))
@@ -314,6 +346,33 @@ class TableVersion:
314
346
  return f'idx_{self.id.hex}_{idx_id}'
315
347
 
316
348
  def add_index(self, col: Column, idx_name: Optional[str], idx: index.IndexBase) -> UpdateStatus:
349
+ # we're creating a new schema version
350
+ self.version += 1
351
+ preceding_schema_version = self.schema_version
352
+ self.schema_version = self.version
353
+ with Env.get().engine.begin() as conn:
354
+ status = self._add_index(col, idx_name, idx, conn)
355
+ self._update_md(time.time(), preceding_schema_version, conn)
356
+ _logger.info(f'Added index {idx_name} on column {col.name} to table {self.name}')
357
+ return status
358
+
359
+ def _add_default_index(self, col: Column, conn: sql.engine.Connection) -> Optional[UpdateStatus]:
360
+ """Add a B-tree index on this column if it has a compatible type"""
361
+ if not col.stored:
362
+ # if the column is intentionally not stored, we want to avoid the overhead of an index
363
+ return None
364
+ if not col.col_type.is_scalar_type() and not (col.col_type.is_media_type() and not col.is_computed):
365
+ # wrong type for a B-tree
366
+ return None
367
+ if col.col_type.is_bool_type():
368
+ # B-trees on bools aren't useful
369
+ return None
370
+ status = self._add_index(col, idx_name=None, idx=index.BtreeIndex(col), conn=conn)
371
+ return status
372
+
373
+ def _add_index(
374
+ self, col: Column, idx_name: Optional[str], idx: index.IndexBase, conn: sql.engine.Connection
375
+ ) -> UpdateStatus:
317
376
  assert not self.is_snapshot
318
377
  idx_id = self.next_idx_id
319
378
  self.next_idx_id += 1
@@ -323,46 +382,41 @@ class TableVersion:
323
382
  assert is_valid_identifier(idx_name)
324
383
  assert idx_name not in [i.name for i in self.idx_md.values()]
325
384
 
326
- # we're creating a new schema version
327
- self.version += 1
328
- preceding_schema_version = self.schema_version
329
- self.schema_version = self.version
330
- with Env.get().engine.begin() as conn:
331
- # add the index value and undo columns (which need to be nullable);
332
- # we don't create a new schema version, because indices aren't part of the logical schema
333
- val_col = Column(
334
- col_id=self.next_col_id, name=None, computed_with=idx.index_value_expr(),
335
- sa_col_type=idx.index_sa_type(), stored=True,
336
- schema_version_add=self.schema_version, schema_version_drop=None)
337
- val_col.tbl = self
338
- val_col.col_type.nullable = True
339
- self.next_col_id += 1
340
-
341
- undo_col = Column(
342
- col_id=self.next_col_id, name=None, col_type=val_col.col_type,
343
- sa_col_type=val_col.sa_col_type, stored=True,
344
- schema_version_add=self.schema_version, schema_version_drop=None)
345
- undo_col.tbl = self
346
- undo_col.col_type.nullable = True
347
- self.next_col_id += 1
348
-
349
- # create and register the index metadata
350
- idx_cls = type(idx)
351
- idx_md = schema.IndexMd(
352
- id=idx_id, name=idx_name,
353
- indexed_col_id=col.id, index_val_col_id=val_col.id, index_val_undo_col_id=undo_col.id,
354
- schema_version_add=self.schema_version, schema_version_drop=None,
355
- class_fqn=idx_cls.__module__ + '.' + idx_cls.__name__, init_args=idx.as_dict())
356
- idx_info = self.IndexInfo(id=idx_id, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
357
- self.idx_md[idx_id] = idx_md
358
- self.idxs_by_name[idx_name] = idx_info
359
-
360
- # add the columns and update the metadata
361
- status = self._add_columns([val_col, undo_col], conn, preceding_schema_version=preceding_schema_version)
362
- # now create the index structure
363
- idx.create_index(self._store_idx_name(idx_id), val_col, conn)
364
-
365
- _logger.info(f'Added index {idx_name} on column {col.name} to table {self.name}')
385
+ # add the index value and undo columns (which need to be nullable)
386
+ val_col = Column(
387
+ col_id=self.next_col_id, name=None, computed_with=idx.index_value_expr(),
388
+ sa_col_type=idx.index_sa_type(), stored=True,
389
+ schema_version_add=self.schema_version, schema_version_drop=None,
390
+ records_errors=idx.records_value_errors())
391
+ val_col.tbl = self
392
+ val_col.col_type = val_col.col_type.copy(nullable=True)
393
+ self.next_col_id += 1
394
+
395
+ undo_col = Column(
396
+ col_id=self.next_col_id, name=None, col_type=val_col.col_type,
397
+ sa_col_type=val_col.sa_col_type, stored=True,
398
+ schema_version_add=self.schema_version, schema_version_drop=None,
399
+ records_errors=False)
400
+ undo_col.tbl = self
401
+ undo_col.col_type = undo_col.col_type.copy(nullable=True)
402
+ self.next_col_id += 1
403
+
404
+ # create and register the index metadata
405
+ idx_cls = type(idx)
406
+ idx_md = schema.IndexMd(
407
+ id=idx_id, name=idx_name,
408
+ indexed_col_id=col.id, index_val_col_id=val_col.id, index_val_undo_col_id=undo_col.id,
409
+ schema_version_add=self.schema_version, schema_version_drop=None,
410
+ class_fqn=idx_cls.__module__ + '.' + idx_cls.__name__, init_args=idx.as_dict())
411
+ idx_info = self.IndexInfo(id=idx_id, name=idx_name, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
412
+ self.idx_md[idx_id] = idx_md
413
+ self.idxs_by_name[idx_name] = idx_info
414
+
415
+ # add the columns and update the metadata
416
+ status = self._add_columns([val_col, undo_col], conn)
417
+ # now create the index structure
418
+ idx.create_index(self._store_idx_name(idx_id), val_col, conn)
419
+
366
420
  return status
367
421
 
368
422
  def drop_index(self, idx_id: int) -> None:
@@ -380,7 +434,8 @@ class TableVersion:
380
434
  del self.idxs_by_name[idx_md.name]
381
435
 
382
436
  with Env.get().engine.begin() as conn:
383
- self._drop_columns([idx_info.val_col, idx_info.undo_col], conn, preceding_schema_version)
437
+ self._drop_columns([idx_info.val_col, idx_info.undo_col])
438
+ self._update_md(time.time(), preceding_schema_version, conn)
384
439
  _logger.info(f'Dropped index {idx_md.name} on table {self.name}')
385
440
 
386
441
  def add_column(self, col: Column, print_stats: bool = False) -> UpdateStatus:
@@ -397,16 +452,16 @@ class TableVersion:
397
452
  if col.compute_func is not None:
398
453
  # create value_expr from compute_func
399
454
  self._create_value_expr(col, self.path)
400
- if col.value_expr is not None:
401
- col.check_value_expr()
402
- self._record_value_expr(col)
403
455
 
404
456
  # we're creating a new schema version
405
457
  self.version += 1
406
458
  preceding_schema_version = self.schema_version
407
459
  self.schema_version = self.version
408
460
  with Env.get().engine.begin() as conn:
409
- status = self._add_columns([col], conn, preceding_schema_version, print_stats=print_stats)
461
+ status = self._add_columns([col], conn, print_stats=print_stats)
462
+ _ = self._add_default_index(col, conn)
463
+ # TODO: what to do about errors?
464
+ self._update_md(time.time(), preceding_schema_version, conn)
410
465
  _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
411
466
 
412
467
  msg = (
@@ -417,13 +472,8 @@ class TableVersion:
417
472
  _logger.info(f'Column {col.name}: {msg}')
418
473
  return status
419
474
 
420
- def _add_columns(
421
- self, cols: List[Column], conn: sql.engine.Connection, preceding_schema_version: Optional[int] = None,
422
- print_stats: bool = False
423
- ) -> UpdateStatus:
475
+ def _add_columns(self, cols: List[Column], conn: sql.engine.Connection, print_stats: bool = False) -> UpdateStatus:
424
476
  """Add and populate columns within the current transaction"""
425
- ts = time.time()
426
-
427
477
  row_count = self.store_tbl.count(conn=conn)
428
478
  for col in cols:
429
479
  if not col.col_type.nullable and not col.is_computed:
@@ -441,6 +491,9 @@ class TableVersion:
441
491
  if col.name is not None:
442
492
  self.cols_by_name[col.name] = col
443
493
  self.cols_by_id[col.id] = col
494
+ if col.value_expr is not None:
495
+ col.check_value_expr()
496
+ self._record_value_expr(col)
444
497
 
445
498
  if col.is_stored:
446
499
  self.store_tbl.add_column(col, conn)
@@ -474,7 +527,6 @@ class TableVersion:
474
527
  finally:
475
528
  plan.close()
476
529
 
477
- self._update_md(ts, preceding_schema_version, conn)
478
530
  if print_stats:
479
531
  plan.ctx.profile.print(num_rows=row_count)
480
532
  # TODO(mkornacker): what to do about system columns with exceptions?
@@ -492,8 +544,16 @@ class TableVersion:
492
544
  dependent_user_cols = [c for c in col.dependent_cols if c.name is not None]
493
545
  if len(dependent_user_cols) > 0:
494
546
  raise excs.Error(
495
- f'Cannot drop column {name} because the following columns depend on it:\n',
496
- f'{", ".join([c.name for c in dependent_user_cols])}')
547
+ f'Cannot drop column `{name}` because the following columns depend on it:\n'
548
+ f'{", ".join(c.name for c in dependent_user_cols)}'
549
+ )
550
+ dependent_remotes = [remote for remote, col_mapping in self.remotes.items() if name in col_mapping]
551
+ if len(dependent_remotes) > 0:
552
+ raise excs.Error(
553
+ f'Cannot drop column `{name}` because the following remotes depend on it:\n'
554
+ f'{", ".join(str(r) for r in dependent_remotes)}'
555
+ )
556
+ assert col.stored_proxy is None # since there are no dependent remotes
497
557
 
498
558
  # we're creating a new schema version
499
559
  self.version += 1
@@ -515,14 +575,14 @@ class TableVersion:
515
575
  # update idxs_by_name
516
576
  for idx_name in dropped_idx_names:
517
577
  del self.idxs_by_name[idx_name]
518
- self._drop_columns(dropped_cols, conn, preceding_schema_version)
578
+ self._drop_columns(dropped_cols)
579
+ self._update_md(time.time(), preceding_schema_version, conn)
519
580
  _logger.info(f'Dropped column {name} from table {self.name}, new version: {self.version}')
520
581
 
521
- def _drop_columns(self, cols: list[Column], conn: sql.engine.Connection, preceding_schema_version: int) -> None:
582
+ def _drop_columns(self, cols: list[Column]) -> None:
522
583
  """Mark columns as dropped"""
523
584
  assert not self.is_snapshot
524
585
 
525
- ts = time.time()
526
586
  for col in cols:
527
587
  if col.value_expr is not None:
528
588
  # update Column.dependent_cols
@@ -538,7 +598,6 @@ class TableVersion:
538
598
  assert col.id in self.cols_by_id
539
599
  del self.cols_by_id[col.id]
540
600
 
541
- self._update_md(ts, preceding_schema_version, conn)
542
601
  self.store_tbl.create_sa_tbl()
543
602
 
544
603
  def rename_column(self, old_name: str, new_name: str) -> None:
@@ -557,13 +616,12 @@ class TableVersion:
557
616
  self.cols_by_name[new_name] = col
558
617
 
559
618
  # we're creating a new schema version
560
- ts = time.time()
561
619
  self.version += 1
562
620
  preceding_schema_version = self.schema_version
563
621
  self.schema_version = self.version
564
622
 
565
623
  with Env.get().engine.begin() as conn:
566
- self._update_md(ts, preceding_schema_version, conn)
624
+ self._update_md(time.time(), preceding_schema_version, conn)
567
625
  _logger.info(f'Renamed column {old_name} to {new_name} in table {self.name}, new version: {self.version}')
568
626
 
569
627
  def set_comment(self, new_comment: Optional[str]):
@@ -578,12 +636,11 @@ class TableVersion:
578
636
 
579
637
  def _create_schema_version(self):
580
638
  # we're creating a new schema version
581
- ts = time.time()
582
639
  self.version += 1
583
640
  preceding_schema_version = self.schema_version
584
641
  self.schema_version = self.version
585
642
  with Env.get().engine.begin() as conn:
586
- self._update_md(ts, preceding_schema_version, conn)
643
+ self._update_md(time.time(), preceding_schema_version, conn)
587
644
  _logger.info(f'[{self.name}] Updating table schema to version: {self.version}')
588
645
 
589
646
  def insert(
@@ -594,12 +651,11 @@ class TableVersion:
594
651
  assert self.is_insertable()
595
652
  from pixeltable.plan import Planner
596
653
  plan = Planner.create_insert_plan(self, rows, ignore_errors=not fail_on_exception)
597
- ts = time.time()
598
654
  with Env.get().engine.begin() as conn:
599
- return self._insert(plan, conn, ts, print_stats)
655
+ return self._insert(plan, conn, time.time(), print_stats)
600
656
 
601
657
  def _insert(
602
- self, exec_plan: exec.ExecNode, conn: sql.engine.Connection, ts: float, print_stats: bool = False,
658
+ self, exec_plan: exec.ExecNode, conn: sql.engine.Connection, timestamp: float, print_stats: bool = False,
603
659
  ) -> UpdateStatus:
604
660
  """Insert rows produced by exec_plan and propagate to views"""
605
661
  # we're creating a new version
@@ -611,13 +667,13 @@ class TableVersion:
611
667
  result.num_excs = num_excs
612
668
  result.num_computed_values += exec_plan.ctx.num_computed_exprs * num_rows
613
669
  result.cols_with_excs = [f'{self.name}.{self.cols_by_id[cid].name}' for cid in cols_with_excs]
614
- self._update_md(ts, None, conn)
670
+ self._update_md(timestamp, None, conn)
615
671
 
616
672
  # update views
617
673
  for view in self.mutable_views:
618
674
  from pixeltable.plan import Planner
619
675
  plan, _ = Planner.create_view_load_plan(view.path, propagates_insert=True)
620
- status = view._insert(plan, conn, ts, print_stats)
676
+ status = view._insert(plan, conn, timestamp, print_stats)
621
677
  result.num_rows += status.num_rows
622
678
  result.num_excs += status.num_excs
623
679
  result.num_computed_values += status.num_computed_values
@@ -661,7 +717,7 @@ class TableVersion:
661
717
  # construct Where clause to match rowid
662
718
  num_rowid_cols = len(self.store_tbl.rowid_columns())
663
719
  for col_idx in range(num_rowid_cols):
664
- assert len(rowids[i]) == num_rowid_cols
720
+ assert len(rowids[i]) == num_rowid_cols, f'len({rowids[i]}) != {num_rowid_cols}'
665
721
  clause = exprs.RowidRef(self, col_idx) == rowids[i][col_idx]
666
722
  if where_clause is None:
667
723
  where_clause = clause
@@ -678,7 +734,7 @@ class TableVersion:
678
734
  where_clause = where_clause & clause
679
735
 
680
736
  update_targets = {col: row[col] for col in row if col not in pk_cols}
681
- status = self._update(conn, update_targets, where_clause, cascade)
737
+ status = self._update(conn, update_targets, where_clause, cascade, show_progress=False)
682
738
  result_status.num_rows += status.num_rows
683
739
  result_status.num_excs += status.num_excs
684
740
  result_status.num_computed_values += status.num_computed_values
@@ -691,7 +747,8 @@ class TableVersion:
691
747
 
692
748
  def _update(
693
749
  self, conn: sql.engine.Connection, update_targets: dict[Column, 'pixeltable.exprs.Expr'],
694
- where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
750
+ where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True,
751
+ show_progress: bool = True
695
752
  ) -> UpdateStatus:
696
753
  """Update rows in this table.
697
754
  Args:
@@ -704,28 +761,27 @@ class TableVersion:
704
761
  from pixeltable.plan import Planner
705
762
  plan, updated_cols, recomputed_cols = \
706
763
  Planner.create_update_plan(self.path, update_targets, [], where_clause, cascade)
707
- ts = time.time()
708
764
  result = self._propagate_update(
709
765
  plan, where_clause.sql_expr() if where_clause is not None else None, recomputed_cols,
710
- base_versions=[], conn=conn, ts=ts, cascade=cascade)
766
+ base_versions=[], conn=conn, timestamp=time.time(), cascade=cascade, show_progress=show_progress)
711
767
  result.updated_cols = updated_cols
712
768
  return result
713
769
 
714
770
  def _propagate_update(
715
771
  self, plan: Optional[exec.ExecNode], where_clause: Optional[sql.ClauseElement],
716
772
  recomputed_view_cols: List[Column], base_versions: List[Optional[int]], conn: sql.engine.Connection,
717
- ts: float, cascade: bool
773
+ timestamp: float, cascade: bool, show_progress: bool = True
718
774
  ) -> UpdateStatus:
719
775
  result = UpdateStatus()
720
776
  if plan is not None:
721
777
  # we're creating a new version
722
778
  self.version += 1
723
779
  result.num_rows, result.num_excs, cols_with_excs = \
724
- self.store_tbl.insert_rows(plan, conn, v_min=self.version)
780
+ self.store_tbl.insert_rows(plan, conn, v_min=self.version, show_progress=show_progress)
725
781
  result.cols_with_excs = [f'{self.name}.{self.cols_by_id[cid].name}' for cid in cols_with_excs]
726
782
  self.store_tbl.delete_rows(
727
783
  self.version, base_versions=base_versions, match_on_vmin=True, where_clause=where_clause, conn=conn)
728
- self._update_md(ts, None, conn)
784
+ self._update_md(timestamp, None, conn)
729
785
 
730
786
  if cascade:
731
787
  base_versions = [None if plan is None else self.version] + base_versions # don't update in place
@@ -737,7 +793,7 @@ class TableVersion:
737
793
  from pixeltable.plan import Planner
738
794
  plan = Planner.create_view_update_plan(view.path, recompute_targets=recomputed_cols)
739
795
  status = view._propagate_update(
740
- plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, ts=ts, cascade=True)
796
+ plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, timestamp=timestamp, cascade=True)
741
797
  result.num_rows += status.num_rows
742
798
  result.num_excs += status.num_excs
743
799
  result.cols_with_excs += status.cols_with_excs
@@ -753,16 +809,15 @@ class TableVersion:
753
809
  assert self.is_insertable()
754
810
  from pixeltable.plan import Planner
755
811
  analysis_info = Planner.analyze(self, where)
756
- ts = time.time()
757
812
  with Env.get().engine.begin() as conn:
758
- num_rows = self._delete(analysis_info.sql_where_clause, base_versions=[], conn=conn, ts=ts)
813
+ num_rows = self._delete(analysis_info.sql_where_clause, base_versions=[], conn=conn, timestamp=time.time())
759
814
 
760
815
  status = UpdateStatus(num_rows=num_rows)
761
816
  return status
762
817
 
763
818
  def _delete(
764
819
  self, where: Optional['pixeltable.exprs.Predicate'], base_versions: List[Optional[int]],
765
- conn: sql.engine.Connection, ts: float) -> int:
820
+ conn: sql.engine.Connection, timestamp: float) -> int:
766
821
  """Delete rows in this table and propagate to views.
767
822
  Args:
768
823
  where: a Predicate to filter rows to delete.
@@ -776,11 +831,12 @@ class TableVersion:
776
831
  if num_rows > 0:
777
832
  # we're creating a new version
778
833
  self.version += 1
779
- self._update_md(ts, None, conn)
834
+ self._update_md(timestamp, None, conn)
780
835
  else:
781
836
  pass
782
837
  for view in self.mutable_views:
783
- num_rows += view._delete(where=None, base_versions=[self.version] + base_versions, conn=conn, ts=ts)
838
+ num_rows += view._delete(
839
+ where=None, base_versions=[self.version] + base_versions, conn=conn, timestamp=timestamp)
784
840
  return num_rows
785
841
 
786
842
  def revert(self) -> None:
@@ -906,6 +962,94 @@ class TableVersion:
906
962
  view._revert(session)
907
963
  _logger.info(f'TableVersion {self.name}: reverted to version {self.version}')
908
964
 
965
+ @classmethod
966
+ def _init_remote(cls, remote_md: dict[str, Any]) -> Tuple[pixeltable.datatransfer.Remote, dict[str, str]]:
967
+ remote_cls = resolve_symbol(remote_md['class'])
968
+ assert isinstance(remote_cls, type) and issubclass(remote_cls, pixeltable.datatransfer.Remote)
969
+ remote = remote_cls.from_dict(remote_md['remote_md'])
970
+ col_mapping = remote_md['col_mapping']
971
+ return remote, col_mapping
972
+
973
+ def link(self, remote: pixeltable.datatransfer.Remote, col_mapping: dict[str, str]) -> None:
974
+ # All of the media columns being linked need to either be stored, computed columns or have stored proxies.
975
+ # This ensures that the media in those columns resides in the media cache, where it can be served.
976
+ # First determine which columns (if any) need stored proxies, but don't have one yet.
977
+ cols_by_name = self.path.cols_by_name() # Includes base columns
978
+ stored_proxies_needed = []
979
+ for col_name in col_mapping.keys():
980
+ col = cols_by_name[col_name]
981
+ if col.col_type.is_media_type() and not (col.is_stored and col.compute_func) and not col.stored_proxy:
982
+ stored_proxies_needed.append(col)
983
+ with Env.get().engine.begin() as conn:
984
+ self.version += 1
985
+ self.remotes[remote] = col_mapping
986
+ preceding_schema_version = None
987
+ if len(stored_proxies_needed) > 0:
988
+ _logger.info(f'Creating stored proxies for columns: {[col.name for col in stored_proxies_needed]}')
989
+ # Create stored proxies for columns that need one. Increment the schema version
990
+ # accordingly.
991
+ preceding_schema_version = self.schema_version
992
+ self.schema_version = self.version
993
+ proxy_cols = [self.create_stored_proxy(col) for col in stored_proxies_needed]
994
+ # Add the columns; this will also update table metadata.
995
+ # TODO Add to base tables
996
+ self._add_columns(proxy_cols, conn)
997
+ # We don't need to retain `UpdateStatus` since the stored proxies are intended to be
998
+ # invisible to the user.
999
+ self._update_md(time.time(), preceding_schema_version, conn)
1000
+
1001
+ def create_stored_proxy(self, col: Column) -> Column:
1002
+ from pixeltable import exprs
1003
+
1004
+ assert col.col_type.is_media_type() and not (col.is_stored and col.compute_func) and not col.stored_proxy
1005
+ proxy_col = Column(
1006
+ name=None,
1007
+ computed_with=exprs.ColumnRef(col).apply(lambda x: x, col_type=col.col_type),
1008
+ stored=True,
1009
+ col_id=self.next_col_id,
1010
+ sa_col_type=col.col_type.to_sa_type(),
1011
+ schema_version_add=self.schema_version
1012
+ )
1013
+ proxy_col.tbl = self
1014
+ self.next_col_id += 1
1015
+ col.stored_proxy = proxy_col
1016
+ proxy_col.proxy_base = col
1017
+ return proxy_col
1018
+
1019
+ def unlink(self, remote: pixeltable.datatransfer.Remote) -> None:
1020
+ assert remote in self.remotes
1021
+ timestamp = time.time()
1022
+ this_remote_col_names = list(self.remotes[remote].keys())
1023
+ other_remote_col_names = {
1024
+ col_name
1025
+ for other_remote, col_mapping in self.remotes.items() if other_remote != remote
1026
+ for col_name in col_mapping.keys()
1027
+ }
1028
+ cols_by_name = self.path.cols_by_name() # Includes base columns
1029
+ stored_proxy_deletions_needed = [
1030
+ cols_by_name[col_name]
1031
+ for col_name in this_remote_col_names
1032
+ if col_name not in other_remote_col_names and cols_by_name[col_name].stored_proxy
1033
+ ]
1034
+ with Env.get().engine.begin() as conn:
1035
+ self.version += 1
1036
+ del self.remotes[remote]
1037
+ preceding_schema_version = None
1038
+ if len(stored_proxy_deletions_needed) > 0:
1039
+ preceding_schema_version = self.schema_version
1040
+ self.schema_version = self.version
1041
+ proxy_cols = [col.stored_proxy for col in stored_proxy_deletions_needed]
1042
+ for col in stored_proxy_deletions_needed:
1043
+ assert col.stored_proxy is not None and col.stored_proxy.proxy_base == col
1044
+ col.stored_proxy.proxy_base = None
1045
+ col.stored_proxy = None
1046
+ # TODO Drop from base tables
1047
+ self._drop_columns(proxy_cols)
1048
+ self._update_md(timestamp, preceding_schema_version, conn)
1049
+
1050
+ def get_remotes(self) -> dict[pixeltable.datatransfer.Remote, dict[str, str]]:
1051
+ return self.remotes
1052
+
909
1053
  def is_view(self) -> bool:
910
1054
  return self.base is not None
911
1055
 
@@ -938,16 +1082,16 @@ class TableVersion:
938
1082
  def get_required_col_names(self) -> List[str]:
939
1083
  """Return the names of all columns for which values must be specified in insert()"""
940
1084
  assert not self.is_view()
941
- names = [c.name for c in self.cols if not c.is_computed and not c.col_type.nullable]
1085
+ names = [c.name for c in self.cols_by_name.values() if not c.is_computed and not c.col_type.nullable]
942
1086
  return names
943
1087
 
944
1088
  def get_computed_col_names(self) -> List[str]:
945
1089
  """Return the names of all computed columns"""
946
- names = [c.name for c in self.cols if c.is_computed]
1090
+ names = [c.name for c in self.cols_by_name.values() if c.is_computed]
947
1091
  return names
948
1092
 
949
1093
  @classmethod
950
- def _create_value_expr(cls, col: Column, path: 'TableVersionPath') -> None:
1094
+ def _create_value_expr(cls, col: Column, path: 'pixeltable.catalog.TableVersionPath') -> None:
951
1095
  """
952
1096
  Create col.value_expr, given col.compute_func.
953
1097
  Interprets compute_func's parameters to be references to columns and construct ColumnRefs as args.
@@ -977,13 +1121,17 @@ class TableVersion:
977
1121
  for refd_col in refd_cols:
978
1122
  refd_col.dependent_cols.add(col)
979
1123
 
980
- def get_dependent_columns(self, cols: List[Column]) -> Set[Column]:
1124
+ def get_idx_val_columns(self, cols: Iterable[Column]) -> set[Column]:
1125
+ result = {info.val_col for col in cols for info in col.get_idx_info().values()}
1126
+ return result
1127
+
1128
+ def get_dependent_columns(self, cols: list[Column]) -> set[Column]:
981
1129
  """
982
1130
  Return the set of columns that transitively depend on any of the given ones.
983
1131
  """
984
1132
  if len(cols) == 0:
985
- return []
986
- result: Set[Column] = set()
1133
+ return set()
1134
+ result: set[Column] = set()
987
1135
  for col in cols:
988
1136
  result.update(col.dependent_cols)
989
1137
  result.update(self.get_dependent_columns(result))
@@ -1003,17 +1151,30 @@ class TableVersion:
1003
1151
  column_md[col.id] = schema.ColumnMd(
1004
1152
  id=col.id, col_type=col.col_type.as_dict(), is_pk=col.is_pk,
1005
1153
  schema_version_add=col.schema_version_add, schema_version_drop=col.schema_version_drop,
1006
- value_expr=value_expr_dict, stored=col.stored)
1154
+ value_expr=value_expr_dict, stored=col.stored,
1155
+ proxy_base=col.proxy_base.id if col.proxy_base else None)
1007
1156
  return column_md
1008
1157
 
1158
+ @classmethod
1159
+ def _create_remotes_md(cls, remotes: dict['pixeltable.datatransfer.Remote', dict[str, str]]) -> list[dict[str, Any]]:
1160
+ return [
1161
+ {
1162
+ 'class': f'{type(remote).__module__}.{type(remote).__qualname__}',
1163
+ 'remote_md': remote.to_dict(),
1164
+ 'col_mapping': col_mapping
1165
+ }
1166
+ for remote, col_mapping in remotes.items()
1167
+ ]
1168
+
1009
1169
  def _create_tbl_md(self) -> schema.TableMd:
1010
1170
  return schema.TableMd(
1011
1171
  name=self.name, current_version=self.version, current_schema_version=self.schema_version,
1012
1172
  next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
1013
- column_md=self._create_column_md(self.cols), index_md=self.idx_md, view_md=self.view_md)
1173
+ column_md=self._create_column_md(self.cols), index_md=self.idx_md,
1174
+ remotes=self._create_remotes_md(self.remotes), view_md=self.view_md)
1014
1175
 
1015
- def _create_version_md(self, ts: float) -> schema.TableVersionMd:
1016
- return schema.TableVersionMd(created_at=ts, version=self.version, schema_version=self.schema_version)
1176
+ def _create_version_md(self, timestamp: float) -> schema.TableVersionMd:
1177
+ return schema.TableVersionMd(created_at=timestamp, version=self.version, schema_version=self.schema_version)
1017
1178
 
1018
1179
  def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
1019
1180
  column_md: Dict[int, schema.SchemaColumn] = {}