pixeltable 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show
  1. pixeltable/__init__.py +3 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/column.py +14 -2
  4. pixeltable/catalog/insertable_table.py +32 -17
  5. pixeltable/catalog/table.py +194 -12
  6. pixeltable/catalog/table_version.py +270 -110
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/datatransfer/__init__.py +1 -0
  9. pixeltable/datatransfer/label_studio.py +526 -0
  10. pixeltable/datatransfer/remote.py +113 -0
  11. pixeltable/env.py +156 -73
  12. pixeltable/exprs/column_ref.py +2 -2
  13. pixeltable/exprs/comparison.py +39 -1
  14. pixeltable/exprs/data_row.py +7 -0
  15. pixeltable/exprs/expr.py +11 -12
  16. pixeltable/exprs/function_call.py +0 -3
  17. pixeltable/exprs/globals.py +14 -2
  18. pixeltable/exprs/similarity_expr.py +5 -3
  19. pixeltable/ext/functions/whisperx.py +30 -0
  20. pixeltable/ext/functions/yolox.py +16 -0
  21. pixeltable/func/aggregate_function.py +2 -2
  22. pixeltable/func/expr_template_function.py +3 -1
  23. pixeltable/func/udf.py +2 -2
  24. pixeltable/functions/fireworks.py +9 -4
  25. pixeltable/functions/huggingface.py +25 -1
  26. pixeltable/functions/openai.py +15 -10
  27. pixeltable/functions/together.py +11 -6
  28. pixeltable/functions/util.py +0 -43
  29. pixeltable/functions/video.py +46 -8
  30. pixeltable/globals.py +20 -2
  31. pixeltable/index/__init__.py +1 -0
  32. pixeltable/index/base.py +6 -1
  33. pixeltable/index/btree.py +54 -0
  34. pixeltable/index/embedding_index.py +4 -1
  35. pixeltable/io/__init__.py +1 -0
  36. pixeltable/io/globals.py +59 -0
  37. pixeltable/iterators/base.py +4 -4
  38. pixeltable/iterators/document.py +26 -15
  39. pixeltable/iterators/video.py +9 -1
  40. pixeltable/metadata/__init__.py +2 -2
  41. pixeltable/metadata/converters/convert_14.py +13 -0
  42. pixeltable/metadata/converters/convert_15.py +29 -0
  43. pixeltable/metadata/converters/util.py +63 -0
  44. pixeltable/metadata/schema.py +12 -6
  45. pixeltable/plan.py +9 -5
  46. pixeltable/store.py +14 -21
  47. pixeltable/tool/create_test_db_dump.py +16 -0
  48. pixeltable/type_system.py +14 -4
  49. pixeltable/utils/coco.py +94 -0
  50. pixeltable-0.2.7.dist-info/METADATA +137 -0
  51. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/RECORD +53 -46
  52. pixeltable/func/nos_function.py +0 -202
  53. pixeltable/utils/clip.py +0 -18
  54. pixeltable-0.2.6.dist-info/METADATA +0 -131
  55. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
  56. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +0 -0
@@ -5,7 +5,8 @@ import importlib
5
5
  import inspect
6
6
  import logging
7
7
  import time
8
- from typing import Optional, List, Dict, Any, Tuple, Type, Set
8
+ from typing import Optional, List, Dict, Any, Tuple, Type, Set, Iterable
9
+ import uuid
9
10
  from uuid import UUID
10
11
 
11
12
  import sqlalchemy as sql
@@ -23,6 +24,7 @@ from pixeltable.utils.filecache import FileCache
23
24
  from pixeltable.utils.media_store import MediaStore
24
25
  from .column import Column
25
26
  from .globals import UpdateStatus, POS_COLUMN_NAME, is_valid_identifier
27
+ from ..func.globals import resolve_symbol
26
28
 
27
29
  _logger = logging.getLogger('pixeltable')
28
30
 
@@ -87,6 +89,8 @@ class TableVersion:
87
89
  self.next_idx_id = tbl_md.next_idx_id
88
90
  self.next_rowid = tbl_md.next_row_id
89
91
 
92
+ self.remotes = dict(TableVersion._init_remote(remote_md) for remote_md in tbl_md.remotes)
93
+
90
94
  # view-specific initialization
91
95
  from pixeltable import exprs
92
96
  predicate_dict = None if not is_view or tbl_md.view_md.predicate is None else tbl_md.view_md.predicate
@@ -115,9 +119,9 @@ class TableVersion:
115
119
  cat.tbl_versions[(self.id, self.effective_version)] = self
116
120
 
117
121
  # init schema after we determined whether we're a component view, and before we create the store table
118
- self.cols: List[Column] = [] # contains complete history of columns, incl dropped ones
122
+ self.cols: list[Column] = [] # contains complete history of columns, incl dropped ones
119
123
  self.cols_by_name: dict[str, Column] = {} # contains only user-facing (named) columns visible in this version
120
- self.cols_by_id: dict[int, Column] = {} # contains only columns visible in this version
124
+ self.cols_by_id: dict[int, Column] = {} # contains only columns visible in this version, both system and user
121
125
  self.idx_md = tbl_md.index_md # needed for _create_tbl_md()
122
126
  self.idxs_by_name: dict[str, TableVersion.IndexInfo] = {} # contains only actively maintained indices
123
127
  self._init_schema(tbl_md, schema_version_md)
@@ -150,23 +154,22 @@ class TableVersion:
150
154
  if col.is_computed:
151
155
  col.check_value_expr()
152
156
 
153
- ts = time.time()
157
+ timestamp = time.time()
154
158
  # create schema.Table
155
159
  # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
156
160
  column_md = cls._create_column_md(cols)
157
161
  table_md = schema.TableMd(
158
- name=name, current_version=0, current_schema_version=0,
159
- next_col_id=len(cols), next_idx_id=0, next_row_id=0, column_md=column_md, index_md={}, view_md=view_md)
160
- tbl_record = schema.Table(dir_id=dir_id, md=dataclasses.asdict(table_md))
161
- session.add(tbl_record)
162
- session.flush() # sets tbl_record.id
163
- assert tbl_record.id is not None
162
+ name=name, current_version=0, current_schema_version=0, next_col_id=len(cols),
163
+ next_idx_id=0, next_row_id=0, column_md=column_md, index_md={}, remotes=[], view_md=view_md)
164
+ # create a schema.Table here, we need it to call our c'tor;
165
+ # don't add it to the session yet, we might add index metadata
166
+ tbl_id = uuid.uuid4()
167
+ tbl_record = schema.Table(id=tbl_id, dir_id=dir_id, md=dataclasses.asdict(table_md))
164
168
 
165
169
  # create schema.TableVersion
166
- table_version_md = schema.TableVersionMd(created_at=ts, version=0, schema_version=0)
170
+ table_version_md = schema.TableVersionMd(created_at=timestamp, version=0, schema_version=0)
167
171
  tbl_version_record = schema.TableVersion(
168
172
  tbl_id=tbl_record.id, version=0, md=dataclasses.asdict(table_version_md))
169
- session.add(tbl_version_record)
170
173
 
171
174
  # create schema.TableSchemaVersion
172
175
  schema_col_md = {col.id: schema.SchemaColumn(pos=pos, name=col.name) for pos, col in enumerate(cols)}
@@ -176,19 +179,33 @@ class TableVersion:
176
179
  num_retained_versions=num_retained_versions, comment=comment)
177
180
  schema_version_record = schema.TableSchemaVersion(
178
181
  tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
179
- session.add(schema_version_record)
180
182
 
181
183
  # if this is purely a snapshot (it doesn't require any additional storage for columns and it # doesn't have a
182
184
  # predicate to apply at runtime), we don't create a physical table and simply use the base's table version path
183
185
  if view_md is not None and view_md.is_snapshot and view_md.predicate is None and len(cols) == 0:
186
+ session.add(tbl_record)
187
+ session.add(tbl_version_record)
188
+ session.add(schema_version_record)
184
189
  return tbl_record.id, None
185
190
 
186
191
  assert (base_path is not None) == (view_md is not None)
187
192
  base = base_path.tbl_version if base_path is not None and view_md.is_snapshot else None
188
193
  base_path = base_path if base_path is not None and not view_md.is_snapshot else None
189
194
  tbl_version = cls(tbl_record.id, table_md, 0, schema_version_md, base=base, base_path=base_path)
190
- tbl_version.store_tbl.create(session.connection())
191
- # TODO: create pgvector indices
195
+
196
+ conn = session.connection()
197
+ tbl_version.store_tbl.create(conn)
198
+ if view_md is None or not view_md.is_snapshot:
199
+ # add default indices, after creating the store table
200
+ for col in tbl_version.cols_by_name.values():
201
+ status = tbl_version._add_default_index(col, conn=conn)
202
+ assert status is None or status.num_excs == 0
203
+
204
+ # we re-create the tbl_record here, now that we have new index metadata
205
+ tbl_record = schema.Table(id=tbl_id, dir_id=dir_id, md=dataclasses.asdict(tbl_version._create_tbl_md()))
206
+ session.add(tbl_record)
207
+ session.add(tbl_version_record)
208
+ session.add(schema_version_record)
192
209
  return tbl_record.id, tbl_version
193
210
 
194
211
  @classmethod
@@ -252,6 +269,16 @@ class TableVersion:
252
269
  col.value_expr = exprs.Expr.from_dict(col_md.value_expr)
253
270
  self._record_value_expr(col)
254
271
 
272
+ # if this is a stored proxy column, resolve the relationships with its proxy base.
273
+ if col_md.proxy_base is not None:
274
+ # proxy_base must have a strictly smaller id, so we must already have encountered it
275
+ # in traversal order; and if the proxy column is active at this version, then the
276
+ # proxy base must necessarily be active as well. This motivates the following assertion.
277
+ assert col_md.proxy_base in self.cols_by_id
278
+ base_col = self.cols_by_id[col_md.proxy_base]
279
+ base_col.stored_proxy = col
280
+ col.proxy_base = base_col
281
+
255
282
  def _init_idxs(self, tbl_md: schema.TableMd) -> None:
256
283
  self.idx_md = tbl_md.index_md
257
284
  self.idxs_by_name = {}
@@ -259,7 +286,7 @@ class TableVersion:
259
286
  for md in tbl_md.index_md.values():
260
287
  if md.schema_version_add > self.schema_version \
261
288
  or md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version:
262
- # column not visible in this schema version
289
+ # index not visible in this schema version
263
290
  continue
264
291
 
265
292
  # instantiate index object
@@ -271,8 +298,10 @@ class TableVersion:
271
298
  # fix up the sa column type of the index value and undo columns
272
299
  val_col = self.cols_by_id[md.index_val_col_id]
273
300
  val_col.sa_col_type = idx.index_sa_type()
301
+ val_col._records_errors = False
274
302
  undo_col = self.cols_by_id[md.index_val_undo_col_id]
275
303
  undo_col.sa_col_type = idx.index_sa_type()
304
+ undo_col._records_errors = False
276
305
  idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
277
306
  self.idxs_by_name[md.name] = idx_info
278
307
 
@@ -287,10 +316,12 @@ class TableVersion:
287
316
  else:
288
317
  self.store_tbl: StoreBase = StoreTable(self)
289
318
 
290
- def _update_md(self, ts: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection) -> None:
319
+ def _update_md(
320
+ self, timestamp: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection
321
+ ) -> None:
291
322
  """Update all recorded metadata in response to a data or schema change.
292
323
  Args:
293
- ts: timestamp of the change
324
+ timestamp: timestamp of the change
294
325
  preceding_schema_version: last schema version if schema change, else None
295
326
  """
296
327
  conn.execute(
@@ -298,7 +329,7 @@ class TableVersion:
298
329
  .values({schema.Table.md: dataclasses.asdict(self._create_tbl_md())})
299
330
  .where(schema.Table.id == self.id))
300
331
 
301
- version_md = self._create_version_md(ts)
332
+ version_md = self._create_version_md(timestamp)
302
333
  conn.execute(
303
334
  sql.insert(schema.TableVersion.__table__)
304
335
  .values(tbl_id=self.id, version=self.version, md=dataclasses.asdict(version_md)))
@@ -315,6 +346,33 @@ class TableVersion:
315
346
  return f'idx_{self.id.hex}_{idx_id}'
316
347
 
317
348
  def add_index(self, col: Column, idx_name: Optional[str], idx: index.IndexBase) -> UpdateStatus:
349
+ # we're creating a new schema version
350
+ self.version += 1
351
+ preceding_schema_version = self.schema_version
352
+ self.schema_version = self.version
353
+ with Env.get().engine.begin() as conn:
354
+ status = self._add_index(col, idx_name, idx, conn)
355
+ self._update_md(time.time(), preceding_schema_version, conn)
356
+ _logger.info(f'Added index {idx_name} on column {col.name} to table {self.name}')
357
+ return status
358
+
359
+ def _add_default_index(self, col: Column, conn: sql.engine.Connection) -> Optional[UpdateStatus]:
360
+ """Add a B-tree index on this column if it has a compatible type"""
361
+ if not col.stored:
362
+ # if the column is intentionally not stored, we want to avoid the overhead of an index
363
+ return None
364
+ if not col.col_type.is_scalar_type() and not (col.col_type.is_media_type() and not col.is_computed):
365
+ # wrong type for a B-tree
366
+ return None
367
+ if col.col_type.is_bool_type():
368
+ # B-trees on bools aren't useful
369
+ return None
370
+ status = self._add_index(col, idx_name=None, idx=index.BtreeIndex(col), conn=conn)
371
+ return status
372
+
373
+ def _add_index(
374
+ self, col: Column, idx_name: Optional[str], idx: index.IndexBase, conn: sql.engine.Connection
375
+ ) -> UpdateStatus:
318
376
  assert not self.is_snapshot
319
377
  idx_id = self.next_idx_id
320
378
  self.next_idx_id += 1
@@ -324,46 +382,41 @@ class TableVersion:
324
382
  assert is_valid_identifier(idx_name)
325
383
  assert idx_name not in [i.name for i in self.idx_md.values()]
326
384
 
327
- # we're creating a new schema version
328
- self.version += 1
329
- preceding_schema_version = self.schema_version
330
- self.schema_version = self.version
331
- with Env.get().engine.begin() as conn:
332
- # add the index value and undo columns (which need to be nullable);
333
- # we don't create a new schema version, because indices aren't part of the logical schema
334
- val_col = Column(
335
- col_id=self.next_col_id, name=None, computed_with=idx.index_value_expr(),
336
- sa_col_type=idx.index_sa_type(), stored=True,
337
- schema_version_add=self.schema_version, schema_version_drop=None)
338
- val_col.tbl = self
339
- val_col.col_type.nullable = True
340
- self.next_col_id += 1
341
-
342
- undo_col = Column(
343
- col_id=self.next_col_id, name=None, col_type=val_col.col_type,
344
- sa_col_type=val_col.sa_col_type, stored=True,
345
- schema_version_add=self.schema_version, schema_version_drop=None)
346
- undo_col.tbl = self
347
- undo_col.col_type.nullable = True
348
- self.next_col_id += 1
349
-
350
- # create and register the index metadata
351
- idx_cls = type(idx)
352
- idx_md = schema.IndexMd(
353
- id=idx_id, name=idx_name,
354
- indexed_col_id=col.id, index_val_col_id=val_col.id, index_val_undo_col_id=undo_col.id,
355
- schema_version_add=self.schema_version, schema_version_drop=None,
356
- class_fqn=idx_cls.__module__ + '.' + idx_cls.__name__, init_args=idx.as_dict())
357
- idx_info = self.IndexInfo(id=idx_id, name=idx_name, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
358
- self.idx_md[idx_id] = idx_md
359
- self.idxs_by_name[idx_name] = idx_info
360
-
361
- # add the columns and update the metadata
362
- status = self._add_columns([val_col, undo_col], conn, preceding_schema_version=preceding_schema_version)
363
- # now create the index structure
364
- idx.create_index(self._store_idx_name(idx_id), val_col, conn)
365
-
366
- _logger.info(f'Added index {idx_name} on column {col.name} to table {self.name}')
385
+ # add the index value and undo columns (which need to be nullable)
386
+ val_col = Column(
387
+ col_id=self.next_col_id, name=None, computed_with=idx.index_value_expr(),
388
+ sa_col_type=idx.index_sa_type(), stored=True,
389
+ schema_version_add=self.schema_version, schema_version_drop=None,
390
+ records_errors=idx.records_value_errors())
391
+ val_col.tbl = self
392
+ val_col.col_type = val_col.col_type.copy(nullable=True)
393
+ self.next_col_id += 1
394
+
395
+ undo_col = Column(
396
+ col_id=self.next_col_id, name=None, col_type=val_col.col_type,
397
+ sa_col_type=val_col.sa_col_type, stored=True,
398
+ schema_version_add=self.schema_version, schema_version_drop=None,
399
+ records_errors=False)
400
+ undo_col.tbl = self
401
+ undo_col.col_type = undo_col.col_type.copy(nullable=True)
402
+ self.next_col_id += 1
403
+
404
+ # create and register the index metadata
405
+ idx_cls = type(idx)
406
+ idx_md = schema.IndexMd(
407
+ id=idx_id, name=idx_name,
408
+ indexed_col_id=col.id, index_val_col_id=val_col.id, index_val_undo_col_id=undo_col.id,
409
+ schema_version_add=self.schema_version, schema_version_drop=None,
410
+ class_fqn=idx_cls.__module__ + '.' + idx_cls.__name__, init_args=idx.as_dict())
411
+ idx_info = self.IndexInfo(id=idx_id, name=idx_name, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
412
+ self.idx_md[idx_id] = idx_md
413
+ self.idxs_by_name[idx_name] = idx_info
414
+
415
+ # add the columns and update the metadata
416
+ status = self._add_columns([val_col, undo_col], conn)
417
+ # now create the index structure
418
+ idx.create_index(self._store_idx_name(idx_id), val_col, conn)
419
+
367
420
  return status
368
421
 
369
422
  def drop_index(self, idx_id: int) -> None:
@@ -381,7 +434,8 @@ class TableVersion:
381
434
  del self.idxs_by_name[idx_md.name]
382
435
 
383
436
  with Env.get().engine.begin() as conn:
384
- self._drop_columns([idx_info.val_col, idx_info.undo_col], conn, preceding_schema_version)
437
+ self._drop_columns([idx_info.val_col, idx_info.undo_col])
438
+ self._update_md(time.time(), preceding_schema_version, conn)
385
439
  _logger.info(f'Dropped index {idx_md.name} on table {self.name}')
386
440
 
387
441
  def add_column(self, col: Column, print_stats: bool = False) -> UpdateStatus:
@@ -398,16 +452,16 @@ class TableVersion:
398
452
  if col.compute_func is not None:
399
453
  # create value_expr from compute_func
400
454
  self._create_value_expr(col, self.path)
401
- if col.value_expr is not None:
402
- col.check_value_expr()
403
- self._record_value_expr(col)
404
455
 
405
456
  # we're creating a new schema version
406
457
  self.version += 1
407
458
  preceding_schema_version = self.schema_version
408
459
  self.schema_version = self.version
409
460
  with Env.get().engine.begin() as conn:
410
- status = self._add_columns([col], conn, preceding_schema_version, print_stats=print_stats)
461
+ status = self._add_columns([col], conn, print_stats=print_stats)
462
+ _ = self._add_default_index(col, conn)
463
+ # TODO: what to do about errors?
464
+ self._update_md(time.time(), preceding_schema_version, conn)
411
465
  _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
412
466
 
413
467
  msg = (
@@ -418,13 +472,8 @@ class TableVersion:
418
472
  _logger.info(f'Column {col.name}: {msg}')
419
473
  return status
420
474
 
421
- def _add_columns(
422
- self, cols: List[Column], conn: sql.engine.Connection, preceding_schema_version: Optional[int] = None,
423
- print_stats: bool = False
424
- ) -> UpdateStatus:
475
+ def _add_columns(self, cols: List[Column], conn: sql.engine.Connection, print_stats: bool = False) -> UpdateStatus:
425
476
  """Add and populate columns within the current transaction"""
426
- ts = time.time()
427
-
428
477
  row_count = self.store_tbl.count(conn=conn)
429
478
  for col in cols:
430
479
  if not col.col_type.nullable and not col.is_computed:
@@ -442,6 +491,9 @@ class TableVersion:
442
491
  if col.name is not None:
443
492
  self.cols_by_name[col.name] = col
444
493
  self.cols_by_id[col.id] = col
494
+ if col.value_expr is not None:
495
+ col.check_value_expr()
496
+ self._record_value_expr(col)
445
497
 
446
498
  if col.is_stored:
447
499
  self.store_tbl.add_column(col, conn)
@@ -475,7 +527,6 @@ class TableVersion:
475
527
  finally:
476
528
  plan.close()
477
529
 
478
- self._update_md(ts, preceding_schema_version, conn)
479
530
  if print_stats:
480
531
  plan.ctx.profile.print(num_rows=row_count)
481
532
  # TODO(mkornacker): what to do about system columns with exceptions?
@@ -493,8 +544,16 @@ class TableVersion:
493
544
  dependent_user_cols = [c for c in col.dependent_cols if c.name is not None]
494
545
  if len(dependent_user_cols) > 0:
495
546
  raise excs.Error(
496
- f'Cannot drop column {name} because the following columns depend on it:\n',
497
- f'{", ".join([c.name for c in dependent_user_cols])}')
547
+ f'Cannot drop column `{name}` because the following columns depend on it:\n'
548
+ f'{", ".join(c.name for c in dependent_user_cols)}'
549
+ )
550
+ dependent_remotes = [remote for remote, col_mapping in self.remotes.items() if name in col_mapping]
551
+ if len(dependent_remotes) > 0:
552
+ raise excs.Error(
553
+ f'Cannot drop column `{name}` because the following remotes depend on it:\n'
554
+ f'{", ".join(str(r) for r in dependent_remotes)}'
555
+ )
556
+ assert col.stored_proxy is None # since there are no dependent remotes
498
557
 
499
558
  # we're creating a new schema version
500
559
  self.version += 1
@@ -516,14 +575,14 @@ class TableVersion:
516
575
  # update idxs_by_name
517
576
  for idx_name in dropped_idx_names:
518
577
  del self.idxs_by_name[idx_name]
519
- self._drop_columns(dropped_cols, conn, preceding_schema_version)
578
+ self._drop_columns(dropped_cols)
579
+ self._update_md(time.time(), preceding_schema_version, conn)
520
580
  _logger.info(f'Dropped column {name} from table {self.name}, new version: {self.version}')
521
581
 
522
- def _drop_columns(self, cols: list[Column], conn: sql.engine.Connection, preceding_schema_version: int) -> None:
582
+ def _drop_columns(self, cols: list[Column]) -> None:
523
583
  """Mark columns as dropped"""
524
584
  assert not self.is_snapshot
525
585
 
526
- ts = time.time()
527
586
  for col in cols:
528
587
  if col.value_expr is not None:
529
588
  # update Column.dependent_cols
@@ -539,7 +598,6 @@ class TableVersion:
539
598
  assert col.id in self.cols_by_id
540
599
  del self.cols_by_id[col.id]
541
600
 
542
- self._update_md(ts, preceding_schema_version, conn)
543
601
  self.store_tbl.create_sa_tbl()
544
602
 
545
603
  def rename_column(self, old_name: str, new_name: str) -> None:
@@ -558,13 +616,12 @@ class TableVersion:
558
616
  self.cols_by_name[new_name] = col
559
617
 
560
618
  # we're creating a new schema version
561
- ts = time.time()
562
619
  self.version += 1
563
620
  preceding_schema_version = self.schema_version
564
621
  self.schema_version = self.version
565
622
 
566
623
  with Env.get().engine.begin() as conn:
567
- self._update_md(ts, preceding_schema_version, conn)
624
+ self._update_md(time.time(), preceding_schema_version, conn)
568
625
  _logger.info(f'Renamed column {old_name} to {new_name} in table {self.name}, new version: {self.version}')
569
626
 
570
627
  def set_comment(self, new_comment: Optional[str]):
@@ -579,12 +636,11 @@ class TableVersion:
579
636
 
580
637
  def _create_schema_version(self):
581
638
  # we're creating a new schema version
582
- ts = time.time()
583
639
  self.version += 1
584
640
  preceding_schema_version = self.schema_version
585
641
  self.schema_version = self.version
586
642
  with Env.get().engine.begin() as conn:
587
- self._update_md(ts, preceding_schema_version, conn)
643
+ self._update_md(time.time(), preceding_schema_version, conn)
588
644
  _logger.info(f'[{self.name}] Updating table schema to version: {self.version}')
589
645
 
590
646
  def insert(
@@ -595,12 +651,11 @@ class TableVersion:
595
651
  assert self.is_insertable()
596
652
  from pixeltable.plan import Planner
597
653
  plan = Planner.create_insert_plan(self, rows, ignore_errors=not fail_on_exception)
598
- ts = time.time()
599
654
  with Env.get().engine.begin() as conn:
600
- return self._insert(plan, conn, ts, print_stats)
655
+ return self._insert(plan, conn, time.time(), print_stats)
601
656
 
602
657
  def _insert(
603
- self, exec_plan: exec.ExecNode, conn: sql.engine.Connection, ts: float, print_stats: bool = False,
658
+ self, exec_plan: exec.ExecNode, conn: sql.engine.Connection, timestamp: float, print_stats: bool = False,
604
659
  ) -> UpdateStatus:
605
660
  """Insert rows produced by exec_plan and propagate to views"""
606
661
  # we're creating a new version
@@ -612,13 +667,13 @@ class TableVersion:
612
667
  result.num_excs = num_excs
613
668
  result.num_computed_values += exec_plan.ctx.num_computed_exprs * num_rows
614
669
  result.cols_with_excs = [f'{self.name}.{self.cols_by_id[cid].name}' for cid in cols_with_excs]
615
- self._update_md(ts, None, conn)
670
+ self._update_md(timestamp, None, conn)
616
671
 
617
672
  # update views
618
673
  for view in self.mutable_views:
619
674
  from pixeltable.plan import Planner
620
675
  plan, _ = Planner.create_view_load_plan(view.path, propagates_insert=True)
621
- status = view._insert(plan, conn, ts, print_stats)
676
+ status = view._insert(plan, conn, timestamp, print_stats)
622
677
  result.num_rows += status.num_rows
623
678
  result.num_excs += status.num_excs
624
679
  result.num_computed_values += status.num_computed_values
@@ -662,7 +717,7 @@ class TableVersion:
662
717
  # construct Where clause to match rowid
663
718
  num_rowid_cols = len(self.store_tbl.rowid_columns())
664
719
  for col_idx in range(num_rowid_cols):
665
- assert len(rowids[i]) == num_rowid_cols
720
+ assert len(rowids[i]) == num_rowid_cols, f'len({rowids[i]}) != {num_rowid_cols}'
666
721
  clause = exprs.RowidRef(self, col_idx) == rowids[i][col_idx]
667
722
  if where_clause is None:
668
723
  where_clause = clause
@@ -679,7 +734,7 @@ class TableVersion:
679
734
  where_clause = where_clause & clause
680
735
 
681
736
  update_targets = {col: row[col] for col in row if col not in pk_cols}
682
- status = self._update(conn, update_targets, where_clause, cascade)
737
+ status = self._update(conn, update_targets, where_clause, cascade, show_progress=False)
683
738
  result_status.num_rows += status.num_rows
684
739
  result_status.num_excs += status.num_excs
685
740
  result_status.num_computed_values += status.num_computed_values
@@ -692,7 +747,8 @@ class TableVersion:
692
747
 
693
748
  def _update(
694
749
  self, conn: sql.engine.Connection, update_targets: dict[Column, 'pixeltable.exprs.Expr'],
695
- where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
750
+ where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True,
751
+ show_progress: bool = True
696
752
  ) -> UpdateStatus:
697
753
  """Update rows in this table.
698
754
  Args:
@@ -705,28 +761,27 @@ class TableVersion:
705
761
  from pixeltable.plan import Planner
706
762
  plan, updated_cols, recomputed_cols = \
707
763
  Planner.create_update_plan(self.path, update_targets, [], where_clause, cascade)
708
- ts = time.time()
709
764
  result = self._propagate_update(
710
765
  plan, where_clause.sql_expr() if where_clause is not None else None, recomputed_cols,
711
- base_versions=[], conn=conn, ts=ts, cascade=cascade)
766
+ base_versions=[], conn=conn, timestamp=time.time(), cascade=cascade, show_progress=show_progress)
712
767
  result.updated_cols = updated_cols
713
768
  return result
714
769
 
715
770
  def _propagate_update(
716
771
  self, plan: Optional[exec.ExecNode], where_clause: Optional[sql.ClauseElement],
717
772
  recomputed_view_cols: List[Column], base_versions: List[Optional[int]], conn: sql.engine.Connection,
718
- ts: float, cascade: bool
773
+ timestamp: float, cascade: bool, show_progress: bool = True
719
774
  ) -> UpdateStatus:
720
775
  result = UpdateStatus()
721
776
  if plan is not None:
722
777
  # we're creating a new version
723
778
  self.version += 1
724
779
  result.num_rows, result.num_excs, cols_with_excs = \
725
- self.store_tbl.insert_rows(plan, conn, v_min=self.version)
780
+ self.store_tbl.insert_rows(plan, conn, v_min=self.version, show_progress=show_progress)
726
781
  result.cols_with_excs = [f'{self.name}.{self.cols_by_id[cid].name}' for cid in cols_with_excs]
727
782
  self.store_tbl.delete_rows(
728
783
  self.version, base_versions=base_versions, match_on_vmin=True, where_clause=where_clause, conn=conn)
729
- self._update_md(ts, None, conn)
784
+ self._update_md(timestamp, None, conn)
730
785
 
731
786
  if cascade:
732
787
  base_versions = [None if plan is None else self.version] + base_versions # don't update in place
@@ -738,7 +793,7 @@ class TableVersion:
738
793
  from pixeltable.plan import Planner
739
794
  plan = Planner.create_view_update_plan(view.path, recompute_targets=recomputed_cols)
740
795
  status = view._propagate_update(
741
- plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, ts=ts, cascade=True)
796
+ plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, timestamp=timestamp, cascade=True)
742
797
  result.num_rows += status.num_rows
743
798
  result.num_excs += status.num_excs
744
799
  result.cols_with_excs += status.cols_with_excs
@@ -754,16 +809,15 @@ class TableVersion:
754
809
  assert self.is_insertable()
755
810
  from pixeltable.plan import Planner
756
811
  analysis_info = Planner.analyze(self, where)
757
- ts = time.time()
758
812
  with Env.get().engine.begin() as conn:
759
- num_rows = self._delete(analysis_info.sql_where_clause, base_versions=[], conn=conn, ts=ts)
813
+ num_rows = self._delete(analysis_info.sql_where_clause, base_versions=[], conn=conn, timestamp=time.time())
760
814
 
761
815
  status = UpdateStatus(num_rows=num_rows)
762
816
  return status
763
817
 
764
818
  def _delete(
765
819
  self, where: Optional['pixeltable.exprs.Predicate'], base_versions: List[Optional[int]],
766
- conn: sql.engine.Connection, ts: float) -> int:
820
+ conn: sql.engine.Connection, timestamp: float) -> int:
767
821
  """Delete rows in this table and propagate to views.
768
822
  Args:
769
823
  where: a Predicate to filter rows to delete.
@@ -777,11 +831,12 @@ class TableVersion:
777
831
  if num_rows > 0:
778
832
  # we're creating a new version
779
833
  self.version += 1
780
- self._update_md(ts, None, conn)
834
+ self._update_md(timestamp, None, conn)
781
835
  else:
782
836
  pass
783
837
  for view in self.mutable_views:
784
- num_rows += view._delete(where=None, base_versions=[self.version] + base_versions, conn=conn, ts=ts)
838
+ num_rows += view._delete(
839
+ where=None, base_versions=[self.version] + base_versions, conn=conn, timestamp=timestamp)
785
840
  return num_rows
786
841
 
787
842
  def revert(self) -> None:
@@ -907,6 +962,94 @@ class TableVersion:
907
962
  view._revert(session)
908
963
  _logger.info(f'TableVersion {self.name}: reverted to version {self.version}')
909
964
 
965
+ @classmethod
966
+ def _init_remote(cls, remote_md: dict[str, Any]) -> Tuple[pixeltable.datatransfer.Remote, dict[str, str]]:
967
+ remote_cls = resolve_symbol(remote_md['class'])
968
+ assert isinstance(remote_cls, type) and issubclass(remote_cls, pixeltable.datatransfer.Remote)
969
+ remote = remote_cls.from_dict(remote_md['remote_md'])
970
+ col_mapping = remote_md['col_mapping']
971
+ return remote, col_mapping
972
+
973
+ def link(self, remote: pixeltable.datatransfer.Remote, col_mapping: dict[str, str]) -> None:
974
+ # All of the media columns being linked need to either be stored, computed columns or have stored proxies.
975
+ # This ensures that the media in those columns resides in the media cache, where it can be served.
976
+ # First determine which columns (if any) need stored proxies, but don't have one yet.
977
+ cols_by_name = self.path.cols_by_name() # Includes base columns
978
+ stored_proxies_needed = []
979
+ for col_name in col_mapping.keys():
980
+ col = cols_by_name[col_name]
981
+ if col.col_type.is_media_type() and not (col.is_stored and col.compute_func) and not col.stored_proxy:
982
+ stored_proxies_needed.append(col)
983
+ with Env.get().engine.begin() as conn:
984
+ self.version += 1
985
+ self.remotes[remote] = col_mapping
986
+ preceding_schema_version = None
987
+ if len(stored_proxies_needed) > 0:
988
+ _logger.info(f'Creating stored proxies for columns: {[col.name for col in stored_proxies_needed]}')
989
+ # Create stored proxies for columns that need one. Increment the schema version
990
+ # accordingly.
991
+ preceding_schema_version = self.schema_version
992
+ self.schema_version = self.version
993
+ proxy_cols = [self.create_stored_proxy(col) for col in stored_proxies_needed]
994
+ # Add the columns; this will also update table metadata.
995
+ # TODO Add to base tables
996
+ self._add_columns(proxy_cols, conn)
997
+ # We don't need to retain `UpdateStatus` since the stored proxies are intended to be
998
+ # invisible to the user.
999
+ self._update_md(time.time(), preceding_schema_version, conn)
1000
+
1001
+ def create_stored_proxy(self, col: Column) -> Column:
1002
+ from pixeltable import exprs
1003
+
1004
+ assert col.col_type.is_media_type() and not (col.is_stored and col.compute_func) and not col.stored_proxy
1005
+ proxy_col = Column(
1006
+ name=None,
1007
+ computed_with=exprs.ColumnRef(col).apply(lambda x: x, col_type=col.col_type),
1008
+ stored=True,
1009
+ col_id=self.next_col_id,
1010
+ sa_col_type=col.col_type.to_sa_type(),
1011
+ schema_version_add=self.schema_version
1012
+ )
1013
+ proxy_col.tbl = self
1014
+ self.next_col_id += 1
1015
+ col.stored_proxy = proxy_col
1016
+ proxy_col.proxy_base = col
1017
+ return proxy_col
1018
+
1019
+ def unlink(self, remote: pixeltable.datatransfer.Remote) -> None:
1020
+ assert remote in self.remotes
1021
+ timestamp = time.time()
1022
+ this_remote_col_names = list(self.remotes[remote].keys())
1023
+ other_remote_col_names = {
1024
+ col_name
1025
+ for other_remote, col_mapping in self.remotes.items() if other_remote != remote
1026
+ for col_name in col_mapping.keys()
1027
+ }
1028
+ cols_by_name = self.path.cols_by_name() # Includes base columns
1029
+ stored_proxy_deletions_needed = [
1030
+ cols_by_name[col_name]
1031
+ for col_name in this_remote_col_names
1032
+ if col_name not in other_remote_col_names and cols_by_name[col_name].stored_proxy
1033
+ ]
1034
+ with Env.get().engine.begin() as conn:
1035
+ self.version += 1
1036
+ del self.remotes[remote]
1037
+ preceding_schema_version = None
1038
+ if len(stored_proxy_deletions_needed) > 0:
1039
+ preceding_schema_version = self.schema_version
1040
+ self.schema_version = self.version
1041
+ proxy_cols = [col.stored_proxy for col in stored_proxy_deletions_needed]
1042
+ for col in stored_proxy_deletions_needed:
1043
+ assert col.stored_proxy is not None and col.stored_proxy.proxy_base == col
1044
+ col.stored_proxy.proxy_base = None
1045
+ col.stored_proxy = None
1046
+ # TODO Drop from base tables
1047
+ self._drop_columns(proxy_cols)
1048
+ self._update_md(timestamp, preceding_schema_version, conn)
1049
+
1050
+ def get_remotes(self) -> dict[pixeltable.datatransfer.Remote, dict[str, str]]:
1051
+ return self.remotes
1052
+
910
1053
  def is_view(self) -> bool:
911
1054
  return self.base is not None
912
1055
 
@@ -939,16 +1082,16 @@ class TableVersion:
939
1082
  def get_required_col_names(self) -> List[str]:
940
1083
  """Return the names of all columns for which values must be specified in insert()"""
941
1084
  assert not self.is_view()
942
- names = [c.name for c in self.cols if not c.is_computed and not c.col_type.nullable]
1085
+ names = [c.name for c in self.cols_by_name.values() if not c.is_computed and not c.col_type.nullable]
943
1086
  return names
944
1087
 
945
1088
  def get_computed_col_names(self) -> List[str]:
946
1089
  """Return the names of all computed columns"""
947
- names = [c.name for c in self.cols if c.is_computed]
1090
+ names = [c.name for c in self.cols_by_name.values() if c.is_computed]
948
1091
  return names
949
1092
 
950
1093
  @classmethod
951
- def _create_value_expr(cls, col: Column, path: 'TableVersionPath') -> None:
1094
+ def _create_value_expr(cls, col: Column, path: 'pixeltable.catalog.TableVersionPath') -> None:
952
1095
  """
953
1096
  Create col.value_expr, given col.compute_func.
954
1097
  Interprets compute_func's parameters to be references to columns and construct ColumnRefs as args.
@@ -978,13 +1121,17 @@ class TableVersion:
978
1121
  for refd_col in refd_cols:
979
1122
  refd_col.dependent_cols.add(col)
980
1123
 
981
- def get_dependent_columns(self, cols: List[Column]) -> Set[Column]:
1124
+ def get_idx_val_columns(self, cols: Iterable[Column]) -> set[Column]:
1125
+ result = {info.val_col for col in cols for info in col.get_idx_info().values()}
1126
+ return result
1127
+
1128
+ def get_dependent_columns(self, cols: list[Column]) -> set[Column]:
982
1129
  """
983
1130
  Return the set of columns that transitively depend on any of the given ones.
984
1131
  """
985
1132
  if len(cols) == 0:
986
- return []
987
- result: Set[Column] = set()
1133
+ return set()
1134
+ result: set[Column] = set()
988
1135
  for col in cols:
989
1136
  result.update(col.dependent_cols)
990
1137
  result.update(self.get_dependent_columns(result))
@@ -1004,17 +1151,30 @@ class TableVersion:
1004
1151
  column_md[col.id] = schema.ColumnMd(
1005
1152
  id=col.id, col_type=col.col_type.as_dict(), is_pk=col.is_pk,
1006
1153
  schema_version_add=col.schema_version_add, schema_version_drop=col.schema_version_drop,
1007
- value_expr=value_expr_dict, stored=col.stored)
1154
+ value_expr=value_expr_dict, stored=col.stored,
1155
+ proxy_base=col.proxy_base.id if col.proxy_base else None)
1008
1156
  return column_md
1009
1157
 
1158
+ @classmethod
1159
+ def _create_remotes_md(cls, remotes: dict['pixeltable.datatransfer.Remote', dict[str, str]]) -> list[dict[str, Any]]:
1160
+ return [
1161
+ {
1162
+ 'class': f'{type(remote).__module__}.{type(remote).__qualname__}',
1163
+ 'remote_md': remote.to_dict(),
1164
+ 'col_mapping': col_mapping
1165
+ }
1166
+ for remote, col_mapping in remotes.items()
1167
+ ]
1168
+
1010
1169
  def _create_tbl_md(self) -> schema.TableMd:
1011
1170
  return schema.TableMd(
1012
1171
  name=self.name, current_version=self.version, current_schema_version=self.schema_version,
1013
1172
  next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
1014
- column_md=self._create_column_md(self.cols), index_md=self.idx_md, view_md=self.view_md)
1173
+ column_md=self._create_column_md(self.cols), index_md=self.idx_md,
1174
+ remotes=self._create_remotes_md(self.remotes), view_md=self.view_md)
1015
1175
 
1016
- def _create_version_md(self, ts: float) -> schema.TableVersionMd:
1017
- return schema.TableVersionMd(created_at=ts, version=self.version, schema_version=self.schema_version)
1176
+ def _create_version_md(self, timestamp: float) -> schema.TableVersionMd:
1177
+ return schema.TableVersionMd(created_at=timestamp, version=self.version, schema_version=self.schema_version)
1018
1178
 
1019
1179
  def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
1020
1180
  column_md: Dict[int, schema.SchemaColumn] = {}