pixeltable 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (53) hide show
  1. pixeltable/__init__.py +3 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/column.py +8 -2
  4. pixeltable/catalog/insertable_table.py +32 -17
  5. pixeltable/catalog/table.py +167 -12
  6. pixeltable/catalog/table_version.py +185 -106
  7. pixeltable/datatransfer/__init__.py +1 -0
  8. pixeltable/datatransfer/label_studio.py +452 -0
  9. pixeltable/datatransfer/remote.py +85 -0
  10. pixeltable/env.py +148 -69
  11. pixeltable/exprs/column_ref.py +2 -2
  12. pixeltable/exprs/comparison.py +39 -1
  13. pixeltable/exprs/data_row.py +7 -0
  14. pixeltable/exprs/expr.py +11 -12
  15. pixeltable/exprs/function_call.py +0 -3
  16. pixeltable/exprs/globals.py +14 -2
  17. pixeltable/exprs/similarity_expr.py +5 -3
  18. pixeltable/ext/functions/whisperx.py +30 -0
  19. pixeltable/ext/functions/yolox.py +16 -0
  20. pixeltable/func/aggregate_function.py +2 -2
  21. pixeltable/func/expr_template_function.py +3 -1
  22. pixeltable/func/udf.py +2 -2
  23. pixeltable/functions/fireworks.py +9 -4
  24. pixeltable/functions/huggingface.py +25 -1
  25. pixeltable/functions/openai.py +15 -10
  26. pixeltable/functions/together.py +11 -6
  27. pixeltable/functions/util.py +0 -43
  28. pixeltable/functions/video.py +46 -8
  29. pixeltable/globals.py +20 -2
  30. pixeltable/index/__init__.py +1 -0
  31. pixeltable/index/base.py +6 -1
  32. pixeltable/index/btree.py +54 -0
  33. pixeltable/index/embedding_index.py +4 -1
  34. pixeltable/io/__init__.py +1 -0
  35. pixeltable/io/globals.py +58 -0
  36. pixeltable/iterators/base.py +4 -4
  37. pixeltable/iterators/document.py +26 -15
  38. pixeltable/iterators/video.py +9 -1
  39. pixeltable/metadata/__init__.py +2 -2
  40. pixeltable/metadata/converters/convert_14.py +13 -0
  41. pixeltable/metadata/schema.py +9 -6
  42. pixeltable/plan.py +9 -5
  43. pixeltable/store.py +14 -21
  44. pixeltable/tool/create_test_db_dump.py +14 -0
  45. pixeltable/type_system.py +14 -4
  46. pixeltable/utils/coco.py +94 -0
  47. pixeltable-0.2.8.dist-info/METADATA +137 -0
  48. {pixeltable-0.2.6.dist-info → pixeltable-0.2.8.dist-info}/RECORD +50 -45
  49. pixeltable/func/nos_function.py +0 -202
  50. pixeltable/utils/clip.py +0 -18
  51. pixeltable-0.2.6.dist-info/METADATA +0 -131
  52. {pixeltable-0.2.6.dist-info → pixeltable-0.2.8.dist-info}/LICENSE +0 -0
  53. {pixeltable-0.2.6.dist-info → pixeltable-0.2.8.dist-info}/WHEEL +0 -0
@@ -5,7 +5,8 @@ import importlib
5
5
  import inspect
6
6
  import logging
7
7
  import time
8
- from typing import Optional, List, Dict, Any, Tuple, Type, Set
8
+ from typing import Optional, List, Dict, Any, Tuple, Type, Set, Iterable
9
+ import uuid
9
10
  from uuid import UUID
10
11
 
11
12
  import sqlalchemy as sql
@@ -87,6 +88,8 @@ class TableVersion:
87
88
  self.next_idx_id = tbl_md.next_idx_id
88
89
  self.next_rowid = tbl_md.next_row_id
89
90
 
91
+ self.remotes = dict(TableVersion._init_remote(remote_md) for remote_md in tbl_md.remotes)
92
+
90
93
  # view-specific initialization
91
94
  from pixeltable import exprs
92
95
  predicate_dict = None if not is_view or tbl_md.view_md.predicate is None else tbl_md.view_md.predicate
@@ -115,7 +118,7 @@ class TableVersion:
115
118
  cat.tbl_versions[(self.id, self.effective_version)] = self
116
119
 
117
120
  # init schema after we determined whether we're a component view, and before we create the store table
118
- self.cols: List[Column] = [] # contains complete history of columns, incl dropped ones
121
+ self.cols: list[Column] = [] # contains complete history of columns, incl dropped ones
119
122
  self.cols_by_name: dict[str, Column] = {} # contains only user-facing (named) columns visible in this version
120
123
  self.cols_by_id: dict[int, Column] = {} # contains only columns visible in this version
121
124
  self.idx_md = tbl_md.index_md # needed for _create_tbl_md()
@@ -150,23 +153,22 @@ class TableVersion:
150
153
  if col.is_computed:
151
154
  col.check_value_expr()
152
155
 
153
- ts = time.time()
156
+ timestamp = time.time()
154
157
  # create schema.Table
155
158
  # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
156
159
  column_md = cls._create_column_md(cols)
157
160
  table_md = schema.TableMd(
158
- name=name, current_version=0, current_schema_version=0,
159
- next_col_id=len(cols), next_idx_id=0, next_row_id=0, column_md=column_md, index_md={}, view_md=view_md)
160
- tbl_record = schema.Table(dir_id=dir_id, md=dataclasses.asdict(table_md))
161
- session.add(tbl_record)
162
- session.flush() # sets tbl_record.id
163
- assert tbl_record.id is not None
161
+ name=name, current_version=0, current_schema_version=0, next_col_id=len(cols),
162
+ next_idx_id=0, next_row_id=0, column_md=column_md, index_md={}, remotes=[], view_md=view_md)
163
+ # create a schema.Table here, we need it to call our c'tor;
164
+ # don't add it to the session yet, we might add index metadata
165
+ tbl_id = uuid.uuid4()
166
+ tbl_record = schema.Table(id=tbl_id, dir_id=dir_id, md=dataclasses.asdict(table_md))
164
167
 
165
168
  # create schema.TableVersion
166
- table_version_md = schema.TableVersionMd(created_at=ts, version=0, schema_version=0)
169
+ table_version_md = schema.TableVersionMd(created_at=timestamp, version=0, schema_version=0)
167
170
  tbl_version_record = schema.TableVersion(
168
171
  tbl_id=tbl_record.id, version=0, md=dataclasses.asdict(table_version_md))
169
- session.add(tbl_version_record)
170
172
 
171
173
  # create schema.TableSchemaVersion
172
174
  schema_col_md = {col.id: schema.SchemaColumn(pos=pos, name=col.name) for pos, col in enumerate(cols)}
@@ -176,19 +178,33 @@ class TableVersion:
176
178
  num_retained_versions=num_retained_versions, comment=comment)
177
179
  schema_version_record = schema.TableSchemaVersion(
178
180
  tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
179
- session.add(schema_version_record)
180
181
 
181
182
  # if this is purely a snapshot (it doesn't require any additional storage for columns and it # doesn't have a
182
183
  # predicate to apply at runtime), we don't create a physical table and simply use the base's table version path
183
184
  if view_md is not None and view_md.is_snapshot and view_md.predicate is None and len(cols) == 0:
185
+ session.add(tbl_record)
186
+ session.add(tbl_version_record)
187
+ session.add(schema_version_record)
184
188
  return tbl_record.id, None
185
189
 
186
190
  assert (base_path is not None) == (view_md is not None)
187
191
  base = base_path.tbl_version if base_path is not None and view_md.is_snapshot else None
188
192
  base_path = base_path if base_path is not None and not view_md.is_snapshot else None
189
193
  tbl_version = cls(tbl_record.id, table_md, 0, schema_version_md, base=base, base_path=base_path)
190
- tbl_version.store_tbl.create(session.connection())
191
- # TODO: create pgvector indices
194
+
195
+ conn = session.connection()
196
+ tbl_version.store_tbl.create(conn)
197
+ if view_md is None or not view_md.is_snapshot:
198
+ # add default indices, after creating the store table
199
+ for col in tbl_version.cols_by_name.values():
200
+ status = tbl_version._add_default_index(col, conn=conn)
201
+ assert status is None or status.num_excs == 0
202
+
203
+ # we re-create the tbl_record here, now that we have new index metadata
204
+ tbl_record = schema.Table(id=tbl_id, dir_id=dir_id, md=dataclasses.asdict(tbl_version._create_tbl_md()))
205
+ session.add(tbl_record)
206
+ session.add(tbl_version_record)
207
+ session.add(schema_version_record)
192
208
  return tbl_record.id, tbl_version
193
209
 
194
210
  @classmethod
@@ -259,7 +275,7 @@ class TableVersion:
259
275
  for md in tbl_md.index_md.values():
260
276
  if md.schema_version_add > self.schema_version \
261
277
  or md.schema_version_drop is not None and md.schema_version_drop <= self.schema_version:
262
- # column not visible in this schema version
278
+ # index not visible in this schema version
263
279
  continue
264
280
 
265
281
  # instantiate index object
@@ -271,8 +287,10 @@ class TableVersion:
271
287
  # fix up the sa column type of the index value and undo columns
272
288
  val_col = self.cols_by_id[md.index_val_col_id]
273
289
  val_col.sa_col_type = idx.index_sa_type()
290
+ val_col._records_errors = False
274
291
  undo_col = self.cols_by_id[md.index_val_undo_col_id]
275
292
  undo_col.sa_col_type = idx.index_sa_type()
293
+ undo_col._records_errors = False
276
294
  idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
277
295
  self.idxs_by_name[md.name] = idx_info
278
296
 
@@ -287,10 +305,12 @@ class TableVersion:
287
305
  else:
288
306
  self.store_tbl: StoreBase = StoreTable(self)
289
307
 
290
- def _update_md(self, ts: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection) -> None:
308
+ def _update_md(
309
+ self, timestamp: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection
310
+ ) -> None:
291
311
  """Update all recorded metadata in response to a data or schema change.
292
312
  Args:
293
- ts: timestamp of the change
313
+ timestamp: timestamp of the change
294
314
  preceding_schema_version: last schema version if schema change, else None
295
315
  """
296
316
  conn.execute(
@@ -298,7 +318,7 @@ class TableVersion:
298
318
  .values({schema.Table.md: dataclasses.asdict(self._create_tbl_md())})
299
319
  .where(schema.Table.id == self.id))
300
320
 
301
- version_md = self._create_version_md(ts)
321
+ version_md = self._create_version_md(timestamp)
302
322
  conn.execute(
303
323
  sql.insert(schema.TableVersion.__table__)
304
324
  .values(tbl_id=self.id, version=self.version, md=dataclasses.asdict(version_md)))
@@ -315,6 +335,33 @@ class TableVersion:
315
335
  return f'idx_{self.id.hex}_{idx_id}'
316
336
 
317
337
  def add_index(self, col: Column, idx_name: Optional[str], idx: index.IndexBase) -> UpdateStatus:
338
+ # we're creating a new schema version
339
+ self.version += 1
340
+ preceding_schema_version = self.schema_version
341
+ self.schema_version = self.version
342
+ with Env.get().engine.begin() as conn:
343
+ status = self._add_index(col, idx_name, idx, conn)
344
+ self._update_md(time.time(), preceding_schema_version, conn)
345
+ _logger.info(f'Added index {idx_name} on column {col.name} to table {self.name}')
346
+ return status
347
+
348
+ def _add_default_index(self, col: Column, conn: sql.engine.Connection) -> Optional[UpdateStatus]:
349
+ """Add a B-tree index on this column if it has a compatible type"""
350
+ if not col.stored:
351
+ # if the column is intentionally not stored, we want to avoid the overhead of an index
352
+ return None
353
+ if not col.col_type.is_scalar_type() and not (col.col_type.is_media_type() and not col.is_computed):
354
+ # wrong type for a B-tree
355
+ return None
356
+ if col.col_type.is_bool_type():
357
+ # B-trees on bools aren't useful
358
+ return None
359
+ status = self._add_index(col, idx_name=None, idx=index.BtreeIndex(col), conn=conn)
360
+ return status
361
+
362
+ def _add_index(
363
+ self, col: Column, idx_name: Optional[str], idx: index.IndexBase, conn: sql.engine.Connection
364
+ ) -> UpdateStatus:
318
365
  assert not self.is_snapshot
319
366
  idx_id = self.next_idx_id
320
367
  self.next_idx_id += 1
@@ -324,46 +371,41 @@ class TableVersion:
324
371
  assert is_valid_identifier(idx_name)
325
372
  assert idx_name not in [i.name for i in self.idx_md.values()]
326
373
 
327
- # we're creating a new schema version
328
- self.version += 1
329
- preceding_schema_version = self.schema_version
330
- self.schema_version = self.version
331
- with Env.get().engine.begin() as conn:
332
- # add the index value and undo columns (which need to be nullable);
333
- # we don't create a new schema version, because indices aren't part of the logical schema
334
- val_col = Column(
335
- col_id=self.next_col_id, name=None, computed_with=idx.index_value_expr(),
336
- sa_col_type=idx.index_sa_type(), stored=True,
337
- schema_version_add=self.schema_version, schema_version_drop=None)
338
- val_col.tbl = self
339
- val_col.col_type.nullable = True
340
- self.next_col_id += 1
341
-
342
- undo_col = Column(
343
- col_id=self.next_col_id, name=None, col_type=val_col.col_type,
344
- sa_col_type=val_col.sa_col_type, stored=True,
345
- schema_version_add=self.schema_version, schema_version_drop=None)
346
- undo_col.tbl = self
347
- undo_col.col_type.nullable = True
348
- self.next_col_id += 1
349
-
350
- # create and register the index metadata
351
- idx_cls = type(idx)
352
- idx_md = schema.IndexMd(
353
- id=idx_id, name=idx_name,
354
- indexed_col_id=col.id, index_val_col_id=val_col.id, index_val_undo_col_id=undo_col.id,
355
- schema_version_add=self.schema_version, schema_version_drop=None,
356
- class_fqn=idx_cls.__module__ + '.' + idx_cls.__name__, init_args=idx.as_dict())
357
- idx_info = self.IndexInfo(id=idx_id, name=idx_name, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
358
- self.idx_md[idx_id] = idx_md
359
- self.idxs_by_name[idx_name] = idx_info
360
-
361
- # add the columns and update the metadata
362
- status = self._add_columns([val_col, undo_col], conn, preceding_schema_version=preceding_schema_version)
363
- # now create the index structure
364
- idx.create_index(self._store_idx_name(idx_id), val_col, conn)
365
-
366
- _logger.info(f'Added index {idx_name} on column {col.name} to table {self.name}')
374
+ # add the index value and undo columns (which need to be nullable)
375
+ val_col = Column(
376
+ col_id=self.next_col_id, name=None, computed_with=idx.index_value_expr(),
377
+ sa_col_type=idx.index_sa_type(), stored=True,
378
+ schema_version_add=self.schema_version, schema_version_drop=None,
379
+ records_errors=idx.records_value_errors())
380
+ val_col.tbl = self
381
+ val_col.col_type = val_col.col_type.copy(nullable=True)
382
+ self.next_col_id += 1
383
+
384
+ undo_col = Column(
385
+ col_id=self.next_col_id, name=None, col_type=val_col.col_type,
386
+ sa_col_type=val_col.sa_col_type, stored=True,
387
+ schema_version_add=self.schema_version, schema_version_drop=None,
388
+ records_errors=False)
389
+ undo_col.tbl = self
390
+ undo_col.col_type = undo_col.col_type.copy(nullable=True)
391
+ self.next_col_id += 1
392
+
393
+ # create and register the index metadata
394
+ idx_cls = type(idx)
395
+ idx_md = schema.IndexMd(
396
+ id=idx_id, name=idx_name,
397
+ indexed_col_id=col.id, index_val_col_id=val_col.id, index_val_undo_col_id=undo_col.id,
398
+ schema_version_add=self.schema_version, schema_version_drop=None,
399
+ class_fqn=idx_cls.__module__ + '.' + idx_cls.__name__, init_args=idx.as_dict())
400
+ idx_info = self.IndexInfo(id=idx_id, name=idx_name, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
401
+ self.idx_md[idx_id] = idx_md
402
+ self.idxs_by_name[idx_name] = idx_info
403
+
404
+ # add the columns and update the metadata
405
+ status = self._add_columns([val_col, undo_col], conn)
406
+ # now create the index structure
407
+ idx.create_index(self._store_idx_name(idx_id), val_col, conn)
408
+
367
409
  return status
368
410
 
369
411
  def drop_index(self, idx_id: int) -> None:
@@ -381,7 +423,8 @@ class TableVersion:
381
423
  del self.idxs_by_name[idx_md.name]
382
424
 
383
425
  with Env.get().engine.begin() as conn:
384
- self._drop_columns([idx_info.val_col, idx_info.undo_col], conn, preceding_schema_version)
426
+ self._drop_columns([idx_info.val_col, idx_info.undo_col])
427
+ self._update_md(time.time(), preceding_schema_version, conn)
385
428
  _logger.info(f'Dropped index {idx_md.name} on table {self.name}')
386
429
 
387
430
  def add_column(self, col: Column, print_stats: bool = False) -> UpdateStatus:
@@ -398,16 +441,16 @@ class TableVersion:
398
441
  if col.compute_func is not None:
399
442
  # create value_expr from compute_func
400
443
  self._create_value_expr(col, self.path)
401
- if col.value_expr is not None:
402
- col.check_value_expr()
403
- self._record_value_expr(col)
404
444
 
405
445
  # we're creating a new schema version
406
446
  self.version += 1
407
447
  preceding_schema_version = self.schema_version
408
448
  self.schema_version = self.version
409
449
  with Env.get().engine.begin() as conn:
410
- status = self._add_columns([col], conn, preceding_schema_version, print_stats=print_stats)
450
+ status = self._add_columns([col], conn, print_stats=print_stats)
451
+ _ = self._add_default_index(col, conn)
452
+ # TODO: what to do about errors?
453
+ self._update_md(time.time(), preceding_schema_version, conn)
411
454
  _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
412
455
 
413
456
  msg = (
@@ -418,13 +461,8 @@ class TableVersion:
418
461
  _logger.info(f'Column {col.name}: {msg}')
419
462
  return status
420
463
 
421
- def _add_columns(
422
- self, cols: List[Column], conn: sql.engine.Connection, preceding_schema_version: Optional[int] = None,
423
- print_stats: bool = False
424
- ) -> UpdateStatus:
464
+ def _add_columns(self, cols: List[Column], conn: sql.engine.Connection, print_stats: bool = False) -> UpdateStatus:
425
465
  """Add and populate columns within the current transaction"""
426
- ts = time.time()
427
-
428
466
  row_count = self.store_tbl.count(conn=conn)
429
467
  for col in cols:
430
468
  if not col.col_type.nullable and not col.is_computed:
@@ -442,6 +480,9 @@ class TableVersion:
442
480
  if col.name is not None:
443
481
  self.cols_by_name[col.name] = col
444
482
  self.cols_by_id[col.id] = col
483
+ if col.value_expr is not None:
484
+ col.check_value_expr()
485
+ self._record_value_expr(col)
445
486
 
446
487
  if col.is_stored:
447
488
  self.store_tbl.add_column(col, conn)
@@ -475,7 +516,6 @@ class TableVersion:
475
516
  finally:
476
517
  plan.close()
477
518
 
478
- self._update_md(ts, preceding_schema_version, conn)
479
519
  if print_stats:
480
520
  plan.ctx.profile.print(num_rows=row_count)
481
521
  # TODO(mkornacker): what to do about system columns with exceptions?
@@ -516,14 +556,14 @@ class TableVersion:
516
556
  # update idxs_by_name
517
557
  for idx_name in dropped_idx_names:
518
558
  del self.idxs_by_name[idx_name]
519
- self._drop_columns(dropped_cols, conn, preceding_schema_version)
559
+ self._drop_columns(dropped_cols)
560
+ self._update_md(time.time(), preceding_schema_version, conn)
520
561
  _logger.info(f'Dropped column {name} from table {self.name}, new version: {self.version}')
521
562
 
522
- def _drop_columns(self, cols: list[Column], conn: sql.engine.Connection, preceding_schema_version: int) -> None:
563
+ def _drop_columns(self, cols: list[Column]) -> None:
523
564
  """Mark columns as dropped"""
524
565
  assert not self.is_snapshot
525
566
 
526
- ts = time.time()
527
567
  for col in cols:
528
568
  if col.value_expr is not None:
529
569
  # update Column.dependent_cols
@@ -539,7 +579,6 @@ class TableVersion:
539
579
  assert col.id in self.cols_by_id
540
580
  del self.cols_by_id[col.id]
541
581
 
542
- self._update_md(ts, preceding_schema_version, conn)
543
582
  self.store_tbl.create_sa_tbl()
544
583
 
545
584
  def rename_column(self, old_name: str, new_name: str) -> None:
@@ -558,13 +597,12 @@ class TableVersion:
558
597
  self.cols_by_name[new_name] = col
559
598
 
560
599
  # we're creating a new schema version
561
- ts = time.time()
562
600
  self.version += 1
563
601
  preceding_schema_version = self.schema_version
564
602
  self.schema_version = self.version
565
603
 
566
604
  with Env.get().engine.begin() as conn:
567
- self._update_md(ts, preceding_schema_version, conn)
605
+ self._update_md(time.time(), preceding_schema_version, conn)
568
606
  _logger.info(f'Renamed column {old_name} to {new_name} in table {self.name}, new version: {self.version}')
569
607
 
570
608
  def set_comment(self, new_comment: Optional[str]):
@@ -579,12 +617,11 @@ class TableVersion:
579
617
 
580
618
  def _create_schema_version(self):
581
619
  # we're creating a new schema version
582
- ts = time.time()
583
620
  self.version += 1
584
621
  preceding_schema_version = self.schema_version
585
622
  self.schema_version = self.version
586
623
  with Env.get().engine.begin() as conn:
587
- self._update_md(ts, preceding_schema_version, conn)
624
+ self._update_md(time.time(), preceding_schema_version, conn)
588
625
  _logger.info(f'[{self.name}] Updating table schema to version: {self.version}')
589
626
 
590
627
  def insert(
@@ -595,12 +632,11 @@ class TableVersion:
595
632
  assert self.is_insertable()
596
633
  from pixeltable.plan import Planner
597
634
  plan = Planner.create_insert_plan(self, rows, ignore_errors=not fail_on_exception)
598
- ts = time.time()
599
635
  with Env.get().engine.begin() as conn:
600
- return self._insert(plan, conn, ts, print_stats)
636
+ return self._insert(plan, conn, time.time(), print_stats)
601
637
 
602
638
  def _insert(
603
- self, exec_plan: exec.ExecNode, conn: sql.engine.Connection, ts: float, print_stats: bool = False,
639
+ self, exec_plan: exec.ExecNode, conn: sql.engine.Connection, timestamp: float, print_stats: bool = False,
604
640
  ) -> UpdateStatus:
605
641
  """Insert rows produced by exec_plan and propagate to views"""
606
642
  # we're creating a new version
@@ -612,13 +648,13 @@ class TableVersion:
612
648
  result.num_excs = num_excs
613
649
  result.num_computed_values += exec_plan.ctx.num_computed_exprs * num_rows
614
650
  result.cols_with_excs = [f'{self.name}.{self.cols_by_id[cid].name}' for cid in cols_with_excs]
615
- self._update_md(ts, None, conn)
651
+ self._update_md(timestamp, None, conn)
616
652
 
617
653
  # update views
618
654
  for view in self.mutable_views:
619
655
  from pixeltable.plan import Planner
620
656
  plan, _ = Planner.create_view_load_plan(view.path, propagates_insert=True)
621
- status = view._insert(plan, conn, ts, print_stats)
657
+ status = view._insert(plan, conn, timestamp, print_stats)
622
658
  result.num_rows += status.num_rows
623
659
  result.num_excs += status.num_excs
624
660
  result.num_computed_values += status.num_computed_values
@@ -662,7 +698,7 @@ class TableVersion:
662
698
  # construct Where clause to match rowid
663
699
  num_rowid_cols = len(self.store_tbl.rowid_columns())
664
700
  for col_idx in range(num_rowid_cols):
665
- assert len(rowids[i]) == num_rowid_cols
701
+ assert len(rowids[i]) == num_rowid_cols, f'len({rowids[i]}) != {num_rowid_cols}'
666
702
  clause = exprs.RowidRef(self, col_idx) == rowids[i][col_idx]
667
703
  if where_clause is None:
668
704
  where_clause = clause
@@ -679,7 +715,7 @@ class TableVersion:
679
715
  where_clause = where_clause & clause
680
716
 
681
717
  update_targets = {col: row[col] for col in row if col not in pk_cols}
682
- status = self._update(conn, update_targets, where_clause, cascade)
718
+ status = self._update(conn, update_targets, where_clause, cascade, show_progress=False)
683
719
  result_status.num_rows += status.num_rows
684
720
  result_status.num_excs += status.num_excs
685
721
  result_status.num_computed_values += status.num_computed_values
@@ -692,7 +728,8 @@ class TableVersion:
692
728
 
693
729
  def _update(
694
730
  self, conn: sql.engine.Connection, update_targets: dict[Column, 'pixeltable.exprs.Expr'],
695
- where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
731
+ where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True,
732
+ show_progress: bool = True
696
733
  ) -> UpdateStatus:
697
734
  """Update rows in this table.
698
735
  Args:
@@ -705,28 +742,27 @@ class TableVersion:
705
742
  from pixeltable.plan import Planner
706
743
  plan, updated_cols, recomputed_cols = \
707
744
  Planner.create_update_plan(self.path, update_targets, [], where_clause, cascade)
708
- ts = time.time()
709
745
  result = self._propagate_update(
710
746
  plan, where_clause.sql_expr() if where_clause is not None else None, recomputed_cols,
711
- base_versions=[], conn=conn, ts=ts, cascade=cascade)
747
+ base_versions=[], conn=conn, timestamp=time.time(), cascade=cascade, show_progress=show_progress)
712
748
  result.updated_cols = updated_cols
713
749
  return result
714
750
 
715
751
  def _propagate_update(
716
752
  self, plan: Optional[exec.ExecNode], where_clause: Optional[sql.ClauseElement],
717
753
  recomputed_view_cols: List[Column], base_versions: List[Optional[int]], conn: sql.engine.Connection,
718
- ts: float, cascade: bool
754
+ timestamp: float, cascade: bool, show_progress: bool = True
719
755
  ) -> UpdateStatus:
720
756
  result = UpdateStatus()
721
757
  if plan is not None:
722
758
  # we're creating a new version
723
759
  self.version += 1
724
760
  result.num_rows, result.num_excs, cols_with_excs = \
725
- self.store_tbl.insert_rows(plan, conn, v_min=self.version)
761
+ self.store_tbl.insert_rows(plan, conn, v_min=self.version, show_progress=show_progress)
726
762
  result.cols_with_excs = [f'{self.name}.{self.cols_by_id[cid].name}' for cid in cols_with_excs]
727
763
  self.store_tbl.delete_rows(
728
764
  self.version, base_versions=base_versions, match_on_vmin=True, where_clause=where_clause, conn=conn)
729
- self._update_md(ts, None, conn)
765
+ self._update_md(timestamp, None, conn)
730
766
 
731
767
  if cascade:
732
768
  base_versions = [None if plan is None else self.version] + base_versions # don't update in place
@@ -738,7 +774,7 @@ class TableVersion:
738
774
  from pixeltable.plan import Planner
739
775
  plan = Planner.create_view_update_plan(view.path, recompute_targets=recomputed_cols)
740
776
  status = view._propagate_update(
741
- plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, ts=ts, cascade=True)
777
+ plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, timestamp=timestamp, cascade=True)
742
778
  result.num_rows += status.num_rows
743
779
  result.num_excs += status.num_excs
744
780
  result.cols_with_excs += status.cols_with_excs
@@ -754,16 +790,15 @@ class TableVersion:
754
790
  assert self.is_insertable()
755
791
  from pixeltable.plan import Planner
756
792
  analysis_info = Planner.analyze(self, where)
757
- ts = time.time()
758
793
  with Env.get().engine.begin() as conn:
759
- num_rows = self._delete(analysis_info.sql_where_clause, base_versions=[], conn=conn, ts=ts)
794
+ num_rows = self._delete(analysis_info.sql_where_clause, base_versions=[], conn=conn, timestamp=time.time())
760
795
 
761
796
  status = UpdateStatus(num_rows=num_rows)
762
797
  return status
763
798
 
764
799
  def _delete(
765
800
  self, where: Optional['pixeltable.exprs.Predicate'], base_versions: List[Optional[int]],
766
- conn: sql.engine.Connection, ts: float) -> int:
801
+ conn: sql.engine.Connection, timestamp: float) -> int:
767
802
  """Delete rows in this table and propagate to views.
768
803
  Args:
769
804
  where: a Predicate to filter rows to delete.
@@ -777,11 +812,12 @@ class TableVersion:
777
812
  if num_rows > 0:
778
813
  # we're creating a new version
779
814
  self.version += 1
780
- self._update_md(ts, None, conn)
815
+ self._update_md(timestamp, None, conn)
781
816
  else:
782
817
  pass
783
818
  for view in self.mutable_views:
784
- num_rows += view._delete(where=None, base_versions=[self.version] + base_versions, conn=conn, ts=ts)
819
+ num_rows += view._delete(
820
+ where=None, base_versions=[self.version] + base_versions, conn=conn, timestamp=timestamp)
785
821
  return num_rows
786
822
 
787
823
  def revert(self) -> None:
@@ -907,6 +943,32 @@ class TableVersion:
907
943
  view._revert(session)
908
944
  _logger.info(f'TableVersion {self.name}: reverted to version {self.version}')
909
945
 
946
+ @classmethod
947
+ def _init_remote(cls, remote_md: dict[str, Any]) -> Tuple[pixeltable.datatransfer.Remote, dict[str, str]]:
948
+ module = importlib.import_module(remote_md['module'])
949
+ remote_cls = getattr(module, remote_md['class'])
950
+ remote = remote_cls.from_dict(remote_md['remote_md'])
951
+ col_mapping = remote_md['col_mapping']
952
+ return remote, col_mapping
953
+
954
+ def link(self, remote: pixeltable.datatransfer.Remote, col_mapping: dict[str, str]) -> None:
955
+ timestamp = time.time()
956
+ self.version += 1
957
+ self.remotes[remote] = col_mapping
958
+ with Env.get().engine.begin() as conn:
959
+ self._update_md(timestamp, None, conn)
960
+
961
+ def unlink(self, remote: pixeltable.datatransfer.Remote) -> None:
962
+ assert remote in self.remotes
963
+ timestamp = time.time()
964
+ self.version += 1
965
+ del self.remotes[remote]
966
+ with Env.get().engine.begin() as conn:
967
+ self._update_md(timestamp, None, conn)
968
+
969
+ def get_remotes(self) -> dict[pixeltable.datatransfer.Remote, dict[str, str]]:
970
+ return self.remotes
971
+
910
972
  def is_view(self) -> bool:
911
973
  return self.base is not None
912
974
 
@@ -939,16 +1001,16 @@ class TableVersion:
939
1001
  def get_required_col_names(self) -> List[str]:
940
1002
  """Return the names of all columns for which values must be specified in insert()"""
941
1003
  assert not self.is_view()
942
- names = [c.name for c in self.cols if not c.is_computed and not c.col_type.nullable]
1004
+ names = [c.name for c in self.cols_by_name.values() if not c.is_computed and not c.col_type.nullable]
943
1005
  return names
944
1006
 
945
1007
  def get_computed_col_names(self) -> List[str]:
946
1008
  """Return the names of all computed columns"""
947
- names = [c.name for c in self.cols if c.is_computed]
1009
+ names = [c.name for c in self.cols_by_name.values() if c.is_computed]
948
1010
  return names
949
1011
 
950
1012
  @classmethod
951
- def _create_value_expr(cls, col: Column, path: 'TableVersionPath') -> None:
1013
+ def _create_value_expr(cls, col: Column, path: 'pixeltable.catalog.TableVersionPath') -> None:
952
1014
  """
953
1015
  Create col.value_expr, given col.compute_func.
954
1016
  Interprets compute_func's parameters to be references to columns and construct ColumnRefs as args.
@@ -978,13 +1040,17 @@ class TableVersion:
978
1040
  for refd_col in refd_cols:
979
1041
  refd_col.dependent_cols.add(col)
980
1042
 
981
- def get_dependent_columns(self, cols: List[Column]) -> Set[Column]:
1043
+ def get_idx_val_columns(self, cols: Iterable[Column]) -> set[Column]:
1044
+ result = {info.val_col for col in cols for info in col.get_idx_info().values()}
1045
+ return result
1046
+
1047
+ def get_dependent_columns(self, cols: list[Column]) -> set[Column]:
982
1048
  """
983
1049
  Return the set of columns that transitively depend on any of the given ones.
984
1050
  """
985
1051
  if len(cols) == 0:
986
- return []
987
- result: Set[Column] = set()
1052
+ return set()
1053
+ result: set[Column] = set()
988
1054
  for col in cols:
989
1055
  result.update(col.dependent_cols)
990
1056
  result.update(self.get_dependent_columns(result))
@@ -1007,14 +1073,27 @@ class TableVersion:
1007
1073
  value_expr=value_expr_dict, stored=col.stored)
1008
1074
  return column_md
1009
1075
 
1076
+ @classmethod
1077
+ def _create_remotes_md(cls, remotes: dict['pixeltable.datatransfer.Remote', dict[str, str]]) -> list[dict[str, Any]]:
1078
+ return [
1079
+ {
1080
+ 'module': type(remote).__module__,
1081
+ 'class': type(remote).__qualname__,
1082
+ 'remote_md': remote.to_dict(),
1083
+ 'col_mapping': col_mapping
1084
+ }
1085
+ for remote, col_mapping in remotes.items()
1086
+ ]
1087
+
1010
1088
  def _create_tbl_md(self) -> schema.TableMd:
1011
1089
  return schema.TableMd(
1012
1090
  name=self.name, current_version=self.version, current_schema_version=self.schema_version,
1013
1091
  next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
1014
- column_md=self._create_column_md(self.cols), index_md=self.idx_md, view_md=self.view_md)
1092
+ column_md=self._create_column_md(self.cols), index_md=self.idx_md,
1093
+ remotes=self._create_remotes_md(self.remotes), view_md=self.view_md)
1015
1094
 
1016
- def _create_version_md(self, ts: float) -> schema.TableVersionMd:
1017
- return schema.TableVersionMd(created_at=ts, version=self.version, schema_version=self.schema_version)
1095
+ def _create_version_md(self, timestamp: float) -> schema.TableVersionMd:
1096
+ return schema.TableVersionMd(created_at=timestamp, version=self.version, schema_version=self.schema_version)
1018
1097
 
1019
1098
  def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
1020
1099
  column_md: Dict[int, schema.SchemaColumn] = {}
@@ -0,0 +1 @@
1
+ from .remote import Remote