pixeltable 0.2.30__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (60) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/table.py +212 -173
  4. pixeltable/catalog/table_version.py +2 -1
  5. pixeltable/catalog/view.py +3 -5
  6. pixeltable/dataframe.py +52 -39
  7. pixeltable/env.py +94 -5
  8. pixeltable/exec/__init__.py +1 -1
  9. pixeltable/exec/aggregation_node.py +3 -3
  10. pixeltable/exec/cache_prefetch_node.py +13 -7
  11. pixeltable/exec/component_iteration_node.py +3 -9
  12. pixeltable/exec/data_row_batch.py +17 -5
  13. pixeltable/exec/exec_node.py +32 -12
  14. pixeltable/exec/expr_eval/__init__.py +1 -0
  15. pixeltable/exec/expr_eval/evaluators.py +245 -0
  16. pixeltable/exec/expr_eval/expr_eval_node.py +404 -0
  17. pixeltable/exec/expr_eval/globals.py +114 -0
  18. pixeltable/exec/expr_eval/row_buffer.py +76 -0
  19. pixeltable/exec/expr_eval/schedulers.py +232 -0
  20. pixeltable/exec/in_memory_data_node.py +2 -2
  21. pixeltable/exec/row_update_node.py +14 -14
  22. pixeltable/exec/sql_node.py +2 -2
  23. pixeltable/exprs/column_ref.py +5 -1
  24. pixeltable/exprs/data_row.py +50 -40
  25. pixeltable/exprs/expr.py +57 -12
  26. pixeltable/exprs/function_call.py +54 -19
  27. pixeltable/exprs/inline_expr.py +12 -21
  28. pixeltable/exprs/literal.py +25 -8
  29. pixeltable/exprs/row_builder.py +23 -0
  30. pixeltable/exprs/similarity_expr.py +4 -4
  31. pixeltable/func/__init__.py +5 -5
  32. pixeltable/func/aggregate_function.py +4 -0
  33. pixeltable/func/callable_function.py +54 -6
  34. pixeltable/func/expr_template_function.py +5 -1
  35. pixeltable/func/function.py +54 -13
  36. pixeltable/func/query_template_function.py +56 -10
  37. pixeltable/func/tools.py +51 -14
  38. pixeltable/func/udf.py +7 -1
  39. pixeltable/functions/__init__.py +1 -1
  40. pixeltable/functions/anthropic.py +108 -21
  41. pixeltable/functions/gemini.py +2 -6
  42. pixeltable/functions/huggingface.py +10 -28
  43. pixeltable/functions/openai.py +225 -28
  44. pixeltable/globals.py +8 -5
  45. pixeltable/index/embedding_index.py +90 -38
  46. pixeltable/io/label_studio.py +1 -1
  47. pixeltable/metadata/__init__.py +1 -1
  48. pixeltable/metadata/converters/convert_24.py +11 -2
  49. pixeltable/metadata/converters/convert_25.py +19 -0
  50. pixeltable/metadata/notes.py +1 -0
  51. pixeltable/plan.py +24 -9
  52. pixeltable/store.py +6 -0
  53. pixeltable/type_system.py +4 -7
  54. pixeltable/utils/arrow.py +3 -3
  55. {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/METADATA +5 -11
  56. {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/RECORD +59 -53
  57. pixeltable/exec/expr_eval_node.py +0 -232
  58. {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/LICENSE +0 -0
  59. {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/WHEEL +0 -0
  60. {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/entry_points.txt +0 -0
pixeltable/__init__.py CHANGED
@@ -2,7 +2,7 @@ from .catalog import Column, InsertableTable, Table, UpdateStatus, View
2
2
  from .dataframe import DataFrame
3
3
  from .exceptions import Error
4
4
  from .exprs import RELATIVE_PATH_ROOT
5
- from .func import Aggregator, Function, expr_udf, uda, udf
5
+ from .func import Aggregator, Function, expr_udf, query, uda, udf
6
6
  from .globals import (array, configure_logging, create_dir, create_snapshot, create_table, create_view, drop_dir,
7
7
  drop_table, get_table, init, list_dirs, list_functions, list_tables, move, tool, tools)
8
8
  from .type_system import (Array, ArrayType, Audio, AudioType, Bool, BoolType, ColumnType, Document, DocumentType, Float,
pixeltable/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  # These version placeholders will be replaced during build.
2
- __version__ = "0.2.30"
3
- __version_tuple__ = (0, 2, 30)
2
+ __version__ = "0.3.1"
3
+ __version_tuple__ = (0, 3, 1)
@@ -25,13 +25,15 @@ from ..exprs import ColumnRef
25
25
  from ..utils.description_helper import DescriptionHelper
26
26
  from ..utils.filecache import FileCache
27
27
  from .column import Column
28
- from .globals import _ROWID_COLUMN_NAME, MediaValidation, UpdateStatus, is_system_column_name, is_valid_identifier, IfNotExistsParam
28
+ from .globals import (_ROWID_COLUMN_NAME, IfExistsParam, IfNotExistsParam, MediaValidation, UpdateStatus,
29
+ is_system_column_name, is_valid_identifier)
29
30
  from .schema_object import SchemaObject
30
31
  from .table_version import TableVersion
31
32
  from .table_version_path import TableVersionPath
32
33
 
33
34
  if TYPE_CHECKING:
34
35
  import torch.utils.data
36
+
35
37
  import pixeltable.plan
36
38
 
37
39
  _logger = logging.getLogger('pixeltable')
@@ -48,20 +50,6 @@ class Table(SchemaObject):
48
50
  super().__init__(id, name, dir_id)
49
51
  self._is_dropped = False
50
52
  self.__tbl_version_path = tbl_version_path
51
- self.__query_scope = self.QueryScope(self)
52
-
53
- class QueryScope:
54
- __table: 'Table'
55
- _queries: dict[str, pxt.func.QueryTemplateFunction]
56
-
57
- def __init__(self, table: 'Table') -> None:
58
- self.__table = table
59
- self._queries = {}
60
-
61
- def __getattr__(self, name: str) -> pxt.func.QueryTemplateFunction:
62
- if name in self._queries:
63
- return self._queries[name]
64
- raise AttributeError(f'Table {self.__table._name!r} has no query with that name: {name!r}')
65
53
 
66
54
  @property
67
55
  def _has_dependents(self) -> bool:
@@ -138,23 +126,12 @@ class Table(SchemaObject):
138
126
  raise excs.Error(f'{self._display_name()} {self._name} has been dropped')
139
127
 
140
128
  def __getattr__(self, name: str) -> 'pxt.exprs.ColumnRef':
141
- """Return a ColumnRef for the given name.
142
- """
129
+ """Return a ColumnRef for the given name."""
143
130
  return self._tbl_version_path.get_column_ref(name)
144
131
 
145
- @overload
146
- def __getitem__(self, name: str) -> 'pxt.exprs.ColumnRef': ...
147
-
148
- @overload
149
- def __getitem__(self, index: Union[exprs.Expr, Sequence[exprs.Expr]]) -> 'pxt.DataFrame': ...
150
-
151
- def __getitem__(self, index):
152
- """Return a ColumnRef or QueryTemplateFunction for the given name, or a DataFrame for the given slice.
153
- """
154
- if isinstance(index, str):
155
- return getattr(self, index)
156
- else:
157
- return self._df()[index]
132
+ def __getitem__(self, name: str) -> 'pxt.exprs.ColumnRef':
133
+ """Return a ColumnRef for the given name."""
134
+ return getattr(self, name)
158
135
 
159
136
  def list_views(self, *, recursive: bool = True) -> list[str]:
160
137
  """
@@ -184,10 +161,6 @@ class Table(SchemaObject):
184
161
  from pixeltable.plan import FromClause
185
162
  return pxt.DataFrame(FromClause(tbls=[self._tbl_version_path]))
186
163
 
187
- @property
188
- def queries(self) -> 'Table.QueryScope':
189
- return self.__query_scope
190
-
191
164
  def select(self, *items: Any, **named_items: Any) -> 'pxt.DataFrame':
192
165
  """ Select columns or expressions from this table.
193
166
 
@@ -264,11 +237,6 @@ class Table(SchemaObject):
264
237
  """Return the schema (column names and column types) of this table."""
265
238
  return {c.name: c.col_type for c in self._tbl_version_path.columns()}
266
239
 
267
- @property
268
- def _query_names(self) -> list[str]:
269
- """Return the names of the registered queries for this table."""
270
- return list(self.__query_scope._queries.keys())
271
-
272
240
  @property
273
241
  def _base(self) -> Optional['Table']:
274
242
  """
@@ -422,28 +390,54 @@ class Table(SchemaObject):
422
390
  """
423
391
  return self._df().to_coco_dataset()
424
392
 
425
- def __setitem__(self, col_name: str, spec: Union[ts.ColumnType, exprs.Expr]) -> None:
426
- """
427
- Adds a column to the table. This is an alternate syntax for `add_column()`; the meaning of
428
-
429
- >>> tbl['new_col'] = pxt.Int
430
-
431
- is exactly equivalent to
393
+ def _column_has_dependents(self, col: Column) -> bool:
394
+ """Returns True if the column has dependents, False otherwise."""
395
+ assert col is not None
396
+ assert col.name in self._schema.keys()
397
+ if any(c.name is not None for c in col.dependent_cols):
398
+ return True
399
+ return any(
400
+ col in store.get_local_columns()
401
+ for view in [self] + self._get_views(recursive=True)
402
+ for store in view._tbl_version.external_stores.values())
432
403
 
433
- >>> tbl.add_column(new_col=pxt.Int)
404
+ def _ignore_or_drop_existing_columns(self, new_col_names: list[str], if_exists: IfExistsParam) -> list[str]:
405
+ """ Check and handle existing columns in the new column specification based on the if_exists parameter.
434
406
 
435
- For details, see the documentation for [`add_column()`][pixeltable.catalog.Table.add_column].
407
+ If `if_exists='ignore'`, returns a list of existing columns, if any, in `new_col_names`.
436
408
  """
437
- self._check_is_dropped()
438
- if not isinstance(col_name, str):
439
- raise excs.Error(f'Column name must be a string, got {type(col_name)}')
440
- if not isinstance(spec, (ts.ColumnType, exprs.Expr, type, _GenericAlias)):
441
- raise excs.Error(f'Column spec must be a ColumnType, Expr, or type, got {type(spec)}')
442
- self.add_column(stored=None, print_stats=False, on_error='abort', **{col_name: spec})
409
+ assert not self.get_metadata()['is_snapshot']
410
+ existing_col_names = set(self._schema.keys())
411
+ cols_to_ignore = []
412
+ for new_col_name in new_col_names:
413
+ if new_col_name in existing_col_names:
414
+ if if_exists == IfExistsParam.ERROR:
415
+ raise excs.Error(f'Duplicate column name: {new_col_name!r}')
416
+ elif if_exists == IfExistsParam.IGNORE:
417
+ cols_to_ignore.append(new_col_name)
418
+ elif if_exists == IfExistsParam.REPLACE or if_exists == IfExistsParam.REPLACE_FORCE:
419
+ if new_col_name not in self._tbl_version.cols_by_name:
420
+ # for views, it is possible that the existing column
421
+ # is a base table column; in that case, we should not
422
+ # drop/replace that column. Continue to raise error.
423
+ raise excs.Error(
424
+ f'Column {new_col_name!r} is a base table column. Cannot replace it.'
425
+ )
426
+ col = self._tbl_version.cols_by_name[new_col_name]
427
+ # cannot drop a column with dependents; so reject
428
+ # replace directive if column has dependents.
429
+ if self._column_has_dependents(col):
430
+ raise excs.Error(
431
+ f'Column {new_col_name!r} already exists and has dependents. Cannot {if_exists.name.lower()} it.'
432
+ )
433
+ self.drop_column(new_col_name)
434
+ assert new_col_name not in self._tbl_version.cols_by_name
435
+ return cols_to_ignore
443
436
 
444
437
  def add_columns(
445
438
  self,
446
- schema: dict[str, Union[ts.ColumnType, builtins.type, _GenericAlias]]
439
+ schema: dict[str, Union[ts.ColumnType, builtins.type, _GenericAlias]],
440
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
447
441
  ) -> UpdateStatus:
448
442
  """
449
443
  Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed columns,
@@ -454,12 +448,21 @@ class Table(SchemaObject):
454
448
 
455
449
  Args:
456
450
  schema: A dictionary mapping column names to types.
451
+ if_exists: Determines the behavior if a column already exists. Must be one of the following:
452
+
453
+ - `'error'`: an exception will be raised.
454
+ - `'ignore'`: do nothing and return.
455
+ - `'replace' or 'replace_force'`: drop the existing column and add the new column, if it has no dependents.
456
+
457
+ Note that the `if_exists` parameter is applied to all columns in the schema.
458
+ To apply different behaviors to different columns, please use [`add_column()`][pixeltable.Table.add_column] for each column.
457
459
 
458
460
  Returns:
459
461
  Information about the execution status of the operation.
460
462
 
461
463
  Raises:
462
- Error: If any column name is invalid or already exists.
464
+ Error: If any column name is invalid, or already exists and `if_exists='error'`,
465
+ or `if_exists='replace*'` but the column has dependents or is a basetable column.
463
466
 
464
467
  Examples:
465
468
  Add multiple columns to the table `my_table`:
@@ -472,49 +475,51 @@ class Table(SchemaObject):
472
475
  ... tbl.add_columns(schema)
473
476
  """
474
477
  self._check_is_dropped()
478
+ if self.get_metadata()['is_snapshot']:
479
+ raise excs.Error('Cannot add column to a snapshot.')
475
480
  col_schema = {
476
481
  col_name: {'type': ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)}
477
482
  for col_name, spec in schema.items()
478
483
  }
484
+ # handle existing columns based on if_exists parameter
485
+ cols_to_ignore = self._ignore_or_drop_existing_columns(list(col_schema.keys()), IfExistsParam.validated(if_exists, 'if_exists'))
486
+ # if all columns to be added already exist and user asked to ignore
487
+ # existing columns, there's nothing to do.
488
+ for cname in cols_to_ignore:
489
+ assert cname in col_schema
490
+ del col_schema[cname]
491
+ if len(col_schema) == 0:
492
+ return UpdateStatus()
479
493
  new_cols = self._create_columns(col_schema)
480
494
  for new_col in new_cols:
481
- self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
495
+ self._verify_column(new_col)
482
496
  status = self._tbl_version.add_columns(new_cols, print_stats=False, on_error='abort')
483
497
  FileCache.get().emit_eviction_warnings()
484
498
  return status
485
499
 
486
- # TODO: add_column() still supports computed columns for backward-compatibility. In the future, computed columns
487
- # will be supported only through add_computed_column(). At that point, we can remove the `stored`,
488
- # `print_stats`, and `on_error` parameters, and change the method body to simply call self.add_columns(kwargs),
489
- # simplifying the code. For the time being, there's some obvious code duplication.
490
500
  def add_column(
491
501
  self,
492
502
  *,
493
- stored: Optional[bool] = None,
494
- print_stats: bool = False,
495
- on_error: Literal['abort', 'ignore'] = 'abort',
503
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
496
504
  **kwargs: Union[ts.ColumnType, builtins.type, _GenericAlias, exprs.Expr]
497
505
  ) -> UpdateStatus:
498
506
  """
499
- Adds a column to the table.
507
+ Adds an ordinary (non-computed) column to the table.
500
508
 
501
509
  Args:
502
510
  kwargs: Exactly one keyword argument of the form `col_name=col_type`.
503
- stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
504
- print_stats: If `True`, print execution metrics during evaluation.
505
- on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
506
- row.
511
+ if_exists: Determines the behavior if the column already exists. Must be one of the following:
507
512
 
508
- - `'abort'`: an exception will be raised and the column will not be added.
509
- - `'ignore'`: execution will continue and the column will be added. Any rows
510
- with errors will have a `None` value for the column, with information about the error stored in the
511
- corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
513
+ - `'error'`: an exception will be raised.
514
+ - `'ignore'`: do nothing and return.
515
+ - `'replace' or 'replace_force'`: drop the existing column and add the new column, if it has no dependents.
512
516
 
513
517
  Returns:
514
518
  Information about the execution status of the operation.
515
519
 
516
520
  Raises:
517
- Error: If the column name is invalid or already exists.
521
+ Error: If the column name is invalid, or already exists and `if_exists='erorr'`,
522
+ or `if_exists='replace*'` but the column has dependents or is a basetable column.
518
523
 
519
524
  Examples:
520
525
  Add an int column:
@@ -526,29 +531,22 @@ class Table(SchemaObject):
526
531
  >>> tbl['new_col'] = pxt.Int
527
532
  """
528
533
  self._check_is_dropped()
534
+ # verify kwargs
535
+ if self._tbl_version.is_snapshot:
536
+ raise excs.Error('Cannot add column to a snapshot.')
529
537
  # verify kwargs and construct column schema dict
530
538
  if len(kwargs) != 1:
531
539
  raise excs.Error(
532
540
  f'add_column() requires exactly one keyword argument of the form "col_name=col_type"; '
533
- f'got {len(kwargs)} instead ({", ".join(list(kwargs.keys()))})'
541
+ f'got {len(kwargs)} instead ({", ".join(kwargs.keys())})'
534
542
  )
535
- col_name, spec = next(iter(kwargs.items()))
536
- if not is_valid_identifier(col_name):
537
- raise excs.Error(f'Invalid column name: {col_name!r}')
538
-
539
- col_schema: dict[str, Any] = {}
540
- if isinstance(spec, (ts.ColumnType, builtins.type, _GenericAlias)):
541
- col_schema['type'] = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
542
- else:
543
- col_schema['value'] = spec
544
- if stored is not None:
545
- col_schema['stored'] = stored
543
+ col_type = next(iter(kwargs.values()))
544
+ if not isinstance(col_type, (ts.ColumnType, type, _GenericAlias)):
545
+ raise excs.Error(
546
+ f'The argument to add_column() must be a type; did you intend to use add_computed_column() instead?'
547
+ )
548
+ return self.add_columns(kwargs, if_exists=if_exists)
546
549
 
547
- new_col = self._create_columns({col_name: col_schema})[0]
548
- self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
549
- status = self._tbl_version.add_columns([new_col], print_stats=print_stats, on_error=on_error)
550
- FileCache.get().emit_eviction_warnings()
551
- return status
552
550
 
553
551
  def add_computed_column(
554
552
  self,
@@ -556,6 +554,7 @@ class Table(SchemaObject):
556
554
  stored: Optional[bool] = None,
557
555
  print_stats: bool = False,
558
556
  on_error: Literal['abort', 'ignore'] = 'abort',
557
+ if_exists: Literal['error', 'ignore', 'replace'] = 'error',
559
558
  **kwargs: exprs.Expr
560
559
  ) -> UpdateStatus:
561
560
  """
@@ -563,12 +562,27 @@ class Table(SchemaObject):
563
562
 
564
563
  Args:
565
564
  kwargs: Exactly one keyword argument of the form `col_name=expression`.
565
+ stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
566
+ print_stats: If `True`, print execution metrics during evaluation.
567
+ on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
568
+ row.
569
+
570
+ - `'abort'`: an exception will be raised and the column will not be added.
571
+ - `'ignore'`: execution will continue and the column will be added. Any rows
572
+ with errors will have a `None` value for the column, with information about the error stored in the
573
+ corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
574
+ if_exists: Determines the behavior if the column already exists. Must be one of the following:
575
+
576
+ - `'error'`: an exception will be raised.
577
+ - `'ignore'`: do nothing and return.
578
+ - `'replace' or 'replace_force'`: drop the existing column and add the new column, iff it has no dependents.
566
579
 
567
580
  Returns:
568
581
  Information about the execution status of the operation.
569
582
 
570
583
  Raises:
571
- Error: If the column name is invalid or already exists.
584
+ Error: If the column name is invalid or already exists and `if_exists='error'`,
585
+ or `if_exists='replace*'` but the column has dependents or is a basetable column.
572
586
 
573
587
  Examples:
574
588
  For a table with an image column `frame`, add an image column `rotated` that rotates the image by
@@ -581,6 +595,8 @@ class Table(SchemaObject):
581
595
  >>> tbl.add_computed_column(rotated=tbl.frame.rotate(90), stored=False)
582
596
  """
583
597
  self._check_is_dropped()
598
+ if self.get_metadata()['is_snapshot']:
599
+ raise excs.Error('Cannot add column to a snapshot.')
584
600
  if len(kwargs) != 1:
585
601
  raise excs.Error(
586
602
  f'add_computed_column() requires exactly one keyword argument of the form "column-name=type|value-expression"; '
@@ -594,8 +610,16 @@ class Table(SchemaObject):
594
610
  if stored is not None:
595
611
  col_schema['stored'] = stored
596
612
 
613
+ # handle existing columns based on if_exists parameter
614
+ cols_to_ignore = self._ignore_or_drop_existing_columns([col_name], IfExistsParam.validated(if_exists, 'if_exists'))
615
+ # if the column to add already exists and user asked to ignore
616
+ # exiting column, there's nothing to do.
617
+ if len(cols_to_ignore) != 0:
618
+ assert cols_to_ignore[0] == col_name
619
+ return UpdateStatus()
620
+
597
621
  new_col = self._create_columns({col_name: col_schema})[0]
598
- self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
622
+ self._verify_column(new_col)
599
623
  status = self._tbl_version.add_columns([new_col], print_stats=print_stats, on_error=on_error)
600
624
  FileCache.get().emit_eviction_warnings()
601
625
  return status
@@ -675,18 +699,12 @@ class Table(SchemaObject):
675
699
  return columns
676
700
 
677
701
  @classmethod
678
- def _verify_column(
679
- cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
680
- ) -> None:
702
+ def _verify_column(cls, col: Column) -> None:
681
703
  """Check integrity of user-supplied Column and supply defaults"""
682
704
  if is_system_column_name(col.name):
683
705
  raise excs.Error(f'{col.name!r} is a reserved name in Pixeltable; please choose a different column name.')
684
706
  if not is_valid_identifier(col.name):
685
707
  raise excs.Error(f"Invalid column name: {col.name!r}")
686
- if col.name in existing_column_names:
687
- raise excs.Error(f'Duplicate column name: {col.name!r}')
688
- if existing_query_names is not None and col.name in existing_query_names:
689
- raise excs.Error(f'Column name conflicts with a registered query: {col.name!r}')
690
708
  if col.stored is False and not (col.is_computed and col.col_type.is_image_type()):
691
709
  raise excs.Error(f'Column {col.name!r}: stored={col.stored} only applies to computed image columns')
692
710
  if col.stored is False and col.has_window_fn_call():
@@ -699,7 +717,7 @@ class Table(SchemaObject):
699
717
  """Check integrity of user-supplied schema and set defaults"""
700
718
  column_names: set[str] = set()
701
719
  for col in schema:
702
- cls._verify_column(col, column_names)
720
+ cls._verify_column(col)
703
721
  column_names.add(col.name)
704
722
 
705
723
  def __check_column_name_exists(self, column_name: str, include_bases: bool = False) -> None:
@@ -809,61 +827,108 @@ class Table(SchemaObject):
809
827
  """
810
828
  self._tbl_version.rename_column(old_name, new_name)
811
829
 
830
+ def _list_index_info_for_test(self) -> list[dict[str, Any]]:
831
+ """
832
+ Returns list of all the indexes on this table. Used for testing.
833
+
834
+ Returns:
835
+ A list of index information, each containing the index's
836
+ id, name, and the name of the column it indexes.
837
+ """
838
+ assert not self._is_dropped
839
+ index_info = []
840
+ for idx_name, idx in self._tbl_version.idxs_by_name.items():
841
+ index_info.append({
842
+ '_id': idx.id,
843
+ '_name': idx_name,
844
+ '_column': idx.col.name
845
+ })
846
+ return index_info
847
+
812
848
  def add_embedding_index(
813
849
  self, column: Union[str, ColumnRef], *, idx_name: Optional[str] = None,
850
+ embedding: Optional[pxt.Function] = None,
814
851
  string_embed: Optional[pxt.Function] = None, image_embed: Optional[pxt.Function] = None,
815
- metric: str = 'cosine'
852
+ metric: str = 'cosine',
853
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
816
854
  ) -> None:
817
855
  """
818
- Add an embedding index to the table. Once the index is added, it will be automatically kept up to data as new
856
+ Add an embedding index to the table. Once the index is created, it will be automatically kept up-to-date as new
819
857
  rows are inserted into the table.
820
858
 
821
- Indices are currently supported only for `String` and `Image` columns. The index must specify, at
822
- minimum, an embedding of the appropriate type (string or image). It may optionally specify _both_ a string
823
- and image embedding (into the same vector space); in particular, this can be used to provide similarity search
824
- of text over an image column.
859
+ To add an embedding index, one must specify, at minimum, the column to be indexed and an embedding UDF.
860
+ Only `String` and `Image` columns are currently supported. Here's an example that uses a
861
+ [CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
862
+
863
+ >>> from pixeltable.functions.huggingface import clip
864
+ ... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
865
+ ... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
866
+
867
+ Once the index is created, similiarity lookups can be performed using the `similarity` pseudo-function.
868
+
869
+ >>> reference_img = PIL.Image.open('my_image.jpg')
870
+ ... sim = tbl.img.similarity(reference_img)
871
+ ... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
872
+
873
+ If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
874
+ performed using any of its supported types. In our example, CLIP supports both text and images, so we can
875
+ also search for images using a text description:
876
+
877
+ >>> sim = tbl.img.similarity('a picture of a train')
878
+ ... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
825
879
 
826
880
  Args:
827
- column: The name of, or reference to, the column to index; must be a `String` or `Image` column.
828
- idx_name: The name of index. If not specified, a name such as `'idx0'` will be generated automatically.
829
- If specified, the name must be unique for this table.
830
- string_embed: A function to embed text; required if the column is a `String` column.
831
- image_embed: A function to embed images; required if the column is an `Image` column.
832
- metric: Distance metric to use for the index; one of `'cosine'`, `'ip'`, or `'l2'`;
833
- the default is `'cosine'`.
881
+ column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
882
+ idx_name: An optional name for the index. If not specified, a name such as `'idx0'` will be generated
883
+ automatically. If specified, the name must be unique for this table.
884
+ embedding: The UDF to use for the embedding. Must be a UDF that accepts a single argument of type `String`
885
+ or `Image` (as appropriate for the column being indexed) and returns a fixed-size 1-dimensional
886
+ array of floats.
887
+ string_embed: An optional UDF to use for the string embedding component of this index.
888
+ Can be used in conjunction with `image_embed` to construct multimodal embeddings manually, by
889
+ specifying different embedding functions for different data types.
890
+ image_embed: An optional UDF to use for the image embedding component of this index.
891
+ Can be used in conjunction with `string_embed` to construct multimodal embeddings manually, by
892
+ specifying different embedding functions for different data types.
893
+ metric: Distance metric to use for the index; one of `'cosine'`, `'ip'`, or `'l2'`.
894
+ The default is `'cosine'`.
895
+ if_exists: Directive for handling an existing index with the same name. Must be one of the following:
896
+
897
+ - `'error'`: raise an error if an index with the same name already exists.
898
+ - `'ignore'`: do nothing if an index with the same name already exists.
899
+ - `'replace'` or `'replace_force'`: replace the existing index with the new one.
834
900
 
835
901
  Raises:
836
- Error: If an index with that name already exists for the table, or if the specified column does not exist.
902
+ Error: If an index with the specified name already exists for the table and `if_exists='error'`, or if the specified column does not exist.
837
903
 
838
904
  Examples:
839
- Add an index to the `img` column of the table `my_table` by column name:
905
+ Add an index to the `img` column of the table `my_table`:
840
906
 
841
- >>> tbl = pxt.get_table('my_table')
842
- ... tbl.add_embedding_index('img', image_embed=my_image_func)
907
+ >>> from pixeltable.functions.huggingface import clip
908
+ ... tbl = pxt.get_table('my_table')
909
+ ... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
910
+ ... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
843
911
 
844
- Add an index to the `img` column of the table `my_table` by column reference:
845
- >>> tbl = pxt.get_table('my_table')
846
- ... tbl.add_embedding_index(tbl.img, image_embed=my_image_func)
912
+ Alternatively, the `img` column may be specified by name:
913
+
914
+ >>> tbl.add_embedding_index('img', embedding=embedding_fn)
847
915
 
848
- Add another index to the `img` column, using the inner product as the distance metric,
849
- and with a specific name; `string_embed` is also specified in order to search with text:
916
+ Add a second index to the `img` column, using the inner product as the distance metric,
917
+ and with a specific name:
850
918
 
851
919
  >>> tbl.add_embedding_index(
852
- ... 'img',
853
- ... idx_name='clip_idx',
854
- ... image_embed=my_image_func,
855
- ... string_embed=my_string_func,
920
+ ... tbl.img,
921
+ ... idx_name='ip_idx',
922
+ ... embedding=embedding_fn,
856
923
  ... metric='ip'
857
924
  ... )
858
925
 
859
- Alternatively:
926
+ Add an index using separately specified string and image embeddings:
860
927
 
861
928
  >>> tbl.add_embedding_index(
862
929
  ... tbl.img,
863
- ... idx_name='clip_idx',
864
- ... image_embed=my_image_func,
865
- ... string_embed=my_string_func,
866
- ... metric='ip'
930
+ ... string_embed=string_embedding_fn,
931
+ ... image_embed=image_embedding_fn
867
932
  ... )
868
933
  """
869
934
  if self._tbl_version_path.is_snapshot():
@@ -877,11 +942,22 @@ class Table(SchemaObject):
877
942
  col = column.col
878
943
 
879
944
  if idx_name is not None and idx_name in self._tbl_version.idxs_by_name:
880
- raise excs.Error(f'Duplicate index name: {idx_name}')
945
+ _if_exists = IfExistsParam.validated(if_exists, 'if_exists')
946
+ # An index with the same name already exists.
947
+ # Handle it according to if_exists.
948
+ if _if_exists == IfExistsParam.ERROR:
949
+ raise excs.Error(f'Duplicate index name: {idx_name}')
950
+ if not isinstance(self._tbl_version.idxs_by_name[idx_name].idx, index.EmbeddingIndex):
951
+ raise excs.Error(f'Index `{idx_name}` is not an embedding index. Cannot {_if_exists.name.lower()} it.')
952
+ if _if_exists == IfExistsParam.IGNORE:
953
+ return
954
+ assert _if_exists == IfExistsParam.REPLACE or _if_exists == IfExistsParam.REPLACE_FORCE
955
+ self.drop_index(idx_name=idx_name)
956
+ assert idx_name not in self._tbl_version.idxs_by_name
881
957
  from pixeltable.index import EmbeddingIndex
882
958
 
883
959
  # create the EmbeddingIndex instance to verify args
884
- idx = EmbeddingIndex(col, metric=metric, string_embed=string_embed, image_embed=image_embed)
960
+ idx = EmbeddingIndex(col, metric=metric, embed=embedding, string_embed=string_embed, image_embed=image_embed)
885
961
  status = self._tbl_version.add_index(col, idx_name=idx_name, idx=idx)
886
962
  # TODO: how to deal with exceptions here? drop the index and raise?
887
963
  FileCache.get().emit_eviction_warnings()
@@ -1255,43 +1331,6 @@ class Table(SchemaObject):
1255
1331
  raise excs.Error('Cannot revert a snapshot')
1256
1332
  self._tbl_version.revert()
1257
1333
 
1258
- @overload
1259
- def query(self, py_fn: Callable) -> 'pxt.func.QueryTemplateFunction': ...
1260
-
1261
- @overload
1262
- def query(
1263
- self, *, param_types: Optional[list[ts.ColumnType]] = None
1264
- ) -> Callable[[Callable], 'pxt.func.QueryTemplateFunction']: ...
1265
-
1266
- def query(self, *args: Any, **kwargs: Any) -> Any:
1267
- def make_query_template(
1268
- py_fn: Callable, param_types: Optional[list[ts.ColumnType]]
1269
- ) -> 'pxt.func.QueryTemplateFunction':
1270
- if py_fn.__module__ != '__main__' and py_fn.__name__.isidentifier():
1271
- # this is a named function in a module
1272
- function_path = f'{py_fn.__module__}.{py_fn.__qualname__}'
1273
- else:
1274
- function_path = None
1275
- query_name = py_fn.__name__
1276
- if query_name in self._schema.keys():
1277
- raise excs.Error(f'Query name {query_name!r} conflicts with existing column')
1278
- if query_name in self.__query_scope._queries and function_path is not None:
1279
- raise excs.Error(f'Duplicate query name: {query_name!r}')
1280
- query_fn = pxt.func.QueryTemplateFunction.create(
1281
- py_fn, param_types=param_types, path=function_path, name=query_name)
1282
- self.__query_scope._queries[query_name] = query_fn
1283
- return query_fn
1284
-
1285
- # TODO: verify that the inferred return type matches that of the template
1286
- # TODO: verify that the signature doesn't contain batched parameters
1287
-
1288
- if len(args) == 1:
1289
- assert len(kwargs) == 0 and callable(args[0])
1290
- return make_query_template(args[0], None)
1291
- else:
1292
- assert len(args) == 0 and len(kwargs) == 1 and 'param_types' in kwargs
1293
- return lambda py_fn: make_query_template(py_fn, kwargs['param_types'])
1294
-
1295
1334
  @property
1296
1335
  def external_stores(self) -> list[str]:
1297
1336
  return list(self._tbl_version.external_stores.keys())
@@ -1381,7 +1420,7 @@ class Table(SchemaObject):
1381
1420
  return sync_status
1382
1421
 
1383
1422
  def __dir__(self) -> list[str]:
1384
- return list(super().__dir__()) + list(self._schema.keys()) + self._query_names
1423
+ return list(super().__dir__()) + list(self._schema.keys())
1385
1424
 
1386
1425
  def _ipython_key_completions_(self) -> list[str]:
1387
- return list(self._schema.keys()) + self._query_names
1426
+ return list(self._schema.keys())
@@ -734,7 +734,8 @@ class TableVersion:
734
734
  if conn is None:
735
735
  with Env.get().engine.begin() as conn:
736
736
  return self._insert(
737
- plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
737
+ plan, conn, time.time(), print_stats=print_stats, rowids=rowids(),
738
+ abort_on_exc=fail_on_exception)
738
739
  else:
739
740
  return self._insert(
740
741
  plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
@@ -16,7 +16,7 @@ from pixeltable.iterators import ComponentIterator
16
16
 
17
17
  from .catalog import Catalog
18
18
  from .column import Column
19
- from .globals import _POS_COLUMN_NAME, UpdateStatus, MediaValidation
19
+ from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
20
20
  from .table import Table
21
21
  from .table_version import TableVersion
22
22
  from .table_version_path import TableVersionPath
@@ -166,13 +166,11 @@ class View(Table):
166
166
  return view
167
167
 
168
168
  @classmethod
169
- def _verify_column(
170
- cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
171
- ) -> None:
169
+ def _verify_column(cls, col: Column) -> None:
172
170
  # make sure that columns are nullable or have a default
173
171
  if not col.col_type.nullable and not col.is_computed:
174
172
  raise excs.Error(f'Column {col.name}: non-computed columns in views must be nullable')
175
- super()._verify_column(col, existing_column_names, existing_query_names)
173
+ super()._verify_column(col)
176
174
 
177
175
  @classmethod
178
176
  def _get_snapshot_path(cls, tbl_version_path: TableVersionPath) -> TableVersionPath: