pixeltable 0.2.30__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/table.py +212 -173
- pixeltable/catalog/table_version.py +2 -1
- pixeltable/catalog/view.py +3 -5
- pixeltable/dataframe.py +52 -39
- pixeltable/env.py +94 -5
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/aggregation_node.py +3 -3
- pixeltable/exec/cache_prefetch_node.py +13 -7
- pixeltable/exec/component_iteration_node.py +3 -9
- pixeltable/exec/data_row_batch.py +17 -5
- pixeltable/exec/exec_node.py +32 -12
- pixeltable/exec/expr_eval/__init__.py +1 -0
- pixeltable/exec/expr_eval/evaluators.py +245 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +404 -0
- pixeltable/exec/expr_eval/globals.py +114 -0
- pixeltable/exec/expr_eval/row_buffer.py +76 -0
- pixeltable/exec/expr_eval/schedulers.py +232 -0
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/row_update_node.py +14 -14
- pixeltable/exec/sql_node.py +2 -2
- pixeltable/exprs/column_ref.py +5 -1
- pixeltable/exprs/data_row.py +50 -40
- pixeltable/exprs/expr.py +57 -12
- pixeltable/exprs/function_call.py +54 -19
- pixeltable/exprs/inline_expr.py +12 -21
- pixeltable/exprs/literal.py +25 -8
- pixeltable/exprs/row_builder.py +23 -0
- pixeltable/exprs/similarity_expr.py +4 -4
- pixeltable/func/__init__.py +5 -5
- pixeltable/func/aggregate_function.py +4 -0
- pixeltable/func/callable_function.py +54 -6
- pixeltable/func/expr_template_function.py +5 -1
- pixeltable/func/function.py +54 -13
- pixeltable/func/query_template_function.py +56 -10
- pixeltable/func/tools.py +51 -14
- pixeltable/func/udf.py +7 -1
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/anthropic.py +108 -21
- pixeltable/functions/gemini.py +2 -6
- pixeltable/functions/huggingface.py +10 -28
- pixeltable/functions/openai.py +225 -28
- pixeltable/globals.py +8 -5
- pixeltable/index/embedding_index.py +90 -38
- pixeltable/io/label_studio.py +1 -1
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_24.py +11 -2
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +24 -9
- pixeltable/store.py +6 -0
- pixeltable/type_system.py +4 -7
- pixeltable/utils/arrow.py +3 -3
- {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/METADATA +5 -11
- {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/RECORD +59 -53
- pixeltable/exec/expr_eval_node.py +0 -232
- {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/entry_points.txt +0 -0
pixeltable/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ from .catalog import Column, InsertableTable, Table, UpdateStatus, View
|
|
|
2
2
|
from .dataframe import DataFrame
|
|
3
3
|
from .exceptions import Error
|
|
4
4
|
from .exprs import RELATIVE_PATH_ROOT
|
|
5
|
-
from .func import Aggregator, Function, expr_udf, uda, udf
|
|
5
|
+
from .func import Aggregator, Function, expr_udf, query, uda, udf
|
|
6
6
|
from .globals import (array, configure_logging, create_dir, create_snapshot, create_table, create_view, drop_dir,
|
|
7
7
|
drop_table, get_table, init, list_dirs, list_functions, list_tables, move, tool, tools)
|
|
8
8
|
from .type_system import (Array, ArrayType, Audio, AudioType, Bool, BoolType, ColumnType, Document, DocumentType, Float,
|
pixeltable/__version__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
# These version placeholders will be replaced during build.
|
|
2
|
-
__version__ = "0.
|
|
3
|
-
__version_tuple__ = (0,
|
|
2
|
+
__version__ = "0.3.1"
|
|
3
|
+
__version_tuple__ = (0, 3, 1)
|
pixeltable/catalog/table.py
CHANGED
|
@@ -25,13 +25,15 @@ from ..exprs import ColumnRef
|
|
|
25
25
|
from ..utils.description_helper import DescriptionHelper
|
|
26
26
|
from ..utils.filecache import FileCache
|
|
27
27
|
from .column import Column
|
|
28
|
-
from .globals import _ROWID_COLUMN_NAME,
|
|
28
|
+
from .globals import (_ROWID_COLUMN_NAME, IfExistsParam, IfNotExistsParam, MediaValidation, UpdateStatus,
|
|
29
|
+
is_system_column_name, is_valid_identifier)
|
|
29
30
|
from .schema_object import SchemaObject
|
|
30
31
|
from .table_version import TableVersion
|
|
31
32
|
from .table_version_path import TableVersionPath
|
|
32
33
|
|
|
33
34
|
if TYPE_CHECKING:
|
|
34
35
|
import torch.utils.data
|
|
36
|
+
|
|
35
37
|
import pixeltable.plan
|
|
36
38
|
|
|
37
39
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -48,20 +50,6 @@ class Table(SchemaObject):
|
|
|
48
50
|
super().__init__(id, name, dir_id)
|
|
49
51
|
self._is_dropped = False
|
|
50
52
|
self.__tbl_version_path = tbl_version_path
|
|
51
|
-
self.__query_scope = self.QueryScope(self)
|
|
52
|
-
|
|
53
|
-
class QueryScope:
|
|
54
|
-
__table: 'Table'
|
|
55
|
-
_queries: dict[str, pxt.func.QueryTemplateFunction]
|
|
56
|
-
|
|
57
|
-
def __init__(self, table: 'Table') -> None:
|
|
58
|
-
self.__table = table
|
|
59
|
-
self._queries = {}
|
|
60
|
-
|
|
61
|
-
def __getattr__(self, name: str) -> pxt.func.QueryTemplateFunction:
|
|
62
|
-
if name in self._queries:
|
|
63
|
-
return self._queries[name]
|
|
64
|
-
raise AttributeError(f'Table {self.__table._name!r} has no query with that name: {name!r}')
|
|
65
53
|
|
|
66
54
|
@property
|
|
67
55
|
def _has_dependents(self) -> bool:
|
|
@@ -138,23 +126,12 @@ class Table(SchemaObject):
|
|
|
138
126
|
raise excs.Error(f'{self._display_name()} {self._name} has been dropped')
|
|
139
127
|
|
|
140
128
|
def __getattr__(self, name: str) -> 'pxt.exprs.ColumnRef':
|
|
141
|
-
"""Return a ColumnRef for the given name.
|
|
142
|
-
"""
|
|
129
|
+
"""Return a ColumnRef for the given name."""
|
|
143
130
|
return self._tbl_version_path.get_column_ref(name)
|
|
144
131
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
@overload
|
|
149
|
-
def __getitem__(self, index: Union[exprs.Expr, Sequence[exprs.Expr]]) -> 'pxt.DataFrame': ...
|
|
150
|
-
|
|
151
|
-
def __getitem__(self, index):
|
|
152
|
-
"""Return a ColumnRef or QueryTemplateFunction for the given name, or a DataFrame for the given slice.
|
|
153
|
-
"""
|
|
154
|
-
if isinstance(index, str):
|
|
155
|
-
return getattr(self, index)
|
|
156
|
-
else:
|
|
157
|
-
return self._df()[index]
|
|
132
|
+
def __getitem__(self, name: str) -> 'pxt.exprs.ColumnRef':
|
|
133
|
+
"""Return a ColumnRef for the given name."""
|
|
134
|
+
return getattr(self, name)
|
|
158
135
|
|
|
159
136
|
def list_views(self, *, recursive: bool = True) -> list[str]:
|
|
160
137
|
"""
|
|
@@ -184,10 +161,6 @@ class Table(SchemaObject):
|
|
|
184
161
|
from pixeltable.plan import FromClause
|
|
185
162
|
return pxt.DataFrame(FromClause(tbls=[self._tbl_version_path]))
|
|
186
163
|
|
|
187
|
-
@property
|
|
188
|
-
def queries(self) -> 'Table.QueryScope':
|
|
189
|
-
return self.__query_scope
|
|
190
|
-
|
|
191
164
|
def select(self, *items: Any, **named_items: Any) -> 'pxt.DataFrame':
|
|
192
165
|
""" Select columns or expressions from this table.
|
|
193
166
|
|
|
@@ -264,11 +237,6 @@ class Table(SchemaObject):
|
|
|
264
237
|
"""Return the schema (column names and column types) of this table."""
|
|
265
238
|
return {c.name: c.col_type for c in self._tbl_version_path.columns()}
|
|
266
239
|
|
|
267
|
-
@property
|
|
268
|
-
def _query_names(self) -> list[str]:
|
|
269
|
-
"""Return the names of the registered queries for this table."""
|
|
270
|
-
return list(self.__query_scope._queries.keys())
|
|
271
|
-
|
|
272
240
|
@property
|
|
273
241
|
def _base(self) -> Optional['Table']:
|
|
274
242
|
"""
|
|
@@ -422,28 +390,54 @@ class Table(SchemaObject):
|
|
|
422
390
|
"""
|
|
423
391
|
return self._df().to_coco_dataset()
|
|
424
392
|
|
|
425
|
-
def
|
|
426
|
-
"""
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
393
|
+
def _column_has_dependents(self, col: Column) -> bool:
|
|
394
|
+
"""Returns True if the column has dependents, False otherwise."""
|
|
395
|
+
assert col is not None
|
|
396
|
+
assert col.name in self._schema.keys()
|
|
397
|
+
if any(c.name is not None for c in col.dependent_cols):
|
|
398
|
+
return True
|
|
399
|
+
return any(
|
|
400
|
+
col in store.get_local_columns()
|
|
401
|
+
for view in [self] + self._get_views(recursive=True)
|
|
402
|
+
for store in view._tbl_version.external_stores.values())
|
|
432
403
|
|
|
433
|
-
|
|
404
|
+
def _ignore_or_drop_existing_columns(self, new_col_names: list[str], if_exists: IfExistsParam) -> list[str]:
|
|
405
|
+
""" Check and handle existing columns in the new column specification based on the if_exists parameter.
|
|
434
406
|
|
|
435
|
-
|
|
407
|
+
If `if_exists='ignore'`, returns a list of existing columns, if any, in `new_col_names`.
|
|
436
408
|
"""
|
|
437
|
-
self.
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
409
|
+
assert not self.get_metadata()['is_snapshot']
|
|
410
|
+
existing_col_names = set(self._schema.keys())
|
|
411
|
+
cols_to_ignore = []
|
|
412
|
+
for new_col_name in new_col_names:
|
|
413
|
+
if new_col_name in existing_col_names:
|
|
414
|
+
if if_exists == IfExistsParam.ERROR:
|
|
415
|
+
raise excs.Error(f'Duplicate column name: {new_col_name!r}')
|
|
416
|
+
elif if_exists == IfExistsParam.IGNORE:
|
|
417
|
+
cols_to_ignore.append(new_col_name)
|
|
418
|
+
elif if_exists == IfExistsParam.REPLACE or if_exists == IfExistsParam.REPLACE_FORCE:
|
|
419
|
+
if new_col_name not in self._tbl_version.cols_by_name:
|
|
420
|
+
# for views, it is possible that the existing column
|
|
421
|
+
# is a base table column; in that case, we should not
|
|
422
|
+
# drop/replace that column. Continue to raise error.
|
|
423
|
+
raise excs.Error(
|
|
424
|
+
f'Column {new_col_name!r} is a base table column. Cannot replace it.'
|
|
425
|
+
)
|
|
426
|
+
col = self._tbl_version.cols_by_name[new_col_name]
|
|
427
|
+
# cannot drop a column with dependents; so reject
|
|
428
|
+
# replace directive if column has dependents.
|
|
429
|
+
if self._column_has_dependents(col):
|
|
430
|
+
raise excs.Error(
|
|
431
|
+
f'Column {new_col_name!r} already exists and has dependents. Cannot {if_exists.name.lower()} it.'
|
|
432
|
+
)
|
|
433
|
+
self.drop_column(new_col_name)
|
|
434
|
+
assert new_col_name not in self._tbl_version.cols_by_name
|
|
435
|
+
return cols_to_ignore
|
|
443
436
|
|
|
444
437
|
def add_columns(
|
|
445
438
|
self,
|
|
446
|
-
schema: dict[str, Union[ts.ColumnType, builtins.type, _GenericAlias]]
|
|
439
|
+
schema: dict[str, Union[ts.ColumnType, builtins.type, _GenericAlias]],
|
|
440
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
|
|
447
441
|
) -> UpdateStatus:
|
|
448
442
|
"""
|
|
449
443
|
Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed columns,
|
|
@@ -454,12 +448,21 @@ class Table(SchemaObject):
|
|
|
454
448
|
|
|
455
449
|
Args:
|
|
456
450
|
schema: A dictionary mapping column names to types.
|
|
451
|
+
if_exists: Determines the behavior if a column already exists. Must be one of the following:
|
|
452
|
+
|
|
453
|
+
- `'error'`: an exception will be raised.
|
|
454
|
+
- `'ignore'`: do nothing and return.
|
|
455
|
+
- `'replace' or 'replace_force'`: drop the existing column and add the new column, if it has no dependents.
|
|
456
|
+
|
|
457
|
+
Note that the `if_exists` parameter is applied to all columns in the schema.
|
|
458
|
+
To apply different behaviors to different columns, please use [`add_column()`][pixeltable.Table.add_column] for each column.
|
|
457
459
|
|
|
458
460
|
Returns:
|
|
459
461
|
Information about the execution status of the operation.
|
|
460
462
|
|
|
461
463
|
Raises:
|
|
462
|
-
Error: If any column name is invalid or already exists
|
|
464
|
+
Error: If any column name is invalid, or already exists and `if_exists='error'`,
|
|
465
|
+
or `if_exists='replace*'` but the column has dependents or is a basetable column.
|
|
463
466
|
|
|
464
467
|
Examples:
|
|
465
468
|
Add multiple columns to the table `my_table`:
|
|
@@ -472,49 +475,51 @@ class Table(SchemaObject):
|
|
|
472
475
|
... tbl.add_columns(schema)
|
|
473
476
|
"""
|
|
474
477
|
self._check_is_dropped()
|
|
478
|
+
if self.get_metadata()['is_snapshot']:
|
|
479
|
+
raise excs.Error('Cannot add column to a snapshot.')
|
|
475
480
|
col_schema = {
|
|
476
481
|
col_name: {'type': ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)}
|
|
477
482
|
for col_name, spec in schema.items()
|
|
478
483
|
}
|
|
484
|
+
# handle existing columns based on if_exists parameter
|
|
485
|
+
cols_to_ignore = self._ignore_or_drop_existing_columns(list(col_schema.keys()), IfExistsParam.validated(if_exists, 'if_exists'))
|
|
486
|
+
# if all columns to be added already exist and user asked to ignore
|
|
487
|
+
# existing columns, there's nothing to do.
|
|
488
|
+
for cname in cols_to_ignore:
|
|
489
|
+
assert cname in col_schema
|
|
490
|
+
del col_schema[cname]
|
|
491
|
+
if len(col_schema) == 0:
|
|
492
|
+
return UpdateStatus()
|
|
479
493
|
new_cols = self._create_columns(col_schema)
|
|
480
494
|
for new_col in new_cols:
|
|
481
|
-
self._verify_column(new_col
|
|
495
|
+
self._verify_column(new_col)
|
|
482
496
|
status = self._tbl_version.add_columns(new_cols, print_stats=False, on_error='abort')
|
|
483
497
|
FileCache.get().emit_eviction_warnings()
|
|
484
498
|
return status
|
|
485
499
|
|
|
486
|
-
# TODO: add_column() still supports computed columns for backward-compatibility. In the future, computed columns
|
|
487
|
-
# will be supported only through add_computed_column(). At that point, we can remove the `stored`,
|
|
488
|
-
# `print_stats`, and `on_error` parameters, and change the method body to simply call self.add_columns(kwargs),
|
|
489
|
-
# simplifying the code. For the time being, there's some obvious code duplication.
|
|
490
500
|
def add_column(
|
|
491
501
|
self,
|
|
492
502
|
*,
|
|
493
|
-
|
|
494
|
-
print_stats: bool = False,
|
|
495
|
-
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
503
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
496
504
|
**kwargs: Union[ts.ColumnType, builtins.type, _GenericAlias, exprs.Expr]
|
|
497
505
|
) -> UpdateStatus:
|
|
498
506
|
"""
|
|
499
|
-
Adds
|
|
507
|
+
Adds an ordinary (non-computed) column to the table.
|
|
500
508
|
|
|
501
509
|
Args:
|
|
502
510
|
kwargs: Exactly one keyword argument of the form `col_name=col_type`.
|
|
503
|
-
|
|
504
|
-
print_stats: If `True`, print execution metrics during evaluation.
|
|
505
|
-
on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
|
|
506
|
-
row.
|
|
511
|
+
if_exists: Determines the behavior if the column already exists. Must be one of the following:
|
|
507
512
|
|
|
508
|
-
- `'
|
|
509
|
-
- `'ignore'`:
|
|
510
|
-
|
|
511
|
-
corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
|
|
513
|
+
- `'error'`: an exception will be raised.
|
|
514
|
+
- `'ignore'`: do nothing and return.
|
|
515
|
+
- `'replace' or 'replace_force'`: drop the existing column and add the new column, if it has no dependents.
|
|
512
516
|
|
|
513
517
|
Returns:
|
|
514
518
|
Information about the execution status of the operation.
|
|
515
519
|
|
|
516
520
|
Raises:
|
|
517
|
-
Error: If the column name is invalid or already exists
|
|
521
|
+
Error: If the column name is invalid, or already exists and `if_exists='erorr'`,
|
|
522
|
+
or `if_exists='replace*'` but the column has dependents or is a basetable column.
|
|
518
523
|
|
|
519
524
|
Examples:
|
|
520
525
|
Add an int column:
|
|
@@ -526,29 +531,22 @@ class Table(SchemaObject):
|
|
|
526
531
|
>>> tbl['new_col'] = pxt.Int
|
|
527
532
|
"""
|
|
528
533
|
self._check_is_dropped()
|
|
534
|
+
# verify kwargs
|
|
535
|
+
if self._tbl_version.is_snapshot:
|
|
536
|
+
raise excs.Error('Cannot add column to a snapshot.')
|
|
529
537
|
# verify kwargs and construct column schema dict
|
|
530
538
|
if len(kwargs) != 1:
|
|
531
539
|
raise excs.Error(
|
|
532
540
|
f'add_column() requires exactly one keyword argument of the form "col_name=col_type"; '
|
|
533
|
-
f'got {len(kwargs)} instead ({", ".join(
|
|
541
|
+
f'got {len(kwargs)} instead ({", ".join(kwargs.keys())})'
|
|
534
542
|
)
|
|
535
|
-
|
|
536
|
-
if not
|
|
537
|
-
raise excs.Error(
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
col_schema['type'] = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
|
|
542
|
-
else:
|
|
543
|
-
col_schema['value'] = spec
|
|
544
|
-
if stored is not None:
|
|
545
|
-
col_schema['stored'] = stored
|
|
543
|
+
col_type = next(iter(kwargs.values()))
|
|
544
|
+
if not isinstance(col_type, (ts.ColumnType, type, _GenericAlias)):
|
|
545
|
+
raise excs.Error(
|
|
546
|
+
f'The argument to add_column() must be a type; did you intend to use add_computed_column() instead?'
|
|
547
|
+
)
|
|
548
|
+
return self.add_columns(kwargs, if_exists=if_exists)
|
|
546
549
|
|
|
547
|
-
new_col = self._create_columns({col_name: col_schema})[0]
|
|
548
|
-
self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
|
|
549
|
-
status = self._tbl_version.add_columns([new_col], print_stats=print_stats, on_error=on_error)
|
|
550
|
-
FileCache.get().emit_eviction_warnings()
|
|
551
|
-
return status
|
|
552
550
|
|
|
553
551
|
def add_computed_column(
|
|
554
552
|
self,
|
|
@@ -556,6 +554,7 @@ class Table(SchemaObject):
|
|
|
556
554
|
stored: Optional[bool] = None,
|
|
557
555
|
print_stats: bool = False,
|
|
558
556
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
557
|
+
if_exists: Literal['error', 'ignore', 'replace'] = 'error',
|
|
559
558
|
**kwargs: exprs.Expr
|
|
560
559
|
) -> UpdateStatus:
|
|
561
560
|
"""
|
|
@@ -563,12 +562,27 @@ class Table(SchemaObject):
|
|
|
563
562
|
|
|
564
563
|
Args:
|
|
565
564
|
kwargs: Exactly one keyword argument of the form `col_name=expression`.
|
|
565
|
+
stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
|
|
566
|
+
print_stats: If `True`, print execution metrics during evaluation.
|
|
567
|
+
on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
|
|
568
|
+
row.
|
|
569
|
+
|
|
570
|
+
- `'abort'`: an exception will be raised and the column will not be added.
|
|
571
|
+
- `'ignore'`: execution will continue and the column will be added. Any rows
|
|
572
|
+
with errors will have a `None` value for the column, with information about the error stored in the
|
|
573
|
+
corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
|
|
574
|
+
if_exists: Determines the behavior if the column already exists. Must be one of the following:
|
|
575
|
+
|
|
576
|
+
- `'error'`: an exception will be raised.
|
|
577
|
+
- `'ignore'`: do nothing and return.
|
|
578
|
+
- `'replace' or 'replace_force'`: drop the existing column and add the new column, iff it has no dependents.
|
|
566
579
|
|
|
567
580
|
Returns:
|
|
568
581
|
Information about the execution status of the operation.
|
|
569
582
|
|
|
570
583
|
Raises:
|
|
571
|
-
Error: If the column name is invalid or already exists
|
|
584
|
+
Error: If the column name is invalid or already exists and `if_exists='error'`,
|
|
585
|
+
or `if_exists='replace*'` but the column has dependents or is a basetable column.
|
|
572
586
|
|
|
573
587
|
Examples:
|
|
574
588
|
For a table with an image column `frame`, add an image column `rotated` that rotates the image by
|
|
@@ -581,6 +595,8 @@ class Table(SchemaObject):
|
|
|
581
595
|
>>> tbl.add_computed_column(rotated=tbl.frame.rotate(90), stored=False)
|
|
582
596
|
"""
|
|
583
597
|
self._check_is_dropped()
|
|
598
|
+
if self.get_metadata()['is_snapshot']:
|
|
599
|
+
raise excs.Error('Cannot add column to a snapshot.')
|
|
584
600
|
if len(kwargs) != 1:
|
|
585
601
|
raise excs.Error(
|
|
586
602
|
f'add_computed_column() requires exactly one keyword argument of the form "column-name=type|value-expression"; '
|
|
@@ -594,8 +610,16 @@ class Table(SchemaObject):
|
|
|
594
610
|
if stored is not None:
|
|
595
611
|
col_schema['stored'] = stored
|
|
596
612
|
|
|
613
|
+
# handle existing columns based on if_exists parameter
|
|
614
|
+
cols_to_ignore = self._ignore_or_drop_existing_columns([col_name], IfExistsParam.validated(if_exists, 'if_exists'))
|
|
615
|
+
# if the column to add already exists and user asked to ignore
|
|
616
|
+
# exiting column, there's nothing to do.
|
|
617
|
+
if len(cols_to_ignore) != 0:
|
|
618
|
+
assert cols_to_ignore[0] == col_name
|
|
619
|
+
return UpdateStatus()
|
|
620
|
+
|
|
597
621
|
new_col = self._create_columns({col_name: col_schema})[0]
|
|
598
|
-
self._verify_column(new_col
|
|
622
|
+
self._verify_column(new_col)
|
|
599
623
|
status = self._tbl_version.add_columns([new_col], print_stats=print_stats, on_error=on_error)
|
|
600
624
|
FileCache.get().emit_eviction_warnings()
|
|
601
625
|
return status
|
|
@@ -675,18 +699,12 @@ class Table(SchemaObject):
|
|
|
675
699
|
return columns
|
|
676
700
|
|
|
677
701
|
@classmethod
|
|
678
|
-
def _verify_column(
|
|
679
|
-
cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
|
|
680
|
-
) -> None:
|
|
702
|
+
def _verify_column(cls, col: Column) -> None:
|
|
681
703
|
"""Check integrity of user-supplied Column and supply defaults"""
|
|
682
704
|
if is_system_column_name(col.name):
|
|
683
705
|
raise excs.Error(f'{col.name!r} is a reserved name in Pixeltable; please choose a different column name.')
|
|
684
706
|
if not is_valid_identifier(col.name):
|
|
685
707
|
raise excs.Error(f"Invalid column name: {col.name!r}")
|
|
686
|
-
if col.name in existing_column_names:
|
|
687
|
-
raise excs.Error(f'Duplicate column name: {col.name!r}')
|
|
688
|
-
if existing_query_names is not None and col.name in existing_query_names:
|
|
689
|
-
raise excs.Error(f'Column name conflicts with a registered query: {col.name!r}')
|
|
690
708
|
if col.stored is False and not (col.is_computed and col.col_type.is_image_type()):
|
|
691
709
|
raise excs.Error(f'Column {col.name!r}: stored={col.stored} only applies to computed image columns')
|
|
692
710
|
if col.stored is False and col.has_window_fn_call():
|
|
@@ -699,7 +717,7 @@ class Table(SchemaObject):
|
|
|
699
717
|
"""Check integrity of user-supplied schema and set defaults"""
|
|
700
718
|
column_names: set[str] = set()
|
|
701
719
|
for col in schema:
|
|
702
|
-
cls._verify_column(col
|
|
720
|
+
cls._verify_column(col)
|
|
703
721
|
column_names.add(col.name)
|
|
704
722
|
|
|
705
723
|
def __check_column_name_exists(self, column_name: str, include_bases: bool = False) -> None:
|
|
@@ -809,61 +827,108 @@ class Table(SchemaObject):
|
|
|
809
827
|
"""
|
|
810
828
|
self._tbl_version.rename_column(old_name, new_name)
|
|
811
829
|
|
|
830
|
+
def _list_index_info_for_test(self) -> list[dict[str, Any]]:
|
|
831
|
+
"""
|
|
832
|
+
Returns list of all the indexes on this table. Used for testing.
|
|
833
|
+
|
|
834
|
+
Returns:
|
|
835
|
+
A list of index information, each containing the index's
|
|
836
|
+
id, name, and the name of the column it indexes.
|
|
837
|
+
"""
|
|
838
|
+
assert not self._is_dropped
|
|
839
|
+
index_info = []
|
|
840
|
+
for idx_name, idx in self._tbl_version.idxs_by_name.items():
|
|
841
|
+
index_info.append({
|
|
842
|
+
'_id': idx.id,
|
|
843
|
+
'_name': idx_name,
|
|
844
|
+
'_column': idx.col.name
|
|
845
|
+
})
|
|
846
|
+
return index_info
|
|
847
|
+
|
|
812
848
|
def add_embedding_index(
|
|
813
849
|
self, column: Union[str, ColumnRef], *, idx_name: Optional[str] = None,
|
|
850
|
+
embedding: Optional[pxt.Function] = None,
|
|
814
851
|
string_embed: Optional[pxt.Function] = None, image_embed: Optional[pxt.Function] = None,
|
|
815
|
-
metric: str = 'cosine'
|
|
852
|
+
metric: str = 'cosine',
|
|
853
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
|
|
816
854
|
) -> None:
|
|
817
855
|
"""
|
|
818
|
-
Add an embedding index to the table. Once the index is
|
|
856
|
+
Add an embedding index to the table. Once the index is created, it will be automatically kept up-to-date as new
|
|
819
857
|
rows are inserted into the table.
|
|
820
858
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
859
|
+
To add an embedding index, one must specify, at minimum, the column to be indexed and an embedding UDF.
|
|
860
|
+
Only `String` and `Image` columns are currently supported. Here's an example that uses a
|
|
861
|
+
[CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
|
|
862
|
+
|
|
863
|
+
>>> from pixeltable.functions.huggingface import clip
|
|
864
|
+
... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
|
|
865
|
+
... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
|
|
866
|
+
|
|
867
|
+
Once the index is created, similiarity lookups can be performed using the `similarity` pseudo-function.
|
|
868
|
+
|
|
869
|
+
>>> reference_img = PIL.Image.open('my_image.jpg')
|
|
870
|
+
... sim = tbl.img.similarity(reference_img)
|
|
871
|
+
... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
872
|
+
|
|
873
|
+
If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
|
|
874
|
+
performed using any of its supported types. In our example, CLIP supports both text and images, so we can
|
|
875
|
+
also search for images using a text description:
|
|
876
|
+
|
|
877
|
+
>>> sim = tbl.img.similarity('a picture of a train')
|
|
878
|
+
... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
825
879
|
|
|
826
880
|
Args:
|
|
827
|
-
column: The name of, or reference to, the column to
|
|
828
|
-
idx_name:
|
|
829
|
-
If specified, the name must be unique for this table.
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
881
|
+
column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
|
|
882
|
+
idx_name: An optional name for the index. If not specified, a name such as `'idx0'` will be generated
|
|
883
|
+
automatically. If specified, the name must be unique for this table.
|
|
884
|
+
embedding: The UDF to use for the embedding. Must be a UDF that accepts a single argument of type `String`
|
|
885
|
+
or `Image` (as appropriate for the column being indexed) and returns a fixed-size 1-dimensional
|
|
886
|
+
array of floats.
|
|
887
|
+
string_embed: An optional UDF to use for the string embedding component of this index.
|
|
888
|
+
Can be used in conjunction with `image_embed` to construct multimodal embeddings manually, by
|
|
889
|
+
specifying different embedding functions for different data types.
|
|
890
|
+
image_embed: An optional UDF to use for the image embedding component of this index.
|
|
891
|
+
Can be used in conjunction with `string_embed` to construct multimodal embeddings manually, by
|
|
892
|
+
specifying different embedding functions for different data types.
|
|
893
|
+
metric: Distance metric to use for the index; one of `'cosine'`, `'ip'`, or `'l2'`.
|
|
894
|
+
The default is `'cosine'`.
|
|
895
|
+
if_exists: Directive for handling an existing index with the same name. Must be one of the following:
|
|
896
|
+
|
|
897
|
+
- `'error'`: raise an error if an index with the same name already exists.
|
|
898
|
+
- `'ignore'`: do nothing if an index with the same name already exists.
|
|
899
|
+
- `'replace'` or `'replace_force'`: replace the existing index with the new one.
|
|
834
900
|
|
|
835
901
|
Raises:
|
|
836
|
-
Error: If an index with
|
|
902
|
+
Error: If an index with the specified name already exists for the table and `if_exists='error'`, or if the specified column does not exist.
|
|
837
903
|
|
|
838
904
|
Examples:
|
|
839
|
-
Add an index to the `img` column of the table `my_table
|
|
905
|
+
Add an index to the `img` column of the table `my_table`:
|
|
840
906
|
|
|
841
|
-
>>>
|
|
842
|
-
... tbl.
|
|
907
|
+
>>> from pixeltable.functions.huggingface import clip
|
|
908
|
+
... tbl = pxt.get_table('my_table')
|
|
909
|
+
... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
|
|
910
|
+
... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
|
|
843
911
|
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
912
|
+
Alternatively, the `img` column may be specified by name:
|
|
913
|
+
|
|
914
|
+
>>> tbl.add_embedding_index('img', embedding=embedding_fn)
|
|
847
915
|
|
|
848
|
-
Add
|
|
849
|
-
and with a specific name
|
|
916
|
+
Add a second index to the `img` column, using the inner product as the distance metric,
|
|
917
|
+
and with a specific name:
|
|
850
918
|
|
|
851
919
|
>>> tbl.add_embedding_index(
|
|
852
|
-
...
|
|
853
|
-
... idx_name='
|
|
854
|
-
...
|
|
855
|
-
... string_embed=my_string_func,
|
|
920
|
+
... tbl.img,
|
|
921
|
+
... idx_name='ip_idx',
|
|
922
|
+
... embedding=embedding_fn,
|
|
856
923
|
... metric='ip'
|
|
857
924
|
... )
|
|
858
925
|
|
|
859
|
-
|
|
926
|
+
Add an index using separately specified string and image embeddings:
|
|
860
927
|
|
|
861
928
|
>>> tbl.add_embedding_index(
|
|
862
929
|
... tbl.img,
|
|
863
|
-
...
|
|
864
|
-
... image_embed=
|
|
865
|
-
... string_embed=my_string_func,
|
|
866
|
-
... metric='ip'
|
|
930
|
+
... string_embed=string_embedding_fn,
|
|
931
|
+
... image_embed=image_embedding_fn
|
|
867
932
|
... )
|
|
868
933
|
"""
|
|
869
934
|
if self._tbl_version_path.is_snapshot():
|
|
@@ -877,11 +942,22 @@ class Table(SchemaObject):
|
|
|
877
942
|
col = column.col
|
|
878
943
|
|
|
879
944
|
if idx_name is not None and idx_name in self._tbl_version.idxs_by_name:
|
|
880
|
-
|
|
945
|
+
_if_exists = IfExistsParam.validated(if_exists, 'if_exists')
|
|
946
|
+
# An index with the same name already exists.
|
|
947
|
+
# Handle it according to if_exists.
|
|
948
|
+
if _if_exists == IfExistsParam.ERROR:
|
|
949
|
+
raise excs.Error(f'Duplicate index name: {idx_name}')
|
|
950
|
+
if not isinstance(self._tbl_version.idxs_by_name[idx_name].idx, index.EmbeddingIndex):
|
|
951
|
+
raise excs.Error(f'Index `{idx_name}` is not an embedding index. Cannot {_if_exists.name.lower()} it.')
|
|
952
|
+
if _if_exists == IfExistsParam.IGNORE:
|
|
953
|
+
return
|
|
954
|
+
assert _if_exists == IfExistsParam.REPLACE or _if_exists == IfExistsParam.REPLACE_FORCE
|
|
955
|
+
self.drop_index(idx_name=idx_name)
|
|
956
|
+
assert idx_name not in self._tbl_version.idxs_by_name
|
|
881
957
|
from pixeltable.index import EmbeddingIndex
|
|
882
958
|
|
|
883
959
|
# create the EmbeddingIndex instance to verify args
|
|
884
|
-
idx = EmbeddingIndex(col, metric=metric, string_embed=string_embed, image_embed=image_embed)
|
|
960
|
+
idx = EmbeddingIndex(col, metric=metric, embed=embedding, string_embed=string_embed, image_embed=image_embed)
|
|
885
961
|
status = self._tbl_version.add_index(col, idx_name=idx_name, idx=idx)
|
|
886
962
|
# TODO: how to deal with exceptions here? drop the index and raise?
|
|
887
963
|
FileCache.get().emit_eviction_warnings()
|
|
@@ -1255,43 +1331,6 @@ class Table(SchemaObject):
|
|
|
1255
1331
|
raise excs.Error('Cannot revert a snapshot')
|
|
1256
1332
|
self._tbl_version.revert()
|
|
1257
1333
|
|
|
1258
|
-
@overload
|
|
1259
|
-
def query(self, py_fn: Callable) -> 'pxt.func.QueryTemplateFunction': ...
|
|
1260
|
-
|
|
1261
|
-
@overload
|
|
1262
|
-
def query(
|
|
1263
|
-
self, *, param_types: Optional[list[ts.ColumnType]] = None
|
|
1264
|
-
) -> Callable[[Callable], 'pxt.func.QueryTemplateFunction']: ...
|
|
1265
|
-
|
|
1266
|
-
def query(self, *args: Any, **kwargs: Any) -> Any:
|
|
1267
|
-
def make_query_template(
|
|
1268
|
-
py_fn: Callable, param_types: Optional[list[ts.ColumnType]]
|
|
1269
|
-
) -> 'pxt.func.QueryTemplateFunction':
|
|
1270
|
-
if py_fn.__module__ != '__main__' and py_fn.__name__.isidentifier():
|
|
1271
|
-
# this is a named function in a module
|
|
1272
|
-
function_path = f'{py_fn.__module__}.{py_fn.__qualname__}'
|
|
1273
|
-
else:
|
|
1274
|
-
function_path = None
|
|
1275
|
-
query_name = py_fn.__name__
|
|
1276
|
-
if query_name in self._schema.keys():
|
|
1277
|
-
raise excs.Error(f'Query name {query_name!r} conflicts with existing column')
|
|
1278
|
-
if query_name in self.__query_scope._queries and function_path is not None:
|
|
1279
|
-
raise excs.Error(f'Duplicate query name: {query_name!r}')
|
|
1280
|
-
query_fn = pxt.func.QueryTemplateFunction.create(
|
|
1281
|
-
py_fn, param_types=param_types, path=function_path, name=query_name)
|
|
1282
|
-
self.__query_scope._queries[query_name] = query_fn
|
|
1283
|
-
return query_fn
|
|
1284
|
-
|
|
1285
|
-
# TODO: verify that the inferred return type matches that of the template
|
|
1286
|
-
# TODO: verify that the signature doesn't contain batched parameters
|
|
1287
|
-
|
|
1288
|
-
if len(args) == 1:
|
|
1289
|
-
assert len(kwargs) == 0 and callable(args[0])
|
|
1290
|
-
return make_query_template(args[0], None)
|
|
1291
|
-
else:
|
|
1292
|
-
assert len(args) == 0 and len(kwargs) == 1 and 'param_types' in kwargs
|
|
1293
|
-
return lambda py_fn: make_query_template(py_fn, kwargs['param_types'])
|
|
1294
|
-
|
|
1295
1334
|
@property
|
|
1296
1335
|
def external_stores(self) -> list[str]:
|
|
1297
1336
|
return list(self._tbl_version.external_stores.keys())
|
|
@@ -1381,7 +1420,7 @@ class Table(SchemaObject):
|
|
|
1381
1420
|
return sync_status
|
|
1382
1421
|
|
|
1383
1422
|
def __dir__(self) -> list[str]:
|
|
1384
|
-
return list(super().__dir__()) + list(self._schema.keys())
|
|
1423
|
+
return list(super().__dir__()) + list(self._schema.keys())
|
|
1385
1424
|
|
|
1386
1425
|
def _ipython_key_completions_(self) -> list[str]:
|
|
1387
|
-
return list(self._schema.keys())
|
|
1426
|
+
return list(self._schema.keys())
|
|
@@ -734,7 +734,8 @@ class TableVersion:
|
|
|
734
734
|
if conn is None:
|
|
735
735
|
with Env.get().engine.begin() as conn:
|
|
736
736
|
return self._insert(
|
|
737
|
-
plan, conn, time.time(), print_stats=print_stats, rowids=rowids(),
|
|
737
|
+
plan, conn, time.time(), print_stats=print_stats, rowids=rowids(),
|
|
738
|
+
abort_on_exc=fail_on_exception)
|
|
738
739
|
else:
|
|
739
740
|
return self._insert(
|
|
740
741
|
plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
|
pixeltable/catalog/view.py
CHANGED
|
@@ -16,7 +16,7 @@ from pixeltable.iterators import ComponentIterator
|
|
|
16
16
|
|
|
17
17
|
from .catalog import Catalog
|
|
18
18
|
from .column import Column
|
|
19
|
-
from .globals import _POS_COLUMN_NAME,
|
|
19
|
+
from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
|
|
20
20
|
from .table import Table
|
|
21
21
|
from .table_version import TableVersion
|
|
22
22
|
from .table_version_path import TableVersionPath
|
|
@@ -166,13 +166,11 @@ class View(Table):
|
|
|
166
166
|
return view
|
|
167
167
|
|
|
168
168
|
@classmethod
|
|
169
|
-
def _verify_column(
|
|
170
|
-
cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
|
|
171
|
-
) -> None:
|
|
169
|
+
def _verify_column(cls, col: Column) -> None:
|
|
172
170
|
# make sure that columns are nullable or have a default
|
|
173
171
|
if not col.col_type.nullable and not col.is_computed:
|
|
174
172
|
raise excs.Error(f'Column {col.name}: non-computed columns in views must be nullable')
|
|
175
|
-
super()._verify_column(col
|
|
173
|
+
super()._verify_column(col)
|
|
176
174
|
|
|
177
175
|
@classmethod
|
|
178
176
|
def _get_snapshot_path(cls, tbl_version_path: TableVersionPath) -> TableVersionPath:
|