pixeltable 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +18 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +31 -50
- pixeltable/catalog/insertable_table.py +7 -6
- pixeltable/catalog/table.py +171 -57
- pixeltable/catalog/table_version.py +417 -140
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/dataframe.py +239 -121
- pixeltable/env.py +82 -16
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/in_memory_data_node.py +11 -7
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +9 -0
- pixeltable/exprs/comparison.py +3 -3
- pixeltable/exprs/data_row.py +5 -1
- pixeltable/exprs/expr.py +15 -7
- pixeltable/exprs/function_call.py +17 -15
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/literal.py +16 -4
- pixeltable/exprs/row_builder.py +15 -41
- pixeltable/exprs/similarity_expr.py +65 -0
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +18 -15
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +20 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/globals.py +24 -14
- pixeltable/func/signature.py +23 -27
- pixeltable/func/udf.py +13 -12
- pixeltable/functions/__init__.py +8 -8
- pixeltable/functions/eval.py +7 -8
- pixeltable/functions/huggingface.py +64 -17
- pixeltable/functions/openai.py +36 -3
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +21 -0
- pixeltable/functions/util.py +11 -0
- pixeltable/globals.py +425 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +51 -0
- pixeltable/index/embedding_index.py +168 -0
- pixeltable/io/__init__.py +3 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +4 -0
- pixeltable/iterators/document.py +218 -97
- pixeltable/iterators/video.py +8 -9
- pixeltable/metadata/__init__.py +7 -3
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/schema.py +45 -22
- pixeltable/plan.py +15 -51
- pixeltable/store.py +38 -41
- pixeltable/tool/create_test_db_dump.py +39 -4
- pixeltable/type_system.py +47 -96
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/METADATA +14 -10
- pixeltable-0.2.6.dist-info/RECORD +119 -0
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -604
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/tests/conftest.py +0 -177
- pixeltable/tests/functions/test_fireworks.py +0 -42
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -152
- pixeltable/tests/functions/test_together.py +0 -111
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -370
- pixeltable/tests/test_dataframe.py +0 -439
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -805
- pixeltable/tests/test_function.py +0 -324
- pixeltable/tests/test_migration.py +0 -43
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -208
- pixeltable/tests/test_table.py +0 -1267
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -22
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -530
- pixeltable/tests/utils.py +0 -408
- pixeltable-0.2.4.dist-info/RECORD +0 -132
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/LICENSE +0 -0
pixeltable/__init__.py
CHANGED
|
@@ -1,18 +1,30 @@
|
|
|
1
1
|
from .catalog import Column, Table, InsertableTable, View
|
|
2
|
-
from .client import Client
|
|
3
2
|
from .dataframe import DataFrame
|
|
4
3
|
from .exceptions import Error, Error
|
|
5
4
|
from .exprs import RELATIVE_PATH_ROOT
|
|
6
5
|
from .func import Function, udf, uda, Aggregator, expr_udf
|
|
7
|
-
from .
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
from .globals import *
|
|
7
|
+
from .type_system import (
|
|
8
|
+
ColumnType,
|
|
9
|
+
StringType,
|
|
10
|
+
IntType,
|
|
11
|
+
FloatType,
|
|
12
|
+
BoolType,
|
|
13
|
+
TimestampType,
|
|
14
|
+
JsonType,
|
|
15
|
+
ArrayType,
|
|
16
|
+
ImageType,
|
|
17
|
+
VideoType,
|
|
18
|
+
AudioType,
|
|
19
|
+
DocumentType,
|
|
20
|
+
)
|
|
10
21
|
from .utils.help import help
|
|
22
|
+
|
|
11
23
|
# noinspection PyUnresolvedReferences
|
|
12
|
-
from . import functions
|
|
24
|
+
from . import functions, io
|
|
25
|
+
from .__version__ import __version__, __version_tuple__
|
|
13
26
|
|
|
14
27
|
__all__ = [
|
|
15
|
-
'Client',
|
|
16
28
|
'DataFrame',
|
|
17
29
|
'Column',
|
|
18
30
|
'Table',
|
|
@@ -39,6 +51,3 @@ __all__ = [
|
|
|
39
51
|
'uda',
|
|
40
52
|
'expr_udf',
|
|
41
53
|
]
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
pixeltable/catalog/column.py
CHANGED
|
@@ -4,11 +4,9 @@ import logging
|
|
|
4
4
|
from typing import Optional, Union, Callable, Set
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
|
-
from pgvector.sqlalchemy import Vector
|
|
8
7
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
from pixeltable.type_system import ColumnType, StringType
|
|
8
|
+
import pixeltable.exceptions as excs
|
|
9
|
+
import pixeltable.type_system as ts
|
|
12
10
|
from .globals import is_valid_identifier
|
|
13
11
|
|
|
14
12
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -20,44 +18,38 @@ class Column:
|
|
|
20
18
|
table/view.
|
|
21
19
|
"""
|
|
22
20
|
def __init__(
|
|
23
|
-
self, name: str, col_type: Optional[ColumnType] = None,
|
|
21
|
+
self, name: Optional[str], col_type: Optional[ts.ColumnType] = None,
|
|
24
22
|
computed_with: Optional[Union['Expr', Callable]] = None,
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
23
|
+
is_pk: bool = False, stored: Optional[bool] = None,
|
|
24
|
+
col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
|
|
25
|
+
schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None
|
|
26
|
+
):
|
|
29
27
|
"""Column constructor.
|
|
30
28
|
|
|
31
29
|
Args:
|
|
32
|
-
name: column name
|
|
30
|
+
name: column name; None for system columns (eg, index columns)
|
|
33
31
|
col_type: column type; can be None if the type can be derived from ``computed_with``
|
|
34
32
|
computed_with: a callable or an Expr object that computes the column value
|
|
35
|
-
|
|
33
|
+
is_pk: if True, this column is part of the primary key
|
|
36
34
|
stored: determines whether a computed column is present in the stored table or recomputed on demand
|
|
37
|
-
indexed: if True, this column has a nearest neighbor index (only valid for image columns)
|
|
38
35
|
col_id: column ID (only used internally)
|
|
39
36
|
|
|
40
37
|
Computed columns: those have a non-None ``computed_with`` argument
|
|
41
|
-
|
|
42
38
|
- when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
|
|
43
39
|
col_type is None
|
|
44
40
|
- when loaded from md store: ``computed_with`` is set and col_type is set
|
|
45
41
|
|
|
46
42
|
``computed_with`` is a Callable:
|
|
47
|
-
|
|
48
43
|
- the callable's parameter names must correspond to existing columns in the table for which this Column
|
|
49
44
|
is being used
|
|
50
45
|
- ``col_type`` needs to be set to the callable's return type
|
|
51
46
|
|
|
52
47
|
``stored`` (only valid for computed image columns):
|
|
53
|
-
|
|
54
48
|
- if True: the column is present in the stored table
|
|
55
49
|
- if False: the column is not present in the stored table and recomputed during a query
|
|
56
50
|
- if None: the system chooses for you (at present, this is always False, but this may change in the future)
|
|
57
|
-
|
|
58
|
-
indexed: only valid for image columns; if true, maintains an NN index for this column
|
|
59
51
|
"""
|
|
60
|
-
if not is_valid_identifier(name):
|
|
52
|
+
if name is not None and not is_valid_identifier(name):
|
|
61
53
|
raise excs.Error(f"Invalid column name: '{name}'")
|
|
62
54
|
self.name = name
|
|
63
55
|
if col_type is None and computed_with is None:
|
|
@@ -90,35 +82,20 @@ class Column:
|
|
|
90
82
|
self.stored = stored
|
|
91
83
|
self.dependent_cols: Set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
|
|
92
84
|
self.id = col_id
|
|
93
|
-
self.
|
|
85
|
+
self.is_pk = is_pk
|
|
86
|
+
self.schema_version_add = schema_version_add
|
|
87
|
+
self.schema_version_drop = schema_version_drop
|
|
94
88
|
|
|
95
89
|
# column in the stored table for the values of this Column
|
|
96
90
|
self.sa_col: Optional[sql.schema.Column] = None
|
|
91
|
+
self.sa_col_type = sa_col_type
|
|
97
92
|
|
|
98
93
|
# computed cols also have storage columns for the exception string and type
|
|
99
94
|
self.sa_errormsg_col: Optional[sql.schema.Column] = None
|
|
100
95
|
self.sa_errortype_col: Optional[sql.schema.Column] = None
|
|
101
|
-
# indexed columns also have a column for the embeddings
|
|
102
|
-
self.sa_idx_col: Optional[sql.schema.Column] = None
|
|
103
96
|
from .table_version import TableVersion
|
|
104
97
|
self.tbl: Optional[TableVersion] = None # set by owning TableVersion
|
|
105
98
|
|
|
106
|
-
if indexed and not self.col_type.is_image_type():
|
|
107
|
-
raise excs.Error(f'Column {name}: indexed=True requires ImageType')
|
|
108
|
-
self.is_indexed = indexed
|
|
109
|
-
|
|
110
|
-
@classmethod
|
|
111
|
-
def from_md(cls, col_id: int, md: schema.SchemaColumn, tbl: 'TableVersion') -> Column:
|
|
112
|
-
"""Construct a Column from metadata.
|
|
113
|
-
|
|
114
|
-
Leaves out value_expr, because that requires TableVersion.cols to be complete.
|
|
115
|
-
"""
|
|
116
|
-
col = cls(
|
|
117
|
-
md.name, col_type=ColumnType.from_dict(md.col_type), primary_key=md.is_pk,
|
|
118
|
-
stored=md.stored, indexed=md.is_indexed, col_id=col_id)
|
|
119
|
-
col.tbl = tbl
|
|
120
|
-
return col
|
|
121
|
-
|
|
122
99
|
def __hash__(self) -> int:
|
|
123
100
|
assert self.tbl is not None
|
|
124
101
|
return hash((self.tbl.id, self.id))
|
|
@@ -137,6 +114,10 @@ class Column:
|
|
|
137
114
|
l = list(self.value_expr.subexprs(filter=lambda e: isinstance(e, exprs.FunctionCall) and e.is_window_fn_call))
|
|
138
115
|
return len(l) > 0
|
|
139
116
|
|
|
117
|
+
def get_idx_info(self) -> dict[str, 'pixeltable.catalog.TableVersion.IndexInfo']:
|
|
118
|
+
assert self.tbl is not None
|
|
119
|
+
return {name: info for name, info in self.tbl.idxs_by_name.items() if info.col == self}
|
|
120
|
+
|
|
140
121
|
@property
|
|
141
122
|
def is_computed(self) -> bool:
|
|
142
123
|
return self.compute_func is not None or self.value_expr is not None
|
|
@@ -167,26 +148,26 @@ class Column:
|
|
|
167
148
|
"""
|
|
168
149
|
assert self.is_stored
|
|
169
150
|
# all storage columns are nullable (we deal with null errors in Pixeltable directly)
|
|
170
|
-
self.sa_col = sql.Column(
|
|
151
|
+
self.sa_col = sql.Column(
|
|
152
|
+
self.store_name(), self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type,
|
|
153
|
+
nullable=True)
|
|
171
154
|
if self.is_computed or self.col_type.is_media_type():
|
|
172
|
-
self.sa_errormsg_col = sql.Column(self.
|
|
173
|
-
self.sa_errortype_col = sql.Column(self.
|
|
174
|
-
|
|
175
|
-
|
|
155
|
+
self.sa_errormsg_col = sql.Column(self.errormsg_store_name(), ts.StringType().to_sa_type(), nullable=True)
|
|
156
|
+
self.sa_errortype_col = sql.Column(self.errortype_store_name(), ts.StringType().to_sa_type(), nullable=True)
|
|
157
|
+
|
|
158
|
+
def get_sa_col_type(self) -> sql.sqltypes.TypeEngine:
|
|
159
|
+
return self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type
|
|
176
160
|
|
|
177
|
-
def
|
|
161
|
+
def store_name(self) -> str:
|
|
178
162
|
assert self.id is not None
|
|
179
163
|
assert self.is_stored
|
|
180
164
|
return f'col_{self.id}'
|
|
181
165
|
|
|
182
|
-
def
|
|
183
|
-
return f'{self.
|
|
184
|
-
|
|
185
|
-
def errortype_storage_name(self) -> str:
|
|
186
|
-
return f'{self.storage_name()}_errortype'
|
|
166
|
+
def errormsg_store_name(self) -> str:
|
|
167
|
+
return f'{self.store_name()}_errormsg'
|
|
187
168
|
|
|
188
|
-
def
|
|
189
|
-
return f'{self.
|
|
169
|
+
def errortype_store_name(self) -> str:
|
|
170
|
+
return f'{self.store_name()}_errortype'
|
|
190
171
|
|
|
191
172
|
def __str__(self) -> str:
|
|
192
173
|
return f'{self.name}: {self.col_type}'
|
|
@@ -11,14 +11,17 @@ import pixeltable.type_system as ts
|
|
|
11
11
|
from pixeltable import exceptions as excs
|
|
12
12
|
from pixeltable.env import Env
|
|
13
13
|
from .catalog import Catalog
|
|
14
|
+
from .globals import UpdateStatus
|
|
14
15
|
from .table import Table
|
|
15
16
|
from .table_version import TableVersion
|
|
16
17
|
from .table_version_path import TableVersionPath
|
|
17
18
|
|
|
18
19
|
_logger = logging.getLogger('pixeltable')
|
|
19
20
|
|
|
21
|
+
|
|
20
22
|
class InsertableTable(Table):
|
|
21
23
|
"""A `Table` that allows inserting and deleting rows."""
|
|
24
|
+
|
|
22
25
|
def __init__(self, dir_id: UUID, tbl_version: TableVersion):
|
|
23
26
|
tbl_version_path = TableVersionPath(tbl_version)
|
|
24
27
|
super().__init__(tbl_version.id, dir_id, tbl_version.name, tbl_version_path)
|
|
@@ -42,7 +45,7 @@ class InsertableTable(Table):
|
|
|
42
45
|
col = columns[column_names.index(pk_col)]
|
|
43
46
|
if col.col_type.nullable:
|
|
44
47
|
raise excs.Error(f'Primary key column {pk_col} cannot be nullable')
|
|
45
|
-
col.
|
|
48
|
+
col.is_pk = True
|
|
46
49
|
|
|
47
50
|
with orm.Session(Env.get().engine, future=True) as session:
|
|
48
51
|
_, tbl_version = TableVersion.create(session, dir_id, name, columns, num_retained_versions, comment)
|
|
@@ -62,7 +65,7 @@ class InsertableTable(Table):
|
|
|
62
65
|
@overload
|
|
63
66
|
def insert(self, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any): ...
|
|
64
67
|
|
|
65
|
-
def insert(self, *args, **kwargs) ->
|
|
68
|
+
def insert(self, *args, **kwargs) -> UpdateStatus:
|
|
66
69
|
"""Insert rows into table.
|
|
67
70
|
|
|
68
71
|
To insert multiple rows at a time:
|
|
@@ -161,7 +164,7 @@ class InsertableTable(Table):
|
|
|
161
164
|
msg = str(e)
|
|
162
165
|
raise excs.Error(f'Error in column {col.name}: {msg[0].lower() + msg[1:]}\nRow: {row}')
|
|
163
166
|
|
|
164
|
-
def delete(self, where: Optional['pixeltable.exprs.Predicate'] = None) ->
|
|
167
|
+
def delete(self, where: Optional['pixeltable.exprs.Predicate'] = None) -> UpdateStatus:
|
|
165
168
|
"""Delete rows in this table.
|
|
166
169
|
|
|
167
170
|
Args:
|
|
@@ -181,9 +184,7 @@ class InsertableTable(Table):
|
|
|
181
184
|
if where is not None:
|
|
182
185
|
if not isinstance(where, Predicate):
|
|
183
186
|
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
184
|
-
analysis_info = Planner.analyze(self.
|
|
185
|
-
if analysis_info.similarity_clause is not None:
|
|
186
|
-
raise excs.Error('nearest() cannot be used with delete()')
|
|
187
|
+
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
187
188
|
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
188
189
|
if analysis_info.filter is not None:
|
|
189
190
|
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
pixeltable/catalog/table.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import dataclasses
|
|
4
3
|
import json
|
|
5
4
|
import logging
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple
|
|
6
|
+
from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple, Iterable
|
|
8
7
|
from uuid import UUID
|
|
9
8
|
|
|
10
9
|
import pandas as pd
|
|
@@ -18,7 +17,7 @@ import pixeltable.exprs as exprs
|
|
|
18
17
|
import pixeltable.metadata.schema as schema
|
|
19
18
|
import pixeltable.type_system as ts
|
|
20
19
|
from .column import Column
|
|
21
|
-
from .globals import is_valid_identifier, is_system_column_name
|
|
20
|
+
from .globals import is_valid_identifier, is_system_column_name, UpdateStatus
|
|
22
21
|
from .schema_object import SchemaObject
|
|
23
22
|
from .table_version import TableVersion
|
|
24
23
|
from .table_version_path import TableVersionPath
|
|
@@ -28,14 +27,7 @@ _logger = logging.getLogger('pixeltable')
|
|
|
28
27
|
class Table(SchemaObject):
|
|
29
28
|
"""Base class for all tabular SchemaObjects."""
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
class UpdateStatus:
|
|
33
|
-
num_rows: int = 0
|
|
34
|
-
# TODO: change to num_computed_columns (the number of computed slots isn't really meaningful to the user)
|
|
35
|
-
num_computed_values: int = 0
|
|
36
|
-
num_excs: int = 0
|
|
37
|
-
updated_cols: List[str] = dataclasses.field(default_factory=list)
|
|
38
|
-
cols_with_excs: List[str] = dataclasses.field(default_factory=list)
|
|
30
|
+
ROWID_COLUMN_NAME = '_rowid'
|
|
39
31
|
|
|
40
32
|
def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
|
|
41
33
|
super().__init__(id, name, dir_id)
|
|
@@ -105,6 +97,11 @@ class Table(SchemaObject):
|
|
|
105
97
|
from pixeltable.dataframe import DataFrame
|
|
106
98
|
return DataFrame(self.tbl_version_path).order_by(*items, asc=asc)
|
|
107
99
|
|
|
100
|
+
def group_by(self, *items: 'exprs.Expr') -> 'pixeltable.dataframe.DataFrame':
|
|
101
|
+
"""Return a DataFrame for this table."""
|
|
102
|
+
from pixeltable.dataframe import DataFrame
|
|
103
|
+
return DataFrame(self.tbl_version_path).group_by(*items)
|
|
104
|
+
|
|
108
105
|
def collect(self) -> 'pixeltable.dataframe.DataFrameResultSet': # type: ignore[name-defined, no-untyped-def]
|
|
109
106
|
"""Return rows from this table.
|
|
110
107
|
"""
|
|
@@ -225,7 +222,7 @@ class Table(SchemaObject):
|
|
|
225
222
|
value: column type or value expression or column specification dictionary:
|
|
226
223
|
column type: a Pixeltable column type (if the table already contains rows, it must be nullable)
|
|
227
224
|
value expression: a Pixeltable expression that computes the column values
|
|
228
|
-
column specification: a dictionary with possible keys 'type', 'value', 'stored'
|
|
225
|
+
column specification: a dictionary with possible keys 'type', 'value', 'stored'
|
|
229
226
|
Examples:
|
|
230
227
|
Add an int column with ``None`` values:
|
|
231
228
|
|
|
@@ -247,11 +244,6 @@ class Table(SchemaObject):
|
|
|
247
244
|
Do the same, but now the column is stored:
|
|
248
245
|
|
|
249
246
|
>>> tbl['rotated'] = {'value': tbl.frame.rotate(90), 'stored': True}
|
|
250
|
-
|
|
251
|
-
Add a resized version of the ``frame`` column and index it. The column does not need to be stored in order
|
|
252
|
-
to be indexed:
|
|
253
|
-
|
|
254
|
-
>>> tbl['small_frame'] = {'value': tbl.frame.resize([224, 224]), 'indexed': True}
|
|
255
247
|
"""
|
|
256
248
|
if not isinstance(column_name, str):
|
|
257
249
|
raise excs.Error(f'Column name must be a string, got {type(column_name)}')
|
|
@@ -264,8 +256,8 @@ class Table(SchemaObject):
|
|
|
264
256
|
|
|
265
257
|
def add_column(
|
|
266
258
|
self, *,
|
|
267
|
-
type: Optional[ts.ColumnType] = None, stored: Optional[bool] = None,
|
|
268
|
-
|
|
259
|
+
type: Optional[ts.ColumnType] = None, stored: Optional[bool] = None, print_stats: bool = False,
|
|
260
|
+
**kwargs: Any
|
|
269
261
|
) -> UpdateStatus:
|
|
270
262
|
"""Adds a column to the table.
|
|
271
263
|
|
|
@@ -273,7 +265,6 @@ class Table(SchemaObject):
|
|
|
273
265
|
kwargs: Exactly one keyword argument of the form ``column-name=type|value-expression``.
|
|
274
266
|
type: The type of the column. Only valid and required if ``value-expression`` is a Callable.
|
|
275
267
|
stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
|
|
276
|
-
indexed: Whether the column is indexed.
|
|
277
268
|
print_stats: If ``True``, print execution metrics.
|
|
278
269
|
|
|
279
270
|
Returns:
|
|
@@ -318,15 +309,6 @@ class Table(SchemaObject):
|
|
|
318
309
|
Alternatively, this can also be expressed as:
|
|
319
310
|
|
|
320
311
|
>>> tbl['rotated'] = {'value': tbl.frame.rotate(90), 'stored': True}
|
|
321
|
-
|
|
322
|
-
Add a resized version of the ``frame`` column and index it. The column does not need to be stored in order
|
|
323
|
-
to be indexed:
|
|
324
|
-
|
|
325
|
-
>>> tbl.add_column(small_frame=tbl.frame.resize([224, 224]), indexed=True)
|
|
326
|
-
|
|
327
|
-
Alternatively, this can also be expressed as:
|
|
328
|
-
|
|
329
|
-
>>> tbl['small_frame'] = {'value': tbl.frame.resize([224, 224]), 'indexed': True}
|
|
330
312
|
"""
|
|
331
313
|
self._check_is_dropped()
|
|
332
314
|
# verify kwargs and construct column schema dict
|
|
@@ -349,8 +331,6 @@ class Table(SchemaObject):
|
|
|
349
331
|
col_schema['type'] = type
|
|
350
332
|
if stored is not None:
|
|
351
333
|
col_schema['stored'] = stored
|
|
352
|
-
if indexed is not None:
|
|
353
|
-
col_schema['indexed'] = indexed
|
|
354
334
|
|
|
355
335
|
new_col = self._create_columns({col_name: col_schema})[0]
|
|
356
336
|
self._verify_column(new_col, self.column_names())
|
|
@@ -364,7 +344,7 @@ class Table(SchemaObject):
|
|
|
364
344
|
(on account of containing Python Callables or Exprs).
|
|
365
345
|
"""
|
|
366
346
|
assert isinstance(spec, dict)
|
|
367
|
-
valid_keys = {'type', 'value', 'stored'
|
|
347
|
+
valid_keys = {'type', 'value', 'stored'}
|
|
368
348
|
has_type = False
|
|
369
349
|
for k in spec.keys():
|
|
370
350
|
if k not in valid_keys:
|
|
@@ -393,8 +373,6 @@ class Table(SchemaObject):
|
|
|
393
373
|
|
|
394
374
|
if 'stored' in spec and not isinstance(spec['stored'], bool):
|
|
395
375
|
raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
|
|
396
|
-
if 'indexed' in spec and not isinstance(spec['indexed'], bool):
|
|
397
|
-
raise excs.Error(f'Column {name}: "indexed" must be a bool, got {spec["indexed"]}')
|
|
398
376
|
if not has_type:
|
|
399
377
|
raise excs.Error(f'Column {name}: "type" is required')
|
|
400
378
|
|
|
@@ -406,7 +384,6 @@ class Table(SchemaObject):
|
|
|
406
384
|
col_type: Optional[ts.ColumnType] = None
|
|
407
385
|
value_expr: Optional[exprs.Expr] = None
|
|
408
386
|
stored: Optional[bool] = None
|
|
409
|
-
indexed: Optional[bool] = None
|
|
410
387
|
primary_key: Optional[bool] = None
|
|
411
388
|
|
|
412
389
|
if isinstance(spec, ts.ColumnType):
|
|
@@ -428,12 +405,10 @@ class Table(SchemaObject):
|
|
|
428
405
|
# create copy so we can modify it
|
|
429
406
|
value_expr = value_expr.copy()
|
|
430
407
|
stored = spec.get('stored')
|
|
431
|
-
indexed = spec.get('indexed')
|
|
432
408
|
primary_key = spec.get('primary_key')
|
|
433
409
|
|
|
434
410
|
column = Column(
|
|
435
|
-
name, col_type=col_type, computed_with=value_expr, stored=stored,
|
|
436
|
-
primary_key=primary_key)
|
|
411
|
+
name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key)
|
|
437
412
|
columns.append(column)
|
|
438
413
|
return columns
|
|
439
414
|
|
|
@@ -498,9 +473,90 @@ class Table(SchemaObject):
|
|
|
498
473
|
self._check_is_dropped()
|
|
499
474
|
self.tbl_version_path.tbl_version.rename_column(old_name, new_name)
|
|
500
475
|
|
|
476
|
+
def add_embedding_index(
|
|
477
|
+
self, col_name: str, *, idx_name: Optional[str] = None,
|
|
478
|
+
text_embed: Optional[pixeltable.Function] = None, img_embed: Optional[pixeltable.Function] = None,
|
|
479
|
+
metric: str = 'cosine'
|
|
480
|
+
) -> None:
|
|
481
|
+
"""Add an index to the table.
|
|
482
|
+
Args:
|
|
483
|
+
col_name: name of column to index
|
|
484
|
+
idx_name: name of index, which needs to be unique for the table; if not provided, a name will be generated
|
|
485
|
+
text_embed: function to embed text; required if the column is a text column
|
|
486
|
+
img_embed: function to embed images; required if the column is an image column
|
|
487
|
+
metric: distance metric to use for the index; one of 'cosine', 'ip', 'l2'; default is 'cosine'
|
|
488
|
+
|
|
489
|
+
Raises:
|
|
490
|
+
Error: If an index with that name already exists for the table or if the column does not exist.
|
|
491
|
+
|
|
492
|
+
Examples:
|
|
493
|
+
Add an index to the ``img`` column:
|
|
494
|
+
|
|
495
|
+
>>> tbl.add_embedding_index('img', img_embed=...)
|
|
496
|
+
|
|
497
|
+
Add another index to the ``img`` column, using the inner product as the distance metric,
|
|
498
|
+
and with a specific name; ``text_embed`` is also specified in order to search with text:
|
|
499
|
+
|
|
500
|
+
>>> tbl.add_embedding_index(
|
|
501
|
+
'img', idx_name='clip_idx', img_embed=..., text_embed=...text_embed..., metric='ip')
|
|
502
|
+
"""
|
|
503
|
+
if self.tbl_version_path.is_snapshot():
|
|
504
|
+
raise excs.Error('Cannot add an index to a snapshot')
|
|
505
|
+
self._check_is_dropped()
|
|
506
|
+
col = self.tbl_version_path.get_column(col_name, include_bases=True)
|
|
507
|
+
if col is None:
|
|
508
|
+
raise excs.Error(f'Column {col_name} unknown')
|
|
509
|
+
if idx_name is not None and idx_name in self.tbl_version_path.tbl_version.idxs_by_name:
|
|
510
|
+
raise excs.Error(f'Duplicate index name: {idx_name}')
|
|
511
|
+
from pixeltable.index import EmbeddingIndex
|
|
512
|
+
# create the EmbeddingIndex instance to verify args
|
|
513
|
+
idx = EmbeddingIndex(col, metric=metric, text_embed=text_embed, img_embed=img_embed)
|
|
514
|
+
status = self.tbl_version_path.tbl_version.add_index(col, idx_name=idx_name, idx=idx)
|
|
515
|
+
# TODO: how to deal with exceptions here? drop the index and raise?
|
|
516
|
+
|
|
517
|
+
def drop_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
|
|
518
|
+
"""Drop an index from the table.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
column_name: The name of the column whose index to drop. Invalid if the column has multiple indices.
|
|
522
|
+
idx_name: The name of the index to drop.
|
|
523
|
+
|
|
524
|
+
Raises:
|
|
525
|
+
Error: If the index does not exist.
|
|
526
|
+
|
|
527
|
+
Examples:
|
|
528
|
+
Drop index on the ``img`` column:
|
|
529
|
+
|
|
530
|
+
>>> tbl.drop_index(column_name='img')
|
|
531
|
+
"""
|
|
532
|
+
if self.tbl_version_path.is_snapshot():
|
|
533
|
+
raise excs.Error('Cannot drop an index from a snapshot')
|
|
534
|
+
self._check_is_dropped()
|
|
535
|
+
if (column_name is None) == (idx_name is None):
|
|
536
|
+
raise excs.Error('Exactly one of column_name or idx_name must be provided')
|
|
537
|
+
tbl_version = self.tbl_version_path.tbl_version
|
|
538
|
+
|
|
539
|
+
if idx_name is not None:
|
|
540
|
+
if idx_name not in tbl_version.idxs_by_name:
|
|
541
|
+
raise excs.Error(f'Index {idx_name} does not exist')
|
|
542
|
+
idx_id = tbl_version.idxs_by_name[idx_name].id
|
|
543
|
+
else:
|
|
544
|
+
col = self.tbl_version_path.get_column(column_name, include_bases=True)
|
|
545
|
+
if col is None:
|
|
546
|
+
raise excs.Error(f'Column {column_name} unknown')
|
|
547
|
+
if col.tbl.id != tbl_version.id:
|
|
548
|
+
raise excs.Error(
|
|
549
|
+
f'Column {column_name}: cannot drop index from column that belongs to base ({col.tbl.name})')
|
|
550
|
+
idx_ids = [info.id for info in tbl_version.idxs_by_name.values() if info.col.id == col.id]
|
|
551
|
+
if len(idx_ids) == 0:
|
|
552
|
+
raise excs.Error(f'Column {column_name} does not have an index')
|
|
553
|
+
if len(idx_ids) > 1:
|
|
554
|
+
raise excs.Error(f'Column {column_name} has multiple indices; specify idx_name instead')
|
|
555
|
+
idx_id = idx_ids[0]
|
|
556
|
+
self.tbl_version_path.tbl_version.drop_index(idx_id)
|
|
557
|
+
|
|
501
558
|
def update(
|
|
502
|
-
self, value_spec:
|
|
503
|
-
where: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
|
|
559
|
+
self, value_spec: dict[str, Any], where: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
|
|
504
560
|
) -> UpdateStatus:
|
|
505
561
|
"""Update rows in this table.
|
|
506
562
|
|
|
@@ -510,11 +566,11 @@ class Table(SchemaObject):
|
|
|
510
566
|
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
511
567
|
|
|
512
568
|
Examples:
|
|
513
|
-
Set
|
|
569
|
+
Set column `int_col` to 1 for all rows:
|
|
514
570
|
|
|
515
571
|
>>> tbl.update({'int_col': 1})
|
|
516
572
|
|
|
517
|
-
Set
|
|
573
|
+
Set column `int_col` to 1 for all rows where `int_col` is 0:
|
|
518
574
|
|
|
519
575
|
>>> tbl.update({'int_col': 1}, where=tbl.int_col == 0)
|
|
520
576
|
|
|
@@ -526,27 +582,93 @@ class Table(SchemaObject):
|
|
|
526
582
|
|
|
527
583
|
>>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
|
|
528
584
|
"""
|
|
585
|
+
if self.tbl_version_path.is_snapshot():
|
|
586
|
+
raise excs.Error('Cannot update a snapshot')
|
|
587
|
+
self._check_is_dropped()
|
|
588
|
+
|
|
589
|
+
update_spec = self._validate_update_spec(value_spec, allow_pk=False, allow_exprs=True)
|
|
590
|
+
from pixeltable.plan import Planner
|
|
591
|
+
if where is not None:
|
|
592
|
+
if not isinstance(where, exprs.Predicate):
|
|
593
|
+
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
594
|
+
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
595
|
+
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
596
|
+
if analysis_info.filter is not None:
|
|
597
|
+
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
|
598
|
+
|
|
599
|
+
return self.tbl_version_path.tbl_version.update(update_spec, where, cascade)
|
|
600
|
+
|
|
601
|
+
def batch_update(self, rows: Iterable[dict[str, Any]], cascade: bool = True) -> UpdateStatus:
|
|
602
|
+
"""Update rows in this table.
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
rows: an Iterable of dictionaries containing values for the updated columns plus values for the primary key
|
|
606
|
+
columns.
|
|
607
|
+
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
608
|
+
|
|
609
|
+
Examples:
|
|
610
|
+
Update the 'name' and 'age' columns for the rows with ids 1 and 2 (assuming 'id' is the primary key):
|
|
611
|
+
|
|
612
|
+
>>> tbl.update([{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 2, 'name': 'Bob', 'age': 40}])
|
|
613
|
+
"""
|
|
614
|
+
if self.tbl_version_path.is_snapshot():
|
|
615
|
+
raise excs.Error('Cannot update a snapshot')
|
|
616
|
+
self._check_is_dropped()
|
|
617
|
+
|
|
618
|
+
row_updates: List[Dict[Column, exprs.Expr]] = []
|
|
619
|
+
pk_col_names = set(c.name for c in self.tbl_version_path.tbl_version.primary_key_columns())
|
|
620
|
+
|
|
621
|
+
# pseudo-column _rowid: contains the rowid of the row to update and can be used instead of the primary key
|
|
622
|
+
has_rowid = self.ROWID_COLUMN_NAME in rows[0]
|
|
623
|
+
rowids: list[Tuple[int, ...]] = []
|
|
624
|
+
if len(pk_col_names) == 0 and not has_rowid:
|
|
625
|
+
raise excs.Error('Table must have primary key for batch update')
|
|
626
|
+
|
|
627
|
+
for row_spec in rows:
|
|
628
|
+
col_vals = self._validate_update_spec(row_spec, allow_pk=not has_rowid, allow_exprs=False)
|
|
629
|
+
if has_rowid:
|
|
630
|
+
# we expect the _rowid column to be present for each row
|
|
631
|
+
assert self.ROWID_COLUMN_NAME in row_spec
|
|
632
|
+
rowids.append(row_spec[self.ROWID_COLUMN_NAME])
|
|
633
|
+
else:
|
|
634
|
+
col_names = set(col.name for col in col_vals.keys())
|
|
635
|
+
if any(pk_col_name not in col_names for pk_col_name in pk_col_names):
|
|
636
|
+
missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
|
|
637
|
+
raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
|
|
638
|
+
row_updates.append(col_vals)
|
|
639
|
+
return self.tbl_version_path.tbl_version.batch_update(row_updates, rowids, cascade)
|
|
640
|
+
|
|
641
|
+
def _validate_update_spec(
|
|
642
|
+
self, value_spec: dict[str, Any], allow_pk: bool, allow_exprs: bool
|
|
643
|
+
) -> dict[Column, 'pixeltable.exprs.Expr']:
|
|
529
644
|
from pixeltable import exprs
|
|
530
|
-
update_targets:
|
|
645
|
+
update_targets: dict[Column, exprs.Expr] = {}
|
|
531
646
|
for col_name, val in value_spec.items():
|
|
532
647
|
if not isinstance(col_name, str):
|
|
533
648
|
raise excs.Error(f'Update specification: dict key must be column name, got {col_name!r}')
|
|
649
|
+
if col_name == self.ROWID_COLUMN_NAME:
|
|
650
|
+
# ignore pseudo-column _rowid
|
|
651
|
+
continue
|
|
534
652
|
col = self.tbl_version_path.get_column(col_name, include_bases=False)
|
|
535
653
|
if col is None:
|
|
536
654
|
# TODO: return more informative error if this is trying to update a base column
|
|
537
655
|
raise excs.Error(f'Column {col_name} unknown')
|
|
538
656
|
if col.is_computed:
|
|
539
657
|
raise excs.Error(f'Column {col_name} is computed and cannot be updated')
|
|
540
|
-
if col.
|
|
658
|
+
if col.is_pk and not allow_pk:
|
|
541
659
|
raise excs.Error(f'Column {col_name} is a primary key column and cannot be updated')
|
|
542
660
|
if col.col_type.is_media_type():
|
|
543
661
|
raise excs.Error(f'Column {col_name} has type image/video/audio/document and cannot be updated')
|
|
544
662
|
|
|
545
663
|
# make sure that the value is compatible with the column type
|
|
546
|
-
# check if this is a literal
|
|
547
664
|
try:
|
|
665
|
+
# check if this is a literal
|
|
548
666
|
value_expr = exprs.Literal(val, col_type=col.col_type)
|
|
549
667
|
except TypeError:
|
|
668
|
+
if not allow_exprs:
|
|
669
|
+
raise excs.Error(
|
|
670
|
+
f'Column {col_name}: value {val!r} is not a valid literal for this column '
|
|
671
|
+
f'(expected {col.col_type})')
|
|
550
672
|
# it's not a literal, let's try to create an expr from it
|
|
551
673
|
value_expr = exprs.Expr.from_object(val)
|
|
552
674
|
if value_expr is None:
|
|
@@ -556,20 +678,10 @@ class Table(SchemaObject):
|
|
|
556
678
|
f'Type of value {val!r} ({value_expr.col_type}) is not compatible with the type of column '
|
|
557
679
|
f'{col_name} ({col.col_type})'
|
|
558
680
|
))
|
|
559
|
-
update_targets
|
|
681
|
+
update_targets[col] = value_expr
|
|
560
682
|
|
|
561
|
-
|
|
562
|
-
if where is not None:
|
|
563
|
-
if not isinstance(where, exprs.Predicate):
|
|
564
|
-
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
565
|
-
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
566
|
-
if analysis_info.similarity_clause is not None:
|
|
567
|
-
raise excs.Error('nearest() cannot be used with update()')
|
|
568
|
-
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
569
|
-
if analysis_info.filter is not None:
|
|
570
|
-
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
|
683
|
+
return update_targets
|
|
571
684
|
|
|
572
|
-
return self.tbl_version_path.tbl_version.update(update_targets, where, cascade)
|
|
573
685
|
|
|
574
686
|
def revert(self) -> None:
|
|
575
687
|
"""Reverts the table to the previous version.
|
|
@@ -577,5 +689,7 @@ class Table(SchemaObject):
|
|
|
577
689
|
.. warning::
|
|
578
690
|
This operation is irreversible.
|
|
579
691
|
"""
|
|
692
|
+
if self.tbl_version_path.is_snapshot():
|
|
693
|
+
raise excs.Error('Cannot revert a snapshot')
|
|
580
694
|
self._check_is_dropped()
|
|
581
695
|
self.tbl_version_path.tbl_version.revert()
|