pixeltable 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/column.py +25 -48
- pixeltable/catalog/insertable_table.py +7 -4
- pixeltable/catalog/table.py +163 -57
- pixeltable/catalog/table_version.py +416 -140
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/client.py +0 -4
- pixeltable/dataframe.py +65 -21
- pixeltable/env.py +16 -1
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/in_memory_data_node.py +11 -7
- pixeltable/exprs/comparison.py +3 -3
- pixeltable/exprs/data_row.py +5 -1
- pixeltable/exprs/literal.py +16 -4
- pixeltable/exprs/row_builder.py +8 -40
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/aggregate_function.py +15 -15
- pixeltable/func/expr_template_function.py +9 -1
- pixeltable/func/globals.py +24 -14
- pixeltable/func/signature.py +18 -12
- pixeltable/func/udf.py +7 -2
- pixeltable/functions/__init__.py +8 -8
- pixeltable/functions/eval.py +7 -8
- pixeltable/functions/huggingface.py +47 -19
- pixeltable/functions/openai.py +2 -2
- pixeltable/functions/util.py +11 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +49 -0
- pixeltable/index/embedding_index.py +95 -0
- pixeltable/metadata/schema.py +45 -22
- pixeltable/plan.py +15 -34
- pixeltable/store.py +38 -41
- pixeltable/tests/conftest.py +5 -11
- pixeltable/tests/ext/test_yolox.py +21 -0
- pixeltable/tests/functions/test_fireworks.py +1 -0
- pixeltable/tests/functions/test_huggingface.py +2 -2
- pixeltable/tests/functions/test_openai.py +15 -5
- pixeltable/tests/functions/test_together.py +1 -0
- pixeltable/tests/test_component_view.py +14 -5
- pixeltable/tests/test_dataframe.py +19 -18
- pixeltable/tests/test_exprs.py +99 -102
- pixeltable/tests/test_function.py +51 -43
- pixeltable/tests/test_index.py +138 -0
- pixeltable/tests/test_migration.py +2 -1
- pixeltable/tests/test_snapshot.py +24 -1
- pixeltable/tests/test_table.py +101 -25
- pixeltable/tests/test_types.py +30 -0
- pixeltable/tests/test_video.py +16 -16
- pixeltable/tests/test_view.py +5 -0
- pixeltable/tests/utils.py +43 -9
- pixeltable/tool/create_test_db_dump.py +16 -0
- pixeltable/type_system.py +37 -45
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/METADATA +5 -4
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/RECORD +56 -49
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0
pixeltable/catalog/column.py
CHANGED
|
@@ -4,10 +4,8 @@ import logging
|
|
|
4
4
|
from typing import Optional, Union, Callable, Set
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
|
-
from pgvector.sqlalchemy import Vector
|
|
8
7
|
|
|
9
8
|
from pixeltable import exceptions as excs
|
|
10
|
-
from pixeltable.metadata import schema
|
|
11
9
|
from pixeltable.type_system import ColumnType, StringType
|
|
12
10
|
from .globals import is_valid_identifier
|
|
13
11
|
|
|
@@ -20,44 +18,38 @@ class Column:
|
|
|
20
18
|
table/view.
|
|
21
19
|
"""
|
|
22
20
|
def __init__(
|
|
23
|
-
self, name: str, col_type: Optional[ColumnType] = None,
|
|
21
|
+
self, name: Optional[str], col_type: Optional[ColumnType] = None,
|
|
24
22
|
computed_with: Optional[Union['Expr', Callable]] = None,
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
23
|
+
is_pk: bool = False, stored: Optional[bool] = None,
|
|
24
|
+
col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
|
|
25
|
+
schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None
|
|
26
|
+
):
|
|
29
27
|
"""Column constructor.
|
|
30
28
|
|
|
31
29
|
Args:
|
|
32
|
-
name: column name
|
|
30
|
+
name: column name; None for system columns (eg, index columns)
|
|
33
31
|
col_type: column type; can be None if the type can be derived from ``computed_with``
|
|
34
32
|
computed_with: a callable or an Expr object that computes the column value
|
|
35
|
-
|
|
33
|
+
is_pk: if True, this column is part of the primary key
|
|
36
34
|
stored: determines whether a computed column is present in the stored table or recomputed on demand
|
|
37
|
-
indexed: if True, this column has a nearest neighbor index (only valid for image columns)
|
|
38
35
|
col_id: column ID (only used internally)
|
|
39
36
|
|
|
40
37
|
Computed columns: those have a non-None ``computed_with`` argument
|
|
41
|
-
|
|
42
38
|
- when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
|
|
43
39
|
col_type is None
|
|
44
40
|
- when loaded from md store: ``computed_with`` is set and col_type is set
|
|
45
41
|
|
|
46
42
|
``computed_with`` is a Callable:
|
|
47
|
-
|
|
48
43
|
- the callable's parameter names must correspond to existing columns in the table for which this Column
|
|
49
44
|
is being used
|
|
50
45
|
- ``col_type`` needs to be set to the callable's return type
|
|
51
46
|
|
|
52
47
|
``stored`` (only valid for computed image columns):
|
|
53
|
-
|
|
54
48
|
- if True: the column is present in the stored table
|
|
55
49
|
- if False: the column is not present in the stored table and recomputed during a query
|
|
56
50
|
- if None: the system chooses for you (at present, this is always False, but this may change in the future)
|
|
57
|
-
|
|
58
|
-
indexed: only valid for image columns; if true, maintains an NN index for this column
|
|
59
51
|
"""
|
|
60
|
-
if not is_valid_identifier(name):
|
|
52
|
+
if name is not None and not is_valid_identifier(name):
|
|
61
53
|
raise excs.Error(f"Invalid column name: '{name}'")
|
|
62
54
|
self.name = name
|
|
63
55
|
if col_type is None and computed_with is None:
|
|
@@ -90,35 +82,20 @@ class Column:
|
|
|
90
82
|
self.stored = stored
|
|
91
83
|
self.dependent_cols: Set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
|
|
92
84
|
self.id = col_id
|
|
93
|
-
self.
|
|
85
|
+
self.is_pk = is_pk
|
|
86
|
+
self.schema_version_add = schema_version_add
|
|
87
|
+
self.schema_version_drop = schema_version_drop
|
|
94
88
|
|
|
95
89
|
# column in the stored table for the values of this Column
|
|
96
90
|
self.sa_col: Optional[sql.schema.Column] = None
|
|
91
|
+
self.sa_col_type = sa_col_type
|
|
97
92
|
|
|
98
93
|
# computed cols also have storage columns for the exception string and type
|
|
99
94
|
self.sa_errormsg_col: Optional[sql.schema.Column] = None
|
|
100
95
|
self.sa_errortype_col: Optional[sql.schema.Column] = None
|
|
101
|
-
# indexed columns also have a column for the embeddings
|
|
102
|
-
self.sa_idx_col: Optional[sql.schema.Column] = None
|
|
103
96
|
from .table_version import TableVersion
|
|
104
97
|
self.tbl: Optional[TableVersion] = None # set by owning TableVersion
|
|
105
98
|
|
|
106
|
-
if indexed and not self.col_type.is_image_type():
|
|
107
|
-
raise excs.Error(f'Column {name}: indexed=True requires ImageType')
|
|
108
|
-
self.is_indexed = indexed
|
|
109
|
-
|
|
110
|
-
@classmethod
|
|
111
|
-
def from_md(cls, col_id: int, md: schema.SchemaColumn, tbl: 'TableVersion') -> Column:
|
|
112
|
-
"""Construct a Column from metadata.
|
|
113
|
-
|
|
114
|
-
Leaves out value_expr, because that requires TableVersion.cols to be complete.
|
|
115
|
-
"""
|
|
116
|
-
col = cls(
|
|
117
|
-
md.name, col_type=ColumnType.from_dict(md.col_type), primary_key=md.is_pk,
|
|
118
|
-
stored=md.stored, indexed=md.is_indexed, col_id=col_id)
|
|
119
|
-
col.tbl = tbl
|
|
120
|
-
return col
|
|
121
|
-
|
|
122
99
|
def __hash__(self) -> int:
|
|
123
100
|
assert self.tbl is not None
|
|
124
101
|
return hash((self.tbl.id, self.id))
|
|
@@ -167,26 +144,26 @@ class Column:
|
|
|
167
144
|
"""
|
|
168
145
|
assert self.is_stored
|
|
169
146
|
# all storage columns are nullable (we deal with null errors in Pixeltable directly)
|
|
170
|
-
self.sa_col = sql.Column(
|
|
147
|
+
self.sa_col = sql.Column(
|
|
148
|
+
self.store_name(), self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type,
|
|
149
|
+
nullable=True)
|
|
171
150
|
if self.is_computed or self.col_type.is_media_type():
|
|
172
|
-
self.sa_errormsg_col = sql.Column(self.
|
|
173
|
-
self.sa_errortype_col = sql.Column(self.
|
|
174
|
-
if self.is_indexed:
|
|
175
|
-
self.sa_idx_col = sql.Column(self.index_storage_name(), Vector(512), nullable=True)
|
|
151
|
+
self.sa_errormsg_col = sql.Column(self.errormsg_store_name(), StringType().to_sa_type(), nullable=True)
|
|
152
|
+
self.sa_errortype_col = sql.Column(self.errortype_store_name(), StringType().to_sa_type(), nullable=True)
|
|
176
153
|
|
|
177
|
-
def
|
|
154
|
+
def get_sa_col_type(self) -> sql.sqltypes.TypeEngine:
|
|
155
|
+
return self.col_type.to_sa_type() if self.sa_col_type is None else self.sa_col_type
|
|
156
|
+
|
|
157
|
+
def store_name(self) -> str:
|
|
178
158
|
assert self.id is not None
|
|
179
159
|
assert self.is_stored
|
|
180
160
|
return f'col_{self.id}'
|
|
181
161
|
|
|
182
|
-
def
|
|
183
|
-
return f'{self.
|
|
184
|
-
|
|
185
|
-
def errortype_storage_name(self) -> str:
|
|
186
|
-
return f'{self.storage_name()}_errortype'
|
|
162
|
+
def errormsg_store_name(self) -> str:
|
|
163
|
+
return f'{self.store_name()}_errormsg'
|
|
187
164
|
|
|
188
|
-
def
|
|
189
|
-
return f'{self.
|
|
165
|
+
def errortype_store_name(self) -> str:
|
|
166
|
+
return f'{self.store_name()}_errortype'
|
|
190
167
|
|
|
191
168
|
def __str__(self) -> str:
|
|
192
169
|
return f'{self.name}: {self.col_type}'
|
|
@@ -11,14 +11,17 @@ import pixeltable.type_system as ts
|
|
|
11
11
|
from pixeltable import exceptions as excs
|
|
12
12
|
from pixeltable.env import Env
|
|
13
13
|
from .catalog import Catalog
|
|
14
|
+
from .globals import UpdateStatus
|
|
14
15
|
from .table import Table
|
|
15
16
|
from .table_version import TableVersion
|
|
16
17
|
from .table_version_path import TableVersionPath
|
|
17
18
|
|
|
18
19
|
_logger = logging.getLogger('pixeltable')
|
|
19
20
|
|
|
21
|
+
|
|
20
22
|
class InsertableTable(Table):
|
|
21
23
|
"""A `Table` that allows inserting and deleting rows."""
|
|
24
|
+
|
|
22
25
|
def __init__(self, dir_id: UUID, tbl_version: TableVersion):
|
|
23
26
|
tbl_version_path = TableVersionPath(tbl_version)
|
|
24
27
|
super().__init__(tbl_version.id, dir_id, tbl_version.name, tbl_version_path)
|
|
@@ -42,7 +45,7 @@ class InsertableTable(Table):
|
|
|
42
45
|
col = columns[column_names.index(pk_col)]
|
|
43
46
|
if col.col_type.nullable:
|
|
44
47
|
raise excs.Error(f'Primary key column {pk_col} cannot be nullable')
|
|
45
|
-
col.
|
|
48
|
+
col.is_pk = True
|
|
46
49
|
|
|
47
50
|
with orm.Session(Env.get().engine, future=True) as session:
|
|
48
51
|
_, tbl_version = TableVersion.create(session, dir_id, name, columns, num_retained_versions, comment)
|
|
@@ -62,7 +65,7 @@ class InsertableTable(Table):
|
|
|
62
65
|
@overload
|
|
63
66
|
def insert(self, print_stats: bool = False, fail_on_exception: bool = True, **kwargs: Any): ...
|
|
64
67
|
|
|
65
|
-
def insert(self, *args, **kwargs) ->
|
|
68
|
+
def insert(self, *args, **kwargs) -> UpdateStatus:
|
|
66
69
|
"""Insert rows into table.
|
|
67
70
|
|
|
68
71
|
To insert multiple rows at a time:
|
|
@@ -161,7 +164,7 @@ class InsertableTable(Table):
|
|
|
161
164
|
msg = str(e)
|
|
162
165
|
raise excs.Error(f'Error in column {col.name}: {msg[0].lower() + msg[1:]}\nRow: {row}')
|
|
163
166
|
|
|
164
|
-
def delete(self, where: Optional['pixeltable.exprs.Predicate'] = None) ->
|
|
167
|
+
def delete(self, where: Optional['pixeltable.exprs.Predicate'] = None) -> UpdateStatus:
|
|
165
168
|
"""Delete rows in this table.
|
|
166
169
|
|
|
167
170
|
Args:
|
|
@@ -181,7 +184,7 @@ class InsertableTable(Table):
|
|
|
181
184
|
if where is not None:
|
|
182
185
|
if not isinstance(where, Predicate):
|
|
183
186
|
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
184
|
-
analysis_info = Planner.analyze(self.
|
|
187
|
+
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
185
188
|
if analysis_info.similarity_clause is not None:
|
|
186
189
|
raise excs.Error('nearest() cannot be used with delete()')
|
|
187
190
|
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
pixeltable/catalog/table.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import dataclasses
|
|
4
3
|
import json
|
|
5
4
|
import logging
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple
|
|
6
|
+
from typing import Union, Any, List, Dict, Optional, Callable, Set, Tuple, Iterable
|
|
8
7
|
from uuid import UUID
|
|
9
8
|
|
|
10
9
|
import pandas as pd
|
|
@@ -18,7 +17,7 @@ import pixeltable.exprs as exprs
|
|
|
18
17
|
import pixeltable.metadata.schema as schema
|
|
19
18
|
import pixeltable.type_system as ts
|
|
20
19
|
from .column import Column
|
|
21
|
-
from .globals import is_valid_identifier, is_system_column_name
|
|
20
|
+
from .globals import is_valid_identifier, is_system_column_name, UpdateStatus
|
|
22
21
|
from .schema_object import SchemaObject
|
|
23
22
|
from .table_version import TableVersion
|
|
24
23
|
from .table_version_path import TableVersionPath
|
|
@@ -28,14 +27,7 @@ _logger = logging.getLogger('pixeltable')
|
|
|
28
27
|
class Table(SchemaObject):
|
|
29
28
|
"""Base class for all tabular SchemaObjects."""
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
class UpdateStatus:
|
|
33
|
-
num_rows: int = 0
|
|
34
|
-
# TODO: change to num_computed_columns (the number of computed slots isn't really meaningful to the user)
|
|
35
|
-
num_computed_values: int = 0
|
|
36
|
-
num_excs: int = 0
|
|
37
|
-
updated_cols: List[str] = dataclasses.field(default_factory=list)
|
|
38
|
-
cols_with_excs: List[str] = dataclasses.field(default_factory=list)
|
|
30
|
+
ROWID_COLUMN_NAME = '_rowid'
|
|
39
31
|
|
|
40
32
|
def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
|
|
41
33
|
super().__init__(id, name, dir_id)
|
|
@@ -225,7 +217,7 @@ class Table(SchemaObject):
|
|
|
225
217
|
value: column type or value expression or column specification dictionary:
|
|
226
218
|
column type: a Pixeltable column type (if the table already contains rows, it must be nullable)
|
|
227
219
|
value expression: a Pixeltable expression that computes the column values
|
|
228
|
-
column specification: a dictionary with possible keys 'type', 'value', 'stored'
|
|
220
|
+
column specification: a dictionary with possible keys 'type', 'value', 'stored'
|
|
229
221
|
Examples:
|
|
230
222
|
Add an int column with ``None`` values:
|
|
231
223
|
|
|
@@ -247,11 +239,6 @@ class Table(SchemaObject):
|
|
|
247
239
|
Do the same, but now the column is stored:
|
|
248
240
|
|
|
249
241
|
>>> tbl['rotated'] = {'value': tbl.frame.rotate(90), 'stored': True}
|
|
250
|
-
|
|
251
|
-
Add a resized version of the ``frame`` column and index it. The column does not need to be stored in order
|
|
252
|
-
to be indexed:
|
|
253
|
-
|
|
254
|
-
>>> tbl['small_frame'] = {'value': tbl.frame.resize([224, 224]), 'indexed': True}
|
|
255
242
|
"""
|
|
256
243
|
if not isinstance(column_name, str):
|
|
257
244
|
raise excs.Error(f'Column name must be a string, got {type(column_name)}')
|
|
@@ -264,8 +251,8 @@ class Table(SchemaObject):
|
|
|
264
251
|
|
|
265
252
|
def add_column(
|
|
266
253
|
self, *,
|
|
267
|
-
type: Optional[ts.ColumnType] = None, stored: Optional[bool] = None,
|
|
268
|
-
|
|
254
|
+
type: Optional[ts.ColumnType] = None, stored: Optional[bool] = None, print_stats: bool = False,
|
|
255
|
+
**kwargs: Any
|
|
269
256
|
) -> UpdateStatus:
|
|
270
257
|
"""Adds a column to the table.
|
|
271
258
|
|
|
@@ -273,7 +260,6 @@ class Table(SchemaObject):
|
|
|
273
260
|
kwargs: Exactly one keyword argument of the form ``column-name=type|value-expression``.
|
|
274
261
|
type: The type of the column. Only valid and required if ``value-expression`` is a Callable.
|
|
275
262
|
stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
|
|
276
|
-
indexed: Whether the column is indexed.
|
|
277
263
|
print_stats: If ``True``, print execution metrics.
|
|
278
264
|
|
|
279
265
|
Returns:
|
|
@@ -318,15 +304,6 @@ class Table(SchemaObject):
|
|
|
318
304
|
Alternatively, this can also be expressed as:
|
|
319
305
|
|
|
320
306
|
>>> tbl['rotated'] = {'value': tbl.frame.rotate(90), 'stored': True}
|
|
321
|
-
|
|
322
|
-
Add a resized version of the ``frame`` column and index it. The column does not need to be stored in order
|
|
323
|
-
to be indexed:
|
|
324
|
-
|
|
325
|
-
>>> tbl.add_column(small_frame=tbl.frame.resize([224, 224]), indexed=True)
|
|
326
|
-
|
|
327
|
-
Alternatively, this can also be expressed as:
|
|
328
|
-
|
|
329
|
-
>>> tbl['small_frame'] = {'value': tbl.frame.resize([224, 224]), 'indexed': True}
|
|
330
307
|
"""
|
|
331
308
|
self._check_is_dropped()
|
|
332
309
|
# verify kwargs and construct column schema dict
|
|
@@ -349,8 +326,6 @@ class Table(SchemaObject):
|
|
|
349
326
|
col_schema['type'] = type
|
|
350
327
|
if stored is not None:
|
|
351
328
|
col_schema['stored'] = stored
|
|
352
|
-
if indexed is not None:
|
|
353
|
-
col_schema['indexed'] = indexed
|
|
354
329
|
|
|
355
330
|
new_col = self._create_columns({col_name: col_schema})[0]
|
|
356
331
|
self._verify_column(new_col, self.column_names())
|
|
@@ -364,7 +339,7 @@ class Table(SchemaObject):
|
|
|
364
339
|
(on account of containing Python Callables or Exprs).
|
|
365
340
|
"""
|
|
366
341
|
assert isinstance(spec, dict)
|
|
367
|
-
valid_keys = {'type', 'value', 'stored'
|
|
342
|
+
valid_keys = {'type', 'value', 'stored'}
|
|
368
343
|
has_type = False
|
|
369
344
|
for k in spec.keys():
|
|
370
345
|
if k not in valid_keys:
|
|
@@ -393,8 +368,6 @@ class Table(SchemaObject):
|
|
|
393
368
|
|
|
394
369
|
if 'stored' in spec and not isinstance(spec['stored'], bool):
|
|
395
370
|
raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
|
|
396
|
-
if 'indexed' in spec and not isinstance(spec['indexed'], bool):
|
|
397
|
-
raise excs.Error(f'Column {name}: "indexed" must be a bool, got {spec["indexed"]}')
|
|
398
371
|
if not has_type:
|
|
399
372
|
raise excs.Error(f'Column {name}: "type" is required')
|
|
400
373
|
|
|
@@ -406,7 +379,6 @@ class Table(SchemaObject):
|
|
|
406
379
|
col_type: Optional[ts.ColumnType] = None
|
|
407
380
|
value_expr: Optional[exprs.Expr] = None
|
|
408
381
|
stored: Optional[bool] = None
|
|
409
|
-
indexed: Optional[bool] = None
|
|
410
382
|
primary_key: Optional[bool] = None
|
|
411
383
|
|
|
412
384
|
if isinstance(spec, ts.ColumnType):
|
|
@@ -428,12 +400,10 @@ class Table(SchemaObject):
|
|
|
428
400
|
# create copy so we can modify it
|
|
429
401
|
value_expr = value_expr.copy()
|
|
430
402
|
stored = spec.get('stored')
|
|
431
|
-
indexed = spec.get('indexed')
|
|
432
403
|
primary_key = spec.get('primary_key')
|
|
433
404
|
|
|
434
405
|
column = Column(
|
|
435
|
-
name, col_type=col_type, computed_with=value_expr, stored=stored,
|
|
436
|
-
primary_key=primary_key)
|
|
406
|
+
name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key)
|
|
437
407
|
columns.append(column)
|
|
438
408
|
return columns
|
|
439
409
|
|
|
@@ -498,9 +468,85 @@ class Table(SchemaObject):
|
|
|
498
468
|
self._check_is_dropped()
|
|
499
469
|
self.tbl_version_path.tbl_version.rename_column(old_name, new_name)
|
|
500
470
|
|
|
471
|
+
def add_embedding_index(
|
|
472
|
+
self, col_name: str, *, idx_name: Optional[str] = None,
|
|
473
|
+
text_embed: Optional[pixeltable.Function] = None, img_embed: Optional[pixeltable.Function] = None
|
|
474
|
+
) -> None:
|
|
475
|
+
"""Add an index to the table.
|
|
476
|
+
Args:
|
|
477
|
+
col_name: name of column to index
|
|
478
|
+
idx_name: name of index, which needs to be unique for the table; if not provided, a name will be generated
|
|
479
|
+
idx_type: type of index (one of 'embedding')
|
|
480
|
+
|
|
481
|
+
Raises:
|
|
482
|
+
Error: If an index with that name already exists for the table or if the column does not exist.
|
|
483
|
+
|
|
484
|
+
Examples:
|
|
485
|
+
Add an index to the ``img`` column:
|
|
486
|
+
|
|
487
|
+
>>> tbl.add_embedding_index('img', text_embed=...)
|
|
488
|
+
|
|
489
|
+
Add another index to the ``img`` column, with a specific name:
|
|
490
|
+
|
|
491
|
+
>>> tbl.add_embedding_index('img', idx_name='clip_idx', text_embed=...)
|
|
492
|
+
"""
|
|
493
|
+
if self.tbl_version_path.is_snapshot():
|
|
494
|
+
raise excs.Error('Cannot add an index to a snapshot')
|
|
495
|
+
self._check_is_dropped()
|
|
496
|
+
col = self.tbl_version_path.get_column(col_name, include_bases=True)
|
|
497
|
+
if col is None:
|
|
498
|
+
raise excs.Error(f'Column {col_name} unknown')
|
|
499
|
+
if idx_name is not None and idx_name in self.tbl_version_path.tbl_version.idxs_by_name:
|
|
500
|
+
raise excs.Error(f'Duplicate index name: {idx_name}')
|
|
501
|
+
from pixeltable.index import EmbeddingIndex
|
|
502
|
+
# create the EmbeddingIndex instance to verify args
|
|
503
|
+
idx = EmbeddingIndex(col, text_embed=text_embed, img_embed=img_embed)
|
|
504
|
+
status = self.tbl_version_path.tbl_version.add_index(col, idx_name=idx_name, idx=idx)
|
|
505
|
+
# TODO: how to deal with exceptions here? drop the index and raise?
|
|
506
|
+
|
|
507
|
+
def drop_index(self, *, column_name: Optional[str] = None, idx_name: Optional[str] = None) -> None:
|
|
508
|
+
"""Drop an index from the table.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
column_name: The name of the column whose index to drop. Invalid if the column has multiple indices.
|
|
512
|
+
idx_name: The name of the index to drop.
|
|
513
|
+
|
|
514
|
+
Raises:
|
|
515
|
+
Error: If the index does not exist.
|
|
516
|
+
|
|
517
|
+
Examples:
|
|
518
|
+
Drop index on the ``img`` column:
|
|
519
|
+
|
|
520
|
+
>>> tbl.drop_index(column_name='img')
|
|
521
|
+
"""
|
|
522
|
+
if self.tbl_version_path.is_snapshot():
|
|
523
|
+
raise excs.Error('Cannot drop an index from a snapshot')
|
|
524
|
+
self._check_is_dropped()
|
|
525
|
+
if (column_name is None) == (idx_name is None):
|
|
526
|
+
raise excs.Error('Exactly one of column_name or idx_name must be provided')
|
|
527
|
+
tbl_version = self.tbl_version_path.tbl_version
|
|
528
|
+
|
|
529
|
+
if idx_name is not None:
|
|
530
|
+
if idx_name not in tbl_version.idxs_by_name:
|
|
531
|
+
raise excs.Error(f'Index {idx_name} does not exist')
|
|
532
|
+
idx_id = tbl_version.idxs_by_name[idx_name].id
|
|
533
|
+
else:
|
|
534
|
+
col = self.tbl_version_path.get_column(column_name, include_bases=True)
|
|
535
|
+
if col is None:
|
|
536
|
+
raise excs.Error(f'Column {column_name} unknown')
|
|
537
|
+
if col.tbl.id != tbl_version.id:
|
|
538
|
+
raise excs.Error(
|
|
539
|
+
f'Column {column_name}: cannot drop index from column that belongs to base ({col.tbl.name})')
|
|
540
|
+
idx_ids = [info.id for info in tbl_version.idxs_by_name.values() if info.col.id == col.id]
|
|
541
|
+
if len(idx_ids) == 0:
|
|
542
|
+
raise excs.Error(f'Column {column_name} does not have an index')
|
|
543
|
+
if len(idx_ids) > 1:
|
|
544
|
+
raise excs.Error(f'Column {column_name} has multiple indices; specify idx_name instead')
|
|
545
|
+
idx_id = idx_ids[0]
|
|
546
|
+
self.tbl_version_path.tbl_version.drop_index(idx_id)
|
|
547
|
+
|
|
501
548
|
def update(
|
|
502
|
-
self, value_spec:
|
|
503
|
-
where: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
|
|
549
|
+
self, value_spec: dict[str, Any], where: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
|
|
504
550
|
) -> UpdateStatus:
|
|
505
551
|
"""Update rows in this table.
|
|
506
552
|
|
|
@@ -510,11 +556,11 @@ class Table(SchemaObject):
|
|
|
510
556
|
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
511
557
|
|
|
512
558
|
Examples:
|
|
513
|
-
Set
|
|
559
|
+
Set column `int_col` to 1 for all rows:
|
|
514
560
|
|
|
515
561
|
>>> tbl.update({'int_col': 1})
|
|
516
562
|
|
|
517
|
-
Set
|
|
563
|
+
Set column `int_col` to 1 for all rows where `int_col` is 0:
|
|
518
564
|
|
|
519
565
|
>>> tbl.update({'int_col': 1}, where=tbl.int_col == 0)
|
|
520
566
|
|
|
@@ -526,27 +572,95 @@ class Table(SchemaObject):
|
|
|
526
572
|
|
|
527
573
|
>>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
|
|
528
574
|
"""
|
|
575
|
+
if self.tbl_version_path.is_snapshot():
|
|
576
|
+
raise excs.Error('Cannot update a snapshot')
|
|
577
|
+
self._check_is_dropped()
|
|
578
|
+
|
|
579
|
+
update_spec = self._validate_update_spec(value_spec, allow_pk=False, allow_exprs=True)
|
|
580
|
+
from pixeltable.plan import Planner
|
|
581
|
+
if where is not None:
|
|
582
|
+
if not isinstance(where, exprs.Predicate):
|
|
583
|
+
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
584
|
+
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
585
|
+
if analysis_info.similarity_clause is not None:
|
|
586
|
+
raise excs.Error('nearest() cannot be used with update()')
|
|
587
|
+
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
588
|
+
if analysis_info.filter is not None:
|
|
589
|
+
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
|
590
|
+
|
|
591
|
+
return self.tbl_version_path.tbl_version.update(update_spec, where, cascade)
|
|
592
|
+
|
|
593
|
+
def batch_update(self, rows: Iterable[dict[str, Any]], cascade: bool = True) -> UpdateStatus:
|
|
594
|
+
"""Update rows in this table.
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
rows: an Iterable of dictionaries containing values for the updated columns plus values for the primary key
|
|
598
|
+
columns.
|
|
599
|
+
cascade: if True, also update all computed columns that transitively depend on the updated columns.
|
|
600
|
+
|
|
601
|
+
Examples:
|
|
602
|
+
Update the 'name' and 'age' columns for the rows with ids 1 and 2 (assuming 'id' is the primary key):
|
|
603
|
+
|
|
604
|
+
>>> tbl.update([{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 2, 'name': 'Bob', 'age': 40}])
|
|
605
|
+
"""
|
|
606
|
+
if self.tbl_version_path.is_snapshot():
|
|
607
|
+
raise excs.Error('Cannot update a snapshot')
|
|
608
|
+
self._check_is_dropped()
|
|
609
|
+
|
|
610
|
+
row_updates: List[Dict[Column, exprs.Expr]] = []
|
|
611
|
+
pk_col_names = set(c.name for c in self.tbl_version_path.tbl_version.primary_key_columns())
|
|
612
|
+
|
|
613
|
+
# pseudo-column _rowid: contains the rowid of the row to update and can be used instead of the primary key
|
|
614
|
+
has_rowid = self.ROWID_COLUMN_NAME in rows[0]
|
|
615
|
+
rowids: list[Tuple[int, ...]] = []
|
|
616
|
+
if len(pk_col_names) == 0 and not has_rowid:
|
|
617
|
+
raise excs.Error('Table must have primary key for batch update')
|
|
618
|
+
|
|
619
|
+
for row_spec in rows:
|
|
620
|
+
col_vals = self._validate_update_spec(row_spec, allow_pk=not has_rowid, allow_exprs=False)
|
|
621
|
+
if has_rowid:
|
|
622
|
+
# we expect the _rowid column to be present for each row
|
|
623
|
+
assert self.ROWID_COLUMN_NAME in row_spec
|
|
624
|
+
rowids.append(row_spec[self.ROWID_COLUMN_NAME])
|
|
625
|
+
else:
|
|
626
|
+
col_names = set(col.name for col in col_vals.keys())
|
|
627
|
+
if any(pk_col_name not in col_names for pk_col_name in pk_col_names):
|
|
628
|
+
missing_cols = pk_col_names - set(col.name for col in col_vals.keys())
|
|
629
|
+
raise excs.Error(f'Primary key columns ({", ".join(missing_cols)}) missing in {row_spec}')
|
|
630
|
+
row_updates.append(col_vals)
|
|
631
|
+
return self.tbl_version_path.tbl_version.batch_update(row_updates, rowids, cascade)
|
|
632
|
+
|
|
633
|
+
def _validate_update_spec(
|
|
634
|
+
self, value_spec: dict[str, Any], allow_pk: bool, allow_exprs: bool
|
|
635
|
+
) -> dict[Column, 'pixeltable.exprs.Expr']:
|
|
529
636
|
from pixeltable import exprs
|
|
530
|
-
update_targets:
|
|
637
|
+
update_targets: dict[Column, exprs.Expr] = {}
|
|
531
638
|
for col_name, val in value_spec.items():
|
|
532
639
|
if not isinstance(col_name, str):
|
|
533
640
|
raise excs.Error(f'Update specification: dict key must be column name, got {col_name!r}')
|
|
641
|
+
if col_name == self.ROWID_COLUMN_NAME:
|
|
642
|
+
# ignore pseudo-column _rowid
|
|
643
|
+
continue
|
|
534
644
|
col = self.tbl_version_path.get_column(col_name, include_bases=False)
|
|
535
645
|
if col is None:
|
|
536
646
|
# TODO: return more informative error if this is trying to update a base column
|
|
537
647
|
raise excs.Error(f'Column {col_name} unknown')
|
|
538
648
|
if col.is_computed:
|
|
539
649
|
raise excs.Error(f'Column {col_name} is computed and cannot be updated')
|
|
540
|
-
if col.
|
|
650
|
+
if col.is_pk and not allow_pk:
|
|
541
651
|
raise excs.Error(f'Column {col_name} is a primary key column and cannot be updated')
|
|
542
652
|
if col.col_type.is_media_type():
|
|
543
653
|
raise excs.Error(f'Column {col_name} has type image/video/audio/document and cannot be updated')
|
|
544
654
|
|
|
545
655
|
# make sure that the value is compatible with the column type
|
|
546
|
-
# check if this is a literal
|
|
547
656
|
try:
|
|
657
|
+
# check if this is a literal
|
|
548
658
|
value_expr = exprs.Literal(val, col_type=col.col_type)
|
|
549
659
|
except TypeError:
|
|
660
|
+
if not allow_exprs:
|
|
661
|
+
raise excs.Error(
|
|
662
|
+
f'Column {col_name}: value {val!r} is not a valid literal for this column '
|
|
663
|
+
f'(expected {col.col_type})')
|
|
550
664
|
# it's not a literal, let's try to create an expr from it
|
|
551
665
|
value_expr = exprs.Expr.from_object(val)
|
|
552
666
|
if value_expr is None:
|
|
@@ -556,20 +670,10 @@ class Table(SchemaObject):
|
|
|
556
670
|
f'Type of value {val!r} ({value_expr.col_type}) is not compatible with the type of column '
|
|
557
671
|
f'{col_name} ({col.col_type})'
|
|
558
672
|
))
|
|
559
|
-
update_targets
|
|
673
|
+
update_targets[col] = value_expr
|
|
560
674
|
|
|
561
|
-
|
|
562
|
-
if where is not None:
|
|
563
|
-
if not isinstance(where, exprs.Predicate):
|
|
564
|
-
raise excs.Error(f"'where' argument must be a Predicate, got {type(where)}")
|
|
565
|
-
analysis_info = Planner.analyze(self.tbl_version_path, where)
|
|
566
|
-
if analysis_info.similarity_clause is not None:
|
|
567
|
-
raise excs.Error('nearest() cannot be used with update()')
|
|
568
|
-
# for now we require that the updated rows can be identified via SQL, rather than via a Python filter
|
|
569
|
-
if analysis_info.filter is not None:
|
|
570
|
-
raise excs.Error(f'Filter {analysis_info.filter} not expressible in SQL')
|
|
675
|
+
return update_targets
|
|
571
676
|
|
|
572
|
-
return self.tbl_version_path.tbl_version.update(update_targets, where, cascade)
|
|
573
677
|
|
|
574
678
|
def revert(self) -> None:
|
|
575
679
|
"""Reverts the table to the previous version.
|
|
@@ -577,5 +681,7 @@ class Table(SchemaObject):
|
|
|
577
681
|
.. warning::
|
|
578
682
|
This operation is irreversible.
|
|
579
683
|
"""
|
|
684
|
+
if self.tbl_version_path.is_snapshot():
|
|
685
|
+
raise excs.Error('Cannot revert a snapshot')
|
|
580
686
|
self._check_is_dropped()
|
|
581
687
|
self.tbl_version_path.tbl_version.revert()
|