pixeltable 0.2.25__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/dir.py +6 -0
- pixeltable/catalog/globals.py +25 -0
- pixeltable/catalog/named_function.py +4 -0
- pixeltable/catalog/path_dict.py +37 -11
- pixeltable/catalog/schema_object.py +6 -0
- pixeltable/catalog/table.py +421 -231
- pixeltable/catalog/table_version.py +22 -8
- pixeltable/catalog/view.py +5 -7
- pixeltable/dataframe.py +439 -105
- pixeltable/env.py +19 -5
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/exec_node.py +6 -7
- pixeltable/exec/expr_eval_node.py +1 -1
- pixeltable/exec/sql_node.py +92 -45
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/arithmetic_expr.py +1 -1
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +1 -1
- pixeltable/exprs/column_ref.py +29 -2
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/compound_predicate.py +1 -1
- pixeltable/exprs/expr.py +12 -5
- pixeltable/exprs/expr_set.py +8 -0
- pixeltable/exprs/function_call.py +147 -39
- pixeltable/exprs/in_predicate.py +1 -1
- pixeltable/exprs/inline_expr.py +25 -5
- pixeltable/exprs/is_null.py +1 -1
- pixeltable/exprs/json_mapper.py +1 -1
- pixeltable/exprs/json_path.py +1 -1
- pixeltable/exprs/method_ref.py +1 -1
- pixeltable/exprs/row_builder.py +1 -1
- pixeltable/exprs/rowid_ref.py +1 -1
- pixeltable/exprs/similarity_expr.py +14 -7
- pixeltable/exprs/sql_element_cache.py +4 -0
- pixeltable/exprs/type_cast.py +2 -2
- pixeltable/exprs/variable.py +3 -0
- pixeltable/func/__init__.py +5 -4
- pixeltable/func/aggregate_function.py +151 -68
- pixeltable/func/callable_function.py +48 -16
- pixeltable/func/expr_template_function.py +64 -23
- pixeltable/func/function.py +195 -27
- pixeltable/func/function_registry.py +2 -1
- pixeltable/func/query_template_function.py +51 -9
- pixeltable/func/signature.py +64 -7
- pixeltable/func/tools.py +153 -0
- pixeltable/func/udf.py +57 -35
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/anthropic.py +51 -4
- pixeltable/functions/gemini.py +85 -0
- pixeltable/functions/globals.py +54 -34
- pixeltable/functions/huggingface.py +10 -28
- pixeltable/functions/json.py +3 -8
- pixeltable/functions/math.py +67 -0
- pixeltable/functions/ollama.py +8 -8
- pixeltable/functions/openai.py +51 -4
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/video.py +3 -9
- pixeltable/functions/vision.py +1 -1
- pixeltable/globals.py +354 -80
- pixeltable/index/embedding_index.py +106 -34
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/label_studio.py +1 -1
- pixeltable/io/parquet.py +39 -19
- pixeltable/iterators/document.py +12 -0
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_16.py +2 -1
- pixeltable/metadata/converters/convert_17.py +2 -1
- pixeltable/metadata/converters/convert_22.py +17 -0
- pixeltable/metadata/converters/convert_23.py +35 -0
- pixeltable/metadata/converters/convert_24.py +56 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/util.py +4 -2
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +1 -0
- pixeltable/plan.py +128 -50
- pixeltable/store.py +1 -1
- pixeltable/type_system.py +196 -54
- pixeltable/utils/arrow.py +8 -3
- pixeltable/utils/description_helper.py +89 -0
- pixeltable/utils/documents.py +14 -0
- {pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/METADATA +30 -20
- pixeltable-0.3.0.dist-info/RECORD +155 -0
- {pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/WHEEL +1 -1
- pixeltable-0.3.0.dist-info/entry_points.txt +3 -0
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable-0.2.25.dist-info/RECORD +0 -154
- pixeltable-0.2.25.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/LICENSE +0 -0
pixeltable/catalog/table.py
CHANGED
|
@@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, Se
|
|
|
10
10
|
from uuid import UUID
|
|
11
11
|
|
|
12
12
|
import pandas as pd
|
|
13
|
-
import pandas.io.formats.style
|
|
14
13
|
import sqlalchemy as sql
|
|
15
14
|
|
|
16
15
|
import pixeltable as pxt
|
|
@@ -21,18 +20,22 @@ import pixeltable.exprs as exprs
|
|
|
21
20
|
import pixeltable.index as index
|
|
22
21
|
import pixeltable.metadata.schema as schema
|
|
23
22
|
import pixeltable.type_system as ts
|
|
24
|
-
from pixeltable.utils.filecache import FileCache
|
|
25
23
|
|
|
24
|
+
from ..exprs import ColumnRef
|
|
25
|
+
from ..utils.description_helper import DescriptionHelper
|
|
26
|
+
from ..utils.filecache import FileCache
|
|
26
27
|
from .column import Column
|
|
27
|
-
from .globals import _ROWID_COLUMN_NAME,
|
|
28
|
+
from .globals import (_ROWID_COLUMN_NAME, IfExistsParam, IfNotExistsParam, MediaValidation, UpdateStatus,
|
|
29
|
+
is_system_column_name, is_valid_identifier)
|
|
28
30
|
from .schema_object import SchemaObject
|
|
29
31
|
from .table_version import TableVersion
|
|
30
32
|
from .table_version_path import TableVersionPath
|
|
31
|
-
from ..exprs import ColumnRef
|
|
32
33
|
|
|
33
34
|
if TYPE_CHECKING:
|
|
34
35
|
import torch.utils.data
|
|
35
36
|
|
|
37
|
+
import pixeltable.plan
|
|
38
|
+
|
|
36
39
|
_logger = logging.getLogger('pixeltable')
|
|
37
40
|
|
|
38
41
|
class Table(SchemaObject):
|
|
@@ -46,23 +49,15 @@ class Table(SchemaObject):
|
|
|
46
49
|
def __init__(self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath):
|
|
47
50
|
super().__init__(id, name, dir_id)
|
|
48
51
|
self._is_dropped = False
|
|
49
|
-
self.
|
|
50
|
-
self.__query_scope = self.QueryScope(self)
|
|
51
|
-
|
|
52
|
-
class QueryScope:
|
|
53
|
-
__table: 'Table'
|
|
54
|
-
_queries: dict[str, pxt.func.QueryTemplateFunction]
|
|
55
|
-
|
|
56
|
-
def __init__(self, table: 'Table') -> None:
|
|
57
|
-
self.__table = table
|
|
58
|
-
self._queries = {}
|
|
52
|
+
self.__tbl_version_path = tbl_version_path
|
|
59
53
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
54
|
+
@property
|
|
55
|
+
def _has_dependents(self) -> bool:
|
|
56
|
+
"""Returns True if this table has any dependent views, or snapshots."""
|
|
57
|
+
return len(self._get_views(recursive=False)) > 0
|
|
64
58
|
|
|
65
59
|
def _move(self, new_name: str, new_dir_id: UUID) -> None:
|
|
60
|
+
self._check_is_dropped()
|
|
66
61
|
super()._move(new_name, new_dir_id)
|
|
67
62
|
with env.Env.get().engine.begin() as conn:
|
|
68
63
|
stmt = sql.text((
|
|
@@ -96,6 +91,7 @@ class Table(SchemaObject):
|
|
|
96
91
|
}
|
|
97
92
|
```
|
|
98
93
|
"""
|
|
94
|
+
self._check_is_dropped()
|
|
99
95
|
md = super().get_metadata()
|
|
100
96
|
md['base'] = self._base._path if self._base is not None else None
|
|
101
97
|
md['schema'] = self._schema
|
|
@@ -116,6 +112,12 @@ class Table(SchemaObject):
|
|
|
116
112
|
"""Return TableVersion for just this table."""
|
|
117
113
|
return self._tbl_version_path.tbl_version
|
|
118
114
|
|
|
115
|
+
@property
|
|
116
|
+
def _tbl_version_path(self) -> TableVersionPath:
|
|
117
|
+
"""Return TableVersionPath for just this table."""
|
|
118
|
+
self._check_is_dropped()
|
|
119
|
+
return self.__tbl_version_path
|
|
120
|
+
|
|
119
121
|
def __hash__(self) -> int:
|
|
120
122
|
return hash(self._tbl_version.id)
|
|
121
123
|
|
|
@@ -124,23 +126,12 @@ class Table(SchemaObject):
|
|
|
124
126
|
raise excs.Error(f'{self._display_name()} {self._name} has been dropped')
|
|
125
127
|
|
|
126
128
|
def __getattr__(self, name: str) -> 'pxt.exprs.ColumnRef':
|
|
127
|
-
"""Return a ColumnRef for the given name.
|
|
128
|
-
"""
|
|
129
|
+
"""Return a ColumnRef for the given name."""
|
|
129
130
|
return self._tbl_version_path.get_column_ref(name)
|
|
130
131
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
@overload
|
|
135
|
-
def __getitem__(self, index: Union[exprs.Expr, Sequence[exprs.Expr]]) -> 'pxt.DataFrame': ...
|
|
136
|
-
|
|
137
|
-
def __getitem__(self, index):
|
|
138
|
-
"""Return a ColumnRef or QueryTemplateFunction for the given name, or a DataFrame for the given slice.
|
|
139
|
-
"""
|
|
140
|
-
if isinstance(index, str):
|
|
141
|
-
return getattr(self, index)
|
|
142
|
-
else:
|
|
143
|
-
return self._df()[index]
|
|
132
|
+
def __getitem__(self, name: str) -> 'pxt.exprs.ColumnRef':
|
|
133
|
+
"""Return a ColumnRef for the given name."""
|
|
134
|
+
return getattr(self, name)
|
|
144
135
|
|
|
145
136
|
def list_views(self, *, recursive: bool = True) -> list[str]:
|
|
146
137
|
"""
|
|
@@ -153,6 +144,7 @@ class Table(SchemaObject):
|
|
|
153
144
|
Returns:
|
|
154
145
|
A list of view paths.
|
|
155
146
|
"""
|
|
147
|
+
self._check_is_dropped()
|
|
156
148
|
return [t._path for t in self._get_views(recursive=recursive)]
|
|
157
149
|
|
|
158
150
|
def _get_views(self, *, recursive: bool = True) -> list['Table']:
|
|
@@ -166,26 +158,42 @@ class Table(SchemaObject):
|
|
|
166
158
|
"""Return a DataFrame for this table.
|
|
167
159
|
"""
|
|
168
160
|
# local import: avoid circular imports
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
@property
|
|
172
|
-
def queries(self) -> 'Table.QueryScope':
|
|
173
|
-
return self.__query_scope
|
|
161
|
+
from pixeltable.plan import FromClause
|
|
162
|
+
return pxt.DataFrame(FromClause(tbls=[self._tbl_version_path]))
|
|
174
163
|
|
|
175
164
|
def select(self, *items: Any, **named_items: Any) -> 'pxt.DataFrame':
|
|
176
|
-
"""
|
|
165
|
+
""" Select columns or expressions from this table.
|
|
166
|
+
|
|
167
|
+
See [`DataFrame.select`][pixeltable.DataFrame.select] for more details.
|
|
168
|
+
"""
|
|
177
169
|
return self._df().select(*items, **named_items)
|
|
178
170
|
|
|
179
171
|
def where(self, pred: 'exprs.Expr') -> 'pxt.DataFrame':
|
|
180
|
-
"""
|
|
172
|
+
"""Filter rows from this table based on the expression.
|
|
173
|
+
|
|
174
|
+
See [`DataFrame.where`][pixeltable.DataFrame.where] for more details.
|
|
175
|
+
"""
|
|
181
176
|
return self._df().where(pred)
|
|
182
177
|
|
|
178
|
+
def join(
|
|
179
|
+
self, other: 'Table', *, on: Optional['exprs.Expr'] = None,
|
|
180
|
+
how: 'pixeltable.plan.JoinType.LiteralType' = 'inner'
|
|
181
|
+
) -> 'pxt.DataFrame':
|
|
182
|
+
"""Join this table with another table."""
|
|
183
|
+
return self._df().join(other, on=on, how=how)
|
|
184
|
+
|
|
183
185
|
def order_by(self, *items: 'exprs.Expr', asc: bool = True) -> 'pxt.DataFrame':
|
|
184
|
-
"""
|
|
186
|
+
"""Order the rows of this table based on the expression.
|
|
187
|
+
|
|
188
|
+
See [`DataFrame.order_by`][pixeltable.DataFrame.order_by] for more details.
|
|
189
|
+
"""
|
|
185
190
|
return self._df().order_by(*items, asc=asc)
|
|
186
191
|
|
|
187
192
|
def group_by(self, *items: 'exprs.Expr') -> 'pxt.DataFrame':
|
|
188
|
-
"""
|
|
193
|
+
"""Group the rows of this table based on the expression.
|
|
194
|
+
|
|
195
|
+
See [`DataFrame.group_by`][pixeltable.DataFrame.group_by] for more details.
|
|
196
|
+
"""
|
|
189
197
|
return self._df().group_by(*items)
|
|
190
198
|
|
|
191
199
|
def limit(self, n: int) -> 'pxt.DataFrame':
|
|
@@ -200,7 +208,6 @@ class Table(SchemaObject):
|
|
|
200
208
|
) -> 'pxt.dataframe.DataFrameResultSet':
|
|
201
209
|
"""Return rows from this table.
|
|
202
210
|
"""
|
|
203
|
-
self._check_is_dropped()
|
|
204
211
|
return self._df().show(*args, **kwargs)
|
|
205
212
|
|
|
206
213
|
def head(
|
|
@@ -230,11 +237,6 @@ class Table(SchemaObject):
|
|
|
230
237
|
"""Return the schema (column names and column types) of this table."""
|
|
231
238
|
return {c.name: c.col_type for c in self._tbl_version_path.columns()}
|
|
232
239
|
|
|
233
|
-
@property
|
|
234
|
-
def _query_names(self) -> list[str]:
|
|
235
|
-
"""Return the names of the registered queries for this table."""
|
|
236
|
-
return list(self.__query_scope._queries.keys())
|
|
237
|
-
|
|
238
240
|
@property
|
|
239
241
|
def _base(self) -> Optional['Table']:
|
|
240
242
|
"""
|
|
@@ -246,6 +248,18 @@ class Table(SchemaObject):
|
|
|
246
248
|
base_id = self._tbl_version_path.base.tbl_version.id
|
|
247
249
|
return catalog.Catalog.get().tbls[base_id]
|
|
248
250
|
|
|
251
|
+
@property
|
|
252
|
+
def _bases(self) -> list['Table']:
|
|
253
|
+
"""
|
|
254
|
+
The ancestor list of bases of this table, starting with its immediate base.
|
|
255
|
+
"""
|
|
256
|
+
bases = []
|
|
257
|
+
base = self._base
|
|
258
|
+
while base is not None:
|
|
259
|
+
bases.append(base)
|
|
260
|
+
base = base._base
|
|
261
|
+
return bases
|
|
262
|
+
|
|
249
263
|
@property
|
|
250
264
|
def _comment(self) -> str:
|
|
251
265
|
return self._tbl_version.comment
|
|
@@ -258,47 +272,98 @@ class Table(SchemaObject):
|
|
|
258
272
|
def _media_validation(self) -> MediaValidation:
|
|
259
273
|
return self._tbl_version.media_validation
|
|
260
274
|
|
|
261
|
-
def
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
.
|
|
277
|
-
|
|
275
|
+
def __repr__(self) -> str:
|
|
276
|
+
return self._descriptors().to_string()
|
|
277
|
+
|
|
278
|
+
def _repr_html_(self) -> str:
|
|
279
|
+
return self._descriptors().to_html()
|
|
280
|
+
|
|
281
|
+
def _descriptors(self) -> DescriptionHelper:
|
|
282
|
+
"""
|
|
283
|
+
Constructs a list of descriptors for this table that can be pretty-printed.
|
|
284
|
+
"""
|
|
285
|
+
helper = DescriptionHelper()
|
|
286
|
+
helper.append(self._title_descriptor())
|
|
287
|
+
helper.append(self._col_descriptor())
|
|
288
|
+
idxs = self._index_descriptor()
|
|
289
|
+
if not idxs.empty:
|
|
290
|
+
helper.append(idxs)
|
|
291
|
+
stores = self._external_store_descriptor()
|
|
292
|
+
if not stores.empty:
|
|
293
|
+
helper.append(stores)
|
|
294
|
+
if self._comment:
|
|
295
|
+
helper.append(f'COMMENT: {self._comment}')
|
|
296
|
+
return helper
|
|
297
|
+
|
|
298
|
+
def _title_descriptor(self) -> str:
|
|
299
|
+
title: str
|
|
300
|
+
if self._base is None:
|
|
301
|
+
title = f'Table\n{self._path!r}'
|
|
302
|
+
else:
|
|
303
|
+
title = f'View\n{self._path!r}'
|
|
304
|
+
title += f'\n(of {self.__bases_to_desc()})'
|
|
305
|
+
return title
|
|
306
|
+
|
|
307
|
+
def _col_descriptor(self, columns: Optional[list[str]] = None) -> pd.DataFrame:
|
|
308
|
+
return pd.DataFrame(
|
|
309
|
+
{
|
|
310
|
+
'Column Name': col.name,
|
|
311
|
+
'Type': col.col_type._to_str(as_schema=True),
|
|
312
|
+
'Computed With': col.value_expr.display_str(inline=False) if col.value_expr is not None else ''
|
|
313
|
+
}
|
|
314
|
+
for col in self.__tbl_version_path.columns()
|
|
315
|
+
if columns is None or col.name in columns
|
|
278
316
|
)
|
|
279
317
|
|
|
318
|
+
def __bases_to_desc(self) -> str:
|
|
319
|
+
bases = self._bases
|
|
320
|
+
assert len(bases) >= 1
|
|
321
|
+
if len(bases) <= 2:
|
|
322
|
+
return ', '.join(repr(b._path) for b in bases)
|
|
323
|
+
else:
|
|
324
|
+
return f'{bases[0]._path!r}, ..., {bases[-1]._path!r}'
|
|
325
|
+
|
|
326
|
+
def _index_descriptor(self, columns: Optional[list[str]] = None) -> pd.DataFrame:
|
|
327
|
+
from pixeltable import index
|
|
328
|
+
|
|
329
|
+
pd_rows = []
|
|
330
|
+
for name, info in self._tbl_version.idxs_by_name.items():
|
|
331
|
+
if isinstance(info.idx, index.EmbeddingIndex) and (columns is None or info.col.name in columns):
|
|
332
|
+
display_embed = info.idx.string_embed if info.col.col_type.is_string_type() else info.idx.image_embed
|
|
333
|
+
if info.idx.string_embed is not None and info.idx.image_embed is not None:
|
|
334
|
+
embed_str = f'{display_embed} (+1)'
|
|
335
|
+
else:
|
|
336
|
+
embed_str = str(display_embed)
|
|
337
|
+
row = {
|
|
338
|
+
'Index Name': name,
|
|
339
|
+
'Column': info.col.name,
|
|
340
|
+
'Metric': str(info.idx.metric.name.lower()),
|
|
341
|
+
'Embedding': embed_str,
|
|
342
|
+
}
|
|
343
|
+
pd_rows.append(row)
|
|
344
|
+
return pd.DataFrame(pd_rows)
|
|
345
|
+
|
|
346
|
+
def _external_store_descriptor(self) -> pd.DataFrame:
|
|
347
|
+
pd_rows = []
|
|
348
|
+
for name, store in self._tbl_version.external_stores.items():
|
|
349
|
+
row = {
|
|
350
|
+
'External Store': name,
|
|
351
|
+
'Type': type(store).__name__,
|
|
352
|
+
}
|
|
353
|
+
pd_rows.append(row)
|
|
354
|
+
return pd.DataFrame(pd_rows)
|
|
355
|
+
|
|
280
356
|
def describe(self) -> None:
|
|
281
357
|
"""
|
|
282
358
|
Print the table schema.
|
|
283
359
|
"""
|
|
360
|
+
self._check_is_dropped()
|
|
284
361
|
if getattr(builtins, '__IPYTHON__', False):
|
|
285
362
|
from IPython.display import display
|
|
286
|
-
display(self.
|
|
363
|
+
display(self._repr_html_())
|
|
287
364
|
else:
|
|
288
365
|
print(repr(self))
|
|
289
366
|
|
|
290
|
-
# TODO: Display comments in _repr_html()
|
|
291
|
-
def __repr__(self) -> str:
|
|
292
|
-
description_str = self._description().to_string(index=False)
|
|
293
|
-
if self._comment is None:
|
|
294
|
-
comment = ''
|
|
295
|
-
else:
|
|
296
|
-
comment = f'{self._comment}\n'
|
|
297
|
-
return f'{self._display_name()} \'{self._name}\'\n{comment}{description_str}'
|
|
298
|
-
|
|
299
|
-
def _repr_html_(self) -> str:
|
|
300
|
-
return self._description_html()._repr_html_() # type: ignore[attr-defined]
|
|
301
|
-
|
|
302
367
|
def _drop(self) -> None:
|
|
303
368
|
cat = catalog.Catalog.get()
|
|
304
369
|
# verify all dependents are deleted by now
|
|
@@ -325,27 +390,54 @@ class Table(SchemaObject):
|
|
|
325
390
|
"""
|
|
326
391
|
return self._df().to_coco_dataset()
|
|
327
392
|
|
|
328
|
-
def
|
|
329
|
-
"""
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
393
|
+
def _column_has_dependents(self, col: Column) -> bool:
|
|
394
|
+
"""Returns True if the column has dependents, False otherwise."""
|
|
395
|
+
assert col is not None
|
|
396
|
+
assert col.name in self._schema.keys()
|
|
397
|
+
if any(c.name is not None for c in col.dependent_cols):
|
|
398
|
+
return True
|
|
399
|
+
return any(
|
|
400
|
+
col in store.get_local_columns()
|
|
401
|
+
for view in [self] + self._get_views(recursive=True)
|
|
402
|
+
for store in view._tbl_version.external_stores.values())
|
|
335
403
|
|
|
336
|
-
|
|
404
|
+
def _ignore_or_drop_existing_columns(self, new_col_names: list[str], if_exists: IfExistsParam) -> list[str]:
|
|
405
|
+
""" Check and handle existing columns in the new column specification based on the if_exists parameter.
|
|
337
406
|
|
|
338
|
-
|
|
407
|
+
If `if_exists='ignore'`, returns a list of existing columns, if any, in `new_col_names`.
|
|
339
408
|
"""
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
409
|
+
assert not self.get_metadata()['is_snapshot']
|
|
410
|
+
existing_col_names = set(self._schema.keys())
|
|
411
|
+
cols_to_ignore = []
|
|
412
|
+
for new_col_name in new_col_names:
|
|
413
|
+
if new_col_name in existing_col_names:
|
|
414
|
+
if if_exists == IfExistsParam.ERROR:
|
|
415
|
+
raise excs.Error(f'Duplicate column name: {new_col_name!r}')
|
|
416
|
+
elif if_exists == IfExistsParam.IGNORE:
|
|
417
|
+
cols_to_ignore.append(new_col_name)
|
|
418
|
+
elif if_exists == IfExistsParam.REPLACE or if_exists == IfExistsParam.REPLACE_FORCE:
|
|
419
|
+
if new_col_name not in self._tbl_version.cols_by_name:
|
|
420
|
+
# for views, it is possible that the existing column
|
|
421
|
+
# is a base table column; in that case, we should not
|
|
422
|
+
# drop/replace that column. Continue to raise error.
|
|
423
|
+
raise excs.Error(
|
|
424
|
+
f'Column {new_col_name!r} is a base table column. Cannot replace it.'
|
|
425
|
+
)
|
|
426
|
+
col = self._tbl_version.cols_by_name[new_col_name]
|
|
427
|
+
# cannot drop a column with dependents; so reject
|
|
428
|
+
# replace directive if column has dependents.
|
|
429
|
+
if self._column_has_dependents(col):
|
|
430
|
+
raise excs.Error(
|
|
431
|
+
f'Column {new_col_name!r} already exists and has dependents. Cannot {if_exists.name.lower()} it.'
|
|
432
|
+
)
|
|
433
|
+
self.drop_column(new_col_name)
|
|
434
|
+
assert new_col_name not in self._tbl_version.cols_by_name
|
|
435
|
+
return cols_to_ignore
|
|
345
436
|
|
|
346
437
|
def add_columns(
|
|
347
438
|
self,
|
|
348
|
-
schema: dict[str, Union[ts.ColumnType, builtins.type, _GenericAlias]]
|
|
439
|
+
schema: dict[str, Union[ts.ColumnType, builtins.type, _GenericAlias]],
|
|
440
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
|
|
349
441
|
) -> UpdateStatus:
|
|
350
442
|
"""
|
|
351
443
|
Adds multiple columns to the table. The columns must be concrete (non-computed) columns; to add computed columns,
|
|
@@ -356,12 +448,21 @@ class Table(SchemaObject):
|
|
|
356
448
|
|
|
357
449
|
Args:
|
|
358
450
|
schema: A dictionary mapping column names to types.
|
|
451
|
+
if_exists: Determines the behavior if a column already exists. Must be one of the following:
|
|
452
|
+
|
|
453
|
+
- `'error'`: an exception will be raised.
|
|
454
|
+
- `'ignore'`: do nothing and return.
|
|
455
|
+
- `'replace' or 'replace_force'`: drop the existing column and add the new column, if it has no dependents.
|
|
456
|
+
|
|
457
|
+
Note that the `if_exists` parameter is applied to all columns in the schema.
|
|
458
|
+
To apply different behaviors to different columns, please use [`add_column()`][pixeltable.Table.add_column] for each column.
|
|
359
459
|
|
|
360
460
|
Returns:
|
|
361
461
|
Information about the execution status of the operation.
|
|
362
462
|
|
|
363
463
|
Raises:
|
|
364
|
-
Error: If any column name is invalid or already exists
|
|
464
|
+
Error: If any column name is invalid, or already exists and `if_exists='error'`,
|
|
465
|
+
or `if_exists='replace*'` but the column has dependents or is a basetable column.
|
|
365
466
|
|
|
366
467
|
Examples:
|
|
367
468
|
Add multiple columns to the table `my_table`:
|
|
@@ -374,49 +475,51 @@ class Table(SchemaObject):
|
|
|
374
475
|
... tbl.add_columns(schema)
|
|
375
476
|
"""
|
|
376
477
|
self._check_is_dropped()
|
|
478
|
+
if self.get_metadata()['is_snapshot']:
|
|
479
|
+
raise excs.Error('Cannot add column to a snapshot.')
|
|
377
480
|
col_schema = {
|
|
378
481
|
col_name: {'type': ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)}
|
|
379
482
|
for col_name, spec in schema.items()
|
|
380
483
|
}
|
|
484
|
+
# handle existing columns based on if_exists parameter
|
|
485
|
+
cols_to_ignore = self._ignore_or_drop_existing_columns(list(col_schema.keys()), IfExistsParam.validated(if_exists, 'if_exists'))
|
|
486
|
+
# if all columns to be added already exist and user asked to ignore
|
|
487
|
+
# existing columns, there's nothing to do.
|
|
488
|
+
for cname in cols_to_ignore:
|
|
489
|
+
assert cname in col_schema
|
|
490
|
+
del col_schema[cname]
|
|
491
|
+
if len(col_schema) == 0:
|
|
492
|
+
return UpdateStatus()
|
|
381
493
|
new_cols = self._create_columns(col_schema)
|
|
382
494
|
for new_col in new_cols:
|
|
383
|
-
self._verify_column(new_col
|
|
495
|
+
self._verify_column(new_col)
|
|
384
496
|
status = self._tbl_version.add_columns(new_cols, print_stats=False, on_error='abort')
|
|
385
497
|
FileCache.get().emit_eviction_warnings()
|
|
386
498
|
return status
|
|
387
499
|
|
|
388
|
-
# TODO: add_column() still supports computed columns for backward-compatibility. In the future, computed columns
|
|
389
|
-
# will be supported only through add_computed_column(). At that point, we can remove the `stored`,
|
|
390
|
-
# `print_stats`, and `on_error` parameters, and change the method body to simply call self.add_columns(kwargs),
|
|
391
|
-
# simplifying the code. For the time being, there's some obvious code duplication.
|
|
392
500
|
def add_column(
|
|
393
501
|
self,
|
|
394
502
|
*,
|
|
395
|
-
|
|
396
|
-
print_stats: bool = False,
|
|
397
|
-
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
503
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
398
504
|
**kwargs: Union[ts.ColumnType, builtins.type, _GenericAlias, exprs.Expr]
|
|
399
505
|
) -> UpdateStatus:
|
|
400
506
|
"""
|
|
401
|
-
Adds
|
|
507
|
+
Adds an ordinary (non-computed) column to the table.
|
|
402
508
|
|
|
403
509
|
Args:
|
|
404
510
|
kwargs: Exactly one keyword argument of the form `col_name=col_type`.
|
|
405
|
-
|
|
406
|
-
print_stats: If `True`, print execution metrics during evaluation.
|
|
407
|
-
on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
|
|
408
|
-
row.
|
|
511
|
+
if_exists: Determines the behavior if the column already exists. Must be one of the following:
|
|
409
512
|
|
|
410
|
-
- `'
|
|
411
|
-
- `'ignore'`:
|
|
412
|
-
|
|
413
|
-
corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
|
|
513
|
+
- `'error'`: an exception will be raised.
|
|
514
|
+
- `'ignore'`: do nothing and return.
|
|
515
|
+
- `'replace' or 'replace_force'`: drop the existing column and add the new column, if it has no dependents.
|
|
414
516
|
|
|
415
517
|
Returns:
|
|
416
518
|
Information about the execution status of the operation.
|
|
417
519
|
|
|
418
520
|
Raises:
|
|
419
|
-
Error: If the column name is invalid or already exists
|
|
521
|
+
Error: If the column name is invalid, or already exists and `if_exists='erorr'`,
|
|
522
|
+
or `if_exists='replace*'` but the column has dependents or is a basetable column.
|
|
420
523
|
|
|
421
524
|
Examples:
|
|
422
525
|
Add an int column:
|
|
@@ -428,29 +531,22 @@ class Table(SchemaObject):
|
|
|
428
531
|
>>> tbl['new_col'] = pxt.Int
|
|
429
532
|
"""
|
|
430
533
|
self._check_is_dropped()
|
|
534
|
+
# verify kwargs
|
|
535
|
+
if self._tbl_version.is_snapshot:
|
|
536
|
+
raise excs.Error('Cannot add column to a snapshot.')
|
|
431
537
|
# verify kwargs and construct column schema dict
|
|
432
538
|
if len(kwargs) != 1:
|
|
433
539
|
raise excs.Error(
|
|
434
540
|
f'add_column() requires exactly one keyword argument of the form "col_name=col_type"; '
|
|
435
|
-
f'got {len(kwargs)} instead ({", ".join(
|
|
541
|
+
f'got {len(kwargs)} instead ({", ".join(kwargs.keys())})'
|
|
436
542
|
)
|
|
437
|
-
|
|
438
|
-
if not
|
|
439
|
-
raise excs.Error(
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
col_schema['type'] = ts.ColumnType.normalize_type(spec, nullable_default=True, allow_builtin_types=False)
|
|
444
|
-
else:
|
|
445
|
-
col_schema['value'] = spec
|
|
446
|
-
if stored is not None:
|
|
447
|
-
col_schema['stored'] = stored
|
|
543
|
+
col_type = next(iter(kwargs.values()))
|
|
544
|
+
if not isinstance(col_type, (ts.ColumnType, type, _GenericAlias)):
|
|
545
|
+
raise excs.Error(
|
|
546
|
+
f'The argument to add_column() must be a type; did you intend to use add_computed_column() instead?'
|
|
547
|
+
)
|
|
548
|
+
return self.add_columns(kwargs, if_exists=if_exists)
|
|
448
549
|
|
|
449
|
-
new_col = self._create_columns({col_name: col_schema})[0]
|
|
450
|
-
self._verify_column(new_col, set(self._schema.keys()), set(self._query_names))
|
|
451
|
-
status = self._tbl_version.add_columns([new_col], print_stats=print_stats, on_error=on_error)
|
|
452
|
-
FileCache.get().emit_eviction_warnings()
|
|
453
|
-
return status
|
|
454
550
|
|
|
455
551
|
def add_computed_column(
|
|
456
552
|
self,
|
|
@@ -458,6 +554,7 @@ class Table(SchemaObject):
|
|
|
458
554
|
stored: Optional[bool] = None,
|
|
459
555
|
print_stats: bool = False,
|
|
460
556
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
557
|
+
if_exists: Literal['error', 'ignore', 'replace'] = 'error',
|
|
461
558
|
**kwargs: exprs.Expr
|
|
462
559
|
) -> UpdateStatus:
|
|
463
560
|
"""
|
|
@@ -465,12 +562,27 @@ class Table(SchemaObject):
|
|
|
465
562
|
|
|
466
563
|
Args:
|
|
467
564
|
kwargs: Exactly one keyword argument of the form `col_name=expression`.
|
|
565
|
+
stored: Whether the column is materialized and stored or computed on demand. Only valid for image columns.
|
|
566
|
+
print_stats: If `True`, print execution metrics during evaluation.
|
|
567
|
+
on_error: Determines the behavior if an error occurs while evaluating the column expression for at least one
|
|
568
|
+
row.
|
|
569
|
+
|
|
570
|
+
- `'abort'`: an exception will be raised and the column will not be added.
|
|
571
|
+
- `'ignore'`: execution will continue and the column will be added. Any rows
|
|
572
|
+
with errors will have a `None` value for the column, with information about the error stored in the
|
|
573
|
+
corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
|
|
574
|
+
if_exists: Determines the behavior if the column already exists. Must be one of the following:
|
|
575
|
+
|
|
576
|
+
- `'error'`: an exception will be raised.
|
|
577
|
+
- `'ignore'`: do nothing and return.
|
|
578
|
+
- `'replace' or 'replace_force'`: drop the existing column and add the new column, iff it has no dependents.
|
|
468
579
|
|
|
469
580
|
Returns:
|
|
470
581
|
Information about the execution status of the operation.
|
|
471
582
|
|
|
472
583
|
Raises:
|
|
473
|
-
Error: If the column name is invalid or already exists
|
|
584
|
+
Error: If the column name is invalid or already exists and `if_exists='error'`,
|
|
585
|
+
or `if_exists='replace*'` but the column has dependents or is a basetable column.
|
|
474
586
|
|
|
475
587
|
Examples:
|
|
476
588
|
For a table with an image column `frame`, add an image column `rotated` that rotates the image by
|
|
@@ -483,6 +595,8 @@ class Table(SchemaObject):
|
|
|
483
595
|
>>> tbl.add_computed_column(rotated=tbl.frame.rotate(90), stored=False)
|
|
484
596
|
"""
|
|
485
597
|
self._check_is_dropped()
|
|
598
|
+
if self.get_metadata()['is_snapshot']:
|
|
599
|
+
raise excs.Error('Cannot add column to a snapshot.')
|
|
486
600
|
if len(kwargs) != 1:
|
|
487
601
|
raise excs.Error(
|
|
488
602
|
f'add_computed_column() requires exactly one keyword argument of the form "column-name=type|value-expression"; '
|
|
@@ -496,8 +610,16 @@ class Table(SchemaObject):
|
|
|
496
610
|
if stored is not None:
|
|
497
611
|
col_schema['stored'] = stored
|
|
498
612
|
|
|
613
|
+
# handle existing columns based on if_exists parameter
|
|
614
|
+
cols_to_ignore = self._ignore_or_drop_existing_columns([col_name], IfExistsParam.validated(if_exists, 'if_exists'))
|
|
615
|
+
# if the column to add already exists and user asked to ignore
|
|
616
|
+
# exiting column, there's nothing to do.
|
|
617
|
+
if len(cols_to_ignore) != 0:
|
|
618
|
+
assert cols_to_ignore[0] == col_name
|
|
619
|
+
return UpdateStatus()
|
|
620
|
+
|
|
499
621
|
new_col = self._create_columns({col_name: col_schema})[0]
|
|
500
|
-
self._verify_column(new_col
|
|
622
|
+
self._verify_column(new_col)
|
|
501
623
|
status = self._tbl_version.add_columns([new_col], print_stats=print_stats, on_error=on_error)
|
|
502
624
|
FileCache.get().emit_eviction_warnings()
|
|
503
625
|
return status
|
|
@@ -577,18 +699,12 @@ class Table(SchemaObject):
|
|
|
577
699
|
return columns
|
|
578
700
|
|
|
579
701
|
@classmethod
|
|
580
|
-
def _verify_column(
|
|
581
|
-
cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
|
|
582
|
-
) -> None:
|
|
702
|
+
def _verify_column(cls, col: Column) -> None:
|
|
583
703
|
"""Check integrity of user-supplied Column and supply defaults"""
|
|
584
704
|
if is_system_column_name(col.name):
|
|
585
705
|
raise excs.Error(f'{col.name!r} is a reserved name in Pixeltable; please choose a different column name.')
|
|
586
706
|
if not is_valid_identifier(col.name):
|
|
587
707
|
raise excs.Error(f"Invalid column name: {col.name!r}")
|
|
588
|
-
if col.name in existing_column_names:
|
|
589
|
-
raise excs.Error(f'Duplicate column name: {col.name!r}')
|
|
590
|
-
if existing_query_names is not None and col.name in existing_query_names:
|
|
591
|
-
raise excs.Error(f'Column name conflicts with a registered query: {col.name!r}')
|
|
592
708
|
if col.stored is False and not (col.is_computed and col.col_type.is_image_type()):
|
|
593
709
|
raise excs.Error(f'Column {col.name!r}: stored={col.stored} only applies to computed image columns')
|
|
594
710
|
if col.stored is False and col.has_window_fn_call():
|
|
@@ -601,7 +717,7 @@ class Table(SchemaObject):
|
|
|
601
717
|
"""Check integrity of user-supplied schema and set defaults"""
|
|
602
718
|
column_names: set[str] = set()
|
|
603
719
|
for col in schema:
|
|
604
|
-
cls._verify_column(col
|
|
720
|
+
cls._verify_column(col)
|
|
605
721
|
column_names.add(col.name)
|
|
606
722
|
|
|
607
723
|
def __check_column_name_exists(self, column_name: str, include_bases: bool = False) -> None:
|
|
@@ -614,14 +730,19 @@ class Table(SchemaObject):
|
|
|
614
730
|
if not exists:
|
|
615
731
|
raise excs.Error(f'Unknown column: {col_ref.col.qualified_name}')
|
|
616
732
|
|
|
617
|
-
def drop_column(self, column: Union[str, ColumnRef]) -> None:
|
|
733
|
+
def drop_column(self, column: Union[str, ColumnRef], if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
|
|
618
734
|
"""Drop a column from the table.
|
|
619
735
|
|
|
620
736
|
Args:
|
|
621
737
|
column: The name or reference of the column to drop.
|
|
738
|
+
if_not_exists: Directive for handling a non-existent column. Must be one of the following:
|
|
739
|
+
|
|
740
|
+
- `'error'`: raise an error if the column does not exist.
|
|
741
|
+
- `'ignore'`: do nothing if the column does not exist.
|
|
622
742
|
|
|
623
743
|
Raises:
|
|
624
|
-
Error: If the column does not exist
|
|
744
|
+
Error: If the column does not exist and `if_exists='error'`,
|
|
745
|
+
or if it is referenced by a dependent computed column.
|
|
625
746
|
|
|
626
747
|
Examples:
|
|
627
748
|
Drop the column `col` from the table `my_table` by column name:
|
|
@@ -633,14 +754,32 @@ class Table(SchemaObject):
|
|
|
633
754
|
|
|
634
755
|
>>> tbl = pxt.get_table('my_table')
|
|
635
756
|
... tbl.drop_column(tbl.col)
|
|
757
|
+
|
|
758
|
+
Drop the column `col` from the table `my_table` if it exists, otherwise do nothing:
|
|
759
|
+
|
|
760
|
+
>>> tbl = pxt.get_table('my_table')
|
|
761
|
+
... tbl.drop_col(tbl.col, if_not_exists='ignore')
|
|
636
762
|
"""
|
|
637
763
|
self._check_is_dropped()
|
|
764
|
+
if self._tbl_version_path.is_snapshot():
|
|
765
|
+
raise excs.Error('Cannot drop column from a snapshot.')
|
|
638
766
|
col: Column = None
|
|
767
|
+
_if_not_exists = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
639
768
|
if isinstance(column, str):
|
|
640
|
-
self.
|
|
769
|
+
col = self._tbl_version_path.get_column(column, include_bases=False)
|
|
770
|
+
if col is None:
|
|
771
|
+
if _if_not_exists == IfNotExistsParam.ERROR:
|
|
772
|
+
raise excs.Error(f'Column {column!r} unknown')
|
|
773
|
+
assert _if_not_exists == IfNotExistsParam.IGNORE
|
|
774
|
+
return
|
|
641
775
|
col = self._tbl_version.cols_by_name[column]
|
|
642
776
|
else:
|
|
643
|
-
self.
|
|
777
|
+
exists = self._tbl_version_path.has_column(column.col, include_bases=False)
|
|
778
|
+
if not exists:
|
|
779
|
+
if _if_not_exists == IfNotExistsParam.ERROR:
|
|
780
|
+
raise excs.Error(f'Unknown column: {column.col.qualified_name}')
|
|
781
|
+
assert _if_not_exists == IfNotExistsParam.IGNORE
|
|
782
|
+
return
|
|
644
783
|
col = column.col
|
|
645
784
|
|
|
646
785
|
dependent_user_cols = [c for c in col.dependent_cols if c.name is not None]
|
|
@@ -686,69 +825,114 @@ class Table(SchemaObject):
|
|
|
686
825
|
>>> tbl = pxt.get_table('my_table')
|
|
687
826
|
... tbl.rename_column('col1', 'col2')
|
|
688
827
|
"""
|
|
689
|
-
self._check_is_dropped()
|
|
690
828
|
self._tbl_version.rename_column(old_name, new_name)
|
|
691
829
|
|
|
830
|
+
def _list_index_info_for_test(self) -> list[dict[str, Any]]:
|
|
831
|
+
"""
|
|
832
|
+
Returns list of all the indexes on this table. Used for testing.
|
|
833
|
+
|
|
834
|
+
Returns:
|
|
835
|
+
A list of index information, each containing the index's
|
|
836
|
+
id, name, and the name of the column it indexes.
|
|
837
|
+
"""
|
|
838
|
+
assert not self._is_dropped
|
|
839
|
+
index_info = []
|
|
840
|
+
for idx_name, idx in self._tbl_version.idxs_by_name.items():
|
|
841
|
+
index_info.append({
|
|
842
|
+
'_id': idx.id,
|
|
843
|
+
'_name': idx_name,
|
|
844
|
+
'_column': idx.col.name
|
|
845
|
+
})
|
|
846
|
+
return index_info
|
|
847
|
+
|
|
692
848
|
def add_embedding_index(
|
|
693
849
|
self, column: Union[str, ColumnRef], *, idx_name: Optional[str] = None,
|
|
850
|
+
embedding: Optional[pxt.Function] = None,
|
|
694
851
|
string_embed: Optional[pxt.Function] = None, image_embed: Optional[pxt.Function] = None,
|
|
695
|
-
metric: str = 'cosine'
|
|
852
|
+
metric: str = 'cosine',
|
|
853
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
|
|
696
854
|
) -> None:
|
|
697
855
|
"""
|
|
698
|
-
Add an embedding index to the table. Once the index is
|
|
856
|
+
Add an embedding index to the table. Once the index is created, it will be automatically kept up-to-date as new
|
|
699
857
|
rows are inserted into the table.
|
|
700
858
|
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
859
|
+
To add an embedding index, one must specify, at minimum, the column to be indexed and an embedding UDF.
|
|
860
|
+
Only `String` and `Image` columns are currently supported. Here's an example that uses a
|
|
861
|
+
[CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
|
|
862
|
+
|
|
863
|
+
>>> from pixeltable.functions.huggingface import clip
|
|
864
|
+
... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
|
|
865
|
+
... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
|
|
866
|
+
|
|
867
|
+
Once the index is created, similiarity lookups can be performed using the `similarity` pseudo-function.
|
|
868
|
+
|
|
869
|
+
>>> reference_img = PIL.Image.open('my_image.jpg')
|
|
870
|
+
... sim = tbl.img.similarity(reference_img)
|
|
871
|
+
... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
872
|
+
|
|
873
|
+
If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
|
|
874
|
+
performed using any of its supported types. In our example, CLIP supports both text and images, so we can
|
|
875
|
+
also search for images using a text description:
|
|
876
|
+
|
|
877
|
+
>>> sim = tbl.img.similarity('a picture of a train')
|
|
878
|
+
... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
|
|
705
879
|
|
|
706
880
|
Args:
|
|
707
|
-
column: The name of, or reference to, the column to
|
|
708
|
-
idx_name:
|
|
709
|
-
If specified, the name must be unique for this table.
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
881
|
+
column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
|
|
882
|
+
idx_name: An optional name for the index. If not specified, a name such as `'idx0'` will be generated
|
|
883
|
+
automatically. If specified, the name must be unique for this table.
|
|
884
|
+
embedding: The UDF to use for the embedding. Must be a UDF that accepts a single argument of type `String`
|
|
885
|
+
or `Image` (as appropriate for the column being indexed) and returns a fixed-size 1-dimensional
|
|
886
|
+
array of floats.
|
|
887
|
+
string_embed: An optional UDF to use for the string embedding component of this index.
|
|
888
|
+
Can be used in conjunction with `image_embed` to construct multimodal embeddings manually, by
|
|
889
|
+
specifying different embedding functions for different data types.
|
|
890
|
+
image_embed: An optional UDF to use for the image embedding component of this index.
|
|
891
|
+
Can be used in conjunction with `string_embed` to construct multimodal embeddings manually, by
|
|
892
|
+
specifying different embedding functions for different data types.
|
|
893
|
+
metric: Distance metric to use for the index; one of `'cosine'`, `'ip'`, or `'l2'`.
|
|
894
|
+
The default is `'cosine'`.
|
|
895
|
+
if_exists: Directive for handling an existing index with the same name. Must be one of the following:
|
|
896
|
+
|
|
897
|
+
- `'error'`: raise an error if an index with the same name already exists.
|
|
898
|
+
- `'ignore'`: do nothing if an index with the same name already exists.
|
|
899
|
+
- `'replace'` or `'replace_force'`: replace the existing index with the new one.
|
|
714
900
|
|
|
715
901
|
Raises:
|
|
716
|
-
Error: If an index with
|
|
902
|
+
Error: If an index with the specified name already exists for the table and `if_exists='error'`, or if the specified column does not exist.
|
|
717
903
|
|
|
718
904
|
Examples:
|
|
719
|
-
Add an index to the `img` column of the table `my_table
|
|
905
|
+
Add an index to the `img` column of the table `my_table`:
|
|
720
906
|
|
|
721
|
-
>>>
|
|
722
|
-
... tbl.
|
|
907
|
+
>>> from pixeltable.functions.huggingface import clip
|
|
908
|
+
... tbl = pxt.get_table('my_table')
|
|
909
|
+
... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
|
|
910
|
+
... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
|
|
723
911
|
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
912
|
+
Alternatively, the `img` column may be specified by name:
|
|
913
|
+
|
|
914
|
+
>>> tbl.add_embedding_index('img', embedding=embedding_fn)
|
|
727
915
|
|
|
728
|
-
Add
|
|
729
|
-
and with a specific name
|
|
916
|
+
Add a second index to the `img` column, using the inner product as the distance metric,
|
|
917
|
+
and with a specific name:
|
|
730
918
|
|
|
731
919
|
>>> tbl.add_embedding_index(
|
|
732
|
-
...
|
|
733
|
-
... idx_name='
|
|
734
|
-
...
|
|
735
|
-
... string_embed=my_string_func,
|
|
920
|
+
... tbl.img,
|
|
921
|
+
... idx_name='ip_idx',
|
|
922
|
+
... embedding=embedding_fn,
|
|
736
923
|
... metric='ip'
|
|
737
924
|
... )
|
|
738
925
|
|
|
739
|
-
|
|
926
|
+
Add an index using separately specified string and image embeddings:
|
|
740
927
|
|
|
741
928
|
>>> tbl.add_embedding_index(
|
|
742
929
|
... tbl.img,
|
|
743
|
-
...
|
|
744
|
-
... image_embed=
|
|
745
|
-
... string_embed=my_string_func,
|
|
746
|
-
... metric='ip'
|
|
930
|
+
... string_embed=string_embedding_fn,
|
|
931
|
+
... image_embed=image_embedding_fn
|
|
747
932
|
... )
|
|
748
933
|
"""
|
|
749
934
|
if self._tbl_version_path.is_snapshot():
|
|
750
935
|
raise excs.Error('Cannot add an index to a snapshot')
|
|
751
|
-
self._check_is_dropped()
|
|
752
936
|
col: Column
|
|
753
937
|
if isinstance(column, str):
|
|
754
938
|
self.__check_column_name_exists(column, include_bases=True)
|
|
@@ -758,11 +942,22 @@ class Table(SchemaObject):
|
|
|
758
942
|
col = column.col
|
|
759
943
|
|
|
760
944
|
if idx_name is not None and idx_name in self._tbl_version.idxs_by_name:
|
|
761
|
-
|
|
945
|
+
_if_exists = IfExistsParam.validated(if_exists, 'if_exists')
|
|
946
|
+
# An index with the same name already exists.
|
|
947
|
+
# Handle it according to if_exists.
|
|
948
|
+
if _if_exists == IfExistsParam.ERROR:
|
|
949
|
+
raise excs.Error(f'Duplicate index name: {idx_name}')
|
|
950
|
+
if not isinstance(self._tbl_version.idxs_by_name[idx_name].idx, index.EmbeddingIndex):
|
|
951
|
+
raise excs.Error(f'Index `{idx_name}` is not an embedding index. Cannot {_if_exists.name.lower()} it.')
|
|
952
|
+
if _if_exists == IfExistsParam.IGNORE:
|
|
953
|
+
return
|
|
954
|
+
assert _if_exists == IfExistsParam.REPLACE or _if_exists == IfExistsParam.REPLACE_FORCE
|
|
955
|
+
self.drop_index(idx_name=idx_name)
|
|
956
|
+
assert idx_name not in self._tbl_version.idxs_by_name
|
|
762
957
|
from pixeltable.index import EmbeddingIndex
|
|
763
958
|
|
|
764
959
|
# create the EmbeddingIndex instance to verify args
|
|
765
|
-
idx = EmbeddingIndex(col, metric=metric, string_embed=string_embed, image_embed=image_embed)
|
|
960
|
+
idx = EmbeddingIndex(col, metric=metric, embed=embedding, string_embed=string_embed, image_embed=image_embed)
|
|
766
961
|
status = self._tbl_version.add_index(col, idx_name=idx_name, idx=idx)
|
|
767
962
|
# TODO: how to deal with exceptions here? drop the index and raise?
|
|
768
963
|
FileCache.get().emit_eviction_warnings()
|
|
@@ -770,7 +965,9 @@ class Table(SchemaObject):
|
|
|
770
965
|
def drop_embedding_index(
|
|
771
966
|
self, *,
|
|
772
967
|
column: Union[str, ColumnRef, None] = None,
|
|
773
|
-
idx_name: Optional[str] = None
|
|
968
|
+
idx_name: Optional[str] = None,
|
|
969
|
+
if_not_exists: Literal['error', 'ignore'] = 'error'
|
|
970
|
+
) -> None:
|
|
774
971
|
"""
|
|
775
972
|
Drop an embedding index from the table. Either a column name or an index name (but not both) must be
|
|
776
973
|
specified. If a column name or reference is specified, it must be a column containing exactly one
|
|
@@ -780,11 +977,20 @@ class Table(SchemaObject):
|
|
|
780
977
|
column: The name of, or reference to, the column from which to drop the index.
|
|
781
978
|
The column must have only one embedding index.
|
|
782
979
|
idx_name: The name of the index to drop.
|
|
980
|
+
if_not_exists: Directive for handling a non-existent index. Must be one of the following:
|
|
981
|
+
|
|
982
|
+
- `'error'`: raise an error if the index does not exist.
|
|
983
|
+
- `'ignore'`: do nothing if the index does not exist.
|
|
984
|
+
|
|
985
|
+
Note that `if_not_exists` parameter is only applicable when an `idx_name` is specified
|
|
986
|
+
and it does not exist, or when `column` is specified and it has no index.
|
|
987
|
+
`if_not_exists` does not apply to non-exisitng column.
|
|
783
988
|
|
|
784
989
|
Raises:
|
|
785
990
|
Error: If `column` is specified, but the column does not exist, or it contains no embedding
|
|
786
|
-
indices or multiple embedding indices.
|
|
787
|
-
Error: If `idx_name` is specified, but the index
|
|
991
|
+
indices and `if_not_exists='error'`, or the column has multiple embedding indices.
|
|
992
|
+
Error: If `idx_name` is specified, but the index is not an embedding index, or
|
|
993
|
+
the index does not exist and `if_not_exists='error'`.
|
|
788
994
|
|
|
789
995
|
Examples:
|
|
790
996
|
Drop the embedding index on the `img` column of the table `my_table` by column name:
|
|
@@ -801,6 +1007,9 @@ class Table(SchemaObject):
|
|
|
801
1007
|
>>> tbl = pxt.get_table('my_table')
|
|
802
1008
|
... tbl.drop_embedding_index(idx_name='idx1')
|
|
803
1009
|
|
|
1010
|
+
Drop the embedding index `idx1` of the table `my_table` by index name, if it exists, otherwise do nothing:
|
|
1011
|
+
>>> tbl = pxt.get_table('my_table')
|
|
1012
|
+
... tbl.drop_embedding_index(idx_name='idx1', if_not_exists='ignore')
|
|
804
1013
|
"""
|
|
805
1014
|
if (column is None) == (idx_name is None):
|
|
806
1015
|
raise excs.Error("Exactly one of 'column' or 'idx_name' must be provided")
|
|
@@ -814,12 +1023,14 @@ class Table(SchemaObject):
|
|
|
814
1023
|
self.__check_column_ref_exists(column, include_bases=True)
|
|
815
1024
|
col = column.col
|
|
816
1025
|
assert col is not None
|
|
817
|
-
self._drop_index(col=col, idx_name=idx_name, _idx_class=index.EmbeddingIndex)
|
|
1026
|
+
self._drop_index(col=col, idx_name=idx_name, _idx_class=index.EmbeddingIndex, if_not_exists=if_not_exists)
|
|
818
1027
|
|
|
819
1028
|
def drop_index(
|
|
820
1029
|
self, *,
|
|
821
1030
|
column: Union[str, ColumnRef, None] = None,
|
|
822
|
-
idx_name: Optional[str] = None
|
|
1031
|
+
idx_name: Optional[str] = None,
|
|
1032
|
+
if_not_exists: Literal['error', 'ignore'] = 'error'
|
|
1033
|
+
) -> None:
|
|
823
1034
|
"""
|
|
824
1035
|
Drop an index from the table. Either a column name or an index name (but not both) must be
|
|
825
1036
|
specified. If a column name or reference is specified, it must be a column containing exactly one index;
|
|
@@ -829,6 +1040,14 @@ class Table(SchemaObject):
|
|
|
829
1040
|
column: The name of, or reference to, the column from which to drop the index.
|
|
830
1041
|
The column must have only one embedding index.
|
|
831
1042
|
idx_name: The name of the index to drop.
|
|
1043
|
+
if_not_exists: Directive for handling a non-existent index. Must be one of the following:
|
|
1044
|
+
|
|
1045
|
+
- `'error'`: raise an error if the index does not exist.
|
|
1046
|
+
- `'ignore'`: do nothing if the index does not exist.
|
|
1047
|
+
|
|
1048
|
+
Note that `if_not_exists` parameter is only applicable when an `idx_name` is specified
|
|
1049
|
+
and it does not exist, or when `column` is specified and it has no index.
|
|
1050
|
+
`if_not_exists` does not apply to non-exisitng column.
|
|
832
1051
|
|
|
833
1052
|
Raises:
|
|
834
1053
|
Error: If `column` is specified, but the column does not exist, or it contains no
|
|
@@ -850,6 +1069,10 @@ class Table(SchemaObject):
|
|
|
850
1069
|
>>> tbl = pxt.get_table('my_table')
|
|
851
1070
|
... tbl.drop_index(idx_name='idx1')
|
|
852
1071
|
|
|
1072
|
+
Drop the index `idx1` of the table `my_table` by index name, if it exists, otherwise do nothing:
|
|
1073
|
+
>>> tbl = pxt.get_table('my_table')
|
|
1074
|
+
... tbl.drop_index(idx_name='idx1', if_not_exists='ignore')
|
|
1075
|
+
|
|
853
1076
|
"""
|
|
854
1077
|
if (column is None) == (idx_name is None):
|
|
855
1078
|
raise excs.Error("Exactly one of 'column' or 'idx_name' must be provided")
|
|
@@ -863,21 +1086,25 @@ class Table(SchemaObject):
|
|
|
863
1086
|
self.__check_column_ref_exists(column, include_bases=True)
|
|
864
1087
|
col = column.col
|
|
865
1088
|
assert col is not None
|
|
866
|
-
self._drop_index(col=col, idx_name=idx_name)
|
|
1089
|
+
self._drop_index(col=col, idx_name=idx_name, if_not_exists=if_not_exists)
|
|
867
1090
|
|
|
868
1091
|
def _drop_index(
|
|
869
1092
|
self, *, col: Optional[Column] = None,
|
|
870
1093
|
idx_name: Optional[str] = None,
|
|
871
|
-
_idx_class: Optional[type[index.IndexBase]] = None
|
|
1094
|
+
_idx_class: Optional[type[index.IndexBase]] = None,
|
|
1095
|
+
if_not_exists: Literal['error', 'ignore'] = 'error'
|
|
872
1096
|
) -> None:
|
|
873
1097
|
if self._tbl_version_path.is_snapshot():
|
|
874
1098
|
raise excs.Error('Cannot drop an index from a snapshot')
|
|
875
|
-
self._check_is_dropped()
|
|
876
1099
|
assert (col is None) != (idx_name is None)
|
|
877
1100
|
|
|
878
1101
|
if idx_name is not None:
|
|
1102
|
+
_if_not_exists = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
879
1103
|
if idx_name not in self._tbl_version.idxs_by_name:
|
|
880
|
-
|
|
1104
|
+
if _if_not_exists == IfNotExistsParam.ERROR:
|
|
1105
|
+
raise excs.Error(f'Index {idx_name!r} does not exist')
|
|
1106
|
+
assert _if_not_exists == IfNotExistsParam.IGNORE
|
|
1107
|
+
return
|
|
881
1108
|
idx_id = self._tbl_version.idxs_by_name[idx_name].id
|
|
882
1109
|
else:
|
|
883
1110
|
if col.tbl.id != self._tbl_version.id:
|
|
@@ -887,7 +1114,11 @@ class Table(SchemaObject):
|
|
|
887
1114
|
if _idx_class is not None:
|
|
888
1115
|
idx_info = [info for info in idx_info if isinstance(info.idx, _idx_class)]
|
|
889
1116
|
if len(idx_info) == 0:
|
|
890
|
-
|
|
1117
|
+
_if_not_exists = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
1118
|
+
if _if_not_exists == IfNotExistsParam.ERROR:
|
|
1119
|
+
raise excs.Error(f'Column {col.name!r} does not have an index')
|
|
1120
|
+
assert _if_not_exists == IfNotExistsParam.IGNORE
|
|
1121
|
+
return
|
|
891
1122
|
if len(idx_info) > 1:
|
|
892
1123
|
raise excs.Error(f"Column {col.name!r} has multiple indices; specify 'idx_name' instead")
|
|
893
1124
|
idx_id = idx_info[0].id
|
|
@@ -1009,7 +1240,6 @@ class Table(SchemaObject):
|
|
|
1009
1240
|
|
|
1010
1241
|
>>> tbl.update({'int_col': tbl.int_col + 1}, where=tbl.int_col == 0)
|
|
1011
1242
|
"""
|
|
1012
|
-
self._check_is_dropped()
|
|
1013
1243
|
status = self._tbl_version.update(value_spec, where, cascade)
|
|
1014
1244
|
FileCache.get().emit_eviction_warnings()
|
|
1015
1245
|
return status
|
|
@@ -1045,7 +1275,6 @@ class Table(SchemaObject):
|
|
|
1045
1275
|
"""
|
|
1046
1276
|
if self._tbl_version_path.is_snapshot():
|
|
1047
1277
|
raise excs.Error('Cannot update a snapshot')
|
|
1048
|
-
self._check_is_dropped()
|
|
1049
1278
|
rows = list(rows)
|
|
1050
1279
|
|
|
1051
1280
|
row_updates: list[dict[Column, exprs.Expr]] = []
|
|
@@ -1100,46 +1329,8 @@ class Table(SchemaObject):
|
|
|
1100
1329
|
"""
|
|
1101
1330
|
if self._tbl_version_path.is_snapshot():
|
|
1102
1331
|
raise excs.Error('Cannot revert a snapshot')
|
|
1103
|
-
self._check_is_dropped()
|
|
1104
1332
|
self._tbl_version.revert()
|
|
1105
1333
|
|
|
1106
|
-
@overload
|
|
1107
|
-
def query(self, py_fn: Callable) -> 'pxt.func.QueryTemplateFunction': ...
|
|
1108
|
-
|
|
1109
|
-
@overload
|
|
1110
|
-
def query(
|
|
1111
|
-
self, *, param_types: Optional[list[ts.ColumnType]] = None
|
|
1112
|
-
) -> Callable[[Callable], 'pxt.func.QueryTemplateFunction']: ...
|
|
1113
|
-
|
|
1114
|
-
def query(self, *args: Any, **kwargs: Any) -> Any:
|
|
1115
|
-
def make_query_template(
|
|
1116
|
-
py_fn: Callable, param_types: Optional[list[ts.ColumnType]]
|
|
1117
|
-
) -> 'pxt.func.QueryTemplateFunction':
|
|
1118
|
-
if py_fn.__module__ != '__main__' and py_fn.__name__.isidentifier():
|
|
1119
|
-
# this is a named function in a module
|
|
1120
|
-
function_path = f'{py_fn.__module__}.{py_fn.__qualname__}'
|
|
1121
|
-
else:
|
|
1122
|
-
function_path = None
|
|
1123
|
-
query_name = py_fn.__name__
|
|
1124
|
-
if query_name in self._schema.keys():
|
|
1125
|
-
raise excs.Error(f'Query name {query_name!r} conflicts with existing column')
|
|
1126
|
-
if query_name in self.__query_scope._queries and function_path is not None:
|
|
1127
|
-
raise excs.Error(f'Duplicate query name: {query_name!r}')
|
|
1128
|
-
query_fn = pxt.func.QueryTemplateFunction.create(
|
|
1129
|
-
py_fn, param_types=param_types, path=function_path, name=query_name)
|
|
1130
|
-
self.__query_scope._queries[query_name] = query_fn
|
|
1131
|
-
return query_fn
|
|
1132
|
-
|
|
1133
|
-
# TODO: verify that the inferred return type matches that of the template
|
|
1134
|
-
# TODO: verify that the signature doesn't contain batched parameters
|
|
1135
|
-
|
|
1136
|
-
if len(args) == 1:
|
|
1137
|
-
assert len(kwargs) == 0 and callable(args[0])
|
|
1138
|
-
return make_query_template(args[0], None)
|
|
1139
|
-
else:
|
|
1140
|
-
assert len(args) == 0 and len(kwargs) == 1 and 'param_types' in kwargs
|
|
1141
|
-
return lambda py_fn: make_query_template(py_fn, kwargs['param_types'])
|
|
1142
|
-
|
|
1143
1334
|
@property
|
|
1144
1335
|
def external_stores(self) -> list[str]:
|
|
1145
1336
|
return list(self._tbl_version.external_stores.keys())
|
|
@@ -1150,7 +1341,6 @@ class Table(SchemaObject):
|
|
|
1150
1341
|
"""
|
|
1151
1342
|
if self._tbl_version.is_snapshot:
|
|
1152
1343
|
raise excs.Error(f'Table `{self._name}` is a snapshot, so it cannot be linked to an external store.')
|
|
1153
|
-
self._check_is_dropped()
|
|
1154
1344
|
if store.name in self.external_stores:
|
|
1155
1345
|
raise excs.Error(f'Table `{self._name}` already has an external store with that name: {store.name}')
|
|
1156
1346
|
_logger.info(f'Linking external store `{store.name}` to table `{self._name}`')
|
|
@@ -1230,7 +1420,7 @@ class Table(SchemaObject):
|
|
|
1230
1420
|
return sync_status
|
|
1231
1421
|
|
|
1232
1422
|
def __dir__(self) -> list[str]:
|
|
1233
|
-
return list(super().__dir__()) + list(self._schema.keys())
|
|
1423
|
+
return list(super().__dir__()) + list(self._schema.keys())
|
|
1234
1424
|
|
|
1235
1425
|
def _ipython_key_completions_(self) -> list[str]:
|
|
1236
|
-
return list(self._schema.keys())
|
|
1426
|
+
return list(self._schema.keys())
|