pixeltable 0.4.18__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +119 -100
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +118 -122
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +322 -257
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +68 -77
- pixeltable/env.py +74 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +4 -5
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +25 -25
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +18 -20
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +2 -24
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +52 -36
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/video.py +8 -13
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +30 -28
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +125 -61
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +8 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/METADATA +1 -1
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.18.dist-info/RECORD +0 -211
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.18.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/globals.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple,
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Union
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import pydantic
|
|
@@ -14,6 +14,7 @@ from pixeltable.catalog import Catalog, TableVersionPath
|
|
|
14
14
|
from pixeltable.catalog.insertable_table import OnErrorParameter
|
|
15
15
|
from pixeltable.config import Config
|
|
16
16
|
from pixeltable.env import Env
|
|
17
|
+
from pixeltable.io.table_data_conduit import DFTableDataConduit, TableDataConduit
|
|
17
18
|
from pixeltable.iterators import ComponentIterator
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
@@ -36,7 +37,7 @@ if TYPE_CHECKING:
|
|
|
36
37
|
_logger = logging.getLogger('pixeltable')
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
def init(config_overrides:
|
|
40
|
+
def init(config_overrides: dict[str, Any] | None = None) -> None:
|
|
40
41
|
"""Initializes the Pixeltable environment."""
|
|
41
42
|
if config_overrides is None:
|
|
42
43
|
config_overrides = {}
|
|
@@ -46,18 +47,19 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
|
|
|
46
47
|
|
|
47
48
|
def create_table(
|
|
48
49
|
path: str,
|
|
49
|
-
schema:
|
|
50
|
+
schema: dict[str, Any] | None = None,
|
|
50
51
|
*,
|
|
51
|
-
source:
|
|
52
|
-
source_format:
|
|
53
|
-
schema_overrides:
|
|
52
|
+
source: TableDataSource | None = None,
|
|
53
|
+
source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
|
|
54
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
55
|
+
create_default_idxs: bool = True,
|
|
54
56
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
55
57
|
primary_key: str | list[str] | None = None,
|
|
56
58
|
num_retained_versions: int = 10,
|
|
57
59
|
comment: str = '',
|
|
58
60
|
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
59
61
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
60
|
-
extra_args:
|
|
62
|
+
extra_args: dict[str, Any] | None = None, # Additional arguments to data source provider
|
|
61
63
|
) -> catalog.Table:
|
|
62
64
|
"""Create a new base table. Exactly one of `schema` or `source` must be provided.
|
|
63
65
|
|
|
@@ -77,6 +79,8 @@ def create_table(
|
|
|
77
79
|
schema_overrides: Must be used in conjunction with a `source`.
|
|
78
80
|
If specified, then columns in `schema_overrides` will be given the specified types.
|
|
79
81
|
(Pixeltable will attempt to infer the types of any columns not specified.)
|
|
82
|
+
create_default_idxs: If True, creates a B-tree index on every scalar and media column that is not computed,
|
|
83
|
+
except for boolean columns.
|
|
80
84
|
on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
|
|
81
85
|
invalid media file (such as a corrupt image) for one of the inserted rows.
|
|
82
86
|
|
|
@@ -138,7 +142,7 @@ def create_table(
|
|
|
138
142
|
|
|
139
143
|
>>> tbl = pxt.create_table('my_table', source='data.csv')
|
|
140
144
|
"""
|
|
141
|
-
from pixeltable.io.table_data_conduit import
|
|
145
|
+
from pixeltable.io.table_data_conduit import UnkTableDataConduit
|
|
142
146
|
from pixeltable.io.utils import normalize_primary_key_parameter
|
|
143
147
|
|
|
144
148
|
if (schema is None) == (source is None):
|
|
@@ -150,11 +154,16 @@ def create_table(
|
|
|
150
154
|
path_obj = catalog.Path.parse(path)
|
|
151
155
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
152
156
|
media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
|
|
153
|
-
primary_key:
|
|
154
|
-
|
|
155
|
-
tds = None
|
|
156
|
-
data_source = None
|
|
157
|
+
primary_key: list[str] | None = normalize_primary_key_parameter(primary_key)
|
|
158
|
+
data_source: TableDataConduit | None = None
|
|
157
159
|
if source is not None:
|
|
160
|
+
if isinstance(source, str) and source.strip().startswith('pxt://'):
|
|
161
|
+
raise excs.Error(
|
|
162
|
+
'create_table(): Creating a table directly from a cloud URI is not supported.'
|
|
163
|
+
' Please replicate the table locally first using `pxt.replicate()`:\n'
|
|
164
|
+
"replica_tbl = pxt.replicate('pxt://path/to/remote_table', 'local_replica_name')\n"
|
|
165
|
+
"pxt.create_table('new_table_name', source=replica_tbl)"
|
|
166
|
+
)
|
|
158
167
|
tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
|
|
159
168
|
tds.check_source_format()
|
|
160
169
|
data_source = tds.specialize()
|
|
@@ -179,35 +188,43 @@ def create_table(
|
|
|
179
188
|
'Unable to create a proper schema from supplied `source`. Please use appropriate `schema_overrides`.'
|
|
180
189
|
)
|
|
181
190
|
|
|
182
|
-
|
|
191
|
+
tbl, was_created = Catalog.get().create_table(
|
|
183
192
|
path_obj,
|
|
184
193
|
schema,
|
|
185
|
-
data_source.pxt_df if isinstance(data_source, DFTableDataConduit) else None,
|
|
186
194
|
if_exists=if_exists_,
|
|
187
195
|
primary_key=primary_key,
|
|
188
196
|
comment=comment,
|
|
189
197
|
media_validation=media_validation_,
|
|
190
198
|
num_retained_versions=num_retained_versions,
|
|
199
|
+
create_default_idxs=create_default_idxs,
|
|
191
200
|
)
|
|
192
|
-
|
|
201
|
+
|
|
202
|
+
# TODO: combine data loading with table creation into a single transaction
|
|
203
|
+
if was_created:
|
|
193
204
|
fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
|
|
194
|
-
|
|
205
|
+
if isinstance(data_source, DFTableDataConduit):
|
|
206
|
+
df = data_source.pxt_df
|
|
207
|
+
with Catalog.get().begin_xact(tbl=tbl._tbl_version_path, for_write=True, lock_mutable_tree=True):
|
|
208
|
+
tbl._tbl_version.get().insert(None, df, fail_on_exception=fail_on_exception)
|
|
209
|
+
elif data_source is not None and not is_direct_df:
|
|
210
|
+
tbl.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
|
|
195
211
|
|
|
196
|
-
return
|
|
212
|
+
return tbl
|
|
197
213
|
|
|
198
214
|
|
|
199
215
|
def create_view(
|
|
200
216
|
path: str,
|
|
201
217
|
base: catalog.Table | DataFrame,
|
|
202
218
|
*,
|
|
203
|
-
additional_columns:
|
|
219
|
+
additional_columns: dict[str, Any] | None = None,
|
|
204
220
|
is_snapshot: bool = False,
|
|
205
|
-
|
|
221
|
+
create_default_idxs: bool = False,
|
|
222
|
+
iterator: tuple[type[ComponentIterator], dict[str, Any]] | None = None,
|
|
206
223
|
num_retained_versions: int = 10,
|
|
207
224
|
comment: str = '',
|
|
208
225
|
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
209
226
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
210
|
-
) ->
|
|
227
|
+
) -> catalog.Table | None:
|
|
211
228
|
"""Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
212
229
|
|
|
213
230
|
Args:
|
|
@@ -220,6 +237,8 @@ def create_view(
|
|
|
220
237
|
[`create_table`][pixeltable.create_table].
|
|
221
238
|
is_snapshot: Whether the view is a snapshot. Setting this to `True` is equivalent to calling
|
|
222
239
|
[`create_snapshot`][pixeltable.create_snapshot].
|
|
240
|
+
create_default_idxs: Whether to create default indexes on the view's columns (the base's columns are excluded).
|
|
241
|
+
Cannot be `True` for snapshots.
|
|
223
242
|
iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
|
|
224
243
|
the base table.
|
|
225
244
|
num_retained_versions: Number of versions of the view to retain.
|
|
@@ -267,9 +286,11 @@ def create_view(
|
|
|
267
286
|
>>> tbl = pxt.get_table('my_table')
|
|
268
287
|
... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 100), if_exists='replace_force')
|
|
269
288
|
"""
|
|
289
|
+
if is_snapshot and create_default_idxs is True:
|
|
290
|
+
raise excs.Error('Cannot create default indexes on a snapshot')
|
|
270
291
|
tbl_version_path: TableVersionPath
|
|
271
|
-
select_list:
|
|
272
|
-
where:
|
|
292
|
+
select_list: list[tuple[exprs.Expr, str | None]] | None = None
|
|
293
|
+
where: exprs.Expr | None = None
|
|
273
294
|
if isinstance(base, catalog.Table):
|
|
274
295
|
tbl_version_path = base._tbl_version_path
|
|
275
296
|
sample_clause = None
|
|
@@ -297,7 +318,7 @@ def create_view(
|
|
|
297
318
|
if col_name in [c.name for c in tbl_version_path.columns()]:
|
|
298
319
|
raise excs.Error(
|
|
299
320
|
f'Column {col_name!r} already exists in the base table '
|
|
300
|
-
f'{tbl_version_path.get_column(col_name).
|
|
321
|
+
f'{tbl_version_path.get_column(col_name).get_tbl().name}.'
|
|
301
322
|
)
|
|
302
323
|
|
|
303
324
|
return Catalog.get().create_view(
|
|
@@ -308,6 +329,7 @@ def create_view(
|
|
|
308
329
|
sample_clause=sample_clause,
|
|
309
330
|
additional_columns=additional_columns,
|
|
310
331
|
is_snapshot=is_snapshot,
|
|
332
|
+
create_default_idxs=create_default_idxs,
|
|
311
333
|
iterator=iterator,
|
|
312
334
|
num_retained_versions=num_retained_versions,
|
|
313
335
|
comment=comment,
|
|
@@ -320,13 +342,13 @@ def create_snapshot(
|
|
|
320
342
|
path_str: str,
|
|
321
343
|
base: catalog.Table | DataFrame,
|
|
322
344
|
*,
|
|
323
|
-
additional_columns:
|
|
324
|
-
iterator:
|
|
345
|
+
additional_columns: dict[str, Any] | None = None,
|
|
346
|
+
iterator: tuple[type[ComponentIterator], dict[str, Any]] | None = None,
|
|
325
347
|
num_retained_versions: int = 10,
|
|
326
348
|
comment: str = '',
|
|
327
349
|
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
328
350
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
329
|
-
) ->
|
|
351
|
+
) -> catalog.Table | None:
|
|
330
352
|
"""Create a snapshot of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
331
353
|
|
|
332
354
|
Args:
|
|
@@ -680,7 +702,7 @@ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths:
|
|
|
680
702
|
|
|
681
703
|
def create_dir(
|
|
682
704
|
path: str, *, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
|
|
683
|
-
) ->
|
|
705
|
+
) -> catalog.Dir | None:
|
|
684
706
|
"""Create a directory.
|
|
685
707
|
|
|
686
708
|
Args:
|
|
@@ -835,9 +857,7 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
835
857
|
|
|
836
858
|
|
|
837
859
|
def _extract_paths(
|
|
838
|
-
dir_entries: dict[str, Catalog.DirEntry],
|
|
839
|
-
parent: catalog.Path,
|
|
840
|
-
entry_type: Optional[type[catalog.SchemaObject]] = None,
|
|
860
|
+
dir_entries: dict[str, Catalog.DirEntry], parent: catalog.Path, entry_type: type[catalog.SchemaObject] | None = None
|
|
841
861
|
) -> list[catalog.Path]:
|
|
842
862
|
"""Convert nested dir_entries structure to a flattened list of paths."""
|
|
843
863
|
matches: list[str]
|
|
@@ -947,7 +967,7 @@ def tools(*args: func.Function | func.tools.Tool) -> func.tools.Tools:
|
|
|
947
967
|
return func.tools.Tools(tools=[arg if isinstance(arg, func.tools.Tool) else tool(arg) for arg in args])
|
|
948
968
|
|
|
949
969
|
|
|
950
|
-
def tool(fn: func.Function, name:
|
|
970
|
+
def tool(fn: func.Function, name: str | None = None, description: str | None = None) -> func.tools.Tool:
|
|
951
971
|
"""
|
|
952
972
|
Specifies a Pixeltable UDF to be used as an LLM tool with customizable metadata. See the documentation for
|
|
953
973
|
[pxt.tools()][pixeltable.tools] for more details.
|
|
@@ -968,11 +988,7 @@ def tool(fn: func.Function, name: Optional[str] = None, description: Optional[st
|
|
|
968
988
|
|
|
969
989
|
|
|
970
990
|
def configure_logging(
|
|
971
|
-
*,
|
|
972
|
-
to_stdout: Optional[bool] = None,
|
|
973
|
-
level: Optional[int] = None,
|
|
974
|
-
add: Optional[str] = None,
|
|
975
|
-
remove: Optional[str] = None,
|
|
991
|
+
*, to_stdout: bool | None = None, level: int | None = None, add: str | None = None, remove: str | None = None
|
|
976
992
|
) -> None:
|
|
977
993
|
"""Configure logging.
|
|
978
994
|
|
pixeltable/index/base.py
CHANGED
|
@@ -5,7 +5,9 @@ from typing import Any
|
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import pixeltable.catalog as catalog
|
|
9
|
+
import pixeltable.exprs as exprs
|
|
10
|
+
import pixeltable.type_system as ts
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class IndexBase(abc.ABC):
|
|
@@ -18,12 +20,14 @@ class IndexBase(abc.ABC):
|
|
|
18
20
|
"""
|
|
19
21
|
|
|
20
22
|
@abc.abstractmethod
|
|
21
|
-
def __init__(self,
|
|
23
|
+
def __init__(self, **kwargs: Any):
|
|
22
24
|
pass
|
|
23
25
|
|
|
24
26
|
@abc.abstractmethod
|
|
25
|
-
def
|
|
26
|
-
"""
|
|
27
|
+
def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
|
|
28
|
+
"""
|
|
29
|
+
Validates that the index can be created on column c and returns an expression that computes the index value.
|
|
30
|
+
"""
|
|
27
31
|
pass
|
|
28
32
|
|
|
29
33
|
@abc.abstractmethod
|
|
@@ -32,13 +36,13 @@ class IndexBase(abc.ABC):
|
|
|
32
36
|
pass
|
|
33
37
|
|
|
34
38
|
@abc.abstractmethod
|
|
35
|
-
def
|
|
39
|
+
def get_index_sa_type(self, value_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
36
40
|
"""Return the sqlalchemy type of the index value column"""
|
|
37
41
|
pass
|
|
38
42
|
|
|
39
43
|
@abc.abstractmethod
|
|
40
|
-
def
|
|
41
|
-
"""
|
|
44
|
+
def sa_index(self, index_name: str, index_value_col: catalog.Column) -> sql.Index:
|
|
45
|
+
"""Return a sqlalchemy Index instance"""
|
|
42
46
|
pass
|
|
43
47
|
|
|
44
48
|
@abc.abstractmethod
|
|
@@ -57,5 +61,5 @@ class IndexBase(abc.ABC):
|
|
|
57
61
|
|
|
58
62
|
@classmethod
|
|
59
63
|
@abc.abstractmethod
|
|
60
|
-
def from_dict(cls,
|
|
64
|
+
def from_dict(cls, d: dict) -> IndexBase:
|
|
61
65
|
pass
|
pixeltable/index/btree.py
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
5
5
|
# TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
|
|
6
6
|
# import pixeltable.catalog as catalog
|
|
7
7
|
import pixeltable.exceptions as excs
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
import pixeltable.exprs as exprs
|
|
9
|
+
import pixeltable.type_system as ts
|
|
10
10
|
from pixeltable.func.udf import udf
|
|
11
11
|
|
|
12
12
|
from .base import IndexBase
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
-
import pixeltable.
|
|
15
|
+
import pixeltable.catalog as catalog
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class BtreeIndex(IndexBase):
|
|
@@ -22,42 +22,39 @@ class BtreeIndex(IndexBase):
|
|
|
22
22
|
|
|
23
23
|
MAX_STRING_LEN = 256
|
|
24
24
|
|
|
25
|
-
value_expr: 'pixeltable.exprs.Expr'
|
|
26
|
-
|
|
27
25
|
@staticmethod
|
|
28
26
|
@udf
|
|
29
|
-
def str_filter(s:
|
|
27
|
+
def str_filter(s: str | None) -> str | None:
|
|
30
28
|
if s is None:
|
|
31
29
|
return None
|
|
32
30
|
return s[: BtreeIndex.MAX_STRING_LEN]
|
|
33
31
|
|
|
34
|
-
def __init__(self
|
|
32
|
+
def __init__(self) -> None:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
def create_value_expr(self, c: 'catalog.Column') -> 'exprs.Expr':
|
|
35
36
|
if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
|
|
36
37
|
raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
|
|
38
|
+
value_expr: exprs.Expr
|
|
37
39
|
if c.col_type.is_media_type():
|
|
38
40
|
# an index on a media column is an index on the file url
|
|
39
41
|
# no validation for media columns: we're only interested in the string value
|
|
40
|
-
|
|
42
|
+
value_expr = exprs.ColumnRef(c, perform_validation=False)
|
|
41
43
|
else:
|
|
42
|
-
|
|
44
|
+
value_expr = (
|
|
43
45
|
BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
|
|
44
46
|
)
|
|
45
|
-
|
|
46
|
-
def index_value_expr(self) -> 'exprs.Expr':
|
|
47
|
-
return self.value_expr
|
|
47
|
+
return value_expr
|
|
48
48
|
|
|
49
49
|
def records_value_errors(self) -> bool:
|
|
50
50
|
return False
|
|
51
51
|
|
|
52
|
-
def
|
|
52
|
+
def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
53
53
|
"""Return the sqlalchemy type of the index value column"""
|
|
54
|
-
return
|
|
54
|
+
return val_col_type.to_sa_type()
|
|
55
55
|
|
|
56
|
-
def
|
|
57
|
-
|
|
58
|
-
idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
|
|
59
|
-
conn = Env.get().conn
|
|
60
|
-
idx.create(bind=conn)
|
|
56
|
+
def sa_index(self, store_index_name: str, index_value_col: 'catalog.Column') -> sql.Index:
|
|
57
|
+
return sql.Index(store_index_name, index_value_col.sa_col, postgresql_using='btree')
|
|
61
58
|
|
|
62
59
|
def drop_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
|
|
63
60
|
"""Drop the index on the index value column"""
|
|
@@ -72,5 +69,5 @@ class BtreeIndex(IndexBase):
|
|
|
72
69
|
return {}
|
|
73
70
|
|
|
74
71
|
@classmethod
|
|
75
|
-
def from_dict(cls,
|
|
76
|
-
return cls(
|
|
72
|
+
def from_dict(cls, d: dict) -> 'BtreeIndex':
|
|
73
|
+
return cls()
|
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
|
-
from typing import Any, ClassVar
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
8
8
|
import PIL.Image
|
|
9
9
|
import sqlalchemy as sql
|
|
10
10
|
|
|
11
|
+
import pixeltable.catalog as catalog
|
|
11
12
|
import pixeltable.exceptions as excs
|
|
13
|
+
import pixeltable.exprs as exprs
|
|
14
|
+
import pixeltable.func as func
|
|
12
15
|
import pixeltable.type_system as ts
|
|
13
|
-
from pixeltable import catalog, exprs, func
|
|
14
16
|
from pixeltable.env import Env
|
|
15
17
|
|
|
16
18
|
from .base import IndexBase
|
|
@@ -39,28 +41,23 @@ class EmbeddingIndex(IndexBase):
|
|
|
39
41
|
}
|
|
40
42
|
|
|
41
43
|
metric: Metric
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
image_embed: Optional[func.Function]
|
|
44
|
+
string_embed: func.Function | None
|
|
45
|
+
image_embed: func.Function | None
|
|
45
46
|
string_embed_signature_idx: int
|
|
46
47
|
image_embed_signature_idx: int
|
|
47
|
-
index_col_type: pgvector.sqlalchemy.Vector
|
|
48
48
|
|
|
49
49
|
def __init__(
|
|
50
50
|
self,
|
|
51
|
-
c: catalog.Column,
|
|
52
51
|
metric: str,
|
|
53
|
-
embed:
|
|
54
|
-
string_embed:
|
|
55
|
-
image_embed:
|
|
52
|
+
embed: func.Function | None = None,
|
|
53
|
+
string_embed: func.Function | None = None,
|
|
54
|
+
image_embed: func.Function | None = None,
|
|
56
55
|
):
|
|
57
56
|
if embed is None and string_embed is None and image_embed is None:
|
|
58
57
|
raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
|
|
59
58
|
metric_names = [m.name.lower() for m in self.Metric]
|
|
60
59
|
if metric.lower() not in metric_names:
|
|
61
60
|
raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
|
|
62
|
-
if not c.col_type.is_string_type() and not c.col_type.is_image_type():
|
|
63
|
-
raise excs.Error('Embedding index requires string or image column')
|
|
64
61
|
|
|
65
62
|
self.string_embed = None
|
|
66
63
|
self.image_embed = None
|
|
@@ -102,47 +99,42 @@ class EmbeddingIndex(IndexBase):
|
|
|
102
99
|
)
|
|
103
100
|
|
|
104
101
|
# Now validate the return types of the embedding functions.
|
|
105
|
-
|
|
106
102
|
if self.string_embed is not None:
|
|
107
103
|
self._validate_embedding_fn(self.string_embed)
|
|
108
|
-
|
|
109
104
|
if self.image_embed is not None:
|
|
110
105
|
self._validate_embedding_fn(self.image_embed)
|
|
111
106
|
|
|
107
|
+
self.metric = self.Metric[metric.upper()]
|
|
108
|
+
|
|
109
|
+
def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
|
|
110
|
+
if not c.col_type.is_string_type() and not c.col_type.is_image_type():
|
|
111
|
+
raise excs.Error(
|
|
112
|
+
f'Embedding index requires string or image column, column {c.name!r} has type {c.col_type}'
|
|
113
|
+
)
|
|
112
114
|
if c.col_type.is_string_type() and self.string_embed is None:
|
|
113
115
|
raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
|
|
114
116
|
if c.col_type.is_image_type() and self.image_embed is None:
|
|
115
117
|
raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
|
|
116
118
|
|
|
117
|
-
|
|
118
|
-
self.value_expr = (
|
|
119
|
+
return (
|
|
119
120
|
self.string_embed(exprs.ColumnRef(c))
|
|
120
121
|
if c.col_type.is_string_type()
|
|
121
122
|
else self.image_embed(exprs.ColumnRef(c))
|
|
122
123
|
)
|
|
123
|
-
assert isinstance(self.value_expr.col_type, ts.ArrayType)
|
|
124
|
-
vector_size = self.value_expr.col_type.shape[0]
|
|
125
|
-
assert vector_size is not None
|
|
126
|
-
self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
|
|
127
|
-
|
|
128
|
-
def index_value_expr(self) -> exprs.Expr:
|
|
129
|
-
"""Return expression that computes the value that goes into the index"""
|
|
130
|
-
return self.value_expr
|
|
131
124
|
|
|
132
125
|
def records_value_errors(self) -> bool:
|
|
133
126
|
return True
|
|
134
127
|
|
|
135
|
-
def
|
|
136
|
-
|
|
137
|
-
|
|
128
|
+
def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
129
|
+
assert isinstance(val_col_type, ts.ArrayType) and val_col_type.shape is not None
|
|
130
|
+
vector_size = val_col_type.shape[0]
|
|
131
|
+
assert vector_size is not None
|
|
132
|
+
return pgvector.sqlalchemy.Vector(vector_size)
|
|
138
133
|
|
|
139
|
-
def
|
|
134
|
+
def sa_index(self, store_index_name: str, index_value_col: 'catalog.Column') -> sql.Index:
|
|
140
135
|
"""Create the index on the index value column"""
|
|
141
|
-
Env.get().dbms.
|
|
142
|
-
|
|
143
|
-
index_value_sa_col=index_value_col.sa_col,
|
|
144
|
-
conn=Env.get().conn,
|
|
145
|
-
metric=self.PGVECTOR_OPS[self.metric],
|
|
136
|
+
return Env.get().dbms.sa_vector_index(
|
|
137
|
+
store_index_name, index_value_col.sa_col, metric=self.PGVECTOR_OPS[self.metric]
|
|
146
138
|
)
|
|
147
139
|
|
|
148
140
|
def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
|
|
@@ -153,6 +145,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
153
145
|
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
|
|
154
146
|
"""Create a ColumnElement that represents '<val_column> <op> <item>'"""
|
|
155
147
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
148
|
+
embedding: np.ndarray
|
|
156
149
|
if isinstance(item, str):
|
|
157
150
|
assert self.string_embed is not None
|
|
158
151
|
embedding = self.string_embed.exec([item], {})
|
|
@@ -171,7 +164,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
171
164
|
def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
|
|
172
165
|
"""Create a ColumnElement that is used in an ORDER BY clause"""
|
|
173
166
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
174
|
-
embedding:
|
|
167
|
+
embedding: np.ndarray | None = None
|
|
175
168
|
if isinstance(item, str):
|
|
176
169
|
assert self.string_embed is not None
|
|
177
170
|
embedding = self.string_embed.exec([item], {})
|
|
@@ -196,9 +189,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
196
189
|
return 'embedding'
|
|
197
190
|
|
|
198
191
|
@classmethod
|
|
199
|
-
def _resolve_embedding_fn(
|
|
200
|
-
cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type
|
|
201
|
-
) -> Optional[func.Function]:
|
|
192
|
+
def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> func.Function | None:
|
|
202
193
|
"""Find an overload resolution for `embed_fn` that matches the given type."""
|
|
203
194
|
assert isinstance(embed_fn, func.Function)
|
|
204
195
|
for resolved_fn in embed_fn._resolved_fns:
|
|
@@ -252,7 +243,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
252
243
|
}
|
|
253
244
|
|
|
254
245
|
@classmethod
|
|
255
|
-
def from_dict(cls,
|
|
246
|
+
def from_dict(cls, d: dict) -> EmbeddingIndex:
|
|
256
247
|
string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
|
|
257
248
|
image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
|
|
258
|
-
return cls(
|
|
249
|
+
return cls(metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
|
pixeltable/io/datarows.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any, Iterable
|
|
3
|
+
from typing import Any, Iterable
|
|
4
4
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.type_system as ts
|
|
@@ -60,7 +60,7 @@ def import_rows(
|
|
|
60
60
|
tbl_path: str,
|
|
61
61
|
rows: list[dict[str, Any]],
|
|
62
62
|
*,
|
|
63
|
-
schema_overrides:
|
|
63
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
64
64
|
primary_key: str | list[str] | None = None,
|
|
65
65
|
num_retained_versions: int = 10,
|
|
66
66
|
comment: str = '',
|
|
@@ -104,7 +104,7 @@ def import_json(
|
|
|
104
104
|
tbl_path: str,
|
|
105
105
|
filepath_or_url: str,
|
|
106
106
|
*,
|
|
107
|
-
schema_overrides:
|
|
107
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
108
108
|
primary_key: str | list[str] | None = None,
|
|
109
109
|
num_retained_versions: int = 10,
|
|
110
110
|
comment: str = '',
|
pixeltable/io/external_store.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import itertools
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
9
9
|
import pixeltable.type_system as ts
|
|
@@ -68,10 +68,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
68
68
|
stored_proxies: dict[ColumnHandle, ColumnHandle] # original col -> proxy col
|
|
69
69
|
|
|
70
70
|
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
name: str,
|
|
73
|
-
col_mapping: dict[ColumnHandle, str],
|
|
74
|
-
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]],
|
|
71
|
+
self, name: str, col_mapping: dict[ColumnHandle, str], stored_proxies: dict[ColumnHandle, ColumnHandle] | None
|
|
75
72
|
):
|
|
76
73
|
super().__init__(name)
|
|
77
74
|
self._col_mapping = col_mapping
|
|
@@ -190,7 +187,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
190
187
|
table: Table,
|
|
191
188
|
export_cols: dict[str, ts.ColumnType],
|
|
192
189
|
import_cols: dict[str, ts.ColumnType],
|
|
193
|
-
col_mapping:
|
|
190
|
+
col_mapping: dict[str, str] | None,
|
|
194
191
|
) -> dict[ColumnHandle, str]:
|
|
195
192
|
"""
|
|
196
193
|
Verifies that the specified `col_mapping` is valid. In particular, checks that:
|
|
@@ -217,19 +214,19 @@ class Project(ExternalStore, abc.ABC):
|
|
|
217
214
|
if t_col not in t_cols:
|
|
218
215
|
if is_user_specified_col_mapping:
|
|
219
216
|
raise excs.Error(
|
|
220
|
-
f'Column name
|
|
217
|
+
f'Column name {t_col!r} appears as a key in `col_mapping`, but {table._display_str()} '
|
|
221
218
|
'contains no such column.'
|
|
222
219
|
)
|
|
223
220
|
else:
|
|
224
221
|
raise excs.Error(
|
|
225
|
-
f'Column
|
|
222
|
+
f'Column {t_col!r} does not exist in {table._display_str()}. Either add a column {t_col!r}, '
|
|
226
223
|
f'or specify a `col_mapping` to associate a different column with '
|
|
227
|
-
f'the external field
|
|
224
|
+
f'the external field {ext_col!r}.'
|
|
228
225
|
)
|
|
229
226
|
if ext_col not in export_cols and ext_col not in import_cols:
|
|
230
227
|
raise excs.Error(
|
|
231
|
-
f'Column name
|
|
232
|
-
f'configuration has no column
|
|
228
|
+
f'Column name {ext_col!r} appears as a value in `col_mapping`, but the external store '
|
|
229
|
+
f'configuration has no column {ext_col!r}.'
|
|
233
230
|
)
|
|
234
231
|
col_ref = table[t_col]
|
|
235
232
|
assert isinstance(col_ref, exprs.ColumnRef)
|
|
@@ -244,19 +241,19 @@ class Project(ExternalStore, abc.ABC):
|
|
|
244
241
|
ext_col_type = export_cols[ext_col]
|
|
245
242
|
if not ext_col_type.is_supertype_of(t_col_type, ignore_nullable=True):
|
|
246
243
|
raise excs.Error(
|
|
247
|
-
f'Column
|
|
244
|
+
f'Column {t_col!r} cannot be exported to external column {ext_col!r} '
|
|
248
245
|
f'(incompatible types; expecting `{ext_col_type}`)'
|
|
249
246
|
)
|
|
250
247
|
if ext_col in import_cols:
|
|
251
248
|
# Validate that the external column can be assigned to the table column
|
|
252
249
|
if table._tbl_version_path.get_column(t_col).is_computed:
|
|
253
250
|
raise excs.Error(
|
|
254
|
-
f'Column
|
|
251
|
+
f'Column {t_col!r} is a computed column, which cannot be populated from an external column'
|
|
255
252
|
)
|
|
256
253
|
ext_col_type = import_cols[ext_col]
|
|
257
254
|
if not t_col_type.is_supertype_of(ext_col_type, ignore_nullable=True):
|
|
258
255
|
raise excs.Error(
|
|
259
|
-
f'Column
|
|
256
|
+
f'Column {t_col!r} cannot be imported from external column {ext_col!r} '
|
|
260
257
|
f'(incompatible types; expecting `{ext_col_type}`)'
|
|
261
258
|
)
|
|
262
259
|
return resolved_col_mapping
|
|
@@ -271,7 +268,7 @@ class MockProject(Project):
|
|
|
271
268
|
export_cols: dict[str, ts.ColumnType],
|
|
272
269
|
import_cols: dict[str, ts.ColumnType],
|
|
273
270
|
col_mapping: dict[ColumnHandle, str],
|
|
274
|
-
stored_proxies:
|
|
271
|
+
stored_proxies: dict[ColumnHandle, ColumnHandle] | None = None,
|
|
275
272
|
):
|
|
276
273
|
super().__init__(name, col_mapping, stored_proxies)
|
|
277
274
|
self.export_cols = export_cols
|
|
@@ -285,7 +282,7 @@ class MockProject(Project):
|
|
|
285
282
|
name: str,
|
|
286
283
|
export_cols: dict[str, ts.ColumnType],
|
|
287
284
|
import_cols: dict[str, ts.ColumnType],
|
|
288
|
-
col_mapping:
|
|
285
|
+
col_mapping: dict[str, str] | None = None,
|
|
289
286
|
) -> 'MockProject':
|
|
290
287
|
col_mapping = cls.validate_columns(t, export_cols, import_cols, col_mapping)
|
|
291
288
|
return cls(name, export_cols, import_cols, col_mapping)
|