pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/index/base.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
-
from typing import Any
|
|
5
4
|
|
|
6
5
|
import sqlalchemy as sql
|
|
7
6
|
|
|
8
|
-
|
|
7
|
+
import pixeltable.catalog as catalog
|
|
8
|
+
import pixeltable.exprs as exprs
|
|
9
|
+
import pixeltable.type_system as ts
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class IndexBase(abc.ABC):
|
|
@@ -18,39 +19,34 @@ class IndexBase(abc.ABC):
|
|
|
18
19
|
"""
|
|
19
20
|
|
|
20
21
|
@abc.abstractmethod
|
|
21
|
-
def
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def index_value_expr(self) -> exprs.Expr:
|
|
26
|
-
"""Return expression that computes the value that goes into the index"""
|
|
27
|
-
pass
|
|
22
|
+
def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
|
|
23
|
+
"""
|
|
24
|
+
Validates that the index can be created on column c and returns an expression that computes the index value.
|
|
25
|
+
"""
|
|
28
26
|
|
|
29
27
|
@abc.abstractmethod
|
|
30
28
|
def records_value_errors(self) -> bool:
|
|
31
29
|
"""True if index_value_expr() can raise errors"""
|
|
32
|
-
pass
|
|
33
30
|
|
|
34
31
|
@abc.abstractmethod
|
|
35
|
-
def
|
|
32
|
+
def get_index_sa_type(self, value_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
36
33
|
"""Return the sqlalchemy type of the index value column"""
|
|
37
|
-
pass
|
|
38
34
|
|
|
39
35
|
@abc.abstractmethod
|
|
40
|
-
def
|
|
41
|
-
"""
|
|
42
|
-
|
|
36
|
+
def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
|
|
37
|
+
"""Return a sqlalchemy statement for creating the index"""
|
|
38
|
+
|
|
39
|
+
@abc.abstractmethod
|
|
40
|
+
def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
|
|
41
|
+
"""Drop the index on the index value column"""
|
|
43
42
|
|
|
44
43
|
@classmethod
|
|
45
44
|
@abc.abstractmethod
|
|
46
|
-
def display_name(cls) -> str:
|
|
47
|
-
pass
|
|
45
|
+
def display_name(cls) -> str: ...
|
|
48
46
|
|
|
49
47
|
@abc.abstractmethod
|
|
50
|
-
def as_dict(self) -> dict:
|
|
51
|
-
pass
|
|
48
|
+
def as_dict(self) -> dict: ...
|
|
52
49
|
|
|
53
50
|
@classmethod
|
|
54
51
|
@abc.abstractmethod
|
|
55
|
-
def from_dict(cls,
|
|
56
|
-
pass
|
|
52
|
+
def from_dict(cls, d: dict) -> IndexBase: ...
|
pixeltable/index/btree.py
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
5
5
|
# TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
|
|
6
6
|
# import pixeltable.catalog as catalog
|
|
7
7
|
import pixeltable.exceptions as excs
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
import pixeltable.exprs as exprs
|
|
9
|
+
import pixeltable.type_system as ts
|
|
10
10
|
from pixeltable.func.udf import udf
|
|
11
11
|
|
|
12
12
|
from .base import IndexBase
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
-
import pixeltable.
|
|
15
|
+
import pixeltable.catalog as catalog
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class BtreeIndex(IndexBase):
|
|
@@ -22,42 +22,48 @@ class BtreeIndex(IndexBase):
|
|
|
22
22
|
|
|
23
23
|
MAX_STRING_LEN = 256
|
|
24
24
|
|
|
25
|
-
value_expr: 'pixeltable.exprs.Expr'
|
|
26
|
-
|
|
27
25
|
@staticmethod
|
|
28
26
|
@udf
|
|
29
|
-
def str_filter(s:
|
|
27
|
+
def str_filter(s: str | None) -> str | None:
|
|
30
28
|
if s is None:
|
|
31
29
|
return None
|
|
32
30
|
return s[: BtreeIndex.MAX_STRING_LEN]
|
|
33
31
|
|
|
34
|
-
def __init__(self
|
|
32
|
+
def __init__(self) -> None:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
def create_value_expr(self, c: 'catalog.Column') -> 'exprs.Expr':
|
|
35
36
|
if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
|
|
36
37
|
raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
|
|
38
|
+
value_expr: exprs.Expr
|
|
37
39
|
if c.col_type.is_media_type():
|
|
38
40
|
# an index on a media column is an index on the file url
|
|
39
41
|
# no validation for media columns: we're only interested in the string value
|
|
40
|
-
|
|
42
|
+
value_expr = exprs.ColumnRef(c, perform_validation=False)
|
|
41
43
|
else:
|
|
42
|
-
|
|
44
|
+
value_expr = (
|
|
43
45
|
BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
|
|
44
46
|
)
|
|
45
|
-
|
|
46
|
-
def index_value_expr(self) -> 'exprs.Expr':
|
|
47
|
-
return self.value_expr
|
|
47
|
+
return value_expr
|
|
48
48
|
|
|
49
49
|
def records_value_errors(self) -> bool:
|
|
50
50
|
return False
|
|
51
51
|
|
|
52
|
-
def
|
|
52
|
+
def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
53
53
|
"""Return the sqlalchemy type of the index value column"""
|
|
54
|
-
return
|
|
54
|
+
return val_col_type.to_sa_type()
|
|
55
|
+
|
|
56
|
+
def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
|
|
57
|
+
"""Return a sqlalchemy statement for creating the index"""
|
|
58
|
+
from sqlalchemy.dialects import postgresql
|
|
59
|
+
|
|
60
|
+
sa_idx = sql.Index(store_index_name, sa_value_col, postgresql_using='btree')
|
|
61
|
+
return sql.schema.CreateIndex(sa_idx, if_not_exists=True).compile(dialect=postgresql.dialect())
|
|
55
62
|
|
|
56
|
-
def
|
|
57
|
-
"""
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
idx.create(bind=conn)
|
|
63
|
+
def drop_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
|
|
64
|
+
"""Drop the index on the index value column"""
|
|
65
|
+
# TODO: implement
|
|
66
|
+
raise NotImplementedError()
|
|
61
67
|
|
|
62
68
|
@classmethod
|
|
63
69
|
def display_name(cls) -> str:
|
|
@@ -67,5 +73,5 @@ class BtreeIndex(IndexBase):
|
|
|
67
73
|
return {}
|
|
68
74
|
|
|
69
75
|
@classmethod
|
|
70
|
-
def from_dict(cls,
|
|
71
|
-
return cls(
|
|
76
|
+
def from_dict(cls, d: dict) -> 'BtreeIndex':
|
|
77
|
+
return cls()
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
|
-
from typing import Any, ClassVar
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
8
|
-
import PIL.Image
|
|
9
8
|
import sqlalchemy as sql
|
|
10
9
|
|
|
10
|
+
import pixeltable.catalog as catalog
|
|
11
11
|
import pixeltable.exceptions as excs
|
|
12
|
+
import pixeltable.exprs as exprs
|
|
13
|
+
import pixeltable.func as func
|
|
12
14
|
import pixeltable.type_system as ts
|
|
13
|
-
from pixeltable import catalog, exprs, func
|
|
14
15
|
from pixeltable.env import Env
|
|
15
16
|
|
|
16
17
|
from .base import IndexBase
|
|
@@ -39,124 +40,105 @@ class EmbeddingIndex(IndexBase):
|
|
|
39
40
|
}
|
|
40
41
|
|
|
41
42
|
metric: Metric
|
|
42
|
-
|
|
43
|
-
string_embed: Optional[func.Function]
|
|
44
|
-
image_embed: Optional[func.Function]
|
|
45
|
-
string_embed_signature_idx: int
|
|
46
|
-
image_embed_signature_idx: int
|
|
47
|
-
index_col_type: pgvector.sqlalchemy.Vector
|
|
43
|
+
embeddings: dict[ts.ColumnType.Type, func.Function]
|
|
48
44
|
|
|
49
45
|
def __init__(
|
|
50
46
|
self,
|
|
51
|
-
c: catalog.Column,
|
|
52
47
|
metric: str,
|
|
53
|
-
embed:
|
|
54
|
-
string_embed:
|
|
55
|
-
image_embed:
|
|
48
|
+
embed: func.Function | None = None,
|
|
49
|
+
string_embed: func.Function | None = None,
|
|
50
|
+
image_embed: func.Function | None = None,
|
|
51
|
+
audio_embed: func.Function | None = None,
|
|
52
|
+
video_embed: func.Function | None = None,
|
|
56
53
|
):
|
|
57
54
|
if embed is None and string_embed is None and image_embed is None:
|
|
58
55
|
raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
|
|
59
56
|
metric_names = [m.name.lower() for m in self.Metric]
|
|
60
57
|
if metric.lower() not in metric_names:
|
|
61
58
|
raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
if
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
elif embed is not None:
|
|
93
|
-
# `embed` is specified; see if it has an image signature.
|
|
94
|
-
self.image_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.IMAGE)
|
|
95
|
-
|
|
96
|
-
if self.string_embed is None and self.image_embed is None:
|
|
97
|
-
# No string OR image signature was found. This can only happen if `embed` was specified and
|
|
98
|
-
# contains no matching signatures.
|
|
59
|
+
|
|
60
|
+
self.embeddings = {}
|
|
61
|
+
|
|
62
|
+
# Resolve the specific embedding functions corresponding to the user-provided embedding functions.
|
|
63
|
+
# For string embeddings, for example, `string_embed` will be used if specified; otherwise, `embed` will
|
|
64
|
+
# be used as a fallback, if it has a matching signature.
|
|
65
|
+
|
|
66
|
+
for embed_type, embed_fn in (
|
|
67
|
+
(ts.ColumnType.Type.STRING, string_embed),
|
|
68
|
+
(ts.ColumnType.Type.IMAGE, image_embed),
|
|
69
|
+
(ts.ColumnType.Type.AUDIO, audio_embed),
|
|
70
|
+
(ts.ColumnType.Type.VIDEO, video_embed),
|
|
71
|
+
):
|
|
72
|
+
if embed_fn is not None:
|
|
73
|
+
# Embedding function for the requisite type is specified directly; it MUST be valid.
|
|
74
|
+
resolved_fn = self._resolve_embedding_fn(embed_fn, embed_type)
|
|
75
|
+
if resolved_fn is None:
|
|
76
|
+
raise excs.Error(
|
|
77
|
+
f'The function `{embed_fn.name}` is not a valid {embed_type.name.lower()} '
|
|
78
|
+
f'embedding: it must take a single {embed_type.name.lower()} parameter'
|
|
79
|
+
)
|
|
80
|
+
self.embeddings[embed_type] = resolved_fn
|
|
81
|
+
elif embed is not None:
|
|
82
|
+
# General `embed` is specified; see if it has a matching signature.
|
|
83
|
+
resolved_fn = self._resolve_embedding_fn(embed, embed_type)
|
|
84
|
+
if resolved_fn is not None:
|
|
85
|
+
self.embeddings[embed_type] = resolved_fn
|
|
86
|
+
|
|
87
|
+
if len(self.embeddings) == 0:
|
|
88
|
+
# `embed` was specified and contains no matching signatures.
|
|
99
89
|
assert embed is not None
|
|
100
90
|
raise excs.Error(
|
|
101
|
-
f'The function `{embed.name}` is not a valid embedding:
|
|
91
|
+
f'The function `{embed.name}` is not a valid embedding: '
|
|
92
|
+
'it must take a single string, image, audio, or video parameter'
|
|
102
93
|
)
|
|
103
94
|
|
|
104
95
|
# Now validate the return types of the embedding functions.
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
self._validate_embedding_fn(self.string_embed)
|
|
108
|
-
|
|
109
|
-
if self.image_embed is not None:
|
|
110
|
-
self._validate_embedding_fn(self.image_embed)
|
|
111
|
-
|
|
112
|
-
if c.col_type.is_string_type() and self.string_embed is None:
|
|
113
|
-
raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
|
|
114
|
-
if c.col_type.is_image_type() and self.image_embed is None:
|
|
115
|
-
raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
|
|
96
|
+
for _, embed_fn in self.embeddings.items():
|
|
97
|
+
self._validate_embedding_fn(embed_fn)
|
|
116
98
|
|
|
117
99
|
self.metric = self.Metric[metric.upper()]
|
|
118
|
-
self.value_expr = (
|
|
119
|
-
self.string_embed(exprs.ColumnRef(c))
|
|
120
|
-
if c.col_type.is_string_type()
|
|
121
|
-
else self.image_embed(exprs.ColumnRef(c))
|
|
122
|
-
)
|
|
123
|
-
assert isinstance(self.value_expr.col_type, ts.ArrayType)
|
|
124
|
-
vector_size = self.value_expr.col_type.shape[0]
|
|
125
|
-
assert vector_size is not None
|
|
126
|
-
self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
|
|
127
100
|
|
|
128
|
-
def
|
|
129
|
-
|
|
130
|
-
|
|
101
|
+
def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
|
|
102
|
+
if c.col_type._type not in (
|
|
103
|
+
ts.ColumnType.Type.STRING,
|
|
104
|
+
ts.ColumnType.Type.IMAGE,
|
|
105
|
+
ts.ColumnType.Type.AUDIO,
|
|
106
|
+
ts.ColumnType.Type.VIDEO,
|
|
107
|
+
):
|
|
108
|
+
raise excs.Error(f'Type `{c.col_type}` of column {c.name!r} is not a valid type for an embedding index.')
|
|
109
|
+
if c.col_type._type not in self.embeddings:
|
|
110
|
+
raise excs.Error(
|
|
111
|
+
f'The specified embedding function does not support the type `{c.col_type}` of column {c.name!r}.'
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
embed_fn = self.embeddings[c.col_type._type]
|
|
115
|
+
return embed_fn(exprs.ColumnRef(c))
|
|
131
116
|
|
|
132
117
|
def records_value_errors(self) -> bool:
|
|
133
118
|
return True
|
|
134
119
|
|
|
135
|
-
def
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
postgresql_with={'m': 16, 'ef_construction': 64},
|
|
146
|
-
postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
|
|
120
|
+
def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
121
|
+
assert isinstance(val_col_type, ts.ArrayType) and val_col_type.shape is not None
|
|
122
|
+
vector_size = val_col_type.shape[0]
|
|
123
|
+
assert vector_size is not None
|
|
124
|
+
return pgvector.sqlalchemy.Vector(vector_size)
|
|
125
|
+
|
|
126
|
+
def sa_create_stmt(self, store_index_name: str, sa_value_col: sql.Column) -> sql.Compiled:
|
|
127
|
+
"""Return a sqlalchemy statement for creating the index"""
|
|
128
|
+
return Env.get().dbms.create_vector_index_stmt(
|
|
129
|
+
store_index_name, sa_value_col, metric=self.PGVECTOR_OPS[self.metric]
|
|
147
130
|
)
|
|
148
|
-
conn = Env.get().conn
|
|
149
|
-
idx.create(bind=conn)
|
|
150
131
|
|
|
151
|
-
def
|
|
132
|
+
def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
|
|
133
|
+
"""Drop the index on the index value column"""
|
|
134
|
+
# TODO: implement
|
|
135
|
+
raise NotImplementedError()
|
|
136
|
+
|
|
137
|
+
def similarity_clause(self, val_column: catalog.Column, item: exprs.Literal) -> sql.ColumnElement:
|
|
152
138
|
"""Create a ColumnElement that represents '<val_column> <op> <item>'"""
|
|
153
|
-
assert
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
embedding = self.string_embed.exec([item], {})
|
|
157
|
-
if isinstance(item, PIL.Image.Image):
|
|
158
|
-
assert self.image_embed is not None
|
|
159
|
-
embedding = self.image_embed.exec([item], {})
|
|
139
|
+
assert item.col_type._type in self.embeddings
|
|
140
|
+
embedding = self.embeddings[item.col_type._type].exec([item.val], {})
|
|
141
|
+
assert isinstance(embedding, np.ndarray)
|
|
160
142
|
|
|
161
143
|
if self.metric == self.Metric.COSINE:
|
|
162
144
|
return val_column.sa_col.cosine_distance(embedding) * -1 + 1
|
|
@@ -166,17 +148,11 @@ class EmbeddingIndex(IndexBase):
|
|
|
166
148
|
assert self.metric == self.Metric.L2
|
|
167
149
|
return val_column.sa_col.l2_distance(embedding)
|
|
168
150
|
|
|
169
|
-
def order_by_clause(self, val_column: catalog.Column, item:
|
|
151
|
+
def order_by_clause(self, val_column: catalog.Column, item: exprs.Literal, is_asc: bool) -> sql.ColumnElement:
|
|
170
152
|
"""Create a ColumnElement that is used in an ORDER BY clause"""
|
|
171
|
-
assert
|
|
172
|
-
embedding
|
|
173
|
-
|
|
174
|
-
assert self.string_embed is not None
|
|
175
|
-
embedding = self.string_embed.exec([item], {})
|
|
176
|
-
if isinstance(item, PIL.Image.Image):
|
|
177
|
-
assert self.image_embed is not None
|
|
178
|
-
embedding = self.image_embed.exec([item], {})
|
|
179
|
-
assert embedding is not None
|
|
153
|
+
assert item.col_type._type in self.embeddings
|
|
154
|
+
embedding = self.embeddings[item.col_type._type].exec([item.val], {})
|
|
155
|
+
assert isinstance(embedding, np.ndarray)
|
|
180
156
|
|
|
181
157
|
if self.metric == self.Metric.COSINE:
|
|
182
158
|
result = val_column.sa_col.cosine_distance(embedding)
|
|
@@ -194,9 +170,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
194
170
|
return 'embedding'
|
|
195
171
|
|
|
196
172
|
@classmethod
|
|
197
|
-
def _resolve_embedding_fn(
|
|
198
|
-
cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type
|
|
199
|
-
) -> Optional[func.Function]:
|
|
173
|
+
def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> func.Function | None:
|
|
200
174
|
"""Find an overload resolution for `embed_fn` that matches the given type."""
|
|
201
175
|
assert isinstance(embed_fn, func.Function)
|
|
202
176
|
for resolved_fn in embed_fn._resolved_fns:
|
|
@@ -243,14 +217,22 @@ class EmbeddingIndex(IndexBase):
|
|
|
243
217
|
)
|
|
244
218
|
|
|
245
219
|
def as_dict(self) -> dict:
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
220
|
+
d: dict[str, Any] = {'metric': self.metric.name.lower()}
|
|
221
|
+
for embed_type, embed_fn in self.embeddings.items():
|
|
222
|
+
key = f'{embed_type.name.lower()}_embed'
|
|
223
|
+
d[key] = embed_fn.as_dict()
|
|
224
|
+
return d
|
|
251
225
|
|
|
252
226
|
@classmethod
|
|
253
|
-
def from_dict(cls,
|
|
254
|
-
string_embed = func.Function.from_dict(d['string_embed']) if d
|
|
255
|
-
image_embed = func.Function.from_dict(d['image_embed']) if d
|
|
256
|
-
|
|
227
|
+
def from_dict(cls, d: dict) -> EmbeddingIndex:
|
|
228
|
+
string_embed = func.Function.from_dict(d['string_embed']) if d.get('string_embed') is not None else None
|
|
229
|
+
image_embed = func.Function.from_dict(d['image_embed']) if d.get('image_embed') is not None else None
|
|
230
|
+
audio_embed = func.Function.from_dict(d['audio_embed']) if d.get('audio_embed') is not None else None
|
|
231
|
+
video_embed = func.Function.from_dict(d['video_embed']) if d.get('video_embed') is not None else None
|
|
232
|
+
return cls(
|
|
233
|
+
metric=d['metric'],
|
|
234
|
+
string_embed=string_embed,
|
|
235
|
+
image_embed=image_embed,
|
|
236
|
+
audio_embed=audio_embed,
|
|
237
|
+
video_embed=video_embed,
|
|
238
|
+
)
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,14 +1,16 @@
|
|
|
1
|
+
"""Functions for importing and exporting Pixeltable data."""
|
|
1
2
|
# ruff: noqa: F401
|
|
2
3
|
|
|
3
4
|
from .datarows import import_json, import_rows
|
|
4
|
-
from .external_store import ExternalStore
|
|
5
|
+
from .external_store import ExternalStore
|
|
5
6
|
from .globals import create_label_studio_project, export_images_as_fo_dataset
|
|
6
7
|
from .hf_datasets import import_huggingface_dataset
|
|
8
|
+
from .lancedb import export_lancedb
|
|
7
9
|
from .pandas import import_csv, import_excel, import_pandas
|
|
8
10
|
from .parquet import export_parquet, import_parquet
|
|
9
11
|
|
|
10
12
|
__default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
|
|
11
|
-
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
|
|
13
|
+
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
|
|
12
14
|
__all__ = sorted(__default_dir - __removed_symbols)
|
|
13
15
|
|
|
14
16
|
|
pixeltable/io/datarows.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any, Iterable
|
|
3
|
+
from typing import Any, Iterable
|
|
4
4
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.type_system as ts
|
|
@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def _infer_schema_from_rows(
|
|
11
|
-
rows: Iterable[dict[str, Any]], schema_overrides: dict[str,
|
|
11
|
+
rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
|
|
12
12
|
) -> dict[str, ts.ColumnType]:
|
|
13
13
|
schema: dict[str, ts.ColumnType] = {}
|
|
14
14
|
cols_with_nones: set[str] = set()
|
|
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
|
|
|
20
20
|
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
21
21
|
# is specified.
|
|
22
22
|
if col_name not in schema:
|
|
23
|
+
assert isinstance(schema_overrides[col_name], ts.ColumnType)
|
|
23
24
|
schema[col_name] = schema_overrides[col_name]
|
|
24
25
|
elif value is not None:
|
|
25
26
|
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
@@ -33,7 +34,7 @@ def _infer_schema_from_rows(
|
|
|
33
34
|
if col_name not in schema:
|
|
34
35
|
schema[col_name] = col_type
|
|
35
36
|
else:
|
|
36
|
-
supertype = schema[col_name].supertype(col_type)
|
|
37
|
+
supertype = schema[col_name].supertype(col_type, for_inference=True)
|
|
37
38
|
if supertype is None:
|
|
38
39
|
raise excs.Error(
|
|
39
40
|
f'Could not infer type of column `{col_name}`; the value in row {n} '
|
|
@@ -59,8 +60,8 @@ def import_rows(
|
|
|
59
60
|
tbl_path: str,
|
|
60
61
|
rows: list[dict[str, Any]],
|
|
61
62
|
*,
|
|
62
|
-
schema_overrides:
|
|
63
|
-
primary_key:
|
|
63
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
64
|
+
primary_key: str | list[str] | None = None,
|
|
64
65
|
num_retained_versions: int = 10,
|
|
65
66
|
comment: str = '',
|
|
66
67
|
) -> pxt.Table:
|
|
@@ -103,8 +104,8 @@ def import_json(
|
|
|
103
104
|
tbl_path: str,
|
|
104
105
|
filepath_or_url: str,
|
|
105
106
|
*,
|
|
106
|
-
schema_overrides:
|
|
107
|
-
primary_key:
|
|
107
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
108
|
+
primary_key: str | list[str] | None = None,
|
|
108
109
|
num_retained_versions: int = 10,
|
|
109
110
|
comment: str = '',
|
|
110
111
|
**kwargs: Any,
|