pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +144 -118
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +139 -124
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +315 -246
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +69 -78
- pixeltable/env.py +78 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +28 -27
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +1033 -6
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +36 -31
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +75 -40
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/document.py +88 -57
- pixeltable/iterators/video.py +66 -37
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +32 -34
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +126 -41
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +74 -38
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.17.dist-info/RECORD +0 -211
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/index/base.py
CHANGED
|
@@ -5,7 +5,9 @@ from typing import Any
|
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import pixeltable.catalog as catalog
|
|
9
|
+
import pixeltable.exprs as exprs
|
|
10
|
+
import pixeltable.type_system as ts
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class IndexBase(abc.ABC):
|
|
@@ -18,12 +20,14 @@ class IndexBase(abc.ABC):
|
|
|
18
20
|
"""
|
|
19
21
|
|
|
20
22
|
@abc.abstractmethod
|
|
21
|
-
def __init__(self,
|
|
23
|
+
def __init__(self, **kwargs: Any):
|
|
22
24
|
pass
|
|
23
25
|
|
|
24
26
|
@abc.abstractmethod
|
|
25
|
-
def
|
|
26
|
-
"""
|
|
27
|
+
def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
|
|
28
|
+
"""
|
|
29
|
+
Validates that the index can be created on column c and returns an expression that computes the index value.
|
|
30
|
+
"""
|
|
27
31
|
pass
|
|
28
32
|
|
|
29
33
|
@abc.abstractmethod
|
|
@@ -32,13 +36,13 @@ class IndexBase(abc.ABC):
|
|
|
32
36
|
pass
|
|
33
37
|
|
|
34
38
|
@abc.abstractmethod
|
|
35
|
-
def
|
|
39
|
+
def get_index_sa_type(self, value_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
36
40
|
"""Return the sqlalchemy type of the index value column"""
|
|
37
41
|
pass
|
|
38
42
|
|
|
39
43
|
@abc.abstractmethod
|
|
40
|
-
def
|
|
41
|
-
"""
|
|
44
|
+
def sa_index(self, index_name: str, index_value_col: catalog.Column) -> sql.Index:
|
|
45
|
+
"""Return a sqlalchemy Index instance"""
|
|
42
46
|
pass
|
|
43
47
|
|
|
44
48
|
@abc.abstractmethod
|
|
@@ -57,5 +61,5 @@ class IndexBase(abc.ABC):
|
|
|
57
61
|
|
|
58
62
|
@classmethod
|
|
59
63
|
@abc.abstractmethod
|
|
60
|
-
def from_dict(cls,
|
|
64
|
+
def from_dict(cls, d: dict) -> IndexBase:
|
|
61
65
|
pass
|
pixeltable/index/btree.py
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
5
5
|
# TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
|
|
6
6
|
# import pixeltable.catalog as catalog
|
|
7
7
|
import pixeltable.exceptions as excs
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
import pixeltable.exprs as exprs
|
|
9
|
+
import pixeltable.type_system as ts
|
|
10
10
|
from pixeltable.func.udf import udf
|
|
11
11
|
|
|
12
12
|
from .base import IndexBase
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
-
import pixeltable.
|
|
15
|
+
import pixeltable.catalog as catalog
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class BtreeIndex(IndexBase):
|
|
@@ -22,42 +22,39 @@ class BtreeIndex(IndexBase):
|
|
|
22
22
|
|
|
23
23
|
MAX_STRING_LEN = 256
|
|
24
24
|
|
|
25
|
-
value_expr: 'pixeltable.exprs.Expr'
|
|
26
|
-
|
|
27
25
|
@staticmethod
|
|
28
26
|
@udf
|
|
29
|
-
def str_filter(s:
|
|
27
|
+
def str_filter(s: str | None) -> str | None:
|
|
30
28
|
if s is None:
|
|
31
29
|
return None
|
|
32
30
|
return s[: BtreeIndex.MAX_STRING_LEN]
|
|
33
31
|
|
|
34
|
-
def __init__(self
|
|
32
|
+
def __init__(self) -> None:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
def create_value_expr(self, c: 'catalog.Column') -> 'exprs.Expr':
|
|
35
36
|
if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
|
|
36
37
|
raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
|
|
38
|
+
value_expr: exprs.Expr
|
|
37
39
|
if c.col_type.is_media_type():
|
|
38
40
|
# an index on a media column is an index on the file url
|
|
39
41
|
# no validation for media columns: we're only interested in the string value
|
|
40
|
-
|
|
42
|
+
value_expr = exprs.ColumnRef(c, perform_validation=False)
|
|
41
43
|
else:
|
|
42
|
-
|
|
44
|
+
value_expr = (
|
|
43
45
|
BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
|
|
44
46
|
)
|
|
45
|
-
|
|
46
|
-
def index_value_expr(self) -> 'exprs.Expr':
|
|
47
|
-
return self.value_expr
|
|
47
|
+
return value_expr
|
|
48
48
|
|
|
49
49
|
def records_value_errors(self) -> bool:
|
|
50
50
|
return False
|
|
51
51
|
|
|
52
|
-
def
|
|
52
|
+
def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
53
53
|
"""Return the sqlalchemy type of the index value column"""
|
|
54
|
-
return
|
|
54
|
+
return val_col_type.to_sa_type()
|
|
55
55
|
|
|
56
|
-
def
|
|
57
|
-
|
|
58
|
-
idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
|
|
59
|
-
conn = Env.get().conn
|
|
60
|
-
idx.create(bind=conn)
|
|
56
|
+
def sa_index(self, store_index_name: str, index_value_col: 'catalog.Column') -> sql.Index:
|
|
57
|
+
return sql.Index(store_index_name, index_value_col.sa_col, postgresql_using='btree')
|
|
61
58
|
|
|
62
59
|
def drop_index(self, index_name: str, index_value_col: 'catalog.Column') -> None:
|
|
63
60
|
"""Drop the index on the index value column"""
|
|
@@ -72,5 +69,5 @@ class BtreeIndex(IndexBase):
|
|
|
72
69
|
return {}
|
|
73
70
|
|
|
74
71
|
@classmethod
|
|
75
|
-
def from_dict(cls,
|
|
76
|
-
return cls(
|
|
72
|
+
def from_dict(cls, d: dict) -> 'BtreeIndex':
|
|
73
|
+
return cls()
|
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
|
-
from typing import Any, ClassVar
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
8
8
|
import PIL.Image
|
|
9
9
|
import sqlalchemy as sql
|
|
10
10
|
|
|
11
|
+
import pixeltable.catalog as catalog
|
|
11
12
|
import pixeltable.exceptions as excs
|
|
13
|
+
import pixeltable.exprs as exprs
|
|
14
|
+
import pixeltable.func as func
|
|
12
15
|
import pixeltable.type_system as ts
|
|
13
|
-
from pixeltable import catalog, exprs, func
|
|
14
16
|
from pixeltable.env import Env
|
|
15
17
|
|
|
16
18
|
from .base import IndexBase
|
|
@@ -39,28 +41,23 @@ class EmbeddingIndex(IndexBase):
|
|
|
39
41
|
}
|
|
40
42
|
|
|
41
43
|
metric: Metric
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
image_embed: Optional[func.Function]
|
|
44
|
+
string_embed: func.Function | None
|
|
45
|
+
image_embed: func.Function | None
|
|
45
46
|
string_embed_signature_idx: int
|
|
46
47
|
image_embed_signature_idx: int
|
|
47
|
-
index_col_type: pgvector.sqlalchemy.Vector
|
|
48
48
|
|
|
49
49
|
def __init__(
|
|
50
50
|
self,
|
|
51
|
-
c: catalog.Column,
|
|
52
51
|
metric: str,
|
|
53
|
-
embed:
|
|
54
|
-
string_embed:
|
|
55
|
-
image_embed:
|
|
52
|
+
embed: func.Function | None = None,
|
|
53
|
+
string_embed: func.Function | None = None,
|
|
54
|
+
image_embed: func.Function | None = None,
|
|
56
55
|
):
|
|
57
56
|
if embed is None and string_embed is None and image_embed is None:
|
|
58
57
|
raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
|
|
59
58
|
metric_names = [m.name.lower() for m in self.Metric]
|
|
60
59
|
if metric.lower() not in metric_names:
|
|
61
60
|
raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
|
|
62
|
-
if not c.col_type.is_string_type() and not c.col_type.is_image_type():
|
|
63
|
-
raise excs.Error('Embedding index requires string or image column')
|
|
64
61
|
|
|
65
62
|
self.string_embed = None
|
|
66
63
|
self.image_embed = None
|
|
@@ -102,47 +99,42 @@ class EmbeddingIndex(IndexBase):
|
|
|
102
99
|
)
|
|
103
100
|
|
|
104
101
|
# Now validate the return types of the embedding functions.
|
|
105
|
-
|
|
106
102
|
if self.string_embed is not None:
|
|
107
103
|
self._validate_embedding_fn(self.string_embed)
|
|
108
|
-
|
|
109
104
|
if self.image_embed is not None:
|
|
110
105
|
self._validate_embedding_fn(self.image_embed)
|
|
111
106
|
|
|
107
|
+
self.metric = self.Metric[metric.upper()]
|
|
108
|
+
|
|
109
|
+
def create_value_expr(self, c: catalog.Column) -> exprs.Expr:
|
|
110
|
+
if not c.col_type.is_string_type() and not c.col_type.is_image_type():
|
|
111
|
+
raise excs.Error(
|
|
112
|
+
f'Embedding index requires string or image column, column {c.name!r} has type {c.col_type}'
|
|
113
|
+
)
|
|
112
114
|
if c.col_type.is_string_type() and self.string_embed is None:
|
|
113
115
|
raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
|
|
114
116
|
if c.col_type.is_image_type() and self.image_embed is None:
|
|
115
117
|
raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
|
|
116
118
|
|
|
117
|
-
|
|
118
|
-
self.value_expr = (
|
|
119
|
+
return (
|
|
119
120
|
self.string_embed(exprs.ColumnRef(c))
|
|
120
121
|
if c.col_type.is_string_type()
|
|
121
122
|
else self.image_embed(exprs.ColumnRef(c))
|
|
122
123
|
)
|
|
123
|
-
assert isinstance(self.value_expr.col_type, ts.ArrayType)
|
|
124
|
-
vector_size = self.value_expr.col_type.shape[0]
|
|
125
|
-
assert vector_size is not None
|
|
126
|
-
self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
|
|
127
|
-
|
|
128
|
-
def index_value_expr(self) -> exprs.Expr:
|
|
129
|
-
"""Return expression that computes the value that goes into the index"""
|
|
130
|
-
return self.value_expr
|
|
131
124
|
|
|
132
125
|
def records_value_errors(self) -> bool:
|
|
133
126
|
return True
|
|
134
127
|
|
|
135
|
-
def
|
|
136
|
-
|
|
137
|
-
|
|
128
|
+
def get_index_sa_type(self, val_col_type: ts.ColumnType) -> sql.types.TypeEngine:
|
|
129
|
+
assert isinstance(val_col_type, ts.ArrayType) and val_col_type.shape is not None
|
|
130
|
+
vector_size = val_col_type.shape[0]
|
|
131
|
+
assert vector_size is not None
|
|
132
|
+
return pgvector.sqlalchemy.Vector(vector_size)
|
|
138
133
|
|
|
139
|
-
def
|
|
134
|
+
def sa_index(self, store_index_name: str, index_value_col: 'catalog.Column') -> sql.Index:
|
|
140
135
|
"""Create the index on the index value column"""
|
|
141
|
-
Env.get().dbms.
|
|
142
|
-
|
|
143
|
-
index_value_sa_col=index_value_col.sa_col,
|
|
144
|
-
conn=Env.get().conn,
|
|
145
|
-
metric=self.PGVECTOR_OPS[self.metric],
|
|
136
|
+
return Env.get().dbms.sa_vector_index(
|
|
137
|
+
store_index_name, index_value_col.sa_col, metric=self.PGVECTOR_OPS[self.metric]
|
|
146
138
|
)
|
|
147
139
|
|
|
148
140
|
def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
|
|
@@ -153,6 +145,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
153
145
|
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
|
|
154
146
|
"""Create a ColumnElement that represents '<val_column> <op> <item>'"""
|
|
155
147
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
148
|
+
embedding: np.ndarray
|
|
156
149
|
if isinstance(item, str):
|
|
157
150
|
assert self.string_embed is not None
|
|
158
151
|
embedding = self.string_embed.exec([item], {})
|
|
@@ -171,7 +164,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
171
164
|
def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
|
|
172
165
|
"""Create a ColumnElement that is used in an ORDER BY clause"""
|
|
173
166
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
174
|
-
embedding:
|
|
167
|
+
embedding: np.ndarray | None = None
|
|
175
168
|
if isinstance(item, str):
|
|
176
169
|
assert self.string_embed is not None
|
|
177
170
|
embedding = self.string_embed.exec([item], {})
|
|
@@ -196,9 +189,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
196
189
|
return 'embedding'
|
|
197
190
|
|
|
198
191
|
@classmethod
|
|
199
|
-
def _resolve_embedding_fn(
|
|
200
|
-
cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type
|
|
201
|
-
) -> Optional[func.Function]:
|
|
192
|
+
def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> func.Function | None:
|
|
202
193
|
"""Find an overload resolution for `embed_fn` that matches the given type."""
|
|
203
194
|
assert isinstance(embed_fn, func.Function)
|
|
204
195
|
for resolved_fn in embed_fn._resolved_fns:
|
|
@@ -252,7 +243,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
252
243
|
}
|
|
253
244
|
|
|
254
245
|
@classmethod
|
|
255
|
-
def from_dict(cls,
|
|
246
|
+
def from_dict(cls, d: dict) -> EmbeddingIndex:
|
|
256
247
|
string_embed = func.Function.from_dict(d['string_embed']) if d['string_embed'] is not None else None
|
|
257
248
|
image_embed = func.Function.from_dict(d['image_embed']) if d['image_embed'] is not None else None
|
|
258
|
-
return cls(
|
|
249
|
+
return cls(metric=d['metric'], string_embed=string_embed, image_embed=image_embed)
|
pixeltable/io/datarows.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any, Iterable
|
|
3
|
+
from typing import Any, Iterable
|
|
4
4
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.type_system as ts
|
|
@@ -60,7 +60,7 @@ def import_rows(
|
|
|
60
60
|
tbl_path: str,
|
|
61
61
|
rows: list[dict[str, Any]],
|
|
62
62
|
*,
|
|
63
|
-
schema_overrides:
|
|
63
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
64
64
|
primary_key: str | list[str] | None = None,
|
|
65
65
|
num_retained_versions: int = 10,
|
|
66
66
|
comment: str = '',
|
|
@@ -104,7 +104,7 @@ def import_json(
|
|
|
104
104
|
tbl_path: str,
|
|
105
105
|
filepath_or_url: str,
|
|
106
106
|
*,
|
|
107
|
-
schema_overrides:
|
|
107
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
108
108
|
primary_key: str | list[str] | None = None,
|
|
109
109
|
num_retained_versions: int = 10,
|
|
110
110
|
comment: str = '',
|
pixeltable/io/external_store.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
import itertools
|
|
5
5
|
import logging
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
9
9
|
import pixeltable.type_system as ts
|
|
@@ -68,10 +68,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
68
68
|
stored_proxies: dict[ColumnHandle, ColumnHandle] # original col -> proxy col
|
|
69
69
|
|
|
70
70
|
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
name: str,
|
|
73
|
-
col_mapping: dict[ColumnHandle, str],
|
|
74
|
-
stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]],
|
|
71
|
+
self, name: str, col_mapping: dict[ColumnHandle, str], stored_proxies: dict[ColumnHandle, ColumnHandle] | None
|
|
75
72
|
):
|
|
76
73
|
super().__init__(name)
|
|
77
74
|
self._col_mapping = col_mapping
|
|
@@ -190,7 +187,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
190
187
|
table: Table,
|
|
191
188
|
export_cols: dict[str, ts.ColumnType],
|
|
192
189
|
import_cols: dict[str, ts.ColumnType],
|
|
193
|
-
col_mapping:
|
|
190
|
+
col_mapping: dict[str, str] | None,
|
|
194
191
|
) -> dict[ColumnHandle, str]:
|
|
195
192
|
"""
|
|
196
193
|
Verifies that the specified `col_mapping` is valid. In particular, checks that:
|
|
@@ -217,19 +214,19 @@ class Project(ExternalStore, abc.ABC):
|
|
|
217
214
|
if t_col not in t_cols:
|
|
218
215
|
if is_user_specified_col_mapping:
|
|
219
216
|
raise excs.Error(
|
|
220
|
-
f'Column name
|
|
217
|
+
f'Column name {t_col!r} appears as a key in `col_mapping`, but {table._display_str()} '
|
|
221
218
|
'contains no such column.'
|
|
222
219
|
)
|
|
223
220
|
else:
|
|
224
221
|
raise excs.Error(
|
|
225
|
-
f'Column
|
|
222
|
+
f'Column {t_col!r} does not exist in {table._display_str()}. Either add a column {t_col!r}, '
|
|
226
223
|
f'or specify a `col_mapping` to associate a different column with '
|
|
227
|
-
f'the external field
|
|
224
|
+
f'the external field {ext_col!r}.'
|
|
228
225
|
)
|
|
229
226
|
if ext_col not in export_cols and ext_col not in import_cols:
|
|
230
227
|
raise excs.Error(
|
|
231
|
-
f'Column name
|
|
232
|
-
f'configuration has no column
|
|
228
|
+
f'Column name {ext_col!r} appears as a value in `col_mapping`, but the external store '
|
|
229
|
+
f'configuration has no column {ext_col!r}.'
|
|
233
230
|
)
|
|
234
231
|
col_ref = table[t_col]
|
|
235
232
|
assert isinstance(col_ref, exprs.ColumnRef)
|
|
@@ -244,19 +241,19 @@ class Project(ExternalStore, abc.ABC):
|
|
|
244
241
|
ext_col_type = export_cols[ext_col]
|
|
245
242
|
if not ext_col_type.is_supertype_of(t_col_type, ignore_nullable=True):
|
|
246
243
|
raise excs.Error(
|
|
247
|
-
f'Column
|
|
244
|
+
f'Column {t_col!r} cannot be exported to external column {ext_col!r} '
|
|
248
245
|
f'(incompatible types; expecting `{ext_col_type}`)'
|
|
249
246
|
)
|
|
250
247
|
if ext_col in import_cols:
|
|
251
248
|
# Validate that the external column can be assigned to the table column
|
|
252
249
|
if table._tbl_version_path.get_column(t_col).is_computed:
|
|
253
250
|
raise excs.Error(
|
|
254
|
-
f'Column
|
|
251
|
+
f'Column {t_col!r} is a computed column, which cannot be populated from an external column'
|
|
255
252
|
)
|
|
256
253
|
ext_col_type = import_cols[ext_col]
|
|
257
254
|
if not t_col_type.is_supertype_of(ext_col_type, ignore_nullable=True):
|
|
258
255
|
raise excs.Error(
|
|
259
|
-
f'Column
|
|
256
|
+
f'Column {t_col!r} cannot be imported from external column {ext_col!r} '
|
|
260
257
|
f'(incompatible types; expecting `{ext_col_type}`)'
|
|
261
258
|
)
|
|
262
259
|
return resolved_col_mapping
|
|
@@ -271,7 +268,7 @@ class MockProject(Project):
|
|
|
271
268
|
export_cols: dict[str, ts.ColumnType],
|
|
272
269
|
import_cols: dict[str, ts.ColumnType],
|
|
273
270
|
col_mapping: dict[ColumnHandle, str],
|
|
274
|
-
stored_proxies:
|
|
271
|
+
stored_proxies: dict[ColumnHandle, ColumnHandle] | None = None,
|
|
275
272
|
):
|
|
276
273
|
super().__init__(name, col_mapping, stored_proxies)
|
|
277
274
|
self.export_cols = export_cols
|
|
@@ -285,7 +282,7 @@ class MockProject(Project):
|
|
|
285
282
|
name: str,
|
|
286
283
|
export_cols: dict[str, ts.ColumnType],
|
|
287
284
|
import_cols: dict[str, ts.ColumnType],
|
|
288
|
-
col_mapping:
|
|
285
|
+
col_mapping: dict[str, str] | None = None,
|
|
289
286
|
) -> 'MockProject':
|
|
290
287
|
col_mapping = cls.validate_columns(t, export_cols, import_cols, col_mapping)
|
|
291
288
|
return cls(name, export_cols, import_cols, col_mapping)
|
pixeltable/io/fiftyone.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any, Iterator
|
|
2
|
+
from typing import Any, Iterator
|
|
3
3
|
|
|
4
4
|
import fiftyone as fo # type: ignore[import-untyped]
|
|
5
5
|
import fiftyone.utils.data as foud # type: ignore[import-untyped]
|
|
@@ -20,7 +20,7 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
20
20
|
__image_format: str # format to use for any exported images that are not already stored on disk
|
|
21
21
|
__labels: dict[str, tuple[exprs.Expr, type[fo.Label]]] # label_name -> (expr, label_cls)
|
|
22
22
|
__image_idx: int # index of the image expr in the select list
|
|
23
|
-
__localpath_idx:
|
|
23
|
+
__localpath_idx: int | None # index of the image localpath in the select list, if present
|
|
24
24
|
__row_iter: Iterator[list] # iterator over the table rows, to be convered to FiftyOne samples
|
|
25
25
|
|
|
26
26
|
def __init__(
|
|
@@ -30,10 +30,10 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
30
30
|
image_format: str,
|
|
31
31
|
classifications: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
|
|
32
32
|
detections: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
|
|
33
|
-
dataset_dir:
|
|
33
|
+
dataset_dir: os.PathLike | None = None,
|
|
34
34
|
shuffle: bool = False,
|
|
35
35
|
seed: int | float | str | bytes | bytearray | None = None,
|
|
36
|
-
max_samples:
|
|
36
|
+
max_samples: int | None = None,
|
|
37
37
|
):
|
|
38
38
|
super().__init__(dataset_dir=dataset_dir, shuffle=shuffle, seed=seed, max_samples=max_samples)
|
|
39
39
|
|
|
@@ -90,7 +90,7 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
90
90
|
df = tbl.select(*selection)
|
|
91
91
|
self.__row_iter = df._output_row_iterator()
|
|
92
92
|
|
|
93
|
-
def __next__(self) -> tuple[str,
|
|
93
|
+
def __next__(self) -> tuple[str, fo.ImageMetadata | None, dict[str, fo.Label] | None]:
|
|
94
94
|
row = next(self.__row_iter)
|
|
95
95
|
img = row[self.__image_idx]
|
|
96
96
|
assert isinstance(img, PIL.Image.Image)
|
pixeltable/io/globals.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Literal
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
4
4
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.exceptions as excs
|
|
@@ -15,12 +15,12 @@ if TYPE_CHECKING:
|
|
|
15
15
|
def create_label_studio_project(
|
|
16
16
|
t: Table,
|
|
17
17
|
label_config: str,
|
|
18
|
-
name:
|
|
19
|
-
title:
|
|
18
|
+
name: str | None = None,
|
|
19
|
+
title: str | None = None,
|
|
20
20
|
media_import_method: Literal['post', 'file', 'url'] = 'post',
|
|
21
|
-
col_mapping:
|
|
21
|
+
col_mapping: dict[str, str] | None = None,
|
|
22
22
|
sync_immediately: bool = True,
|
|
23
|
-
s3_configuration:
|
|
23
|
+
s3_configuration: dict[str, Any] | None = None,
|
|
24
24
|
**kwargs: Any,
|
|
25
25
|
) -> UpdateStatus:
|
|
26
26
|
"""
|
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import typing
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any
|
|
5
5
|
|
|
6
6
|
import pixeltable as pxt
|
|
7
7
|
import pixeltable.type_system as ts
|
|
@@ -36,7 +36,7 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
|
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
def _to_pixeltable_type(feature_type: Any, nullable: bool) ->
|
|
39
|
+
def _to_pixeltable_type(feature_type: Any, nullable: bool) -> ts.ColumnType | None:
|
|
40
40
|
"""Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
|
|
41
41
|
import datasets
|
|
42
42
|
|
|
@@ -76,7 +76,7 @@ def _get_hf_schema(dataset: datasets.Dataset | datasets.DatasetDict) -> datasets
|
|
|
76
76
|
|
|
77
77
|
def huggingface_schema_to_pxt_schema(
|
|
78
78
|
hf_schema: datasets.Features, schema_overrides: dict[str, Any], primary_key: list[str]
|
|
79
|
-
) -> dict[str,
|
|
79
|
+
) -> dict[str, ts.ColumnType | None]:
|
|
80
80
|
"""Generate a pixeltable schema from a huggingface dataset schema.
|
|
81
81
|
Columns without a known mapping are mapped to None
|
|
82
82
|
"""
|
|
@@ -93,7 +93,7 @@ def import_huggingface_dataset(
|
|
|
93
93
|
table_path: str,
|
|
94
94
|
dataset: datasets.Dataset | datasets.DatasetDict,
|
|
95
95
|
*,
|
|
96
|
-
schema_overrides:
|
|
96
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
97
97
|
primary_key: str | list[str] | None = None,
|
|
98
98
|
**kwargs: Any,
|
|
99
99
|
) -> pxt.Table:
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -4,7 +4,7 @@ import logging
|
|
|
4
4
|
import os
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Iterator, Literal
|
|
7
|
+
from typing import Any, Iterator, Literal
|
|
8
8
|
from xml.etree import ElementTree as ET
|
|
9
9
|
|
|
10
10
|
import label_studio_sdk
|
|
@@ -53,7 +53,7 @@ class LabelStudioProject(Project):
|
|
|
53
53
|
|
|
54
54
|
project_id: int # Label Studio project ID
|
|
55
55
|
media_import_method: Literal['post', 'file', 'url']
|
|
56
|
-
_project:
|
|
56
|
+
_project: ls_project.Project | None
|
|
57
57
|
|
|
58
58
|
def __init__(
|
|
59
59
|
self,
|
|
@@ -61,7 +61,7 @@ class LabelStudioProject(Project):
|
|
|
61
61
|
project_id: int,
|
|
62
62
|
media_import_method: Literal['post', 'file', 'url'],
|
|
63
63
|
col_mapping: dict[ColumnHandle, str],
|
|
64
|
-
stored_proxies:
|
|
64
|
+
stored_proxies: dict[ColumnHandle, ColumnHandle] | None = None,
|
|
65
65
|
):
|
|
66
66
|
self.project_id = project_id
|
|
67
67
|
self.media_import_method = media_import_method
|
|
@@ -278,8 +278,8 @@ class LabelStudioProject(Project):
|
|
|
278
278
|
# columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
|
|
279
279
|
# preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.
|
|
280
280
|
# We have to wait until we begin iterating to populate them, so they're initially `None`.
|
|
281
|
-
rl_col_idxs:
|
|
282
|
-
data_col_idxs:
|
|
281
|
+
rl_col_idxs: list[int] | None = None
|
|
282
|
+
data_col_idxs: list[int] | None = None
|
|
283
283
|
|
|
284
284
|
row_ids_in_pxt: set[tuple] = set()
|
|
285
285
|
tasks_created = 0
|
|
@@ -349,7 +349,7 @@ class LabelStudioProject(Project):
|
|
|
349
349
|
return sync_status
|
|
350
350
|
|
|
351
351
|
@classmethod
|
|
352
|
-
def __validate_fileurl(cls, col: Column, url: str) ->
|
|
352
|
+
def __validate_fileurl(cls, col: Column, url: str) -> str | None:
|
|
353
353
|
# Check that the URL is one that will be visible to Label Studio. If it isn't, log an info message
|
|
354
354
|
# to help users debug the issue.
|
|
355
355
|
if not (url.startswith('http://') or url.startswith('https://')):
|
|
@@ -497,7 +497,7 @@ class LabelStudioProject(Project):
|
|
|
497
497
|
|
|
498
498
|
@classmethod
|
|
499
499
|
def __coco_to_predictions(
|
|
500
|
-
cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id:
|
|
500
|
+
cls, coco_annotations: dict[str, Any], from_name: str, rl_info: '_RectangleLabel', task_id: int | None = None
|
|
501
501
|
) -> dict[str, Any]:
|
|
502
502
|
width = coco_annotations['image']['width']
|
|
503
503
|
height = coco_annotations['image']['height']
|
|
@@ -549,11 +549,11 @@ class LabelStudioProject(Project):
|
|
|
549
549
|
cls,
|
|
550
550
|
t: Table,
|
|
551
551
|
label_config: str,
|
|
552
|
-
name:
|
|
553
|
-
title:
|
|
552
|
+
name: str | None,
|
|
553
|
+
title: str | None,
|
|
554
554
|
media_import_method: Literal['post', 'file', 'url'],
|
|
555
|
-
col_mapping:
|
|
556
|
-
s3_configuration:
|
|
555
|
+
col_mapping: dict[str, str] | None,
|
|
556
|
+
s3_configuration: dict[str, Any] | None,
|
|
557
557
|
**kwargs: Any,
|
|
558
558
|
) -> 'LabelStudioProject':
|
|
559
559
|
"""
|
|
@@ -652,7 +652,7 @@ class LabelStudioProject(Project):
|
|
|
652
652
|
|
|
653
653
|
@dataclass(frozen=True)
|
|
654
654
|
class _DataKey:
|
|
655
|
-
name:
|
|
655
|
+
name: str | None # The 'name' attribute of the data key; may differ from the field name
|
|
656
656
|
column_type: ts.ColumnType
|
|
657
657
|
|
|
658
658
|
|
pixeltable/io/pandas.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -16,7 +16,7 @@ def import_pandas(
|
|
|
16
16
|
tbl_name: str,
|
|
17
17
|
df: pd.DataFrame,
|
|
18
18
|
*,
|
|
19
|
-
schema_overrides:
|
|
19
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
20
20
|
primary_key: str | list[str] | None = None,
|
|
21
21
|
num_retained_versions: int = 10,
|
|
22
22
|
comment: str = '',
|
|
@@ -56,7 +56,7 @@ def import_pandas(
|
|
|
56
56
|
def import_csv(
|
|
57
57
|
tbl_name: str,
|
|
58
58
|
filepath_or_buffer: str | os.PathLike,
|
|
59
|
-
schema_overrides:
|
|
59
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
60
60
|
primary_key: str | list[str] | None = None,
|
|
61
61
|
num_retained_versions: int = 10,
|
|
62
62
|
comment: str = '',
|
|
@@ -86,7 +86,7 @@ def import_excel(
|
|
|
86
86
|
tbl_name: str,
|
|
87
87
|
io: str | os.PathLike,
|
|
88
88
|
*,
|
|
89
|
-
schema_overrides:
|
|
89
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
90
90
|
primary_key: str | list[str] | None = None,
|
|
91
91
|
num_retained_versions: int = 10,
|
|
92
92
|
comment: str = '',
|
|
@@ -141,7 +141,7 @@ def df_infer_schema(
|
|
|
141
141
|
return pd_schema
|
|
142
142
|
|
|
143
143
|
|
|
144
|
-
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) ->
|
|
144
|
+
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> ts.ColumnType | None:
|
|
145
145
|
"""
|
|
146
146
|
Determines a pixeltable ColumnType from a pandas dtype
|
|
147
147
|
|
|
@@ -192,7 +192,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
|
|
|
192
192
|
|
|
193
193
|
|
|
194
194
|
def _df_row_to_pxt_row(
|
|
195
|
-
row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping:
|
|
195
|
+
row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: dict[str, str] | None
|
|
196
196
|
) -> dict[str, Any]:
|
|
197
197
|
"""Convert a row to insertable format"""
|
|
198
198
|
pxt_row: dict[str, Any] = {}
|
pixeltable/io/parquet.py
CHANGED
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
import typing
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import pixeltable as pxt
|
|
10
10
|
import pixeltable.exceptions as excs
|
|
@@ -71,7 +71,7 @@ def import_parquet(
|
|
|
71
71
|
table: str,
|
|
72
72
|
*,
|
|
73
73
|
parquet_path: str,
|
|
74
|
-
schema_overrides:
|
|
74
|
+
schema_overrides: dict[str, Any] | None = None,
|
|
75
75
|
primary_key: str | list[str] | None = None,
|
|
76
76
|
**kwargs: Any,
|
|
77
77
|
) -> pxt.Table:
|