pixeltable 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +25 -15
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +123 -103
- pixeltable/catalog/table_version.py +292 -143
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +68 -27
- pixeltable/dataframe.py +102 -72
- pixeltable/env.py +39 -23
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -8
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +18 -17
- pixeltable/exec/expr_eval/expr_eval_node.py +29 -16
- pixeltable/exec/expr_eval/globals.py +33 -11
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +170 -42
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +101 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +31 -16
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +21 -15
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +214 -109
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +61 -28
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +3 -2
- pixeltable/io/label_studio.py +80 -71
- pixeltable/io/pandas.py +33 -9
- pixeltable/io/parquet.py +10 -13
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +9 -2
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +130 -85
- pixeltable/utils/arrow.py +1 -7
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +44 -0
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +13 -8
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/media_store.py +2 -1
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/METADATA +7 -8
- pixeltable-0.3.3.dist-info/RECORD +163 -0
- pixeltable-0.3.1.dist-info/RECORD +0 -160
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.1.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
pixeltable/exprs/in_predicate.py
CHANGED
|
@@ -26,7 +26,8 @@ class InPredicate(Expr):
|
|
|
26
26
|
if value_set_expr is not None:
|
|
27
27
|
if not value_set_expr.col_type.is_json_type():
|
|
28
28
|
raise excs.Error(
|
|
29
|
-
f'isin(): argument must have a JSON type, but {value_set_expr} has type {value_set_expr.col_type}'
|
|
29
|
+
f'isin(): argument must have a JSON type, but {value_set_expr} has type {value_set_expr.col_type}'
|
|
30
|
+
)
|
|
30
31
|
self.components = [lhs.copy(), value_set_expr.copy()]
|
|
31
32
|
else:
|
|
32
33
|
assert value_set_literal is not None
|
|
@@ -95,4 +96,3 @@ class InPredicate(Expr):
|
|
|
95
96
|
assert 'value_list' in d
|
|
96
97
|
assert len(components) <= 2
|
|
97
98
|
return cls(components[0], d['value_list'], components[1] if len(components) == 2 else None)
|
|
98
|
-
|
pixeltable/exprs/inline_expr.py
CHANGED
|
@@ -45,8 +45,7 @@ class InlineArray(Expr):
|
|
|
45
45
|
elif inferred_element_type.is_array_type():
|
|
46
46
|
assert isinstance(inferred_element_type, ts.ArrayType)
|
|
47
47
|
col_type = ts.ArrayType(
|
|
48
|
-
(len(exprs), *inferred_element_type.shape),
|
|
49
|
-
ts.ColumnType.make_type(inferred_element_type.dtype)
|
|
48
|
+
(len(exprs), *inferred_element_type.shape), ts.ColumnType.make_type(inferred_element_type.dtype)
|
|
50
49
|
)
|
|
51
50
|
else:
|
|
52
51
|
raise excs.Error(f'Element type is not a valid dtype for an array: {inferred_element_type}')
|
|
@@ -82,9 +81,14 @@ class InlineArray(Expr):
|
|
|
82
81
|
# loaded and their types are known.
|
|
83
82
|
return InlineList(components) # type: ignore[return-value]
|
|
84
83
|
|
|
85
|
-
def
|
|
84
|
+
def as_literal(self) -> Optional[Literal]:
|
|
86
85
|
assert isinstance(self.col_type, ts.ArrayType)
|
|
87
|
-
|
|
86
|
+
if not all(isinstance(comp, Literal) for comp in self.components):
|
|
87
|
+
return None
|
|
88
|
+
return Literal(
|
|
89
|
+
np.array([c.as_literal().val for c in self.components], dtype=self.col_type.numpy_dtype()), self.col_type
|
|
90
|
+
)
|
|
91
|
+
|
|
88
92
|
|
|
89
93
|
class InlineList(Expr):
|
|
90
94
|
"""
|
|
@@ -97,7 +101,7 @@ class InlineList(Expr):
|
|
|
97
101
|
json_schema = {
|
|
98
102
|
'type': 'array',
|
|
99
103
|
'prefixItems': [expr.col_type.to_json_schema() for expr in exprs],
|
|
100
|
-
'items': False # No additional items (fixed length)
|
|
104
|
+
'items': False, # No additional items (fixed length)
|
|
101
105
|
}
|
|
102
106
|
|
|
103
107
|
super().__init__(ts.JsonType(json_schema))
|
|
@@ -124,8 +128,11 @@ class InlineList(Expr):
|
|
|
124
128
|
def _from_dict(cls, _: dict, components: list[Expr]) -> InlineList:
|
|
125
129
|
return cls(components)
|
|
126
130
|
|
|
127
|
-
def
|
|
128
|
-
|
|
131
|
+
def as_literal(self) -> Optional[Literal]:
|
|
132
|
+
if not all(isinstance(comp, Literal) for comp in self.components):
|
|
133
|
+
return None
|
|
134
|
+
return Literal(list(c.as_literal().val for c in self.components), self.col_type)
|
|
135
|
+
|
|
129
136
|
|
|
130
137
|
class InlineDict(Expr):
|
|
131
138
|
"""
|
|
@@ -147,10 +154,7 @@ class InlineDict(Expr):
|
|
|
147
154
|
try:
|
|
148
155
|
json_schema = {
|
|
149
156
|
'type': 'object',
|
|
150
|
-
'properties': {
|
|
151
|
-
key: expr.col_type.to_json_schema()
|
|
152
|
-
for key, expr in zip(self.keys, exprs)
|
|
153
|
-
},
|
|
157
|
+
'properties': {key: expr.col_type.to_json_schema() for key, expr in zip(self.keys, exprs)},
|
|
154
158
|
}
|
|
155
159
|
except excs.Error:
|
|
156
160
|
# InlineDicts are used to store iterator arguments, which are not required to be valid JSON types,
|
|
@@ -177,10 +181,7 @@ class InlineDict(Expr):
|
|
|
177
181
|
|
|
178
182
|
def eval(self, data_row: DataRow, _: RowBuilder) -> None:
|
|
179
183
|
assert len(self.keys) == len(self.components)
|
|
180
|
-
data_row[self.slot_idx] = {
|
|
181
|
-
key: data_row[expr.slot_idx]
|
|
182
|
-
for key, expr in zip(self.keys, self.components)
|
|
183
|
-
}
|
|
184
|
+
data_row[self.slot_idx] = {key: data_row[expr.slot_idx] for key, expr in zip(self.keys, self.components)}
|
|
184
185
|
|
|
185
186
|
def to_kwargs(self) -> dict[str, Any]:
|
|
186
187
|
"""Deconstructs this expression into a dictionary by recursively unwrapping all Literals,
|
|
@@ -207,5 +208,7 @@ class InlineDict(Expr):
|
|
|
207
208
|
arg = dict(zip(d['keys'], components))
|
|
208
209
|
return InlineDict(arg)
|
|
209
210
|
|
|
210
|
-
def
|
|
211
|
-
|
|
211
|
+
def as_literal(self) -> Optional[Literal]:
|
|
212
|
+
if not all(isinstance(comp, Literal) for comp in self.components):
|
|
213
|
+
return None
|
|
214
|
+
return Literal(dict(zip(self.keys, (c.as_literal().val for c in self.components))), self.col_type)
|
pixeltable/exprs/json_mapper.py
CHANGED
|
@@ -5,8 +5,9 @@ from typing import Optional
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
7
7
|
import pixeltable.type_system as ts
|
|
8
|
+
|
|
8
9
|
from .data_row import DataRow
|
|
9
|
-
from .expr import Expr, ExprScope
|
|
10
|
+
from .expr import _GLOBAL_SCOPE, Expr, ExprScope
|
|
10
11
|
from .row_builder import RowBuilder
|
|
11
12
|
from .sql_element_cache import SqlElementCache
|
|
12
13
|
|
|
@@ -17,6 +18,7 @@ class JsonMapper(Expr):
|
|
|
17
18
|
The target expr would typically contain relative JsonPaths, which are bound to an ObjectRef, which in turn
|
|
18
19
|
is populated by JsonMapper.eval(). The JsonMapper effectively creates a new scope for its target expr.
|
|
19
20
|
"""
|
|
21
|
+
|
|
20
22
|
def __init__(self, src_expr: Expr, target_expr: Expr):
|
|
21
23
|
# TODO: type spec should be list[target_expr.col_type]
|
|
22
24
|
super().__init__(ts.JsonType())
|
|
@@ -26,6 +28,7 @@ class JsonMapper(Expr):
|
|
|
26
28
|
self.target_expr_scope = ExprScope(_GLOBAL_SCOPE)
|
|
27
29
|
|
|
28
30
|
from .object_ref import ObjectRef
|
|
31
|
+
|
|
29
32
|
scope_anchor = ObjectRef(self.target_expr_scope, self)
|
|
30
33
|
self.components = [src_expr, target_expr, scope_anchor]
|
|
31
34
|
self.parent_mapper: Optional[JsonMapper] = None
|
|
@@ -118,4 +121,3 @@ class JsonMapper(Expr):
|
|
|
118
121
|
def _from_dict(cls, d: dict, components: list[Expr]) -> JsonMapper:
|
|
119
122
|
assert len(components) == 2
|
|
120
123
|
return cls(components[0], components[1])
|
|
121
|
-
|
pixeltable/exprs/json_path.py
CHANGED
|
@@ -23,7 +23,7 @@ class JsonPath(Expr):
|
|
|
23
23
|
self,
|
|
24
24
|
anchor: Optional['pxt.exprs.Expr'],
|
|
25
25
|
path_elements: Optional[list[Union[str, int, slice]]] = None,
|
|
26
|
-
scope_idx: int = 0
|
|
26
|
+
scope_idx: int = 0,
|
|
27
27
|
) -> None:
|
|
28
28
|
"""
|
|
29
29
|
anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
|
|
@@ -44,15 +44,13 @@ class JsonPath(Expr):
|
|
|
44
44
|
|
|
45
45
|
def __repr__(self) -> str:
|
|
46
46
|
# else "R": the anchor is RELATIVE_PATH_ROOT
|
|
47
|
-
return (
|
|
48
|
-
f'{
|
|
47
|
+
return (
|
|
48
|
+
f'{str(self._anchor) if self._anchor is not None else "R"}'
|
|
49
|
+
f'{"." if isinstance(self.path_elements[0], str) else ""}{self._json_path()}'
|
|
50
|
+
)
|
|
49
51
|
|
|
50
52
|
def _as_dict(self) -> dict:
|
|
51
|
-
path_elements = [
|
|
52
|
-
[el.start, el.stop, el.step] if isinstance(el, slice)
|
|
53
|
-
else el
|
|
54
|
-
for el in self.path_elements
|
|
55
|
-
]
|
|
53
|
+
path_elements = [[el.start, el.stop, el.step] if isinstance(el, slice) else el for el in self.path_elements]
|
|
56
54
|
return {'path_elements': path_elements, 'scope_idx': self.scope_idx, **super()._as_dict()}
|
|
57
55
|
|
|
58
56
|
@classmethod
|
|
@@ -61,11 +59,7 @@ class JsonPath(Expr):
|
|
|
61
59
|
assert 'scope_idx' in d
|
|
62
60
|
assert len(components) <= 1
|
|
63
61
|
anchor = components[0] if len(components) == 1 else None
|
|
64
|
-
path_elements = [
|
|
65
|
-
slice(el[0], el[1], el[2]) if isinstance(el, list)
|
|
66
|
-
else el
|
|
67
|
-
for el in d['path_elements']
|
|
68
|
-
]
|
|
62
|
+
path_elements = [slice(el[0], el[1], el[2]) if isinstance(el, list) else el for el in d['path_elements']]
|
|
69
63
|
return cls(anchor, path_elements, d['scope_idx'])
|
|
70
64
|
|
|
71
65
|
@property
|
|
@@ -114,7 +108,7 @@ class JsonPath(Expr):
|
|
|
114
108
|
anchor_name = self._anchor.default_column_name() if self._anchor is not None else ''
|
|
115
109
|
ret_name = f'{anchor_name}.{self._json_path()}'
|
|
116
110
|
|
|
117
|
-
def cleanup_char(s
|
|
111
|
+
def cleanup_char(s: str) -> str:
|
|
118
112
|
if s == '.':
|
|
119
113
|
return '_'
|
|
120
114
|
elif s == '*':
|
|
@@ -125,7 +119,7 @@ class JsonPath(Expr):
|
|
|
125
119
|
return ''
|
|
126
120
|
|
|
127
121
|
clean_name = ''.join(map(cleanup_char, ret_name))
|
|
128
|
-
clean_name = clean_name.lstrip('_')
|
|
122
|
+
clean_name = clean_name.lstrip('_') # remove leading underscore
|
|
129
123
|
if clean_name == '':
|
|
130
124
|
clean_name = None
|
|
131
125
|
|
|
@@ -144,9 +138,9 @@ class JsonPath(Expr):
|
|
|
144
138
|
*two* rows (each containing col val 0), not a single row with [0, 0].
|
|
145
139
|
We need to use a workaround: retrieve the entire dict, then use jmespath to extract the path correctly.
|
|
146
140
|
"""
|
|
147
|
-
#path_str = '$.' + '.'.join(self.path_elements)
|
|
148
|
-
#assert isinstance(self._anchor(), ColumnRef)
|
|
149
|
-
#return sql.func.jsonb_path_query(self._anchor().col.sa_col, path_str)
|
|
141
|
+
# path_str = '$.' + '.'.join(self.path_elements)
|
|
142
|
+
# assert isinstance(self._anchor(), ColumnRef)
|
|
143
|
+
# return sql.func.jsonb_path_query(self._anchor().col.sa_col, path_str)
|
|
150
144
|
return None
|
|
151
145
|
|
|
152
146
|
def _json_path(self) -> str:
|
pixeltable/exprs/literal.py
CHANGED
|
@@ -3,8 +3,8 @@ from __future__ import annotations
|
|
|
3
3
|
import datetime
|
|
4
4
|
from typing import Any, Optional
|
|
5
5
|
|
|
6
|
-
import sqlalchemy as sql
|
|
7
6
|
import numpy as np
|
|
7
|
+
import sqlalchemy as sql
|
|
8
8
|
|
|
9
9
|
import pixeltable.type_system as ts
|
|
10
10
|
from pixeltable.env import Env
|
|
@@ -65,9 +65,8 @@ class Literal(Expr):
|
|
|
65
65
|
return super()._id_attrs() + [('val', self.val)]
|
|
66
66
|
|
|
67
67
|
def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
68
|
-
#
|
|
69
|
-
|
|
70
|
-
return sql.sql.expression.literal(self.val)
|
|
68
|
+
# Return a sql object so that constants can participate in SQL expressions
|
|
69
|
+
return sql.sql.expression.literal(self.val, type_=self.col_type.to_sa_type())
|
|
71
70
|
|
|
72
71
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
73
72
|
# this will be called, even though sql_expr() does not return None
|
|
@@ -89,11 +88,8 @@ class Literal(Expr):
|
|
|
89
88
|
else:
|
|
90
89
|
return {'val': self.val, **super()._as_dict()}
|
|
91
90
|
|
|
92
|
-
def
|
|
93
|
-
return self
|
|
94
|
-
|
|
95
|
-
def is_constant(self) -> bool:
|
|
96
|
-
return True
|
|
91
|
+
def as_literal(self) -> Optional[Literal]:
|
|
92
|
+
return self
|
|
97
93
|
|
|
98
94
|
@classmethod
|
|
99
95
|
def _from_dict(cls, d: dict, components: list[Expr]) -> Literal:
|
pixeltable/exprs/method_ref.py
CHANGED
|
@@ -19,6 +19,7 @@ class MethodRef(Expr):
|
|
|
19
19
|
When a `MethodRef` is called, it returns a `FunctionCall` with the base expression as the first argument.
|
|
20
20
|
The effective arity of a `MethodRef` is one less than the arity of the underlying `Function`.
|
|
21
21
|
"""
|
|
22
|
+
|
|
22
23
|
# TODO: Should this even be an `Expr`? It can't actually be evaluated directly (it has to be first
|
|
23
24
|
# converted to a `FunctionCall` by binding any remaining parameters).
|
|
24
25
|
|
pixeltable/exprs/object_ref.py
CHANGED
|
@@ -18,6 +18,7 @@ class ObjectRef(Expr):
|
|
|
18
18
|
Reference to an intermediate result, such as the "scope variable" produced by a JsonMapper.
|
|
19
19
|
The object is generated/materialized elsewhere and establishes a new scope.
|
|
20
20
|
"""
|
|
21
|
+
|
|
21
22
|
def __init__(self, scope: ExprScope, owner: JsonMapper):
|
|
22
23
|
# TODO: do we need an Unknown type after all?
|
|
23
24
|
super().__init__(ts.JsonType()) # JsonType: this could be anything
|
|
@@ -40,4 +41,3 @@ class ObjectRef(Expr):
|
|
|
40
41
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
41
42
|
# this will be called, but the value has already been materialized elsewhere
|
|
42
43
|
pass
|
|
43
|
-
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -13,7 +13,9 @@ import pixeltable.catalog as catalog
|
|
|
13
13
|
import pixeltable.exceptions as excs
|
|
14
14
|
import pixeltable.func as func
|
|
15
15
|
import pixeltable.utils as utils
|
|
16
|
+
from pixeltable.env import Env
|
|
16
17
|
from pixeltable.utils.media_store import MediaStore
|
|
18
|
+
|
|
17
19
|
from .data_row import DataRow
|
|
18
20
|
from .expr import Expr
|
|
19
21
|
from .expr_set import ExprSet
|
|
@@ -32,7 +34,9 @@ class ExecProfile:
|
|
|
32
34
|
per_call_time = self.eval_time[i] / self.eval_count[i]
|
|
33
35
|
calls_per_row = self.eval_count[i] / num_rows
|
|
34
36
|
multiple_str = f'({calls_per_row}x)' if calls_per_row > 1 else ''
|
|
35
|
-
|
|
37
|
+
Env.get().console_logger.info(
|
|
38
|
+
f'{self.row_builder.unique_exprs[i]}: {utils.print_perf_counter_delta(per_call_time)} {multiple_str}'
|
|
39
|
+
)
|
|
36
40
|
|
|
37
41
|
|
|
38
42
|
@dataclass
|
|
@@ -40,6 +44,7 @@ class ColumnSlotIdx:
|
|
|
40
44
|
"""Info for how to locate materialized column in DataRow
|
|
41
45
|
TODO: can this be integrated into RowBuilder directly?
|
|
42
46
|
"""
|
|
47
|
+
|
|
43
48
|
col: catalog.Column
|
|
44
49
|
slot_idx: int
|
|
45
50
|
|
|
@@ -50,6 +55,7 @@ class RowBuilder:
|
|
|
50
55
|
For ColumnRefs to unstored iterator columns:
|
|
51
56
|
- in order for them to be executable, we also record the iterator args and pass them to the ColumnRef
|
|
52
57
|
"""
|
|
58
|
+
|
|
53
59
|
unique_exprs: ExprSet
|
|
54
60
|
next_slot_idx: int
|
|
55
61
|
input_expr_slot_idxs: set[int]
|
|
@@ -83,14 +89,13 @@ class RowBuilder:
|
|
|
83
89
|
@dataclass
|
|
84
90
|
class EvalCtx:
|
|
85
91
|
"""Context for evaluating a set of target exprs"""
|
|
92
|
+
|
|
86
93
|
slot_idxs: list[int] # slot idxs of exprs needed to evaluate target exprs; does not contain duplicates
|
|
87
94
|
exprs: list[Expr] # exprs corresponding to slot_idxs
|
|
88
95
|
target_slot_idxs: list[int] # slot idxs of target exprs; might contain duplicates
|
|
89
96
|
target_exprs: list[Expr] # exprs corresponding to target_slot_idxs
|
|
90
97
|
|
|
91
|
-
def __init__(
|
|
92
|
-
self, output_exprs: Sequence[Expr], columns: Sequence[catalog.Column], input_exprs: Iterable[Expr]
|
|
93
|
-
):
|
|
98
|
+
def __init__(self, output_exprs: Sequence[Expr], columns: Sequence[catalog.Column], input_exprs: Iterable[Expr]):
|
|
94
99
|
"""
|
|
95
100
|
Args:
|
|
96
101
|
output_exprs: list of Exprs to be evaluated
|
|
@@ -106,10 +111,12 @@ class RowBuilder:
|
|
|
106
111
|
self.input_expr_slot_idxs = {e.slot_idx for e in unique_input_exprs}
|
|
107
112
|
|
|
108
113
|
resolve_cols = set(columns)
|
|
109
|
-
self.output_exprs = ExprSet(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
114
|
+
self.output_exprs = ExprSet(
|
|
115
|
+
[
|
|
116
|
+
self._record_unique_expr(e.copy().resolve_computed_cols(resolve_cols=resolve_cols), recursive=True)
|
|
117
|
+
for e in output_exprs
|
|
118
|
+
]
|
|
119
|
+
)
|
|
113
120
|
|
|
114
121
|
# if init(columns):
|
|
115
122
|
# - we are creating table rows and need to record columns for create_table_row()
|
|
@@ -119,6 +126,7 @@ class RowBuilder:
|
|
|
119
126
|
# * for write-validated columns, we need to create validating ColumnRefs
|
|
120
127
|
# * further references to that column (eg, computed cols) need to resolve to the validating ColumnRef
|
|
121
128
|
from .column_ref import ColumnRef
|
|
129
|
+
|
|
122
130
|
self.table_columns: list[ColumnSlotIdx] = []
|
|
123
131
|
self.input_exprs = ExprSet()
|
|
124
132
|
validating_colrefs: dict[Expr, Expr] = {} # key: non-validating colref, value: corresp. validating colref
|
|
@@ -133,7 +141,8 @@ class RowBuilder:
|
|
|
133
141
|
else:
|
|
134
142
|
# record a ColumnRef so that references to this column resolve to the same slot idx
|
|
135
143
|
perform_validation = (
|
|
136
|
-
None
|
|
144
|
+
None
|
|
145
|
+
if not col.col_type.is_media_type()
|
|
137
146
|
else col.media_validation == catalog.MediaValidation.ON_WRITE
|
|
138
147
|
)
|
|
139
148
|
expr = ColumnRef(col, perform_validation=perform_validation)
|
|
@@ -184,10 +193,11 @@ class RowBuilder:
|
|
|
184
193
|
|
|
185
194
|
# determine transitive dependencies for the purpose of exception propagation
|
|
186
195
|
# (list of set of slot_idxs, indexed by slot_idx)
|
|
187
|
-
#self.dependents = np.zeros((self.num_materialized, self.num_materialized), dtype=bool)
|
|
196
|
+
# self.dependents = np.zeros((self.num_materialized, self.num_materialized), dtype=bool)
|
|
188
197
|
self.dependencies = np.zeros((self.num_materialized, self.num_materialized), dtype=bool)
|
|
189
198
|
exc_dependencies: list[set[int]] = [set() for _ in range(self.num_materialized)]
|
|
190
199
|
from .column_property_ref import ColumnPropertyRef
|
|
200
|
+
|
|
191
201
|
for expr in self.unique_exprs:
|
|
192
202
|
if expr.slot_idx in self.input_expr_slot_idxs:
|
|
193
203
|
# this is input and therefore doesn't depend on other exprs
|
|
@@ -204,8 +214,8 @@ class RowBuilder:
|
|
|
204
214
|
self.dependents = self.dependencies.T
|
|
205
215
|
self.transitive_dependents = np.zeros((self.num_materialized, self.num_materialized), dtype=bool)
|
|
206
216
|
for i in reversed(range(self.num_materialized)):
|
|
207
|
-
self.transitive_dependents[i] = (
|
|
208
|
-
self.
|
|
217
|
+
self.transitive_dependents[i] = self.dependents[i] | np.any(
|
|
218
|
+
self.transitive_dependents[self.dependents[i]], axis=0
|
|
209
219
|
)
|
|
210
220
|
|
|
211
221
|
self._exc_dependents = [set() for _ in range(self.num_materialized)]
|
|
@@ -228,6 +238,7 @@ class RowBuilder:
|
|
|
228
238
|
|
|
229
239
|
def set_conn(self, conn: sql.engine.Connection) -> None:
|
|
230
240
|
from .function_call import FunctionCall
|
|
241
|
+
|
|
231
242
|
for expr in self.unique_exprs:
|
|
232
243
|
if isinstance(expr, FunctionCall) and isinstance(expr.fn, func.QueryTemplateFunction):
|
|
233
244
|
expr.fn.set_conn(conn)
|
|
@@ -352,8 +363,11 @@ class RowBuilder:
|
|
|
352
363
|
target_slot_idxs = [e.slot_idx for e in targets]
|
|
353
364
|
ctx_slot_idxs = [e.slot_idx for e in dependencies]
|
|
354
365
|
return self.EvalCtx(
|
|
355
|
-
slot_idxs=ctx_slot_idxs,
|
|
356
|
-
|
|
366
|
+
slot_idxs=ctx_slot_idxs,
|
|
367
|
+
exprs=[self.unique_exprs[slot_idx] for slot_idx in ctx_slot_idxs],
|
|
368
|
+
target_slot_idxs=target_slot_idxs,
|
|
369
|
+
target_exprs=targets,
|
|
370
|
+
)
|
|
357
371
|
|
|
358
372
|
def set_exc(self, data_row: DataRow, slot_idx: int, exc: Exception) -> None:
|
|
359
373
|
"""Record an exception in data_row and propagate it to dependents"""
|
|
@@ -362,7 +376,7 @@ class RowBuilder:
|
|
|
362
376
|
data_row.set_exc(slot_idx, exc)
|
|
363
377
|
|
|
364
378
|
def eval(
|
|
365
|
-
|
|
379
|
+
self, data_row: DataRow, ctx: EvalCtx, profile: Optional[ExecProfile] = None, ignore_errors: bool = False
|
|
366
380
|
) -> None:
|
|
367
381
|
"""
|
|
368
382
|
Populates the slots in data_row given in ctx.
|
|
@@ -387,7 +401,8 @@ class RowBuilder:
|
|
|
387
401
|
if not ignore_errors:
|
|
388
402
|
input_vals = [data_row[d.slot_idx] for d in expr.dependencies()]
|
|
389
403
|
raise excs.ExprEvalError(
|
|
390
|
-
expr, f'expression {expr}', data_row.get_exc(expr.slot_idx), exc_tb, input_vals, 0
|
|
404
|
+
expr, f'expression {expr}', data_row.get_exc(expr.slot_idx), exc_tb, input_vals, 0
|
|
405
|
+
) from exc
|
|
391
406
|
|
|
392
407
|
def create_table_row(self, data_row: DataRow, exc_col_ids: set[int]) -> tuple[dict[str, Any], int]:
|
|
393
408
|
"""Create a table row from the slots that have an output column assigned
|
pixeltable/exprs/rowid_ref.py
CHANGED
|
@@ -22,9 +22,14 @@ class RowidRef(Expr):
|
|
|
22
22
|
_from_dict()/init() is called, which is why this class effectively has two separate paths for construction
|
|
23
23
|
(with and without a TableVersion).
|
|
24
24
|
"""
|
|
25
|
+
|
|
25
26
|
def __init__(
|
|
26
|
-
|
|
27
|
-
|
|
27
|
+
self,
|
|
28
|
+
tbl: catalog.TableVersion,
|
|
29
|
+
idx: int,
|
|
30
|
+
tbl_id: Optional[UUID] = None,
|
|
31
|
+
normalized_base_id: Optional[UUID] = None,
|
|
32
|
+
):
|
|
28
33
|
super().__init__(ts.IntType(nullable=False))
|
|
29
34
|
self.tbl = tbl
|
|
30
35
|
if tbl is not None:
|
|
@@ -48,12 +53,16 @@ class RowidRef(Expr):
|
|
|
48
53
|
return str(self)
|
|
49
54
|
|
|
50
55
|
def _equals(self, other: RowidRef) -> bool:
|
|
51
|
-
return
|
|
56
|
+
return (
|
|
57
|
+
self.normalized_base_id == other.normalized_base_id
|
|
52
58
|
and self.rowid_component_idx == other.rowid_component_idx
|
|
59
|
+
)
|
|
53
60
|
|
|
54
61
|
def _id_attrs(self) -> list[tuple[str, Any]]:
|
|
55
|
-
return super()._id_attrs()
|
|
56
|
-
|
|
62
|
+
return super()._id_attrs() + [
|
|
63
|
+
('normalized_base_id', self.normalized_base_id),
|
|
64
|
+
('idx', self.rowid_component_idx),
|
|
65
|
+
]
|
|
57
66
|
|
|
58
67
|
def __repr__(self) -> str:
|
|
59
68
|
# check if this is the pos column of a component view
|
|
@@ -14,11 +14,10 @@ from .sql_element_cache import SqlElementCache
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class SimilarityExpr(Expr):
|
|
17
|
-
|
|
18
17
|
def __init__(self, col_ref: ColumnRef, item: Any, idx_name: Optional[str] = None):
|
|
19
18
|
super().__init__(ts.FloatType())
|
|
20
19
|
item_expr = Expr.from_object(item)
|
|
21
|
-
if item_expr is None or not(item_expr.col_type.is_string_type() or item_expr.col_type.is_image_type()):
|
|
20
|
+
if item_expr is None or not (item_expr.col_type.is_string_type() or item_expr.col_type.is_image_type()):
|
|
22
21
|
raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not a {type(item)}')
|
|
23
22
|
assert item_expr.col_type.is_string_type() or item_expr.col_type.is_image_type()
|
|
24
23
|
|
|
@@ -27,6 +26,7 @@ class SimilarityExpr(Expr):
|
|
|
27
26
|
# determine index to use
|
|
28
27
|
idx_info = col_ref.col.get_idx_info()
|
|
29
28
|
from pixeltable import index
|
|
29
|
+
|
|
30
30
|
embedding_idx_info = {
|
|
31
31
|
info.name: info for info in idx_info.values() if isinstance(info.idx, index.EmbeddingIndex)
|
|
32
32
|
}
|
|
@@ -38,7 +38,8 @@ class SimilarityExpr(Expr):
|
|
|
38
38
|
if idx_name is None:
|
|
39
39
|
raise excs.Error(
|
|
40
40
|
f'Column {col_ref.col.name!r} has multiple indices; use the index name to disambiguate: '
|
|
41
|
-
f'`{col_ref.col.name}.similarity(..., idx=<name>)`'
|
|
41
|
+
f'`{col_ref.col.name}.similarity(..., idx=<name>)`'
|
|
42
|
+
)
|
|
42
43
|
self.idx_info = embedding_idx_info[idx_name]
|
|
43
44
|
else:
|
|
44
45
|
self.idx_info = next(iter(embedding_idx_info.values()))
|
|
@@ -48,11 +49,13 @@ class SimilarityExpr(Expr):
|
|
|
48
49
|
if item_expr.col_type.is_string_type() and idx.string_embed is None:
|
|
49
50
|
raise excs.Error(
|
|
50
51
|
f'Embedding index {self.idx_info.name!r} on column {self.idx_info.col.name!r} does not have a '
|
|
51
|
-
f
|
|
52
|
+
f'string embedding and does not support string queries'
|
|
53
|
+
)
|
|
52
54
|
if item_expr.col_type.is_image_type() and idx.image_embed is None:
|
|
53
55
|
raise excs.Error(
|
|
54
56
|
f'Embedding index {self.idx_info.name!r} on column {self.idx_info.col.name!r} does not have an '
|
|
55
|
-
f
|
|
57
|
+
f'image embedding and does not support image queries'
|
|
58
|
+
)
|
|
56
59
|
self.id = self._create_id()
|
|
57
60
|
|
|
58
61
|
def __repr__(self) -> str:
|
|
@@ -66,9 +69,10 @@ class SimilarityExpr(Expr):
|
|
|
66
69
|
|
|
67
70
|
def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
68
71
|
if not isinstance(self.components[1], Literal):
|
|
69
|
-
|
|
72
|
+
raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not an expression')
|
|
70
73
|
item = self.components[1].val
|
|
71
74
|
from pixeltable import index
|
|
75
|
+
|
|
72
76
|
assert isinstance(self.idx_info.idx, index.EmbeddingIndex)
|
|
73
77
|
return self.idx_info.idx.similarity_clause(self.idx_info.val_col, item)
|
|
74
78
|
|
|
@@ -77,6 +81,7 @@ class SimilarityExpr(Expr):
|
|
|
77
81
|
raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not an expression')
|
|
78
82
|
item = self.components[1].val
|
|
79
83
|
from pixeltable import index
|
|
84
|
+
|
|
80
85
|
assert isinstance(self.idx_info.idx, index.EmbeddingIndex)
|
|
81
86
|
return self.idx_info.idx.order_by_clause(self.idx_info.val_col, item, is_asc)
|
|
82
87
|
|
pixeltable/exprs/type_cast.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
from typing import Any, Optional
|
|
1
|
+
from typing import Any, Optional, Union
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
5
|
+
import pixeltable.exprs as exprs
|
|
5
6
|
import pixeltable.type_system as ts
|
|
6
7
|
|
|
7
8
|
from .expr import DataRow, Expr
|
|
9
|
+
from .literal import Literal
|
|
8
10
|
from .row_builder import RowBuilder
|
|
9
11
|
from .sql_element_cache import SqlElementCache
|
|
10
12
|
|
|
@@ -14,21 +16,19 @@ class TypeCast(Expr):
|
|
|
14
16
|
An `Expr` that represents a type conversion from an underlying `Expr` to
|
|
15
17
|
a specified `ColumnType`.
|
|
16
18
|
"""
|
|
19
|
+
|
|
17
20
|
def __init__(self, underlying: Expr, new_type: ts.ColumnType):
|
|
18
21
|
super().__init__(new_type)
|
|
19
22
|
self.components: list[Expr] = [underlying]
|
|
20
23
|
self.id: Optional[int] = self._create_id()
|
|
21
24
|
|
|
22
|
-
@property
|
|
23
|
-
def _underlying(self):
|
|
24
|
-
return self.components[0]
|
|
25
|
-
|
|
26
25
|
def _equals(self, other: 'TypeCast') -> bool:
|
|
27
26
|
# `TypeCast` has no properties beyond those captured by `Expr`.
|
|
28
27
|
return True
|
|
29
28
|
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
@property
|
|
30
|
+
def _op1(self) -> Expr:
|
|
31
|
+
return self.components[0]
|
|
32
32
|
|
|
33
33
|
def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
34
34
|
"""
|
|
@@ -38,9 +38,24 @@ class TypeCast(Expr):
|
|
|
38
38
|
return None
|
|
39
39
|
|
|
40
40
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
41
|
-
original_val = data_row[self.
|
|
41
|
+
original_val = data_row[self._op1.slot_idx]
|
|
42
42
|
data_row[self.slot_idx] = self.col_type.create_literal(original_val)
|
|
43
43
|
|
|
44
|
+
def as_literal(self) -> Optional[Literal]:
|
|
45
|
+
op1_lit = self._op1.as_literal()
|
|
46
|
+
if op1_lit is None:
|
|
47
|
+
return None
|
|
48
|
+
if not (
|
|
49
|
+
self.col_type.is_numeric_type() and (op1_lit.col_type.is_numeric_type() or op1_lit.col_type.is_bool_type())
|
|
50
|
+
):
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
op1_val = op1_lit.val
|
|
54
|
+
if self.col_type.is_int_type():
|
|
55
|
+
return Literal(int(op1_val), self.col_type)
|
|
56
|
+
elif self.col_type.is_float_type():
|
|
57
|
+
return Literal(float(op1_val), self.col_type)
|
|
58
|
+
return None
|
|
44
59
|
|
|
45
60
|
def _as_dict(self) -> dict:
|
|
46
61
|
return {'new_type': self.col_type.as_dict(), **super()._as_dict()}
|
|
@@ -52,4 +67,4 @@ class TypeCast(Expr):
|
|
|
52
67
|
return cls(components[0], ts.ColumnType.from_dict(d['new_type']))
|
|
53
68
|
|
|
54
69
|
def __repr__(self) -> str:
|
|
55
|
-
return f'{self.
|
|
70
|
+
return f'{self._op1}.astype({self.col_type._to_str(as_schema=True)})'
|
pixeltable/ext/__init__.py
CHANGED
|
@@ -15,7 +15,7 @@ def transcribe(
|
|
|
15
15
|
model: str,
|
|
16
16
|
compute_type: Optional[str] = None,
|
|
17
17
|
language: Optional[str] = None,
|
|
18
|
-
chunk_size: int = 30
|
|
18
|
+
chunk_size: int = 30,
|
|
19
19
|
) -> dict:
|
|
20
20
|
"""
|
|
21
21
|
Transcribe an audio file using WhisperX.
|
|
@@ -44,7 +44,7 @@ def transcribe(
|
|
|
44
44
|
Add a computed column that applies the model `tiny.en` to an existing Pixeltable column `tbl.audio`
|
|
45
45
|
of the table `tbl`:
|
|
46
46
|
|
|
47
|
-
>>> tbl
|
|
47
|
+
>>> tbl.add_computed_column(result=transcribe(tbl.audio, model='tiny.en'))
|
|
48
48
|
"""
|
|
49
49
|
import torch
|
|
50
50
|
import whisperx # type: ignore[import-untyped]
|