pixeltable 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/column.py +37 -11
- pixeltable/catalog/globals.py +18 -0
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/table.py +19 -3
- pixeltable/catalog/table_version.py +34 -14
- pixeltable/catalog/view.py +16 -17
- pixeltable/dataframe.py +7 -8
- pixeltable/env.py +5 -0
- pixeltable/exec/__init__.py +0 -1
- pixeltable/exec/aggregation_node.py +6 -3
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +2 -19
- pixeltable/exec/exec_node.py +2 -1
- pixeltable/exec/expr_eval_node.py +17 -10
- pixeltable/exec/in_memory_data_node.py +6 -3
- pixeltable/exec/sql_node.py +24 -25
- pixeltable/exprs/arithmetic_expr.py +3 -1
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +93 -14
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +27 -18
- pixeltable/exprs/expr.py +53 -52
- pixeltable/exprs/expr_set.py +5 -0
- pixeltable/exprs/function_call.py +32 -16
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +5 -10
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +12 -11
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +7 -5
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/function.py +11 -10
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/globals.py +5 -7
- pixeltable/functions/huggingface.py +19 -20
- pixeltable/functions/llama_cpp.py +106 -0
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +9 -0
- pixeltable/globals.py +12 -20
- pixeltable/index/btree.py +16 -3
- pixeltable/index/embedding_index.py +4 -4
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +96 -2
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +1 -1
- pixeltable/iterators/video.py +120 -63
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +45 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/plan.py +16 -14
- pixeltable/py.typed +0 -0
- pixeltable/store.py +7 -2
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +28 -5
- pixeltable/type_system.py +17 -1
- pixeltable/utils/documents.py +15 -1
- pixeltable/utils/formatter.py +9 -10
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/METADATA +46 -10
- pixeltable-0.2.22.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable-0.2.21.dist-info/RECORD +0 -148
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
|
@@ -7,7 +7,7 @@ from typing import Iterable, List, Optional
|
|
|
7
7
|
|
|
8
8
|
from tqdm import TqdmWarning, tqdm
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
from pixeltable import exprs
|
|
11
11
|
from pixeltable.func import CallableFunction
|
|
12
12
|
|
|
13
13
|
from .data_row_batch import DataRowBatch
|
|
@@ -22,7 +22,7 @@ class ExprEvalNode(ExecNode):
|
|
|
22
22
|
@dataclass
|
|
23
23
|
class Cohort:
|
|
24
24
|
"""List of exprs that form an evaluation context and contain calls to at most one external function"""
|
|
25
|
-
|
|
25
|
+
exprs_: List[exprs.Expr]
|
|
26
26
|
batched_fn: Optional[CallableFunction]
|
|
27
27
|
segment_ctxs: List['exprs.RowBuilder.EvalCtx']
|
|
28
28
|
target_slot_idxs: List[int]
|
|
@@ -38,7 +38,7 @@ class ExprEvalNode(ExecNode):
|
|
|
38
38
|
# we're only materializing exprs that are not already in the input
|
|
39
39
|
self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
|
|
40
40
|
self.pbar: Optional[tqdm] = None
|
|
41
|
-
self.cohorts: List[
|
|
41
|
+
self.cohorts: List[ExprEvalNode.Cohort] = []
|
|
42
42
|
self._create_cohorts()
|
|
43
43
|
|
|
44
44
|
def __next__(self) -> DataRowBatch:
|
|
@@ -88,6 +88,8 @@ class ExprEvalNode(ExecNode):
|
|
|
88
88
|
for e in all_exprs:
|
|
89
89
|
if not self._is_batched_fn_call(e):
|
|
90
90
|
continue
|
|
91
|
+
assert isinstance(e, exprs.FunctionCall)
|
|
92
|
+
assert isinstance(e.fn, CallableFunction)
|
|
91
93
|
if current_batched_fn is None or current_batched_fn != e.fn:
|
|
92
94
|
# create a new cohort
|
|
93
95
|
cohorts.append([])
|
|
@@ -96,8 +98,8 @@ class ExprEvalNode(ExecNode):
|
|
|
96
98
|
|
|
97
99
|
# expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
|
|
98
100
|
# cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
|
|
99
|
-
exclude = set(
|
|
100
|
-
all_target_slot_idxs = set(
|
|
101
|
+
exclude = set(e.slot_idx for e in self.input_exprs)
|
|
102
|
+
all_target_slot_idxs = set(e.slot_idx for e in self.target_exprs)
|
|
101
103
|
target_slot_idxs: List[List[int]] = [] # the ones materialized by each cohort
|
|
102
104
|
for i in range(len(cohorts)):
|
|
103
105
|
cohorts[i] = self.row_builder.get_dependencies(
|
|
@@ -106,7 +108,7 @@ class ExprEvalNode(ExecNode):
|
|
|
106
108
|
[e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
|
|
107
109
|
exclude.update(target_slot_idxs[-1])
|
|
108
110
|
|
|
109
|
-
all_cohort_slot_idxs = set(
|
|
111
|
+
all_cohort_slot_idxs = set(e.slot_idx for cohort in cohorts for e in cohort)
|
|
110
112
|
remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
|
|
111
113
|
if len(remaining_slot_idxs) > 0:
|
|
112
114
|
cohorts.append(self.row_builder.get_dependencies(
|
|
@@ -164,9 +166,10 @@ class ExprEvalNode(ExecNode):
|
|
|
164
166
|
rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
|
|
165
167
|
else:
|
|
166
168
|
fn_call = segment_ctx.exprs[0]
|
|
169
|
+
assert isinstance(fn_call, exprs.FunctionCall)
|
|
167
170
|
# make a batched external function call
|
|
168
|
-
arg_batches = [[] for _ in range(len(fn_call.args))]
|
|
169
|
-
kwarg_batches = {k: [] for k in fn_call.kwargs.keys()}
|
|
171
|
+
arg_batches: list[list[exprs.Expr]] = [[] for _ in range(len(fn_call.args))]
|
|
172
|
+
kwarg_batches: dict[str, list[exprs.Expr]] = {k: [] for k in fn_call.kwargs.keys()}
|
|
170
173
|
|
|
171
174
|
valid_batch_idxs: List[int] = [] # rows with exceptions are not valid
|
|
172
175
|
for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
|
|
@@ -176,12 +179,15 @@ class ExprEvalNode(ExecNode):
|
|
|
176
179
|
continue
|
|
177
180
|
valid_batch_idxs.append(row_idx)
|
|
178
181
|
args, kwargs = fn_call._make_args(row)
|
|
179
|
-
|
|
180
|
-
|
|
182
|
+
for i in range(len(args)):
|
|
183
|
+
arg_batches[i].append(args[i])
|
|
184
|
+
for k in kwargs.keys():
|
|
185
|
+
kwarg_batches[k].append(kwargs[k])
|
|
181
186
|
num_valid_batch_rows = len(valid_batch_idxs)
|
|
182
187
|
|
|
183
188
|
if ext_batch_size is None:
|
|
184
189
|
# we need to choose a batch size based on the args
|
|
190
|
+
assert isinstance(fn_call.fn, CallableFunction)
|
|
185
191
|
sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
|
|
186
192
|
ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
|
|
187
193
|
|
|
@@ -201,6 +207,7 @@ class ExprEvalNode(ExecNode):
|
|
|
201
207
|
for k in kwarg_batches.keys()
|
|
202
208
|
}
|
|
203
209
|
start_ts = time.perf_counter()
|
|
210
|
+
assert isinstance(fn_call.fn, CallableFunction)
|
|
204
211
|
result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
|
|
205
212
|
self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
|
|
206
213
|
self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any,
|
|
2
|
+
from typing import Any, Iterator, Optional
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
@@ -23,12 +23,15 @@ class InMemoryDataNode(ExecNode):
|
|
|
23
23
|
start_row_id: int
|
|
24
24
|
output_rows: Optional[DataRowBatch]
|
|
25
25
|
|
|
26
|
+
# output_exprs is declared in the superclass, but we redeclare it here with a more specific type
|
|
27
|
+
output_exprs: list[exprs.ColumnRef]
|
|
28
|
+
|
|
26
29
|
def __init__(
|
|
27
30
|
self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
|
|
28
31
|
row_builder: exprs.RowBuilder, start_row_id: int,
|
|
29
32
|
):
|
|
30
|
-
# we materialize
|
|
31
|
-
output_exprs =
|
|
33
|
+
# we materialize the input slots
|
|
34
|
+
output_exprs = list(row_builder.input_exprs)
|
|
32
35
|
super().__init__(row_builder, output_exprs, [], None)
|
|
33
36
|
assert tbl.is_insertable()
|
|
34
37
|
self.tbl = tbl
|
pixeltable/exec/sql_node.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import warnings
|
|
3
3
|
from decimal import Decimal
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Iterable, Iterator, NamedTuple, Optional
|
|
5
5
|
from uuid import UUID
|
|
6
6
|
|
|
7
7
|
import sqlalchemy as sql
|
|
8
8
|
|
|
9
9
|
import pixeltable.catalog as catalog
|
|
10
10
|
import pixeltable.exprs as exprs
|
|
11
|
+
|
|
11
12
|
from .data_row_batch import DataRowBatch
|
|
12
13
|
from .exec_node import ExecNode
|
|
13
14
|
|
|
@@ -100,7 +101,7 @@ class SqlNode(ExecNode):
|
|
|
100
101
|
# minimize the number of tables that need to be joined to the target table
|
|
101
102
|
self.retarget_rowid_refs(tbl, self.select_list)
|
|
102
103
|
|
|
103
|
-
assert self.sql_elements.
|
|
104
|
+
assert self.sql_elements.contains_all(self.select_list)
|
|
104
105
|
self.set_pk = set_pk
|
|
105
106
|
self.num_pk_cols = 0
|
|
106
107
|
if set_pk:
|
|
@@ -120,13 +121,13 @@ class SqlNode(ExecNode):
|
|
|
120
121
|
def _create_stmt(self) -> sql.Select:
|
|
121
122
|
"""Create Select from local state"""
|
|
122
123
|
|
|
123
|
-
assert self.sql_elements.
|
|
124
|
+
assert self.sql_elements.contains_all(self.select_list)
|
|
124
125
|
sql_select_list = [self.sql_elements.get(e) for e in self.select_list]
|
|
125
126
|
if self.set_pk:
|
|
126
127
|
sql_select_list += self.tbl.tbl_version.store_tbl.pk_columns()
|
|
127
128
|
stmt = sql.select(*sql_select_list)
|
|
128
129
|
|
|
129
|
-
order_by_clause: list[sql.
|
|
130
|
+
order_by_clause: list[sql.ColumnElement] = []
|
|
130
131
|
for e, asc in self.order_by_clause:
|
|
131
132
|
if isinstance(e, exprs.SimilarityExpr):
|
|
132
133
|
order_by_clause.append(e.as_order_by_clause(asc))
|
|
@@ -141,7 +142,7 @@ class SqlNode(ExecNode):
|
|
|
141
142
|
return stmt
|
|
142
143
|
|
|
143
144
|
def _ordering_tbl_ids(self) -> set[UUID]:
|
|
144
|
-
return exprs.Expr.
|
|
145
|
+
return exprs.Expr.all_tbl_ids(e for e, _ in self.order_by_clause)
|
|
145
146
|
|
|
146
147
|
def to_cte(self) -> Optional[tuple[sql.CTE, exprs.ExprDict[sql.ColumnElement]]]:
|
|
147
148
|
"""
|
|
@@ -182,9 +183,9 @@ class SqlNode(ExecNode):
|
|
|
182
183
|
"""
|
|
183
184
|
# we need to include at least the root
|
|
184
185
|
if refd_tbl_ids is None:
|
|
185
|
-
refd_tbl_ids =
|
|
186
|
+
refd_tbl_ids = set()
|
|
186
187
|
if exact_version_only is None:
|
|
187
|
-
exact_version_only =
|
|
188
|
+
exact_version_only = set()
|
|
188
189
|
candidates = tbl.get_tbl_versions()
|
|
189
190
|
assert len(candidates) > 0
|
|
190
191
|
joined_tbls: list[catalog.TableVersion] = [candidates[0]]
|
|
@@ -193,6 +194,7 @@ class SqlNode(ExecNode):
|
|
|
193
194
|
joined_tbls.append(tbl)
|
|
194
195
|
|
|
195
196
|
first = True
|
|
197
|
+
prev_tbl: catalog.TableVersion
|
|
196
198
|
for tbl in joined_tbls[::-1]:
|
|
197
199
|
if first:
|
|
198
200
|
stmt = stmt.select_from(tbl.store_tbl.sa_tbl)
|
|
@@ -239,22 +241,19 @@ class SqlNode(ExecNode):
|
|
|
239
241
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
240
242
|
# run the query; do this here rather than in _open(), exceptions are only expected during iteration
|
|
241
243
|
assert self.ctx.conn is not None
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
pass
|
|
256
|
-
except Exception as e:
|
|
257
|
-
raise e
|
|
244
|
+
with warnings.catch_warnings(record=True) as w:
|
|
245
|
+
stmt = self._create_stmt()
|
|
246
|
+
try:
|
|
247
|
+
# log stmt, if possible
|
|
248
|
+
stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
|
|
249
|
+
_logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
252
|
+
self._log_explain(stmt)
|
|
253
|
+
|
|
254
|
+
result_cursor = self.ctx.conn.execute(stmt)
|
|
255
|
+
for warning in w:
|
|
256
|
+
pass
|
|
258
257
|
|
|
259
258
|
tbl_version = self.tbl.tbl_version if self.tbl is not None else None
|
|
260
259
|
output_batch = DataRowBatch(tbl_version, self.row_builder)
|
|
@@ -350,7 +349,7 @@ class SqlScanNode(SqlNode):
|
|
|
350
349
|
def _create_stmt(self) -> sql.Select:
|
|
351
350
|
stmt = super()._create_stmt()
|
|
352
351
|
where_clause_tbl_ids = self.where_clause.tbl_ids() if self.where_clause is not None else set()
|
|
353
|
-
refd_tbl_ids = exprs.Expr.
|
|
352
|
+
refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
|
|
354
353
|
stmt = self.create_from_clause(
|
|
355
354
|
self.tbl, stmt, refd_tbl_ids, exact_version_only={t.id for t in self.exact_version_only})
|
|
356
355
|
|
|
@@ -386,7 +385,7 @@ class SqlLookupNode(SqlNode):
|
|
|
386
385
|
|
|
387
386
|
def _create_stmt(self) -> sql.Select:
|
|
388
387
|
stmt = super()._create_stmt()
|
|
389
|
-
refd_tbl_ids = exprs.Expr.
|
|
388
|
+
refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | self._ordering_tbl_ids()
|
|
390
389
|
stmt = self.create_from_clause(self.tbl, stmt, refd_tbl_ids)
|
|
391
390
|
stmt = stmt.where(self.where_clause)
|
|
392
391
|
return stmt
|
|
@@ -6,6 +6,7 @@ import sqlalchemy as sql
|
|
|
6
6
|
|
|
7
7
|
import pixeltable.exceptions as excs
|
|
8
8
|
import pixeltable.type_system as ts
|
|
9
|
+
|
|
9
10
|
from .data_row import DataRow
|
|
10
11
|
from .expr import Expr
|
|
11
12
|
from .globals import ArithmeticOperator
|
|
@@ -86,6 +87,7 @@ class ArithmeticExpr(Expr):
|
|
|
86
87
|
return sql.sql.expression.cast(sql.func.floor(left / right), sql.Integer)
|
|
87
88
|
if self.col_type.is_float_type():
|
|
88
89
|
return sql.sql.expression.cast(sql.func.floor(left / right), sql.Float)
|
|
90
|
+
assert False
|
|
89
91
|
|
|
90
92
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
91
93
|
op1_val = data_row[self._op1.slot_idx]
|
|
@@ -121,7 +123,7 @@ class ArithmeticExpr(Expr):
|
|
|
121
123
|
return {'operator': self.operator.value, **super()._as_dict()}
|
|
122
124
|
|
|
123
125
|
@classmethod
|
|
124
|
-
def _from_dict(cls, d: dict, components: list[Expr]) ->
|
|
126
|
+
def _from_dict(cls, d: dict, components: list[Expr]) -> ArithmeticExpr:
|
|
125
127
|
assert 'operator' in d
|
|
126
128
|
assert len(components) == 2
|
|
127
129
|
return cls(ArithmeticOperator(d['operator']), components[0], components[1])
|
pixeltable/exprs/array_slice.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, Optional, Union
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
@@ -15,7 +15,7 @@ class ArraySlice(Expr):
|
|
|
15
15
|
"""
|
|
16
16
|
Slice operation on an array, eg, t.array_col[:, 1:2].
|
|
17
17
|
"""
|
|
18
|
-
def __init__(self, arr: Expr, index:
|
|
18
|
+
def __init__(self, arr: Expr, index: tuple[Union[int, slice], ...]):
|
|
19
19
|
assert arr.col_type.is_array_type()
|
|
20
20
|
# determine result type
|
|
21
21
|
super().__init__(arr.col_type)
|
|
@@ -24,7 +24,7 @@ class ArraySlice(Expr):
|
|
|
24
24
|
self.id = self._create_id()
|
|
25
25
|
|
|
26
26
|
def __str__(self) -> str:
|
|
27
|
-
index_strs:
|
|
27
|
+
index_strs: list[str] = []
|
|
28
28
|
for el in self.index:
|
|
29
29
|
if isinstance(el, int):
|
|
30
30
|
index_strs.append(str(el))
|
|
@@ -39,7 +39,7 @@ class ArraySlice(Expr):
|
|
|
39
39
|
def _equals(self, other: ArraySlice) -> bool:
|
|
40
40
|
return self.index == other.index
|
|
41
41
|
|
|
42
|
-
def _id_attrs(self) ->
|
|
42
|
+
def _id_attrs(self) -> list[tuple[str, Any]]:
|
|
43
43
|
return super()._id_attrs() + [('index', self.index)]
|
|
44
44
|
|
|
45
45
|
def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
@@ -49,8 +49,8 @@ class ArraySlice(Expr):
|
|
|
49
49
|
val = data_row[self._array.slot_idx]
|
|
50
50
|
data_row[self.slot_idx] = val[self.index]
|
|
51
51
|
|
|
52
|
-
def _as_dict(self) ->
|
|
53
|
-
index = []
|
|
52
|
+
def _as_dict(self) -> dict:
|
|
53
|
+
index: list[Any] = []
|
|
54
54
|
for el in self.index:
|
|
55
55
|
if isinstance(el, slice):
|
|
56
56
|
index.append([el.start, el.stop, el.step])
|
|
@@ -59,7 +59,7 @@ class ArraySlice(Expr):
|
|
|
59
59
|
return {'index': index, **super()._as_dict()}
|
|
60
60
|
|
|
61
61
|
@classmethod
|
|
62
|
-
def _from_dict(cls, d:
|
|
62
|
+
def _from_dict(cls, d: dict, components: list[Expr]) -> ArraySlice:
|
|
63
63
|
assert 'index' in d
|
|
64
64
|
index = []
|
|
65
65
|
for el in d['index']:
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import enum
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
8
|
import pixeltable.type_system as ts
|
|
9
|
+
from pixeltable import catalog
|
|
9
10
|
from .column_ref import ColumnRef
|
|
10
11
|
from .data_row import DataRow
|
|
11
12
|
from .expr import Expr
|
|
@@ -33,22 +34,36 @@ class ColumnPropertyRef(Expr):
|
|
|
33
34
|
def default_column_name(self) -> Optional[str]:
|
|
34
35
|
return str(self).replace('.', '_')
|
|
35
36
|
|
|
36
|
-
def _equals(self, other:
|
|
37
|
+
def _equals(self, other: ColumnPropertyRef) -> bool:
|
|
37
38
|
return self.prop == other.prop
|
|
38
39
|
|
|
39
|
-
def _id_attrs(self) ->
|
|
40
|
+
def _id_attrs(self) -> list[tuple[str, Any]]:
|
|
40
41
|
return super()._id_attrs() + [('prop', self.prop.value)]
|
|
41
42
|
|
|
42
43
|
@property
|
|
43
44
|
def _col_ref(self) -> ColumnRef:
|
|
44
|
-
|
|
45
|
+
col_ref = self.components[0]
|
|
46
|
+
assert isinstance(col_ref, ColumnRef)
|
|
47
|
+
return col_ref
|
|
45
48
|
|
|
46
49
|
def __str__(self) -> str:
|
|
47
50
|
return f'{self._col_ref}.{self.prop.name.lower()}'
|
|
48
51
|
|
|
52
|
+
def is_error_prop(self) -> bool:
|
|
53
|
+
return self.prop == self.Property.ERRORTYPE or self.prop == self.Property.ERRORMSG
|
|
54
|
+
|
|
49
55
|
def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
50
56
|
if not self._col_ref.col.is_stored:
|
|
51
57
|
return None
|
|
58
|
+
|
|
59
|
+
# the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
|
|
60
|
+
if (
|
|
61
|
+
self._col_ref.col.col_type.is_media_type()
|
|
62
|
+
and self._col_ref.col.media_validation == catalog.MediaValidation.ON_READ
|
|
63
|
+
and self.is_error_prop()
|
|
64
|
+
):
|
|
65
|
+
return None
|
|
66
|
+
|
|
52
67
|
if self.prop == self.Property.ERRORTYPE:
|
|
53
68
|
assert self._col_ref.col.sa_errortype_col is not None
|
|
54
69
|
return self._col_ref.col.sa_errortype_col
|
|
@@ -61,18 +76,30 @@ class ColumnPropertyRef(Expr):
|
|
|
61
76
|
return None
|
|
62
77
|
|
|
63
78
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
64
|
-
assert self.prop == self.Property.FILEURL or self.prop == self.Property.LOCALPATH
|
|
65
|
-
assert data_row.has_val[self._col_ref.slot_idx]
|
|
66
79
|
if self.prop == self.Property.FILEURL:
|
|
80
|
+
assert data_row.has_val[self._col_ref.slot_idx]
|
|
67
81
|
data_row[self.slot_idx] = data_row.file_urls[self._col_ref.slot_idx]
|
|
68
|
-
|
|
82
|
+
return
|
|
83
|
+
elif self.prop == self.Property.LOCALPATH:
|
|
84
|
+
assert data_row.has_val[self._col_ref.slot_idx]
|
|
69
85
|
data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
|
|
70
|
-
|
|
71
|
-
|
|
86
|
+
return
|
|
87
|
+
elif self.is_error_prop():
|
|
88
|
+
exc = data_row.get_exc(self._col_ref.slot_idx)
|
|
89
|
+
if exc is None:
|
|
90
|
+
data_row[self.slot_idx] = None
|
|
91
|
+
elif self.prop == self.Property.ERRORTYPE:
|
|
92
|
+
data_row[self.slot_idx] = type(exc).__name__
|
|
93
|
+
else:
|
|
94
|
+
data_row[self.slot_idx] = str(exc)
|
|
95
|
+
else:
|
|
96
|
+
assert False
|
|
97
|
+
|
|
98
|
+
def _as_dict(self) -> dict:
|
|
72
99
|
return {'prop': self.prop.value, **super()._as_dict()}
|
|
73
100
|
|
|
74
101
|
@classmethod
|
|
75
|
-
def _from_dict(cls, d:
|
|
102
|
+
def _from_dict(cls, d: dict, components: list[Expr]) -> ColumnPropertyRef:
|
|
76
103
|
assert 'prop' in d
|
|
77
104
|
assert isinstance(components[0], ColumnRef)
|
|
78
105
|
return cls(components[0], cls.Property(d['prop']))
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Sequence
|
|
3
4
|
from uuid import UUID
|
|
4
5
|
|
|
5
6
|
import sqlalchemy as sql
|
|
6
7
|
|
|
7
|
-
|
|
8
|
+
import pixeltable.catalog as catalog
|
|
9
|
+
import pixeltable.exceptions as excs
|
|
10
|
+
import pixeltable.iterators as iters
|
|
11
|
+
|
|
8
12
|
from .data_row import DataRow
|
|
13
|
+
from .expr import Expr
|
|
9
14
|
from .row_builder import RowBuilder
|
|
10
15
|
from .sql_element_cache import SqlElementCache
|
|
11
|
-
import pixeltable.iterators as iters
|
|
12
|
-
import pixeltable.exceptions as excs
|
|
13
|
-
import pixeltable.catalog as catalog
|
|
14
16
|
|
|
15
17
|
|
|
16
18
|
class ColumnRef(Expr):
|
|
@@ -19,18 +21,31 @@ class ColumnRef(Expr):
|
|
|
19
21
|
When this reference is created in the context of a view, it can also refer to a column of the view base.
|
|
20
22
|
For that reason, a ColumnRef needs to be serialized with the qualifying table id (column ids are only
|
|
21
23
|
unique in the context of a particular table).
|
|
24
|
+
|
|
25
|
+
Media validation:
|
|
26
|
+
- media validation is potentially cpu-intensive, and it's desirable to schedule and parallelize it during
|
|
27
|
+
general expr evaluation
|
|
28
|
+
- media validation on read is done in ColumnRef.eval()
|
|
29
|
+
- a validating ColumnRef cannot be translated to SQL (because the validation is done in Python)
|
|
30
|
+
- in that case, the ColumnRef also instantiates a second non-validating ColumnRef as a component (= dependency)
|
|
31
|
+
- the non-validating ColumnRef is used for SQL translation
|
|
32
|
+
|
|
33
|
+
TODO:
|
|
34
|
+
separate Exprs (like validating ColumnRefs) from the logical expression tree and instead have RowBuilder
|
|
35
|
+
insert them into the EvalCtxs as needed
|
|
22
36
|
"""
|
|
23
37
|
|
|
24
38
|
col: catalog.Column
|
|
25
39
|
is_unstored_iter_col: bool
|
|
26
40
|
iter_arg_ctx: Optional[RowBuilder.EvalCtx]
|
|
27
41
|
base_rowid_len: int
|
|
28
|
-
base_rowid:
|
|
42
|
+
base_rowid: Sequence[Optional[Any]]
|
|
29
43
|
iterator: Optional[iters.ComponentIterator]
|
|
30
44
|
pos_idx: Optional[int]
|
|
31
45
|
id: int
|
|
46
|
+
perform_validation: bool # if True, performs media validation
|
|
32
47
|
|
|
33
|
-
def __init__(self, col: catalog.Column):
|
|
48
|
+
def __init__(self, col: catalog.Column, perform_validation: Optional[bool] = None):
|
|
34
49
|
super().__init__(col.col_type)
|
|
35
50
|
assert col.tbl is not None
|
|
36
51
|
self.col = col
|
|
@@ -43,17 +58,44 @@ class ColumnRef(Expr):
|
|
|
43
58
|
self.iterator = None
|
|
44
59
|
# index of the position column in the view's primary key; don't try to reference tbl.store_tbl here
|
|
45
60
|
self.pos_idx = col.tbl.num_rowid_columns() - 1 if self.is_unstored_iter_col else None
|
|
61
|
+
|
|
62
|
+
self.perform_validation = False
|
|
63
|
+
if col.col_type.is_media_type():
|
|
64
|
+
# we perform media validation if the column is a media type and the validation is set to ON_READ,
|
|
65
|
+
# unless we're told not to
|
|
66
|
+
if perform_validation is not None:
|
|
67
|
+
self.perform_validation = perform_validation
|
|
68
|
+
else:
|
|
69
|
+
self.perform_validation = (
|
|
70
|
+
col.col_type.is_media_type() and col.media_validation == catalog.MediaValidation.ON_READ
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
assert perform_validation is None or not perform_validation
|
|
74
|
+
if self.perform_validation:
|
|
75
|
+
non_validating_col_ref = ColumnRef(col, perform_validation=False)
|
|
76
|
+
self.components = [non_validating_col_ref]
|
|
46
77
|
self.id = self._create_id()
|
|
47
78
|
|
|
48
79
|
def set_iter_arg_ctx(self, iter_arg_ctx: RowBuilder.EvalCtx) -> None:
|
|
49
80
|
self.iter_arg_ctx = iter_arg_ctx
|
|
50
81
|
assert len(self.iter_arg_ctx.target_slot_idxs) == 1 # a single inline dict
|
|
51
82
|
|
|
52
|
-
def _id_attrs(self) -> list[
|
|
53
|
-
return
|
|
83
|
+
def _id_attrs(self) -> list[tuple[str, Any]]:
|
|
84
|
+
return (
|
|
85
|
+
super()._id_attrs()
|
|
86
|
+
+ [('tbl_id', self.col.tbl.id), ('col_id', self.col.id), ('perform_validation', self.perform_validation)]
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# override
|
|
90
|
+
def _retarget(self, tbl_versions: dict[UUID, catalog.TableVersion]) -> ColumnRef:
|
|
91
|
+
target = tbl_versions[self.col.tbl.id]
|
|
92
|
+
assert self.col.id in target.cols_by_id
|
|
93
|
+
col = target.cols_by_id[self.col.id]
|
|
94
|
+
return ColumnRef(col)
|
|
54
95
|
|
|
55
96
|
def __getattr__(self, name: str) -> Expr:
|
|
56
97
|
from .column_property_ref import ColumnPropertyRef
|
|
98
|
+
|
|
57
99
|
# resolve column properties
|
|
58
100
|
if name == ColumnPropertyRef.Property.ERRORTYPE.name.lower() \
|
|
59
101
|
or name == ColumnPropertyRef.Property.ERRORMSG.name.lower():
|
|
@@ -82,7 +124,7 @@ class ColumnRef(Expr):
|
|
|
82
124
|
return str(self)
|
|
83
125
|
|
|
84
126
|
def _equals(self, other: ColumnRef) -> bool:
|
|
85
|
-
return self.col == other.col
|
|
127
|
+
return self.col == other.col and self.perform_validation == other.perform_validation
|
|
86
128
|
|
|
87
129
|
def __str__(self) -> str:
|
|
88
130
|
if self.col.name is None:
|
|
@@ -94,9 +136,38 @@ class ColumnRef(Expr):
|
|
|
94
136
|
return f'ColumnRef({self.col!r})'
|
|
95
137
|
|
|
96
138
|
def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
97
|
-
return self.col.sa_col
|
|
139
|
+
return None if self.perform_validation else self.col.sa_col
|
|
98
140
|
|
|
99
141
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
142
|
+
if self.perform_validation:
|
|
143
|
+
# validate media file of our input ColumnRef and if successful, replicate the state of that slot
|
|
144
|
+
# to our slot
|
|
145
|
+
unvalidated_slot_idx = self.components[0].slot_idx
|
|
146
|
+
if data_row.file_paths[unvalidated_slot_idx] is None:
|
|
147
|
+
# no media file to validate, we still need to replicate the value
|
|
148
|
+
assert data_row.file_urls[unvalidated_slot_idx] is None
|
|
149
|
+
val = data_row.vals[unvalidated_slot_idx]
|
|
150
|
+
data_row.vals[self.slot_idx] = val
|
|
151
|
+
data_row.has_val[self.slot_idx] = True
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
self.col.col_type.validate_media(data_row.file_paths[unvalidated_slot_idx])
|
|
156
|
+
# access the value only after successful validation
|
|
157
|
+
val = data_row[unvalidated_slot_idx]
|
|
158
|
+
data_row.vals[self.slot_idx] = val
|
|
159
|
+
data_row.has_val[self.slot_idx] = True
|
|
160
|
+
# make sure that the validated slot points to the same file as the unvalidated slot
|
|
161
|
+
data_row.file_paths[self.slot_idx] = data_row.file_paths[unvalidated_slot_idx]
|
|
162
|
+
data_row.file_urls[self.slot_idx] = data_row.file_urls[unvalidated_slot_idx]
|
|
163
|
+
return
|
|
164
|
+
except excs.Error as exc:
|
|
165
|
+
# propagate the exception, but ignore it otherwise;
|
|
166
|
+
# media validation errors don't cause exceptions during query execution
|
|
167
|
+
# TODO: allow for different error-handling behavior
|
|
168
|
+
row_builder.set_exc(data_row, self.slot_idx, exc)
|
|
169
|
+
return
|
|
170
|
+
|
|
100
171
|
if not self.is_unstored_iter_col:
|
|
101
172
|
# supply default
|
|
102
173
|
data_row[self.slot_idx] = None
|
|
@@ -115,7 +186,14 @@ class ColumnRef(Expr):
|
|
|
115
186
|
def _as_dict(self) -> dict:
|
|
116
187
|
tbl = self.col.tbl
|
|
117
188
|
version = tbl.version if tbl.is_snapshot else None
|
|
118
|
-
|
|
189
|
+
# we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
|
|
190
|
+
# non-validating component ColumnRef
|
|
191
|
+
return {
|
|
192
|
+
'tbl_id': str(tbl.id),
|
|
193
|
+
'tbl_version': version,
|
|
194
|
+
'col_id': self.col.id,
|
|
195
|
+
'perform_validation': self.perform_validation
|
|
196
|
+
}
|
|
119
197
|
|
|
120
198
|
@classmethod
|
|
121
199
|
def get_column(cls, d: dict) -> catalog.Column:
|
|
@@ -126,6 +204,7 @@ class ColumnRef(Expr):
|
|
|
126
204
|
return col
|
|
127
205
|
|
|
128
206
|
@classmethod
|
|
129
|
-
def _from_dict(cls, d: dict, _: list[Expr]) ->
|
|
207
|
+
def _from_dict(cls, d: dict, _: list[Expr]) -> ColumnRef:
|
|
130
208
|
col = cls.get_column(d)
|
|
131
|
-
|
|
209
|
+
perform_validation = d['perform_validation']
|
|
210
|
+
return cls(col, perform_validation=perform_validation)
|
pixeltable/exprs/comparison.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Any, Optional
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
7
7
|
import pixeltable.exceptions as excs
|
|
8
8
|
import pixeltable.index as index
|
|
9
9
|
import pixeltable.type_system as ts
|
|
10
|
+
|
|
10
11
|
from .column_ref import ColumnRef
|
|
11
12
|
from .data_row import DataRow
|
|
12
13
|
from .expr import Expr
|
|
@@ -65,7 +66,7 @@ class Comparison(Expr):
|
|
|
65
66
|
def _op2(self) -> Expr:
|
|
66
67
|
return self.components[1]
|
|
67
68
|
|
|
68
|
-
def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.
|
|
69
|
+
def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
69
70
|
left = sql_elements.get(self._op1)
|
|
70
71
|
if self.is_search_arg_comparison:
|
|
71
72
|
# reference the index value column if there is an index and this is not a snapshot
|
|
@@ -113,11 +114,10 @@ class Comparison(Expr):
|
|
|
113
114
|
elif self.operator == ComparisonOperator.GE:
|
|
114
115
|
data_row[self.slot_idx] = left >= right
|
|
115
116
|
|
|
116
|
-
def _as_dict(self) ->
|
|
117
|
+
def _as_dict(self) -> dict:
|
|
117
118
|
return {'operator': self.operator.value, **super()._as_dict()}
|
|
118
119
|
|
|
119
120
|
@classmethod
|
|
120
|
-
def _from_dict(cls, d:
|
|
121
|
+
def _from_dict(cls, d: dict, components: list[Expr]) -> Comparison:
|
|
121
122
|
assert 'operator' in d
|
|
122
123
|
return cls(ComparisonOperator(d['operator']), components[0], components[1])
|
|
123
|
-
|