pixeltable 0.2.17__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +8 -7
- pixeltable/catalog/column.py +11 -8
- pixeltable/catalog/insertable_table.py +1 -1
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/table.py +20 -13
- pixeltable/catalog/table_version.py +91 -54
- pixeltable/catalog/table_version_path.py +7 -9
- pixeltable/catalog/view.py +2 -1
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +173 -83
- pixeltable/exec/aggregation_node.py +2 -1
- pixeltable/exec/component_iteration_node.py +1 -1
- pixeltable/exec/sql_node.py +11 -8
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -1
- pixeltable/exprs/column_property_ref.py +9 -7
- pixeltable/exprs/column_ref.py +2 -1
- pixeltable/exprs/comparison.py +10 -7
- pixeltable/exprs/compound_predicate.py +3 -2
- pixeltable/exprs/data_row.py +19 -4
- pixeltable/exprs/expr.py +46 -35
- pixeltable/exprs/expr_set.py +32 -9
- pixeltable/exprs/function_call.py +56 -32
- pixeltable/exprs/in_predicate.py +3 -2
- pixeltable/exprs/inline_array.py +2 -1
- pixeltable/exprs/inline_dict.py +2 -1
- pixeltable/exprs/is_null.py +3 -2
- pixeltable/exprs/json_mapper.py +5 -4
- pixeltable/exprs/json_path.py +7 -1
- pixeltable/exprs/literal.py +34 -7
- pixeltable/exprs/method_ref.py +3 -3
- pixeltable/exprs/object_ref.py +6 -5
- pixeltable/exprs/row_builder.py +25 -17
- pixeltable/exprs/rowid_ref.py +2 -1
- pixeltable/exprs/similarity_expr.py +2 -1
- pixeltable/exprs/sql_element_cache.py +30 -0
- pixeltable/exprs/type_cast.py +3 -3
- pixeltable/exprs/variable.py +2 -1
- pixeltable/ext/functions/whisperx.py +4 -4
- pixeltable/ext/functions/yolox.py +6 -6
- pixeltable/func/aggregate_function.py +1 -0
- pixeltable/func/function.py +28 -4
- pixeltable/functions/__init__.py +4 -2
- pixeltable/functions/anthropic.py +15 -5
- pixeltable/functions/fireworks.py +1 -1
- pixeltable/functions/globals.py +6 -1
- pixeltable/functions/huggingface.py +2 -2
- pixeltable/functions/image.py +17 -2
- pixeltable/functions/json.py +5 -5
- pixeltable/functions/mistralai.py +188 -0
- pixeltable/functions/openai.py +6 -10
- pixeltable/functions/string.py +3 -2
- pixeltable/functions/timestamp.py +95 -7
- pixeltable/functions/together.py +4 -4
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/vision.py +27 -17
- pixeltable/functions/whisper.py +1 -1
- pixeltable/io/hf_datasets.py +17 -15
- pixeltable/io/pandas.py +0 -2
- pixeltable/io/parquet.py +15 -14
- pixeltable/iterators/document.py +16 -15
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_19.py +46 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +5 -4
- pixeltable/plan.py +100 -78
- pixeltable/store.py +5 -1
- pixeltable/tool/create_test_db_dump.py +4 -3
- pixeltable/type_system.py +12 -14
- pixeltable/utils/documents.py +45 -42
- pixeltable/utils/formatter.py +2 -2
- {pixeltable-0.2.17.dist-info → pixeltable-0.2.18.dist-info}/METADATA +79 -21
- pixeltable-0.2.18.dist-info/RECORD +147 -0
- pixeltable-0.2.17.dist-info/RECORD +0 -144
- {pixeltable-0.2.17.dist-info → pixeltable-0.2.18.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.17.dist-info → pixeltable-0.2.18.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.17.dist-info → pixeltable-0.2.18.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
|
|
1
|
+
import itertools
|
|
2
|
+
from typing import Any, Iterable, Optional, Sequence
|
|
2
3
|
from uuid import UUID
|
|
3
4
|
|
|
4
5
|
import sqlalchemy as sql
|
|
5
6
|
|
|
6
7
|
import pixeltable as pxt
|
|
7
8
|
import pixeltable.exec as exec
|
|
8
|
-
import pixeltable.func as func
|
|
9
9
|
from pixeltable import catalog
|
|
10
10
|
from pixeltable import exceptions as excs
|
|
11
11
|
from pixeltable import exprs
|
|
@@ -14,11 +14,12 @@ from pixeltable import exprs
|
|
|
14
14
|
def _is_agg_fn_call(e: exprs.Expr) -> bool:
|
|
15
15
|
return isinstance(e, exprs.FunctionCall) and e.is_agg_fn_call and not e.is_window_fn_call
|
|
16
16
|
|
|
17
|
+
|
|
17
18
|
def _get_combined_ordering(
|
|
18
|
-
o1:
|
|
19
|
-
) ->
|
|
19
|
+
o1: list[tuple[exprs.Expr, bool]], o2: list[tuple[exprs.Expr, bool]]
|
|
20
|
+
) -> list[tuple[exprs.Expr, bool]]:
|
|
20
21
|
"""Returns an ordering that's compatible with both o1 and o2, or an empty list if no such ordering exists"""
|
|
21
|
-
result:
|
|
22
|
+
result: list[tuple[exprs.Expr, bool]] = []
|
|
22
23
|
# determine combined ordering
|
|
23
24
|
for (e1, asc1), (e2, asc2) in zip(o1, o2):
|
|
24
25
|
if e1.id != e2.id:
|
|
@@ -36,18 +37,42 @@ def _get_combined_ordering(
|
|
|
36
37
|
result.extend(o2[prefix_len:])
|
|
37
38
|
return result
|
|
38
39
|
|
|
40
|
+
|
|
39
41
|
class Analyzer:
|
|
40
|
-
"""
|
|
42
|
+
"""
|
|
43
|
+
Performs semantic analysis of a query and stores the analysis state.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
tbl: catalog.TableVersionPath
|
|
47
|
+
all_exprs: list[exprs.Expr]
|
|
48
|
+
select_list: list[exprs.Expr]
|
|
49
|
+
group_by_clause: list[exprs.Expr]
|
|
50
|
+
order_by_clause: list[tuple[exprs.Expr, bool]]
|
|
51
|
+
|
|
52
|
+
# exprs that can be expressed in SQL and are retrieved directly from the store
|
|
53
|
+
#sql_exprs: list[exprs.Expr]
|
|
54
|
+
|
|
55
|
+
sql_elements: exprs.SqlElementCache
|
|
56
|
+
|
|
57
|
+
# Where clause of the Select stmt of the SQL scan
|
|
58
|
+
sql_where_clause: Optional[exprs.Expr]
|
|
59
|
+
|
|
60
|
+
# filter predicate applied to output rows of the SQL scan
|
|
61
|
+
filter: Optional[exprs.Expr]
|
|
62
|
+
|
|
63
|
+
agg_fn_calls: list[exprs.FunctionCall]
|
|
64
|
+
agg_order_by: list[exprs.Expr]
|
|
41
65
|
|
|
42
66
|
def __init__(
|
|
43
67
|
self, tbl: catalog.TableVersionPath, select_list: Sequence[exprs.Expr],
|
|
44
|
-
where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[
|
|
45
|
-
order_by_clause: Optional[
|
|
68
|
+
where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
69
|
+
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None):
|
|
46
70
|
if group_by_clause is None:
|
|
47
71
|
group_by_clause = []
|
|
48
72
|
if order_by_clause is None:
|
|
49
73
|
order_by_clause = []
|
|
50
74
|
self.tbl = tbl
|
|
75
|
+
self.sql_elements = exprs.SqlElementCache()
|
|
51
76
|
|
|
52
77
|
# remove references to unstored computed cols
|
|
53
78
|
self.select_list = [e.resolve_computed_cols() for e in select_list]
|
|
@@ -56,14 +81,10 @@ class Analyzer:
|
|
|
56
81
|
self.group_by_clause = [e.resolve_computed_cols() for e in group_by_clause]
|
|
57
82
|
self.order_by_clause = [(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
|
|
58
83
|
|
|
59
|
-
|
|
60
|
-
self.
|
|
61
|
-
# filter predicate applied to output rows of the SQL scan
|
|
62
|
-
self.filter: Optional[exprs.Expr] = None
|
|
63
|
-
# not executable
|
|
64
|
-
#self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
|
|
84
|
+
self.sql_where_clause = None
|
|
85
|
+
self.filter = None
|
|
65
86
|
if where_clause is not None:
|
|
66
|
-
where_clause_conjuncts, self.filter = where_clause.split_conjuncts(
|
|
87
|
+
where_clause_conjuncts, self.filter = where_clause.split_conjuncts(self.sql_elements.contains)
|
|
67
88
|
self.sql_where_clause = exprs.CompoundPredicate.make_conjunction(where_clause_conjuncts)
|
|
68
89
|
|
|
69
90
|
# all exprs that are evaluated in Python; not executable
|
|
@@ -72,15 +93,8 @@ class Analyzer:
|
|
|
72
93
|
self.all_exprs.extend(e for e, _ in self.order_by_clause)
|
|
73
94
|
if self.filter is not None:
|
|
74
95
|
self.all_exprs.append(self.filter)
|
|
75
|
-
self.sql_exprs = list(exprs.Expr.list_subexprs(
|
|
76
|
-
self.all_exprs, filter=lambda e: e.sql_expr() is not None, traverse_matches=False))
|
|
77
|
-
|
|
78
|
-
# sql_exprs: exprs that can be expressed via SQL and are retrieved directly from the store
|
|
79
|
-
# (we don't want to materialize literals via SQL, so we remove them here)
|
|
80
|
-
self.sql_exprs = [e for e in self.sql_exprs if not isinstance(e, exprs.Literal)]
|
|
81
96
|
|
|
82
|
-
self.
|
|
83
|
-
self.agg_order_by: List[exprs.Expr] = []
|
|
97
|
+
self.agg_order_by = []
|
|
84
98
|
self._analyze_agg()
|
|
85
99
|
|
|
86
100
|
def _analyze_agg(self) -> None:
|
|
@@ -106,7 +120,7 @@ class Analyzer:
|
|
|
106
120
|
# check that grouping exprs don't contain aggregates and can be expressed as SQL (we perform sort-based
|
|
107
121
|
# aggregation and rely on the SqlScanNode returning data in the correct order)
|
|
108
122
|
for e in self.group_by_clause:
|
|
109
|
-
if
|
|
123
|
+
if not self.sql_elements.contains(e):
|
|
110
124
|
raise excs.Error(f'Invalid grouping expression, needs to be expressible in SQL: {e}')
|
|
111
125
|
if e._contains(filter=lambda e: _is_agg_fn_call(e)):
|
|
112
126
|
raise excs.Error(f'Grouping expression contains aggregate function: {e}')
|
|
@@ -132,7 +146,7 @@ class Analyzer:
|
|
|
132
146
|
))
|
|
133
147
|
self.agg_order_by = order_by
|
|
134
148
|
|
|
135
|
-
def _determine_agg_status(self, e: exprs.Expr, grouping_expr_ids:
|
|
149
|
+
def _determine_agg_status(self, e: exprs.Expr, grouping_expr_ids: set[int]) -> tuple[bool, bool]:
|
|
136
150
|
"""Determine whether expr is the input to or output of an aggregate function.
|
|
137
151
|
Returns:
|
|
138
152
|
(<is output>, <is input>)
|
|
@@ -167,17 +181,15 @@ class Analyzer:
|
|
|
167
181
|
TODO: add EvalCtx for each expr list?
|
|
168
182
|
"""
|
|
169
183
|
# maintain original composition of select list
|
|
170
|
-
row_builder.
|
|
171
|
-
row_builder.
|
|
184
|
+
row_builder.set_slot_idxs(self.select_list, remove_duplicates=False)
|
|
185
|
+
row_builder.set_slot_idxs(self.group_by_clause)
|
|
172
186
|
order_by_exprs = [e for e, _ in self.order_by_clause]
|
|
173
|
-
row_builder.
|
|
174
|
-
|
|
175
|
-
row_builder.substitute_exprs(self.all_exprs)
|
|
176
|
-
row_builder.substitute_exprs(self.sql_exprs)
|
|
187
|
+
row_builder.set_slot_idxs(order_by_exprs)
|
|
188
|
+
row_builder.set_slot_idxs(self.all_exprs)
|
|
177
189
|
if self.filter is not None:
|
|
178
|
-
|
|
179
|
-
row_builder.
|
|
180
|
-
row_builder.
|
|
190
|
+
row_builder.set_slot_idxs([self.filter])
|
|
191
|
+
row_builder.set_slot_idxs(self.agg_fn_calls)
|
|
192
|
+
row_builder.set_slot_idxs(self.agg_order_by)
|
|
181
193
|
|
|
182
194
|
|
|
183
195
|
class Planner:
|
|
@@ -187,12 +199,12 @@ class Planner:
|
|
|
187
199
|
cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None
|
|
188
200
|
) -> sql.Select:
|
|
189
201
|
stmt = sql.select(sql.func.count())
|
|
190
|
-
refd_tbl_ids:
|
|
202
|
+
refd_tbl_ids: set[UUID] = set()
|
|
191
203
|
if where_clause is not None:
|
|
192
204
|
analyzer = cls.analyze(tbl, where_clause)
|
|
193
205
|
if analyzer.filter is not None:
|
|
194
206
|
raise excs.Error(f'Filter {analyzer.filter} not expressible in SQL')
|
|
195
|
-
clause_element = analyzer.sql_where_clause.sql_expr()
|
|
207
|
+
clause_element = analyzer.sql_where_clause.sql_expr(analyzer.sql_elements)
|
|
196
208
|
assert clause_element is not None
|
|
197
209
|
stmt = stmt.where(clause_element)
|
|
198
210
|
refd_tbl_ids = where_clause.tbl_ids()
|
|
@@ -267,9 +279,9 @@ class Planner:
|
|
|
267
279
|
def create_update_plan(
|
|
268
280
|
cls, tbl: catalog.TableVersionPath,
|
|
269
281
|
update_targets: dict[catalog.Column, exprs.Expr],
|
|
270
|
-
recompute_targets:
|
|
282
|
+
recompute_targets: list[catalog.Column],
|
|
271
283
|
where_clause: Optional[exprs.Expr], cascade: bool
|
|
272
|
-
) ->
|
|
284
|
+
) -> tuple[exec.ExecNode, list[str], list[catalog.Column]]:
|
|
273
285
|
"""Creates a plan to materialize updated rows.
|
|
274
286
|
The plan:
|
|
275
287
|
- retrieves rows that are visible at the current version of the table
|
|
@@ -310,7 +322,7 @@ class Planner:
|
|
|
310
322
|
select_list.extend(recomputed_exprs)
|
|
311
323
|
|
|
312
324
|
# we need to retrieve the PK columns of the existing rows
|
|
313
|
-
plan = cls.create_query_plan(tbl, select_list, where_clause=where_clause,
|
|
325
|
+
plan = cls.create_query_plan(tbl, select_list, where_clause=where_clause, ignore_errors=True)
|
|
314
326
|
all_base_cols = copied_cols + updated_cols + list(recomputed_base_cols) # same order as select_list
|
|
315
327
|
# update row builder with column information
|
|
316
328
|
for i, col in enumerate(all_base_cols):
|
|
@@ -356,7 +368,7 @@ class Planner:
|
|
|
356
368
|
copied_cols = [
|
|
357
369
|
col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
|
|
358
370
|
]
|
|
359
|
-
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
371
|
+
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
|
|
360
372
|
select_list.extend(exprs.ColumnRef(col) for col in updated_cols)
|
|
361
373
|
|
|
362
374
|
recomputed_exprs = \
|
|
@@ -369,19 +381,21 @@ class Planner:
|
|
|
369
381
|
# - RowUpdateNode to update the retrieved rows
|
|
370
382
|
# - ExprEvalNode to evaluate the remaining output exprs
|
|
371
383
|
analyzer = Analyzer(tbl, select_list)
|
|
372
|
-
|
|
384
|
+
sql_exprs = list(exprs.Expr.list_subexprs(
|
|
385
|
+
analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False))
|
|
386
|
+
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs)
|
|
373
387
|
analyzer.finalize(row_builder)
|
|
374
|
-
sql_lookup_node = exec.SqlLookupNode(tbl, row_builder,
|
|
388
|
+
sql_lookup_node = exec.SqlLookupNode(tbl, row_builder, sql_exprs, sa_key_cols, key_vals)
|
|
375
389
|
delete_where_clause = sql_lookup_node.where_clause
|
|
376
390
|
col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
|
|
377
391
|
row_update_node = exec.RowUpdateNode(tbl, key_vals, len(rowids) > 0, col_vals, row_builder, sql_lookup_node)
|
|
378
392
|
plan: exec.ExecNode = row_update_node
|
|
379
|
-
if not cls._is_contained_in(analyzer.select_list,
|
|
393
|
+
if not cls._is_contained_in(analyzer.select_list, sql_exprs):
|
|
380
394
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
381
|
-
plan = exec.ExprEvalNode(row_builder, analyzer.select_list,
|
|
395
|
+
plan = exec.ExprEvalNode(row_builder, analyzer.select_list, sql_exprs, input=plan)
|
|
382
396
|
# update row builder with column information
|
|
383
397
|
all_base_cols = copied_cols + list(updated_cols) + list(recomputed_base_cols) # same order as select_list
|
|
384
|
-
row_builder.
|
|
398
|
+
row_builder.set_slot_idxs(select_list, remove_duplicates=False)
|
|
385
399
|
for i, col in enumerate(all_base_cols):
|
|
386
400
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
387
401
|
|
|
@@ -396,7 +410,7 @@ class Planner:
|
|
|
396
410
|
|
|
397
411
|
@classmethod
|
|
398
412
|
def create_view_update_plan(
|
|
399
|
-
cls, view: catalog.TableVersionPath, recompute_targets:
|
|
413
|
+
cls, view: catalog.TableVersionPath, recompute_targets: list[catalog.Column]
|
|
400
414
|
) -> exec.ExecNode:
|
|
401
415
|
"""Creates a plan to materialize updated rows for a view, given that the base table has been updated.
|
|
402
416
|
The plan:
|
|
@@ -427,8 +441,7 @@ class Planner:
|
|
|
427
441
|
|
|
428
442
|
# we need to retrieve the PK columns of the existing rows
|
|
429
443
|
plan = cls.create_query_plan(
|
|
430
|
-
view, select_list, where_clause=target.predicate,
|
|
431
|
-
exact_version_only=view.get_bases())
|
|
444
|
+
view, select_list, where_clause=target.predicate, ignore_errors=True, exact_version_only=view.get_bases())
|
|
432
445
|
for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
|
|
433
446
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
434
447
|
# TODO: avoid duplication with view_load_plan() logic (where does this belong?)
|
|
@@ -440,7 +453,7 @@ class Planner:
|
|
|
440
453
|
@classmethod
|
|
441
454
|
def create_view_load_plan(
|
|
442
455
|
cls, view: catalog.TableVersionPath, propagates_insert: bool = False
|
|
443
|
-
) ->
|
|
456
|
+
) -> tuple[exec.ExecNode, int]:
|
|
444
457
|
"""Creates a query plan for populating a view.
|
|
445
458
|
|
|
446
459
|
Args:
|
|
@@ -459,7 +472,6 @@ class Planner:
|
|
|
459
472
|
# - we can ignore stored non-computed columns because they have a default value that is supplied directly by
|
|
460
473
|
# the store
|
|
461
474
|
target = view.tbl_version # the one we need to populate
|
|
462
|
-
#stored_cols = [c for c in target.cols if c.is_stored and (c.is_computed or target.is_iterator_column(c))]
|
|
463
475
|
stored_cols = [c for c in target.cols if c.is_stored]
|
|
464
476
|
# 2. for component views: iterator args
|
|
465
477
|
iterator_args = [target.iterator_args] if target.iterator_args is not None else []
|
|
@@ -477,8 +489,9 @@ class Planner:
|
|
|
477
489
|
]
|
|
478
490
|
# if we're propagating an insert, we only want to see those base rows that were created for the current version
|
|
479
491
|
base_analyzer = Analyzer(view, base_output_exprs, where_clause=target.predicate)
|
|
492
|
+
base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
|
|
480
493
|
plan = cls._create_query_plan(
|
|
481
|
-
view.base, row_builder=row_builder, analyzer=base_analyzer, with_pk=True,
|
|
494
|
+
view.base, row_builder=row_builder, analyzer=base_analyzer, eval_ctx=base_eval_ctx, with_pk=True,
|
|
482
495
|
exact_version_only=view.get_bases() if propagates_insert else [])
|
|
483
496
|
exec_ctx = plan.ctx
|
|
484
497
|
if target.is_component_view():
|
|
@@ -494,9 +507,9 @@ class Planner:
|
|
|
494
507
|
return plan, len(row_builder.default_eval_ctx.target_exprs)
|
|
495
508
|
|
|
496
509
|
@classmethod
|
|
497
|
-
def _determine_ordering(cls, analyzer: Analyzer) ->
|
|
510
|
+
def _determine_ordering(cls, analyzer: Analyzer) -> list[tuple[exprs.Expr, bool]]:
|
|
498
511
|
"""Returns the exprs for the ORDER BY clause of the SqlScanNode"""
|
|
499
|
-
order_by_items:
|
|
512
|
+
order_by_items: list[tuple[exprs.Expr, Optional[bool]]] = []
|
|
500
513
|
order_by_origin: Optional[exprs.Expr] = None # the expr that determines the ordering
|
|
501
514
|
|
|
502
515
|
|
|
@@ -576,7 +589,7 @@ class Planner:
|
|
|
576
589
|
order_by_origin = unstored_iter_col_refs[0]
|
|
577
590
|
|
|
578
591
|
for e in [e for e, _ in order_by_items]:
|
|
579
|
-
if
|
|
592
|
+
if not analyzer.sql_elements.contains(e):
|
|
580
593
|
raise excs.Error(f'order_by element cannot be expressed in SQL: {e}')
|
|
581
594
|
# we do ascending ordering by default, if not specified otherwise
|
|
582
595
|
order_by_items = [(e, True) if asc is None else (e, asc) for e, asc in order_by_items]
|
|
@@ -590,7 +603,7 @@ class Planner:
|
|
|
590
603
|
|
|
591
604
|
@classmethod
|
|
592
605
|
def _insert_prefetch_node(
|
|
593
|
-
cls, tbl_id: UUID, output_exprs:
|
|
606
|
+
cls, tbl_id: UUID, output_exprs: list[exprs.Expr], row_builder: exprs.RowBuilder, input: exec.ExecNode
|
|
594
607
|
) -> exec.ExecNode:
|
|
595
608
|
"""Returns a CachePrefetchNode into the plan if needed, otherwise returns input"""
|
|
596
609
|
# we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
|
|
@@ -608,10 +621,10 @@ class Planner:
|
|
|
608
621
|
|
|
609
622
|
@classmethod
|
|
610
623
|
def create_query_plan(
|
|
611
|
-
cls, tbl: catalog.TableVersionPath, select_list: Optional[
|
|
612
|
-
where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[
|
|
613
|
-
order_by_clause: Optional[
|
|
614
|
-
|
|
624
|
+
cls, tbl: catalog.TableVersionPath, select_list: Optional[list[exprs.Expr]] = None,
|
|
625
|
+
where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
626
|
+
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, limit: Optional[int] = None,
|
|
627
|
+
ignore_errors: bool = False, exact_version_only: Optional[list[catalog.TableVersion]] = None
|
|
615
628
|
) -> exec.ExecNode:
|
|
616
629
|
"""Return plan for executing a query.
|
|
617
630
|
Updates 'select_list' in place to make it executable.
|
|
@@ -628,13 +641,19 @@ class Planner:
|
|
|
628
641
|
analyzer = Analyzer(
|
|
629
642
|
tbl, select_list, where_clause=where_clause, group_by_clause=group_by_clause,
|
|
630
643
|
order_by_clause=order_by_clause)
|
|
631
|
-
|
|
644
|
+
input_exprs = exprs.ExprSet(exprs.Expr.list_subexprs(
|
|
645
|
+
analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False))
|
|
646
|
+
# remove Literals from sql_exprs, we don't want to materialize them via a Select
|
|
647
|
+
input_exprs = exprs.ExprSet(e for e in input_exprs if not isinstance(e, exprs.Literal))
|
|
648
|
+
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], input_exprs)
|
|
632
649
|
|
|
633
650
|
analyzer.finalize(row_builder)
|
|
634
651
|
# select_list: we need to materialize everything that's been collected
|
|
635
652
|
# with_pk: for now, we always retrieve the PK, because we need it for the file cache
|
|
653
|
+
eval_ctx = row_builder.create_eval_ctx(analyzer.all_exprs)
|
|
636
654
|
plan = cls._create_query_plan(
|
|
637
|
-
tbl, row_builder, analyzer=analyzer, limit=limit, with_pk=True,
|
|
655
|
+
tbl, row_builder, analyzer=analyzer, eval_ctx=eval_ctx, limit=limit, with_pk=True,
|
|
656
|
+
exact_version_only=exact_version_only)
|
|
638
657
|
plan.ctx.ignore_errors = ignore_errors
|
|
639
658
|
select_list.clear()
|
|
640
659
|
select_list.extend(analyzer.select_list)
|
|
@@ -643,9 +662,13 @@ class Planner:
|
|
|
643
662
|
@classmethod
|
|
644
663
|
def _create_query_plan(
|
|
645
664
|
cls, tbl: catalog.TableVersionPath, row_builder: exprs.RowBuilder, analyzer: Analyzer,
|
|
646
|
-
|
|
665
|
+
eval_ctx: exprs.RowBuilder.EvalCtx,
|
|
666
|
+
limit: Optional[int] = None, with_pk: bool = False,
|
|
667
|
+
exact_version_only: Optional[list[catalog.TableVersion]] = None
|
|
647
668
|
) -> exec.ExecNode:
|
|
648
669
|
"""
|
|
670
|
+
Create plan to materialize eval_ctx.
|
|
671
|
+
|
|
649
672
|
Args:
|
|
650
673
|
plan_target: if not None, generate a plan that materializes only expression that can be evaluted
|
|
651
674
|
in the context of that table version (eg, if 'tbl' is a view, 'plan_target' might be the base)
|
|
@@ -659,9 +682,11 @@ class Planner:
|
|
|
659
682
|
|
|
660
683
|
order_by_items = cls._determine_ordering(analyzer)
|
|
661
684
|
sql_limit = 0 if is_agg_query else limit # if we're aggregating, the limit applies to the agg output
|
|
662
|
-
|
|
685
|
+
sql_exprs = [
|
|
686
|
+
e for e in eval_ctx.exprs if analyzer.sql_elements.contains(e) and not isinstance(e, exprs.Literal)
|
|
687
|
+
]
|
|
663
688
|
plan = exec.SqlScanNode(
|
|
664
|
-
tbl, row_builder, select_list=
|
|
689
|
+
tbl, row_builder, select_list=sql_exprs, where_clause=analyzer.sql_where_clause,
|
|
665
690
|
filter=analyzer.filter, order_by_items=order_by_items,
|
|
666
691
|
limit=sql_limit, set_pk=with_pk, exact_version_only=exact_version_only)
|
|
667
692
|
plan = cls._insert_prefetch_node(tbl.tbl_version.id, analyzer.select_list, row_builder, plan)
|
|
@@ -671,29 +696,26 @@ class Planner:
|
|
|
671
696
|
# args of the agg fn calls
|
|
672
697
|
agg_input = exprs.ExprSet(analyzer.group_by_clause.copy())
|
|
673
698
|
for fn_call in analyzer.agg_fn_calls:
|
|
674
|
-
agg_input.
|
|
675
|
-
if not
|
|
699
|
+
agg_input.update(fn_call.components)
|
|
700
|
+
if not exprs.ExprSet(sql_exprs).issuperset(agg_input):
|
|
676
701
|
# we need an ExprEvalNode
|
|
677
|
-
plan = exec.ExprEvalNode(row_builder, agg_input,
|
|
702
|
+
plan = exec.ExprEvalNode(row_builder, agg_input, sql_exprs, input=plan)
|
|
678
703
|
|
|
679
704
|
# batch size for aggregation input: this could be the entire table, so we need to divide it into
|
|
680
705
|
# smaller batches; at the same time, we need to make the batches large enough to amortize the
|
|
681
706
|
# function call overhead
|
|
682
|
-
# TODO: increase this if we have NOS calls in order to reduce the cost of switching models, but take
|
|
683
|
-
# into account the amount of memory needed for intermediate images
|
|
684
707
|
ctx.batch_size = 16
|
|
685
708
|
|
|
686
709
|
plan = exec.AggregationNode(
|
|
687
710
|
tbl.tbl_version, row_builder, analyzer.group_by_clause, analyzer.agg_fn_calls, agg_input, input=plan)
|
|
688
|
-
agg_output = analyzer.group_by_clause
|
|
689
|
-
if not
|
|
711
|
+
agg_output = exprs.ExprSet(itertools.chain(analyzer.group_by_clause, analyzer.agg_fn_calls))
|
|
712
|
+
if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
|
|
690
713
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
691
|
-
plan = exec.ExprEvalNode(
|
|
692
|
-
row_builder, analyzer.select_list, agg_output, input=plan)
|
|
714
|
+
plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
|
|
693
715
|
else:
|
|
694
|
-
if not
|
|
716
|
+
if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
|
|
695
717
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
696
|
-
plan = exec.ExprEvalNode(row_builder,
|
|
718
|
+
plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, sql_exprs, input=plan)
|
|
697
719
|
# we're returning everything to the user, so we might as well do it in a single batch
|
|
698
720
|
ctx.batch_size = 0
|
|
699
721
|
|
|
@@ -707,17 +729,17 @@ class Planner:
|
|
|
707
729
|
@classmethod
|
|
708
730
|
def create_add_column_plan(
|
|
709
731
|
cls, tbl: catalog.TableVersionPath, col: catalog.Column
|
|
710
|
-
) ->
|
|
732
|
+
) -> tuple[exec.ExecNode, Optional[int]]:
|
|
711
733
|
"""Creates a plan for InsertableTable.add_column()
|
|
712
734
|
Returns:
|
|
713
735
|
plan: the plan to execute
|
|
714
736
|
value_expr slot idx for the plan output (for computed cols)
|
|
715
737
|
"""
|
|
716
738
|
assert isinstance(tbl, catalog.TableVersionPath)
|
|
717
|
-
index_info: List[Tuple[catalog.Column, func.Function]] = []
|
|
718
739
|
row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[])
|
|
719
740
|
analyzer = Analyzer(tbl, row_builder.default_eval_ctx.target_exprs)
|
|
720
|
-
plan = cls._create_query_plan(
|
|
741
|
+
plan = cls._create_query_plan(
|
|
742
|
+
tbl, row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True)
|
|
721
743
|
plan.ctx.batch_size = 16
|
|
722
744
|
plan.ctx.show_pbar = True
|
|
723
745
|
plan.ctx.ignore_errors = True
|
pixeltable/store.py
CHANGED
|
@@ -53,7 +53,6 @@ class StoreBase:
|
|
|
53
53
|
def _create_rowid_columns(self) -> List[sql.Column]:
|
|
54
54
|
"""Create and return rowid columns"""
|
|
55
55
|
|
|
56
|
-
@abc.abstractmethod
|
|
57
56
|
def _create_system_columns(self) -> List[sql.Column]:
|
|
58
57
|
"""Create and return system columns"""
|
|
59
58
|
rowid_cols = self._create_rowid_columns()
|
|
@@ -432,6 +431,11 @@ class StoreComponentView(StoreView):
|
|
|
432
431
|
|
|
433
432
|
PK: now also includes pos, the position returned by the ComponentIterator for the base row identified by base_rowid
|
|
434
433
|
"""
|
|
434
|
+
|
|
435
|
+
rowid_cols: list[sql.Column]
|
|
436
|
+
pos_col: sql.Column
|
|
437
|
+
pos_col_idx: int
|
|
438
|
+
|
|
435
439
|
def __init__(self, catalog_view: catalog.TableVersion):
|
|
436
440
|
super().__init__(catalog_view)
|
|
437
441
|
|
|
@@ -5,6 +5,7 @@ import os
|
|
|
5
5
|
import pathlib
|
|
6
6
|
import subprocess
|
|
7
7
|
from typing import Any
|
|
8
|
+
from zoneinfo import ZoneInfo
|
|
8
9
|
|
|
9
10
|
import pixeltable_pgserver
|
|
10
11
|
import toml
|
|
@@ -15,8 +16,7 @@ from pixeltable.env import Env
|
|
|
15
16
|
from pixeltable.func import Batch
|
|
16
17
|
from pixeltable.io.external_store import Project
|
|
17
18
|
from pixeltable.tool import embed_udf
|
|
18
|
-
from pixeltable.type_system import
|
|
19
|
-
StringType, IntType, FloatType, BoolType, TimestampType, JsonType, ImageType
|
|
19
|
+
from pixeltable.type_system import BoolType, FloatType, ImageType, IntType, JsonType, StringType, TimestampType
|
|
20
20
|
|
|
21
21
|
_logger = logging.getLogger('pixeltable')
|
|
22
22
|
|
|
@@ -248,7 +248,8 @@ class Dumper:
|
|
|
248
248
|
add_column('str_const', 'str')
|
|
249
249
|
add_column('int_const', 5)
|
|
250
250
|
add_column('float_const', 5.0)
|
|
251
|
-
add_column('timestamp_const_1', datetime.datetime.now(
|
|
251
|
+
add_column('timestamp_const_1', datetime.datetime.now())
|
|
252
|
+
add_column('timestamp_const_2', datetime.datetime.now().astimezone(ZoneInfo('America/Anchorage')))
|
|
252
253
|
|
|
253
254
|
# type_cast
|
|
254
255
|
add_column('astype', t.c2.astype(FloatType()))
|
pixeltable/type_system.py
CHANGED
|
@@ -15,7 +15,8 @@ import numpy as np
|
|
|
15
15
|
import PIL.Image
|
|
16
16
|
import sqlalchemy as sql
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
import pixeltable.exceptions as excs
|
|
19
|
+
from pixeltable.env import Env
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class ColumnType:
|
|
@@ -99,7 +100,7 @@ class ColumnType:
|
|
|
99
100
|
if nullable == self.nullable:
|
|
100
101
|
return self
|
|
101
102
|
else:
|
|
102
|
-
return self.__class__(nullable=nullable)
|
|
103
|
+
return self.__class__(nullable=nullable) # type: ignore[call-arg]
|
|
103
104
|
|
|
104
105
|
@classmethod
|
|
105
106
|
def serialize_list(cls, type_list: List[ColumnType]) -> str:
|
|
@@ -474,7 +475,7 @@ class TimestampType(ColumnType):
|
|
|
474
475
|
super().__init__(self.Type.TIMESTAMP, nullable=nullable)
|
|
475
476
|
|
|
476
477
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
477
|
-
return sql.TIMESTAMP()
|
|
478
|
+
return sql.TIMESTAMP(timezone=True)
|
|
478
479
|
|
|
479
480
|
def _validate_literal(self, val: Any) -> None:
|
|
480
481
|
if not isinstance(val, datetime.datetime):
|
|
@@ -496,7 +497,7 @@ class JsonType(ColumnType):
|
|
|
496
497
|
return JsonType(self.type_spec, nullable=nullable)
|
|
497
498
|
|
|
498
499
|
def matches(self, other: ColumnType) -> bool:
|
|
499
|
-
return other
|
|
500
|
+
return isinstance(other, JsonType) and self.type_spec == other.type_spec
|
|
500
501
|
|
|
501
502
|
def supertype(self, other: ColumnType) -> Optional[JsonType]:
|
|
502
503
|
if not isinstance(other, JsonType):
|
|
@@ -558,7 +559,7 @@ class JsonType(ColumnType):
|
|
|
558
559
|
raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
|
|
559
560
|
|
|
560
561
|
@classmethod
|
|
561
|
-
def __is_valid_literal(cls, val: Any) ->
|
|
562
|
+
def __is_valid_literal(cls, val: Any) -> bool:
|
|
562
563
|
if val is None or isinstance(val, (str, int, float, bool)):
|
|
563
564
|
return True
|
|
564
565
|
if isinstance(val, (list, tuple)):
|
|
@@ -585,7 +586,7 @@ class ArrayType(ColumnType):
|
|
|
585
586
|
return ArrayType(self.shape, self.pxt_dtype, nullable=nullable)
|
|
586
587
|
|
|
587
588
|
def matches(self, other: ColumnType) -> bool:
|
|
588
|
-
return other
|
|
589
|
+
return isinstance(other, ArrayType) and self.shape == other.shape and self.dtype == other.dtype
|
|
589
590
|
|
|
590
591
|
def supertype(self, other: ColumnType) -> Optional[ArrayType]:
|
|
591
592
|
if not isinstance(other, ArrayType):
|
|
@@ -718,7 +719,7 @@ class ImageType(ColumnType):
|
|
|
718
719
|
|
|
719
720
|
def matches(self, other: ColumnType) -> bool:
|
|
720
721
|
return (
|
|
721
|
-
other
|
|
722
|
+
isinstance(other, ImageType)
|
|
722
723
|
and self.width == other.width
|
|
723
724
|
and self.height == other.height
|
|
724
725
|
and self.mode == other.mode
|
|
@@ -848,7 +849,7 @@ class DocumentType(ColumnType):
|
|
|
848
849
|
return DocumentType(doc_formats=self.doc_formats, nullable=nullable)
|
|
849
850
|
|
|
850
851
|
def matches(self, other: ColumnType) -> bool:
|
|
851
|
-
return other
|
|
852
|
+
return isinstance(other, DocumentType) and self._doc_formats == other._doc_formats
|
|
852
853
|
|
|
853
854
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
854
855
|
# stored as a file path
|
|
@@ -860,9 +861,6 @@ class DocumentType(ColumnType):
|
|
|
860
861
|
def validate_media(self, val: Any) -> None:
|
|
861
862
|
assert isinstance(val, str)
|
|
862
863
|
from pixeltable.utils.documents import get_document_handle
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
raise excs.Error(f'Not a recognized document format: {val}')
|
|
867
|
-
except Exception as e:
|
|
868
|
-
raise excs.Error(f'Not a recognized document format: {val}') from None
|
|
864
|
+
dh = get_document_handle(val)
|
|
865
|
+
if dh is None:
|
|
866
|
+
raise excs.Error(f'Not a recognized document format: {val}')
|