pixeltable 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -0
- pixeltable/catalog/catalog.py +105 -51
- pixeltable/catalog/column.py +7 -2
- pixeltable/catalog/table.py +1 -0
- pixeltable/catalog/table_metadata.py +4 -0
- pixeltable/catalog/table_version.py +99 -78
- pixeltable/catalog/table_version_handle.py +4 -1
- pixeltable/config.py +6 -0
- pixeltable/dataframe.py +10 -5
- pixeltable/env.py +48 -19
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/exec_node.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +1 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
- pixeltable/exec/expr_eval/globals.py +2 -0
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/object_store_save_node.py +1 -4
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +107 -14
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +10 -11
- pixeltable/exprs/column_property_ref.py +10 -10
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/data_row.py +106 -37
- pixeltable/exprs/expr.py +9 -0
- pixeltable/exprs/expr_set.py +14 -7
- pixeltable/exprs/inline_expr.py +2 -19
- pixeltable/exprs/json_path.py +45 -12
- pixeltable/exprs/row_builder.py +54 -22
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/bedrock.py +7 -0
- pixeltable/functions/deepseek.py +11 -4
- pixeltable/functions/llama_cpp.py +7 -0
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/ollama.py +7 -0
- pixeltable/functions/openai.py +4 -4
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/globals.py +10 -4
- pixeltable/io/globals.py +16 -15
- pixeltable/io/table_data_conduit.py +46 -21
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +175 -46
- pixeltable/store.py +1 -1
- pixeltable/type_system.py +5 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/exception_handler.py +5 -28
- pixeltable/utils/image.py +7 -0
- pixeltable/utils/misc.py +5 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/METADATA +2 -1
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/RECORD +57 -50
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/licenses/LICENSE +0 -0
pixeltable/exec/sql_node.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import datetime
|
|
1
2
|
import logging
|
|
2
3
|
import warnings
|
|
3
4
|
from decimal import Decimal
|
|
@@ -65,7 +66,7 @@ def print_order_by_clause(clause: OrderByClause) -> str:
|
|
|
65
66
|
|
|
66
67
|
class SqlNode(ExecNode):
|
|
67
68
|
"""
|
|
68
|
-
Materializes data from the store via
|
|
69
|
+
Materializes data from the store via a SQL statement.
|
|
69
70
|
This only provides the select list. The subclasses are responsible for the From clause and any additional clauses.
|
|
70
71
|
The pk columns are not included in the select list.
|
|
71
72
|
If set_pk is True, they are added to the end of the result set when creating the SQL statement
|
|
@@ -82,6 +83,8 @@ class SqlNode(ExecNode):
|
|
|
82
83
|
|
|
83
84
|
tbl: Optional[catalog.TableVersionPath]
|
|
84
85
|
select_list: exprs.ExprSet
|
|
86
|
+
columns: list[catalog.Column] # for which columns to populate DataRow.cell_vals/cell_md
|
|
87
|
+
cell_md_refs: list[exprs.ColumnPropertyRef] # of ColumnRefs which also need DataRow.slot_cellmd for evaluation
|
|
85
88
|
set_pk: bool
|
|
86
89
|
num_pk_cols: int
|
|
87
90
|
py_filter: Optional[exprs.Expr] # a predicate that can only be run in Python
|
|
@@ -89,6 +92,12 @@ class SqlNode(ExecNode):
|
|
|
89
92
|
cte: Optional[sql.CTE]
|
|
90
93
|
sql_elements: exprs.SqlElementCache
|
|
91
94
|
|
|
95
|
+
# execution state
|
|
96
|
+
cellmd_item_idxs: exprs.ExprDict[int] # cellmd expr -> idx in sql select list
|
|
97
|
+
column_item_idxs: dict[catalog.Column, int] # column -> idx in sql select list
|
|
98
|
+
column_cellmd_item_idxs: dict[catalog.Column, int] # column -> idx in sql select list
|
|
99
|
+
result_cursor: sql.engine.CursorResult | None
|
|
100
|
+
|
|
92
101
|
# where_clause/-_element: allow subclass to set one or the other (but not both)
|
|
93
102
|
where_clause: Optional[exprs.Expr]
|
|
94
103
|
where_clause_element: Optional[sql.ColumnElement]
|
|
@@ -101,12 +110,22 @@ class SqlNode(ExecNode):
|
|
|
101
110
|
tbl: Optional[catalog.TableVersionPath],
|
|
102
111
|
row_builder: exprs.RowBuilder,
|
|
103
112
|
select_list: Iterable[exprs.Expr],
|
|
113
|
+
columns: list[catalog.Column],
|
|
104
114
|
sql_elements: exprs.SqlElementCache,
|
|
115
|
+
cell_md_col_refs: list[exprs.ColumnRef] | None = None,
|
|
105
116
|
set_pk: bool = False,
|
|
106
117
|
):
|
|
107
118
|
# create Select stmt
|
|
108
119
|
self.sql_elements = sql_elements
|
|
109
120
|
self.tbl = tbl
|
|
121
|
+
self.columns = columns
|
|
122
|
+
if cell_md_col_refs is not None:
|
|
123
|
+
assert all(ref.col.stores_cellmd for ref in cell_md_col_refs)
|
|
124
|
+
self.cell_md_refs = [
|
|
125
|
+
exprs.ColumnPropertyRef(ref, exprs.ColumnPropertyRef.Property.CELLMD) for ref in cell_md_col_refs
|
|
126
|
+
]
|
|
127
|
+
else:
|
|
128
|
+
self.cell_md_refs = []
|
|
110
129
|
self.select_list = exprs.ExprSet(select_list)
|
|
111
130
|
# unstored iter columns: we also need to retrieve whatever is needed to materialize the iter args
|
|
112
131
|
for iter_arg in row_builder.unstored_iter_args.values():
|
|
@@ -129,6 +148,9 @@ class SqlNode(ExecNode):
|
|
|
129
148
|
assert self.num_pk_cols > 1
|
|
130
149
|
|
|
131
150
|
# additional state
|
|
151
|
+
self.cellmd_item_idxs = exprs.ExprDict()
|
|
152
|
+
self.column_item_idxs = {}
|
|
153
|
+
self.column_cellmd_item_idxs = {}
|
|
132
154
|
self.result_cursor = None
|
|
133
155
|
# the filter is provided by the subclass
|
|
134
156
|
self.py_filter = None
|
|
@@ -144,10 +166,9 @@ class SqlNode(ExecNode):
|
|
|
144
166
|
if tv is not None:
|
|
145
167
|
assert tv.is_validated
|
|
146
168
|
|
|
147
|
-
def
|
|
148
|
-
"""Create a list of pk columns"""
|
|
149
|
-
# we need to retrieve the pk columns
|
|
169
|
+
def _pk_col_items(self) -> list[sql.Column]:
|
|
150
170
|
if self.set_pk:
|
|
171
|
+
# we need to retrieve the pk columns
|
|
151
172
|
assert self.tbl is not None
|
|
152
173
|
assert self.tbl.tbl_version.get().is_validated
|
|
153
174
|
return self.tbl.tbl_version.get().store_tbl.pk_columns()
|
|
@@ -157,7 +178,19 @@ class SqlNode(ExecNode):
|
|
|
157
178
|
"""Create Select from local state"""
|
|
158
179
|
|
|
159
180
|
assert self.sql_elements.contains_all(self.select_list)
|
|
160
|
-
|
|
181
|
+
sql_select_list_exprs = exprs.ExprSet(self.select_list)
|
|
182
|
+
self.cellmd_item_idxs = exprs.ExprDict((ref, sql_select_list_exprs.add(ref)) for ref in self.cell_md_refs)
|
|
183
|
+
column_refs = [exprs.ColumnRef(col) for col in self.columns]
|
|
184
|
+
self.column_item_idxs = {col_ref.col: sql_select_list_exprs.add(col_ref) for col_ref in column_refs}
|
|
185
|
+
column_cellmd_refs = [
|
|
186
|
+
exprs.ColumnPropertyRef(col_ref, exprs.ColumnPropertyRef.Property.CELLMD)
|
|
187
|
+
for col_ref in column_refs
|
|
188
|
+
if col_ref.col.stores_cellmd
|
|
189
|
+
]
|
|
190
|
+
self.column_cellmd_item_idxs = {
|
|
191
|
+
cellmd_ref.col_ref.col: sql_select_list_exprs.add(cellmd_ref) for cellmd_ref in column_cellmd_refs
|
|
192
|
+
}
|
|
193
|
+
sql_select_list = [self.sql_elements.get(e) for e in sql_select_list_exprs] + self._pk_col_items()
|
|
161
194
|
stmt = sql.select(*sql_select_list)
|
|
162
195
|
|
|
163
196
|
where_clause_element = (
|
|
@@ -198,9 +231,7 @@ class SqlNode(ExecNode):
|
|
|
198
231
|
if not keep_pk:
|
|
199
232
|
self.set_pk = False # we don't need the PK if we use this SqlNode as a CTE
|
|
200
233
|
self.cte = self._create_stmt().cte()
|
|
201
|
-
|
|
202
|
-
assert len(self.select_list) + pk_count == len(self.cte.c)
|
|
203
|
-
return self.cte, exprs.ExprDict(zip(self.select_list, self.cte.c)) # skip pk cols
|
|
234
|
+
return self.cte, exprs.ExprDict(zip(list(self.select_list) + self.cell_md_refs, self.cte.c)) # skip pk cols
|
|
204
235
|
|
|
205
236
|
@classmethod
|
|
206
237
|
def retarget_rowid_refs(cls, target: catalog.TableVersionPath, expr_seq: Iterable[exprs.Expr]) -> None:
|
|
@@ -318,24 +349,53 @@ class SqlNode(ExecNode):
|
|
|
318
349
|
output_batch = DataRowBatch(self.row_builder)
|
|
319
350
|
output_row: Optional[exprs.DataRow] = None
|
|
320
351
|
num_rows_returned = 0
|
|
352
|
+
is_using_cockroachdb = Env.get().is_using_cockroachdb
|
|
353
|
+
tzinfo = Env.get().default_time_zone
|
|
321
354
|
|
|
322
355
|
for sql_row in result_cursor:
|
|
323
356
|
output_row = output_batch.add_row(output_row)
|
|
324
357
|
|
|
325
358
|
# populate output_row
|
|
359
|
+
|
|
326
360
|
if self.num_pk_cols > 0:
|
|
327
361
|
output_row.set_pk(tuple(sql_row[-self.num_pk_cols :]))
|
|
362
|
+
|
|
363
|
+
# column copies
|
|
364
|
+
for col, item_idx in self.column_item_idxs.items():
|
|
365
|
+
output_row.cell_vals[col.id] = sql_row[item_idx]
|
|
366
|
+
for col, item_idx in self.column_cellmd_item_idxs.items():
|
|
367
|
+
cell_md_dict = sql_row[item_idx]
|
|
368
|
+
output_row.cell_md[col.id] = exprs.CellMd(**cell_md_dict) if cell_md_dict is not None else None
|
|
369
|
+
|
|
370
|
+
# populate DataRow.slot_cellmd, where requested
|
|
371
|
+
for cellmd_ref, item_idx in self.cellmd_item_idxs.items():
|
|
372
|
+
cell_md_dict = sql_row[item_idx]
|
|
373
|
+
output_row.slot_md[cellmd_ref.col_ref.slot_idx] = (
|
|
374
|
+
exprs.CellMd.from_dict(cell_md_dict) if cell_md_dict is not None else None
|
|
375
|
+
)
|
|
376
|
+
|
|
328
377
|
# copy the output of the SQL query into the output row
|
|
329
378
|
for i, e in enumerate(self.select_list):
|
|
330
379
|
slot_idx = e.slot_idx
|
|
331
|
-
# certain numerical operations can produce Decimals (eg, SUM(<int column>)); we need to convert them
|
|
332
380
|
if isinstance(sql_row[i], Decimal):
|
|
381
|
+
# certain numerical operations can produce Decimals (eg, SUM(<int column>)); we need to convert them
|
|
333
382
|
if e.col_type.is_int_type():
|
|
334
383
|
output_row[slot_idx] = int(sql_row[i])
|
|
335
384
|
elif e.col_type.is_float_type():
|
|
336
385
|
output_row[slot_idx] = float(sql_row[i])
|
|
337
386
|
else:
|
|
338
387
|
raise RuntimeError(f'Unexpected Decimal value for {e}')
|
|
388
|
+
elif is_using_cockroachdb and isinstance(sql_row[i], datetime.datetime):
|
|
389
|
+
# Ensure that the datetime is timezone-aware and in the session time zone
|
|
390
|
+
# cockroachDB returns timestamps in the session time zone, with numeric offset,
|
|
391
|
+
# convert to the session time zone with the requested tzinfo for DST handling
|
|
392
|
+
if e.col_type.is_timestamp_type():
|
|
393
|
+
if isinstance(sql_row[i].tzinfo, datetime.timezone):
|
|
394
|
+
output_row[slot_idx] = sql_row[i].astimezone(tz=tzinfo)
|
|
395
|
+
else:
|
|
396
|
+
output_row[slot_idx] = sql_row[i]
|
|
397
|
+
else:
|
|
398
|
+
raise RuntimeError(f'Unexpected datetime value for {e}')
|
|
339
399
|
else:
|
|
340
400
|
output_row[slot_idx] = sql_row[i]
|
|
341
401
|
|
|
@@ -387,11 +447,21 @@ class SqlScanNode(SqlNode):
|
|
|
387
447
|
tbl: catalog.TableVersionPath,
|
|
388
448
|
row_builder: exprs.RowBuilder,
|
|
389
449
|
select_list: Iterable[exprs.Expr],
|
|
450
|
+
columns: list[catalog.Column],
|
|
451
|
+
cell_md_col_refs: list[exprs.ColumnRef] | None = None,
|
|
390
452
|
set_pk: bool = False,
|
|
391
453
|
exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
|
|
392
454
|
):
|
|
393
455
|
sql_elements = exprs.SqlElementCache()
|
|
394
|
-
super().__init__(
|
|
456
|
+
super().__init__(
|
|
457
|
+
tbl,
|
|
458
|
+
row_builder,
|
|
459
|
+
select_list,
|
|
460
|
+
columns=columns,
|
|
461
|
+
sql_elements=sql_elements,
|
|
462
|
+
set_pk=set_pk,
|
|
463
|
+
cell_md_col_refs=cell_md_col_refs,
|
|
464
|
+
)
|
|
395
465
|
# create Select stmt
|
|
396
466
|
if exact_version_only is None:
|
|
397
467
|
exact_version_only = []
|
|
@@ -423,11 +493,21 @@ class SqlLookupNode(SqlNode):
|
|
|
423
493
|
tbl: catalog.TableVersionPath,
|
|
424
494
|
row_builder: exprs.RowBuilder,
|
|
425
495
|
select_list: Iterable[exprs.Expr],
|
|
496
|
+
columns: list[catalog.Column],
|
|
426
497
|
sa_key_cols: list[sql.Column],
|
|
427
498
|
key_vals: list[tuple],
|
|
499
|
+
cell_md_col_refs: list[exprs.ColumnRef] | None = None,
|
|
428
500
|
):
|
|
429
501
|
sql_elements = exprs.SqlElementCache()
|
|
430
|
-
super().__init__(
|
|
502
|
+
super().__init__(
|
|
503
|
+
tbl,
|
|
504
|
+
row_builder,
|
|
505
|
+
select_list,
|
|
506
|
+
columns=columns,
|
|
507
|
+
sql_elements=sql_elements,
|
|
508
|
+
set_pk=True,
|
|
509
|
+
cell_md_col_refs=cell_md_col_refs,
|
|
510
|
+
)
|
|
431
511
|
# Where clause: (key-col-1, key-col-2, ...) IN ((val-1, val-2, ...), ...)
|
|
432
512
|
self.where_clause_element = sql.tuple_(*sa_key_cols).in_(key_vals)
|
|
433
513
|
|
|
@@ -460,9 +540,10 @@ class SqlAggregationNode(SqlNode):
|
|
|
460
540
|
limit: Optional[int] = None,
|
|
461
541
|
exact_version_only: Optional[list[catalog.TableVersion]] = None,
|
|
462
542
|
):
|
|
543
|
+
assert len(input.cell_md_refs) == 0 # there's no aggregation over json or arrays in SQL
|
|
463
544
|
self.input_cte, input_col_map = input.to_cte()
|
|
464
545
|
sql_elements = exprs.SqlElementCache(input_col_map)
|
|
465
|
-
super().__init__(None, row_builder, select_list, sql_elements)
|
|
546
|
+
super().__init__(None, row_builder, select_list, columns=[], sql_elements=sql_elements)
|
|
466
547
|
self.group_by_items = group_by_items
|
|
467
548
|
|
|
468
549
|
def _create_stmt(self) -> sql.Select:
|
|
@@ -498,7 +579,10 @@ class SqlJoinNode(SqlNode):
|
|
|
498
579
|
input_cte, input_col_map = input_node.to_cte()
|
|
499
580
|
self.input_ctes.append(input_cte)
|
|
500
581
|
sql_elements.extend(input_col_map)
|
|
501
|
-
|
|
582
|
+
cell_md_col_refs = [cell_md_ref.col_ref for input in inputs for cell_md_ref in input.cell_md_refs]
|
|
583
|
+
super().__init__(
|
|
584
|
+
None, row_builder, select_list, columns=[], sql_elements=sql_elements, cell_md_col_refs=cell_md_col_refs
|
|
585
|
+
)
|
|
502
586
|
|
|
503
587
|
def _create_stmt(self) -> sql.Select:
|
|
504
588
|
from pixeltable import plan
|
|
@@ -552,7 +636,16 @@ class SqlSampleNode(SqlNode):
|
|
|
552
636
|
assert self.pk_count > 1
|
|
553
637
|
sql_elements = exprs.SqlElementCache(input_col_map)
|
|
554
638
|
assert sql_elements.contains_all(stratify_exprs)
|
|
555
|
-
|
|
639
|
+
cell_md_col_refs = [cell_md_ref.col_ref for cell_md_ref in input.cell_md_refs]
|
|
640
|
+
super().__init__(
|
|
641
|
+
input.tbl,
|
|
642
|
+
row_builder,
|
|
643
|
+
select_list,
|
|
644
|
+
columns=[],
|
|
645
|
+
sql_elements=sql_elements,
|
|
646
|
+
cell_md_col_refs=cell_md_col_refs,
|
|
647
|
+
set_pk=True,
|
|
648
|
+
)
|
|
556
649
|
self.stratify_exprs = stratify_exprs
|
|
557
650
|
self.sample_clause = sample_clause
|
|
558
651
|
assert isinstance(self.sample_clause.seed, int)
|
pixeltable/exprs/__init__.py
CHANGED
|
@@ -6,7 +6,7 @@ from .column_property_ref import ColumnPropertyRef
|
|
|
6
6
|
from .column_ref import ColumnRef
|
|
7
7
|
from .comparison import Comparison
|
|
8
8
|
from .compound_predicate import CompoundPredicate
|
|
9
|
-
from .data_row import DataRow
|
|
9
|
+
from .data_row import ArrayMd, CellMd, DataRow
|
|
10
10
|
from .expr import Expr
|
|
11
11
|
from .expr_dict import ExprDict
|
|
12
12
|
from .expr_set import ExprSet
|
|
@@ -72,15 +72,16 @@ class ArithmeticExpr(Expr):
|
|
|
72
72
|
return left * right
|
|
73
73
|
if self.operator == ArithmeticOperator.DIV:
|
|
74
74
|
assert self.col_type.is_float_type()
|
|
75
|
-
# Avoid
|
|
75
|
+
# Avoid division by zero errors by converting any zero divisor to NULL.
|
|
76
76
|
# TODO: Should we cast the NULLs to NaNs when they are retrieved back into Python?
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
return sql.
|
|
77
|
+
# These casts cause the computation to take place in float units, rather than DECIMAL.
|
|
78
|
+
nullif = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
|
|
79
|
+
return sql.cast(left, self.col_type.to_sa_type()) / nullif
|
|
80
80
|
if self.operator == ArithmeticOperator.MOD:
|
|
81
81
|
if self.col_type.is_int_type():
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
# Avoid division by zero errors by converting any zero divisor to NULL.
|
|
83
|
+
nullif1 = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
|
|
84
|
+
return left % nullif1
|
|
84
85
|
if self.col_type.is_float_type():
|
|
85
86
|
# Postgres does not support modulus for floats
|
|
86
87
|
return None
|
|
@@ -90,11 +91,9 @@ class ArithmeticExpr(Expr):
|
|
|
90
91
|
# We need the behavior to be consistent, so that expressions will evaluate the same way
|
|
91
92
|
# whether or not their operands can be translated to SQL. These SQL clauses should
|
|
92
93
|
# mimic the behavior of Python's // operator.
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
if self.col_type.is_float_type():
|
|
97
|
-
return sql.sql.expression.cast(sql.func.floor(left / nullif), self.col_type.to_sa_type())
|
|
94
|
+
# Avoid division by zero errors by converting any zero divisor to NULL.
|
|
95
|
+
nullif = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
|
|
96
|
+
return sql.func.floor(sql.cast(left, self.col_type.to_sa_type()) / nullif)
|
|
98
97
|
raise AssertionError()
|
|
99
98
|
|
|
100
99
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
@@ -44,21 +44,21 @@ class ColumnPropertyRef(Expr):
|
|
|
44
44
|
return [*super()._id_attrs(), ('prop', self.prop.value)]
|
|
45
45
|
|
|
46
46
|
@property
|
|
47
|
-
def
|
|
47
|
+
def col_ref(self) -> ColumnRef:
|
|
48
48
|
col_ref = self.components[0]
|
|
49
49
|
assert isinstance(col_ref, ColumnRef)
|
|
50
50
|
return col_ref
|
|
51
51
|
|
|
52
52
|
def __repr__(self) -> str:
|
|
53
|
-
return f'{self.
|
|
53
|
+
return f'{self.col_ref}.{self.prop.name.lower()}'
|
|
54
54
|
|
|
55
55
|
def is_cellmd_prop(self) -> bool:
|
|
56
56
|
return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
|
|
57
57
|
|
|
58
58
|
def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
59
|
-
if not self.
|
|
59
|
+
if not self.col_ref.col_handle.get().is_stored:
|
|
60
60
|
return None
|
|
61
|
-
col = self.
|
|
61
|
+
col = self.col_ref.col_handle.get()
|
|
62
62
|
|
|
63
63
|
# the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
|
|
64
64
|
if (
|
|
@@ -77,7 +77,7 @@ class ColumnPropertyRef(Expr):
|
|
|
77
77
|
return col.sa_cellmd_col
|
|
78
78
|
if self.prop == self.Property.FILEURL:
|
|
79
79
|
# the file url is stored as the column value
|
|
80
|
-
return sql_elements.get(self.
|
|
80
|
+
return sql_elements.get(self.col_ref)
|
|
81
81
|
return None
|
|
82
82
|
|
|
83
83
|
@classmethod
|
|
@@ -87,15 +87,15 @@ class ColumnPropertyRef(Expr):
|
|
|
87
87
|
|
|
88
88
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
89
89
|
if self.prop == self.Property.FILEURL:
|
|
90
|
-
assert data_row.has_val[self.
|
|
91
|
-
data_row[self.slot_idx] = data_row.file_urls[self.
|
|
90
|
+
assert data_row.has_val[self.col_ref.slot_idx]
|
|
91
|
+
data_row[self.slot_idx] = data_row.file_urls[self.col_ref.slot_idx]
|
|
92
92
|
return
|
|
93
93
|
elif self.prop == self.Property.LOCALPATH:
|
|
94
|
-
assert data_row.has_val[self.
|
|
95
|
-
data_row[self.slot_idx] = data_row.file_paths[self.
|
|
94
|
+
assert data_row.has_val[self.col_ref.slot_idx]
|
|
95
|
+
data_row[self.slot_idx] = data_row.file_paths[self.col_ref.slot_idx]
|
|
96
96
|
return
|
|
97
97
|
elif self.is_cellmd_prop():
|
|
98
|
-
exc = data_row.get_exc(self.
|
|
98
|
+
exc = data_row.get_exc(self.col_ref.slot_idx)
|
|
99
99
|
if exc is None:
|
|
100
100
|
data_row[self.slot_idx] = None
|
|
101
101
|
elif self.prop == self.Property.ERRORTYPE:
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -123,8 +123,8 @@ class ColumnRef(Expr):
|
|
|
123
123
|
name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
|
|
124
124
|
or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
|
|
125
125
|
):
|
|
126
|
-
|
|
127
|
-
if not
|
|
126
|
+
is_valid = (self.col.is_computed or self.col.col_type.is_media_type()) and self.col.is_stored
|
|
127
|
+
if not is_valid:
|
|
128
128
|
raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
|
|
129
129
|
return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
|
|
130
130
|
if (
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import dataclasses
|
|
3
4
|
import datetime
|
|
4
5
|
import io
|
|
5
6
|
import urllib.parse
|
|
@@ -13,15 +14,72 @@ import PIL
|
|
|
13
14
|
import PIL.Image
|
|
14
15
|
import sqlalchemy as sql
|
|
15
16
|
|
|
17
|
+
import pixeltable.utils.image as image_utils
|
|
16
18
|
from pixeltable import catalog, env
|
|
17
19
|
from pixeltable.utils.local_store import TempStore
|
|
20
|
+
from pixeltable.utils.misc import non_none_dict_factory
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclasses.dataclass
|
|
24
|
+
class ArrayMd:
|
|
25
|
+
"""
|
|
26
|
+
Metadata for array cells that are stored externally.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
start: int
|
|
30
|
+
end: int
|
|
31
|
+
|
|
32
|
+
# we store bool arrays as packed bits (uint8 arrays), and need to record the shape to reconstruct the array
|
|
33
|
+
is_bool: bool = False
|
|
34
|
+
shape: tuple[int, ...] | None = None
|
|
35
|
+
|
|
36
|
+
def as_dict(self) -> dict:
|
|
37
|
+
# dict_factory: suppress Nones
|
|
38
|
+
x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
|
|
39
|
+
return x
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclasses.dataclass
|
|
43
|
+
class CellMd:
|
|
44
|
+
"""
|
|
45
|
+
Content of the cellmd column.
|
|
46
|
+
|
|
47
|
+
All fields are optional, to minimize storage.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
errortype: str | None = None
|
|
51
|
+
errormsg: str | None = None
|
|
52
|
+
|
|
53
|
+
# a list of file urls that are used to store images and arrays; only set for json and array columns
|
|
54
|
+
# for json columns: a list of all urls referenced in the column value
|
|
55
|
+
# for array columns: a single url
|
|
56
|
+
file_urls: list[str] | None = None
|
|
57
|
+
|
|
58
|
+
array_md: ArrayMd | None = None
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_dict(cls, d: dict) -> CellMd:
|
|
62
|
+
x: CellMd
|
|
63
|
+
if 'array_md' in d:
|
|
64
|
+
d2 = d.copy()
|
|
65
|
+
del d2['array_md']
|
|
66
|
+
x = cls(**d2, array_md=ArrayMd(**d['array_md']))
|
|
67
|
+
else:
|
|
68
|
+
x = cls(**d)
|
|
69
|
+
return x
|
|
70
|
+
|
|
71
|
+
def as_dict(self) -> dict:
|
|
72
|
+
x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
|
|
73
|
+
return x
|
|
18
74
|
|
|
19
75
|
|
|
20
76
|
class DataRow:
|
|
21
77
|
"""
|
|
22
78
|
Encapsulates all data and execution state needed by RowBuilder and DataRowBatch:
|
|
23
79
|
- state for in-memory computation
|
|
24
|
-
- state for
|
|
80
|
+
- state needed for expression evaluation
|
|
81
|
+
- containers for output column values
|
|
82
|
+
|
|
25
83
|
This is not meant to be a black-box abstraction.
|
|
26
84
|
|
|
27
85
|
In-memory representations by column type:
|
|
@@ -39,79 +97,92 @@ class DataRow:
|
|
|
39
97
|
- DocumentType: local path if available, otherwise url
|
|
40
98
|
"""
|
|
41
99
|
|
|
100
|
+
# expr evaluation state; indexed by slot idx
|
|
42
101
|
vals: np.ndarray # of object
|
|
43
102
|
has_val: np.ndarray # of bool
|
|
44
103
|
excs: np.ndarray # of object
|
|
45
|
-
|
|
46
|
-
# If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
|
|
47
|
-
# exception handling under normal operation.
|
|
48
|
-
_may_have_exc: bool
|
|
49
|
-
|
|
50
|
-
# expr evaluation state; indexed by slot idx
|
|
51
104
|
missing_slots: np.ndarray # of bool; number of missing dependencies
|
|
52
105
|
missing_dependents: np.ndarray # of int16; number of missing dependents
|
|
53
106
|
is_scheduled: np.ndarray # of bool; True if this slot is scheduled for evaluation
|
|
54
107
|
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
media_slot_idxs: list[int]
|
|
58
|
-
array_slot_idxs: list[int]
|
|
59
|
-
|
|
60
|
-
# the primary key of a store row is a sequence of ints (the number is different for table vs view)
|
|
61
|
-
pk: Optional[tuple[int, ...]]
|
|
108
|
+
# CellMd needed for query execution; needs to be indexed by slot idx, not column id, to work for joins
|
|
109
|
+
slot_md: dict[int, CellMd]
|
|
62
110
|
|
|
63
111
|
# file_urls:
|
|
64
112
|
# - stored url of file for media in vals[i]
|
|
65
113
|
# - None if vals[i] is not media type
|
|
66
114
|
# - not None if file_paths[i] is not None
|
|
115
|
+
# TODO: this is a sparse vector; should it be a dict[int, str]?
|
|
67
116
|
file_urls: np.ndarray # of str
|
|
68
117
|
|
|
69
118
|
# file_paths:
|
|
70
119
|
# - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
|
|
71
120
|
# - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
|
|
121
|
+
# TODO: this is a sparse vector; should it be a dict[int, str]?
|
|
72
122
|
file_paths: np.ndarray # of str
|
|
73
123
|
|
|
124
|
+
# If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
|
|
125
|
+
# exception handling under normal operation.
|
|
126
|
+
_may_have_exc: bool
|
|
127
|
+
|
|
128
|
+
# the primary key of a store row is a sequence of ints (the number is different for table vs view)
|
|
129
|
+
pk: Optional[tuple[int, ...]]
|
|
74
130
|
# for nested rows (ie, those produced by JsonMapperDispatcher)
|
|
75
131
|
parent_row: Optional[DataRow]
|
|
76
132
|
parent_slot_idx: Optional[int]
|
|
77
133
|
|
|
134
|
+
# state for table output (insert()/update()); key: column id
|
|
135
|
+
cell_vals: dict[int, Any] # materialized values of output columns, in the format required for the column
|
|
136
|
+
cell_md: dict[int, CellMd]
|
|
137
|
+
|
|
138
|
+
# control structures that are shared across all DataRows in a batch
|
|
139
|
+
img_slot_idxs: list[int]
|
|
140
|
+
media_slot_idxs: list[int]
|
|
141
|
+
array_slot_idxs: list[int]
|
|
142
|
+
json_slot_idxs: list[int]
|
|
143
|
+
|
|
78
144
|
def __init__(
|
|
79
145
|
self,
|
|
80
146
|
size: int,
|
|
81
147
|
img_slot_idxs: list[int],
|
|
82
148
|
media_slot_idxs: list[int],
|
|
83
149
|
array_slot_idxs: list[int],
|
|
150
|
+
json_slot_idxs: list[int],
|
|
84
151
|
parent_row: Optional[DataRow] = None,
|
|
85
152
|
parent_slot_idx: Optional[int] = None,
|
|
86
153
|
):
|
|
87
|
-
self.img_slot_idxs = img_slot_idxs
|
|
88
|
-
self.media_slot_idxs = media_slot_idxs
|
|
89
|
-
self.array_slot_idxs = array_slot_idxs
|
|
90
154
|
self.init(size)
|
|
91
155
|
self.parent_row = parent_row
|
|
92
156
|
self.parent_slot_idx = parent_slot_idx
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
self.
|
|
96
|
-
self.
|
|
97
|
-
|
|
157
|
+
self.img_slot_idxs = img_slot_idxs
|
|
158
|
+
self.media_slot_idxs = media_slot_idxs
|
|
159
|
+
self.array_slot_idxs = array_slot_idxs
|
|
160
|
+
self.json_slot_idxs = json_slot_idxs
|
|
161
|
+
|
|
162
|
+
def init(self, size: int) -> None:
|
|
163
|
+
self.vals = np.full(size, None, dtype=object)
|
|
164
|
+
self.has_val = np.zeros(size, dtype=bool)
|
|
165
|
+
self.excs = np.full(size, None, dtype=object)
|
|
166
|
+
self.missing_slots = np.zeros(size, dtype=bool)
|
|
167
|
+
self.missing_dependents = np.zeros(size, dtype=np.int16)
|
|
168
|
+
self.is_scheduled = np.zeros(size, dtype=bool)
|
|
169
|
+
self.slot_md = {}
|
|
170
|
+
self.file_urls = np.full(size, None, dtype=object)
|
|
171
|
+
self.file_paths = np.full(size, None, dtype=object)
|
|
98
172
|
self._may_have_exc = False
|
|
99
|
-
self.
|
|
100
|
-
self.
|
|
101
|
-
self.is_scheduled = np.zeros(num_slots, dtype=bool)
|
|
173
|
+
self.cell_vals = {}
|
|
174
|
+
self.cell_md = {}
|
|
102
175
|
self.pk = None
|
|
103
|
-
self.file_urls = np.full(num_slots, None, dtype=object)
|
|
104
|
-
self.file_paths = np.full(num_slots, None, dtype=object)
|
|
105
176
|
self.parent_row = None
|
|
106
177
|
self.parent_slot_idx = None
|
|
107
178
|
|
|
108
|
-
def clear(self,
|
|
109
|
-
if
|
|
110
|
-
self.has_val[
|
|
111
|
-
self.vals[
|
|
112
|
-
self.excs[
|
|
113
|
-
self.file_urls[
|
|
114
|
-
self.file_paths[
|
|
179
|
+
def clear(self, slot_idxs: Optional[np.ndarray] = None) -> None:
|
|
180
|
+
if slot_idxs is not None:
|
|
181
|
+
self.has_val[slot_idxs] = False
|
|
182
|
+
self.vals[slot_idxs] = None
|
|
183
|
+
self.excs[slot_idxs] = None
|
|
184
|
+
self.file_urls[slot_idxs] = None
|
|
185
|
+
self.file_paths[slot_idxs] = None
|
|
115
186
|
else:
|
|
116
187
|
self.init(len(self.vals))
|
|
117
188
|
|
|
@@ -292,9 +363,7 @@ class DataRow:
|
|
|
292
363
|
val = self.vals[index]
|
|
293
364
|
format = None
|
|
294
365
|
if isinstance(val, PIL.Image.Image):
|
|
295
|
-
|
|
296
|
-
# In that case, use WebP instead.
|
|
297
|
-
format = 'webp' if val.has_transparency_data else 'jpeg'
|
|
366
|
+
format = image_utils.default_format(val)
|
|
298
367
|
filepath, url = TempStore.save_media_object(val, col, format=format)
|
|
299
368
|
self.file_paths[index] = str(filepath) if filepath is not None else None
|
|
300
369
|
self.vals[index] = None
|
pixeltable/exprs/expr.py
CHANGED
|
@@ -368,6 +368,15 @@ class Expr(abc.ABC):
|
|
|
368
368
|
for e in expr_list:
|
|
369
369
|
yield from e.subexprs(expr_class=expr_class, filter=filter, traverse_matches=traverse_matches)
|
|
370
370
|
|
|
371
|
+
@classmethod
|
|
372
|
+
def list_contains(
|
|
373
|
+
cls,
|
|
374
|
+
expr_list: Iterable[Expr],
|
|
375
|
+
expr_class: type[Expr] | None = None,
|
|
376
|
+
filter: Callable[[Expr], bool] | None = None,
|
|
377
|
+
) -> bool:
|
|
378
|
+
return any(e._contains(expr_class, filter) for e in expr_list)
|
|
379
|
+
|
|
371
380
|
def _contains(self, cls: Optional[type[Expr]] = None, filter: Optional[Callable[[Expr], bool]] = None) -> bool:
|
|
372
381
|
"""
|
|
373
382
|
Returns True if any subexpr is an instance of cls and/or matches filter.
|
pixeltable/exprs/expr_set.py
CHANGED
|
@@ -9,26 +9,33 @@ T = TypeVar('T', bound='Expr')
|
|
|
9
9
|
|
|
10
10
|
class ExprSet(Generic[T]):
|
|
11
11
|
"""
|
|
12
|
-
|
|
12
|
+
An ordered set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by
|
|
13
|
+
Expr.id.
|
|
13
14
|
"""
|
|
14
15
|
|
|
15
16
|
exprs: dict[int, T] # key: Expr.id
|
|
17
|
+
expr_offsets: dict[int, int] # key: Expr.id, value: offset into self.exprs.keys()
|
|
16
18
|
exprs_by_idx: dict[int, T] # key: slot_idx
|
|
17
19
|
|
|
18
20
|
def __init__(self, elements: Optional[Iterable[T]] = None):
|
|
19
21
|
self.exprs = {}
|
|
22
|
+
self.expr_offsets = {}
|
|
20
23
|
self.exprs_by_idx = {}
|
|
21
24
|
if elements is not None:
|
|
22
25
|
for e in elements:
|
|
23
26
|
self.add(e)
|
|
24
27
|
|
|
25
|
-
def add(self, expr: T) ->
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
def add(self, expr: T) -> int:
|
|
29
|
+
"""Returns offset corresponding to iteration order"""
|
|
30
|
+
offset = self.expr_offsets.get(expr.id)
|
|
31
|
+
if offset is not None:
|
|
32
|
+
return offset
|
|
33
|
+
offset = len(self.exprs)
|
|
28
34
|
self.exprs[expr.id] = expr
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
35
|
+
self.expr_offsets[expr.id] = offset
|
|
36
|
+
if expr.slot_idx is not None:
|
|
37
|
+
self.exprs_by_idx[expr.slot_idx] = expr
|
|
38
|
+
return offset
|
|
32
39
|
|
|
33
40
|
def update(self, *others: Iterable[T]) -> None:
|
|
34
41
|
for other in others:
|