pixeltable 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +6 -3
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/globals.py +15 -6
- pixeltable/catalog/insertable_table.py +23 -8
- pixeltable/catalog/named_function.py +1 -1
- pixeltable/catalog/path_dict.py +4 -4
- pixeltable/catalog/schema_object.py +30 -18
- pixeltable/catalog/table.py +87 -104
- pixeltable/catalog/table_version.py +35 -24
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +15 -8
- pixeltable/dataframe.py +56 -56
- pixeltable/env.py +10 -9
- pixeltable/exec/__init__.py +3 -3
- pixeltable/exec/aggregation_node.py +3 -3
- pixeltable/exec/expr_eval_node.py +3 -3
- pixeltable/exec/in_memory_data_node.py +4 -4
- pixeltable/exec/sql_node.py +4 -1
- pixeltable/exprs/arithmetic_expr.py +41 -16
- pixeltable/exprs/array_slice.py +3 -4
- pixeltable/exprs/column_ref.py +20 -4
- pixeltable/exprs/comparison.py +11 -6
- pixeltable/exprs/data_row.py +3 -0
- pixeltable/exprs/expr.py +88 -23
- pixeltable/exprs/function_call.py +12 -1
- pixeltable/exprs/globals.py +3 -1
- pixeltable/exprs/inline_array.py +4 -4
- pixeltable/exprs/json_path.py +36 -20
- pixeltable/exprs/row_builder.py +4 -4
- pixeltable/exprs/rowid_ref.py +1 -1
- pixeltable/functions/__init__.py +1 -2
- pixeltable/functions/audio.py +32 -0
- pixeltable/functions/huggingface.py +4 -4
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/json.py +46 -0
- pixeltable/functions/video.py +5 -1
- pixeltable/functions/{eval.py → vision.py} +166 -27
- pixeltable/globals.py +57 -28
- pixeltable/io/external_store.py +6 -6
- pixeltable/io/globals.py +13 -14
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/pandas.py +60 -19
- pixeltable/io/parquet.py +14 -14
- pixeltable/iterators/document.py +7 -7
- pixeltable/iterators/video.py +55 -23
- pixeltable/plan.py +58 -29
- pixeltable/store.py +97 -59
- pixeltable/tool/create_test_db_dump.py +17 -11
- pixeltable/type_system.py +155 -143
- pixeltable/utils/pytorch.py +12 -10
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/METADATA +10 -10
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/RECORD +56 -54
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Iterable, List, Optional, Sequence, Set, Tuple
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
|
|
4
4
|
import sqlalchemy as sql
|
|
5
5
|
|
|
6
|
+
import pixeltable as pxt
|
|
6
7
|
import pixeltable.exec as exec
|
|
7
8
|
import pixeltable.func as func
|
|
8
9
|
from pixeltable import catalog
|
|
@@ -39,7 +40,7 @@ class Analyzer:
|
|
|
39
40
|
"""Class to perform semantic analysis of a query and to store the analysis state"""
|
|
40
41
|
|
|
41
42
|
def __init__(
|
|
42
|
-
self, tbl: catalog.TableVersionPath, select_list:
|
|
43
|
+
self, tbl: catalog.TableVersionPath, select_list: Sequence[exprs.Expr],
|
|
43
44
|
where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[List[exprs.Expr]] = None,
|
|
44
45
|
order_by_clause: Optional[List[Tuple[exprs.Expr, bool]]] = None):
|
|
45
46
|
if group_by_clause is None:
|
|
@@ -68,7 +69,7 @@ class Analyzer:
|
|
|
68
69
|
# all exprs that are evaluated in Python; not executable
|
|
69
70
|
self.all_exprs = self.select_list.copy()
|
|
70
71
|
self.all_exprs.extend(self.group_by_clause)
|
|
71
|
-
self.all_exprs.extend(
|
|
72
|
+
self.all_exprs.extend(e for e, _ in self.order_by_clause)
|
|
72
73
|
if self.filter is not None:
|
|
73
74
|
self.all_exprs.append(self.filter)
|
|
74
75
|
self.sql_exprs = list(exprs.Expr.list_subexprs(
|
|
@@ -84,7 +85,7 @@ class Analyzer:
|
|
|
84
85
|
|
|
85
86
|
def _analyze_agg(self) -> None:
|
|
86
87
|
"""Check semantic correctness of aggregation and fill in agg-specific fields of Analyzer"""
|
|
87
|
-
self.agg_fn_calls = [e for e in self.all_exprs if _is_agg_fn_call(e)]
|
|
88
|
+
self.agg_fn_calls = [e for e in self.all_exprs if isinstance(e, exprs.FunctionCall) and _is_agg_fn_call(e)]
|
|
88
89
|
if len(self.agg_fn_calls) == 0:
|
|
89
90
|
# nothing to do
|
|
90
91
|
return
|
|
@@ -98,7 +99,7 @@ class Analyzer:
|
|
|
98
99
|
|
|
99
100
|
# check that filter doesn't contain aggregates
|
|
100
101
|
if self.filter is not None:
|
|
101
|
-
agg_fn_calls = [e for e in self.filter.subexprs(filter=lambda e: _is_agg_fn_call(e))]
|
|
102
|
+
agg_fn_calls = [e for e in self.filter.subexprs(expr_class=exprs.FunctionCall, filter=lambda e: _is_agg_fn_call(e))]
|
|
102
103
|
if len(agg_fn_calls) > 0:
|
|
103
104
|
raise excs.Error(f'Filter cannot contain aggregate functions: {self.filter}')
|
|
104
105
|
|
|
@@ -111,7 +112,7 @@ class Analyzer:
|
|
|
111
112
|
raise excs.Error(f'Grouping expression contains aggregate function: {e}')
|
|
112
113
|
|
|
113
114
|
# check that agg fn calls don't have contradicting ordering requirements
|
|
114
|
-
order_by:
|
|
115
|
+
order_by: list[exprs.Expr] = []
|
|
115
116
|
order_by_origin: Optional[exprs.Expr] = None # the expr that determines the ordering
|
|
116
117
|
for agg_fn_call in self.agg_fn_calls:
|
|
117
118
|
fn_call_order_by = agg_fn_call.get_agg_order_by()
|
|
@@ -185,7 +186,7 @@ class Planner:
|
|
|
185
186
|
def create_count_stmt(
|
|
186
187
|
cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None
|
|
187
188
|
) -> sql.Select:
|
|
188
|
-
stmt = sql.select(sql.func.count(
|
|
189
|
+
stmt = sql.select(sql.func.count())
|
|
189
190
|
refd_tbl_ids: Set[UUID] = set()
|
|
190
191
|
if where_clause is not None:
|
|
191
192
|
analyzer = cls.analyze(tbl, where_clause)
|
|
@@ -200,7 +201,7 @@ class Planner:
|
|
|
200
201
|
|
|
201
202
|
@classmethod
|
|
202
203
|
def create_insert_plan(
|
|
203
|
-
|
|
204
|
+
cls, tbl: catalog.TableVersion, rows: list[dict[str, Any]], ignore_errors: bool
|
|
204
205
|
) -> exec.ExecNode:
|
|
205
206
|
"""Creates a plan for TableVersion.insert()"""
|
|
206
207
|
assert not tbl.is_view()
|
|
@@ -214,12 +215,12 @@ class Planner:
|
|
|
214
215
|
stored_col_info = row_builder.output_slot_idxs()
|
|
215
216
|
stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
|
|
216
217
|
input_col_info = [info for info in stored_col_info if not info.col.is_computed]
|
|
217
|
-
plan = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
|
|
218
|
+
plan: exec.ExecNode = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
|
|
218
219
|
|
|
219
220
|
media_input_cols = [info for info in input_col_info if info.col.col_type.is_media_type()]
|
|
220
221
|
if len(media_input_cols) > 0:
|
|
221
222
|
# prefetch external files for all input column refs for validation
|
|
222
|
-
plan = exec.CachePrefetchNode(tbl.id, media_input_cols, plan)
|
|
223
|
+
plan = exec.CachePrefetchNode(tbl.id, media_input_cols, input=plan)
|
|
223
224
|
plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
|
|
224
225
|
|
|
225
226
|
computed_exprs = [e for e in row_builder.default_eval_ctx.target_exprs if not isinstance(e, exprs.ColumnRef)]
|
|
@@ -234,6 +235,34 @@ class Planner:
|
|
|
234
235
|
ignore_errors=ignore_errors))
|
|
235
236
|
return plan
|
|
236
237
|
|
|
238
|
+
@classmethod
|
|
239
|
+
def create_df_insert_plan(
|
|
240
|
+
cls,
|
|
241
|
+
tbl: catalog.TableVersion,
|
|
242
|
+
df: 'pxt.DataFrame',
|
|
243
|
+
ignore_errors: bool
|
|
244
|
+
) -> exec.ExecNode:
|
|
245
|
+
assert not tbl.is_view()
|
|
246
|
+
plan = df._create_query_plan() # ExecNode constructed by the DataFrame
|
|
247
|
+
|
|
248
|
+
# Modify the plan RowBuilder to register the output columns
|
|
249
|
+
for col_name, expr in zip(df.schema.keys(), df._select_list_exprs):
|
|
250
|
+
assert col_name in tbl.cols_by_name
|
|
251
|
+
col = tbl.cols_by_name[col_name]
|
|
252
|
+
plan.row_builder.add_table_column(col, expr.slot_idx)
|
|
253
|
+
|
|
254
|
+
stored_col_info = plan.row_builder.output_slot_idxs()
|
|
255
|
+
stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
|
|
256
|
+
plan.set_stored_img_cols(stored_img_col_info)
|
|
257
|
+
|
|
258
|
+
plan.set_ctx(
|
|
259
|
+
exec.ExecContext(
|
|
260
|
+
plan.row_builder, batch_size=0, show_pbar=True, num_computed_exprs=0,
|
|
261
|
+
ignore_errors=ignore_errors))
|
|
262
|
+
plan.ctx.num_rows = 0 # Unknown
|
|
263
|
+
|
|
264
|
+
return plan
|
|
265
|
+
|
|
237
266
|
@classmethod
|
|
238
267
|
def create_update_plan(
|
|
239
268
|
cls, tbl: catalog.TableVersionPath,
|
|
@@ -258,7 +287,7 @@ class Planner:
|
|
|
258
287
|
target = tbl.tbl_version # the one we need to update
|
|
259
288
|
updated_cols = list(update_targets.keys())
|
|
260
289
|
if len(recompute_targets) > 0:
|
|
261
|
-
recomputed_cols = recompute_targets
|
|
290
|
+
recomputed_cols = set(recompute_targets)
|
|
262
291
|
else:
|
|
263
292
|
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
|
|
264
293
|
# regardless of cascade, we need to update all indices on any updated column
|
|
@@ -270,13 +299,13 @@ class Planner:
|
|
|
270
299
|
copied_cols = [
|
|
271
300
|
col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
|
|
272
301
|
]
|
|
273
|
-
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
302
|
+
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
|
|
274
303
|
select_list.extend(update_targets.values())
|
|
275
304
|
|
|
276
305
|
recomputed_exprs = \
|
|
277
306
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
278
307
|
# recomputed cols reference the new values of the updated cols
|
|
279
|
-
spec = {exprs.ColumnRef(col): e for col, e in update_targets.items()}
|
|
308
|
+
spec: dict[exprs.Expr, exprs.Expr] = {exprs.ColumnRef(col): e for col, e in update_targets.items()}
|
|
280
309
|
exprs.Expr.list_substitute(recomputed_exprs, spec)
|
|
281
310
|
select_list.extend(recomputed_exprs)
|
|
282
311
|
|
|
@@ -284,16 +313,17 @@ class Planner:
|
|
|
284
313
|
plan = cls.create_query_plan(tbl, select_list, where_clause=where_clause, with_pk=True, ignore_errors=True)
|
|
285
314
|
all_base_cols = copied_cols + updated_cols + list(recomputed_base_cols) # same order as select_list
|
|
286
315
|
# update row builder with column information
|
|
287
|
-
|
|
316
|
+
for i, col in enumerate(all_base_cols):
|
|
317
|
+
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
288
318
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
289
319
|
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
290
320
|
|
|
291
321
|
@classmethod
|
|
292
322
|
def create_batch_update_plan(
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
) ->
|
|
323
|
+
cls, tbl: catalog.TableVersionPath,
|
|
324
|
+
batch: list[dict[catalog.Column, exprs.Expr]], rowids: list[tuple[int, ...]],
|
|
325
|
+
cascade: bool
|
|
326
|
+
) -> tuple[exec.ExecNode, exec.RowUpdateNode, sql.ColumnElement[bool], list[catalog.Column], list[catalog.Column]]:
|
|
297
327
|
"""
|
|
298
328
|
Returns:
|
|
299
329
|
- root node of the plan to produce the updated rows
|
|
@@ -327,7 +357,7 @@ class Planner:
|
|
|
327
357
|
col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
|
|
328
358
|
]
|
|
329
359
|
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
330
|
-
select_list.extend(
|
|
360
|
+
select_list.extend(exprs.ColumnRef(col) for col in updated_cols)
|
|
331
361
|
|
|
332
362
|
recomputed_exprs = \
|
|
333
363
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
@@ -341,10 +371,11 @@ class Planner:
|
|
|
341
371
|
analyzer = Analyzer(tbl, select_list)
|
|
342
372
|
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], analyzer.sql_exprs)
|
|
343
373
|
analyzer.finalize(row_builder)
|
|
344
|
-
|
|
345
|
-
delete_where_clause =
|
|
374
|
+
sql_lookup_node = exec.SqlLookupNode(tbl, row_builder, analyzer.sql_exprs, sa_key_cols, key_vals)
|
|
375
|
+
delete_where_clause = sql_lookup_node.where_clause
|
|
346
376
|
col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
|
|
347
|
-
|
|
377
|
+
row_update_node = exec.RowUpdateNode(tbl, key_vals, len(rowids) > 0, col_vals, row_builder, sql_lookup_node)
|
|
378
|
+
plan: exec.ExecNode = row_update_node
|
|
348
379
|
if not cls._is_contained_in(analyzer.select_list, analyzer.sql_exprs):
|
|
349
380
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
350
381
|
plan = exec.ExprEvalNode(row_builder, analyzer.select_list, analyzer.sql_exprs, input=plan)
|
|
@@ -388,7 +419,7 @@ class Planner:
|
|
|
388
419
|
# retrieve all stored cols and all target exprs
|
|
389
420
|
recomputed_cols = set(recompute_targets.copy())
|
|
390
421
|
copied_cols = [col for col in target.cols if col.is_stored and not col in recomputed_cols]
|
|
391
|
-
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
422
|
+
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
|
|
392
423
|
# resolve recomputed exprs to stored columns in the base
|
|
393
424
|
recomputed_exprs = \
|
|
394
425
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_cols) for c in recomputed_cols]
|
|
@@ -398,10 +429,8 @@ class Planner:
|
|
|
398
429
|
plan = cls.create_query_plan(
|
|
399
430
|
view, select_list, where_clause=target.predicate, with_pk=True, ignore_errors=True,
|
|
400
431
|
exact_version_only=view.get_bases())
|
|
401
|
-
|
|
432
|
+
for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
|
|
402
433
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
403
|
-
for i, col in enumerate(copied_cols + list(recomputed_cols)) # same order as select_list
|
|
404
|
-
]
|
|
405
434
|
# TODO: avoid duplication with view_load_plan() logic (where does this belong?)
|
|
406
435
|
stored_img_col_info = \
|
|
407
436
|
[info for info in plan.row_builder.output_slot_idxs() if info.col.col_type.is_image_type()]
|
|
@@ -532,7 +561,7 @@ class Planner:
|
|
|
532
561
|
return False
|
|
533
562
|
tbl = e.col.tbl
|
|
534
563
|
return tbl.is_component_view() and tbl.is_iterator_column(e.col) and not e.col.is_stored
|
|
535
|
-
unstored_iter_col_refs = list(exprs.Expr.list_subexprs(analyzer.all_exprs, filter=refs_unstored_iter_col))
|
|
564
|
+
unstored_iter_col_refs = list(exprs.Expr.list_subexprs(analyzer.all_exprs, expr_class=exprs.ColumnRef, filter=refs_unstored_iter_col))
|
|
536
565
|
if len(unstored_iter_col_refs) > 0 and len(order_by_items) == 0:
|
|
537
566
|
# we don't already have a user-requested ordering and we access unstored iterator columns:
|
|
538
567
|
# order by the primary key of the component view, which minimizes the number of iterator instantiations
|
|
@@ -554,9 +583,9 @@ class Planner:
|
|
|
554
583
|
return order_by_items
|
|
555
584
|
|
|
556
585
|
@classmethod
|
|
557
|
-
def _is_contained_in(cls, l1:
|
|
586
|
+
def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
|
|
558
587
|
"""Returns True if l1 is contained in l2"""
|
|
559
|
-
s1, s2 = set(
|
|
588
|
+
s1, s2 = set(e.id for e in l1), set(e.id for e in l2)
|
|
560
589
|
return s1 <= s2
|
|
561
590
|
|
|
562
591
|
@classmethod
|
pixeltable/store.py
CHANGED
|
@@ -7,7 +7,7 @@ import sys
|
|
|
7
7
|
import urllib.parse
|
|
8
8
|
import urllib.request
|
|
9
9
|
import warnings
|
|
10
|
-
from typing import Optional, Dict, Any, List, Tuple, Set
|
|
10
|
+
from typing import Optional, Dict, Any, List, Tuple, Set, Union
|
|
11
11
|
|
|
12
12
|
import sqlalchemy as sql
|
|
13
13
|
from tqdm import tqdm, TqdmWarning
|
|
@@ -15,10 +15,8 @@ from tqdm import tqdm, TqdmWarning
|
|
|
15
15
|
import pixeltable.catalog as catalog
|
|
16
16
|
import pixeltable.env as env
|
|
17
17
|
from pixeltable import exprs
|
|
18
|
-
import pixeltable.exceptions as excs
|
|
19
18
|
from pixeltable.exec import ExecNode
|
|
20
19
|
from pixeltable.metadata import schema
|
|
21
|
-
from pixeltable.type_system import StringType
|
|
22
20
|
from pixeltable.utils.media_store import MediaStore
|
|
23
21
|
from pixeltable.utils.sql import log_stmt, log_explain
|
|
24
22
|
|
|
@@ -34,10 +32,15 @@ class StoreBase:
|
|
|
34
32
|
- v_max: version at which the row was deleted (or MAX_VERSION if it's still live)
|
|
35
33
|
"""
|
|
36
34
|
|
|
35
|
+
__INSERT_BATCH_SIZE = 1000
|
|
36
|
+
|
|
37
37
|
def __init__(self, tbl_version: catalog.TableVersion):
|
|
38
38
|
self.tbl_version = tbl_version
|
|
39
39
|
self.sa_md = sql.MetaData()
|
|
40
40
|
self.sa_tbl: Optional[sql.Table] = None
|
|
41
|
+
# We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
|
|
42
|
+
# since it's referenced by various methods of `StoreBase`
|
|
43
|
+
self.base = None if tbl_version.base is None else tbl_version.base.store_tbl
|
|
41
44
|
self.create_sa_tbl()
|
|
42
45
|
|
|
43
46
|
def pk_columns(self) -> List[sql.Column]:
|
|
@@ -49,7 +52,6 @@ class StoreBase:
|
|
|
49
52
|
@abc.abstractmethod
|
|
50
53
|
def _create_rowid_columns(self) -> List[sql.Column]:
|
|
51
54
|
"""Create and return rowid columns"""
|
|
52
|
-
pass
|
|
53
55
|
|
|
54
56
|
@abc.abstractmethod
|
|
55
57
|
def _create_system_columns(self) -> List[sql.Column]:
|
|
@@ -61,7 +63,6 @@ class StoreBase:
|
|
|
61
63
|
self._pk_columns = [*rowid_cols, self.v_min_col]
|
|
62
64
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
63
65
|
|
|
64
|
-
|
|
65
66
|
def create_sa_tbl(self) -> None:
|
|
66
67
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
67
68
|
system_cols = self._create_system_columns()
|
|
@@ -96,14 +97,12 @@ class StoreBase:
|
|
|
96
97
|
self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
|
|
97
98
|
|
|
98
99
|
@abc.abstractmethod
|
|
99
|
-
def _rowid_join_predicate(self) -> sql.
|
|
100
|
+
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
100
101
|
"""Return predicate for rowid joins to all bases"""
|
|
101
|
-
pass
|
|
102
102
|
|
|
103
103
|
@abc.abstractmethod
|
|
104
104
|
def _storage_name(self) -> str:
|
|
105
105
|
"""Return the name of the data store table"""
|
|
106
|
-
pass
|
|
107
106
|
|
|
108
107
|
def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
|
|
109
108
|
"""Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
|
|
@@ -158,10 +157,12 @@ class StoreBase:
|
|
|
158
157
|
|
|
159
158
|
def count(self, conn: Optional[sql.engine.Connection] = None) -> int:
|
|
160
159
|
"""Return the number of rows visible in self.tbl_version"""
|
|
161
|
-
stmt =
|
|
162
|
-
.
|
|
163
|
-
.
|
|
160
|
+
stmt = (
|
|
161
|
+
sql.select(sql.func.count('*')) # type: ignore
|
|
162
|
+
.select_from(self.sa_tbl)
|
|
163
|
+
.where(self.v_min_col <= self.tbl_version.version)
|
|
164
164
|
.where(self.v_max_col > self.tbl_version.version)
|
|
165
|
+
)
|
|
165
166
|
if conn is None:
|
|
166
167
|
with env.Env.get().engine.connect() as conn:
|
|
167
168
|
result = conn.execute(stmt).scalar_one()
|
|
@@ -191,12 +192,12 @@ class StoreBase:
|
|
|
191
192
|
added_storage_cols = [col.store_name()]
|
|
192
193
|
if col.records_errors:
|
|
193
194
|
# we also need to create the errormsg and errortype storage cols
|
|
194
|
-
stmt = (f'ALTER TABLE {self._storage_name()} '
|
|
195
|
-
|
|
196
|
-
conn.execute(
|
|
197
|
-
stmt = (f'ALTER TABLE {self._storage_name()} '
|
|
198
|
-
|
|
199
|
-
conn.execute(
|
|
195
|
+
stmt = sql.text(f'ALTER TABLE {self._storage_name()} '
|
|
196
|
+
f'ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL')
|
|
197
|
+
conn.execute(stmt)
|
|
198
|
+
stmt = sql.text(f'ALTER TABLE {self._storage_name()} '
|
|
199
|
+
f'ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL')
|
|
200
|
+
conn.execute(stmt)
|
|
200
201
|
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
201
202
|
self.create_sa_tbl()
|
|
202
203
|
_logger.info(f'Added columns {added_storage_cols} to storage table {self._storage_name()}')
|
|
@@ -223,35 +224,70 @@ class StoreBase:
|
|
|
223
224
|
"""
|
|
224
225
|
num_excs = 0
|
|
225
226
|
num_rows = 0
|
|
226
|
-
for row_batch in exec_plan:
|
|
227
|
-
num_rows += len(row_batch)
|
|
228
|
-
for result_row in row_batch:
|
|
229
|
-
values_dict: Dict[sql.Column, Any] = {}
|
|
230
|
-
|
|
231
|
-
if col.is_computed:
|
|
232
|
-
if result_row.has_exc(value_expr_slot_idx):
|
|
233
|
-
num_excs += 1
|
|
234
|
-
value_exc = result_row.get_exc(value_expr_slot_idx)
|
|
235
|
-
# we store a NULL value and record the exception/exc type
|
|
236
|
-
error_type = type(value_exc).__name__
|
|
237
|
-
error_msg = str(value_exc)
|
|
238
|
-
values_dict = {
|
|
239
|
-
col.sa_col: None,
|
|
240
|
-
col.sa_errortype_col: error_type,
|
|
241
|
-
col.sa_errormsg_col: error_msg
|
|
242
|
-
}
|
|
243
|
-
else:
|
|
244
|
-
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
245
|
-
if col.col_type.is_media_type():
|
|
246
|
-
val = self._move_tmp_media_file(val, col, result_row.pk[-1])
|
|
247
|
-
values_dict = {col.sa_col: val}
|
|
248
|
-
|
|
249
|
-
update_stmt = sql.update(self.sa_tbl).values(values_dict)
|
|
250
|
-
for pk_col, pk_val in zip(self.pk_columns(), result_row.pk):
|
|
251
|
-
update_stmt = update_stmt.where(pk_col == pk_val)
|
|
252
|
-
log_stmt(_logger, update_stmt)
|
|
253
|
-
conn.execute(update_stmt)
|
|
254
227
|
|
|
228
|
+
# create temp table to store output of exec_plan, with the same primary key as the store table
|
|
229
|
+
tmp_name = f'temp_{self._storage_name()}'
|
|
230
|
+
tmp_pk_cols = [sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns()]
|
|
231
|
+
tmp_cols = tmp_pk_cols.copy()
|
|
232
|
+
tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
|
|
233
|
+
tmp_cols.append(tmp_val_col)
|
|
234
|
+
# add error columns if the store column records errors
|
|
235
|
+
if col.records_errors:
|
|
236
|
+
tmp_errortype_col = sql.Column(col.sa_errortype_col.name, col.sa_errortype_col.type)
|
|
237
|
+
tmp_cols.append(tmp_errortype_col)
|
|
238
|
+
tmp_errormsg_col = sql.Column(col.sa_errormsg_col.name, col.sa_errormsg_col.type)
|
|
239
|
+
tmp_cols.append(tmp_errormsg_col)
|
|
240
|
+
tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
|
|
241
|
+
tmp_tbl.create(bind=conn)
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
# insert rows from exec_plan into temp table
|
|
245
|
+
for row_batch in exec_plan:
|
|
246
|
+
num_rows += len(row_batch)
|
|
247
|
+
tbl_rows: list[dict[str, Any]] = []
|
|
248
|
+
for result_row in row_batch:
|
|
249
|
+
tbl_row: dict[str, Any] = {}
|
|
250
|
+
for pk_col, pk_val in zip(self.pk_columns(), result_row.pk):
|
|
251
|
+
tbl_row[pk_col.name] = pk_val
|
|
252
|
+
|
|
253
|
+
if col.is_computed:
|
|
254
|
+
if result_row.has_exc(value_expr_slot_idx):
|
|
255
|
+
num_excs += 1
|
|
256
|
+
value_exc = result_row.get_exc(value_expr_slot_idx)
|
|
257
|
+
# we store a NULL value and record the exception/exc type
|
|
258
|
+
error_type = type(value_exc).__name__
|
|
259
|
+
error_msg = str(value_exc)
|
|
260
|
+
tbl_row[col.sa_col.name] = None
|
|
261
|
+
tbl_row[col.sa_errortype_col.name] = error_type
|
|
262
|
+
tbl_row[col.sa_errormsg_col.name] = error_msg
|
|
263
|
+
else:
|
|
264
|
+
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
265
|
+
if col.col_type.is_media_type():
|
|
266
|
+
val = self._move_tmp_media_file(val, col, result_row.pk[-1])
|
|
267
|
+
tbl_row[col.sa_col.name] = val
|
|
268
|
+
if col.records_errors:
|
|
269
|
+
tbl_row[col.sa_errortype_col.name] = None
|
|
270
|
+
tbl_row[col.sa_errormsg_col.name] = None
|
|
271
|
+
|
|
272
|
+
tbl_rows.append(tbl_row)
|
|
273
|
+
conn.execute(sql.insert(tmp_tbl), tbl_rows)
|
|
274
|
+
|
|
275
|
+
# update store table with values from temp table
|
|
276
|
+
update_stmt = sql.update(self.sa_tbl)
|
|
277
|
+
for pk_col, tmp_pk_col in zip(self.pk_columns(), tmp_pk_cols):
|
|
278
|
+
update_stmt = update_stmt.where(pk_col == tmp_pk_col)
|
|
279
|
+
update_stmt = update_stmt.values({col.sa_col: tmp_val_col})
|
|
280
|
+
if col.records_errors:
|
|
281
|
+
update_stmt = update_stmt.values({
|
|
282
|
+
col.sa_errortype_col: tmp_errortype_col,
|
|
283
|
+
col.sa_errormsg_col: tmp_errormsg_col
|
|
284
|
+
})
|
|
285
|
+
log_explain(_logger, update_stmt, conn)
|
|
286
|
+
conn.execute(update_stmt)
|
|
287
|
+
|
|
288
|
+
finally:
|
|
289
|
+
tmp_tbl.drop(bind=conn)
|
|
290
|
+
self.sa_md.remove(tmp_tbl)
|
|
255
291
|
return num_excs
|
|
256
292
|
|
|
257
293
|
def insert_rows(
|
|
@@ -264,7 +300,6 @@ class StoreBase:
|
|
|
264
300
|
"""
|
|
265
301
|
assert v_min is not None
|
|
266
302
|
exec_plan.ctx.set_conn(conn)
|
|
267
|
-
batch_size = 16 # TODO: is this a good batch size?
|
|
268
303
|
# TODO: total?
|
|
269
304
|
num_excs = 0
|
|
270
305
|
num_rows = 0
|
|
@@ -276,10 +311,10 @@ class StoreBase:
|
|
|
276
311
|
exec_plan.open()
|
|
277
312
|
for row_batch in exec_plan:
|
|
278
313
|
num_rows += len(row_batch)
|
|
279
|
-
for batch_start_idx in range(0, len(row_batch),
|
|
314
|
+
for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
|
|
280
315
|
# compute batch of rows and convert them into table rows
|
|
281
316
|
table_rows: List[Dict[str, Any]] = []
|
|
282
|
-
for row_idx in range(batch_start_idx, min(batch_start_idx +
|
|
317
|
+
for row_idx in range(batch_start_idx, min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))):
|
|
283
318
|
row = row_batch[row_idx]
|
|
284
319
|
table_row, num_row_exc = \
|
|
285
320
|
self._create_table_row(row, row_builder, media_cols, cols_with_excs, v_min=v_min)
|
|
@@ -295,6 +330,8 @@ class StoreBase:
|
|
|
295
330
|
file=sys.stdout
|
|
296
331
|
)
|
|
297
332
|
progress_bar.update(1)
|
|
333
|
+
|
|
334
|
+
# insert batch of rows
|
|
298
335
|
self._move_tmp_media_files(table_rows, media_cols, v_min)
|
|
299
336
|
conn.execute(sql.insert(self.sa_tbl), table_rows)
|
|
300
337
|
if progress_bar is not None:
|
|
@@ -303,7 +340,7 @@ class StoreBase:
|
|
|
303
340
|
finally:
|
|
304
341
|
exec_plan.close()
|
|
305
342
|
|
|
306
|
-
def _versions_clause(self, versions:
|
|
343
|
+
def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
|
|
307
344
|
"""Return filter for base versions"""
|
|
308
345
|
v = versions[0]
|
|
309
346
|
if v is None:
|
|
@@ -318,7 +355,7 @@ class StoreBase:
|
|
|
318
355
|
|
|
319
356
|
def delete_rows(
|
|
320
357
|
self, current_version: int, base_versions: List[Optional[int]], match_on_vmin: bool,
|
|
321
|
-
where_clause: Optional[sql.
|
|
358
|
+
where_clause: Optional[sql.ColumnElement[bool]], conn: sql.engine.Connection) -> int:
|
|
322
359
|
"""Mark rows as deleted that are live and were created prior to current_version.
|
|
323
360
|
Also: populate the undo columns
|
|
324
361
|
Args:
|
|
@@ -338,17 +375,19 @@ class StoreBase:
|
|
|
338
375
|
rowid_join_clause = self._rowid_join_predicate()
|
|
339
376
|
base_versions_clause = sql.true() if len(base_versions) == 0 \
|
|
340
377
|
else self.base._versions_clause(base_versions, match_on_vmin)
|
|
341
|
-
set_clause = {self.v_max_col: current_version}
|
|
378
|
+
set_clause: dict[sql.Column, Union[int, sql.Column]] = {self.v_max_col: current_version}
|
|
342
379
|
for index_info in self.tbl_version.idxs_by_name.values():
|
|
343
380
|
# copy value column to undo column
|
|
344
381
|
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
|
345
382
|
# set value column to NULL
|
|
346
383
|
set_clause[index_info.val_col.sa_col] = None
|
|
347
|
-
stmt =
|
|
348
|
-
.
|
|
349
|
-
.
|
|
350
|
-
.where(
|
|
384
|
+
stmt = (
|
|
385
|
+
sql.update(self.sa_tbl)
|
|
386
|
+
.values(set_clause)
|
|
387
|
+
.where(where_clause)
|
|
388
|
+
.where(rowid_join_clause)
|
|
351
389
|
.where(base_versions_clause)
|
|
390
|
+
)
|
|
352
391
|
log_explain(_logger, stmt, conn)
|
|
353
392
|
status = conn.execute(stmt)
|
|
354
393
|
return status.rowcount
|
|
@@ -366,14 +405,13 @@ class StoreTable(StoreBase):
|
|
|
366
405
|
def _storage_name(self) -> str:
|
|
367
406
|
return f'tbl_{self.tbl_version.id.hex}'
|
|
368
407
|
|
|
369
|
-
def _rowid_join_predicate(self) -> sql.
|
|
408
|
+
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
370
409
|
return sql.true()
|
|
371
410
|
|
|
372
411
|
|
|
373
412
|
class StoreView(StoreBase):
|
|
374
413
|
def __init__(self, catalog_view: catalog.TableVersion):
|
|
375
414
|
assert catalog_view.is_view()
|
|
376
|
-
self.base = catalog_view.base.store_tbl
|
|
377
415
|
super().__init__(catalog_view)
|
|
378
416
|
|
|
379
417
|
def _create_rowid_columns(self) -> List[sql.Column]:
|
|
@@ -384,7 +422,7 @@ class StoreView(StoreBase):
|
|
|
384
422
|
def _storage_name(self) -> str:
|
|
385
423
|
return f'view_{self.tbl_version.id.hex}'
|
|
386
424
|
|
|
387
|
-
def _rowid_join_predicate(self) -> sql.
|
|
425
|
+
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
388
426
|
return sql.and_(
|
|
389
427
|
self.base._rowid_join_predicate(),
|
|
390
428
|
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())])
|
|
@@ -411,7 +449,7 @@ class StoreComponentView(StoreView):
|
|
|
411
449
|
# we need to fix up the 'pos' column in TableVersion
|
|
412
450
|
self.tbl_version.cols_by_name['pos'].sa_col = self.pos_col
|
|
413
451
|
|
|
414
|
-
def _rowid_join_predicate(self) -> sql.
|
|
452
|
+
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
415
453
|
return sql.and_(
|
|
416
454
|
self.base._rowid_join_predicate(),
|
|
417
455
|
*[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())])
|
|
@@ -6,7 +6,7 @@ import pathlib
|
|
|
6
6
|
import subprocess
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
|
-
import
|
|
9
|
+
import pixeltable_pgserver
|
|
10
10
|
import toml
|
|
11
11
|
|
|
12
12
|
import pixeltable as pxt
|
|
@@ -41,22 +41,27 @@ class Dumper:
|
|
|
41
41
|
md_version = metadata.VERSION
|
|
42
42
|
dump_file = self.output_dir / f'pixeltable-v{md_version:03d}-test.dump.gz'
|
|
43
43
|
_logger.info(f'Creating database dump at: {dump_file}')
|
|
44
|
-
pg_package_dir = os.path.dirname(
|
|
44
|
+
pg_package_dir = os.path.dirname(pixeltable_pgserver.__file__)
|
|
45
45
|
pg_dump_binary = f'{pg_package_dir}/pginstall/bin/pg_dump'
|
|
46
46
|
_logger.info(f'Using pg_dump binary at: {pg_dump_binary}')
|
|
47
|
+
# We need the raw DB URL, without a driver qualifier. (The driver qualifier is needed by
|
|
48
|
+
# SQLAlchemy, but command-line Postgres won't know how to interpret it.)
|
|
49
|
+
db_url = Env.get()._db_server.get_uri(Env.get()._db_name)
|
|
47
50
|
with open(dump_file, 'wb') as dump:
|
|
48
51
|
pg_dump_process = subprocess.Popen(
|
|
49
|
-
|
|
52
|
+
(pg_dump_binary, db_url, '-U', 'postgres', '-Fc'),
|
|
50
53
|
stdout=subprocess.PIPE
|
|
51
54
|
)
|
|
52
55
|
subprocess.run(
|
|
53
|
-
|
|
56
|
+
('gzip', '-9'),
|
|
54
57
|
stdin=pg_dump_process.stdout,
|
|
55
58
|
stdout=dump,
|
|
56
59
|
check=True
|
|
57
60
|
)
|
|
61
|
+
if pg_dump_process.poll() != 0:
|
|
62
|
+
raise RuntimeError(f'pg_dump failed with return code {pg_dump_process.returncode}')
|
|
58
63
|
info_file = self.output_dir / f'pixeltable-v{md_version:03d}-test-info.toml'
|
|
59
|
-
git_sha = subprocess.check_output(
|
|
64
|
+
git_sha = subprocess.check_output(('git', 'rev-parse', 'HEAD')).decode('ascii').strip()
|
|
60
65
|
user = os.environ.get('USER', os.environ.get('USERNAME'))
|
|
61
66
|
info_dict = {'pixeltable-dump': {
|
|
62
67
|
'metadata-version': md_version,
|
|
@@ -177,8 +182,8 @@ class Dumper:
|
|
|
177
182
|
assert t.base_table_image_rot.col in project.stored_proxies
|
|
178
183
|
|
|
179
184
|
def __add_expr_columns(self, t: pxt.Table, col_prefix: str, include_expensive_functions=False) -> None:
|
|
180
|
-
def add_column(col_name: str, col_expr: Any) -> None:
|
|
181
|
-
t.add_column(**{f'{col_prefix}_{col_name}': col_expr})
|
|
185
|
+
def add_column(col_name: str, col_expr: Any, stored: bool = True) -> None:
|
|
186
|
+
t.add_column(**{f'{col_prefix}_{col_name}': col_expr}, stored=stored)
|
|
182
187
|
|
|
183
188
|
# arithmetic_expr
|
|
184
189
|
add_column('plus', t.c2 + 6)
|
|
@@ -187,9 +192,6 @@ class Dumper:
|
|
|
187
192
|
add_column('div', t.c3 / 1.7)
|
|
188
193
|
add_column('mod', t.c2 % 11)
|
|
189
194
|
|
|
190
|
-
# array_slice
|
|
191
|
-
add_column('array_slice_1', t.c6[5])
|
|
192
|
-
|
|
193
195
|
# column_property_ref
|
|
194
196
|
add_column('fileurl', t.c8.fileurl)
|
|
195
197
|
add_column('localpath', t.c8.localpath)
|
|
@@ -217,7 +219,7 @@ class Dumper:
|
|
|
217
219
|
|
|
218
220
|
# image_member_access
|
|
219
221
|
add_column('image_mode', t.c8.mode)
|
|
220
|
-
add_column('image_rot', t.c8.rotate(180))
|
|
222
|
+
add_column('image_rot', t.c8.rotate(180), stored=False)
|
|
221
223
|
|
|
222
224
|
# in_predicate
|
|
223
225
|
add_column('isin_1', t.c1.isin(['test string 1', 'test string 2', 'test string 3']))
|
|
@@ -237,6 +239,10 @@ class Dumper:
|
|
|
237
239
|
# json_mapper and json_path
|
|
238
240
|
add_column('json_mapper', t.c6[3])
|
|
239
241
|
add_column('json_path', t.c6.f1)
|
|
242
|
+
add_column('json_path_nested', t.c6.f6.f7)
|
|
243
|
+
add_column('json_path_star', t.c6.f5['*'])
|
|
244
|
+
add_column('json_path_idx', t.c6.f5[3])
|
|
245
|
+
add_column('json_path_slice', t.c6.f5[1:3:2])
|
|
240
246
|
|
|
241
247
|
# literal
|
|
242
248
|
add_column('str_const', 'str')
|