pixeltable 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +3 -0
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/globals.py +15 -6
- pixeltable/catalog/insertable_table.py +23 -8
- pixeltable/catalog/named_function.py +1 -1
- pixeltable/catalog/path_dict.py +4 -4
- pixeltable/catalog/schema_object.py +30 -18
- pixeltable/catalog/table.py +84 -99
- pixeltable/catalog/table_version.py +35 -24
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +15 -8
- pixeltable/dataframe.py +56 -56
- pixeltable/env.py +6 -5
- pixeltable/exec/__init__.py +3 -3
- pixeltable/exec/aggregation_node.py +3 -3
- pixeltable/exec/expr_eval_node.py +3 -3
- pixeltable/exec/in_memory_data_node.py +4 -4
- pixeltable/exec/sql_node.py +4 -1
- pixeltable/exprs/array_slice.py +3 -4
- pixeltable/exprs/column_ref.py +20 -4
- pixeltable/exprs/comparison.py +11 -6
- pixeltable/exprs/data_row.py +3 -0
- pixeltable/exprs/expr.py +51 -23
- pixeltable/exprs/function_call.py +8 -1
- pixeltable/exprs/inline_array.py +2 -2
- pixeltable/exprs/json_path.py +36 -20
- pixeltable/exprs/row_builder.py +4 -4
- pixeltable/exprs/rowid_ref.py +1 -1
- pixeltable/functions/__init__.py +1 -2
- pixeltable/functions/audio.py +32 -0
- pixeltable/functions/huggingface.py +4 -4
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/video.py +5 -1
- pixeltable/functions/vision.py +2 -6
- pixeltable/globals.py +57 -28
- pixeltable/io/external_store.py +4 -4
- pixeltable/io/globals.py +12 -13
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/pandas.py +27 -12
- pixeltable/io/parquet.py +14 -14
- pixeltable/iterators/document.py +7 -7
- pixeltable/plan.py +58 -29
- pixeltable/store.py +32 -31
- pixeltable/tool/create_test_db_dump.py +12 -6
- pixeltable/type_system.py +89 -97
- pixeltable/utils/pytorch.py +12 -10
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/METADATA +10 -10
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/RECORD +52 -51
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/entry_points.txt +0 -0
pixeltable/plan.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Iterable, List, Optional, Sequence, Set, Tuple
|
|
2
2
|
from uuid import UUID
|
|
3
3
|
|
|
4
4
|
import sqlalchemy as sql
|
|
5
5
|
|
|
6
|
+
import pixeltable as pxt
|
|
6
7
|
import pixeltable.exec as exec
|
|
7
8
|
import pixeltable.func as func
|
|
8
9
|
from pixeltable import catalog
|
|
@@ -39,7 +40,7 @@ class Analyzer:
|
|
|
39
40
|
"""Class to perform semantic analysis of a query and to store the analysis state"""
|
|
40
41
|
|
|
41
42
|
def __init__(
|
|
42
|
-
self, tbl: catalog.TableVersionPath, select_list:
|
|
43
|
+
self, tbl: catalog.TableVersionPath, select_list: Sequence[exprs.Expr],
|
|
43
44
|
where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[List[exprs.Expr]] = None,
|
|
44
45
|
order_by_clause: Optional[List[Tuple[exprs.Expr, bool]]] = None):
|
|
45
46
|
if group_by_clause is None:
|
|
@@ -68,7 +69,7 @@ class Analyzer:
|
|
|
68
69
|
# all exprs that are evaluated in Python; not executable
|
|
69
70
|
self.all_exprs = self.select_list.copy()
|
|
70
71
|
self.all_exprs.extend(self.group_by_clause)
|
|
71
|
-
self.all_exprs.extend(
|
|
72
|
+
self.all_exprs.extend(e for e, _ in self.order_by_clause)
|
|
72
73
|
if self.filter is not None:
|
|
73
74
|
self.all_exprs.append(self.filter)
|
|
74
75
|
self.sql_exprs = list(exprs.Expr.list_subexprs(
|
|
@@ -84,7 +85,7 @@ class Analyzer:
|
|
|
84
85
|
|
|
85
86
|
def _analyze_agg(self) -> None:
|
|
86
87
|
"""Check semantic correctness of aggregation and fill in agg-specific fields of Analyzer"""
|
|
87
|
-
self.agg_fn_calls = [e for e in self.all_exprs if _is_agg_fn_call(e)]
|
|
88
|
+
self.agg_fn_calls = [e for e in self.all_exprs if isinstance(e, exprs.FunctionCall) and _is_agg_fn_call(e)]
|
|
88
89
|
if len(self.agg_fn_calls) == 0:
|
|
89
90
|
# nothing to do
|
|
90
91
|
return
|
|
@@ -98,7 +99,7 @@ class Analyzer:
|
|
|
98
99
|
|
|
99
100
|
# check that filter doesn't contain aggregates
|
|
100
101
|
if self.filter is not None:
|
|
101
|
-
agg_fn_calls = [e for e in self.filter.subexprs(filter=lambda e: _is_agg_fn_call(e))]
|
|
102
|
+
agg_fn_calls = [e for e in self.filter.subexprs(expr_class=exprs.FunctionCall, filter=lambda e: _is_agg_fn_call(e))]
|
|
102
103
|
if len(agg_fn_calls) > 0:
|
|
103
104
|
raise excs.Error(f'Filter cannot contain aggregate functions: {self.filter}')
|
|
104
105
|
|
|
@@ -111,7 +112,7 @@ class Analyzer:
|
|
|
111
112
|
raise excs.Error(f'Grouping expression contains aggregate function: {e}')
|
|
112
113
|
|
|
113
114
|
# check that agg fn calls don't have contradicting ordering requirements
|
|
114
|
-
order_by:
|
|
115
|
+
order_by: list[exprs.Expr] = []
|
|
115
116
|
order_by_origin: Optional[exprs.Expr] = None # the expr that determines the ordering
|
|
116
117
|
for agg_fn_call in self.agg_fn_calls:
|
|
117
118
|
fn_call_order_by = agg_fn_call.get_agg_order_by()
|
|
@@ -185,7 +186,7 @@ class Planner:
|
|
|
185
186
|
def create_count_stmt(
|
|
186
187
|
cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None
|
|
187
188
|
) -> sql.Select:
|
|
188
|
-
stmt = sql.select(sql.func.count(
|
|
189
|
+
stmt = sql.select(sql.func.count())
|
|
189
190
|
refd_tbl_ids: Set[UUID] = set()
|
|
190
191
|
if where_clause is not None:
|
|
191
192
|
analyzer = cls.analyze(tbl, where_clause)
|
|
@@ -200,7 +201,7 @@ class Planner:
|
|
|
200
201
|
|
|
201
202
|
@classmethod
|
|
202
203
|
def create_insert_plan(
|
|
203
|
-
|
|
204
|
+
cls, tbl: catalog.TableVersion, rows: list[dict[str, Any]], ignore_errors: bool
|
|
204
205
|
) -> exec.ExecNode:
|
|
205
206
|
"""Creates a plan for TableVersion.insert()"""
|
|
206
207
|
assert not tbl.is_view()
|
|
@@ -214,12 +215,12 @@ class Planner:
|
|
|
214
215
|
stored_col_info = row_builder.output_slot_idxs()
|
|
215
216
|
stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
|
|
216
217
|
input_col_info = [info for info in stored_col_info if not info.col.is_computed]
|
|
217
|
-
plan = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
|
|
218
|
+
plan: exec.ExecNode = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
|
|
218
219
|
|
|
219
220
|
media_input_cols = [info for info in input_col_info if info.col.col_type.is_media_type()]
|
|
220
221
|
if len(media_input_cols) > 0:
|
|
221
222
|
# prefetch external files for all input column refs for validation
|
|
222
|
-
plan = exec.CachePrefetchNode(tbl.id, media_input_cols, plan)
|
|
223
|
+
plan = exec.CachePrefetchNode(tbl.id, media_input_cols, input=plan)
|
|
223
224
|
plan = exec.MediaValidationNode(row_builder, media_input_cols, input=plan)
|
|
224
225
|
|
|
225
226
|
computed_exprs = [e for e in row_builder.default_eval_ctx.target_exprs if not isinstance(e, exprs.ColumnRef)]
|
|
@@ -234,6 +235,34 @@ class Planner:
|
|
|
234
235
|
ignore_errors=ignore_errors))
|
|
235
236
|
return plan
|
|
236
237
|
|
|
238
|
+
@classmethod
|
|
239
|
+
def create_df_insert_plan(
|
|
240
|
+
cls,
|
|
241
|
+
tbl: catalog.TableVersion,
|
|
242
|
+
df: 'pxt.DataFrame',
|
|
243
|
+
ignore_errors: bool
|
|
244
|
+
) -> exec.ExecNode:
|
|
245
|
+
assert not tbl.is_view()
|
|
246
|
+
plan = df._create_query_plan() # ExecNode constructed by the DataFrame
|
|
247
|
+
|
|
248
|
+
# Modify the plan RowBuilder to register the output columns
|
|
249
|
+
for col_name, expr in zip(df.schema.keys(), df._select_list_exprs):
|
|
250
|
+
assert col_name in tbl.cols_by_name
|
|
251
|
+
col = tbl.cols_by_name[col_name]
|
|
252
|
+
plan.row_builder.add_table_column(col, expr.slot_idx)
|
|
253
|
+
|
|
254
|
+
stored_col_info = plan.row_builder.output_slot_idxs()
|
|
255
|
+
stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
|
|
256
|
+
plan.set_stored_img_cols(stored_img_col_info)
|
|
257
|
+
|
|
258
|
+
plan.set_ctx(
|
|
259
|
+
exec.ExecContext(
|
|
260
|
+
plan.row_builder, batch_size=0, show_pbar=True, num_computed_exprs=0,
|
|
261
|
+
ignore_errors=ignore_errors))
|
|
262
|
+
plan.ctx.num_rows = 0 # Unknown
|
|
263
|
+
|
|
264
|
+
return plan
|
|
265
|
+
|
|
237
266
|
@classmethod
|
|
238
267
|
def create_update_plan(
|
|
239
268
|
cls, tbl: catalog.TableVersionPath,
|
|
@@ -258,7 +287,7 @@ class Planner:
|
|
|
258
287
|
target = tbl.tbl_version # the one we need to update
|
|
259
288
|
updated_cols = list(update_targets.keys())
|
|
260
289
|
if len(recompute_targets) > 0:
|
|
261
|
-
recomputed_cols = recompute_targets
|
|
290
|
+
recomputed_cols = set(recompute_targets)
|
|
262
291
|
else:
|
|
263
292
|
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
|
|
264
293
|
# regardless of cascade, we need to update all indices on any updated column
|
|
@@ -270,13 +299,13 @@ class Planner:
|
|
|
270
299
|
copied_cols = [
|
|
271
300
|
col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
|
|
272
301
|
]
|
|
273
|
-
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
302
|
+
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
|
|
274
303
|
select_list.extend(update_targets.values())
|
|
275
304
|
|
|
276
305
|
recomputed_exprs = \
|
|
277
306
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
278
307
|
# recomputed cols reference the new values of the updated cols
|
|
279
|
-
spec = {exprs.ColumnRef(col): e for col, e in update_targets.items()}
|
|
308
|
+
spec: dict[exprs.Expr, exprs.Expr] = {exprs.ColumnRef(col): e for col, e in update_targets.items()}
|
|
280
309
|
exprs.Expr.list_substitute(recomputed_exprs, spec)
|
|
281
310
|
select_list.extend(recomputed_exprs)
|
|
282
311
|
|
|
@@ -284,16 +313,17 @@ class Planner:
|
|
|
284
313
|
plan = cls.create_query_plan(tbl, select_list, where_clause=where_clause, with_pk=True, ignore_errors=True)
|
|
285
314
|
all_base_cols = copied_cols + updated_cols + list(recomputed_base_cols) # same order as select_list
|
|
286
315
|
# update row builder with column information
|
|
287
|
-
|
|
316
|
+
for i, col in enumerate(all_base_cols):
|
|
317
|
+
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
288
318
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
289
319
|
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
290
320
|
|
|
291
321
|
@classmethod
|
|
292
322
|
def create_batch_update_plan(
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
) ->
|
|
323
|
+
cls, tbl: catalog.TableVersionPath,
|
|
324
|
+
batch: list[dict[catalog.Column, exprs.Expr]], rowids: list[tuple[int, ...]],
|
|
325
|
+
cascade: bool
|
|
326
|
+
) -> tuple[exec.ExecNode, exec.RowUpdateNode, sql.ColumnElement[bool], list[catalog.Column], list[catalog.Column]]:
|
|
297
327
|
"""
|
|
298
328
|
Returns:
|
|
299
329
|
- root node of the plan to produce the updated rows
|
|
@@ -327,7 +357,7 @@ class Planner:
|
|
|
327
357
|
col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
|
|
328
358
|
]
|
|
329
359
|
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
330
|
-
select_list.extend(
|
|
360
|
+
select_list.extend(exprs.ColumnRef(col) for col in updated_cols)
|
|
331
361
|
|
|
332
362
|
recomputed_exprs = \
|
|
333
363
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
@@ -341,10 +371,11 @@ class Planner:
|
|
|
341
371
|
analyzer = Analyzer(tbl, select_list)
|
|
342
372
|
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], analyzer.sql_exprs)
|
|
343
373
|
analyzer.finalize(row_builder)
|
|
344
|
-
|
|
345
|
-
delete_where_clause =
|
|
374
|
+
sql_lookup_node = exec.SqlLookupNode(tbl, row_builder, analyzer.sql_exprs, sa_key_cols, key_vals)
|
|
375
|
+
delete_where_clause = sql_lookup_node.where_clause
|
|
346
376
|
col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
|
|
347
|
-
|
|
377
|
+
row_update_node = exec.RowUpdateNode(tbl, key_vals, len(rowids) > 0, col_vals, row_builder, sql_lookup_node)
|
|
378
|
+
plan: exec.ExecNode = row_update_node
|
|
348
379
|
if not cls._is_contained_in(analyzer.select_list, analyzer.sql_exprs):
|
|
349
380
|
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
350
381
|
plan = exec.ExprEvalNode(row_builder, analyzer.select_list, analyzer.sql_exprs, input=plan)
|
|
@@ -388,7 +419,7 @@ class Planner:
|
|
|
388
419
|
# retrieve all stored cols and all target exprs
|
|
389
420
|
recomputed_cols = set(recompute_targets.copy())
|
|
390
421
|
copied_cols = [col for col in target.cols if col.is_stored and not col in recomputed_cols]
|
|
391
|
-
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
422
|
+
select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
|
|
392
423
|
# resolve recomputed exprs to stored columns in the base
|
|
393
424
|
recomputed_exprs = \
|
|
394
425
|
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_cols) for c in recomputed_cols]
|
|
@@ -398,10 +429,8 @@ class Planner:
|
|
|
398
429
|
plan = cls.create_query_plan(
|
|
399
430
|
view, select_list, where_clause=target.predicate, with_pk=True, ignore_errors=True,
|
|
400
431
|
exact_version_only=view.get_bases())
|
|
401
|
-
|
|
432
|
+
for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
|
|
402
433
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
403
|
-
for i, col in enumerate(copied_cols + list(recomputed_cols)) # same order as select_list
|
|
404
|
-
]
|
|
405
434
|
# TODO: avoid duplication with view_load_plan() logic (where does this belong?)
|
|
406
435
|
stored_img_col_info = \
|
|
407
436
|
[info for info in plan.row_builder.output_slot_idxs() if info.col.col_type.is_image_type()]
|
|
@@ -532,7 +561,7 @@ class Planner:
|
|
|
532
561
|
return False
|
|
533
562
|
tbl = e.col.tbl
|
|
534
563
|
return tbl.is_component_view() and tbl.is_iterator_column(e.col) and not e.col.is_stored
|
|
535
|
-
unstored_iter_col_refs = list(exprs.Expr.list_subexprs(analyzer.all_exprs, filter=refs_unstored_iter_col))
|
|
564
|
+
unstored_iter_col_refs = list(exprs.Expr.list_subexprs(analyzer.all_exprs, expr_class=exprs.ColumnRef, filter=refs_unstored_iter_col))
|
|
536
565
|
if len(unstored_iter_col_refs) > 0 and len(order_by_items) == 0:
|
|
537
566
|
# we don't already have a user-requested ordering and we access unstored iterator columns:
|
|
538
567
|
# order by the primary key of the component view, which minimizes the number of iterator instantiations
|
|
@@ -554,9 +583,9 @@ class Planner:
|
|
|
554
583
|
return order_by_items
|
|
555
584
|
|
|
556
585
|
@classmethod
|
|
557
|
-
def _is_contained_in(cls, l1:
|
|
586
|
+
def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
|
|
558
587
|
"""Returns True if l1 is contained in l2"""
|
|
559
|
-
s1, s2 = set(
|
|
588
|
+
s1, s2 = set(e.id for e in l1), set(e.id for e in l2)
|
|
560
589
|
return s1 <= s2
|
|
561
590
|
|
|
562
591
|
@classmethod
|
pixeltable/store.py
CHANGED
|
@@ -7,7 +7,7 @@ import sys
|
|
|
7
7
|
import urllib.parse
|
|
8
8
|
import urllib.request
|
|
9
9
|
import warnings
|
|
10
|
-
from typing import Optional, Dict, Any, List, Tuple, Set
|
|
10
|
+
from typing import Optional, Dict, Any, List, Tuple, Set, Union
|
|
11
11
|
|
|
12
12
|
import sqlalchemy as sql
|
|
13
13
|
from tqdm import tqdm, TqdmWarning
|
|
@@ -15,10 +15,8 @@ from tqdm import tqdm, TqdmWarning
|
|
|
15
15
|
import pixeltable.catalog as catalog
|
|
16
16
|
import pixeltable.env as env
|
|
17
17
|
from pixeltable import exprs
|
|
18
|
-
import pixeltable.exceptions as excs
|
|
19
18
|
from pixeltable.exec import ExecNode
|
|
20
19
|
from pixeltable.metadata import schema
|
|
21
|
-
from pixeltable.type_system import StringType
|
|
22
20
|
from pixeltable.utils.media_store import MediaStore
|
|
23
21
|
from pixeltable.utils.sql import log_stmt, log_explain
|
|
24
22
|
|
|
@@ -34,10 +32,15 @@ class StoreBase:
|
|
|
34
32
|
- v_max: version at which the row was deleted (or MAX_VERSION if it's still live)
|
|
35
33
|
"""
|
|
36
34
|
|
|
35
|
+
__INSERT_BATCH_SIZE = 1000
|
|
36
|
+
|
|
37
37
|
def __init__(self, tbl_version: catalog.TableVersion):
|
|
38
38
|
self.tbl_version = tbl_version
|
|
39
39
|
self.sa_md = sql.MetaData()
|
|
40
40
|
self.sa_tbl: Optional[sql.Table] = None
|
|
41
|
+
# We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
|
|
42
|
+
# since it's referenced by various methods of `StoreBase`
|
|
43
|
+
self.base = None if tbl_version.base is None else tbl_version.base.store_tbl
|
|
41
44
|
self.create_sa_tbl()
|
|
42
45
|
|
|
43
46
|
def pk_columns(self) -> List[sql.Column]:
|
|
@@ -49,7 +52,6 @@ class StoreBase:
|
|
|
49
52
|
@abc.abstractmethod
|
|
50
53
|
def _create_rowid_columns(self) -> List[sql.Column]:
|
|
51
54
|
"""Create and return rowid columns"""
|
|
52
|
-
pass
|
|
53
55
|
|
|
54
56
|
@abc.abstractmethod
|
|
55
57
|
def _create_system_columns(self) -> List[sql.Column]:
|
|
@@ -61,7 +63,6 @@ class StoreBase:
|
|
|
61
63
|
self._pk_columns = [*rowid_cols, self.v_min_col]
|
|
62
64
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
63
65
|
|
|
64
|
-
|
|
65
66
|
def create_sa_tbl(self) -> None:
|
|
66
67
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
67
68
|
system_cols = self._create_system_columns()
|
|
@@ -96,14 +97,12 @@ class StoreBase:
|
|
|
96
97
|
self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
|
|
97
98
|
|
|
98
99
|
@abc.abstractmethod
|
|
99
|
-
def _rowid_join_predicate(self) -> sql.
|
|
100
|
+
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
100
101
|
"""Return predicate for rowid joins to all bases"""
|
|
101
|
-
pass
|
|
102
102
|
|
|
103
103
|
@abc.abstractmethod
|
|
104
104
|
def _storage_name(self) -> str:
|
|
105
105
|
"""Return the name of the data store table"""
|
|
106
|
-
pass
|
|
107
106
|
|
|
108
107
|
def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
|
|
109
108
|
"""Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
|
|
@@ -158,10 +157,12 @@ class StoreBase:
|
|
|
158
157
|
|
|
159
158
|
def count(self, conn: Optional[sql.engine.Connection] = None) -> int:
|
|
160
159
|
"""Return the number of rows visible in self.tbl_version"""
|
|
161
|
-
stmt =
|
|
162
|
-
.
|
|
163
|
-
.
|
|
160
|
+
stmt = (
|
|
161
|
+
sql.select(sql.func.count('*')) # type: ignore
|
|
162
|
+
.select_from(self.sa_tbl)
|
|
163
|
+
.where(self.v_min_col <= self.tbl_version.version)
|
|
164
164
|
.where(self.v_max_col > self.tbl_version.version)
|
|
165
|
+
)
|
|
165
166
|
if conn is None:
|
|
166
167
|
with env.Env.get().engine.connect() as conn:
|
|
167
168
|
result = conn.execute(stmt).scalar_one()
|
|
@@ -191,12 +192,12 @@ class StoreBase:
|
|
|
191
192
|
added_storage_cols = [col.store_name()]
|
|
192
193
|
if col.records_errors:
|
|
193
194
|
# we also need to create the errormsg and errortype storage cols
|
|
194
|
-
stmt = (f'ALTER TABLE {self._storage_name()} '
|
|
195
|
-
|
|
196
|
-
conn.execute(
|
|
197
|
-
stmt = (f'ALTER TABLE {self._storage_name()} '
|
|
198
|
-
|
|
199
|
-
conn.execute(
|
|
195
|
+
stmt = sql.text(f'ALTER TABLE {self._storage_name()} '
|
|
196
|
+
f'ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL')
|
|
197
|
+
conn.execute(stmt)
|
|
198
|
+
stmt = sql.text(f'ALTER TABLE {self._storage_name()} '
|
|
199
|
+
f'ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL')
|
|
200
|
+
conn.execute(stmt)
|
|
200
201
|
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
201
202
|
self.create_sa_tbl()
|
|
202
203
|
_logger.info(f'Added columns {added_storage_cols} to storage table {self._storage_name()}')
|
|
@@ -299,7 +300,6 @@ class StoreBase:
|
|
|
299
300
|
"""
|
|
300
301
|
assert v_min is not None
|
|
301
302
|
exec_plan.ctx.set_conn(conn)
|
|
302
|
-
batch_size = 16 # TODO: is this a good batch size?
|
|
303
303
|
# TODO: total?
|
|
304
304
|
num_excs = 0
|
|
305
305
|
num_rows = 0
|
|
@@ -311,10 +311,10 @@ class StoreBase:
|
|
|
311
311
|
exec_plan.open()
|
|
312
312
|
for row_batch in exec_plan:
|
|
313
313
|
num_rows += len(row_batch)
|
|
314
|
-
for batch_start_idx in range(0, len(row_batch),
|
|
314
|
+
for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
|
|
315
315
|
# compute batch of rows and convert them into table rows
|
|
316
316
|
table_rows: List[Dict[str, Any]] = []
|
|
317
|
-
for row_idx in range(batch_start_idx, min(batch_start_idx +
|
|
317
|
+
for row_idx in range(batch_start_idx, min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))):
|
|
318
318
|
row = row_batch[row_idx]
|
|
319
319
|
table_row, num_row_exc = \
|
|
320
320
|
self._create_table_row(row, row_builder, media_cols, cols_with_excs, v_min=v_min)
|
|
@@ -340,7 +340,7 @@ class StoreBase:
|
|
|
340
340
|
finally:
|
|
341
341
|
exec_plan.close()
|
|
342
342
|
|
|
343
|
-
def _versions_clause(self, versions:
|
|
343
|
+
def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
|
|
344
344
|
"""Return filter for base versions"""
|
|
345
345
|
v = versions[0]
|
|
346
346
|
if v is None:
|
|
@@ -355,7 +355,7 @@ class StoreBase:
|
|
|
355
355
|
|
|
356
356
|
def delete_rows(
|
|
357
357
|
self, current_version: int, base_versions: List[Optional[int]], match_on_vmin: bool,
|
|
358
|
-
where_clause: Optional[sql.
|
|
358
|
+
where_clause: Optional[sql.ColumnElement[bool]], conn: sql.engine.Connection) -> int:
|
|
359
359
|
"""Mark rows as deleted that are live and were created prior to current_version.
|
|
360
360
|
Also: populate the undo columns
|
|
361
361
|
Args:
|
|
@@ -375,17 +375,19 @@ class StoreBase:
|
|
|
375
375
|
rowid_join_clause = self._rowid_join_predicate()
|
|
376
376
|
base_versions_clause = sql.true() if len(base_versions) == 0 \
|
|
377
377
|
else self.base._versions_clause(base_versions, match_on_vmin)
|
|
378
|
-
set_clause = {self.v_max_col: current_version}
|
|
378
|
+
set_clause: dict[sql.Column, Union[int, sql.Column]] = {self.v_max_col: current_version}
|
|
379
379
|
for index_info in self.tbl_version.idxs_by_name.values():
|
|
380
380
|
# copy value column to undo column
|
|
381
381
|
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
|
382
382
|
# set value column to NULL
|
|
383
383
|
set_clause[index_info.val_col.sa_col] = None
|
|
384
|
-
stmt =
|
|
385
|
-
.
|
|
386
|
-
.
|
|
387
|
-
.where(
|
|
384
|
+
stmt = (
|
|
385
|
+
sql.update(self.sa_tbl)
|
|
386
|
+
.values(set_clause)
|
|
387
|
+
.where(where_clause)
|
|
388
|
+
.where(rowid_join_clause)
|
|
388
389
|
.where(base_versions_clause)
|
|
390
|
+
)
|
|
389
391
|
log_explain(_logger, stmt, conn)
|
|
390
392
|
status = conn.execute(stmt)
|
|
391
393
|
return status.rowcount
|
|
@@ -403,14 +405,13 @@ class StoreTable(StoreBase):
|
|
|
403
405
|
def _storage_name(self) -> str:
|
|
404
406
|
return f'tbl_{self.tbl_version.id.hex}'
|
|
405
407
|
|
|
406
|
-
def _rowid_join_predicate(self) -> sql.
|
|
408
|
+
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
407
409
|
return sql.true()
|
|
408
410
|
|
|
409
411
|
|
|
410
412
|
class StoreView(StoreBase):
|
|
411
413
|
def __init__(self, catalog_view: catalog.TableVersion):
|
|
412
414
|
assert catalog_view.is_view()
|
|
413
|
-
self.base = catalog_view.base.store_tbl
|
|
414
415
|
super().__init__(catalog_view)
|
|
415
416
|
|
|
416
417
|
def _create_rowid_columns(self) -> List[sql.Column]:
|
|
@@ -421,7 +422,7 @@ class StoreView(StoreBase):
|
|
|
421
422
|
def _storage_name(self) -> str:
|
|
422
423
|
return f'view_{self.tbl_version.id.hex}'
|
|
423
424
|
|
|
424
|
-
def _rowid_join_predicate(self) -> sql.
|
|
425
|
+
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
425
426
|
return sql.and_(
|
|
426
427
|
self.base._rowid_join_predicate(),
|
|
427
428
|
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())])
|
|
@@ -448,7 +449,7 @@ class StoreComponentView(StoreView):
|
|
|
448
449
|
# we need to fix up the 'pos' column in TableVersion
|
|
449
450
|
self.tbl_version.cols_by_name['pos'].sa_col = self.pos_col
|
|
450
451
|
|
|
451
|
-
def _rowid_join_predicate(self) -> sql.
|
|
452
|
+
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
452
453
|
return sql.and_(
|
|
453
454
|
self.base._rowid_join_predicate(),
|
|
454
455
|
*[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())])
|
|
@@ -44,19 +44,24 @@ class Dumper:
|
|
|
44
44
|
pg_package_dir = os.path.dirname(pixeltable_pgserver.__file__)
|
|
45
45
|
pg_dump_binary = f'{pg_package_dir}/pginstall/bin/pg_dump'
|
|
46
46
|
_logger.info(f'Using pg_dump binary at: {pg_dump_binary}')
|
|
47
|
+
# We need the raw DB URL, without a driver qualifier. (The driver qualifier is needed by
|
|
48
|
+
# SQLAlchemy, but command-line Postgres won't know how to interpret it.)
|
|
49
|
+
db_url = Env.get()._db_server.get_uri(Env.get()._db_name)
|
|
47
50
|
with open(dump_file, 'wb') as dump:
|
|
48
51
|
pg_dump_process = subprocess.Popen(
|
|
49
|
-
|
|
52
|
+
(pg_dump_binary, db_url, '-U', 'postgres', '-Fc'),
|
|
50
53
|
stdout=subprocess.PIPE
|
|
51
54
|
)
|
|
52
55
|
subprocess.run(
|
|
53
|
-
|
|
56
|
+
('gzip', '-9'),
|
|
54
57
|
stdin=pg_dump_process.stdout,
|
|
55
58
|
stdout=dump,
|
|
56
59
|
check=True
|
|
57
60
|
)
|
|
61
|
+
if pg_dump_process.poll() != 0:
|
|
62
|
+
raise RuntimeError(f'pg_dump failed with return code {pg_dump_process.returncode}')
|
|
58
63
|
info_file = self.output_dir / f'pixeltable-v{md_version:03d}-test-info.toml'
|
|
59
|
-
git_sha = subprocess.check_output(
|
|
64
|
+
git_sha = subprocess.check_output(('git', 'rev-parse', 'HEAD')).decode('ascii').strip()
|
|
60
65
|
user = os.environ.get('USER', os.environ.get('USERNAME'))
|
|
61
66
|
info_dict = {'pixeltable-dump': {
|
|
62
67
|
'metadata-version': md_version,
|
|
@@ -187,9 +192,6 @@ class Dumper:
|
|
|
187
192
|
add_column('div', t.c3 / 1.7)
|
|
188
193
|
add_column('mod', t.c2 % 11)
|
|
189
194
|
|
|
190
|
-
# array_slice
|
|
191
|
-
add_column('array_slice_1', t.c6[5])
|
|
192
|
-
|
|
193
195
|
# column_property_ref
|
|
194
196
|
add_column('fileurl', t.c8.fileurl)
|
|
195
197
|
add_column('localpath', t.c8.localpath)
|
|
@@ -237,6 +239,10 @@ class Dumper:
|
|
|
237
239
|
# json_mapper and json_path
|
|
238
240
|
add_column('json_mapper', t.c6[3])
|
|
239
241
|
add_column('json_path', t.c6.f1)
|
|
242
|
+
add_column('json_path_nested', t.c6.f6.f7)
|
|
243
|
+
add_column('json_path_star', t.c6.f5['*'])
|
|
244
|
+
add_column('json_path_idx', t.c6.f5[3])
|
|
245
|
+
add_column('json_path_slice', t.c6.f5[1:3:2])
|
|
240
246
|
|
|
241
247
|
# literal
|
|
242
248
|
add_column('str_const', 'str')
|