pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +83 -19
- pixeltable/_query.py +1444 -0
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +7 -4
- pixeltable/catalog/catalog.py +2394 -119
- pixeltable/catalog/column.py +225 -104
- pixeltable/catalog/dir.py +38 -9
- pixeltable/catalog/globals.py +53 -34
- pixeltable/catalog/insertable_table.py +265 -115
- pixeltable/catalog/path.py +80 -17
- pixeltable/catalog/schema_object.py +28 -43
- pixeltable/catalog/table.py +1270 -677
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +1270 -751
- pixeltable/catalog/table_version_handle.py +109 -0
- pixeltable/catalog/table_version_path.py +137 -42
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +251 -134
- pixeltable/config.py +215 -0
- pixeltable/env.py +736 -285
- pixeltable/exceptions.py +26 -2
- pixeltable/exec/__init__.py +7 -2
- pixeltable/exec/aggregation_node.py +39 -21
- pixeltable/exec/cache_prefetch_node.py +87 -109
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +25 -28
- pixeltable/exec/data_row_batch.py +11 -46
- pixeltable/exec/exec_context.py +26 -11
- pixeltable/exec/exec_node.py +35 -27
- pixeltable/exec/expr_eval/__init__.py +3 -0
- pixeltable/exec/expr_eval/evaluators.py +365 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
- pixeltable/exec/expr_eval/globals.py +200 -0
- pixeltable/exec/expr_eval/row_buffer.py +74 -0
- pixeltable/exec/expr_eval/schedulers.py +413 -0
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +35 -27
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +44 -29
- pixeltable/exec/sql_node.py +414 -115
- pixeltable/exprs/__init__.py +8 -5
- pixeltable/exprs/arithmetic_expr.py +79 -45
- pixeltable/exprs/array_slice.py +5 -5
- pixeltable/exprs/column_property_ref.py +40 -26
- pixeltable/exprs/column_ref.py +254 -61
- pixeltable/exprs/comparison.py +14 -9
- pixeltable/exprs/compound_predicate.py +9 -10
- pixeltable/exprs/data_row.py +213 -72
- pixeltable/exprs/expr.py +270 -104
- pixeltable/exprs/expr_dict.py +6 -5
- pixeltable/exprs/expr_set.py +20 -11
- pixeltable/exprs/function_call.py +383 -284
- pixeltable/exprs/globals.py +18 -5
- pixeltable/exprs/in_predicate.py +7 -7
- pixeltable/exprs/inline_expr.py +37 -37
- pixeltable/exprs/is_null.py +8 -4
- pixeltable/exprs/json_mapper.py +120 -54
- pixeltable/exprs/json_path.py +90 -60
- pixeltable/exprs/literal.py +61 -16
- pixeltable/exprs/method_ref.py +7 -6
- pixeltable/exprs/object_ref.py +19 -8
- pixeltable/exprs/row_builder.py +238 -75
- pixeltable/exprs/rowid_ref.py +53 -15
- pixeltable/exprs/similarity_expr.py +65 -50
- pixeltable/exprs/sql_element_cache.py +5 -5
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/exprs/type_cast.py +25 -13
- pixeltable/exprs/variable.py +2 -2
- pixeltable/func/__init__.py +9 -5
- pixeltable/func/aggregate_function.py +197 -92
- pixeltable/func/callable_function.py +119 -35
- pixeltable/func/expr_template_function.py +101 -48
- pixeltable/func/function.py +375 -62
- pixeltable/func/function_registry.py +20 -19
- pixeltable/func/globals.py +6 -5
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +151 -35
- pixeltable/func/signature.py +178 -49
- pixeltable/func/tools.py +164 -0
- pixeltable/func/udf.py +176 -53
- pixeltable/functions/__init__.py +44 -4
- pixeltable/functions/anthropic.py +226 -47
- pixeltable/functions/audio.py +148 -11
- pixeltable/functions/bedrock.py +137 -0
- pixeltable/functions/date.py +188 -0
- pixeltable/functions/deepseek.py +113 -0
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +72 -20
- pixeltable/functions/gemini.py +249 -0
- pixeltable/functions/globals.py +208 -53
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1088 -95
- pixeltable/functions/image.py +155 -84
- pixeltable/functions/json.py +8 -11
- pixeltable/functions/llama_cpp.py +31 -19
- pixeltable/functions/math.py +169 -0
- pixeltable/functions/mistralai.py +50 -75
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +29 -36
- pixeltable/functions/openai.py +548 -160
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +15 -14
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +310 -85
- pixeltable/functions/timestamp.py +37 -19
- pixeltable/functions/together.py +77 -120
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +7 -2
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1528 -117
- pixeltable/functions/vision.py +26 -26
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +19 -10
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/functions/yolox.py +112 -0
- pixeltable/globals.py +716 -236
- pixeltable/index/__init__.py +3 -1
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +32 -22
- pixeltable/index/embedding_index.py +155 -92
- pixeltable/io/__init__.py +12 -7
- pixeltable/io/datarows.py +140 -0
- pixeltable/io/external_store.py +83 -125
- pixeltable/io/fiftyone.py +24 -33
- pixeltable/io/globals.py +47 -182
- pixeltable/io/hf_datasets.py +96 -127
- pixeltable/io/label_studio.py +171 -156
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +136 -115
- pixeltable/io/parquet.py +40 -153
- pixeltable/io/table_data_conduit.py +702 -0
- pixeltable/io/utils.py +100 -0
- pixeltable/iterators/__init__.py +8 -4
- pixeltable/iterators/audio.py +207 -0
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +144 -87
- pixeltable/iterators/image.py +17 -38
- pixeltable/iterators/string.py +15 -12
- pixeltable/iterators/video.py +523 -127
- pixeltable/metadata/__init__.py +33 -8
- pixeltable/metadata/converters/convert_10.py +2 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_15.py +15 -11
- pixeltable/metadata/converters/convert_16.py +4 -5
- pixeltable/metadata/converters/convert_17.py +4 -5
- pixeltable/metadata/converters/convert_18.py +4 -6
- pixeltable/metadata/converters/convert_19.py +6 -9
- pixeltable/metadata/converters/convert_20.py +3 -6
- pixeltable/metadata/converters/convert_21.py +6 -8
- pixeltable/metadata/converters/convert_22.py +3 -2
- pixeltable/metadata/converters/convert_23.py +33 -0
- pixeltable/metadata/converters/convert_24.py +55 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/convert_27.py +29 -0
- pixeltable/metadata/converters/convert_28.py +13 -0
- pixeltable/metadata/converters/convert_29.py +110 -0
- pixeltable/metadata/converters/convert_30.py +63 -0
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +44 -18
- pixeltable/metadata/notes.py +21 -0
- pixeltable/metadata/schema.py +185 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +616 -225
- pixeltable/share/__init__.py +3 -0
- pixeltable/share/packager.py +797 -0
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +349 -0
- pixeltable/store.py +398 -232
- pixeltable/type_system.py +730 -267
- pixeltable/utils/__init__.py +40 -0
- pixeltable/utils/arrow.py +201 -29
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +26 -27
- pixeltable/utils/code.py +4 -4
- pixeltable/utils/console_output.py +46 -0
- pixeltable/utils/coroutine.py +24 -0
- pixeltable/utils/dbms.py +92 -0
- pixeltable/utils/description_helper.py +11 -12
- pixeltable/utils/documents.py +60 -61
- pixeltable/utils/exception_handler.py +36 -0
- pixeltable/utils/filecache.py +38 -22
- pixeltable/utils/formatter.py +88 -51
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +14 -13
- pixeltable/utils/iceberg.py +13 -0
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +20 -20
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +32 -5
- pixeltable/utils/system.py +30 -0
- pixeltable/utils/transactional_directory.py +4 -3
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -36
- pixeltable/catalog/path_dict.py +0 -141
- pixeltable/dataframe.py +0 -894
- pixeltable/exec/expr_eval_node.py +0 -232
- pixeltable/ext/__init__.py +0 -14
- pixeltable/ext/functions/__init__.py +0 -8
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/ext/functions/yolox.py +0 -157
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable/utils/media_store.py +0 -76
- pixeltable/utils/s3.py +0 -16
- pixeltable-0.2.26.dist-info/METADATA +0 -400
- pixeltable-0.2.26.dist-info/RECORD +0 -156
- pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/store.py
CHANGED
|
@@ -2,23 +2,23 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import logging
|
|
5
|
-
import os
|
|
6
5
|
import sys
|
|
7
|
-
import
|
|
8
|
-
import urllib.request
|
|
6
|
+
import time
|
|
9
7
|
import warnings
|
|
10
|
-
from typing import Any,
|
|
8
|
+
from typing import Any, Iterable, Iterator
|
|
9
|
+
from uuid import UUID
|
|
11
10
|
|
|
11
|
+
import more_itertools
|
|
12
|
+
import psycopg
|
|
12
13
|
import sqlalchemy as sql
|
|
13
14
|
from tqdm import TqdmWarning, tqdm
|
|
14
15
|
|
|
15
|
-
import
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
from pixeltable import exprs
|
|
16
|
+
from pixeltable import catalog, exceptions as excs
|
|
17
|
+
from pixeltable.catalog.update_status import RowCountStats
|
|
18
|
+
from pixeltable.env import Env
|
|
19
19
|
from pixeltable.exec import ExecNode
|
|
20
20
|
from pixeltable.metadata import schema
|
|
21
|
-
from pixeltable.utils.
|
|
21
|
+
from pixeltable.utils.exception_handler import run_cleanup
|
|
22
22
|
from pixeltable.utils.sql import log_explain, log_stmt
|
|
23
23
|
|
|
24
24
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -32,24 +32,49 @@ class StoreBase:
|
|
|
32
32
|
- v_min: version at which the row was created
|
|
33
33
|
- v_max: version at which the row was deleted (or MAX_VERSION if it's still live)
|
|
34
34
|
"""
|
|
35
|
-
|
|
35
|
+
|
|
36
|
+
tbl_version: catalog.TableVersionHandle
|
|
36
37
|
sa_md: sql.MetaData
|
|
37
|
-
sa_tbl:
|
|
38
|
+
sa_tbl: sql.Table | None
|
|
38
39
|
_pk_cols: list[sql.Column]
|
|
39
40
|
v_min_col: sql.Column
|
|
40
41
|
v_max_col: sql.Column
|
|
41
|
-
base: Optional[StoreBase]
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
# We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
|
|
44
|
+
# since it's referenced by various methods of `StoreBase`
|
|
45
|
+
_base: StoreBase | None
|
|
46
|
+
|
|
47
|
+
# In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
|
|
48
|
+
# benefit to going higher.
|
|
49
|
+
# TODO: Perform more rigorous experiments with different table structures and OS environments to refine this.
|
|
50
|
+
__INSERT_BATCH_SIZE = 10_000
|
|
44
51
|
|
|
45
52
|
def __init__(self, tbl_version: catalog.TableVersion):
|
|
46
|
-
self.tbl_version = tbl_version
|
|
53
|
+
self.tbl_version = tbl_version.handle
|
|
47
54
|
self.sa_md = sql.MetaData()
|
|
48
55
|
self.sa_tbl = None
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
56
|
+
self._pk_cols = []
|
|
57
|
+
|
|
58
|
+
# we initialize _base lazily, because the base may not exist anymore at this point
|
|
59
|
+
# (but we might still need sa_table to access our store table); do this before create_sa_tbl()
|
|
60
|
+
self._base = None
|
|
61
|
+
|
|
62
|
+
# we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
|
|
63
|
+
self.create_sa_tbl(tbl_version)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def base(self) -> StoreBase | None:
|
|
67
|
+
if self._base is None:
|
|
68
|
+
tv = self.tbl_version.get()
|
|
69
|
+
self._base = tv.base.get().store_tbl if tv.base is not None else None
|
|
70
|
+
return self._base
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def storage_name(cls, tbl_id: UUID, is_view: bool) -> str:
|
|
74
|
+
return f'{"view" if is_view else "tbl"}_{tbl_id.hex}'
|
|
75
|
+
|
|
76
|
+
def system_columns(self) -> list[sql.Column]:
|
|
77
|
+
return [*self._pk_cols, self.v_max_col]
|
|
53
78
|
|
|
54
79
|
def pk_columns(self) -> list[sql.Column]:
|
|
55
80
|
return self._pk_cols
|
|
@@ -63,25 +88,44 @@ class StoreBase:
|
|
|
63
88
|
|
|
64
89
|
def _create_system_columns(self) -> list[sql.Column]:
|
|
65
90
|
"""Create and return system columns"""
|
|
66
|
-
rowid_cols
|
|
91
|
+
rowid_cols: list[sql.Column]
|
|
92
|
+
if self._store_tbl_exists():
|
|
93
|
+
# derive our rowid Columns from the existing table, without having to access self.base.store_tbl:
|
|
94
|
+
# self.base may not exist anymore (both this table and our base got dropped in the same transaction, and
|
|
95
|
+
# the base was finalized before this table)
|
|
96
|
+
with Env.get().begin_xact(for_write=False) as conn:
|
|
97
|
+
q = (
|
|
98
|
+
f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r} '
|
|
99
|
+
'ORDER BY ordinal_position'
|
|
100
|
+
)
|
|
101
|
+
col_names = [row[0] for row in conn.execute(sql.text(q)).fetchall()]
|
|
102
|
+
num_rowid_cols = col_names.index('v_min')
|
|
103
|
+
rowid_cols = [
|
|
104
|
+
sql.Column(col_name, sql.BigInteger, nullable=False) for col_name in col_names[:num_rowid_cols]
|
|
105
|
+
]
|
|
106
|
+
else:
|
|
107
|
+
rowid_cols = self._create_rowid_columns()
|
|
67
108
|
self.v_min_col = sql.Column('v_min', sql.BigInteger, nullable=False)
|
|
68
|
-
self.v_max_col =
|
|
69
|
-
|
|
109
|
+
self.v_max_col = sql.Column(
|
|
110
|
+
'v_max', sql.BigInteger, nullable=False, server_default=str(schema.Table.MAX_VERSION)
|
|
111
|
+
)
|
|
70
112
|
self._pk_cols = [*rowid_cols, self.v_min_col]
|
|
71
113
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
72
114
|
|
|
73
|
-
def create_sa_tbl(self) -> None:
|
|
115
|
+
def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
|
|
74
116
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
117
|
+
if tbl_version is None:
|
|
118
|
+
tbl_version = self.tbl_version.get()
|
|
75
119
|
system_cols = self._create_system_columns()
|
|
76
120
|
all_cols = system_cols.copy()
|
|
77
|
-
|
|
121
|
+
# we captured all columns, including dropped ones: they're still part of the physical table
|
|
122
|
+
for col in [c for c in tbl_version.cols if c.is_stored]:
|
|
78
123
|
# re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
|
|
79
124
|
# to the last sql.Table version we created and cannot be reused
|
|
80
125
|
col.create_sa_cols()
|
|
81
126
|
all_cols.append(col.sa_col)
|
|
82
|
-
if col.
|
|
83
|
-
all_cols.append(col.
|
|
84
|
-
all_cols.append(col.sa_errortype_col)
|
|
127
|
+
if col.stores_cellmd:
|
|
128
|
+
all_cols.append(col.sa_cellmd_col)
|
|
85
129
|
|
|
86
130
|
if self.sa_tbl is not None:
|
|
87
131
|
# if we're called in response to a schema change, we need to remove the old table first
|
|
@@ -92,16 +136,17 @@ class StoreBase:
|
|
|
92
136
|
# - base x view joins can be executed as merge joins
|
|
93
137
|
# - speeds up ORDER BY rowid DESC
|
|
94
138
|
# - allows filtering for a particular table version in index scan
|
|
95
|
-
idx_name = f'sys_cols_idx_{
|
|
139
|
+
idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
|
|
96
140
|
idxs.append(sql.Index(idx_name, *system_cols))
|
|
97
141
|
|
|
98
142
|
# v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
|
|
99
|
-
idx_name = f'vmin_idx_{
|
|
100
|
-
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=
|
|
101
|
-
idx_name = f'vmax_idx_{
|
|
102
|
-
idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=
|
|
143
|
+
idx_name = f'vmin_idx_{tbl_version.id.hex}'
|
|
144
|
+
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
145
|
+
idx_name = f'vmax_idx_{tbl_version.id.hex}'
|
|
146
|
+
idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
103
147
|
|
|
104
148
|
self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
|
|
149
|
+
# _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
|
|
105
150
|
|
|
106
151
|
@abc.abstractmethod
|
|
107
152
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
@@ -111,116 +156,169 @@ class StoreBase:
|
|
|
111
156
|
def _storage_name(self) -> str:
|
|
112
157
|
"""Return the name of the data store table"""
|
|
113
158
|
|
|
114
|
-
def
|
|
115
|
-
"""Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
|
|
116
|
-
pxt_tmp_dir = str(env.Env.get().tmp_dir)
|
|
117
|
-
if file_url is None:
|
|
118
|
-
return None
|
|
119
|
-
parsed = urllib.parse.urlparse(file_url)
|
|
120
|
-
# We should never be passed a local file path here. The "len > 1" ensures that Windows
|
|
121
|
-
# file paths aren't mistaken for URLs with a single-character scheme.
|
|
122
|
-
assert len(parsed.scheme) > 1
|
|
123
|
-
if parsed.scheme != 'file':
|
|
124
|
-
# remote url
|
|
125
|
-
return file_url
|
|
126
|
-
file_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
127
|
-
if not file_path.startswith(pxt_tmp_dir):
|
|
128
|
-
# not a tmp file
|
|
129
|
-
return file_url
|
|
130
|
-
_, ext = os.path.splitext(file_path)
|
|
131
|
-
new_path = str(MediaStore.prepare_media_path(self.tbl_version.id, col.id, v_min, ext=ext))
|
|
132
|
-
os.rename(file_path, new_path)
|
|
133
|
-
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(new_path))
|
|
134
|
-
return new_file_url
|
|
135
|
-
|
|
136
|
-
def _move_tmp_media_files(
|
|
137
|
-
self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
|
|
138
|
-
) -> None:
|
|
139
|
-
"""Move tmp media files that we generated to a permanent location"""
|
|
140
|
-
for c in media_cols:
|
|
141
|
-
for table_row in table_rows:
|
|
142
|
-
file_url = table_row[c.store_name()]
|
|
143
|
-
table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
|
|
144
|
-
|
|
145
|
-
def _create_table_row(
|
|
146
|
-
self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
|
|
147
|
-
) -> tuple[dict[str, Any], int]:
|
|
148
|
-
"""Return Tuple[complete table row, # of exceptions] for insert()
|
|
149
|
-
Creates a row that includes the PK columns, with the values from input_row.pk.
|
|
150
|
-
Returns:
|
|
151
|
-
Tuple[complete table row, # of exceptions]
|
|
152
|
-
"""
|
|
153
|
-
table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
|
|
154
|
-
assert len(pk) == len(self._pk_cols)
|
|
155
|
-
for pk_col, pk_val in zip(self._pk_cols, pk):
|
|
156
|
-
table_row[pk_col.name] = pk_val
|
|
157
|
-
return table_row, num_excs
|
|
158
|
-
|
|
159
|
-
def count(self, conn: Optional[sql.engine.Connection] = None) -> int:
|
|
159
|
+
def count(self) -> int:
|
|
160
160
|
"""Return the number of rows visible in self.tbl_version"""
|
|
161
161
|
stmt = (
|
|
162
162
|
sql.select(sql.func.count('*'))
|
|
163
163
|
.select_from(self.sa_tbl)
|
|
164
|
-
.where(self.v_min_col <= self.tbl_version.version)
|
|
165
|
-
.where(self.v_max_col > self.tbl_version.version)
|
|
164
|
+
.where(self.v_min_col <= self.tbl_version.get().version)
|
|
165
|
+
.where(self.v_max_col > self.tbl_version.get().version)
|
|
166
166
|
)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
result = conn.execute(stmt).scalar_one()
|
|
170
|
-
else:
|
|
171
|
-
result = conn.execute(stmt).scalar_one()
|
|
167
|
+
conn = Env.get().conn
|
|
168
|
+
result = conn.execute(stmt).scalar_one()
|
|
172
169
|
assert isinstance(result, int)
|
|
173
170
|
return result
|
|
174
171
|
|
|
175
|
-
def
|
|
176
|
-
|
|
172
|
+
def _exec_if_not_exists(self, stmt: str, wait_for_table: bool) -> None:
|
|
173
|
+
"""
|
|
174
|
+
Execute a statement containing 'IF NOT EXISTS' and ignore any duplicate object-related errors.
|
|
175
|
+
|
|
176
|
+
The statement needs to run in a separate transaction, because the expected error conditions will abort the
|
|
177
|
+
enclosing transaction (and the ability to run additional statements in that same transaction).
|
|
178
|
+
"""
|
|
179
|
+
while True:
|
|
180
|
+
with Env.get().begin_xact(for_write=True) as conn:
|
|
181
|
+
try:
|
|
182
|
+
if wait_for_table and not Env.get().is_using_cockroachdb:
|
|
183
|
+
# Try to lock the table to make sure that it exists. This needs to run in the same transaction
|
|
184
|
+
# as 'stmt' to avoid a race condition.
|
|
185
|
+
# TODO: adapt this for CockroachDB
|
|
186
|
+
lock_stmt = f'LOCK TABLE {self._storage_name()} IN ACCESS EXCLUSIVE MODE'
|
|
187
|
+
conn.execute(sql.text(lock_stmt))
|
|
188
|
+
conn.execute(sql.text(stmt))
|
|
189
|
+
return
|
|
190
|
+
except (sql.exc.IntegrityError, sql.exc.ProgrammingError) as e:
|
|
191
|
+
Env.get().console_logger.info(f'{stmt} failed with: {e}')
|
|
192
|
+
if (
|
|
193
|
+
isinstance(e.orig, psycopg.errors.UniqueViolation)
|
|
194
|
+
and 'duplicate key value violates unique constraint' in str(e.orig)
|
|
195
|
+
) or (
|
|
196
|
+
isinstance(e.orig, (psycopg.errors.DuplicateObject, psycopg.errors.DuplicateTable))
|
|
197
|
+
and 'already exists' in str(e.orig)
|
|
198
|
+
):
|
|
199
|
+
# table already exists
|
|
200
|
+
return
|
|
201
|
+
elif isinstance(e.orig, psycopg.errors.UndefinedTable):
|
|
202
|
+
# the Lock Table failed because the table doesn't exist yet; try again
|
|
203
|
+
time.sleep(1)
|
|
204
|
+
continue
|
|
205
|
+
else:
|
|
206
|
+
raise
|
|
207
|
+
|
|
208
|
+
def _store_tbl_exists(self) -> bool:
|
|
209
|
+
"""Returns True if the store table exists, False otherwise."""
|
|
210
|
+
with Env.get().begin_xact(for_write=False) as conn:
|
|
211
|
+
q = (
|
|
212
|
+
'SELECT COUNT(*) FROM pg_catalog.pg_tables '
|
|
213
|
+
f"WHERE schemaname = 'public' AND tablename = {self._storage_name()!r}"
|
|
214
|
+
)
|
|
215
|
+
res = conn.execute(sql.text(q)).scalar_one()
|
|
216
|
+
return res == 1
|
|
217
|
+
|
|
218
|
+
def create(self) -> None:
|
|
219
|
+
"""
|
|
220
|
+
Create or update store table to bring it in sync with self.sa_tbl. Idempotent.
|
|
177
221
|
|
|
178
|
-
|
|
222
|
+
This runs a sequence of DDL statements (Create Table, Alter Table Add Column, Create Index), each of which
|
|
223
|
+
is run in its own transaction.
|
|
224
|
+
|
|
225
|
+
The exception to that are local replicas, for which TableRestorer creates an enclosing transaction. In theory,
|
|
226
|
+
this should avoid the potential for race conditions that motivate the error handling present in
|
|
227
|
+
_exec_if_not_exists() (meaning: we shouldn't see those errors when creating local replicas).
|
|
228
|
+
TODO: remove the special case for local replicas in order to make the logic easier to reason about.
|
|
229
|
+
"""
|
|
230
|
+
postgres_dialect = sql.dialects.postgresql.dialect()
|
|
231
|
+
|
|
232
|
+
if not self._store_tbl_exists():
|
|
233
|
+
# run Create Table If Not Exists; we always need If Not Exists to avoid race conditions between concurrent
|
|
234
|
+
# Pixeltable processes
|
|
235
|
+
create_stmt = sql.schema.CreateTable(self.sa_tbl, if_not_exists=True).compile(dialect=postgres_dialect)
|
|
236
|
+
self._exec_if_not_exists(str(create_stmt), wait_for_table=False)
|
|
237
|
+
else:
|
|
238
|
+
# ensure that all columns exist by running Alter Table Add Column If Not Exists for all columns
|
|
239
|
+
for col in self.sa_tbl.columns:
|
|
240
|
+
stmt = self._add_column_stmt(col)
|
|
241
|
+
self._exec_if_not_exists(stmt, wait_for_table=True)
|
|
242
|
+
# TODO: do we also need to ensure that these columns are now visible (ie, is there another potential race
|
|
243
|
+
# condition here?)
|
|
244
|
+
|
|
245
|
+
# ensure that all system indices exist by running Create Index If Not Exists
|
|
246
|
+
for idx in self.sa_tbl.indexes:
|
|
247
|
+
create_idx_stmt = sql.schema.CreateIndex(idx, if_not_exists=True).compile(dialect=postgres_dialect)
|
|
248
|
+
self._exec_if_not_exists(str(create_idx_stmt), wait_for_table=True)
|
|
249
|
+
|
|
250
|
+
# ensure that all visible non-system indices exist by running appropriate create statements
|
|
251
|
+
for id in self.tbl_version.get().idxs:
|
|
252
|
+
self.create_index(id)
|
|
253
|
+
|
|
254
|
+
def create_index(self, idx_id: int) -> None:
|
|
255
|
+
"""Create If Not Exists for this index"""
|
|
256
|
+
idx_info = self.tbl_version.get().idxs[idx_id]
|
|
257
|
+
stmt = idx_info.idx.sa_create_stmt(self.tbl_version.get()._store_idx_name(idx_id), idx_info.val_col.sa_col)
|
|
258
|
+
self._exec_if_not_exists(str(stmt), wait_for_table=True)
|
|
259
|
+
|
|
260
|
+
def validate(self) -> None:
|
|
261
|
+
"""Validate store table against self.table_version"""
|
|
262
|
+
with Env.get().begin_xact() as conn:
|
|
263
|
+
# check that all columns are present
|
|
264
|
+
q = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
|
|
265
|
+
store_col_info = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
|
|
266
|
+
tbl_col_info = {col.store_name() for col in self.tbl_version.get().cols if col.is_stored}
|
|
267
|
+
assert tbl_col_info.issubset(store_col_info)
|
|
268
|
+
|
|
269
|
+
# check that all visible indices are present
|
|
270
|
+
q = f'SELECT indexname FROM pg_indexes WHERE tablename = {self._storage_name()!r}'
|
|
271
|
+
store_idx_names = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
|
|
272
|
+
tbl_index_names = {
|
|
273
|
+
self.tbl_version.get()._store_idx_name(info.id) for info in self.tbl_version.get().idxs.values()
|
|
274
|
+
}
|
|
275
|
+
assert tbl_index_names.issubset(store_idx_names)
|
|
276
|
+
|
|
277
|
+
def drop(self) -> None:
|
|
179
278
|
"""Drop store table"""
|
|
180
|
-
|
|
279
|
+
conn = Env.get().conn
|
|
280
|
+
drop_stmt = f'DROP TABLE IF EXISTS {self._storage_name()}'
|
|
281
|
+
conn.execute(sql.text(drop_stmt))
|
|
282
|
+
|
|
283
|
+
def _add_column_stmt(self, sa_col: sql.Column) -> str:
|
|
284
|
+
col_type_str = sa_col.type.compile(dialect=sql.dialects.postgresql.dialect())
|
|
285
|
+
return (
|
|
286
|
+
f'ALTER TABLE {self._storage_name()} ADD COLUMN IF NOT EXISTS '
|
|
287
|
+
f'{sa_col.name} {col_type_str} {"NOT " if not sa_col.nullable else ""} NULL'
|
|
288
|
+
)
|
|
181
289
|
|
|
182
|
-
def add_column(self, col: catalog.Column
|
|
290
|
+
def add_column(self, col: catalog.Column) -> None:
|
|
183
291
|
"""Add column(s) to the store-resident table based on a catalog column
|
|
184
292
|
|
|
185
293
|
Note that a computed catalog column will require two extra columns (for the computed value and for the error
|
|
186
294
|
message).
|
|
187
295
|
"""
|
|
188
296
|
assert col.is_stored
|
|
189
|
-
|
|
190
|
-
|
|
297
|
+
conn = Env.get().conn
|
|
298
|
+
col_type_str = col.sa_col_type.compile(dialect=conn.dialect)
|
|
299
|
+
s_txt = f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL'
|
|
300
|
+
added_storage_cols = [col.store_name()]
|
|
301
|
+
if col.stores_cellmd:
|
|
302
|
+
cellmd_type_str = col.sa_cellmd_type().compile(dialect=conn.dialect)
|
|
303
|
+
s_txt += f' , ADD COLUMN {col.cellmd_store_name()} {cellmd_type_str} DEFAULT NULL'
|
|
304
|
+
added_storage_cols.append(col.cellmd_store_name())
|
|
305
|
+
|
|
306
|
+
stmt = sql.text(s_txt)
|
|
191
307
|
log_stmt(_logger, stmt)
|
|
192
308
|
conn.execute(stmt)
|
|
193
|
-
added_storage_cols = [col.store_name()]
|
|
194
|
-
if col.records_errors:
|
|
195
|
-
# we also need to create the errormsg and errortype storage cols
|
|
196
|
-
stmt = sql.text(f'ALTER TABLE {self._storage_name()} '
|
|
197
|
-
f'ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL')
|
|
198
|
-
conn.execute(stmt)
|
|
199
|
-
stmt = sql.text(f'ALTER TABLE {self._storage_name()} '
|
|
200
|
-
f'ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL')
|
|
201
|
-
conn.execute(stmt)
|
|
202
|
-
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
203
309
|
self.create_sa_tbl()
|
|
204
310
|
_logger.info(f'Added columns {added_storage_cols} to storage table {self._storage_name()}')
|
|
205
311
|
|
|
206
|
-
def drop_column(self, col: catalog.Column
|
|
312
|
+
def drop_column(self, col: catalog.Column) -> None:
|
|
207
313
|
"""Execute Alter Table Drop Column statement"""
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def load_column(
|
|
217
|
-
self,
|
|
218
|
-
col: catalog.Column,
|
|
219
|
-
exec_plan: ExecNode,
|
|
220
|
-
value_expr_slot_idx: int,
|
|
221
|
-
conn: sql.engine.Connection,
|
|
222
|
-
on_error: Literal['abort', 'ignore']
|
|
223
|
-
) -> int:
|
|
314
|
+
s_txt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
|
|
315
|
+
if col.stores_cellmd:
|
|
316
|
+
s_txt += f' , DROP COLUMN {col.cellmd_store_name()}'
|
|
317
|
+
stmt = sql.text(s_txt)
|
|
318
|
+
log_stmt(_logger, stmt)
|
|
319
|
+
Env.get().conn.execute(stmt)
|
|
320
|
+
|
|
321
|
+
def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
|
|
224
322
|
"""Update store column of a computed column with values produced by an execution plan
|
|
225
323
|
|
|
226
324
|
Returns:
|
|
@@ -229,142 +327,169 @@ class StoreBase:
|
|
|
229
327
|
sql.exc.DBAPIError if there was a SQL error during execution
|
|
230
328
|
excs.Error if on_error='abort' and there was an exception during row evaluation
|
|
231
329
|
"""
|
|
330
|
+
assert col.get_tbl().id == self.tbl_version.id
|
|
232
331
|
num_excs = 0
|
|
233
332
|
num_rows = 0
|
|
234
|
-
|
|
235
333
|
# create temp table to store output of exec_plan, with the same primary key as the store table
|
|
236
334
|
tmp_name = f'temp_{self._storage_name()}'
|
|
237
|
-
tmp_pk_cols =
|
|
238
|
-
tmp_cols = tmp_pk_cols.copy()
|
|
335
|
+
tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
|
|
239
336
|
tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
|
|
240
|
-
tmp_cols
|
|
337
|
+
tmp_cols = [*tmp_pk_cols, tmp_val_col]
|
|
241
338
|
# add error columns if the store column records errors
|
|
242
|
-
if col.
|
|
243
|
-
|
|
244
|
-
tmp_cols.append(
|
|
245
|
-
|
|
246
|
-
|
|
339
|
+
if col.stores_cellmd:
|
|
340
|
+
tmp_cellmd_col = sql.Column(col.sa_cellmd_col.name, col.sa_cellmd_col.type)
|
|
341
|
+
tmp_cols.append(tmp_cellmd_col)
|
|
342
|
+
tmp_col_names = [col.name for col in tmp_cols]
|
|
343
|
+
|
|
247
344
|
tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
|
|
345
|
+
conn = Env.get().conn
|
|
248
346
|
tmp_tbl.create(bind=conn)
|
|
249
347
|
|
|
348
|
+
row_builder = exec_plan.row_builder
|
|
349
|
+
|
|
250
350
|
try:
|
|
351
|
+
table_rows: list[tuple[Any]] = []
|
|
352
|
+
|
|
251
353
|
# insert rows from exec_plan into temp table
|
|
252
354
|
for row_batch in exec_plan:
|
|
253
355
|
num_rows += len(row_batch)
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
tbl_row[col.sa_errortype_col.name] = error_type
|
|
273
|
-
tbl_row[col.sa_errormsg_col.name] = error_msg
|
|
274
|
-
else:
|
|
275
|
-
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
276
|
-
if col.col_type.is_media_type():
|
|
277
|
-
val = self._move_tmp_media_file(val, col, result_row.pk[-1])
|
|
278
|
-
tbl_row[col.sa_col.name] = val
|
|
279
|
-
if col.records_errors:
|
|
280
|
-
tbl_row[col.sa_errortype_col.name] = None
|
|
281
|
-
tbl_row[col.sa_errormsg_col.name] = None
|
|
282
|
-
|
|
283
|
-
tbl_rows.append(tbl_row)
|
|
284
|
-
conn.execute(sql.insert(tmp_tbl), tbl_rows)
|
|
356
|
+
batch_table_rows: list[tuple[Any]] = []
|
|
357
|
+
|
|
358
|
+
for row in row_batch:
|
|
359
|
+
if abort_on_exc and row.has_exc():
|
|
360
|
+
exc = row.get_first_exc()
|
|
361
|
+
raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
|
|
362
|
+
table_row, num_row_exc = row_builder.create_store_table_row(row, None, row.pk)
|
|
363
|
+
num_excs += num_row_exc
|
|
364
|
+
batch_table_rows.append(tuple(table_row))
|
|
365
|
+
|
|
366
|
+
table_rows.extend(batch_table_rows)
|
|
367
|
+
|
|
368
|
+
if len(table_rows) >= self.__INSERT_BATCH_SIZE:
|
|
369
|
+
self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
|
|
370
|
+
table_rows.clear()
|
|
371
|
+
|
|
372
|
+
if len(table_rows) > 0:
|
|
373
|
+
self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
|
|
285
374
|
|
|
286
375
|
# update store table with values from temp table
|
|
287
376
|
update_stmt = sql.update(self.sa_tbl)
|
|
288
377
|
for pk_col, tmp_pk_col in zip(self.pk_columns(), tmp_pk_cols):
|
|
289
378
|
update_stmt = update_stmt.where(pk_col == tmp_pk_col)
|
|
290
379
|
update_stmt = update_stmt.values({col.sa_col: tmp_val_col})
|
|
291
|
-
if col.
|
|
292
|
-
update_stmt = update_stmt.values({
|
|
293
|
-
col.sa_errortype_col: tmp_errortype_col,
|
|
294
|
-
col.sa_errormsg_col: tmp_errormsg_col
|
|
295
|
-
})
|
|
380
|
+
if col.stores_cellmd:
|
|
381
|
+
update_stmt = update_stmt.values({col.sa_cellmd_col: tmp_cellmd_col})
|
|
296
382
|
log_explain(_logger, update_stmt, conn)
|
|
297
383
|
conn.execute(update_stmt)
|
|
298
384
|
|
|
299
385
|
finally:
|
|
300
|
-
|
|
301
|
-
|
|
386
|
+
|
|
387
|
+
def remove_tmp_tbl() -> None:
|
|
388
|
+
self.sa_md.remove(tmp_tbl)
|
|
389
|
+
tmp_tbl.drop(bind=conn)
|
|
390
|
+
|
|
391
|
+
run_cleanup(remove_tmp_tbl, raise_error=False)
|
|
392
|
+
|
|
302
393
|
return num_excs
|
|
303
394
|
|
|
304
395
|
def insert_rows(
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
396
|
+
self,
|
|
397
|
+
exec_plan: ExecNode,
|
|
398
|
+
v_min: int,
|
|
399
|
+
show_progress: bool = True,
|
|
400
|
+
rowids: Iterator[int] | None = None,
|
|
401
|
+
abort_on_exc: bool = False,
|
|
402
|
+
) -> tuple[set[int], RowCountStats]:
|
|
308
403
|
"""Insert rows into the store table and update the catalog table's md
|
|
309
404
|
Returns:
|
|
310
405
|
number of inserted rows, number of exceptions, set of column ids that have exceptions
|
|
311
406
|
"""
|
|
312
407
|
assert v_min is not None
|
|
313
|
-
exec_plan.ctx.set_conn(conn)
|
|
314
408
|
# TODO: total?
|
|
315
409
|
num_excs = 0
|
|
316
410
|
num_rows = 0
|
|
317
411
|
cols_with_excs: set[int] = set()
|
|
318
|
-
progress_bar:
|
|
412
|
+
progress_bar: tqdm | None = None # create this only after we started executing
|
|
319
413
|
row_builder = exec_plan.row_builder
|
|
320
|
-
|
|
414
|
+
|
|
415
|
+
store_col_names = row_builder.store_column_names()
|
|
416
|
+
|
|
321
417
|
try:
|
|
418
|
+
table_rows: list[tuple[Any]] = []
|
|
322
419
|
exec_plan.open()
|
|
420
|
+
|
|
323
421
|
for row_batch in exec_plan:
|
|
324
422
|
num_rows += len(row_batch)
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
423
|
+
batch_table_rows: list[tuple[Any]] = []
|
|
424
|
+
|
|
425
|
+
# compute batch of rows and convert them into table rows
|
|
426
|
+
for row in row_batch:
|
|
427
|
+
# if abort_on_exc == True, we need to check for media validation exceptions
|
|
428
|
+
if abort_on_exc and row.has_exc():
|
|
429
|
+
exc = row.get_first_exc()
|
|
430
|
+
raise exc
|
|
431
|
+
|
|
432
|
+
rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
|
|
433
|
+
pk = (*rowid, v_min)
|
|
434
|
+
assert len(pk) == len(self._pk_cols)
|
|
435
|
+
table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
|
|
436
|
+
num_excs += num_row_exc
|
|
437
|
+
|
|
438
|
+
if show_progress and Env.get().verbosity >= 1:
|
|
439
|
+
if progress_bar is None:
|
|
440
|
+
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
441
|
+
progress_bar = tqdm(
|
|
442
|
+
desc=f'Inserting rows into `{self.tbl_version.get().name}`',
|
|
443
|
+
unit=' rows',
|
|
444
|
+
ncols=100,
|
|
445
|
+
file=sys.stdout,
|
|
446
|
+
)
|
|
447
|
+
progress_bar.update(1)
|
|
448
|
+
|
|
449
|
+
batch_table_rows.append(tuple(table_row))
|
|
450
|
+
|
|
451
|
+
table_rows.extend(batch_table_rows)
|
|
452
|
+
|
|
453
|
+
# if a batch is ready for insertion into the database, insert it
|
|
454
|
+
if len(table_rows) >= self.__INSERT_BATCH_SIZE:
|
|
455
|
+
self.sql_insert(self.sa_tbl, store_col_names, table_rows)
|
|
456
|
+
table_rows.clear()
|
|
457
|
+
|
|
458
|
+
# insert any remaining rows
|
|
459
|
+
if len(table_rows) > 0:
|
|
460
|
+
self.sql_insert(self.sa_tbl, store_col_names, table_rows)
|
|
461
|
+
|
|
356
462
|
if progress_bar is not None:
|
|
357
463
|
progress_bar.close()
|
|
358
|
-
|
|
464
|
+
computed_values = exec_plan.ctx.num_computed_exprs * num_rows
|
|
465
|
+
row_counts = RowCountStats(ins_rows=num_rows, num_excs=num_excs, computed_values=computed_values)
|
|
466
|
+
|
|
467
|
+
return cols_with_excs, row_counts
|
|
359
468
|
finally:
|
|
360
469
|
exec_plan.close()
|
|
361
470
|
|
|
362
|
-
|
|
471
|
+
@classmethod
|
|
472
|
+
def sql_insert(cls, sa_tbl: sql.Table, store_col_names: list[str], table_rows: list[tuple[Any]]) -> None:
|
|
473
|
+
assert len(table_rows) > 0
|
|
474
|
+
conn = Env.get().conn
|
|
475
|
+
conn.execute(sql.insert(sa_tbl), [dict(zip(store_col_names, table_row)) for table_row in table_rows])
|
|
476
|
+
|
|
477
|
+
# TODO: Inserting directly via psycopg delivers a small performance benefit, but is somewhat fraught due to
|
|
478
|
+
# differences in the data representation that SQLAlchemy/psycopg expect. The below code will do the
|
|
479
|
+
# insertion in psycopg and can be used if/when we decide to pursue that optimization.
|
|
480
|
+
# col_names_str = ", ".join(store_col_names)
|
|
481
|
+
# placeholders_str = ", ".join('%s' for _ in store_col_names)
|
|
482
|
+
# stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
|
|
483
|
+
# conn.exec_driver_sql(stmt_text, table_rows)
|
|
484
|
+
|
|
485
|
+
def _versions_clause(self, versions: list[int | None], match_on_vmin: bool) -> sql.ColumnElement[bool]:
|
|
363
486
|
"""Return filter for base versions"""
|
|
364
487
|
v = versions[0]
|
|
365
488
|
if v is None:
|
|
366
489
|
# we're looking at live rows
|
|
367
|
-
clause = sql.and_(
|
|
490
|
+
clause = sql.and_(
|
|
491
|
+
self.v_min_col <= self.tbl_version.get().version, self.v_max_col == schema.Table.MAX_VERSION
|
|
492
|
+
)
|
|
368
493
|
else:
|
|
369
494
|
# we're looking at a specific version
|
|
370
495
|
clause = self.v_min_col == v if match_on_vmin else self.v_max_col == v
|
|
@@ -373,8 +498,12 @@ class StoreBase:
|
|
|
373
498
|
return sql.and_(clause, self.base._versions_clause(versions[1:], match_on_vmin))
|
|
374
499
|
|
|
375
500
|
def delete_rows(
|
|
376
|
-
|
|
377
|
-
|
|
501
|
+
self,
|
|
502
|
+
current_version: int,
|
|
503
|
+
base_versions: list[int | None],
|
|
504
|
+
match_on_vmin: bool,
|
|
505
|
+
where_clause: sql.ColumnElement[bool] | None,
|
|
506
|
+
) -> int:
|
|
378
507
|
"""Mark rows as deleted that are live and were created prior to current_version.
|
|
379
508
|
Also: populate the undo columns
|
|
380
509
|
Args:
|
|
@@ -387,34 +516,63 @@ class StoreBase:
|
|
|
387
516
|
number of deleted rows
|
|
388
517
|
"""
|
|
389
518
|
where_clause = sql.true() if where_clause is None else where_clause
|
|
390
|
-
|
|
391
|
-
self.v_min_col < current_version,
|
|
392
|
-
self.v_max_col == schema.Table.MAX_VERSION,
|
|
393
|
-
where_clause)
|
|
519
|
+
version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
|
|
394
520
|
rowid_join_clause = self._rowid_join_predicate()
|
|
395
|
-
base_versions_clause =
|
|
396
|
-
else self.base._versions_clause(base_versions, match_on_vmin)
|
|
397
|
-
|
|
398
|
-
|
|
521
|
+
base_versions_clause = (
|
|
522
|
+
sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
|
|
523
|
+
)
|
|
524
|
+
set_clause: dict[sql.Column, int | sql.Column] = {self.v_max_col: current_version}
|
|
525
|
+
for index_info in self.tbl_version.get().idxs_by_name.values():
|
|
399
526
|
# copy value column to undo column
|
|
400
527
|
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
|
401
528
|
# set value column to NULL
|
|
402
529
|
set_clause[index_info.val_col.sa_col] = None
|
|
530
|
+
|
|
403
531
|
stmt = (
|
|
404
532
|
sql.update(self.sa_tbl)
|
|
405
533
|
.values(set_clause)
|
|
406
534
|
.where(where_clause)
|
|
535
|
+
.where(version_clause)
|
|
407
536
|
.where(rowid_join_clause)
|
|
408
537
|
.where(base_versions_clause)
|
|
409
538
|
)
|
|
539
|
+
conn = Env.get().conn
|
|
410
540
|
log_explain(_logger, stmt, conn)
|
|
411
541
|
status = conn.execute(stmt)
|
|
412
542
|
return status.rowcount
|
|
413
543
|
|
|
544
|
+
def dump_rows(self, version: int, filter_view: StoreBase, filter_view_version: int) -> Iterator[dict[str, Any]]:
|
|
545
|
+
filter_predicate = sql.and_(
|
|
546
|
+
filter_view.v_min_col <= filter_view_version,
|
|
547
|
+
filter_view.v_max_col > filter_view_version,
|
|
548
|
+
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), filter_view.rowid_columns())],
|
|
549
|
+
)
|
|
550
|
+
stmt = (
|
|
551
|
+
sql.select(self.sa_tbl)
|
|
552
|
+
.where(self.v_min_col <= version)
|
|
553
|
+
.where(self.v_max_col > version)
|
|
554
|
+
.where(sql.exists().where(filter_predicate))
|
|
555
|
+
)
|
|
556
|
+
conn = Env.get().conn
|
|
557
|
+
_logger.debug(stmt)
|
|
558
|
+
log_explain(_logger, stmt, conn)
|
|
559
|
+
result = conn.execute(stmt)
|
|
560
|
+
for row in result:
|
|
561
|
+
yield dict(zip(result.keys(), row))
|
|
562
|
+
|
|
563
|
+
def load_rows(self, rows: Iterable[dict[str, Any]], batch_size: int = 10_000) -> None:
|
|
564
|
+
"""
|
|
565
|
+
When instantiating a replica, we can't rely on the usual insertion code path, which contains error handling
|
|
566
|
+
and other logic that doesn't apply.
|
|
567
|
+
"""
|
|
568
|
+
conn = Env.get().conn
|
|
569
|
+
for batch in more_itertools.batched(rows, batch_size):
|
|
570
|
+
conn.execute(sql.insert(self.sa_tbl), batch)
|
|
571
|
+
|
|
414
572
|
|
|
415
573
|
class StoreTable(StoreBase):
|
|
416
574
|
def __init__(self, tbl_version: catalog.TableVersion):
|
|
417
|
-
assert not tbl_version.is_view
|
|
575
|
+
assert not tbl_version.is_view
|
|
418
576
|
super().__init__(tbl_version)
|
|
419
577
|
|
|
420
578
|
def _create_rowid_columns(self) -> list[sql.Column]:
|
|
@@ -430,7 +588,7 @@ class StoreTable(StoreBase):
|
|
|
430
588
|
|
|
431
589
|
class StoreView(StoreBase):
|
|
432
590
|
def __init__(self, catalog_view: catalog.TableVersion):
|
|
433
|
-
assert catalog_view.is_view
|
|
591
|
+
assert catalog_view.is_view
|
|
434
592
|
super().__init__(catalog_view)
|
|
435
593
|
|
|
436
594
|
def _create_rowid_columns(self) -> list[sql.Column]:
|
|
@@ -444,7 +602,9 @@ class StoreView(StoreBase):
|
|
|
444
602
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
445
603
|
return sql.and_(
|
|
446
604
|
self.base._rowid_join_predicate(),
|
|
447
|
-
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())]
|
|
605
|
+
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())],
|
|
606
|
+
)
|
|
607
|
+
|
|
448
608
|
|
|
449
609
|
class StoreComponentView(StoreView):
|
|
450
610
|
"""A view that stores components of its base, as produced by a ComponentIterator
|
|
@@ -452,28 +612,34 @@ class StoreComponentView(StoreView):
|
|
|
452
612
|
PK: now also includes pos, the position returned by the ComponentIterator for the base row identified by base_rowid
|
|
453
613
|
"""
|
|
454
614
|
|
|
455
|
-
rowid_cols: list[sql.Column]
|
|
456
|
-
pos_col: sql.Column
|
|
457
|
-
pos_col_idx: int
|
|
458
|
-
|
|
459
615
|
def __init__(self, catalog_view: catalog.TableVersion):
|
|
460
616
|
super().__init__(catalog_view)
|
|
461
617
|
|
|
462
618
|
def _create_rowid_columns(self) -> list[sql.Column]:
|
|
463
619
|
# each base row is expanded into n view rows
|
|
464
|
-
|
|
620
|
+
rowid_cols = [sql.Column(c.name, c.type) for c in self.base.rowid_columns()]
|
|
465
621
|
# name of pos column: avoid collisions with bases' pos columns
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
def
|
|
472
|
-
|
|
622
|
+
pos_col = sql.Column(f'pos_{len(rowid_cols) - 1}', sql.BigInteger, nullable=False)
|
|
623
|
+
rowid_cols.append(pos_col)
|
|
624
|
+
return rowid_cols
|
|
625
|
+
|
|
626
|
+
@property
|
|
627
|
+
def pos_col(self) -> sql.Column:
|
|
628
|
+
return self.rowid_columns()[-1]
|
|
629
|
+
|
|
630
|
+
@property
|
|
631
|
+
def pos_col_idx(self) -> int:
|
|
632
|
+
return len(self.rowid_columns()) - 1
|
|
633
|
+
|
|
634
|
+
def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
|
|
635
|
+
if tbl_version is None:
|
|
636
|
+
tbl_version = self.tbl_version.get()
|
|
637
|
+
super().create_sa_tbl(tbl_version)
|
|
473
638
|
# we need to fix up the 'pos' column in TableVersion
|
|
474
|
-
|
|
639
|
+
tbl_version.cols_by_name['pos'].sa_col = self.pos_col
|
|
475
640
|
|
|
476
641
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
477
642
|
return sql.and_(
|
|
478
643
|
self.base._rowid_join_predicate(),
|
|
479
|
-
*[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())]
|
|
644
|
+
*[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())],
|
|
645
|
+
)
|