pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/store.py
CHANGED
|
@@ -2,23 +2,22 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import logging
|
|
5
|
-
import os
|
|
6
5
|
import sys
|
|
7
|
-
import
|
|
8
|
-
import urllib.request
|
|
6
|
+
import time
|
|
9
7
|
import warnings
|
|
10
|
-
from typing import Any, Iterable, Iterator
|
|
8
|
+
from typing import Any, Iterable, Iterator
|
|
11
9
|
|
|
12
10
|
import more_itertools
|
|
11
|
+
import psycopg
|
|
13
12
|
import sqlalchemy as sql
|
|
14
13
|
from tqdm import TqdmWarning, tqdm
|
|
15
14
|
|
|
16
|
-
from pixeltable import catalog, exceptions as excs
|
|
15
|
+
from pixeltable import catalog, exceptions as excs
|
|
16
|
+
from pixeltable.catalog.update_status import RowCountStats
|
|
17
17
|
from pixeltable.env import Env
|
|
18
18
|
from pixeltable.exec import ExecNode
|
|
19
19
|
from pixeltable.metadata import schema
|
|
20
20
|
from pixeltable.utils.exception_handler import run_cleanup
|
|
21
|
-
from pixeltable.utils.media_store import MediaStore
|
|
22
21
|
from pixeltable.utils.sql import log_explain, log_stmt
|
|
23
22
|
|
|
24
23
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -35,13 +34,16 @@ class StoreBase:
|
|
|
35
34
|
|
|
36
35
|
tbl_version: catalog.TableVersionHandle
|
|
37
36
|
sa_md: sql.MetaData
|
|
38
|
-
sa_tbl:
|
|
37
|
+
sa_tbl: sql.Table | None
|
|
39
38
|
_pk_cols: list[sql.Column]
|
|
40
39
|
v_min_col: sql.Column
|
|
41
40
|
v_max_col: sql.Column
|
|
42
|
-
base:
|
|
41
|
+
base: StoreBase | None
|
|
43
42
|
|
|
44
|
-
|
|
43
|
+
# In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
|
|
44
|
+
# benefit to going higher.
|
|
45
|
+
# TODO: Perform more rigorous experiments with different table structures and OS environments to refine this.
|
|
46
|
+
__INSERT_BATCH_SIZE = 10_000
|
|
45
47
|
|
|
46
48
|
def __init__(self, tbl_version: catalog.TableVersion):
|
|
47
49
|
self.tbl_version = catalog.TableVersionHandle(
|
|
@@ -78,20 +80,20 @@ class StoreBase:
|
|
|
78
80
|
self._pk_cols = [*rowid_cols, self.v_min_col]
|
|
79
81
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
80
82
|
|
|
81
|
-
def create_sa_tbl(self, tbl_version:
|
|
83
|
+
def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
|
|
82
84
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
83
85
|
if tbl_version is None:
|
|
84
86
|
tbl_version = self.tbl_version.get()
|
|
85
87
|
system_cols = self._create_system_columns()
|
|
86
88
|
all_cols = system_cols.copy()
|
|
89
|
+
# we captured all columns, including dropped ones: they're still part of the physical table
|
|
87
90
|
for col in [c for c in tbl_version.cols if c.is_stored]:
|
|
88
91
|
# re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
|
|
89
92
|
# to the last sql.Table version we created and cannot be reused
|
|
90
93
|
col.create_sa_cols()
|
|
91
94
|
all_cols.append(col.sa_col)
|
|
92
|
-
if col.
|
|
93
|
-
all_cols.append(col.
|
|
94
|
-
all_cols.append(col.sa_errortype_col)
|
|
95
|
+
if col.stores_cellmd:
|
|
96
|
+
all_cols.append(col.sa_cellmd_col)
|
|
95
97
|
|
|
96
98
|
if self.sa_tbl is not None:
|
|
97
99
|
# if we're called in response to a schema change, we need to remove the old table first
|
|
@@ -122,51 +124,6 @@ class StoreBase:
|
|
|
122
124
|
def _storage_name(self) -> str:
|
|
123
125
|
"""Return the name of the data store table"""
|
|
124
126
|
|
|
125
|
-
def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
|
|
126
|
-
"""Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
|
|
127
|
-
pxt_tmp_dir = str(Env.get().tmp_dir)
|
|
128
|
-
if file_url is None:
|
|
129
|
-
return None
|
|
130
|
-
parsed = urllib.parse.urlparse(file_url)
|
|
131
|
-
# We should never be passed a local file path here. The "len > 1" ensures that Windows
|
|
132
|
-
# file paths aren't mistaken for URLs with a single-character scheme.
|
|
133
|
-
assert len(parsed.scheme) > 1
|
|
134
|
-
if parsed.scheme != 'file':
|
|
135
|
-
# remote url
|
|
136
|
-
return file_url
|
|
137
|
-
file_path = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
138
|
-
if not file_path.startswith(pxt_tmp_dir):
|
|
139
|
-
# not a tmp file
|
|
140
|
-
return file_url
|
|
141
|
-
_, ext = os.path.splitext(file_path)
|
|
142
|
-
new_path = str(MediaStore.prepare_media_path(self.tbl_version.id, col.id, v_min, ext=ext))
|
|
143
|
-
os.rename(file_path, new_path)
|
|
144
|
-
new_file_url = urllib.parse.urljoin('file:', urllib.request.pathname2url(new_path))
|
|
145
|
-
return new_file_url
|
|
146
|
-
|
|
147
|
-
def _move_tmp_media_files(
|
|
148
|
-
self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
|
|
149
|
-
) -> None:
|
|
150
|
-
"""Move tmp media files that we generated to a permanent location"""
|
|
151
|
-
for c in media_cols:
|
|
152
|
-
for table_row in table_rows:
|
|
153
|
-
file_url = table_row[c.store_name()]
|
|
154
|
-
table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
|
|
155
|
-
|
|
156
|
-
def _create_table_row(
|
|
157
|
-
self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
|
|
158
|
-
) -> tuple[dict[str, Any], int]:
|
|
159
|
-
"""Return Tuple[complete table row, # of exceptions] for insert()
|
|
160
|
-
Creates a row that includes the PK columns, with the values from input_row.pk.
|
|
161
|
-
Returns:
|
|
162
|
-
Tuple[complete table row, # of exceptions]
|
|
163
|
-
"""
|
|
164
|
-
table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
|
|
165
|
-
assert len(pk) == len(self._pk_cols)
|
|
166
|
-
for pk_col, pk_val in zip(self._pk_cols, pk):
|
|
167
|
-
table_row[pk_col.name] = pk_val
|
|
168
|
-
return table_row, num_excs
|
|
169
|
-
|
|
170
127
|
def count(self) -> int:
|
|
171
128
|
"""Return the number of rows visible in self.tbl_version"""
|
|
172
129
|
stmt = (
|
|
@@ -180,15 +137,123 @@ class StoreBase:
|
|
|
180
137
|
assert isinstance(result, int)
|
|
181
138
|
return result
|
|
182
139
|
|
|
140
|
+
def _exec_if_not_exists(self, stmt: str, wait_for_table: bool) -> None:
|
|
141
|
+
"""
|
|
142
|
+
Execute a statement containing 'IF NOT EXISTS' and ignore any duplicate object-related errors.
|
|
143
|
+
|
|
144
|
+
The statement needs to run in a separate transaction, because the expected error conditions will abort the
|
|
145
|
+
enclosing transaction (and the ability to run additional statements in that same transaction).
|
|
146
|
+
"""
|
|
147
|
+
while True:
|
|
148
|
+
with Env.get().begin_xact(for_write=True) as conn:
|
|
149
|
+
try:
|
|
150
|
+
if wait_for_table and not Env.get().is_using_cockroachdb:
|
|
151
|
+
# Try to lock the table to make sure that it exists. This needs to run in the same transaction
|
|
152
|
+
# as 'stmt' to avoid a race condition.
|
|
153
|
+
# TODO: adapt this for CockroachDB
|
|
154
|
+
lock_stmt = f'LOCK TABLE {self._storage_name()} IN ACCESS EXCLUSIVE MODE'
|
|
155
|
+
conn.execute(sql.text(lock_stmt))
|
|
156
|
+
conn.execute(sql.text(stmt))
|
|
157
|
+
return
|
|
158
|
+
except (sql.exc.IntegrityError, sql.exc.ProgrammingError) as e:
|
|
159
|
+
Env.get().console_logger.info(f'{stmt} failed with: {e}')
|
|
160
|
+
if (
|
|
161
|
+
isinstance(e.orig, psycopg.errors.UniqueViolation)
|
|
162
|
+
and 'duplicate key value violates unique constraint' in str(e.orig)
|
|
163
|
+
) or (
|
|
164
|
+
isinstance(e.orig, (psycopg.errors.DuplicateObject, psycopg.errors.DuplicateTable))
|
|
165
|
+
and 'already exists' in str(e.orig)
|
|
166
|
+
):
|
|
167
|
+
# table already exists
|
|
168
|
+
return
|
|
169
|
+
elif isinstance(e.orig, psycopg.errors.UndefinedTable):
|
|
170
|
+
# the Lock Table failed because the table doesn't exist yet; try again
|
|
171
|
+
time.sleep(1)
|
|
172
|
+
continue
|
|
173
|
+
else:
|
|
174
|
+
raise
|
|
175
|
+
|
|
176
|
+
def _store_tbl_exists(self) -> bool:
|
|
177
|
+
"""Returns True if the store table exists, False otherwise."""
|
|
178
|
+
with Env.get().begin_xact(for_write=False) as conn:
|
|
179
|
+
q = (
|
|
180
|
+
'SELECT COUNT(*) FROM pg_catalog.pg_tables '
|
|
181
|
+
f"WHERE schemaname = 'public' AND tablename = {self._storage_name()!r}"
|
|
182
|
+
)
|
|
183
|
+
res = conn.execute(sql.text(q)).scalar_one()
|
|
184
|
+
return res == 1
|
|
185
|
+
|
|
183
186
|
def create(self) -> None:
|
|
184
|
-
|
|
185
|
-
self.
|
|
187
|
+
"""
|
|
188
|
+
Create or update store table to bring it in sync with self.sa_tbl. Idempotent.
|
|
189
|
+
|
|
190
|
+
This runs a sequence of DDL statements (Create Table, Alter Table Add Column, Create Index), each of which
|
|
191
|
+
is run in its own transaction.
|
|
192
|
+
|
|
193
|
+
The exception to that are local replicas, for which TableRestorer creates an enclosing transaction. In theory,
|
|
194
|
+
this should avoid the potential for race conditions that motivate the error handling present in
|
|
195
|
+
_exec_if_not_exists() (meaning: we shouldn't see those errors when creating local replicas).
|
|
196
|
+
TODO: remove the special case for local replicas in order to make the logic easier to reason about.
|
|
197
|
+
"""
|
|
198
|
+
postgres_dialect = sql.dialects.postgresql.dialect()
|
|
199
|
+
|
|
200
|
+
if not self._store_tbl_exists():
|
|
201
|
+
# run Create Table If Not Exists; we always need If Not Exists to avoid race conditions between concurrent
|
|
202
|
+
# Pixeltable processes
|
|
203
|
+
create_stmt = sql.schema.CreateTable(self.sa_tbl, if_not_exists=True).compile(dialect=postgres_dialect)
|
|
204
|
+
self._exec_if_not_exists(str(create_stmt), wait_for_table=False)
|
|
205
|
+
else:
|
|
206
|
+
# ensure that all columns exist by running Alter Table Add Column If Not Exists for all columns
|
|
207
|
+
for col in self.sa_tbl.columns:
|
|
208
|
+
stmt = self._add_column_stmt(col)
|
|
209
|
+
self._exec_if_not_exists(stmt, wait_for_table=True)
|
|
210
|
+
# TODO: do we also need to ensure that these columns are now visible (ie, is there another potential race
|
|
211
|
+
# condition here?)
|
|
212
|
+
|
|
213
|
+
# ensure that all system indices exist by running Create Index If Not Exists
|
|
214
|
+
for idx in self.sa_tbl.indexes:
|
|
215
|
+
create_idx_stmt = sql.schema.CreateIndex(idx, if_not_exists=True).compile(dialect=postgres_dialect)
|
|
216
|
+
self._exec_if_not_exists(str(create_idx_stmt), wait_for_table=True)
|
|
217
|
+
|
|
218
|
+
# ensure that all visible non-system indices exist by running appropriate create statements
|
|
219
|
+
for id in self.tbl_version.get().idxs:
|
|
220
|
+
self.create_index(id)
|
|
221
|
+
|
|
222
|
+
def create_index(self, idx_id: int) -> None:
|
|
223
|
+
"""Create If Not Exists for this index"""
|
|
224
|
+
idx_info = self.tbl_version.get().idxs[idx_id]
|
|
225
|
+
stmt = idx_info.idx.sa_create_stmt(self.tbl_version.get()._store_idx_name(idx_id), idx_info.val_col.sa_col)
|
|
226
|
+
self._exec_if_not_exists(str(stmt), wait_for_table=True)
|
|
227
|
+
|
|
228
|
+
def validate(self) -> None:
|
|
229
|
+
"""Validate store table against self.table_version"""
|
|
230
|
+
with Env.get().begin_xact() as conn:
|
|
231
|
+
# check that all columns are present
|
|
232
|
+
q = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
|
|
233
|
+
store_col_info = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
|
|
234
|
+
tbl_col_info = {col.store_name() for col in self.tbl_version.get().cols if col.is_stored}
|
|
235
|
+
assert tbl_col_info.issubset(store_col_info)
|
|
236
|
+
|
|
237
|
+
# check that all visible indices are present
|
|
238
|
+
q = f'SELECT indexname FROM pg_indexes WHERE tablename = {self._storage_name()!r}'
|
|
239
|
+
store_idx_names = {row[0] for row in conn.execute(sql.text(q)).fetchall()}
|
|
240
|
+
tbl_index_names = {
|
|
241
|
+
self.tbl_version.get()._store_idx_name(info.id) for info in self.tbl_version.get().idxs.values()
|
|
242
|
+
}
|
|
243
|
+
assert tbl_index_names.issubset(store_idx_names)
|
|
186
244
|
|
|
187
245
|
def drop(self) -> None:
|
|
188
246
|
"""Drop store table"""
|
|
189
247
|
conn = Env.get().conn
|
|
190
248
|
self.sa_md.drop_all(bind=conn)
|
|
191
249
|
|
|
250
|
+
def _add_column_stmt(self, sa_col: sql.Column) -> str:
|
|
251
|
+
col_type_str = sa_col.type.compile(dialect=sql.dialects.postgresql.dialect())
|
|
252
|
+
return (
|
|
253
|
+
f'ALTER TABLE {self._storage_name()} ADD COLUMN IF NOT EXISTS '
|
|
254
|
+
f'{sa_col.name} {col_type_str} {"NOT " if not sa_col.nullable else ""} NULL'
|
|
255
|
+
)
|
|
256
|
+
|
|
192
257
|
def add_column(self, col: catalog.Column) -> None:
|
|
193
258
|
"""Add column(s) to the store-resident table based on a catalog column
|
|
194
259
|
|
|
@@ -197,14 +262,13 @@ class StoreBase:
|
|
|
197
262
|
"""
|
|
198
263
|
assert col.is_stored
|
|
199
264
|
conn = Env.get().conn
|
|
200
|
-
col_type_str = col.
|
|
265
|
+
col_type_str = col.sa_col_type.compile(dialect=conn.dialect)
|
|
201
266
|
s_txt = f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL'
|
|
202
267
|
added_storage_cols = [col.store_name()]
|
|
203
|
-
if col.
|
|
204
|
-
|
|
205
|
-
s_txt += f' , ADD COLUMN {col.
|
|
206
|
-
|
|
207
|
-
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
268
|
+
if col.stores_cellmd:
|
|
269
|
+
cellmd_type_str = col.sa_cellmd_type().compile(dialect=conn.dialect)
|
|
270
|
+
s_txt += f' , ADD COLUMN {col.cellmd_store_name()} {cellmd_type_str} DEFAULT NULL'
|
|
271
|
+
added_storage_cols.append(col.cellmd_store_name())
|
|
208
272
|
|
|
209
273
|
stmt = sql.text(s_txt)
|
|
210
274
|
log_stmt(_logger, stmt)
|
|
@@ -215,25 +279,13 @@ class StoreBase:
|
|
|
215
279
|
def drop_column(self, col: catalog.Column) -> None:
|
|
216
280
|
"""Execute Alter Table Drop Column statement"""
|
|
217
281
|
s_txt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
|
|
218
|
-
if col.
|
|
219
|
-
s_txt += f' , DROP COLUMN {col.
|
|
220
|
-
s_txt += f' , DROP COLUMN {col.errortype_store_name()}'
|
|
282
|
+
if col.stores_cellmd:
|
|
283
|
+
s_txt += f' , DROP COLUMN {col.cellmd_store_name()}'
|
|
221
284
|
stmt = sql.text(s_txt)
|
|
222
285
|
log_stmt(_logger, stmt)
|
|
223
286
|
Env.get().conn.execute(stmt)
|
|
224
287
|
|
|
225
|
-
def
|
|
226
|
-
conn = Env.get().conn
|
|
227
|
-
sql_text = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
|
|
228
|
-
result = conn.execute(sql.text(sql_text))
|
|
229
|
-
existing_cols = {row[0] for row in result}
|
|
230
|
-
for col in cols:
|
|
231
|
-
if col.store_name() not in existing_cols:
|
|
232
|
-
self.add_column(col)
|
|
233
|
-
|
|
234
|
-
def load_column(
|
|
235
|
-
self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
|
|
236
|
-
) -> int:
|
|
288
|
+
def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
|
|
237
289
|
"""Update store column of a computed column with values produced by an execution plan
|
|
238
290
|
|
|
239
291
|
Returns:
|
|
@@ -242,84 +294,69 @@ class StoreBase:
|
|
|
242
294
|
sql.exc.DBAPIError if there was a SQL error during execution
|
|
243
295
|
excs.Error if on_error='abort' and there was an exception during row evaluation
|
|
244
296
|
"""
|
|
245
|
-
assert col.
|
|
297
|
+
assert col.get_tbl().id == self.tbl_version.id
|
|
246
298
|
num_excs = 0
|
|
247
299
|
num_rows = 0
|
|
248
300
|
# create temp table to store output of exec_plan, with the same primary key as the store table
|
|
249
301
|
tmp_name = f'temp_{self._storage_name()}'
|
|
250
|
-
tmp_pk_cols =
|
|
251
|
-
tmp_cols = tmp_pk_cols.copy()
|
|
302
|
+
tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
|
|
252
303
|
tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
|
|
253
|
-
tmp_cols
|
|
304
|
+
tmp_cols = [*tmp_pk_cols, tmp_val_col]
|
|
254
305
|
# add error columns if the store column records errors
|
|
255
|
-
if col.
|
|
256
|
-
|
|
257
|
-
tmp_cols.append(
|
|
258
|
-
|
|
259
|
-
|
|
306
|
+
if col.stores_cellmd:
|
|
307
|
+
tmp_cellmd_col = sql.Column(col.sa_cellmd_col.name, col.sa_cellmd_col.type)
|
|
308
|
+
tmp_cols.append(tmp_cellmd_col)
|
|
309
|
+
tmp_col_names = [col.name for col in tmp_cols]
|
|
310
|
+
|
|
260
311
|
tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
|
|
261
312
|
conn = Env.get().conn
|
|
262
313
|
tmp_tbl.create(bind=conn)
|
|
263
314
|
|
|
315
|
+
row_builder = exec_plan.row_builder
|
|
316
|
+
|
|
264
317
|
try:
|
|
318
|
+
table_rows: list[tuple[Any]] = []
|
|
319
|
+
|
|
265
320
|
# insert rows from exec_plan into temp table
|
|
266
|
-
# TODO: unify the table row construction logic with RowBuilder.create_table_row()
|
|
267
321
|
for row_batch in exec_plan:
|
|
268
322
|
num_rows += len(row_batch)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
tbl_row[col.sa_errortype_col.name] = error_type
|
|
288
|
-
tbl_row[col.sa_errormsg_col.name] = error_msg
|
|
289
|
-
else:
|
|
290
|
-
if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
|
|
291
|
-
# we have yet to store this image
|
|
292
|
-
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
293
|
-
result_row.flush_img(value_expr_slot_idx, filepath)
|
|
294
|
-
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
295
|
-
if col.col_type.is_media_type():
|
|
296
|
-
val = self._move_tmp_media_file(val, col, result_row.pk[-1])
|
|
297
|
-
tbl_row[col.sa_col.name] = val
|
|
298
|
-
if col.records_errors:
|
|
299
|
-
tbl_row[col.sa_errortype_col.name] = None
|
|
300
|
-
tbl_row[col.sa_errormsg_col.name] = None
|
|
301
|
-
|
|
302
|
-
tbl_rows.append(tbl_row)
|
|
303
|
-
conn.execute(sql.insert(tmp_tbl), tbl_rows)
|
|
323
|
+
batch_table_rows: list[tuple[Any]] = []
|
|
324
|
+
|
|
325
|
+
for row in row_batch:
|
|
326
|
+
if abort_on_exc and row.has_exc():
|
|
327
|
+
exc = row.get_first_exc()
|
|
328
|
+
raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
|
|
329
|
+
table_row, num_row_exc = row_builder.create_store_table_row(row, None, row.pk)
|
|
330
|
+
num_excs += num_row_exc
|
|
331
|
+
batch_table_rows.append(tuple(table_row))
|
|
332
|
+
|
|
333
|
+
table_rows.extend(batch_table_rows)
|
|
334
|
+
|
|
335
|
+
if len(table_rows) >= self.__INSERT_BATCH_SIZE:
|
|
336
|
+
self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
|
|
337
|
+
table_rows.clear()
|
|
338
|
+
|
|
339
|
+
if len(table_rows) > 0:
|
|
340
|
+
self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
|
|
304
341
|
|
|
305
342
|
# update store table with values from temp table
|
|
306
343
|
update_stmt = sql.update(self.sa_tbl)
|
|
307
344
|
for pk_col, tmp_pk_col in zip(self.pk_columns(), tmp_pk_cols):
|
|
308
345
|
update_stmt = update_stmt.where(pk_col == tmp_pk_col)
|
|
309
346
|
update_stmt = update_stmt.values({col.sa_col: tmp_val_col})
|
|
310
|
-
if col.
|
|
311
|
-
update_stmt = update_stmt.values(
|
|
312
|
-
{col.sa_errortype_col: tmp_errortype_col, col.sa_errormsg_col: tmp_errormsg_col}
|
|
313
|
-
)
|
|
347
|
+
if col.stores_cellmd:
|
|
348
|
+
update_stmt = update_stmt.values({col.sa_cellmd_col: tmp_cellmd_col})
|
|
314
349
|
log_explain(_logger, update_stmt, conn)
|
|
315
350
|
conn.execute(update_stmt)
|
|
351
|
+
|
|
316
352
|
finally:
|
|
317
353
|
|
|
318
354
|
def remove_tmp_tbl() -> None:
|
|
319
355
|
self.sa_md.remove(tmp_tbl)
|
|
320
356
|
tmp_tbl.drop(bind=conn)
|
|
321
357
|
|
|
322
|
-
run_cleanup(remove_tmp_tbl, raise_error=
|
|
358
|
+
run_cleanup(remove_tmp_tbl, raise_error=False)
|
|
359
|
+
|
|
323
360
|
return num_excs
|
|
324
361
|
|
|
325
362
|
def insert_rows(
|
|
@@ -327,9 +364,9 @@ class StoreBase:
|
|
|
327
364
|
exec_plan: ExecNode,
|
|
328
365
|
v_min: int,
|
|
329
366
|
show_progress: bool = True,
|
|
330
|
-
rowids:
|
|
367
|
+
rowids: Iterator[int] | None = None,
|
|
331
368
|
abort_on_exc: bool = False,
|
|
332
|
-
) -> tuple[
|
|
369
|
+
) -> tuple[set[int], RowCountStats]:
|
|
333
370
|
"""Insert rows into the store table and update the catalog table's md
|
|
334
371
|
Returns:
|
|
335
372
|
number of inserted rows, number of exceptions, set of column ids that have exceptions
|
|
@@ -339,53 +376,80 @@ class StoreBase:
|
|
|
339
376
|
num_excs = 0
|
|
340
377
|
num_rows = 0
|
|
341
378
|
cols_with_excs: set[int] = set()
|
|
342
|
-
progress_bar:
|
|
379
|
+
progress_bar: tqdm | None = None # create this only after we started executing
|
|
343
380
|
row_builder = exec_plan.row_builder
|
|
344
|
-
|
|
345
|
-
|
|
381
|
+
|
|
382
|
+
store_col_names = row_builder.store_column_names()
|
|
346
383
|
|
|
347
384
|
try:
|
|
385
|
+
table_rows: list[tuple[Any]] = []
|
|
348
386
|
exec_plan.open()
|
|
387
|
+
|
|
349
388
|
for row_batch in exec_plan:
|
|
350
389
|
num_rows += len(row_batch)
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
390
|
+
batch_table_rows: list[tuple[Any]] = []
|
|
391
|
+
|
|
392
|
+
# compute batch of rows and convert them into table rows
|
|
393
|
+
for row in row_batch:
|
|
394
|
+
# if abort_on_exc == True, we need to check for media validation exceptions
|
|
395
|
+
if abort_on_exc and row.has_exc():
|
|
396
|
+
exc = row.get_first_exc()
|
|
397
|
+
raise exc
|
|
398
|
+
|
|
399
|
+
rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
|
|
400
|
+
pk = (*rowid, v_min)
|
|
401
|
+
assert len(pk) == len(self._pk_cols)
|
|
402
|
+
table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
|
|
403
|
+
num_excs += num_row_exc
|
|
404
|
+
|
|
405
|
+
if show_progress and Env.get().verbosity >= 1:
|
|
406
|
+
if progress_bar is None:
|
|
407
|
+
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
408
|
+
progress_bar = tqdm(
|
|
409
|
+
desc=f'Inserting rows into `{self.tbl_version.get().name}`',
|
|
410
|
+
unit=' rows',
|
|
411
|
+
ncols=100,
|
|
412
|
+
file=sys.stdout,
|
|
413
|
+
)
|
|
414
|
+
progress_bar.update(1)
|
|
415
|
+
|
|
416
|
+
batch_table_rows.append(tuple(table_row))
|
|
417
|
+
|
|
418
|
+
table_rows.extend(batch_table_rows)
|
|
419
|
+
|
|
420
|
+
# if a batch is ready for insertion into the database, insert it
|
|
421
|
+
if len(table_rows) >= self.__INSERT_BATCH_SIZE:
|
|
422
|
+
self.sql_insert(self.sa_tbl, store_col_names, table_rows)
|
|
423
|
+
table_rows.clear()
|
|
424
|
+
|
|
425
|
+
# insert any remaining rows
|
|
426
|
+
if len(table_rows) > 0:
|
|
427
|
+
self.sql_insert(self.sa_tbl, store_col_names, table_rows)
|
|
428
|
+
|
|
382
429
|
if progress_bar is not None:
|
|
383
430
|
progress_bar.close()
|
|
384
|
-
|
|
431
|
+
computed_values = exec_plan.ctx.num_computed_exprs * num_rows
|
|
432
|
+
row_counts = RowCountStats(ins_rows=num_rows, num_excs=num_excs, computed_values=computed_values)
|
|
433
|
+
|
|
434
|
+
return cols_with_excs, row_counts
|
|
385
435
|
finally:
|
|
386
436
|
exec_plan.close()
|
|
387
437
|
|
|
388
|
-
|
|
438
|
+
@classmethod
|
|
439
|
+
def sql_insert(cls, sa_tbl: sql.Table, store_col_names: list[str], table_rows: list[tuple[Any]]) -> None:
|
|
440
|
+
assert len(table_rows) > 0
|
|
441
|
+
conn = Env.get().conn
|
|
442
|
+
conn.execute(sql.insert(sa_tbl), [dict(zip(store_col_names, table_row)) for table_row in table_rows])
|
|
443
|
+
|
|
444
|
+
# TODO: Inserting directly via psycopg delivers a small performance benefit, but is somewhat fraught due to
|
|
445
|
+
# differences in the data representation that SQLAlchemy/psycopg expect. The below code will do the
|
|
446
|
+
# insertion in psycopg and can be used if/when we decide to pursue that optimization.
|
|
447
|
+
# col_names_str = ", ".join(store_col_names)
|
|
448
|
+
# placeholders_str = ", ".join('%s' for _ in store_col_names)
|
|
449
|
+
# stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
|
|
450
|
+
# conn.exec_driver_sql(stmt_text, table_rows)
|
|
451
|
+
|
|
452
|
+
def _versions_clause(self, versions: list[int | None], match_on_vmin: bool) -> sql.ColumnElement[bool]:
|
|
389
453
|
"""Return filter for base versions"""
|
|
390
454
|
v = versions[0]
|
|
391
455
|
if v is None:
|
|
@@ -403,9 +467,9 @@ class StoreBase:
|
|
|
403
467
|
def delete_rows(
|
|
404
468
|
self,
|
|
405
469
|
current_version: int,
|
|
406
|
-
base_versions: list[
|
|
470
|
+
base_versions: list[int | None],
|
|
407
471
|
match_on_vmin: bool,
|
|
408
|
-
where_clause:
|
|
472
|
+
where_clause: sql.ColumnElement[bool] | None,
|
|
409
473
|
) -> int:
|
|
410
474
|
"""Mark rows as deleted that are live and were created prior to current_version.
|
|
411
475
|
Also: populate the undo columns
|
|
@@ -424,7 +488,7 @@ class StoreBase:
|
|
|
424
488
|
base_versions_clause = (
|
|
425
489
|
sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
|
|
426
490
|
)
|
|
427
|
-
set_clause: dict[sql.Column,
|
|
491
|
+
set_clause: dict[sql.Column, int | sql.Column] = {self.v_max_col: current_version}
|
|
428
492
|
for index_info in self.tbl_version.get().idxs_by_name.values():
|
|
429
493
|
# copy value column to undo column
|
|
430
494
|
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
|
@@ -451,8 +515,7 @@ class StoreBase:
|
|
|
451
515
|
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), filter_view.rowid_columns())],
|
|
452
516
|
)
|
|
453
517
|
stmt = (
|
|
454
|
-
sql.select(
|
|
455
|
-
.select_from(self.sa_tbl)
|
|
518
|
+
sql.select(self.sa_tbl)
|
|
456
519
|
.where(self.v_min_col <= version)
|
|
457
520
|
.where(self.v_max_col > version)
|
|
458
521
|
.where(sql.exists().where(filter_predicate))
|
|
@@ -532,7 +595,7 @@ class StoreComponentView(StoreView):
|
|
|
532
595
|
self.rowid_cols.append(self.pos_col)
|
|
533
596
|
return self.rowid_cols
|
|
534
597
|
|
|
535
|
-
def create_sa_tbl(self, tbl_version:
|
|
598
|
+
def create_sa_tbl(self, tbl_version: catalog.TableVersion | None = None) -> None:
|
|
536
599
|
if tbl_version is None:
|
|
537
600
|
tbl_version = self.tbl_version.get()
|
|
538
601
|
super().create_sa_tbl(tbl_version)
|