pixeltable 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +106 -81
- pixeltable/env.py +28 -24
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -9
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +108 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +231 -113
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +36 -23
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/share/__init__.py +0 -0
- pixeltable/share/packager.py +218 -0
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +102 -75
- pixeltable/utils/arrow.py +7 -8
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/iceberg.py +14 -0
- pixeltable/utils/media_store.py +3 -2
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/METADATA +9 -9
- pixeltable-0.3.4.dist-info/RECORD +166 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import tarfile
|
|
5
|
+
import urllib.parse
|
|
6
|
+
import urllib.request
|
|
7
|
+
import uuid
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Iterator
|
|
10
|
+
|
|
11
|
+
import more_itertools
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pyarrow as pa
|
|
14
|
+
import pyiceberg.catalog
|
|
15
|
+
|
|
16
|
+
import pixeltable as pxt
|
|
17
|
+
import pixeltable.type_system as ts
|
|
18
|
+
from pixeltable import exprs
|
|
19
|
+
from pixeltable.env import Env
|
|
20
|
+
from pixeltable.utils.arrow import PXT_TO_PA_TYPES
|
|
21
|
+
from pixeltable.utils.iceberg import sqlite_catalog
|
|
22
|
+
|
|
23
|
+
_logger = logging.getLogger('pixeltable')
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TablePackager:
|
|
27
|
+
"""
|
|
28
|
+
Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
|
|
29
|
+
is as follows:
|
|
30
|
+
|
|
31
|
+
warehouse/catalog.db # sqlite Iceberg catalog
|
|
32
|
+
warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
|
|
33
|
+
media/** # Local media files
|
|
34
|
+
|
|
35
|
+
If the table being archived is a view, then the Iceberg catalog will contain separate tables for the view and each
|
|
36
|
+
of its ancestors. All rows will be exported with additional _rowid and _v_min columns. Currently, only the most
|
|
37
|
+
recent version of the table can be exported, and only the full table contents.
|
|
38
|
+
|
|
39
|
+
If the table contains media columns, they are handled as follows:
|
|
40
|
+
- If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
|
|
41
|
+
stored in the Iceberg table.
|
|
42
|
+
- If a media file is a local file, then it will be copied into the tarball as a file of the form
|
|
43
|
+
'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
table: pxt.Table # The table to be packaged
|
|
47
|
+
tmp_dir: Path # Temporary directory where the package will reside
|
|
48
|
+
iceberg_catalog: pyiceberg.catalog.Catalog
|
|
49
|
+
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
50
|
+
|
|
51
|
+
def __init__(self, table: pxt.Table) -> None:
|
|
52
|
+
self.table = table
|
|
53
|
+
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
54
|
+
self.media_files = {}
|
|
55
|
+
|
|
56
|
+
def package(self) -> Path:
|
|
57
|
+
"""
|
|
58
|
+
Export the table to a tarball containing Iceberg tables and media files.
|
|
59
|
+
"""
|
|
60
|
+
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
61
|
+
_logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
|
|
62
|
+
self.tmp_dir.mkdir()
|
|
63
|
+
self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
|
|
64
|
+
ancestors = [self.table] + self.table._bases
|
|
65
|
+
for t in ancestors:
|
|
66
|
+
_logger.info(f"Exporting table '{t._path}'.")
|
|
67
|
+
self.__export_table(t)
|
|
68
|
+
_logger.info(f'Building archive.')
|
|
69
|
+
bundle_path = self.__build_tarball()
|
|
70
|
+
_logger.info(f'Packaging complete: {bundle_path}')
|
|
71
|
+
return bundle_path
|
|
72
|
+
|
|
73
|
+
def __export_table(self, t: pxt.Table) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Exports the data from `t` into an Iceberg table.
|
|
76
|
+
"""
|
|
77
|
+
# First generate a select list for the data we want to extract from `t`. This includes:
|
|
78
|
+
# - all stored columns, including computed columns;
|
|
79
|
+
# - errortype and errormsg fields whenever they're defined.
|
|
80
|
+
# We select only those columns that are defined in this table (columns inherited from ancestor tables will be
|
|
81
|
+
# handled separately).
|
|
82
|
+
# For media columns, we substitute `col.fileurl` so that we always get the URL (which may be a file:// URL;
|
|
83
|
+
# these will be specially handled later)
|
|
84
|
+
select_exprs: dict[str, exprs.Expr] = {}
|
|
85
|
+
|
|
86
|
+
# As we generate the select list, we construct a separate list of column types. We can't rely on df._schema
|
|
87
|
+
# to get the column types, since we'll be substituting `fileurl`s for media columns.
|
|
88
|
+
actual_col_types: list[ts.ColumnType] = []
|
|
89
|
+
|
|
90
|
+
for col_name, col in t._tbl_version.cols_by_name.items():
|
|
91
|
+
if not col.is_stored:
|
|
92
|
+
continue
|
|
93
|
+
if col.col_type.is_media_type():
|
|
94
|
+
select_exprs[col_name] = t[col_name].fileurl
|
|
95
|
+
else:
|
|
96
|
+
select_exprs[col_name] = t[col_name]
|
|
97
|
+
actual_col_types.append(col.col_type)
|
|
98
|
+
if col.records_errors:
|
|
99
|
+
select_exprs[f'{col_name}_errortype'] = t[col_name].errortype
|
|
100
|
+
actual_col_types.append(ts.StringType())
|
|
101
|
+
select_exprs[f'{col_name}_errormsg'] = t[col_name].errormsg
|
|
102
|
+
actual_col_types.append(ts.StringType())
|
|
103
|
+
|
|
104
|
+
# Run the select() on `self.table`, not `t`, so that we export only those rows that are actually present in
|
|
105
|
+
# `self.table`.
|
|
106
|
+
df = self.table.select(**select_exprs)
|
|
107
|
+
namespace = self.__iceberg_namespace(t)
|
|
108
|
+
self.iceberg_catalog.create_namespace_if_not_exists(namespace)
|
|
109
|
+
iceberg_schema = self.__to_iceberg_schema(df._schema)
|
|
110
|
+
iceberg_tbl = self.iceberg_catalog.create_table(f'{namespace}.{t._name}', schema=iceberg_schema)
|
|
111
|
+
|
|
112
|
+
# Populate the Iceberg table with data.
|
|
113
|
+
# The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
|
|
114
|
+
# excessive memory usage. The pyarrow tables are then amalgamated into the (single) Iceberg table on disk.
|
|
115
|
+
for pa_table in self.__to_pa_tables(df, actual_col_types, iceberg_schema):
|
|
116
|
+
iceberg_tbl.append(pa_table)
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def __iceberg_namespace(cls, table: pxt.Table) -> str:
|
|
120
|
+
"""
|
|
121
|
+
Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
|
|
122
|
+
"""
|
|
123
|
+
parent_path = table._parent._path
|
|
124
|
+
if len(parent_path) == 0:
|
|
125
|
+
return 'pxt'
|
|
126
|
+
else:
|
|
127
|
+
return f'pxt.{parent_path}'
|
|
128
|
+
|
|
129
|
+
# The following methods are responsible for schema and data conversion from Pixeltable to Iceberg. Some of this
|
|
130
|
+
# logic might be consolidated into arrow.py and unified with general Parquet export, but there are several
|
|
131
|
+
# major differences:
|
|
132
|
+
# - Iceberg has no array type; we export all arrays as binary blobs
|
|
133
|
+
# - We include _rowid and _v_min columns in the Iceberg table
|
|
134
|
+
# - Media columns are handled specially as indicated above
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def __to_iceberg_schema(cls, pxt_schema: dict[str, ts.ColumnType]) -> pa.Schema:
|
|
138
|
+
entries = [(name, cls.__to_iceberg_type(col_type)) for name, col_type in pxt_schema.items()]
|
|
139
|
+
entries.append(('_rowid', pa.list_(pa.int64())))
|
|
140
|
+
entries.append(('_v_min', pa.int64()))
|
|
141
|
+
return pa.schema(entries) # type: ignore[arg-type]
|
|
142
|
+
|
|
143
|
+
@classmethod
|
|
144
|
+
def __to_iceberg_type(cls, col_type: ts.ColumnType) -> pa.DataType:
|
|
145
|
+
if col_type.is_array_type():
|
|
146
|
+
return pa.binary()
|
|
147
|
+
if col_type.is_media_type():
|
|
148
|
+
return pa.string()
|
|
149
|
+
return PXT_TO_PA_TYPES.get(col_type.__class__)
|
|
150
|
+
|
|
151
|
+
def __to_pa_tables(
|
|
152
|
+
self,
|
|
153
|
+
df: pxt.DataFrame,
|
|
154
|
+
actual_col_types: list[pxt.ColumnType],
|
|
155
|
+
arrow_schema: pa.Schema,
|
|
156
|
+
batch_size: int = 1_000,
|
|
157
|
+
) -> Iterator[pa.Table]:
|
|
158
|
+
"""
|
|
159
|
+
Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
|
|
160
|
+
to avoid excessive memory usage.
|
|
161
|
+
"""
|
|
162
|
+
for rows in more_itertools.batched(self.__to_pa_rows(df, actual_col_types), batch_size):
|
|
163
|
+
cols = {col_name: [row[idx] for row in rows] for idx, col_name in enumerate(df._schema.keys())}
|
|
164
|
+
cols['_rowid'] = [row[-2] for row in rows]
|
|
165
|
+
cols['_v_min'] = [row[-1] for row in rows]
|
|
166
|
+
yield pa.Table.from_pydict(cols, schema=arrow_schema)
|
|
167
|
+
|
|
168
|
+
def __to_pa_rows(self, df: pxt.DataFrame, actual_col_types: list[pxt.ColumnType]) -> Iterator[list]:
|
|
169
|
+
for row in df._exec():
|
|
170
|
+
vals = [row[e.slot_idx] for e in df._select_list_exprs]
|
|
171
|
+
result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
|
|
172
|
+
result.append(row.rowid)
|
|
173
|
+
result.append(row.v_min)
|
|
174
|
+
yield result
|
|
175
|
+
|
|
176
|
+
def __to_pa_value(self, val: Any, col_type: ts.ColumnType) -> Any:
|
|
177
|
+
if val is None:
|
|
178
|
+
return None
|
|
179
|
+
if col_type.is_array_type():
|
|
180
|
+
# Export arrays as binary
|
|
181
|
+
assert isinstance(val, np.ndarray)
|
|
182
|
+
arr = io.BytesIO()
|
|
183
|
+
np.save(arr, val)
|
|
184
|
+
return arr.getvalue()
|
|
185
|
+
if col_type.is_json_type():
|
|
186
|
+
# Export JSON as strings
|
|
187
|
+
return json.dumps(val)
|
|
188
|
+
if col_type.is_media_type():
|
|
189
|
+
# Handle media files as described above
|
|
190
|
+
assert isinstance(val, str) # Media columns are always referenced by `fileurl`
|
|
191
|
+
return self.__process_media_url(val)
|
|
192
|
+
return val
|
|
193
|
+
|
|
194
|
+
def __process_media_url(self, url: str) -> str:
|
|
195
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
196
|
+
if parsed_url.scheme == 'file':
|
|
197
|
+
# It's the URL of a local file. Replace it with a pxtmedia:// URI.
|
|
198
|
+
# (We can't use an actual pxt:// URI, because the eventual pxt:// table name might not be known at this
|
|
199
|
+
# time. The pxtmedia:// URI serves as a relative reference into the tarball that can be replaced with an
|
|
200
|
+
# actual URL when the table is reconstituted.)
|
|
201
|
+
path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
|
|
202
|
+
if path not in self.media_files:
|
|
203
|
+
# Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
|
|
204
|
+
dest_name = f'{uuid.uuid4().hex}{path.suffix}'
|
|
205
|
+
self.media_files[path] = dest_name
|
|
206
|
+
return f'pxtmedia://{self.media_files[path]}'
|
|
207
|
+
# For any type of URL other than a local file, just return the URL as-is.
|
|
208
|
+
return url
|
|
209
|
+
|
|
210
|
+
def __build_tarball(self) -> Path:
|
|
211
|
+
bundle_path = self.tmp_dir / 'bundle.tar.bz2'
|
|
212
|
+
with tarfile.open(bundle_path, 'w:bz2') as tf:
|
|
213
|
+
# Add the Iceberg warehouse dir (including the catalog)
|
|
214
|
+
tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
|
|
215
|
+
# Add the media files
|
|
216
|
+
for src_file, dest_name in self.media_files.items():
|
|
217
|
+
tf.add(src_file, arcname=f'media/{dest_name}')
|
|
218
|
+
return bundle_path
|
pixeltable/store.py
CHANGED
|
@@ -32,6 +32,7 @@ class StoreBase:
|
|
|
32
32
|
- v_min: version at which the row was created
|
|
33
33
|
- v_max: version at which the row was deleted (or MAX_VERSION if it's still live)
|
|
34
34
|
"""
|
|
35
|
+
|
|
35
36
|
tbl_version: catalog.TableVersion
|
|
36
37
|
sa_md: sql.MetaData
|
|
37
38
|
sa_tbl: Optional[sql.Table]
|
|
@@ -65,8 +66,9 @@ class StoreBase:
|
|
|
65
66
|
"""Create and return system columns"""
|
|
66
67
|
rowid_cols = self._create_rowid_columns()
|
|
67
68
|
self.v_min_col = sql.Column('v_min', sql.BigInteger, nullable=False)
|
|
68
|
-
self.v_max_col =
|
|
69
|
-
|
|
69
|
+
self.v_max_col = sql.Column(
|
|
70
|
+
'v_max', sql.BigInteger, nullable=False, server_default=str(schema.Table.MAX_VERSION)
|
|
71
|
+
)
|
|
70
72
|
self._pk_cols = [*rowid_cols, self.v_min_col]
|
|
71
73
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
72
74
|
|
|
@@ -134,7 +136,7 @@ class StoreBase:
|
|
|
134
136
|
return new_file_url
|
|
135
137
|
|
|
136
138
|
def _move_tmp_media_files(
|
|
137
|
-
|
|
139
|
+
self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
|
|
138
140
|
) -> None:
|
|
139
141
|
"""Move tmp media files that we generated to a permanent location"""
|
|
140
142
|
for c in media_cols:
|
|
@@ -143,7 +145,7 @@ class StoreBase:
|
|
|
143
145
|
table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
|
|
144
146
|
|
|
145
147
|
def _create_table_row(
|
|
146
|
-
|
|
148
|
+
self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
|
|
147
149
|
) -> tuple[dict[str, Any], int]:
|
|
148
150
|
"""Return Tuple[complete table row, # of exceptions] for insert()
|
|
149
151
|
Creates a row that includes the PK columns, with the values from input_row.pk.
|
|
@@ -193,11 +195,13 @@ class StoreBase:
|
|
|
193
195
|
added_storage_cols = [col.store_name()]
|
|
194
196
|
if col.records_errors:
|
|
195
197
|
# we also need to create the errormsg and errortype storage cols
|
|
196
|
-
stmt = sql.text(
|
|
197
|
-
|
|
198
|
+
stmt = sql.text(
|
|
199
|
+
f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL'
|
|
200
|
+
)
|
|
198
201
|
conn.execute(stmt)
|
|
199
|
-
stmt = sql.text(
|
|
200
|
-
|
|
202
|
+
stmt = sql.text(
|
|
203
|
+
f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL'
|
|
204
|
+
)
|
|
201
205
|
conn.execute(stmt)
|
|
202
206
|
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
203
207
|
self.create_sa_tbl()
|
|
@@ -219,7 +223,7 @@ class StoreBase:
|
|
|
219
223
|
exec_plan: ExecNode,
|
|
220
224
|
value_expr_slot_idx: int,
|
|
221
225
|
conn: sql.engine.Connection,
|
|
222
|
-
on_error: Literal['abort', 'ignore']
|
|
226
|
+
on_error: Literal['abort', 'ignore'],
|
|
223
227
|
) -> int:
|
|
224
228
|
"""Update store column of a computed column with values produced by an execution plan
|
|
225
229
|
|
|
@@ -295,10 +299,9 @@ class StoreBase:
|
|
|
295
299
|
update_stmt = update_stmt.where(pk_col == tmp_pk_col)
|
|
296
300
|
update_stmt = update_stmt.values({col.sa_col: tmp_val_col})
|
|
297
301
|
if col.records_errors:
|
|
298
|
-
update_stmt = update_stmt.values(
|
|
299
|
-
col.sa_errortype_col: tmp_errortype_col,
|
|
300
|
-
|
|
301
|
-
})
|
|
302
|
+
update_stmt = update_stmt.values(
|
|
303
|
+
{col.sa_errortype_col: tmp_errortype_col, col.sa_errormsg_col: tmp_errormsg_col}
|
|
304
|
+
)
|
|
302
305
|
log_explain(_logger, update_stmt, conn)
|
|
303
306
|
conn.execute(update_stmt)
|
|
304
307
|
|
|
@@ -308,8 +311,13 @@ class StoreBase:
|
|
|
308
311
|
return num_excs
|
|
309
312
|
|
|
310
313
|
def insert_rows(
|
|
311
|
-
|
|
312
|
-
|
|
314
|
+
self,
|
|
315
|
+
exec_plan: ExecNode,
|
|
316
|
+
conn: sql.engine.Connection,
|
|
317
|
+
v_min: Optional[int] = None,
|
|
318
|
+
show_progress: bool = True,
|
|
319
|
+
rowids: Optional[Iterator[int]] = None,
|
|
320
|
+
abort_on_exc: bool = False,
|
|
313
321
|
) -> tuple[int, int, set[int]]:
|
|
314
322
|
"""Insert rows into the store table and update the catalog table's md
|
|
315
323
|
Returns:
|
|
@@ -347,12 +355,12 @@ class StoreBase:
|
|
|
347
355
|
|
|
348
356
|
if show_progress:
|
|
349
357
|
if progress_bar is None:
|
|
350
|
-
warnings.simplefilter(
|
|
358
|
+
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
351
359
|
progress_bar = tqdm(
|
|
352
360
|
desc=f'Inserting rows into `{self.tbl_version.name}`',
|
|
353
361
|
unit=' rows',
|
|
354
362
|
ncols=100,
|
|
355
|
-
file=sys.stdout
|
|
363
|
+
file=sys.stdout,
|
|
356
364
|
)
|
|
357
365
|
progress_bar.update(1)
|
|
358
366
|
|
|
@@ -379,8 +387,13 @@ class StoreBase:
|
|
|
379
387
|
return sql.and_(clause, self.base._versions_clause(versions[1:], match_on_vmin))
|
|
380
388
|
|
|
381
389
|
def delete_rows(
|
|
382
|
-
|
|
383
|
-
|
|
390
|
+
self,
|
|
391
|
+
current_version: int,
|
|
392
|
+
base_versions: list[Optional[int]],
|
|
393
|
+
match_on_vmin: bool,
|
|
394
|
+
where_clause: Optional[sql.ColumnElement[bool]],
|
|
395
|
+
conn: sql.engine.Connection,
|
|
396
|
+
) -> int:
|
|
384
397
|
"""Mark rows as deleted that are live and were created prior to current_version.
|
|
385
398
|
Also: populate the undo columns
|
|
386
399
|
Args:
|
|
@@ -394,12 +407,12 @@ class StoreBase:
|
|
|
394
407
|
"""
|
|
395
408
|
where_clause = sql.true() if where_clause is None else where_clause
|
|
396
409
|
where_clause = sql.and_(
|
|
397
|
-
self.v_min_col < current_version,
|
|
398
|
-
|
|
399
|
-
where_clause)
|
|
410
|
+
self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
|
|
411
|
+
)
|
|
400
412
|
rowid_join_clause = self._rowid_join_predicate()
|
|
401
|
-
base_versions_clause =
|
|
402
|
-
else self.base._versions_clause(base_versions, match_on_vmin)
|
|
413
|
+
base_versions_clause = (
|
|
414
|
+
sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
|
|
415
|
+
)
|
|
403
416
|
set_clause: dict[sql.Column, Union[int, sql.Column]] = {self.v_max_col: current_version}
|
|
404
417
|
for index_info in self.tbl_version.idxs_by_name.values():
|
|
405
418
|
# copy value column to undo column
|
|
@@ -450,7 +463,9 @@ class StoreView(StoreBase):
|
|
|
450
463
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
451
464
|
return sql.and_(
|
|
452
465
|
self.base._rowid_join_predicate(),
|
|
453
|
-
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())]
|
|
466
|
+
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), self.base.rowid_columns())],
|
|
467
|
+
)
|
|
468
|
+
|
|
454
469
|
|
|
455
470
|
class StoreComponentView(StoreView):
|
|
456
471
|
"""A view that stores components of its base, as produced by a ComponentIterator
|
|
@@ -482,4 +497,5 @@ class StoreComponentView(StoreView):
|
|
|
482
497
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
483
498
|
return sql.and_(
|
|
484
499
|
self.base._rowid_join_predicate(),
|
|
485
|
-
*[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())]
|
|
500
|
+
*[c1 == c2 for c1, c2 in zip(self.rowid_columns()[:-1], self.base.rowid_columns())],
|
|
501
|
+
)
|