pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +83 -19
- pixeltable/_query.py +1444 -0
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +7 -4
- pixeltable/catalog/catalog.py +2394 -119
- pixeltable/catalog/column.py +225 -104
- pixeltable/catalog/dir.py +38 -9
- pixeltable/catalog/globals.py +53 -34
- pixeltable/catalog/insertable_table.py +265 -115
- pixeltable/catalog/path.py +80 -17
- pixeltable/catalog/schema_object.py +28 -43
- pixeltable/catalog/table.py +1270 -677
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +1270 -751
- pixeltable/catalog/table_version_handle.py +109 -0
- pixeltable/catalog/table_version_path.py +137 -42
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +251 -134
- pixeltable/config.py +215 -0
- pixeltable/env.py +736 -285
- pixeltable/exceptions.py +26 -2
- pixeltable/exec/__init__.py +7 -2
- pixeltable/exec/aggregation_node.py +39 -21
- pixeltable/exec/cache_prefetch_node.py +87 -109
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +25 -28
- pixeltable/exec/data_row_batch.py +11 -46
- pixeltable/exec/exec_context.py +26 -11
- pixeltable/exec/exec_node.py +35 -27
- pixeltable/exec/expr_eval/__init__.py +3 -0
- pixeltable/exec/expr_eval/evaluators.py +365 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
- pixeltable/exec/expr_eval/globals.py +200 -0
- pixeltable/exec/expr_eval/row_buffer.py +74 -0
- pixeltable/exec/expr_eval/schedulers.py +413 -0
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +35 -27
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +44 -29
- pixeltable/exec/sql_node.py +414 -115
- pixeltable/exprs/__init__.py +8 -5
- pixeltable/exprs/arithmetic_expr.py +79 -45
- pixeltable/exprs/array_slice.py +5 -5
- pixeltable/exprs/column_property_ref.py +40 -26
- pixeltable/exprs/column_ref.py +254 -61
- pixeltable/exprs/comparison.py +14 -9
- pixeltable/exprs/compound_predicate.py +9 -10
- pixeltable/exprs/data_row.py +213 -72
- pixeltable/exprs/expr.py +270 -104
- pixeltable/exprs/expr_dict.py +6 -5
- pixeltable/exprs/expr_set.py +20 -11
- pixeltable/exprs/function_call.py +383 -284
- pixeltable/exprs/globals.py +18 -5
- pixeltable/exprs/in_predicate.py +7 -7
- pixeltable/exprs/inline_expr.py +37 -37
- pixeltable/exprs/is_null.py +8 -4
- pixeltable/exprs/json_mapper.py +120 -54
- pixeltable/exprs/json_path.py +90 -60
- pixeltable/exprs/literal.py +61 -16
- pixeltable/exprs/method_ref.py +7 -6
- pixeltable/exprs/object_ref.py +19 -8
- pixeltable/exprs/row_builder.py +238 -75
- pixeltable/exprs/rowid_ref.py +53 -15
- pixeltable/exprs/similarity_expr.py +65 -50
- pixeltable/exprs/sql_element_cache.py +5 -5
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/exprs/type_cast.py +25 -13
- pixeltable/exprs/variable.py +2 -2
- pixeltable/func/__init__.py +9 -5
- pixeltable/func/aggregate_function.py +197 -92
- pixeltable/func/callable_function.py +119 -35
- pixeltable/func/expr_template_function.py +101 -48
- pixeltable/func/function.py +375 -62
- pixeltable/func/function_registry.py +20 -19
- pixeltable/func/globals.py +6 -5
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +151 -35
- pixeltable/func/signature.py +178 -49
- pixeltable/func/tools.py +164 -0
- pixeltable/func/udf.py +176 -53
- pixeltable/functions/__init__.py +44 -4
- pixeltable/functions/anthropic.py +226 -47
- pixeltable/functions/audio.py +148 -11
- pixeltable/functions/bedrock.py +137 -0
- pixeltable/functions/date.py +188 -0
- pixeltable/functions/deepseek.py +113 -0
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +72 -20
- pixeltable/functions/gemini.py +249 -0
- pixeltable/functions/globals.py +208 -53
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1088 -95
- pixeltable/functions/image.py +155 -84
- pixeltable/functions/json.py +8 -11
- pixeltable/functions/llama_cpp.py +31 -19
- pixeltable/functions/math.py +169 -0
- pixeltable/functions/mistralai.py +50 -75
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +29 -36
- pixeltable/functions/openai.py +548 -160
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +15 -14
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +310 -85
- pixeltable/functions/timestamp.py +37 -19
- pixeltable/functions/together.py +77 -120
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +7 -2
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1528 -117
- pixeltable/functions/vision.py +26 -26
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +19 -10
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/functions/yolox.py +112 -0
- pixeltable/globals.py +716 -236
- pixeltable/index/__init__.py +3 -1
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +32 -22
- pixeltable/index/embedding_index.py +155 -92
- pixeltable/io/__init__.py +12 -7
- pixeltable/io/datarows.py +140 -0
- pixeltable/io/external_store.py +83 -125
- pixeltable/io/fiftyone.py +24 -33
- pixeltable/io/globals.py +47 -182
- pixeltable/io/hf_datasets.py +96 -127
- pixeltable/io/label_studio.py +171 -156
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +136 -115
- pixeltable/io/parquet.py +40 -153
- pixeltable/io/table_data_conduit.py +702 -0
- pixeltable/io/utils.py +100 -0
- pixeltable/iterators/__init__.py +8 -4
- pixeltable/iterators/audio.py +207 -0
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +144 -87
- pixeltable/iterators/image.py +17 -38
- pixeltable/iterators/string.py +15 -12
- pixeltable/iterators/video.py +523 -127
- pixeltable/metadata/__init__.py +33 -8
- pixeltable/metadata/converters/convert_10.py +2 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_15.py +15 -11
- pixeltable/metadata/converters/convert_16.py +4 -5
- pixeltable/metadata/converters/convert_17.py +4 -5
- pixeltable/metadata/converters/convert_18.py +4 -6
- pixeltable/metadata/converters/convert_19.py +6 -9
- pixeltable/metadata/converters/convert_20.py +3 -6
- pixeltable/metadata/converters/convert_21.py +6 -8
- pixeltable/metadata/converters/convert_22.py +3 -2
- pixeltable/metadata/converters/convert_23.py +33 -0
- pixeltable/metadata/converters/convert_24.py +55 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/convert_27.py +29 -0
- pixeltable/metadata/converters/convert_28.py +13 -0
- pixeltable/metadata/converters/convert_29.py +110 -0
- pixeltable/metadata/converters/convert_30.py +63 -0
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +44 -18
- pixeltable/metadata/notes.py +21 -0
- pixeltable/metadata/schema.py +185 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +616 -225
- pixeltable/share/__init__.py +3 -0
- pixeltable/share/packager.py +797 -0
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +349 -0
- pixeltable/store.py +398 -232
- pixeltable/type_system.py +730 -267
- pixeltable/utils/__init__.py +40 -0
- pixeltable/utils/arrow.py +201 -29
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +26 -27
- pixeltable/utils/code.py +4 -4
- pixeltable/utils/console_output.py +46 -0
- pixeltable/utils/coroutine.py +24 -0
- pixeltable/utils/dbms.py +92 -0
- pixeltable/utils/description_helper.py +11 -12
- pixeltable/utils/documents.py +60 -61
- pixeltable/utils/exception_handler.py +36 -0
- pixeltable/utils/filecache.py +38 -22
- pixeltable/utils/formatter.py +88 -51
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +14 -13
- pixeltable/utils/iceberg.py +13 -0
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +20 -20
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +32 -5
- pixeltable/utils/system.py +30 -0
- pixeltable/utils/transactional_directory.py +4 -3
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -36
- pixeltable/catalog/path_dict.py +0 -141
- pixeltable/dataframe.py +0 -894
- pixeltable/exec/expr_eval_node.py +0 -232
- pixeltable/ext/__init__.py +0 -14
- pixeltable/ext/functions/__init__.py +0 -8
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/ext/functions/yolox.py +0 -157
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable/utils/media_store.py +0 -76
- pixeltable/utils/s3.py +0 -16
- pixeltable-0.2.26.dist-info/METADATA +0 -400
- pixeltable-0.2.26.dist-info/RECORD +0 -156
- pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,797 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import tarfile
|
|
6
|
+
import urllib.parse
|
|
7
|
+
import urllib.request
|
|
8
|
+
import uuid
|
|
9
|
+
from datetime import timedelta
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Iterator
|
|
12
|
+
from uuid import UUID
|
|
13
|
+
|
|
14
|
+
import more_itertools
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pgvector.sqlalchemy as sql_vector # type: ignore[import-untyped]
|
|
17
|
+
import PIL.Image
|
|
18
|
+
import pyarrow as pa
|
|
19
|
+
import pyarrow.parquet as pq
|
|
20
|
+
import sqlalchemy as sql
|
|
21
|
+
|
|
22
|
+
import pixeltable as pxt
|
|
23
|
+
import pixeltable.utils.av as av_utils
|
|
24
|
+
from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
|
|
25
|
+
from pixeltable.catalog.table_version import TableVersionKey, TableVersionMd
|
|
26
|
+
from pixeltable.env import Env
|
|
27
|
+
from pixeltable.exprs.data_row import CellMd
|
|
28
|
+
from pixeltable.metadata import schema
|
|
29
|
+
from pixeltable.utils import sha256sum
|
|
30
|
+
from pixeltable.utils.formatter import Formatter
|
|
31
|
+
from pixeltable.utils.local_store import TempStore
|
|
32
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
33
|
+
|
|
34
|
+
_logger = logging.getLogger('pixeltable')
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TablePackager:
|
|
38
|
+
"""
|
|
39
|
+
Packages a pixeltable Table into a tarball containing Parquet tables and media files. The structure of the tarball
|
|
40
|
+
is as follows:
|
|
41
|
+
|
|
42
|
+
metadata.json # Pixeltable metadata for the packaged table and its ancestors
|
|
43
|
+
tables/** # Parquet tables for the packaged table and its ancestors, each table in a directory 'tbl_{tbl_id.hex}'
|
|
44
|
+
media/** # Local media files
|
|
45
|
+
|
|
46
|
+
If the table contains media columns, they are handled as follows:
|
|
47
|
+
- If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
|
|
48
|
+
stored in the Parquet table.
|
|
49
|
+
- If a media file is a local file, then it will be copied into the tarball as a file of the form
|
|
50
|
+
'media/{uuid}{extension}', and the Parquet table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
table: catalog.Table # The table to be packaged
|
|
54
|
+
tmp_dir: Path # Temporary directory where the package will reside
|
|
55
|
+
tables_dir: Path # Directory where the Parquet tables will be written
|
|
56
|
+
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
57
|
+
bundle_md: dict[str, Any]
|
|
58
|
+
|
|
59
|
+
bundle_path: Path
|
|
60
|
+
preview_header: dict[str, str]
|
|
61
|
+
preview: list[list[Any]]
|
|
62
|
+
|
|
63
|
+
def __init__(self, table: catalog.Table, additional_md: dict[str, Any] | None = None) -> None:
|
|
64
|
+
self.table = table
|
|
65
|
+
self.tmp_dir = TempStore.create_path()
|
|
66
|
+
self.media_files = {}
|
|
67
|
+
|
|
68
|
+
# Load metadata and convert to JSON immediately
|
|
69
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
70
|
+
tbl_md = catalog.Catalog.get().load_replica_md(table)
|
|
71
|
+
self.bundle_md = {
|
|
72
|
+
'pxt_version': pxt.__version__,
|
|
73
|
+
'pxt_md_version': metadata.VERSION,
|
|
74
|
+
'md': [md.as_dict() for md in tbl_md],
|
|
75
|
+
}
|
|
76
|
+
if additional_md is not None:
|
|
77
|
+
self.bundle_md.update(additional_md)
|
|
78
|
+
|
|
79
|
+
def package(self) -> Path:
|
|
80
|
+
"""
|
|
81
|
+
Export the table to a tarball containing Parquet tables and media files.
|
|
82
|
+
"""
|
|
83
|
+
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
84
|
+
|
|
85
|
+
_logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
|
|
86
|
+
self.tmp_dir.mkdir()
|
|
87
|
+
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
88
|
+
json.dump(self.bundle_md, fp)
|
|
89
|
+
self.tables_dir = self.tmp_dir / 'tables'
|
|
90
|
+
self.tables_dir.mkdir()
|
|
91
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
92
|
+
for tv in self.table._tbl_version_path.get_tbl_versions():
|
|
93
|
+
_logger.info(f'Exporting table {tv.get().versioned_name!r}.')
|
|
94
|
+
self.__export_table(tv.get())
|
|
95
|
+
|
|
96
|
+
_logger.info('Building archive.')
|
|
97
|
+
self.bundle_path = self.__build_tarball()
|
|
98
|
+
|
|
99
|
+
_logger.info('Extracting preview data.')
|
|
100
|
+
self.bundle_md['row_count'] = self.table.count()
|
|
101
|
+
preview_header, preview = self.__extract_preview_data()
|
|
102
|
+
self.bundle_md['preview_header'] = preview_header
|
|
103
|
+
self.bundle_md['preview_data'] = preview
|
|
104
|
+
|
|
105
|
+
_logger.info(f'Packaging complete: {self.bundle_path}')
|
|
106
|
+
return self.bundle_path
|
|
107
|
+
|
|
108
|
+
def __export_table(self, tv: catalog.TableVersion) -> None:
|
|
109
|
+
"""
|
|
110
|
+
Exports the data from `t` into a Parquet table.
|
|
111
|
+
"""
|
|
112
|
+
# `tv` must be an ancestor of the primary table
|
|
113
|
+
assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
|
|
114
|
+
sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
|
|
115
|
+
media_cols: set[str] = set()
|
|
116
|
+
cellmd_cols: set[str] = set()
|
|
117
|
+
for col in tv.cols:
|
|
118
|
+
if col.is_stored and col.col_type.is_media_type():
|
|
119
|
+
media_cols.add(col.store_name())
|
|
120
|
+
if col.stores_cellmd:
|
|
121
|
+
cellmd_cols.add(col.cellmd_store_name())
|
|
122
|
+
|
|
123
|
+
parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
|
|
124
|
+
# TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
|
|
125
|
+
# future support for this.)
|
|
126
|
+
parquet_dir = self.tables_dir / f'tbl_{tv.id.hex}'
|
|
127
|
+
parquet_dir.mkdir()
|
|
128
|
+
parquet_file = parquet_dir / f'tbl_{tv.id.hex}.00000.parquet'
|
|
129
|
+
_logger.info(f'Creating parquet table: {parquet_file}')
|
|
130
|
+
|
|
131
|
+
# Populate the Parquet table with data.
|
|
132
|
+
# The data is first loaded from the Query into a sequence of pyarrow tables, batched in order to avoid
|
|
133
|
+
# excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
|
|
134
|
+
# We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
|
|
135
|
+
# faster compression should provide good performance while still reducing temporary storage utilization.
|
|
136
|
+
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
|
|
137
|
+
filter_tv = self.table._tbl_version_path.tbl_version.get()
|
|
138
|
+
row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
|
|
139
|
+
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, cellmd_cols, parquet_schema):
|
|
140
|
+
parquet_writer.write_table(pa_table)
|
|
141
|
+
parquet_writer.close()
|
|
142
|
+
|
|
143
|
+
# The following methods are responsible for schema and data conversion from Pixeltable to Parquet.
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
|
|
147
|
+
entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
|
|
148
|
+
return pa.schema(entries)
|
|
149
|
+
|
|
150
|
+
@classmethod
|
|
151
|
+
def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
|
|
152
|
+
if isinstance(col_type, sql.String):
|
|
153
|
+
return pa.string()
|
|
154
|
+
if isinstance(col_type, sql.Boolean):
|
|
155
|
+
return pa.bool_()
|
|
156
|
+
if isinstance(col_type, sql.BigInteger):
|
|
157
|
+
return pa.int64()
|
|
158
|
+
if isinstance(col_type, sql.Float):
|
|
159
|
+
return pa.float32()
|
|
160
|
+
if isinstance(col_type, sql.TIMESTAMP):
|
|
161
|
+
return pa.timestamp('us', tz='UTC')
|
|
162
|
+
if isinstance(col_type, sql.Date):
|
|
163
|
+
return pa.date32()
|
|
164
|
+
if isinstance(col_type, sql.JSON):
|
|
165
|
+
return pa.string() # JSON will be exported as strings
|
|
166
|
+
if isinstance(col_type, sql.LargeBinary):
|
|
167
|
+
return pa.binary()
|
|
168
|
+
if isinstance(col_type, sql.UUID):
|
|
169
|
+
return pa.uuid()
|
|
170
|
+
if isinstance(col_type, sql_vector.Vector):
|
|
171
|
+
# Parquet/pyarrow do not handle null values properly for fixed_shape_tensor(), so we have to use list_()
|
|
172
|
+
# here instead.
|
|
173
|
+
return pa.list_(pa.float32())
|
|
174
|
+
raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
|
|
175
|
+
|
|
176
|
+
def __to_pa_tables(
|
|
177
|
+
self,
|
|
178
|
+
row_iter: Iterator[dict[str, Any]],
|
|
179
|
+
sql_types: dict[str, sql.types.TypeEngine[Any]],
|
|
180
|
+
media_cols: set[str],
|
|
181
|
+
cellmd_cols: set[str],
|
|
182
|
+
arrow_schema: pa.Schema,
|
|
183
|
+
batch_size: int = 1_000,
|
|
184
|
+
) -> Iterator[pa.Table]:
|
|
185
|
+
"""
|
|
186
|
+
Group rows into a sequence of pyarrow tables, batched into smaller chunks to minimize memory utilization.
|
|
187
|
+
The row dictionaries have the format {store_col_name: value}, where the values reflect the unprocessed contents
|
|
188
|
+
of the store database (as returned by `StoreTable.dump_rows()`).
|
|
189
|
+
"""
|
|
190
|
+
for rows in more_itertools.batched(row_iter, batch_size):
|
|
191
|
+
cols = {}
|
|
192
|
+
for name, sql_type in sql_types.items():
|
|
193
|
+
values = [
|
|
194
|
+
self.__to_pa_value(row.get(name), sql_type, name in media_cols, name in cellmd_cols) for row in rows
|
|
195
|
+
]
|
|
196
|
+
cols[name] = values
|
|
197
|
+
yield pa.Table.from_pydict(cols, schema=arrow_schema)
|
|
198
|
+
|
|
199
|
+
def __to_pa_value(
|
|
200
|
+
self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool, is_cellmd_col: bool
|
|
201
|
+
) -> Any:
|
|
202
|
+
if val is None:
|
|
203
|
+
return None
|
|
204
|
+
if is_cellmd_col:
|
|
205
|
+
assert isinstance(val, dict)
|
|
206
|
+
# Export JSON as strings
|
|
207
|
+
return json.dumps(self.__process_cellmd(val))
|
|
208
|
+
if isinstance(sql_type, sql.JSON):
|
|
209
|
+
# Export JSON as strings
|
|
210
|
+
return json.dumps(val)
|
|
211
|
+
if isinstance(sql_type, sql.UUID):
|
|
212
|
+
# PyArrow's pa.uuid() expects bytes
|
|
213
|
+
assert isinstance(val, uuid.UUID)
|
|
214
|
+
return val.bytes
|
|
215
|
+
if is_media_col:
|
|
216
|
+
# Handle media files as described above
|
|
217
|
+
assert isinstance(val, str)
|
|
218
|
+
return self.__process_media_url(val)
|
|
219
|
+
return val
|
|
220
|
+
|
|
221
|
+
def __process_media_url(self, url: str) -> str:
|
|
222
|
+
"""
|
|
223
|
+
Process a media URL for export. If it's a local file URL (file://), then replace it with a pxtmedia:// URI,
|
|
224
|
+
copying the file into the tarball if necessary. If it's any other type of URL, return it unchanged.
|
|
225
|
+
"""
|
|
226
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
227
|
+
if parsed_url.scheme == 'file':
|
|
228
|
+
# It's the URL of a local file. Replace it with a pxtmedia:// URI.
|
|
229
|
+
# (We can't use an actual pxt:// URI, because the eventual pxt:// table name might not be known at this
|
|
230
|
+
# time. The pxtmedia:// URI serves as a relative reference into the tarball that can be replaced with an
|
|
231
|
+
# actual URL when the table is reconstituted.)
|
|
232
|
+
path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
|
|
233
|
+
if path not in self.media_files:
|
|
234
|
+
# Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
|
|
235
|
+
# We name the media files in the archive by their SHA256 hash. This ensures that we can properly
|
|
236
|
+
# deduplicate and validate them later.
|
|
237
|
+
# If we get a collision, it's not a problem; it just means we have two identical files (which will
|
|
238
|
+
# be conveniently deduplicated in the bundle).
|
|
239
|
+
sha = sha256sum(path)
|
|
240
|
+
dest_name = f'{sha}{path.suffix}'
|
|
241
|
+
self.media_files[path] = dest_name
|
|
242
|
+
return f'pxtmedia://{self.media_files[path]}'
|
|
243
|
+
# For any type of URL other than a local file, just return the URL as-is.
|
|
244
|
+
return url
|
|
245
|
+
|
|
246
|
+
def __process_cellmd(self, cellmd: dict[str, Any]) -> dict[str, Any]:
|
|
247
|
+
"""
|
|
248
|
+
Process a cellmd dictionary for export. This involves replacing any local file references
|
|
249
|
+
with pxtmedia:// URIs, as described above.
|
|
250
|
+
"""
|
|
251
|
+
cellmd_ = CellMd.from_dict(cellmd)
|
|
252
|
+
if cellmd_.file_urls is None:
|
|
253
|
+
return cellmd # No changes
|
|
254
|
+
|
|
255
|
+
updated_urls: list[str] = []
|
|
256
|
+
for url in cellmd_.file_urls:
|
|
257
|
+
updated_urls.append(self.__process_media_url(url))
|
|
258
|
+
cellmd_.file_urls = updated_urls
|
|
259
|
+
return cellmd_.as_dict()
|
|
260
|
+
|
|
261
|
+
def __build_tarball(self) -> Path:
|
|
262
|
+
bundle_path = self.tmp_dir / 'bundle.tar.bz2'
|
|
263
|
+
with tarfile.open(bundle_path, 'w:bz2') as tf:
|
|
264
|
+
# Add metadata json
|
|
265
|
+
tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
|
|
266
|
+
# Add the dir containing Parquet tables
|
|
267
|
+
tf.add(self.tables_dir, arcname='tables')
|
|
268
|
+
# Add the media files
|
|
269
|
+
for src_file, dest_name in self.media_files.items():
|
|
270
|
+
tf.add(src_file, arcname=f'media/{dest_name}')
|
|
271
|
+
return bundle_path
|
|
272
|
+
|
|
273
|
+
def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
|
|
274
|
+
"""
|
|
275
|
+
Extract a preview of the table data for display in the UI.
|
|
276
|
+
|
|
277
|
+
In order to bound the size of the output data, all "unbounded" data types are resized:
|
|
278
|
+
- Strings are abbreviated as per Formatter.abbreviate()
|
|
279
|
+
- Arrays and JSON are shortened and formatted as strings
|
|
280
|
+
- Images are resized to thumbnail size as a base64-encoded webp
|
|
281
|
+
- Videos are replaced by their first frame and resized as above
|
|
282
|
+
- Documents are replaced by a thumbnail as a base64-encoded webp
|
|
283
|
+
"""
|
|
284
|
+
preview_cols = self.table._get_schema()
|
|
285
|
+
select_list = [self.table[col_name] for col_name in preview_cols]
|
|
286
|
+
# First 5 rows
|
|
287
|
+
rows = list(self.table.select(*select_list).head(n=5))
|
|
288
|
+
|
|
289
|
+
preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
|
|
290
|
+
preview = [
|
|
291
|
+
[self.__encode_preview_data(val, col_type)]
|
|
292
|
+
for row in rows
|
|
293
|
+
for val, col_type in zip(row.values(), preview_cols.values(), strict=True)
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
return preview_header, preview
|
|
297
|
+
|
|
298
|
+
def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
|
|
299
|
+
if val is None:
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
match col_type._type:
|
|
303
|
+
case ts.ColumnType.Type.STRING:
|
|
304
|
+
assert isinstance(val, str)
|
|
305
|
+
return Formatter.abbreviate(val)
|
|
306
|
+
|
|
307
|
+
case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
|
|
308
|
+
return val
|
|
309
|
+
|
|
310
|
+
case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
|
|
311
|
+
return str(val)
|
|
312
|
+
|
|
313
|
+
case ts.ColumnType.Type.UUID:
|
|
314
|
+
assert isinstance(val, uuid.UUID)
|
|
315
|
+
return str(val)
|
|
316
|
+
|
|
317
|
+
case ts.ColumnType.Type.BINARY:
|
|
318
|
+
assert isinstance(val, bytes)
|
|
319
|
+
return Formatter.format_binary(val)
|
|
320
|
+
|
|
321
|
+
case ts.ColumnType.Type.ARRAY:
|
|
322
|
+
assert isinstance(val, np.ndarray)
|
|
323
|
+
return Formatter.format_array(val)
|
|
324
|
+
|
|
325
|
+
case ts.ColumnType.Type.JSON:
|
|
326
|
+
# We need to escape the JSON string server-side for security reasons.
|
|
327
|
+
# Therefore we don't escape it here, in order to avoid double-escaping.
|
|
328
|
+
return Formatter.format_json(val, escape_strings=False)
|
|
329
|
+
|
|
330
|
+
case ts.ColumnType.Type.IMAGE:
|
|
331
|
+
# Rescale the image to minimize data transfer size
|
|
332
|
+
assert isinstance(val, PIL.Image.Image)
|
|
333
|
+
return self.__encode_image(val)
|
|
334
|
+
|
|
335
|
+
case ts.ColumnType.Type.AUDIO:
|
|
336
|
+
assert isinstance(val, str)
|
|
337
|
+
return self.__encode_audio(val)
|
|
338
|
+
|
|
339
|
+
case ts.ColumnType.Type.VIDEO:
|
|
340
|
+
assert isinstance(val, str)
|
|
341
|
+
return self.__encode_video(val)
|
|
342
|
+
|
|
343
|
+
case ts.ColumnType.Type.DOCUMENT:
|
|
344
|
+
assert isinstance(val, str)
|
|
345
|
+
return self.__encode_document(val)
|
|
346
|
+
|
|
347
|
+
case _:
|
|
348
|
+
raise AssertionError(f'Unrecognized column type: {col_type._type}')
|
|
349
|
+
|
|
350
|
+
def __encode_image(self, img: PIL.Image.Image) -> str:
|
|
351
|
+
# Heuristic for thumbnail sizing:
|
|
352
|
+
# Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
|
|
353
|
+
# But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
|
|
354
|
+
# in the case of highly oblong images).
|
|
355
|
+
if img.height > img.width * 1.5:
|
|
356
|
+
scaled_img = img.resize((img.width * 360 // img.height, 360))
|
|
357
|
+
else:
|
|
358
|
+
scaled_img = img.resize((240, img.height * 240 // img.width))
|
|
359
|
+
with io.BytesIO() as buffer:
|
|
360
|
+
scaled_img.save(buffer, 'webp')
|
|
361
|
+
return base64.b64encode(buffer.getvalue()).decode()
|
|
362
|
+
|
|
363
|
+
def __encode_audio(self, audio_path: str) -> str | None:
|
|
364
|
+
try:
|
|
365
|
+
audio_md = av_utils.get_metadata(audio_path)
|
|
366
|
+
if 'streams' in audio_md:
|
|
367
|
+
duration = audio_md['streams'][0]['duration_seconds']
|
|
368
|
+
assert isinstance(duration, float)
|
|
369
|
+
return f'{timedelta(seconds=round(duration))} audio clip'
|
|
370
|
+
return None
|
|
371
|
+
except Exception:
|
|
372
|
+
_logger.info(f'Could not extract audio metadata from file for data preview: {audio_path}', exc_info=True)
|
|
373
|
+
return None
|
|
374
|
+
|
|
375
|
+
def __encode_video(self, video_path: str) -> str | None:
|
|
376
|
+
thumb = Formatter.extract_first_video_frame(video_path)
|
|
377
|
+
return self.__encode_image(thumb) if thumb is not None else None
|
|
378
|
+
|
|
379
|
+
def __encode_document(self, doc_path: str) -> str | None:
|
|
380
|
+
thumb = Formatter.make_document_thumbnail(doc_path)
|
|
381
|
+
return self.__encode_image(thumb) if thumb is not None else None
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
class TableRestorer:
|
|
385
|
+
"""
|
|
386
|
+
Creates a replica table from a tarball containing Parquet tables and media files. See the `TablePackager` docs for
|
|
387
|
+
details on the tarball structure.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
tbl_path: Pixeltable path (such as 'my_dir.my_table') where the materialized table will be made visible.
|
|
391
|
+
bundle_md: Optional metadata dictionary.
|
|
392
|
+
If not provided, metadata will be read from the tarball's `metadata.json`.
|
|
393
|
+
The metadata contains table_md, table_version_md, and table_schema_version_md entries for each ancestor
|
|
394
|
+
of the table being restored, as written out by `TablePackager`.
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
tbl_path: str
|
|
398
|
+
bundle_md: dict[str, Any] | None
|
|
399
|
+
tmp_dir: Path
|
|
400
|
+
media_files: dict[str, str] # Mapping from pxtmedia:// URLs to local file:// URLs
|
|
401
|
+
|
|
402
|
+
def __init__(self, tbl_path: str, bundle_md: dict[str, Any] | None = None) -> None:
|
|
403
|
+
self.tbl_path = tbl_path
|
|
404
|
+
self.bundle_md = bundle_md
|
|
405
|
+
self.tmp_dir = TempStore.create_path()
|
|
406
|
+
self.media_files = {}
|
|
407
|
+
|
|
408
|
+
def restore(self, bundle_path: Path, pxt_uri: str | None = None, explicit_version: int | None = None) -> pxt.Table:
|
|
409
|
+
# Extract tarball
|
|
410
|
+
print(f'Extracting table data into: {self.tmp_dir}')
|
|
411
|
+
with tarfile.open(bundle_path, 'r:bz2') as tf:
|
|
412
|
+
tf.extractall(path=self.tmp_dir)
|
|
413
|
+
|
|
414
|
+
if self.bundle_md is None:
|
|
415
|
+
# No metadata supplied; read it from the archive
|
|
416
|
+
with open(self.tmp_dir / 'metadata.json', 'r', encoding='utf8') as fp:
|
|
417
|
+
self.bundle_md = json.load(fp)
|
|
418
|
+
|
|
419
|
+
pxt_md_version = self.bundle_md['pxt_md_version']
|
|
420
|
+
assert isinstance(pxt_md_version, int)
|
|
421
|
+
|
|
422
|
+
if pxt_md_version != metadata.VERSION:
|
|
423
|
+
raise excs.Error(
|
|
424
|
+
f'Pixeltable metadata version mismatch: {pxt_md_version} != {metadata.VERSION}.\n'
|
|
425
|
+
'Please upgrade Pixeltable to use this dataset: pip install -U pixeltable'
|
|
426
|
+
)
|
|
427
|
+
# Convert tables metadata from dict to list of TableVersionMd
|
|
428
|
+
tbl_md = [schema.md_from_dict(TableVersionMd, t) for t in self.bundle_md['md']]
|
|
429
|
+
|
|
430
|
+
for md in tbl_md:
|
|
431
|
+
md.tbl_md.is_replica = True
|
|
432
|
+
|
|
433
|
+
assert not tbl_md[0].version_md.is_fragment # Top-level table cannot be a version fragment
|
|
434
|
+
|
|
435
|
+
cat = catalog.Catalog.get()
|
|
436
|
+
|
|
437
|
+
with cat.begin_xact(for_write=True):
|
|
438
|
+
# Create (or update) the replica table and its ancestors, along with TableVersion instances for any
|
|
439
|
+
# versions that have not been seen before.
|
|
440
|
+
cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
|
|
441
|
+
|
|
442
|
+
_logger.debug(f'Now will import data for {len(tbl_md)} table(s):')
|
|
443
|
+
_logger.debug(repr([md.tbl_md.tbl_id for md in tbl_md[::-1]]))
|
|
444
|
+
|
|
445
|
+
# Now we need to load data for replica_tbl and its ancestors, except that we skip
|
|
446
|
+
# replica_tbl itself if it's a pure snapshot.
|
|
447
|
+
for md in tbl_md[::-1]: # Base table first
|
|
448
|
+
if not md.is_pure_snapshot:
|
|
449
|
+
tv = cat.get_tbl_version(TableVersionKey(UUID(md.tbl_md.tbl_id), md.version_md.version, None))
|
|
450
|
+
# Import data from Parquet.
|
|
451
|
+
_logger.info(f'Importing table {tv.name!r}.')
|
|
452
|
+
self.__import_table(self.tmp_dir, tv, md)
|
|
453
|
+
|
|
454
|
+
tbl = cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id), version=explicit_version)
|
|
455
|
+
if pxt_uri is not None:
|
|
456
|
+
# Set pxt_uri for the newly created table
|
|
457
|
+
cat.update_additional_md(tbl._id, {'pxt_uri': pxt_uri})
|
|
458
|
+
tbl._tbl_version_path.clear_cached_md() # TODO: Clear cached md for ancestors too?
|
|
459
|
+
return tbl
|
|
460
|
+
|
|
461
|
+
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: TableVersionMd) -> None:
|
|
462
|
+
"""
|
|
463
|
+
Import the Parquet table into the Pixeltable catalog.
|
|
464
|
+
"""
|
|
465
|
+
tbl_id = UUID(tbl_md.tbl_md.tbl_id)
|
|
466
|
+
parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
|
|
467
|
+
parquet_table = pq.read_table(str(parquet_dir))
|
|
468
|
+
replica_version = tv.version
|
|
469
|
+
|
|
470
|
+
conn = Env.get().conn
|
|
471
|
+
store_sa_tbl = tv.store_tbl.sa_tbl
|
|
472
|
+
store_sa_tbl_name = tv.store_tbl._storage_name()
|
|
473
|
+
|
|
474
|
+
# Sometimes we are importing a table that has never been seen before. Other times, however, we are importing
|
|
475
|
+
# an existing replica table, and the table version and/or row selection differs from what was imported
|
|
476
|
+
# previously. Care must be taken to ensure that the new data is merged with existing data in a way that
|
|
477
|
+
# yields an internally consistent version history for each row.
|
|
478
|
+
|
|
479
|
+
# The overall strategy is this:
|
|
480
|
+
# 1. Import the parquet data into a temporary table;
|
|
481
|
+
# 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
|
|
482
|
+
# 3. Delete any row instances from the temporary table that are already present in the existing table;
|
|
483
|
+
# 4. Copy the remaining rows from the temporary table into the existing table.
|
|
484
|
+
# 5. Rectify any index columns.
|
|
485
|
+
|
|
486
|
+
# STEP 1: Import the parquet data into a temporary table.
|
|
487
|
+
|
|
488
|
+
# Create a temporary table for the initial data load, containing columns for all columns present in the
|
|
489
|
+
# parquet table. The parquet columns have identical names to those in the store table, so we can use the
|
|
490
|
+
# store table schema to get their SQL types (which are not necessarily derivable from their Parquet types,
|
|
491
|
+
# e.g., pa.string() may hold either VARCHAR or serialized JSONB).
|
|
492
|
+
temp_cols: dict[str, sql.Column] = {}
|
|
493
|
+
for field in parquet_table.schema:
|
|
494
|
+
assert field.name in store_sa_tbl.columns, f'{field.name} not in {list(store_sa_tbl.columns)}'
|
|
495
|
+
col_type = store_sa_tbl.columns[field.name].type
|
|
496
|
+
temp_cols[field.name] = sql.Column(field.name, col_type)
|
|
497
|
+
temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
|
|
498
|
+
_logger.debug(f'Creating temporary table: {temp_sa_tbl_name}')
|
|
499
|
+
temp_md = sql.MetaData()
|
|
500
|
+
temp_sa_tbl = sql.Table(temp_sa_tbl_name, temp_md, *temp_cols.values(), prefixes=['TEMPORARY'])
|
|
501
|
+
temp_sa_tbl.create(conn)
|
|
502
|
+
|
|
503
|
+
# Populate the temporary table with data from the Parquet file.
|
|
504
|
+
_logger.debug(f'Loading {parquet_table.num_rows} row(s) into temporary table: {temp_sa_tbl_name}')
|
|
505
|
+
for batch in parquet_table.to_batches(max_chunksize=10_000):
|
|
506
|
+
pydict = batch.to_pydict()
|
|
507
|
+
rows = self.__from_pa_pydict(tv, pydict)
|
|
508
|
+
conn.execute(sql.insert(temp_sa_tbl), rows)
|
|
509
|
+
|
|
510
|
+
# STEP 2: Rectify v_max values.
|
|
511
|
+
|
|
512
|
+
# Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
|
|
513
|
+
# Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
|
|
514
|
+
# In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
|
|
515
|
+
# row id, or MAX_VERSION if no such row instance exists. But in the replica, we need to be careful, since
|
|
516
|
+
# we might see only a subset of the original table's versions, and we might see them out of order.
|
|
517
|
+
|
|
518
|
+
# We'll adjust the v_max values according to the principle of "latest provable v_max":
|
|
519
|
+
# they will always correspond to the latest version for which we can prove the row instance was alive. This
|
|
520
|
+
# will enable us to maintain consistency of the v_max values if additional table versions are later imported,
|
|
521
|
+
# regardless of the order in which they are seen. It also means that replica tables (unlike original tables)
|
|
522
|
+
# may have gaps in their row version histories, but this is fine; the gaps simply correspond to table versions
|
|
523
|
+
# that have never been observed.
|
|
524
|
+
|
|
525
|
+
pk_predicates = [col == temp_cols[col.name] for col in tv.store_tbl.pk_columns()]
|
|
526
|
+
pk_clause = sql.and_(*pk_predicates)
|
|
527
|
+
|
|
528
|
+
# If the same pk exists in both the temporary table and the existing table, then the corresponding row data
|
|
529
|
+
# must be identical; the rows can differ only in their v_max value. As a sanity check, we go through the
|
|
530
|
+
# motion of verifying this; a failure implies data corruption in either the replica being imported or in a
|
|
531
|
+
# previously imported replica.
|
|
532
|
+
|
|
533
|
+
system_col_names = {col.name for col in tv.store_tbl.system_columns()}
|
|
534
|
+
media_col_names = {col.store_name() for col in tv.cols if col.col_type.is_media_type() and col.is_stored}
|
|
535
|
+
value_store_cols = [
|
|
536
|
+
store_sa_tbl.c[col_name]
|
|
537
|
+
for col_name in temp_cols
|
|
538
|
+
if col_name not in system_col_names and col_name not in media_col_names
|
|
539
|
+
]
|
|
540
|
+
value_temp_cols = [
|
|
541
|
+
col
|
|
542
|
+
for col_name, col in temp_cols.items()
|
|
543
|
+
if col_name not in system_col_names and col_name not in media_col_names
|
|
544
|
+
]
|
|
545
|
+
|
|
546
|
+
q: sql.Executable
|
|
547
|
+
|
|
548
|
+
assert len(value_store_cols) == len(value_temp_cols)
|
|
549
|
+
if len(value_store_cols) > 0:
|
|
550
|
+
mismatch_predicates = [
|
|
551
|
+
store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)
|
|
552
|
+
]
|
|
553
|
+
mismatch_clause = sql.or_(*mismatch_predicates)
|
|
554
|
+
|
|
555
|
+
# This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
|
|
556
|
+
# one value column. Pseudo-SQL:
|
|
557
|
+
#
|
|
558
|
+
# SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
|
|
559
|
+
# FROM store_tbl, temp_tbl
|
|
560
|
+
# WHERE store_tbl.rowid = temp_tbl.rowid
|
|
561
|
+
# AND store_tbl.pos_0 = temp_tbl.pos_0
|
|
562
|
+
# AND ... AND store_tbl.pos_k = temp_tbl.pos_k
|
|
563
|
+
# AND store_tbl.v_min = temp_tbl.v_min
|
|
564
|
+
# AND (
|
|
565
|
+
# store_tbl.col_0 != temp_tbl.col_0
|
|
566
|
+
# OR store_tbl.col_1 != temp_tbl.col_1
|
|
567
|
+
# OR ... OR store_tbl.col_n != temp_tbl.col_n
|
|
568
|
+
# )
|
|
569
|
+
#
|
|
570
|
+
# The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
|
|
571
|
+
# either column is NULL; this is what we want, since it may indicate a column that is present in one version
|
|
572
|
+
# but not the other.
|
|
573
|
+
q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
|
|
574
|
+
_logger.debug(q.compile())
|
|
575
|
+
result = conn.execute(q)
|
|
576
|
+
if result.rowcount > 0:
|
|
577
|
+
_logger.debug(
|
|
578
|
+
f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
|
|
579
|
+
f'{result.rowcount} inconsistent row(s).'
|
|
580
|
+
)
|
|
581
|
+
row = result.first()
|
|
582
|
+
_logger.debug('Example mismatch:')
|
|
583
|
+
_logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
|
|
584
|
+
_logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
|
|
585
|
+
raise excs.Error(
|
|
586
|
+
'Data corruption error: '
|
|
587
|
+
'the replica data are inconsistent with data retrieved from a previous replica.'
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
_logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
|
|
591
|
+
|
|
592
|
+
# Now rectify the v_max values in the temporary table.
|
|
593
|
+
# If a row instance has a concrete v_max value, then we know it's genuine: it's the unique and immutable
|
|
594
|
+
# version when the row was deleted. (This can only happen if later versions of the base table already
|
|
595
|
+
# existed at the time this replica was published.)
|
|
596
|
+
# But if a row instance has a v_max value of MAX_VERSION, then we don't know anything about its future.
|
|
597
|
+
# It might live indefinitely, or it might be deleted as early as version `n + 1`. Following the principle
|
|
598
|
+
# of "latest provable v_max", we simply set v_max equal to `n + 1`.
|
|
599
|
+
q = (
|
|
600
|
+
temp_sa_tbl.update()
|
|
601
|
+
.values(v_max=(replica_version + 1))
|
|
602
|
+
.where(temp_sa_tbl.c.v_max == schema.Table.MAX_VERSION)
|
|
603
|
+
)
|
|
604
|
+
_logger.debug(q.compile())
|
|
605
|
+
result = conn.execute(q)
|
|
606
|
+
_logger.debug(f'Rectified {result.rowcount} row(s) in {temp_sa_tbl_name!r}.')
|
|
607
|
+
|
|
608
|
+
# Now rectify the v_max values in the existing table. This is done by simply taking the later of the two v_max
|
|
609
|
+
# values (the existing one and the new one) for each row instance, following the "latest provable v_max"
|
|
610
|
+
# principle. Obviously we only need to do this for rows that exist in both tables (it's a simple join).
|
|
611
|
+
q = (
|
|
612
|
+
store_sa_tbl.update()
|
|
613
|
+
.values(v_max=sql.func.greatest(store_sa_tbl.c.v_max, temp_sa_tbl.c.v_max))
|
|
614
|
+
.where(pk_clause)
|
|
615
|
+
)
|
|
616
|
+
_logger.debug(q.compile())
|
|
617
|
+
result = conn.execute(q)
|
|
618
|
+
_logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
|
|
619
|
+
|
|
620
|
+
# STEP 3: Delete any row instances from the temporary table that are already present in the existing table.
|
|
621
|
+
|
|
622
|
+
# Now we need to update rows in the existing table that are also present in the temporary table. This is to
|
|
623
|
+
# account for the scenario where the temporary table has columns that are not present in the existing table.
|
|
624
|
+
# (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
|
|
625
|
+
# might also occur; there may be columns in the existing table that are not present in the temporary table.)
|
|
626
|
+
value_update_clauses: dict[str, sql.ColumnElement] = {}
|
|
627
|
+
for temp_col in temp_cols.values():
|
|
628
|
+
if temp_col.name not in system_col_names:
|
|
629
|
+
store_col = store_sa_tbl.c[temp_col.name]
|
|
630
|
+
# Prefer the value from the existing table, substituting the value from the temporary table if it's
|
|
631
|
+
# NULL. This works in all cases (including media columns, where we prefer the existing media file).
|
|
632
|
+
clause = sql.case((store_col == None, temp_col), else_=store_col)
|
|
633
|
+
value_update_clauses[temp_col.name] = clause
|
|
634
|
+
if len(value_update_clauses) > 0:
|
|
635
|
+
q = store_sa_tbl.update().values(**value_update_clauses).where(pk_clause)
|
|
636
|
+
_logger.debug(q.compile())
|
|
637
|
+
result = conn.execute(q)
|
|
638
|
+
_logger.debug(
|
|
639
|
+
f'Merged values from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r} for {result.rowcount} row(s).'
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# Now drop any rows from the temporary table that are also present in the existing table.
|
|
643
|
+
# The v_max values have been rectified, data has been merged into NULL cells, and all other row values have
|
|
644
|
+
# been verified identical.
|
|
645
|
+
# TODO: Delete any media files that were orphaned by this operation (they're necessarily duplicates of media
|
|
646
|
+
# files that are already present in the existing table).
|
|
647
|
+
q = temp_sa_tbl.delete().where(pk_clause)
|
|
648
|
+
_logger.debug(q.compile())
|
|
649
|
+
result = conn.execute(q)
|
|
650
|
+
_logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
|
|
651
|
+
|
|
652
|
+
# STEP 4: Copy the remaining rows from the temporary table into the existing table.
|
|
653
|
+
|
|
654
|
+
# Now copy the remaining data (consisting entirely of new row instances) from the temporary table into
|
|
655
|
+
# the actual table.
|
|
656
|
+
q = store_sa_tbl.insert().from_select(
|
|
657
|
+
[store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
|
|
658
|
+
)
|
|
659
|
+
_logger.debug(q.compile())
|
|
660
|
+
result = conn.execute(q)
|
|
661
|
+
_logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
|
|
662
|
+
|
|
663
|
+
# STEP 5: Rectify any index columns.
|
|
664
|
+
|
|
665
|
+
# Finally, rectify any index columns in the table. This involves shuffling data between the index's val and
|
|
666
|
+
# undo columns to ensure they appropriately reflect the most recent replicated version of the table.
|
|
667
|
+
|
|
668
|
+
# Get the most recent replicated version of the table. This might be the version we're currently importing,
|
|
669
|
+
# but it might be a different version of the table that was previously imported.
|
|
670
|
+
head_version_md = catalog.Catalog.get()._collect_tbl_history(tv.id, n=1)[0]
|
|
671
|
+
head_version = head_version_md.version_md.version
|
|
672
|
+
_logger.debug(f'Head version for index rectification is {head_version}.')
|
|
673
|
+
|
|
674
|
+
# Get the index info from the table metadata. Here we use the tbl_md that we just collected from the DB.
|
|
675
|
+
# This is to ensure we pick up ALL indices, including dropped indices and indices that are present in
|
|
676
|
+
# a previously replicated version of the table, but not in the one currently being imported.
|
|
677
|
+
index_md = head_version_md.tbl_md.index_md
|
|
678
|
+
|
|
679
|
+
# Now update the table. We can do this for all indices together with just two SQL queries. For each index,
|
|
680
|
+
# at most one of the val or undo columns will be non-NULL in any given row.
|
|
681
|
+
# For rows where v_min <= head_version < v_max, we set, for all indices:
|
|
682
|
+
# val_col = whichever of (val_col, undo_col) is non-NULL (or NULL if both are, e.g., for a dropped index)
|
|
683
|
+
# undo_col = NULL
|
|
684
|
+
# For rows where head_version < v_min or v_max <= head_version, vice versa.
|
|
685
|
+
val_sql_clauses: dict[str, sql.ColumnElement] = {}
|
|
686
|
+
undo_sql_clauses: dict[str, sql.ColumnElement] = {}
|
|
687
|
+
for index in index_md.values():
|
|
688
|
+
if index.class_fqn.endswith('.EmbeddingIndex'):
|
|
689
|
+
val_col_name = f'col_{index.index_val_col_id}'
|
|
690
|
+
undo_col_name = f'col_{index.index_val_undo_col_id}'
|
|
691
|
+
# Check that the val column for the index is actually present in the store table. We need to do this
|
|
692
|
+
# to properly handle the case where the replica represents a table version that was *not* the most
|
|
693
|
+
# recent version at the time it was published. In that case, it is possible for tbl_md to contain
|
|
694
|
+
# metadata for indices not known to any version that has been replicated. (However, the converse
|
|
695
|
+
# *does* hold: all replicated indices must have metadata in tbl_md; and that's what's important.)
|
|
696
|
+
if val_col_name in store_sa_tbl.c:
|
|
697
|
+
assert undo_col_name in store_sa_tbl.c
|
|
698
|
+
coalesce = sql.func.coalesce(store_sa_tbl.c[val_col_name], store_sa_tbl.c[undo_col_name])
|
|
699
|
+
val_sql_clauses[val_col_name] = coalesce
|
|
700
|
+
val_sql_clauses[undo_col_name] = sql.null()
|
|
701
|
+
undo_sql_clauses[undo_col_name] = coalesce
|
|
702
|
+
undo_sql_clauses[val_col_name] = sql.null()
|
|
703
|
+
|
|
704
|
+
if len(val_sql_clauses) > 0:
|
|
705
|
+
q2 = (
|
|
706
|
+
store_sa_tbl.update()
|
|
707
|
+
.values(**val_sql_clauses)
|
|
708
|
+
.where(sql.and_(tv.store_tbl.v_min_col <= head_version, tv.store_tbl.v_max_col > head_version))
|
|
709
|
+
)
|
|
710
|
+
_logger.debug(q2.compile())
|
|
711
|
+
_ = conn.execute(q2)
|
|
712
|
+
q2 = (
|
|
713
|
+
store_sa_tbl.update()
|
|
714
|
+
.values(**undo_sql_clauses)
|
|
715
|
+
.where(sql.or_(tv.store_tbl.v_min_col > head_version, tv.store_tbl.v_max_col <= head_version))
|
|
716
|
+
)
|
|
717
|
+
_logger.debug(q2.compile())
|
|
718
|
+
_ = conn.execute(q2)
|
|
719
|
+
_logger.debug(f'Rectified index columns in {store_sa_tbl_name!r}.')
|
|
720
|
+
else:
|
|
721
|
+
_logger.debug(f'No index columns to rectify in {store_sa_tbl_name!r}.')
|
|
722
|
+
|
|
723
|
+
def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
|
|
724
|
+
# Data conversions from pyarrow to Pixeltable
|
|
725
|
+
sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
|
|
726
|
+
for col_name in pydict:
|
|
727
|
+
assert col_name in tv.store_tbl.sa_tbl.columns
|
|
728
|
+
sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
|
|
729
|
+
stored_cols: dict[str, catalog.Column] = {col.store_name(): col for col in tv.cols if col.is_stored}
|
|
730
|
+
stored_cols |= {col.cellmd_store_name(): col for col in tv.cols if col.stores_cellmd}
|
|
731
|
+
|
|
732
|
+
row_count = len(next(iter(pydict.values())))
|
|
733
|
+
rows: list[dict[str, Any]] = [{} for _ in range(row_count)]
|
|
734
|
+
for col_name, col_vals in pydict.items():
|
|
735
|
+
assert len(col_vals) == row_count
|
|
736
|
+
col = stored_cols.get(col_name) # Will be None for system columns
|
|
737
|
+
is_media_col = col is not None and col.is_stored and col.col_type.is_media_type()
|
|
738
|
+
is_cellmd_col = col is not None and col.stores_cellmd and col_name == col.cellmd_store_name()
|
|
739
|
+
assert col is None or is_cellmd_col or col_name == col.store_name()
|
|
740
|
+
|
|
741
|
+
for i, val in enumerate(col_vals):
|
|
742
|
+
rows[i][col_name] = self.__from_pa_value(val, sql_types[col_name], col, is_media_col, is_cellmd_col)
|
|
743
|
+
|
|
744
|
+
return rows
|
|
745
|
+
|
|
746
|
+
def __from_pa_value(
|
|
747
|
+
self,
|
|
748
|
+
val: Any,
|
|
749
|
+
sql_type: sql.types.TypeEngine[Any],
|
|
750
|
+
col: catalog.Column | None,
|
|
751
|
+
is_media_col: bool,
|
|
752
|
+
is_cellmd_col: bool,
|
|
753
|
+
) -> Any:
|
|
754
|
+
if val is None:
|
|
755
|
+
return None
|
|
756
|
+
if isinstance(sql_type, sql_vector.Vector):
|
|
757
|
+
if isinstance(val, list):
|
|
758
|
+
val = np.array(val, dtype=np.float32)
|
|
759
|
+
assert isinstance(val, np.ndarray) and val.dtype == np.float32 and val.ndim == 1
|
|
760
|
+
return val
|
|
761
|
+
if is_cellmd_col:
|
|
762
|
+
assert col is not None
|
|
763
|
+
assert isinstance(val, str)
|
|
764
|
+
return self.__restore_cellmd(col, json.loads(val))
|
|
765
|
+
if isinstance(sql_type, sql.JSON):
|
|
766
|
+
return json.loads(val)
|
|
767
|
+
if is_media_col:
|
|
768
|
+
assert col is not None
|
|
769
|
+
return self.__relocate_media_file(col, val)
|
|
770
|
+
return val
|
|
771
|
+
|
|
772
|
+
def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
|
|
773
|
+
# If this is a pxtmedia:// URL, relocate it
|
|
774
|
+
assert isinstance(url, str)
|
|
775
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
776
|
+
assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
|
|
777
|
+
if parsed_url.scheme == 'pxtmedia':
|
|
778
|
+
if url not in self.media_files:
|
|
779
|
+
# First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
|
|
780
|
+
# in self.media_files.
|
|
781
|
+
src_path = self.tmp_dir / 'media' / parsed_url.netloc
|
|
782
|
+
# Move the file to the media store and update the URL.
|
|
783
|
+
self.media_files[url] = ObjectOps.put_file(media_col, src_path, relocate_or_delete=True)
|
|
784
|
+
return self.media_files[url]
|
|
785
|
+
# For any type of URL other than a local file, just return the URL as-is.
|
|
786
|
+
return url
|
|
787
|
+
|
|
788
|
+
def __restore_cellmd(self, col: catalog.Column, cellmd: dict[str, Any]) -> dict[str, Any]:
|
|
789
|
+
cellmd_ = CellMd.from_dict(cellmd)
|
|
790
|
+
if cellmd_.file_urls is None:
|
|
791
|
+
return cellmd # No changes
|
|
792
|
+
|
|
793
|
+
updated_urls: list[str] = []
|
|
794
|
+
for url in cellmd_.file_urls:
|
|
795
|
+
updated_urls.append(self.__relocate_media_file(col, url))
|
|
796
|
+
cellmd_.file_urls = updated_urls
|
|
797
|
+
return cellmd_.as_dict()
|