pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/share/__init__.py
CHANGED
pixeltable/share/packager.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import base64
|
|
2
|
-
import
|
|
2
|
+
import dataclasses
|
|
3
3
|
import io
|
|
4
|
-
import itertools
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
6
|
import tarfile
|
|
@@ -9,11 +8,12 @@ import urllib.parse
|
|
|
9
8
|
import urllib.request
|
|
10
9
|
import uuid
|
|
11
10
|
from pathlib import Path
|
|
12
|
-
from typing import Any, Iterator
|
|
11
|
+
from typing import Any, Iterator
|
|
13
12
|
from uuid import UUID
|
|
14
13
|
|
|
15
14
|
import more_itertools
|
|
16
15
|
import numpy as np
|
|
16
|
+
import pgvector.sqlalchemy as sql_vector # type: ignore[import-untyped]
|
|
17
17
|
import PIL.Image
|
|
18
18
|
import pyarrow as pa
|
|
19
19
|
import pyarrow.parquet as pq
|
|
@@ -21,11 +21,14 @@ import sqlalchemy as sql
|
|
|
21
21
|
|
|
22
22
|
import pixeltable as pxt
|
|
23
23
|
from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
|
|
24
|
+
from pixeltable.catalog.table_version import TableVersionCompleteMd
|
|
24
25
|
from pixeltable.env import Env
|
|
26
|
+
from pixeltable.exprs.data_row import CellMd
|
|
25
27
|
from pixeltable.metadata import schema
|
|
26
28
|
from pixeltable.utils import sha256sum
|
|
27
29
|
from pixeltable.utils.formatter import Formatter
|
|
28
|
-
from pixeltable.utils.
|
|
30
|
+
from pixeltable.utils.local_store import TempStore
|
|
31
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
29
32
|
|
|
30
33
|
_logger = logging.getLogger('pixeltable')
|
|
31
34
|
|
|
@@ -50,27 +53,27 @@ class TablePackager:
|
|
|
50
53
|
tmp_dir: Path # Temporary directory where the package will reside
|
|
51
54
|
tables_dir: Path # Directory where the Parquet tables will be written
|
|
52
55
|
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
53
|
-
|
|
56
|
+
bundle_md: dict[str, Any]
|
|
54
57
|
|
|
55
58
|
bundle_path: Path
|
|
56
59
|
preview_header: dict[str, str]
|
|
57
60
|
preview: list[list[Any]]
|
|
58
61
|
|
|
59
|
-
def __init__(self, table: catalog.Table, additional_md:
|
|
62
|
+
def __init__(self, table: catalog.Table, additional_md: dict[str, Any] | None = None) -> None:
|
|
60
63
|
self.table = table
|
|
61
|
-
self.tmp_dir =
|
|
64
|
+
self.tmp_dir = TempStore.create_path()
|
|
62
65
|
self.media_files = {}
|
|
63
66
|
|
|
64
|
-
# Load metadata
|
|
67
|
+
# Load metadata and convert to JSON immediately
|
|
65
68
|
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
66
69
|
tbl_md = catalog.Catalog.get().load_replica_md(table)
|
|
67
|
-
self.
|
|
70
|
+
self.bundle_md = {
|
|
68
71
|
'pxt_version': pxt.__version__,
|
|
69
72
|
'pxt_md_version': metadata.VERSION,
|
|
70
|
-
'md':
|
|
73
|
+
'md': [dataclasses.asdict(md) for md in tbl_md],
|
|
71
74
|
}
|
|
72
75
|
if additional_md is not None:
|
|
73
|
-
self.
|
|
76
|
+
self.bundle_md.update(additional_md)
|
|
74
77
|
|
|
75
78
|
def package(self) -> Path:
|
|
76
79
|
"""
|
|
@@ -81,7 +84,7 @@ class TablePackager:
|
|
|
81
84
|
_logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
|
|
82
85
|
self.tmp_dir.mkdir()
|
|
83
86
|
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
84
|
-
json.dump(self.
|
|
87
|
+
json.dump(self.bundle_md, fp)
|
|
85
88
|
self.tables_dir = self.tmp_dir / 'tables'
|
|
86
89
|
self.tables_dir.mkdir()
|
|
87
90
|
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
@@ -93,10 +96,10 @@ class TablePackager:
|
|
|
93
96
|
self.bundle_path = self.__build_tarball()
|
|
94
97
|
|
|
95
98
|
_logger.info('Extracting preview data.')
|
|
96
|
-
self.
|
|
99
|
+
self.bundle_md['row_count'] = self.table.count()
|
|
97
100
|
preview_header, preview = self.__extract_preview_data()
|
|
98
|
-
self.
|
|
99
|
-
self.
|
|
101
|
+
self.bundle_md['preview_header'] = preview_header
|
|
102
|
+
self.bundle_md['preview_data'] = preview
|
|
100
103
|
|
|
101
104
|
_logger.info(f'Packaging complete: {self.bundle_path}')
|
|
102
105
|
return self.bundle_path
|
|
@@ -109,9 +112,12 @@ class TablePackager:
|
|
|
109
112
|
assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
|
|
110
113
|
sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
|
|
111
114
|
media_cols: set[str] = set()
|
|
115
|
+
cellmd_cols: set[str] = set()
|
|
112
116
|
for col in tv.cols:
|
|
113
117
|
if col.is_stored and col.col_type.is_media_type():
|
|
114
118
|
media_cols.add(col.store_name())
|
|
119
|
+
if col.stores_cellmd:
|
|
120
|
+
cellmd_cols.add(col.cellmd_store_name())
|
|
115
121
|
|
|
116
122
|
parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
|
|
117
123
|
# TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
|
|
@@ -126,10 +132,10 @@ class TablePackager:
|
|
|
126
132
|
# excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
|
|
127
133
|
# We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
|
|
128
134
|
# faster compression should provide good performance while still reducing temporary storage utilization.
|
|
129
|
-
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='
|
|
130
|
-
filter_tv = self.table.
|
|
135
|
+
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
|
|
136
|
+
filter_tv = self.table._tbl_version_path.tbl_version.get()
|
|
131
137
|
row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
|
|
132
|
-
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
|
|
138
|
+
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, cellmd_cols, parquet_schema):
|
|
133
139
|
parquet_writer.write_table(pa_table)
|
|
134
140
|
parquet_writer.close()
|
|
135
141
|
|
|
@@ -138,7 +144,7 @@ class TablePackager:
|
|
|
138
144
|
@classmethod
|
|
139
145
|
def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
|
|
140
146
|
entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
|
|
141
|
-
return pa.schema(entries)
|
|
147
|
+
return pa.schema(entries)
|
|
142
148
|
|
|
143
149
|
@classmethod
|
|
144
150
|
def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
|
|
@@ -151,13 +157,17 @@ class TablePackager:
|
|
|
151
157
|
if isinstance(col_type, sql.Float):
|
|
152
158
|
return pa.float32()
|
|
153
159
|
if isinstance(col_type, sql.TIMESTAMP):
|
|
154
|
-
return pa.timestamp('us', tz=
|
|
160
|
+
return pa.timestamp('us', tz='UTC')
|
|
155
161
|
if isinstance(col_type, sql.Date):
|
|
156
162
|
return pa.date32()
|
|
157
163
|
if isinstance(col_type, sql.JSON):
|
|
158
164
|
return pa.string() # JSON will be exported as strings
|
|
159
165
|
if isinstance(col_type, sql.LargeBinary):
|
|
160
166
|
return pa.binary()
|
|
167
|
+
if isinstance(col_type, sql_vector.Vector):
|
|
168
|
+
# Parquet/pyarrow do not handle null values properly for fixed_shape_tensor(), so we have to use list_()
|
|
169
|
+
# here instead.
|
|
170
|
+
return pa.list_(pa.float32())
|
|
161
171
|
raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
|
|
162
172
|
|
|
163
173
|
def __to_pa_tables(
|
|
@@ -165,6 +175,7 @@ class TablePackager:
|
|
|
165
175
|
row_iter: Iterator[dict[str, Any]],
|
|
166
176
|
sql_types: dict[str, sql.types.TypeEngine[Any]],
|
|
167
177
|
media_cols: set[str],
|
|
178
|
+
cellmd_cols: set[str],
|
|
168
179
|
arrow_schema: pa.Schema,
|
|
169
180
|
batch_size: int = 1_000,
|
|
170
181
|
) -> Iterator[pa.Table]:
|
|
@@ -176,14 +187,21 @@ class TablePackager:
|
|
|
176
187
|
for rows in more_itertools.batched(row_iter, batch_size):
|
|
177
188
|
cols = {}
|
|
178
189
|
for name, sql_type in sql_types.items():
|
|
179
|
-
|
|
180
|
-
|
|
190
|
+
values = [
|
|
191
|
+
self.__to_pa_value(row.get(name), sql_type, name in media_cols, name in cellmd_cols) for row in rows
|
|
192
|
+
]
|
|
181
193
|
cols[name] = values
|
|
182
194
|
yield pa.Table.from_pydict(cols, schema=arrow_schema)
|
|
183
195
|
|
|
184
|
-
def __to_pa_value(
|
|
196
|
+
def __to_pa_value(
|
|
197
|
+
self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool, is_cellmd_col: bool
|
|
198
|
+
) -> Any:
|
|
185
199
|
if val is None:
|
|
186
200
|
return None
|
|
201
|
+
if is_cellmd_col:
|
|
202
|
+
assert isinstance(val, dict)
|
|
203
|
+
# Export JSON as strings
|
|
204
|
+
return json.dumps(self.__process_cellmd(val))
|
|
187
205
|
if isinstance(sql_type, sql.JSON):
|
|
188
206
|
# Export JSON as strings
|
|
189
207
|
return json.dumps(val)
|
|
@@ -194,6 +212,10 @@ class TablePackager:
|
|
|
194
212
|
return val
|
|
195
213
|
|
|
196
214
|
def __process_media_url(self, url: str) -> str:
|
|
215
|
+
"""
|
|
216
|
+
Process a media URL for export. If it's a local file URL (file://), then replace it with a pxtmedia:// URI,
|
|
217
|
+
copying the file into the tarball if necessary. If it's any other type of URL, return it unchanged.
|
|
218
|
+
"""
|
|
197
219
|
parsed_url = urllib.parse.urlparse(url)
|
|
198
220
|
if parsed_url.scheme == 'file':
|
|
199
221
|
# It's the URL of a local file. Replace it with a pxtmedia:// URI.
|
|
@@ -214,6 +236,21 @@ class TablePackager:
|
|
|
214
236
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
215
237
|
return url
|
|
216
238
|
|
|
239
|
+
def __process_cellmd(self, cellmd: dict[str, Any]) -> dict[str, Any]:
|
|
240
|
+
"""
|
|
241
|
+
Process a cellmd dictionary for export. This involves replacing any local file references
|
|
242
|
+
with pxtmedia:// URIs, as described above.
|
|
243
|
+
"""
|
|
244
|
+
cellmd_ = CellMd.from_dict(cellmd)
|
|
245
|
+
if cellmd_.file_urls is None:
|
|
246
|
+
return cellmd # No changes
|
|
247
|
+
|
|
248
|
+
updated_urls: list[str] = []
|
|
249
|
+
for url in cellmd_.file_urls:
|
|
250
|
+
updated_urls.append(self.__process_media_url(url))
|
|
251
|
+
cellmd_.file_urls = updated_urls
|
|
252
|
+
return cellmd_.as_dict()
|
|
253
|
+
|
|
217
254
|
def __build_tarball(self) -> Path:
|
|
218
255
|
bundle_path = self.tmp_dir / 'bundle.tar.bz2'
|
|
219
256
|
with tarfile.open(bundle_path, 'w:bz2') as tf:
|
|
@@ -237,8 +274,7 @@ class TablePackager:
|
|
|
237
274
|
- Videos are replaced by their first frame and resized as above
|
|
238
275
|
- Documents are replaced by a thumbnail as a base64-encoded webp
|
|
239
276
|
"""
|
|
240
|
-
|
|
241
|
-
preview_cols = dict(itertools.islice(self.table._schema.items(), 0, 8))
|
|
277
|
+
preview_cols = self.table._get_schema()
|
|
242
278
|
select_list = [self.table[col_name] for col_name in preview_cols]
|
|
243
279
|
# First 5 rows
|
|
244
280
|
rows = list(self.table.select(*select_list).head(n=5))
|
|
@@ -308,11 +344,11 @@ class TablePackager:
|
|
|
308
344
|
scaled_img.save(buffer, 'webp')
|
|
309
345
|
return base64.b64encode(buffer.getvalue()).decode()
|
|
310
346
|
|
|
311
|
-
def __encode_video(self, video_path: str) ->
|
|
347
|
+
def __encode_video(self, video_path: str) -> str | None:
|
|
312
348
|
thumb = Formatter.extract_first_video_frame(video_path)
|
|
313
349
|
return self.__encode_image(thumb) if thumb is not None else None
|
|
314
350
|
|
|
315
|
-
def __encode_document(self, doc_path: str) ->
|
|
351
|
+
def __encode_document(self, doc_path: str) -> str | None:
|
|
316
352
|
thumb = Formatter.make_document_thumbnail(doc_path)
|
|
317
353
|
return self.__encode_image(thumb) if thumb is not None else None
|
|
318
354
|
|
|
@@ -324,20 +360,21 @@ class TableRestorer:
|
|
|
324
360
|
|
|
325
361
|
Args:
|
|
326
362
|
tbl_path: Pixeltable path (such as 'my_dir.my_table') where the materialized table will be made visible.
|
|
327
|
-
|
|
363
|
+
bundle_md: Optional metadata dictionary.
|
|
364
|
+
If not provided, metadata will be read from the tarball's `metadata.json`.
|
|
328
365
|
The metadata contains table_md, table_version_md, and table_schema_version_md entries for each ancestor
|
|
329
366
|
of the table being restored, as written out by `TablePackager`.
|
|
330
367
|
"""
|
|
331
368
|
|
|
332
369
|
tbl_path: str
|
|
333
|
-
|
|
370
|
+
bundle_md: dict[str, Any] | None
|
|
334
371
|
tmp_dir: Path
|
|
335
372
|
media_files: dict[str, str] # Mapping from pxtmedia:// URLs to local file:// URLs
|
|
336
373
|
|
|
337
|
-
def __init__(self, tbl_path: str,
|
|
374
|
+
def __init__(self, tbl_path: str, bundle_md: dict[str, Any] | None = None) -> None:
|
|
338
375
|
self.tbl_path = tbl_path
|
|
339
|
-
self.
|
|
340
|
-
self.tmp_dir =
|
|
376
|
+
self.bundle_md = bundle_md
|
|
377
|
+
self.tmp_dir = TempStore.create_path()
|
|
341
378
|
self.media_files = {}
|
|
342
379
|
|
|
343
380
|
def restore(self, bundle_path: Path) -> pxt.Table:
|
|
@@ -346,12 +383,12 @@ class TableRestorer:
|
|
|
346
383
|
with tarfile.open(bundle_path, 'r:bz2') as tf:
|
|
347
384
|
tf.extractall(path=self.tmp_dir)
|
|
348
385
|
|
|
349
|
-
if self.
|
|
386
|
+
if self.bundle_md is None:
|
|
350
387
|
# No metadata supplied; read it from the archive
|
|
351
388
|
with open(self.tmp_dir / 'metadata.json', 'r', encoding='utf8') as fp:
|
|
352
|
-
self.
|
|
389
|
+
self.bundle_md = json.load(fp)
|
|
353
390
|
|
|
354
|
-
pxt_md_version = self.
|
|
391
|
+
pxt_md_version = self.bundle_md['pxt_md_version']
|
|
355
392
|
assert isinstance(pxt_md_version, int)
|
|
356
393
|
|
|
357
394
|
if pxt_md_version != metadata.VERSION:
|
|
@@ -359,51 +396,40 @@ class TableRestorer:
|
|
|
359
396
|
f'Pixeltable metadata version mismatch: {pxt_md_version} != {metadata.VERSION}.\n'
|
|
360
397
|
'Please upgrade Pixeltable to use this dataset: pip install -U pixeltable'
|
|
361
398
|
)
|
|
399
|
+
# Convert tables metadata from dict to list of TableVersionCompleteMd
|
|
400
|
+
tbl_md = [schema.md_from_dict(TableVersionCompleteMd, t) for t in self.bundle_md['md']]
|
|
401
|
+
|
|
402
|
+
for md in tbl_md:
|
|
403
|
+
md.tbl_md.is_replica = True
|
|
362
404
|
|
|
363
|
-
|
|
405
|
+
assert not tbl_md[0].version_md.is_fragment # Top-level table cannot be a version fragment
|
|
364
406
|
|
|
365
|
-
# Create the replica table
|
|
366
|
-
# The logic here needs to be completely restructured in order to make it concurrency-safe.
|
|
367
|
-
# - Catalog.create_replica() needs to write the metadata and also create the physical store tables
|
|
368
|
-
# and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
|
|
369
|
-
# an actual table)
|
|
370
|
-
# - this could be done one replica at a time (instead of the entire hierarchy)
|
|
371
407
|
cat = catalog.Catalog.get()
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
# Instantiate data from the Parquet tables.
|
|
391
|
-
with Env.get().begin_xact():
|
|
392
|
-
for md in ancestor_md[::-1]: # Base table first
|
|
393
|
-
# Create a TableVersion instance (and a store table) for this ancestor.
|
|
394
|
-
tv = catalog.TableVersion.create_replica(md)
|
|
395
|
-
# Now import data from Parquet.
|
|
396
|
-
_logger.info(f'Importing table {tv.name!r}.')
|
|
397
|
-
self.__import_table(self.tmp_dir, tv, md)
|
|
398
|
-
|
|
399
|
-
with cat.begin_xact(for_write=False):
|
|
408
|
+
|
|
409
|
+
with cat.begin_xact(for_write=True):
|
|
410
|
+
# Create (or update) the replica table and its ancestors, along with TableVersion instances for any
|
|
411
|
+
# versions that have not been seen before.
|
|
412
|
+
cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
|
|
413
|
+
|
|
414
|
+
_logger.debug(f'Now will import data for {len(tbl_md)} table(s):')
|
|
415
|
+
_logger.debug(repr([md.tbl_md.tbl_id for md in tbl_md[::-1]]))
|
|
416
|
+
|
|
417
|
+
# Now we need to load data for replica_tbl and its ancestors, except that we skip
|
|
418
|
+
# replica_tbl itself if it's a pure snapshot.
|
|
419
|
+
for md in tbl_md[::-1]: # Base table first
|
|
420
|
+
if not md.is_pure_snapshot:
|
|
421
|
+
tv = cat.get_tbl_version(UUID(md.tbl_md.tbl_id), md.version_md.version)
|
|
422
|
+
# Import data from Parquet.
|
|
423
|
+
_logger.info(f'Importing table {tv.name!r}.')
|
|
424
|
+
self.__import_table(self.tmp_dir, tv, md)
|
|
425
|
+
|
|
400
426
|
return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
|
|
401
427
|
|
|
402
|
-
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md:
|
|
428
|
+
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: TableVersionCompleteMd) -> None:
|
|
403
429
|
"""
|
|
404
430
|
Import the Parquet table into the Pixeltable catalog.
|
|
405
431
|
"""
|
|
406
|
-
tbl_id =
|
|
432
|
+
tbl_id = UUID(tbl_md.tbl_md.tbl_id)
|
|
407
433
|
parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
|
|
408
434
|
parquet_table = pq.read_table(str(parquet_dir))
|
|
409
435
|
replica_version = tv.version
|
|
@@ -422,6 +448,9 @@ class TableRestorer:
|
|
|
422
448
|
# 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
|
|
423
449
|
# 3. Delete any row instances from the temporary table that are already present in the existing table;
|
|
424
450
|
# 4. Copy the remaining rows from the temporary table into the existing table.
|
|
451
|
+
# 5. Rectify any index columns.
|
|
452
|
+
|
|
453
|
+
# STEP 1: Import the parquet data into a temporary table.
|
|
425
454
|
|
|
426
455
|
# Create a temporary table for the initial data load, containing columns for all columns present in the
|
|
427
456
|
# parquet table. The parquet columns have identical names to those in the store table, so we can use the
|
|
@@ -429,7 +458,7 @@ class TableRestorer:
|
|
|
429
458
|
# e.g., pa.string() may hold either VARCHAR or serialized JSONB).
|
|
430
459
|
temp_cols: dict[str, sql.Column] = {}
|
|
431
460
|
for field in parquet_table.schema:
|
|
432
|
-
assert field.name in store_sa_tbl.columns
|
|
461
|
+
assert field.name in store_sa_tbl.columns, f'{field.name} not in {list(store_sa_tbl.columns)}'
|
|
433
462
|
col_type = store_sa_tbl.columns[field.name].type
|
|
434
463
|
temp_cols[field.name] = sql.Column(field.name, col_type)
|
|
435
464
|
temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
|
|
@@ -445,6 +474,8 @@ class TableRestorer:
|
|
|
445
474
|
rows = self.__from_pa_pydict(tv, pydict)
|
|
446
475
|
conn.execute(sql.insert(temp_sa_tbl), rows)
|
|
447
476
|
|
|
477
|
+
# STEP 2: Rectify v_max values.
|
|
478
|
+
|
|
448
479
|
# Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
|
|
449
480
|
# Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
|
|
450
481
|
# In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
|
|
@@ -478,42 +509,51 @@ class TableRestorer:
|
|
|
478
509
|
for col_name, col in temp_cols.items()
|
|
479
510
|
if col_name not in system_col_names and col_name not in media_col_names
|
|
480
511
|
]
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
)
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
512
|
+
|
|
513
|
+
q: sql.Executable
|
|
514
|
+
|
|
515
|
+
assert len(value_store_cols) == len(value_temp_cols)
|
|
516
|
+
if len(value_store_cols) > 0:
|
|
517
|
+
mismatch_predicates = [
|
|
518
|
+
store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)
|
|
519
|
+
]
|
|
520
|
+
mismatch_clause = sql.or_(*mismatch_predicates)
|
|
521
|
+
|
|
522
|
+
# This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
|
|
523
|
+
# one value column. Pseudo-SQL:
|
|
524
|
+
#
|
|
525
|
+
# SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
|
|
526
|
+
# FROM store_tbl, temp_tbl
|
|
527
|
+
# WHERE store_tbl.rowid = temp_tbl.rowid
|
|
528
|
+
# AND store_tbl.pos_0 = temp_tbl.pos_0
|
|
529
|
+
# AND ... AND store_tbl.pos_k = temp_tbl.pos_k
|
|
530
|
+
# AND store_tbl.v_min = temp_tbl.v_min
|
|
531
|
+
# AND (
|
|
532
|
+
# store_tbl.col_0 != temp_tbl.col_0
|
|
533
|
+
# OR store_tbl.col_1 != temp_tbl.col_1
|
|
534
|
+
# OR ... OR store_tbl.col_n != temp_tbl.col_n
|
|
535
|
+
# )
|
|
536
|
+
#
|
|
537
|
+
# The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
|
|
538
|
+
# either column is NULL; this is what we want, since it may indicate a column that is present in one version
|
|
539
|
+
# but not the other.
|
|
540
|
+
q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
|
|
541
|
+
_logger.debug(q.compile())
|
|
542
|
+
result = conn.execute(q)
|
|
543
|
+
if result.rowcount > 0:
|
|
544
|
+
_logger.debug(
|
|
545
|
+
f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
|
|
546
|
+
f'{result.rowcount} inconsistent row(s).'
|
|
547
|
+
)
|
|
548
|
+
row = result.first()
|
|
549
|
+
_logger.debug('Example mismatch:')
|
|
550
|
+
_logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
|
|
551
|
+
_logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
|
|
552
|
+
raise excs.Error(
|
|
553
|
+
'Data corruption error: '
|
|
554
|
+
'the replica data are inconsistent with data retrieved from a previous replica.'
|
|
555
|
+
)
|
|
556
|
+
|
|
517
557
|
_logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
|
|
518
558
|
|
|
519
559
|
# Now rectify the v_max values in the temporary table.
|
|
@@ -544,6 +584,8 @@ class TableRestorer:
|
|
|
544
584
|
result = conn.execute(q)
|
|
545
585
|
_logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
|
|
546
586
|
|
|
587
|
+
# STEP 3: Delete any row instances from the temporary table that are already present in the existing table.
|
|
588
|
+
|
|
547
589
|
# Now we need to update rows in the existing table that are also present in the temporary table. This is to
|
|
548
590
|
# account for the scenario where the temporary table has columns that are not present in the existing table.
|
|
549
591
|
# (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
|
|
@@ -574,7 +616,9 @@ class TableRestorer:
|
|
|
574
616
|
result = conn.execute(q)
|
|
575
617
|
_logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
|
|
576
618
|
|
|
577
|
-
#
|
|
619
|
+
# STEP 4: Copy the remaining rows from the temporary table into the existing table.
|
|
620
|
+
|
|
621
|
+
# Now copy the remaining data (consisting entirely of new row instances) from the temporary table into
|
|
578
622
|
# the actual table.
|
|
579
623
|
q = store_sa_tbl.insert().from_select(
|
|
580
624
|
[store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
|
|
@@ -583,42 +627,118 @@ class TableRestorer:
|
|
|
583
627
|
result = conn.execute(q)
|
|
584
628
|
_logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
|
|
585
629
|
|
|
630
|
+
# STEP 5: Rectify any index columns.
|
|
631
|
+
|
|
632
|
+
# Finally, rectify any index columns in the table. This involves shuffling data between the index's val and
|
|
633
|
+
# undo columns to ensure they appropriately reflect the most recent replicated version of the table.
|
|
634
|
+
|
|
635
|
+
# Get the most recent replicated version of the table. This might be the version we're currently importing,
|
|
636
|
+
# but it might be a different version of the table that was previously imported.
|
|
637
|
+
head_version_md = catalog.Catalog.get()._collect_tbl_history(tv.id, n=1)[0]
|
|
638
|
+
head_version = head_version_md.version_md.version
|
|
639
|
+
_logger.debug(f'Head version for index rectification is {head_version}.')
|
|
640
|
+
|
|
641
|
+
# Get the index info from the table metadata. Here we use the tbl_md that we just collected from the DB.
|
|
642
|
+
# This is to ensure we pick up ALL indices, including dropped indices and indices that are present in
|
|
643
|
+
# a previously replicated version of the table, but not in the one currently being imported.
|
|
644
|
+
index_md = head_version_md.tbl_md.index_md
|
|
645
|
+
|
|
646
|
+
# Now update the table. We can do this for all indices together with just two SQL queries. For each index,
|
|
647
|
+
# at most one of the val or undo columns will be non-NULL in any given row.
|
|
648
|
+
# For rows where v_min <= head_version < v_max, we set, for all indices:
|
|
649
|
+
# val_col = whichever of (val_col, undo_col) is non-NULL (or NULL if both are, e.g., for a dropped index)
|
|
650
|
+
# undo_col = NULL
|
|
651
|
+
# For rows where head_version < v_min or v_max <= head_version, vice versa.
|
|
652
|
+
val_sql_clauses: dict[str, sql.ColumnElement] = {}
|
|
653
|
+
undo_sql_clauses: dict[str, sql.ColumnElement] = {}
|
|
654
|
+
for index in index_md.values():
|
|
655
|
+
if index.class_fqn.endswith('.EmbeddingIndex'):
|
|
656
|
+
val_col_name = f'col_{index.index_val_col_id}'
|
|
657
|
+
undo_col_name = f'col_{index.index_val_undo_col_id}'
|
|
658
|
+
# Check that the val column for the index is actually present in the store table. We need to do this
|
|
659
|
+
# to properly handle the case where the replica represents a table version that was *not* the most
|
|
660
|
+
# recent version at the time it was published. In that case, it is possible for tbl_md to contain
|
|
661
|
+
# metadata for indices not known to any version that has been replicated. (However, the converse
|
|
662
|
+
# *does* hold: all replicated indices must have metadata in tbl_md; and that's what's important.)
|
|
663
|
+
if val_col_name in store_sa_tbl.c:
|
|
664
|
+
assert undo_col_name in store_sa_tbl.c
|
|
665
|
+
coalesce = sql.func.coalesce(store_sa_tbl.c[val_col_name], store_sa_tbl.c[undo_col_name])
|
|
666
|
+
val_sql_clauses[val_col_name] = coalesce
|
|
667
|
+
val_sql_clauses[undo_col_name] = sql.null()
|
|
668
|
+
undo_sql_clauses[undo_col_name] = coalesce
|
|
669
|
+
undo_sql_clauses[val_col_name] = sql.null()
|
|
670
|
+
|
|
671
|
+
if len(val_sql_clauses) > 0:
|
|
672
|
+
q2 = (
|
|
673
|
+
store_sa_tbl.update()
|
|
674
|
+
.values(**val_sql_clauses)
|
|
675
|
+
.where(sql.and_(tv.store_tbl.v_min_col <= head_version, tv.store_tbl.v_max_col > head_version))
|
|
676
|
+
)
|
|
677
|
+
_logger.debug(q2.compile())
|
|
678
|
+
_ = conn.execute(q2)
|
|
679
|
+
q2 = (
|
|
680
|
+
store_sa_tbl.update()
|
|
681
|
+
.values(**undo_sql_clauses)
|
|
682
|
+
.where(sql.or_(tv.store_tbl.v_min_col > head_version, tv.store_tbl.v_max_col <= head_version))
|
|
683
|
+
)
|
|
684
|
+
_logger.debug(q2.compile())
|
|
685
|
+
_ = conn.execute(q2)
|
|
686
|
+
_logger.debug(f'Rectified index columns in {store_sa_tbl_name!r}.')
|
|
687
|
+
else:
|
|
688
|
+
_logger.debug(f'No index columns to rectify in {store_sa_tbl_name!r}.')
|
|
689
|
+
|
|
586
690
|
def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
|
|
587
691
|
# Data conversions from pyarrow to Pixeltable
|
|
588
692
|
sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
|
|
589
693
|
for col_name in pydict:
|
|
590
694
|
assert col_name in tv.store_tbl.sa_tbl.columns
|
|
591
695
|
sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
|
|
592
|
-
|
|
593
|
-
for col in tv.cols
|
|
594
|
-
if col.is_stored and col.col_type.is_media_type():
|
|
595
|
-
media_col_ids[col.store_name()] = col.id
|
|
696
|
+
stored_cols: dict[str, catalog.Column] = {col.store_name(): col for col in tv.cols if col.is_stored}
|
|
697
|
+
stored_cols |= {col.cellmd_store_name(): col for col in tv.cols if col.stores_cellmd}
|
|
596
698
|
|
|
597
699
|
row_count = len(next(iter(pydict.values())))
|
|
598
|
-
rows: list[dict[str, Any]] = []
|
|
599
|
-
for
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
700
|
+
rows: list[dict[str, Any]] = [{} for _ in range(row_count)]
|
|
701
|
+
for col_name, col_vals in pydict.items():
|
|
702
|
+
assert len(col_vals) == row_count
|
|
703
|
+
col = stored_cols.get(col_name) # Will be None for system columns
|
|
704
|
+
is_media_col = col is not None and col.is_stored and col.col_type.is_media_type()
|
|
705
|
+
is_cellmd_col = col is not None and col.stores_cellmd and col_name == col.cellmd_store_name()
|
|
706
|
+
assert col is None or is_cellmd_col or col_name == col.store_name()
|
|
707
|
+
|
|
708
|
+
for i, val in enumerate(col_vals):
|
|
709
|
+
rows[i][col_name] = self.__from_pa_value(val, sql_types[col_name], col, is_media_col, is_cellmd_col)
|
|
605
710
|
|
|
606
711
|
return rows
|
|
607
712
|
|
|
608
713
|
def __from_pa_value(
|
|
609
|
-
self,
|
|
714
|
+
self,
|
|
715
|
+
val: Any,
|
|
716
|
+
sql_type: sql.types.TypeEngine[Any],
|
|
717
|
+
col: catalog.Column | None,
|
|
718
|
+
is_media_col: bool,
|
|
719
|
+
is_cellmd_col: bool,
|
|
610
720
|
) -> Any:
|
|
611
721
|
if val is None:
|
|
612
722
|
return None
|
|
723
|
+
if isinstance(sql_type, sql_vector.Vector):
|
|
724
|
+
if isinstance(val, list):
|
|
725
|
+
val = np.array(val, dtype=np.float32)
|
|
726
|
+
assert isinstance(val, np.ndarray) and val.dtype == np.float32 and val.ndim == 1
|
|
727
|
+
return val
|
|
728
|
+
if is_cellmd_col:
|
|
729
|
+
assert col is not None
|
|
730
|
+
assert isinstance(val, str)
|
|
731
|
+
return self.__restore_cellmd(col, json.loads(val))
|
|
613
732
|
if isinstance(sql_type, sql.JSON):
|
|
614
733
|
return json.loads(val)
|
|
615
|
-
if
|
|
616
|
-
assert
|
|
617
|
-
return self.__relocate_media_file(
|
|
734
|
+
if is_media_col:
|
|
735
|
+
assert col is not None
|
|
736
|
+
return self.__relocate_media_file(col, val)
|
|
618
737
|
return val
|
|
619
738
|
|
|
620
|
-
def __relocate_media_file(self,
|
|
739
|
+
def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
|
|
621
740
|
# If this is a pxtmedia:// URL, relocate it
|
|
741
|
+
assert isinstance(url, str)
|
|
622
742
|
parsed_url = urllib.parse.urlparse(url)
|
|
623
743
|
assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
|
|
624
744
|
if parsed_url.scheme == 'pxtmedia':
|
|
@@ -626,9 +746,19 @@ class TableRestorer:
|
|
|
626
746
|
# First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
|
|
627
747
|
# in self.media_files.
|
|
628
748
|
src_path = self.tmp_dir / 'media' / parsed_url.netloc
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
749
|
+
# Move the file to the media store and update the URL.
|
|
750
|
+
self.media_files[url] = ObjectOps.put_file(media_col, src_path, relocate_or_delete=True)
|
|
632
751
|
return self.media_files[url]
|
|
633
752
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
634
753
|
return url
|
|
754
|
+
|
|
755
|
+
def __restore_cellmd(self, col: catalog.Column, cellmd: dict[str, Any]) -> dict[str, Any]:
|
|
756
|
+
cellmd_ = CellMd.from_dict(cellmd)
|
|
757
|
+
if cellmd_.file_urls is None:
|
|
758
|
+
return cellmd # No changes
|
|
759
|
+
|
|
760
|
+
updated_urls: list[str] = []
|
|
761
|
+
for url in cellmd_.file_urls:
|
|
762
|
+
updated_urls.append(self.__relocate_media_file(col, url))
|
|
763
|
+
cellmd_.file_urls = updated_urls
|
|
764
|
+
return cellmd_.as_dict()
|