pixeltable 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +2 -27
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +9 -7
- pixeltable/catalog/column.py +6 -2
- pixeltable/catalog/dir.py +2 -1
- pixeltable/catalog/insertable_table.py +11 -0
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +27 -38
- pixeltable/catalog/table_version.py +19 -0
- pixeltable/catalog/table_version_path.py +7 -0
- pixeltable/catalog/view.py +31 -0
- pixeltable/dataframe.py +50 -7
- pixeltable/env.py +1 -1
- pixeltable/exceptions.py +20 -2
- pixeltable/exec/aggregation_node.py +14 -0
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +0 -4
- pixeltable/exec/expr_eval/expr_eval_node.py +1 -2
- pixeltable/exec/sql_node.py +3 -2
- pixeltable/exprs/column_ref.py +42 -17
- pixeltable/exprs/data_row.py +3 -0
- pixeltable/exprs/globals.py +1 -1
- pixeltable/exprs/literal.py +11 -1
- pixeltable/exprs/rowid_ref.py +4 -1
- pixeltable/exprs/similarity_expr.py +1 -1
- pixeltable/func/function.py +1 -1
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +2 -0
- pixeltable/functions/anthropic.py +1 -1
- pixeltable/functions/bedrock.py +130 -0
- pixeltable/functions/date.py +185 -0
- pixeltable/functions/gemini.py +22 -20
- pixeltable/functions/globals.py +1 -16
- pixeltable/functions/huggingface.py +7 -6
- pixeltable/functions/image.py +15 -16
- pixeltable/functions/json.py +2 -1
- pixeltable/functions/math.py +40 -0
- pixeltable/functions/mistralai.py +3 -2
- pixeltable/functions/openai.py +9 -8
- pixeltable/functions/string.py +1 -2
- pixeltable/functions/together.py +4 -3
- pixeltable/functions/video.py +2 -2
- pixeltable/globals.py +26 -9
- pixeltable/io/datarows.py +4 -3
- pixeltable/io/hf_datasets.py +2 -2
- pixeltable/io/label_studio.py +17 -17
- pixeltable/io/pandas.py +29 -16
- pixeltable/io/parquet.py +2 -0
- pixeltable/io/table_data_conduit.py +8 -2
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +12 -5
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +219 -119
- pixeltable/share/publish.py +61 -16
- pixeltable/store.py +45 -20
- pixeltable/type_system.py +46 -2
- pixeltable/utils/arrow.py +8 -2
- pixeltable/utils/pytorch.py +4 -0
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/METADATA +2 -4
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/RECORD +66 -63
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/entry_points.txt +0 -0
pixeltable/share/packager.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import datetime
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import tarfile
|
|
@@ -9,45 +9,38 @@ from pathlib import Path
|
|
|
9
9
|
from typing import Any, Iterator, Optional
|
|
10
10
|
|
|
11
11
|
import more_itertools
|
|
12
|
-
import numpy as np
|
|
13
12
|
import pyarrow as pa
|
|
14
|
-
import
|
|
13
|
+
import pyarrow.parquet as pq
|
|
14
|
+
import sqlalchemy as sql
|
|
15
15
|
|
|
16
16
|
import pixeltable as pxt
|
|
17
|
-
|
|
18
|
-
from pixeltable import catalog, exprs, metadata
|
|
19
|
-
from pixeltable.dataframe import DataFrame
|
|
17
|
+
from pixeltable import catalog, exceptions as excs, metadata
|
|
20
18
|
from pixeltable.env import Env
|
|
21
|
-
from pixeltable.
|
|
22
|
-
from pixeltable.utils.
|
|
19
|
+
from pixeltable.metadata import schema
|
|
20
|
+
from pixeltable.utils.media_store import MediaStore
|
|
23
21
|
|
|
24
22
|
_logger = logging.getLogger('pixeltable')
|
|
25
23
|
|
|
26
24
|
|
|
27
25
|
class TablePackager:
|
|
28
26
|
"""
|
|
29
|
-
Packages a pixeltable Table into a tarball containing
|
|
27
|
+
Packages a pixeltable Table into a tarball containing Parquet tables and media files. The structure of the tarball
|
|
30
28
|
is as follows:
|
|
31
29
|
|
|
32
|
-
metadata.json # Pixeltable metadata for the packaged table
|
|
33
|
-
|
|
34
|
-
warehouse/pxt.db/** # Iceberg metadata and data files (parquet/avro/json)
|
|
30
|
+
metadata.json # Pixeltable metadata for the packaged table and its ancestors
|
|
31
|
+
tables/** # Parquet tables for the packaged table and its ancestors, each table in a directory 'tbl_{tbl_id.hex}'
|
|
35
32
|
media/** # Local media files
|
|
36
33
|
|
|
37
|
-
If the table being archived is a view, then the Iceberg catalog will contain separate tables for the view and each
|
|
38
|
-
of its ancestors. All rows will be exported with additional _rowid and _v_min columns. Currently, only the most
|
|
39
|
-
recent version of the table can be exported, and only the full table contents.
|
|
40
|
-
|
|
41
34
|
If the table contains media columns, they are handled as follows:
|
|
42
35
|
- If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
|
|
43
|
-
stored in the
|
|
36
|
+
stored in the Parquet table.
|
|
44
37
|
- If a media file is a local file, then it will be copied into the tarball as a file of the form
|
|
45
|
-
'media/{uuid}{extension}', and the
|
|
38
|
+
'media/{uuid}{extension}', and the Parquet table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
|
|
46
39
|
"""
|
|
47
40
|
|
|
48
41
|
table: catalog.Table # The table to be packaged
|
|
49
42
|
tmp_dir: Path # Temporary directory where the package will reside
|
|
50
|
-
|
|
43
|
+
tables_dir: Path # Directory where the Parquet tables will be written
|
|
51
44
|
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
52
45
|
md: dict[str, Any]
|
|
53
46
|
|
|
@@ -69,138 +62,113 @@ class TablePackager:
|
|
|
69
62
|
|
|
70
63
|
def package(self) -> Path:
|
|
71
64
|
"""
|
|
72
|
-
Export the table to a tarball containing
|
|
65
|
+
Export the table to a tarball containing Parquet tables and media files.
|
|
73
66
|
"""
|
|
74
67
|
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
75
68
|
_logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
|
|
76
69
|
self.tmp_dir.mkdir()
|
|
77
70
|
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
78
71
|
json.dump(self.md, fp)
|
|
79
|
-
self.
|
|
72
|
+
self.tables_dir = self.tmp_dir / 'tables'
|
|
73
|
+
self.tables_dir.mkdir()
|
|
80
74
|
with Env.get().begin_xact():
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
self.__export_table(t)
|
|
75
|
+
for tv in self.table._tbl_version_path.get_tbl_versions():
|
|
76
|
+
_logger.info(f"Exporting table '{tv.get().name}:{tv.get().version}'.")
|
|
77
|
+
self.__export_table(tv.get())
|
|
85
78
|
_logger.info('Building archive.')
|
|
86
79
|
bundle_path = self.__build_tarball()
|
|
87
80
|
_logger.info(f'Packaging complete: {bundle_path}')
|
|
88
81
|
return bundle_path
|
|
89
82
|
|
|
90
|
-
def __export_table(self,
|
|
83
|
+
def __export_table(self, tv: catalog.TableVersion) -> None:
|
|
91
84
|
"""
|
|
92
|
-
Exports the data from `t` into
|
|
85
|
+
Exports the data from `t` into a Parquet table.
|
|
93
86
|
"""
|
|
94
|
-
#
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
select_exprs: dict[str, exprs.Expr] = {}
|
|
102
|
-
|
|
103
|
-
# As we generate the select list, we construct a separate list of column types. We can't rely on df._schema
|
|
104
|
-
# to get the column types, since we'll be substituting `fileurl`s for media columns.
|
|
105
|
-
actual_col_types: list[ts.ColumnType] = []
|
|
106
|
-
|
|
107
|
-
for col_name, col in t._tbl_version.get().cols_by_name.items():
|
|
108
|
-
if not col.is_stored:
|
|
109
|
-
continue
|
|
110
|
-
if col.col_type.is_media_type():
|
|
111
|
-
select_exprs[col_name] = t[col_name].fileurl
|
|
112
|
-
else:
|
|
113
|
-
select_exprs[col_name] = t[col_name]
|
|
114
|
-
actual_col_types.append(col.col_type)
|
|
115
|
-
if col.records_errors:
|
|
116
|
-
select_exprs[f'{col_name}_errortype'] = t[col_name].errortype
|
|
117
|
-
actual_col_types.append(ts.StringType())
|
|
118
|
-
select_exprs[f'{col_name}_errormsg'] = t[col_name].errormsg
|
|
119
|
-
actual_col_types.append(ts.StringType())
|
|
120
|
-
|
|
121
|
-
# Run the select() on `self.table`, not `t`, so that we export only those rows that are actually present in
|
|
122
|
-
# `self.table`.
|
|
123
|
-
df = self.table.select(**select_exprs)
|
|
124
|
-
namespace = self.__iceberg_namespace(t)
|
|
125
|
-
self.iceberg_catalog.create_namespace_if_not_exists(namespace)
|
|
126
|
-
iceberg_schema = self.__to_iceberg_schema(df._schema)
|
|
127
|
-
iceberg_tbl = self.iceberg_catalog.create_table(f'{namespace}.{t._name}', schema=iceberg_schema)
|
|
128
|
-
|
|
129
|
-
# Populate the Iceberg table with data.
|
|
130
|
-
# The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
|
|
131
|
-
# excessive memory usage. The pyarrow tables are then amalgamated into the (single) Iceberg table on disk.
|
|
132
|
-
for pa_table in self.__to_pa_tables(df, actual_col_types, iceberg_schema):
|
|
133
|
-
iceberg_tbl.append(pa_table)
|
|
87
|
+
# `tv` must be an ancestor of the primary table
|
|
88
|
+
assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
|
|
89
|
+
sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
|
|
90
|
+
media_cols: set[str] = set()
|
|
91
|
+
for col in tv.cols_by_name.values():
|
|
92
|
+
if col.is_stored and col.col_type.is_media_type():
|
|
93
|
+
media_cols.add(col.store_name())
|
|
134
94
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
return 'pxt'
|
|
143
|
-
else:
|
|
144
|
-
return f'pxt.{parent_path}'
|
|
95
|
+
parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
|
|
96
|
+
# TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
|
|
97
|
+
# future support for this.)
|
|
98
|
+
parquet_dir = self.tables_dir / f'tbl_{tv.id.hex}'
|
|
99
|
+
parquet_dir.mkdir()
|
|
100
|
+
parquet_file = parquet_dir / f'tbl_{tv.id.hex}.00000.parquet'
|
|
101
|
+
_logger.info(f'Creating parquet table: {parquet_file}')
|
|
145
102
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
103
|
+
# Populate the Parquet table with data.
|
|
104
|
+
# The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
|
|
105
|
+
# excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
|
|
106
|
+
# We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
|
|
107
|
+
# faster compression should provide good performance while still reducing temporary storage utilization.
|
|
108
|
+
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
|
|
109
|
+
filter_tv = self.table._tbl_version.get()
|
|
110
|
+
row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
|
|
111
|
+
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
|
|
112
|
+
parquet_writer.write_table(pa_table)
|
|
113
|
+
parquet_writer.close()
|
|
114
|
+
|
|
115
|
+
# The following methods are responsible for schema and data conversion from Pixeltable to Parquet.
|
|
152
116
|
|
|
153
117
|
@classmethod
|
|
154
|
-
def
|
|
155
|
-
entries = [(
|
|
156
|
-
entries.append(('_rowid', pa.list_(pa.int64())))
|
|
157
|
-
entries.append(('_v_min', pa.int64()))
|
|
118
|
+
def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
|
|
119
|
+
entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
|
|
158
120
|
return pa.schema(entries) # type: ignore[arg-type]
|
|
159
121
|
|
|
160
122
|
@classmethod
|
|
161
|
-
def
|
|
162
|
-
if col_type.
|
|
163
|
-
return pa.binary()
|
|
164
|
-
if col_type.is_media_type():
|
|
123
|
+
def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
|
|
124
|
+
if isinstance(col_type, sql.String):
|
|
165
125
|
return pa.string()
|
|
166
|
-
|
|
126
|
+
if isinstance(col_type, sql.Boolean):
|
|
127
|
+
return pa.bool_()
|
|
128
|
+
if isinstance(col_type, sql.BigInteger):
|
|
129
|
+
return pa.int64()
|
|
130
|
+
if isinstance(col_type, sql.Float):
|
|
131
|
+
return pa.float32()
|
|
132
|
+
if isinstance(col_type, sql.TIMESTAMP):
|
|
133
|
+
return pa.timestamp('us', tz=datetime.timezone.utc)
|
|
134
|
+
if isinstance(col_type, sql.Date):
|
|
135
|
+
return pa.date32()
|
|
136
|
+
if isinstance(col_type, sql.JSON):
|
|
137
|
+
return pa.string() # JSON will be exported as strings
|
|
138
|
+
if isinstance(col_type, sql.LargeBinary):
|
|
139
|
+
return pa.binary()
|
|
140
|
+
raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
|
|
167
141
|
|
|
168
142
|
def __to_pa_tables(
|
|
169
|
-
self,
|
|
143
|
+
self,
|
|
144
|
+
row_iter: Iterator[dict[str, Any]],
|
|
145
|
+
sql_types: dict[str, sql.types.TypeEngine[Any]],
|
|
146
|
+
media_cols: set[str],
|
|
147
|
+
arrow_schema: pa.Schema,
|
|
148
|
+
batch_size: int = 1_000,
|
|
170
149
|
) -> Iterator[pa.Table]:
|
|
171
150
|
"""
|
|
172
|
-
|
|
173
|
-
|
|
151
|
+
Group rows into a sequence of pyarrow tables, batched into smaller chunks to minimize memory utilization.
|
|
152
|
+
The row dictionaries have the format {store_col_name: value}, where the values reflect the unprocessed contents
|
|
153
|
+
of the store database (as returned by `StoreTable.dump_rows()`).
|
|
174
154
|
"""
|
|
175
|
-
for rows in more_itertools.batched(
|
|
176
|
-
cols = {
|
|
177
|
-
|
|
178
|
-
|
|
155
|
+
for rows in more_itertools.batched(row_iter, batch_size):
|
|
156
|
+
cols = {}
|
|
157
|
+
for name, sql_type in sql_types.items():
|
|
158
|
+
is_media_col = name in media_cols
|
|
159
|
+
values = [self.__to_pa_value(row.get(name), sql_type, is_media_col) for row in rows]
|
|
160
|
+
cols[name] = values
|
|
179
161
|
yield pa.Table.from_pydict(cols, schema=arrow_schema)
|
|
180
162
|
|
|
181
|
-
def
|
|
182
|
-
for row in df._exec():
|
|
183
|
-
vals = [row[e.slot_idx] for e in df._select_list_exprs]
|
|
184
|
-
result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
|
|
185
|
-
result.append(row.rowid)
|
|
186
|
-
result.append(row.v_min)
|
|
187
|
-
yield result
|
|
188
|
-
|
|
189
|
-
def __to_pa_value(self, val: Any, col_type: ts.ColumnType) -> Any:
|
|
163
|
+
def __to_pa_value(self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool) -> Any:
|
|
190
164
|
if val is None:
|
|
191
165
|
return None
|
|
192
|
-
if
|
|
193
|
-
# Export arrays as binary
|
|
194
|
-
assert isinstance(val, np.ndarray)
|
|
195
|
-
arr = io.BytesIO()
|
|
196
|
-
np.save(arr, val)
|
|
197
|
-
return arr.getvalue()
|
|
198
|
-
if col_type.is_json_type():
|
|
166
|
+
if isinstance(sql_type, sql.JSON):
|
|
199
167
|
# Export JSON as strings
|
|
200
168
|
return json.dumps(val)
|
|
201
|
-
if
|
|
169
|
+
if is_media_col:
|
|
202
170
|
# Handle media files as described above
|
|
203
|
-
assert isinstance(val, str)
|
|
171
|
+
assert isinstance(val, str)
|
|
204
172
|
return self.__process_media_url(val)
|
|
205
173
|
return val
|
|
206
174
|
|
|
@@ -225,9 +193,141 @@ class TablePackager:
|
|
|
225
193
|
with tarfile.open(bundle_path, 'w:bz2') as tf:
|
|
226
194
|
# Add metadata json
|
|
227
195
|
tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
|
|
228
|
-
# Add the
|
|
229
|
-
tf.add(self.
|
|
196
|
+
# Add the dir containing Parquet tables
|
|
197
|
+
tf.add(self.tables_dir, arcname='tables')
|
|
230
198
|
# Add the media files
|
|
231
199
|
for src_file, dest_name in self.media_files.items():
|
|
232
200
|
tf.add(src_file, arcname=f'media/{dest_name}')
|
|
233
201
|
return bundle_path
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class TableRestorer:
|
|
205
|
+
"""
|
|
206
|
+
Creates a replica table from a tarball containing Parquet tables and media files. See the `TablePackager` docs for
|
|
207
|
+
details on the tarball structure.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
tbl_path: Pixeltable path (such as 'my_dir.my_table') where the materialized table will be made visible.
|
|
211
|
+
md: Optional metadata dictionary. If not provided, metadata will be read from the tarball's `metadata.json`.
|
|
212
|
+
The metadata contains table_md, table_version_md, and table_schema_version_md entries for each ancestor
|
|
213
|
+
of the table being restored, as written out by `TablePackager`.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
tbl_path: str
|
|
217
|
+
md: Optional[dict[str, Any]]
|
|
218
|
+
tmp_dir: Path
|
|
219
|
+
media_files: dict[str, str] # Mapping from pxtmedia:// URLs to local file:// URLs
|
|
220
|
+
|
|
221
|
+
def __init__(self, tbl_path: str, md: Optional[dict[str, Any]] = None) -> None:
|
|
222
|
+
self.tbl_path = tbl_path
|
|
223
|
+
self.md = md
|
|
224
|
+
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
225
|
+
self.media_files = {}
|
|
226
|
+
|
|
227
|
+
def restore(self, bundle_path: Path) -> pxt.Table:
|
|
228
|
+
# Extract tarball
|
|
229
|
+
print(f'Extracting table data into: {self.tmp_dir}')
|
|
230
|
+
with tarfile.open(bundle_path, 'r:bz2') as tf:
|
|
231
|
+
tf.extractall(path=self.tmp_dir)
|
|
232
|
+
|
|
233
|
+
if self.md is None:
|
|
234
|
+
# No metadata supplied; read it from the archive
|
|
235
|
+
with open(self.tmp_dir / 'metadata.json', 'r', encoding='utf8') as fp:
|
|
236
|
+
self.md = json.load(fp)
|
|
237
|
+
|
|
238
|
+
pxt_md_version = self.md['pxt_md_version']
|
|
239
|
+
assert isinstance(pxt_md_version, int)
|
|
240
|
+
|
|
241
|
+
if pxt_md_version != metadata.VERSION:
|
|
242
|
+
raise excs.Error(
|
|
243
|
+
f'Pixeltable metadata version mismatch: {pxt_md_version} != {metadata.VERSION}.\n'
|
|
244
|
+
'Please upgrade Pixeltable to use this dataset: pip install -U pixeltable'
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
|
|
248
|
+
|
|
249
|
+
# Create the replica table
|
|
250
|
+
# TODO: This needs to be made concurrency-safe.
|
|
251
|
+
replica_tbl = catalog.Catalog.get().create_replica(catalog.Path(self.tbl_path), tbl_md)
|
|
252
|
+
assert replica_tbl._tbl_version.get().is_snapshot
|
|
253
|
+
|
|
254
|
+
# Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
|
|
255
|
+
# replica_tbl itself if it's a pure snapshot.
|
|
256
|
+
if replica_tbl._id != replica_tbl._tbl_version.id:
|
|
257
|
+
ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
|
|
258
|
+
else:
|
|
259
|
+
ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
|
|
260
|
+
|
|
261
|
+
# Instantiate data from the Parquet tables.
|
|
262
|
+
with Env.get().begin_xact():
|
|
263
|
+
for md in ancestor_md[::-1]: # Base table first
|
|
264
|
+
# Create a TableVersion instance (and a store table) for this ancestor.
|
|
265
|
+
tv = catalog.TableVersion.create_replica(md)
|
|
266
|
+
# Now import data from Parquet.
|
|
267
|
+
_logger.info(f'Importing table {tv.name!r}.')
|
|
268
|
+
self.__import_table(self.tmp_dir, tv, md)
|
|
269
|
+
|
|
270
|
+
return replica_tbl
|
|
271
|
+
|
|
272
|
+
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
|
|
273
|
+
"""
|
|
274
|
+
Import the Parquet table into the Pixeltable catalog.
|
|
275
|
+
"""
|
|
276
|
+
tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
|
|
277
|
+
parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
|
|
278
|
+
parquet_table = pq.read_table(str(parquet_dir))
|
|
279
|
+
|
|
280
|
+
for batch in parquet_table.to_batches():
|
|
281
|
+
pydict = batch.to_pydict()
|
|
282
|
+
rows = self.__from_pa_pydict(tv, pydict)
|
|
283
|
+
tv.store_tbl.load_rows(rows)
|
|
284
|
+
|
|
285
|
+
def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
|
|
286
|
+
# Data conversions from pyarrow to Pixeltable
|
|
287
|
+
sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
|
|
288
|
+
for col_name in pydict:
|
|
289
|
+
assert col_name in tv.store_tbl.sa_tbl.columns
|
|
290
|
+
sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
|
|
291
|
+
media_col_ids: dict[str, int] = {}
|
|
292
|
+
for col in tv.cols_by_name.values():
|
|
293
|
+
if col.is_stored and col.col_type.is_media_type():
|
|
294
|
+
media_col_ids[col.store_name()] = col.id
|
|
295
|
+
|
|
296
|
+
row_count = len(next(iter(pydict.values())))
|
|
297
|
+
rows: list[dict[str, Any]] = []
|
|
298
|
+
for i in range(row_count):
|
|
299
|
+
row = {
|
|
300
|
+
col_name: self.__from_pa_value(tv, col_vals[i], sql_types[col_name], media_col_ids.get(col_name))
|
|
301
|
+
for col_name, col_vals in pydict.items()
|
|
302
|
+
}
|
|
303
|
+
rows.append(row)
|
|
304
|
+
|
|
305
|
+
return rows
|
|
306
|
+
|
|
307
|
+
def __from_pa_value(
|
|
308
|
+
self, tv: catalog.TableVersion, val: Any, sql_type: sql.types.TypeEngine[Any], media_col_id: Optional[int]
|
|
309
|
+
) -> Any:
|
|
310
|
+
if val is None:
|
|
311
|
+
return None
|
|
312
|
+
if isinstance(sql_type, sql.JSON):
|
|
313
|
+
return json.loads(val)
|
|
314
|
+
if media_col_id is not None:
|
|
315
|
+
assert isinstance(val, str)
|
|
316
|
+
return self.__relocate_media_file(tv, media_col_id, val)
|
|
317
|
+
return val
|
|
318
|
+
|
|
319
|
+
def __relocate_media_file(self, tv: catalog.TableVersion, media_col_id: int, url: str) -> str:
|
|
320
|
+
# If this is a pxtmedia:// URL, relocate it
|
|
321
|
+
parsed_url = urllib.parse.urlparse(url)
|
|
322
|
+
assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
|
|
323
|
+
if parsed_url.scheme == 'pxtmedia':
|
|
324
|
+
if url not in self.media_files:
|
|
325
|
+
# First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
|
|
326
|
+
# in self.media_files.
|
|
327
|
+
src_path = self.tmp_dir / 'media' / parsed_url.netloc
|
|
328
|
+
dest_path = MediaStore.prepare_media_path(tv.id, media_col_id, tv.version, ext=src_path.suffix)
|
|
329
|
+
src_path.rename(dest_path)
|
|
330
|
+
self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
331
|
+
return self.media_files[url]
|
|
332
|
+
# For any type of URL other than a local file, just return the URL as-is.
|
|
333
|
+
return url
|
pixeltable/share/publish.py
CHANGED
|
@@ -9,10 +9,9 @@ from tqdm import tqdm
|
|
|
9
9
|
import pixeltable as pxt
|
|
10
10
|
from pixeltable import exceptions as excs
|
|
11
11
|
from pixeltable.env import Env
|
|
12
|
-
from pixeltable.metadata.schema import FullTableMd
|
|
13
12
|
from pixeltable.utils import sha256sum
|
|
14
13
|
|
|
15
|
-
from .packager import TablePackager
|
|
14
|
+
from .packager import TablePackager, TableRestorer
|
|
16
15
|
|
|
17
16
|
# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
|
|
18
17
|
# pixeltable.com URLs are available.
|
|
@@ -20,7 +19,10 @@ from .packager import TablePackager
|
|
|
20
19
|
PIXELTABLE_API_URL = 'https://internal-api.pixeltable.com'
|
|
21
20
|
|
|
22
21
|
|
|
23
|
-
def
|
|
22
|
+
def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
23
|
+
if not src_tbl._tbl_version.get().is_snapshot:
|
|
24
|
+
raise excs.Error('Only snapshots may be published.')
|
|
25
|
+
|
|
24
26
|
packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
|
|
25
27
|
request_json = packager.md | {'operation_type': 'publish_snapshot'}
|
|
26
28
|
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
@@ -65,18 +67,6 @@ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
65
67
|
return confirmed_tbl_uri
|
|
66
68
|
|
|
67
69
|
|
|
68
|
-
def clone_snapshot(dest_tbl_uri: str) -> list[FullTableMd]:
|
|
69
|
-
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
70
|
-
clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': dest_tbl_uri}
|
|
71
|
-
response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=headers_json)
|
|
72
|
-
if response.status_code != 200:
|
|
73
|
-
raise excs.Error(f'Error cloning snapshot: {response.text}')
|
|
74
|
-
response_json = response.json()
|
|
75
|
-
if not isinstance(response_json, dict) or 'table_uri' not in response_json:
|
|
76
|
-
raise excs.Error(f'Unexpected response from server.\n{response_json}')
|
|
77
|
-
return [FullTableMd.from_dict(t) for t in response_json['md']['tables']]
|
|
78
|
-
|
|
79
|
-
|
|
80
70
|
def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
|
|
81
71
|
from pixeltable.utils.s3 import get_client
|
|
82
72
|
|
|
@@ -102,5 +92,60 @@ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult
|
|
|
102
92
|
file=sys.stdout,
|
|
103
93
|
)
|
|
104
94
|
s3_client.upload_file(
|
|
105
|
-
Filename=str(bundle), Bucket=bucket, Key=
|
|
95
|
+
Filename=str(bundle), Bucket=bucket, Key=remote_path, ExtraArgs=upload_args, Callback=progress_bar.update
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
100
|
+
headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
|
|
101
|
+
clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': src_tbl_uri}
|
|
102
|
+
response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=headers_json)
|
|
103
|
+
if response.status_code != 200:
|
|
104
|
+
raise excs.Error(f'Error cloning snapshot: {response.text}')
|
|
105
|
+
response_json = response.json()
|
|
106
|
+
if not isinstance(response_json, dict) or 'table_uri' not in response_json:
|
|
107
|
+
raise excs.Error(f'Error cloning shapshot: unexpected response from server.\n{response_json}')
|
|
108
|
+
|
|
109
|
+
primary_tbl_additional_md = response_json['md']['tables'][0]['table_md']['additional_md']
|
|
110
|
+
bundle_uri = primary_tbl_additional_md['destination_uri']
|
|
111
|
+
bundle_filename = primary_tbl_additional_md['datafile']
|
|
112
|
+
parsed_location = urllib.parse.urlparse(bundle_uri)
|
|
113
|
+
if parsed_location.scheme == 's3':
|
|
114
|
+
bundle_path = _download_bundle_from_s3(parsed_location, bundle_filename)
|
|
115
|
+
else:
|
|
116
|
+
raise excs.Error(f'Unexpected response from server: unsupported bundle uri: {bundle_uri}')
|
|
117
|
+
|
|
118
|
+
restorer = TableRestorer(dest_path, response_json)
|
|
119
|
+
tbl = restorer.restore(bundle_path)
|
|
120
|
+
Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
|
|
121
|
+
return tbl
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_filename: str) -> Path:
|
|
125
|
+
from pixeltable.utils.s3 import get_client
|
|
126
|
+
|
|
127
|
+
bucket = parsed_location.netloc
|
|
128
|
+
remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
|
|
129
|
+
remote_path = str(remote_dir / bundle_filename)[1:] # Remove initial /
|
|
130
|
+
|
|
131
|
+
Env.get().console_logger.info(f'Downloading snapshot from: {bucket}:{remote_path}')
|
|
132
|
+
|
|
133
|
+
boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
|
|
134
|
+
s3_client = get_client(**boto_config)
|
|
135
|
+
|
|
136
|
+
obj = s3_client.head_object(Bucket=bucket, Key=remote_path) # Check if the object exists
|
|
137
|
+
bundle_size = obj['ContentLength']
|
|
138
|
+
|
|
139
|
+
bundle_path = Path(Env.get().create_tmp_path())
|
|
140
|
+
progress_bar = tqdm(
|
|
141
|
+
desc='Downloading',
|
|
142
|
+
total=bundle_size,
|
|
143
|
+
unit='B',
|
|
144
|
+
unit_scale=True,
|
|
145
|
+
unit_divisor=1024,
|
|
146
|
+
miniters=1,
|
|
147
|
+
ncols=100,
|
|
148
|
+
file=sys.stdout,
|
|
106
149
|
)
|
|
150
|
+
s3_client.download_file(Bucket=bucket, Key=remote_path, Filename=str(bundle_path), Callback=progress_bar.update)
|
|
151
|
+
return bundle_path
|
pixeltable/store.py
CHANGED
|
@@ -7,8 +7,9 @@ import sys
|
|
|
7
7
|
import urllib.parse
|
|
8
8
|
import urllib.request
|
|
9
9
|
import warnings
|
|
10
|
-
from typing import Any, Iterator, Literal, Optional, Union
|
|
10
|
+
from typing import Any, Iterable, Iterator, Literal, Optional, Union
|
|
11
11
|
|
|
12
|
+
import more_itertools
|
|
12
13
|
import sqlalchemy as sql
|
|
13
14
|
from tqdm import TqdmWarning, tqdm
|
|
14
15
|
|
|
@@ -190,34 +191,29 @@ class StoreBase:
|
|
|
190
191
|
assert col.is_stored
|
|
191
192
|
conn = Env.get().conn
|
|
192
193
|
col_type_str = col.get_sa_col_type().compile(dialect=conn.dialect)
|
|
193
|
-
|
|
194
|
-
log_stmt(_logger, stmt)
|
|
195
|
-
conn.execute(stmt)
|
|
194
|
+
s_txt = f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.store_name()} {col_type_str} NULL'
|
|
196
195
|
added_storage_cols = [col.store_name()]
|
|
197
196
|
if col.records_errors:
|
|
198
197
|
# we also need to create the errormsg and errortype storage cols
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
)
|
|
202
|
-
conn.execute(stmt)
|
|
203
|
-
stmt = sql.text(
|
|
204
|
-
f'ALTER TABLE {self._storage_name()} ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL'
|
|
205
|
-
)
|
|
206
|
-
conn.execute(stmt)
|
|
198
|
+
s_txt += f' , ADD COLUMN {col.errormsg_store_name()} VARCHAR DEFAULT NULL'
|
|
199
|
+
s_txt += f' , ADD COLUMN {col.errortype_store_name()} VARCHAR DEFAULT NULL'
|
|
207
200
|
added_storage_cols.extend([col.errormsg_store_name(), col.errortype_store_name()])
|
|
201
|
+
|
|
202
|
+
stmt = sql.text(s_txt)
|
|
203
|
+
log_stmt(_logger, stmt)
|
|
204
|
+
conn.execute(stmt)
|
|
208
205
|
self.create_sa_tbl()
|
|
209
206
|
_logger.info(f'Added columns {added_storage_cols} to storage table {self._storage_name()}')
|
|
210
207
|
|
|
211
208
|
def drop_column(self, col: catalog.Column) -> None:
|
|
212
209
|
"""Execute Alter Table Drop Column statement"""
|
|
213
|
-
|
|
214
|
-
stmt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
|
|
215
|
-
conn.execute(sql.text(stmt))
|
|
210
|
+
s_txt = f'ALTER TABLE {self._storage_name()} DROP COLUMN {col.store_name()}'
|
|
216
211
|
if col.records_errors:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
212
|
+
s_txt += f' , DROP COLUMN {col.errormsg_store_name()}'
|
|
213
|
+
s_txt += f' , DROP COLUMN {col.errortype_store_name()}'
|
|
214
|
+
stmt = sql.text(s_txt)
|
|
215
|
+
log_stmt(_logger, stmt)
|
|
216
|
+
Env.get().conn.execute(stmt)
|
|
221
217
|
|
|
222
218
|
def load_column(
|
|
223
219
|
self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
|
|
@@ -313,7 +309,7 @@ class StoreBase:
|
|
|
313
309
|
def insert_rows(
|
|
314
310
|
self,
|
|
315
311
|
exec_plan: ExecNode,
|
|
316
|
-
v_min:
|
|
312
|
+
v_min: int,
|
|
317
313
|
show_progress: bool = True,
|
|
318
314
|
rowids: Optional[Iterator[int]] = None,
|
|
319
315
|
abort_on_exc: bool = False,
|
|
@@ -432,6 +428,35 @@ class StoreBase:
|
|
|
432
428
|
status = conn.execute(stmt)
|
|
433
429
|
return status.rowcount
|
|
434
430
|
|
|
431
|
+
def dump_rows(self, version: int, filter_view: StoreBase, filter_view_version: int) -> Iterator[dict[str, Any]]:
|
|
432
|
+
filter_predicate = sql.and_(
|
|
433
|
+
filter_view.v_min_col <= filter_view_version,
|
|
434
|
+
filter_view.v_max_col > filter_view_version,
|
|
435
|
+
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), filter_view.rowid_columns())],
|
|
436
|
+
)
|
|
437
|
+
stmt = (
|
|
438
|
+
sql.select('*') # TODO: Use a more specific list of columns?
|
|
439
|
+
.select_from(self.sa_tbl)
|
|
440
|
+
.where(self.v_min_col <= version)
|
|
441
|
+
.where(self.v_max_col > version)
|
|
442
|
+
.where(sql.exists().where(filter_predicate))
|
|
443
|
+
)
|
|
444
|
+
conn = Env.get().conn
|
|
445
|
+
_logger.debug(stmt)
|
|
446
|
+
log_explain(_logger, stmt, conn)
|
|
447
|
+
result = conn.execute(stmt)
|
|
448
|
+
for row in result:
|
|
449
|
+
yield dict(zip(result.keys(), row))
|
|
450
|
+
|
|
451
|
+
def load_rows(self, rows: Iterable[dict[str, Any]], batch_size: int = 10_000) -> None:
|
|
452
|
+
"""
|
|
453
|
+
When instantiating a replica, we can't rely on the usual insertion code path, which contains error handling
|
|
454
|
+
and other logic that doesn't apply.
|
|
455
|
+
"""
|
|
456
|
+
conn = Env.get().conn
|
|
457
|
+
for batch in more_itertools.batched(rows, batch_size):
|
|
458
|
+
conn.execute(sql.insert(self.sa_tbl), batch)
|
|
459
|
+
|
|
435
460
|
|
|
436
461
|
class StoreTable(StoreBase):
|
|
437
462
|
def __init__(self, tbl_version: catalog.TableVersion):
|