pixeltable 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +9 -1
- pixeltable/catalog/catalog.py +559 -134
- pixeltable/catalog/column.py +36 -32
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +12 -0
- pixeltable/catalog/insertable_table.py +30 -25
- pixeltable/catalog/schema_object.py +9 -6
- pixeltable/catalog/table.py +334 -267
- pixeltable/catalog/table_version.py +360 -241
- pixeltable/catalog/table_version_handle.py +18 -2
- pixeltable/catalog/table_version_path.py +86 -23
- pixeltable/catalog/view.py +47 -23
- pixeltable/dataframe.py +198 -19
- pixeltable/env.py +6 -4
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/exec_node.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +4 -1
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +188 -22
- pixeltable/exprs/column_property_ref.py +16 -6
- pixeltable/exprs/column_ref.py +33 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +11 -4
- pixeltable/exprs/literal.py +2 -0
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +5 -3
- pixeltable/func/tools.py +12 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +19 -45
- pixeltable/functions/deepseek.py +19 -38
- pixeltable/functions/fireworks.py +9 -18
- pixeltable/functions/gemini.py +165 -33
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/llama_cpp.py +6 -6
- pixeltable/functions/math.py +63 -0
- pixeltable/functions/mistralai.py +16 -53
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +82 -165
- pixeltable/functions/string.py +212 -58
- pixeltable/functions/together.py +22 -80
- pixeltable/globals.py +10 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +10 -31
- pixeltable/io/label_studio.py +5 -5
- pixeltable/io/parquet.py +4 -4
- pixeltable/io/table_data_conduit.py +1 -32
- pixeltable/metadata/__init__.py +11 -2
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +13 -1
- pixeltable/plan.py +135 -12
- pixeltable/share/packager.py +321 -20
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +31 -13
- pixeltable/type_system.py +30 -0
- pixeltable/utils/dbms.py +1 -1
- pixeltable/utils/formatter.py +64 -42
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/RECORD +79 -74
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
pixeltable/share/packager.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import datetime
|
|
3
|
+
import io
|
|
4
|
+
import itertools
|
|
2
5
|
import json
|
|
3
6
|
import logging
|
|
4
7
|
import tarfile
|
|
@@ -7,16 +10,21 @@ import urllib.request
|
|
|
7
10
|
import uuid
|
|
8
11
|
from pathlib import Path
|
|
9
12
|
from typing import Any, Iterator, Optional
|
|
13
|
+
from uuid import UUID
|
|
10
14
|
|
|
11
15
|
import more_itertools
|
|
16
|
+
import numpy as np
|
|
17
|
+
import PIL.Image
|
|
12
18
|
import pyarrow as pa
|
|
13
19
|
import pyarrow.parquet as pq
|
|
14
20
|
import sqlalchemy as sql
|
|
15
21
|
|
|
16
22
|
import pixeltable as pxt
|
|
17
|
-
from pixeltable import catalog, exceptions as excs, metadata
|
|
23
|
+
from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
|
|
18
24
|
from pixeltable.env import Env
|
|
19
25
|
from pixeltable.metadata import schema
|
|
26
|
+
from pixeltable.utils import sha256sum
|
|
27
|
+
from pixeltable.utils.formatter import Formatter
|
|
20
28
|
from pixeltable.utils.media_store import MediaStore
|
|
21
29
|
|
|
22
30
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -44,13 +52,17 @@ class TablePackager:
|
|
|
44
52
|
media_files: dict[Path, str] # Mapping from local media file paths to their tarball names
|
|
45
53
|
md: dict[str, Any]
|
|
46
54
|
|
|
55
|
+
bundle_path: Path
|
|
56
|
+
preview_header: dict[str, str]
|
|
57
|
+
preview: list[list[Any]]
|
|
58
|
+
|
|
47
59
|
def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
|
|
48
60
|
self.table = table
|
|
49
61
|
self.tmp_dir = Path(Env.get().create_tmp_path())
|
|
50
62
|
self.media_files = {}
|
|
51
63
|
|
|
52
64
|
# Load metadata
|
|
53
|
-
with
|
|
65
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
54
66
|
tbl_md = catalog.Catalog.get().load_replica_md(table)
|
|
55
67
|
self.md = {
|
|
56
68
|
'pxt_version': pxt.__version__,
|
|
@@ -65,20 +77,29 @@ class TablePackager:
|
|
|
65
77
|
Export the table to a tarball containing Parquet tables and media files.
|
|
66
78
|
"""
|
|
67
79
|
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
68
|
-
|
|
80
|
+
|
|
81
|
+
_logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
|
|
69
82
|
self.tmp_dir.mkdir()
|
|
70
83
|
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
71
84
|
json.dump(self.md, fp)
|
|
72
85
|
self.tables_dir = self.tmp_dir / 'tables'
|
|
73
86
|
self.tables_dir.mkdir()
|
|
74
|
-
with
|
|
87
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
75
88
|
for tv in self.table._tbl_version_path.get_tbl_versions():
|
|
76
|
-
_logger.info(f
|
|
89
|
+
_logger.info(f'Exporting table {tv.get().versioned_name!r}.')
|
|
77
90
|
self.__export_table(tv.get())
|
|
91
|
+
|
|
78
92
|
_logger.info('Building archive.')
|
|
79
|
-
bundle_path = self.__build_tarball()
|
|
80
|
-
|
|
81
|
-
|
|
93
|
+
self.bundle_path = self.__build_tarball()
|
|
94
|
+
|
|
95
|
+
_logger.info('Extracting preview data.')
|
|
96
|
+
self.md['count'] = self.table.count()
|
|
97
|
+
preview_header, preview = self.__extract_preview_data()
|
|
98
|
+
self.md['preview_header'] = preview_header
|
|
99
|
+
self.md['preview'] = preview
|
|
100
|
+
|
|
101
|
+
_logger.info(f'Packaging complete: {self.bundle_path}')
|
|
102
|
+
return self.bundle_path
|
|
82
103
|
|
|
83
104
|
def __export_table(self, tv: catalog.TableVersion) -> None:
|
|
84
105
|
"""
|
|
@@ -88,7 +109,7 @@ class TablePackager:
|
|
|
88
109
|
assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
|
|
89
110
|
sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
|
|
90
111
|
media_cols: set[str] = set()
|
|
91
|
-
for col in tv.
|
|
112
|
+
for col in tv.cols:
|
|
92
113
|
if col.is_stored and col.col_type.is_media_type():
|
|
93
114
|
media_cols.add(col.store_name())
|
|
94
115
|
|
|
@@ -106,7 +127,7 @@ class TablePackager:
|
|
|
106
127
|
# We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
|
|
107
128
|
# faster compression should provide good performance while still reducing temporary storage utilization.
|
|
108
129
|
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
|
|
109
|
-
filter_tv = self.table.
|
|
130
|
+
filter_tv = self.table._tbl_version_path.tbl_version.get()
|
|
110
131
|
row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
|
|
111
132
|
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
|
|
112
133
|
parquet_writer.write_table(pa_table)
|
|
@@ -182,7 +203,12 @@ class TablePackager:
|
|
|
182
203
|
path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
|
|
183
204
|
if path not in self.media_files:
|
|
184
205
|
# Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
|
|
185
|
-
|
|
206
|
+
# We name the media files in the archive by their SHA256 hash. This ensures that we can properly
|
|
207
|
+
# deduplicate and validate them later.
|
|
208
|
+
# If we get a collision, it's not a problem; it just means we have two identical files (which will
|
|
209
|
+
# be conveniently deduplicated in the bundle).
|
|
210
|
+
sha = sha256sum(path)
|
|
211
|
+
dest_name = f'{sha}{path.suffix}'
|
|
186
212
|
self.media_files[path] = dest_name
|
|
187
213
|
return f'pxtmedia://{self.media_files[path]}'
|
|
188
214
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
@@ -200,6 +226,96 @@ class TablePackager:
|
|
|
200
226
|
tf.add(src_file, arcname=f'media/{dest_name}')
|
|
201
227
|
return bundle_path
|
|
202
228
|
|
|
229
|
+
def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
|
|
230
|
+
"""
|
|
231
|
+
Extract a preview of the table data for display in the UI.
|
|
232
|
+
|
|
233
|
+
In order to bound the size of the output data, all "unbounded" data types are resized:
|
|
234
|
+
- Strings are abbreviated as per Formatter.abbreviate()
|
|
235
|
+
- Arrays and JSON are shortened and formatted as strings
|
|
236
|
+
- Images are resized to thumbnail size as a base64-encoded webp
|
|
237
|
+
- Videos are replaced by their first frame and resized as above
|
|
238
|
+
- Documents are replaced by a thumbnail as a base64-encoded webp
|
|
239
|
+
"""
|
|
240
|
+
# First 8 columns
|
|
241
|
+
preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
|
|
242
|
+
select_list = [self.table[col_name] for col_name in preview_cols]
|
|
243
|
+
# First 5 rows
|
|
244
|
+
rows = list(self.table.select(*select_list).head(n=5))
|
|
245
|
+
|
|
246
|
+
preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
|
|
247
|
+
preview = [
|
|
248
|
+
[self.__encode_preview_data(val, col_type)]
|
|
249
|
+
for row in rows
|
|
250
|
+
for val, col_type in zip(row.values(), preview_cols.values())
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
return preview_header, preview
|
|
254
|
+
|
|
255
|
+
def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
|
|
256
|
+
if val is None:
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
match col_type._type:
|
|
260
|
+
case ts.ColumnType.Type.STRING:
|
|
261
|
+
assert isinstance(val, str)
|
|
262
|
+
return Formatter.abbreviate(val)
|
|
263
|
+
|
|
264
|
+
case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
|
|
265
|
+
return val
|
|
266
|
+
|
|
267
|
+
case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
|
|
268
|
+
return str(val)
|
|
269
|
+
|
|
270
|
+
case ts.ColumnType.Type.ARRAY:
|
|
271
|
+
assert isinstance(val, np.ndarray)
|
|
272
|
+
return Formatter.format_array(val)
|
|
273
|
+
|
|
274
|
+
case ts.ColumnType.Type.JSON:
|
|
275
|
+
# We need to escape the JSON string server-side for security reasons.
|
|
276
|
+
# Therefore we don't escape it here, in order to avoid double-escaping.
|
|
277
|
+
return Formatter.format_json(val, escape_strings=False)
|
|
278
|
+
|
|
279
|
+
case ts.ColumnType.Type.IMAGE:
|
|
280
|
+
# Rescale the image to minimize data transfer size
|
|
281
|
+
assert isinstance(val, PIL.Image.Image)
|
|
282
|
+
return self.__encode_image(val)
|
|
283
|
+
|
|
284
|
+
case ts.ColumnType.Type.VIDEO:
|
|
285
|
+
assert isinstance(val, str)
|
|
286
|
+
return self.__encode_video(val)
|
|
287
|
+
|
|
288
|
+
case ts.ColumnType.Type.AUDIO:
|
|
289
|
+
return None
|
|
290
|
+
|
|
291
|
+
case ts.ColumnType.Type.DOCUMENT:
|
|
292
|
+
assert isinstance(val, str)
|
|
293
|
+
return self.__encode_document(val)
|
|
294
|
+
|
|
295
|
+
case _:
|
|
296
|
+
raise AssertionError(f'Unrecognized column type: {col_type._type}')
|
|
297
|
+
|
|
298
|
+
def __encode_image(self, img: PIL.Image.Image) -> str:
|
|
299
|
+
# Heuristic for thumbnail sizing:
|
|
300
|
+
# Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
|
|
301
|
+
# But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
|
|
302
|
+
# in the case of highly oblong images).
|
|
303
|
+
if img.height > img.width * 1.5:
|
|
304
|
+
scaled_img = img.resize((img.width * 360 // img.height, 360))
|
|
305
|
+
else:
|
|
306
|
+
scaled_img = img.resize((240, img.height * 240 // img.width))
|
|
307
|
+
with io.BytesIO() as buffer:
|
|
308
|
+
scaled_img.save(buffer, 'webp')
|
|
309
|
+
return base64.b64encode(buffer.getvalue()).decode()
|
|
310
|
+
|
|
311
|
+
def __encode_video(self, video_path: str) -> Optional[str]:
|
|
312
|
+
thumb = Formatter.extract_first_video_frame(video_path)
|
|
313
|
+
return self.__encode_image(thumb) if thumb is not None else None
|
|
314
|
+
|
|
315
|
+
def __encode_document(self, doc_path: str) -> Optional[str]:
|
|
316
|
+
thumb = Formatter.make_document_thumbnail(doc_path)
|
|
317
|
+
return self.__encode_image(thumb) if thumb is not None else None
|
|
318
|
+
|
|
203
319
|
|
|
204
320
|
class TableRestorer:
|
|
205
321
|
"""
|
|
@@ -247,13 +363,26 @@ class TableRestorer:
|
|
|
247
363
|
tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
|
|
248
364
|
|
|
249
365
|
# Create the replica table
|
|
250
|
-
#
|
|
251
|
-
|
|
252
|
-
|
|
366
|
+
# The logic here needs to be completely restructured in order to make it concurrency-safe.
|
|
367
|
+
# - Catalog.create_replica() needs to write the metadata and also create the physical store tables
|
|
368
|
+
# and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
|
|
369
|
+
# an actual table)
|
|
370
|
+
# - this could be done one replica at a time (instead of the entire hierarchy)
|
|
371
|
+
cat = catalog.Catalog.get()
|
|
372
|
+
cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
|
|
373
|
+
# don't call get_table() until after the calls to create_replica() and __import_table() below;
|
|
374
|
+
# the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
|
|
375
|
+
# TV instances for the same replica version, which then leads to failures when constructing queries
|
|
253
376
|
|
|
254
377
|
# Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
|
|
255
378
|
# replica_tbl itself if it's a pure snapshot.
|
|
256
|
-
|
|
379
|
+
target_md = tbl_md[0]
|
|
380
|
+
is_pure_snapshot = (
|
|
381
|
+
target_md.tbl_md.view_md is not None
|
|
382
|
+
and target_md.tbl_md.view_md.predicate is None
|
|
383
|
+
and len(target_md.schema_version_md.columns) == 0
|
|
384
|
+
)
|
|
385
|
+
if is_pure_snapshot:
|
|
257
386
|
ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
|
|
258
387
|
else:
|
|
259
388
|
ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
|
|
@@ -267,7 +396,8 @@ class TableRestorer:
|
|
|
267
396
|
_logger.info(f'Importing table {tv.name!r}.')
|
|
268
397
|
self.__import_table(self.tmp_dir, tv, md)
|
|
269
398
|
|
|
270
|
-
|
|
399
|
+
with cat.begin_xact(for_write=False):
|
|
400
|
+
return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
|
|
271
401
|
|
|
272
402
|
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
|
|
273
403
|
"""
|
|
@@ -276,11 +406,182 @@ class TableRestorer:
|
|
|
276
406
|
tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
|
|
277
407
|
parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
|
|
278
408
|
parquet_table = pq.read_table(str(parquet_dir))
|
|
279
|
-
|
|
280
|
-
|
|
409
|
+
replica_version = tv.version
|
|
410
|
+
|
|
411
|
+
conn = Env.get().conn
|
|
412
|
+
store_sa_tbl = tv.store_tbl.sa_tbl
|
|
413
|
+
store_sa_tbl_name = tv.store_tbl._storage_name()
|
|
414
|
+
|
|
415
|
+
# Sometimes we are importing a table that has never been seen before. Other times, however, we are importing
|
|
416
|
+
# an existing replica table, and the table version and/or row selection differs from what was imported
|
|
417
|
+
# previously. Care must be taken to ensure that the new data is merged with existing data in a way that
|
|
418
|
+
# yields an internally consistent version history for each row.
|
|
419
|
+
|
|
420
|
+
# The overall strategy is this:
|
|
421
|
+
# 1. Import the parquet data into a temporary table;
|
|
422
|
+
# 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
|
|
423
|
+
# 3. Delete any row instances from the temporary table that are already present in the existing table;
|
|
424
|
+
# 4. Copy the remaining rows from the temporary table into the existing table.
|
|
425
|
+
|
|
426
|
+
# Create a temporary table for the initial data load, containing columns for all columns present in the
|
|
427
|
+
# parquet table. The parquet columns have identical names to those in the store table, so we can use the
|
|
428
|
+
# store table schema to get their SQL types (which are not necessarily derivable from their Parquet types,
|
|
429
|
+
# e.g., pa.string() may hold either VARCHAR or serialized JSONB).
|
|
430
|
+
temp_cols: dict[str, sql.Column] = {}
|
|
431
|
+
for field in parquet_table.schema:
|
|
432
|
+
assert field.name in store_sa_tbl.columns
|
|
433
|
+
col_type = store_sa_tbl.columns[field.name].type
|
|
434
|
+
temp_cols[field.name] = sql.Column(field.name, col_type)
|
|
435
|
+
temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
|
|
436
|
+
_logger.debug(f'Creating temporary table: {temp_sa_tbl_name}')
|
|
437
|
+
temp_md = sql.MetaData()
|
|
438
|
+
temp_sa_tbl = sql.Table(temp_sa_tbl_name, temp_md, *temp_cols.values(), prefixes=['TEMPORARY'])
|
|
439
|
+
temp_sa_tbl.create(conn)
|
|
440
|
+
|
|
441
|
+
# Populate the temporary table with data from the Parquet file.
|
|
442
|
+
_logger.debug(f'Loading {parquet_table.num_rows} row(s) into temporary table: {temp_sa_tbl_name}')
|
|
443
|
+
for batch in parquet_table.to_batches(max_chunksize=10_000):
|
|
281
444
|
pydict = batch.to_pydict()
|
|
282
445
|
rows = self.__from_pa_pydict(tv, pydict)
|
|
283
|
-
|
|
446
|
+
conn.execute(sql.insert(temp_sa_tbl), rows)
|
|
447
|
+
|
|
448
|
+
# Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
|
|
449
|
+
# Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
|
|
450
|
+
# In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
|
|
451
|
+
# row id, or MAX_VERSION if no such row instance exists. But in the replica, we need to be careful, since
|
|
452
|
+
# we might see only a subset of the original table's versions, and we might see them out of order.
|
|
453
|
+
|
|
454
|
+
# We'll adjust the v_max values according to the principle of "latest provable v_max":
|
|
455
|
+
# they will always correspond to the latest version for which we can prove the row instance was alive. This
|
|
456
|
+
# will enable us to maintain consistency of the v_max values if additional table versions are later imported,
|
|
457
|
+
# regardless of the order in which they are seen. It also means that replica tables (unlike original tables)
|
|
458
|
+
# may have gaps in their row version histories, but this is fine; the gaps simply correspond to table versions
|
|
459
|
+
# that have never been observed.
|
|
460
|
+
|
|
461
|
+
pk_predicates = [col == temp_cols[col.name] for col in tv.store_tbl.pk_columns()]
|
|
462
|
+
pk_clause = sql.and_(*pk_predicates)
|
|
463
|
+
|
|
464
|
+
# If the same pk exists in both the temporary table and the existing table, then the corresponding row data
|
|
465
|
+
# must be identical; the rows can differ only in their v_max value. As a sanity check, we go through the
|
|
466
|
+
# motion of verifying this; a failure implies data corruption in either the replica being imported or in a
|
|
467
|
+
# previously imported replica.
|
|
468
|
+
|
|
469
|
+
system_col_names = {col.name for col in tv.store_tbl.system_columns()}
|
|
470
|
+
media_col_names = {col.store_name() for col in tv.cols if col.col_type.is_media_type() and col.is_stored}
|
|
471
|
+
value_store_cols = [
|
|
472
|
+
store_sa_tbl.c[col_name]
|
|
473
|
+
for col_name in temp_cols
|
|
474
|
+
if col_name not in system_col_names and col_name not in media_col_names
|
|
475
|
+
]
|
|
476
|
+
value_temp_cols = [
|
|
477
|
+
col
|
|
478
|
+
for col_name, col in temp_cols.items()
|
|
479
|
+
if col_name not in system_col_names and col_name not in media_col_names
|
|
480
|
+
]
|
|
481
|
+
mismatch_predicates = [store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)]
|
|
482
|
+
mismatch_clause = sql.or_(*mismatch_predicates)
|
|
483
|
+
|
|
484
|
+
# This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
|
|
485
|
+
# one value column. Pseudo-SQL:
|
|
486
|
+
#
|
|
487
|
+
# SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
|
|
488
|
+
# FROM store_tbl, temp_tbl
|
|
489
|
+
# WHERE store_tbl.rowid = temp_tbl.rowid
|
|
490
|
+
# AND store_tbl.pos_0 = temp_tbl.pos_0
|
|
491
|
+
# AND ... AND store_tbl.pos_k = temp_tbl.pos_k
|
|
492
|
+
# AND store_tbl.v_min = temp_tbl.v_min
|
|
493
|
+
# AND (
|
|
494
|
+
# store_tbl.col_0 != temp_tbl.col_0
|
|
495
|
+
# OR store_tbl.col_1 != temp_tbl.col_1
|
|
496
|
+
# OR ... OR store_tbl.col_n != temp_tbl.col_n
|
|
497
|
+
# )
|
|
498
|
+
#
|
|
499
|
+
# The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
|
|
500
|
+
# either column is NULL; this is what we want, since it may indicate a column that is present in one version
|
|
501
|
+
# but not the other.
|
|
502
|
+
q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
|
|
503
|
+
_logger.debug(q.compile())
|
|
504
|
+
result = conn.execute(q)
|
|
505
|
+
if result.rowcount > 0:
|
|
506
|
+
_logger.debug(
|
|
507
|
+
f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
|
|
508
|
+
f'{result.rowcount} inconsistent row(s).'
|
|
509
|
+
)
|
|
510
|
+
row = result.first()
|
|
511
|
+
_logger.debug('Example mismatch:')
|
|
512
|
+
_logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
|
|
513
|
+
_logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
|
|
514
|
+
raise excs.Error(
|
|
515
|
+
'Data corruption error: the replica data are inconsistent with data retrieved from a previous replica.'
|
|
516
|
+
)
|
|
517
|
+
_logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
|
|
518
|
+
|
|
519
|
+
# Now rectify the v_max values in the temporary table.
|
|
520
|
+
# If a row instance has a concrete v_max value, then we know it's genuine: it's the unique and immutable
|
|
521
|
+
# version when the row was deleted. (This can only happen if later versions of the base table already
|
|
522
|
+
# existed at the time this replica was published.)
|
|
523
|
+
# But if a row instance has a v_max value of MAX_VERSION, then we don't know anything about its future.
|
|
524
|
+
# It might live indefinitely, or it might be deleted as early as version `n + 1`. Following the principle
|
|
525
|
+
# of "latest provable v_max", we simply set v_max equal to `n + 1`.
|
|
526
|
+
q = (
|
|
527
|
+
temp_sa_tbl.update()
|
|
528
|
+
.values(v_max=(replica_version + 1))
|
|
529
|
+
.where(temp_sa_tbl.c.v_max == schema.Table.MAX_VERSION)
|
|
530
|
+
)
|
|
531
|
+
_logger.debug(q.compile())
|
|
532
|
+
result = conn.execute(q)
|
|
533
|
+
_logger.debug(f'Rectified {result.rowcount} row(s) in {temp_sa_tbl_name!r}.')
|
|
534
|
+
|
|
535
|
+
# Now rectify the v_max values in the existing table. This is done by simply taking the later of the two v_max
|
|
536
|
+
# values (the existing one and the new one) for each row instance, following the "latest provable v_max"
|
|
537
|
+
# principle. Obviously we only need to do this for rows that exist in both tables (it's a simple join).
|
|
538
|
+
q = (
|
|
539
|
+
store_sa_tbl.update()
|
|
540
|
+
.values(v_max=sql.func.greatest(store_sa_tbl.c.v_max, temp_sa_tbl.c.v_max))
|
|
541
|
+
.where(pk_clause)
|
|
542
|
+
)
|
|
543
|
+
_logger.debug(q.compile())
|
|
544
|
+
result = conn.execute(q)
|
|
545
|
+
_logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
|
|
546
|
+
|
|
547
|
+
# Now we need to update rows in the existing table that are also present in the temporary table. This is to
|
|
548
|
+
# account for the scenario where the temporary table has columns that are not present in the existing table.
|
|
549
|
+
# (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
|
|
550
|
+
# might also occur; there may be columns in the existing table that are not present in the temporary table.)
|
|
551
|
+
value_update_clauses: dict[str, sql.ColumnElement] = {}
|
|
552
|
+
for temp_col in temp_cols.values():
|
|
553
|
+
if temp_col.name not in system_col_names:
|
|
554
|
+
store_col = store_sa_tbl.c[temp_col.name]
|
|
555
|
+
# Prefer the value from the existing table, substituting the value from the temporary table if it's
|
|
556
|
+
# NULL. This works in all cases (including media columns, where we prefer the existing media file).
|
|
557
|
+
clause = sql.case((store_col == None, temp_col), else_=store_col)
|
|
558
|
+
value_update_clauses[temp_col.name] = clause
|
|
559
|
+
if len(value_update_clauses) > 0:
|
|
560
|
+
q = store_sa_tbl.update().values(**value_update_clauses).where(pk_clause)
|
|
561
|
+
_logger.debug(q.compile())
|
|
562
|
+
result = conn.execute(q)
|
|
563
|
+
_logger.debug(
|
|
564
|
+
f'Merged values from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r} for {result.rowcount} row(s).'
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
# Now drop any rows from the temporary table that are also present in the existing table.
|
|
568
|
+
# The v_max values have been rectified, data has been merged into NULL cells, and all other row values have
|
|
569
|
+
# been verified identical.
|
|
570
|
+
# TODO: Delete any media files that were orphaned by this operation (they're necessarily duplicates of media
|
|
571
|
+
# files that are already present in the existing table).
|
|
572
|
+
q = temp_sa_tbl.delete().where(pk_clause)
|
|
573
|
+
_logger.debug(q.compile())
|
|
574
|
+
result = conn.execute(q)
|
|
575
|
+
_logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
|
|
576
|
+
|
|
577
|
+
# Finally, copy the remaining data (consisting entirely of new row instances) from the temporary table into
|
|
578
|
+
# the actual table.
|
|
579
|
+
q = store_sa_tbl.insert().from_select(
|
|
580
|
+
[store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
|
|
581
|
+
)
|
|
582
|
+
_logger.debug(q.compile())
|
|
583
|
+
result = conn.execute(q)
|
|
584
|
+
_logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
|
|
284
585
|
|
|
285
586
|
def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
|
|
286
587
|
# Data conversions from pyarrow to Pixeltable
|
|
@@ -289,7 +590,7 @@ class TableRestorer:
|
|
|
289
590
|
assert col_name in tv.store_tbl.sa_tbl.columns
|
|
290
591
|
sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
|
|
291
592
|
media_col_ids: dict[str, int] = {}
|
|
292
|
-
for col in tv.
|
|
593
|
+
for col in tv.cols:
|
|
293
594
|
if col.is_stored and col.col_type.is_media_type():
|
|
294
595
|
media_col_ids[col.store_name()] = col.id
|
|
295
596
|
|
pixeltable/share/publish.py
CHANGED
|
@@ -35,7 +35,7 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
35
35
|
upload_id = response_json['upload_id']
|
|
36
36
|
destination_uri = response_json['destination_uri']
|
|
37
37
|
|
|
38
|
-
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
|
|
38
|
+
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path()}' at: {dest_tbl_uri}")
|
|
39
39
|
|
|
40
40
|
bundle = packager.package()
|
|
41
41
|
|
|
@@ -117,7 +117,7 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
|
117
117
|
|
|
118
118
|
restorer = TableRestorer(dest_path, response_json)
|
|
119
119
|
tbl = restorer.restore(bundle_path)
|
|
120
|
-
Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
|
|
120
|
+
Env.get().console_logger.info(f'Created local replica {tbl._path()!r} from URI: {src_tbl_uri}')
|
|
121
121
|
return tbl
|
|
122
122
|
|
|
123
123
|
|
pixeltable/store.py
CHANGED
|
@@ -52,7 +52,11 @@ class StoreBase:
|
|
|
52
52
|
# We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
|
|
53
53
|
# since it's referenced by various methods of `StoreBase`
|
|
54
54
|
self.base = tbl_version.base.get().store_tbl if tbl_version.base is not None else None
|
|
55
|
-
|
|
55
|
+
# we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
|
|
56
|
+
self.create_sa_tbl(tbl_version)
|
|
57
|
+
|
|
58
|
+
def system_columns(self) -> list[sql.Column]:
|
|
59
|
+
return [*self._pk_cols, self.v_max_col]
|
|
56
60
|
|
|
57
61
|
def pk_columns(self) -> list[sql.Column]:
|
|
58
62
|
return self._pk_cols
|
|
@@ -74,11 +78,13 @@ class StoreBase:
|
|
|
74
78
|
self._pk_cols = [*rowid_cols, self.v_min_col]
|
|
75
79
|
return [*rowid_cols, self.v_min_col, self.v_max_col]
|
|
76
80
|
|
|
77
|
-
def create_sa_tbl(self) -> None:
|
|
81
|
+
def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
|
|
78
82
|
"""Create self.sa_tbl from self.tbl_version."""
|
|
83
|
+
if tbl_version is None:
|
|
84
|
+
tbl_version = self.tbl_version.get()
|
|
79
85
|
system_cols = self._create_system_columns()
|
|
80
86
|
all_cols = system_cols.copy()
|
|
81
|
-
for col in [c for c in
|
|
87
|
+
for col in [c for c in tbl_version.cols if c.is_stored]:
|
|
82
88
|
# re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
|
|
83
89
|
# to the last sql.Table version we created and cannot be reused
|
|
84
90
|
col.create_sa_cols()
|
|
@@ -96,16 +102,17 @@ class StoreBase:
|
|
|
96
102
|
# - base x view joins can be executed as merge joins
|
|
97
103
|
# - speeds up ORDER BY rowid DESC
|
|
98
104
|
# - allows filtering for a particular table version in index scan
|
|
99
|
-
idx_name = f'sys_cols_idx_{
|
|
105
|
+
idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
|
|
100
106
|
idxs.append(sql.Index(idx_name, *system_cols))
|
|
101
107
|
|
|
102
108
|
# v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
|
|
103
|
-
idx_name = f'vmin_idx_{
|
|
109
|
+
idx_name = f'vmin_idx_{tbl_version.id.hex}'
|
|
104
110
|
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
105
|
-
idx_name = f'vmax_idx_{
|
|
111
|
+
idx_name = f'vmax_idx_{tbl_version.id.hex}'
|
|
106
112
|
idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
107
113
|
|
|
108
114
|
self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
|
|
115
|
+
# _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
|
|
109
116
|
|
|
110
117
|
@abc.abstractmethod
|
|
111
118
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
@@ -215,6 +222,15 @@ class StoreBase:
|
|
|
215
222
|
log_stmt(_logger, stmt)
|
|
216
223
|
Env.get().conn.execute(stmt)
|
|
217
224
|
|
|
225
|
+
def ensure_columns_exist(self, cols: Iterable[catalog.Column]) -> None:
|
|
226
|
+
conn = Env.get().conn
|
|
227
|
+
sql_text = f'SELECT column_name FROM information_schema.columns WHERE table_name = {self._storage_name()!r}'
|
|
228
|
+
result = conn.execute(sql.text(sql_text))
|
|
229
|
+
existing_cols = {row[0] for row in result}
|
|
230
|
+
for col in cols:
|
|
231
|
+
if col.store_name() not in existing_cols:
|
|
232
|
+
self.add_column(col)
|
|
233
|
+
|
|
218
234
|
def load_column(
|
|
219
235
|
self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
|
|
220
236
|
) -> int:
|
|
@@ -273,7 +289,7 @@ class StoreBase:
|
|
|
273
289
|
else:
|
|
274
290
|
if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
|
|
275
291
|
# we have yet to store this image
|
|
276
|
-
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.
|
|
292
|
+
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
277
293
|
result_row.flush_img(value_expr_slot_idx, filepath)
|
|
278
294
|
val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
|
|
279
295
|
if col.col_type.is_media_type():
|
|
@@ -403,9 +419,7 @@ class StoreBase:
|
|
|
403
419
|
number of deleted rows
|
|
404
420
|
"""
|
|
405
421
|
where_clause = sql.true() if where_clause is None else where_clause
|
|
406
|
-
|
|
407
|
-
self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
|
|
408
|
-
)
|
|
422
|
+
version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
|
|
409
423
|
rowid_join_clause = self._rowid_join_predicate()
|
|
410
424
|
base_versions_clause = (
|
|
411
425
|
sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
|
|
@@ -416,10 +430,12 @@ class StoreBase:
|
|
|
416
430
|
set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
|
|
417
431
|
# set value column to NULL
|
|
418
432
|
set_clause[index_info.val_col.sa_col] = None
|
|
433
|
+
|
|
419
434
|
stmt = (
|
|
420
435
|
sql.update(self.sa_tbl)
|
|
421
436
|
.values(set_clause)
|
|
422
437
|
.where(where_clause)
|
|
438
|
+
.where(version_clause)
|
|
423
439
|
.where(rowid_join_clause)
|
|
424
440
|
.where(base_versions_clause)
|
|
425
441
|
)
|
|
@@ -516,10 +532,12 @@ class StoreComponentView(StoreView):
|
|
|
516
532
|
self.rowid_cols.append(self.pos_col)
|
|
517
533
|
return self.rowid_cols
|
|
518
534
|
|
|
519
|
-
def create_sa_tbl(self) -> None:
|
|
520
|
-
|
|
535
|
+
def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
|
|
536
|
+
if tbl_version is None:
|
|
537
|
+
tbl_version = self.tbl_version.get()
|
|
538
|
+
super().create_sa_tbl(tbl_version)
|
|
521
539
|
# we need to fix up the 'pos' column in TableVersion
|
|
522
|
-
|
|
540
|
+
tbl_version.cols_by_name['pos'].sa_col = self.pos_col
|
|
523
541
|
|
|
524
542
|
def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
|
|
525
543
|
return sql.and_(
|
pixeltable/type_system.py
CHANGED
|
@@ -395,6 +395,36 @@ class ColumnType:
|
|
|
395
395
|
raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
|
|
396
396
|
raise excs.Error(f'Unknown type: {t}')
|
|
397
397
|
|
|
398
|
+
@classmethod
|
|
399
|
+
def from_json_schema(cls, schema: dict[str, Any]) -> Optional[ColumnType]:
|
|
400
|
+
# We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
|
|
401
|
+
# TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
|
|
402
|
+
# type?)
|
|
403
|
+
py_type = cls.__json_schema_to_py_type(schema)
|
|
404
|
+
return cls.from_python_type(py_type) if py_type is not None else None
|
|
405
|
+
|
|
406
|
+
@classmethod
|
|
407
|
+
def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> Union[type, _GenericAlias, None]:
|
|
408
|
+
if 'type' in schema:
|
|
409
|
+
if schema['type'] == 'null':
|
|
410
|
+
return type(None)
|
|
411
|
+
if schema['type'] == 'string':
|
|
412
|
+
return str
|
|
413
|
+
if schema['type'] == 'integer':
|
|
414
|
+
return int
|
|
415
|
+
if schema['type'] == 'number':
|
|
416
|
+
return float
|
|
417
|
+
if schema['type'] == 'boolean':
|
|
418
|
+
return bool
|
|
419
|
+
if schema['type'] in ('array', 'object'):
|
|
420
|
+
return list
|
|
421
|
+
elif 'anyOf' in schema:
|
|
422
|
+
subscripts = tuple(cls.__json_schema_to_py_type(subschema) for subschema in schema['anyOf'])
|
|
423
|
+
if all(subscript is not None for subscript in subscripts):
|
|
424
|
+
return Union[subscripts]
|
|
425
|
+
|
|
426
|
+
return None
|
|
427
|
+
|
|
398
428
|
def validate_literal(self, val: Any) -> None:
|
|
399
429
|
"""Raise TypeError if val is not a valid literal for this type"""
|
|
400
430
|
if val is None:
|
pixeltable/utils/dbms.py
CHANGED
|
@@ -35,7 +35,7 @@ class PostgresqlDbms(Dbms):
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
def __init__(self, db_url: URL):
|
|
38
|
-
super().__init__('postgresql', '
|
|
38
|
+
super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
|
|
39
39
|
|
|
40
40
|
def drop_db_stmt(self, database: str) -> str:
|
|
41
41
|
return f'DROP DATABASE {database}'
|