pixeltable 0.3.14__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +292 -105
- pixeltable/catalog/column.py +10 -8
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +25 -20
- pixeltable/catalog/schema_object.py +3 -6
- pixeltable/catalog/table.py +245 -189
- pixeltable/catalog/table_version.py +319 -201
- pixeltable/catalog/table_version_handle.py +15 -2
- pixeltable/catalog/table_version_path.py +60 -21
- pixeltable/catalog/view.py +14 -5
- pixeltable/dataframe.py +11 -9
- pixeltable/env.py +2 -4
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +20 -11
- pixeltable/exprs/column_property_ref.py +15 -6
- pixeltable/exprs/column_ref.py +32 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/query_template_function.py +1 -1
- pixeltable/functions/gemini.py +166 -33
- pixeltable/functions/math.py +63 -0
- pixeltable/functions/string.py +212 -58
- pixeltable/globals.py +7 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +8 -29
- pixeltable/io/label_studio.py +1 -1
- pixeltable/io/parquet.py +4 -4
- pixeltable/io/table_data_conduit.py +0 -31
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +5 -1
- pixeltable/plan.py +4 -4
- pixeltable/share/packager.py +207 -15
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +31 -13
- pixeltable/utils/dbms.py +1 -1
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/METADATA +1 -1
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/RECORD +50 -49
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.4.0rc1.dist-info}/entry_points.txt +0 -0
|
@@ -23,7 +23,6 @@ from .utils import normalize_schema_names
|
|
|
23
23
|
|
|
24
24
|
_logger = logging.getLogger('pixeltable')
|
|
25
25
|
|
|
26
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
27
26
|
|
|
28
27
|
if TYPE_CHECKING:
|
|
29
28
|
import datasets # type: ignore[import-untyped]
|
|
@@ -46,9 +45,6 @@ class TableDataConduitFormat(str, enum.Enum):
|
|
|
46
45
|
return False
|
|
47
46
|
|
|
48
47
|
|
|
49
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
50
|
-
|
|
51
|
-
|
|
52
48
|
@dataclass
|
|
53
49
|
class TableDataConduit:
|
|
54
50
|
source: TableDataSource
|
|
@@ -129,9 +125,6 @@ class TableDataConduit:
|
|
|
129
125
|
raise excs.Error(f'Missing required column(s) ({", ".join(missing_cols)})')
|
|
130
126
|
|
|
131
127
|
|
|
132
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
133
|
-
|
|
134
|
-
|
|
135
128
|
class DFTableDataConduit(TableDataConduit):
|
|
136
129
|
pxt_df: pxt.DataFrame = None
|
|
137
130
|
|
|
@@ -155,9 +148,6 @@ class DFTableDataConduit(TableDataConduit):
|
|
|
155
148
|
self.check_source_columns_are_insertable(self.pxt_df.schema.keys())
|
|
156
149
|
|
|
157
150
|
|
|
158
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
159
|
-
|
|
160
|
-
|
|
161
151
|
class RowDataTableDataConduit(TableDataConduit):
|
|
162
152
|
raw_rows: Optional[RowData] = None
|
|
163
153
|
disable_mapping: bool = True
|
|
@@ -235,9 +225,6 @@ class RowDataTableDataConduit(TableDataConduit):
|
|
|
235
225
|
yield self.valid_rows
|
|
236
226
|
|
|
237
227
|
|
|
238
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
239
|
-
|
|
240
|
-
|
|
241
228
|
class PandasTableDataConduit(TableDataConduit):
|
|
242
229
|
pd_df: pd.DataFrame = None
|
|
243
230
|
batch_count: int = 0
|
|
@@ -293,9 +280,6 @@ class PandasTableDataConduit(TableDataConduit):
|
|
|
293
280
|
yield self.valid_rows
|
|
294
281
|
|
|
295
282
|
|
|
296
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
297
|
-
|
|
298
|
-
|
|
299
283
|
class CSVTableDataConduit(TableDataConduit):
|
|
300
284
|
@classmethod
|
|
301
285
|
def from_tds(cls, tds: TableDataConduit) -> 'PandasTableDataConduit':
|
|
@@ -307,9 +291,6 @@ class CSVTableDataConduit(TableDataConduit):
|
|
|
307
291
|
return PandasTableDataConduit.from_tds(t)
|
|
308
292
|
|
|
309
293
|
|
|
310
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
311
|
-
|
|
312
|
-
|
|
313
294
|
class ExcelTableDataConduit(TableDataConduit):
|
|
314
295
|
@classmethod
|
|
315
296
|
def from_tds(cls, tds: TableDataConduit) -> 'PandasTableDataConduit':
|
|
@@ -321,9 +302,6 @@ class ExcelTableDataConduit(TableDataConduit):
|
|
|
321
302
|
return PandasTableDataConduit.from_tds(t)
|
|
322
303
|
|
|
323
304
|
|
|
324
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
325
|
-
|
|
326
|
-
|
|
327
305
|
class JsonTableDataConduit(TableDataConduit):
|
|
328
306
|
@classmethod
|
|
329
307
|
def from_tds(cls, tds: TableDataConduit) -> RowDataTableDataConduit:
|
|
@@ -346,9 +324,6 @@ class JsonTableDataConduit(TableDataConduit):
|
|
|
346
324
|
return t2
|
|
347
325
|
|
|
348
326
|
|
|
349
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
350
|
-
|
|
351
|
-
|
|
352
327
|
class HFTableDataConduit(TableDataConduit):
|
|
353
328
|
hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
|
|
354
329
|
column_name_for_split: Optional[str] = None
|
|
@@ -478,9 +453,6 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
478
453
|
yield batch
|
|
479
454
|
|
|
480
455
|
|
|
481
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
482
|
-
|
|
483
|
-
|
|
484
456
|
class ParquetTableDataConduit(TableDataConduit):
|
|
485
457
|
pq_ds: Optional[ParquetDataset] = None
|
|
486
458
|
|
|
@@ -542,9 +514,6 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
542
514
|
raise e
|
|
543
515
|
|
|
544
516
|
|
|
545
|
-
# ---------------------------------------------------------------------------------------------------------
|
|
546
|
-
|
|
547
|
-
|
|
548
517
|
class UnkTableDataConduit(TableDataConduit):
|
|
549
518
|
"""Source type is not known at the time of creation"""
|
|
550
519
|
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -16,7 +16,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
19
|
-
VERSION =
|
|
19
|
+
VERSION = 36
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -12,9 +12,9 @@ _logger = logging.getLogger('pixeltable')
|
|
|
12
12
|
@register_converter(version=13)
|
|
13
13
|
def _(engine: sql.engine.Engine) -> None:
|
|
14
14
|
with engine.begin() as conn:
|
|
15
|
-
for row in conn.execute(sql.select(Table)):
|
|
15
|
+
for row in conn.execute(sql.select(Table.id, Table.md)):
|
|
16
16
|
id = row[0]
|
|
17
|
-
md = row[
|
|
17
|
+
md = row[1]
|
|
18
18
|
updated_md = __update_md(md)
|
|
19
19
|
if updated_md != md:
|
|
20
20
|
_logger.info(f'Updating schema for table: {id}')
|
|
@@ -1,33 +1,28 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
from uuid import UUID
|
|
2
3
|
|
|
3
4
|
import sqlalchemy as sql
|
|
4
5
|
|
|
5
6
|
from pixeltable.metadata import register_converter
|
|
6
7
|
from pixeltable.metadata.converters.util import (
|
|
7
|
-
|
|
8
|
+
convert_table_md,
|
|
8
9
|
convert_table_schema_version_record,
|
|
9
10
|
convert_table_version_record,
|
|
10
11
|
)
|
|
11
|
-
from pixeltable.metadata.schema import
|
|
12
|
+
from pixeltable.metadata.schema import TableSchemaVersion, TableVersion
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
@register_converter(version=30)
|
|
15
16
|
def _(engine: sql.engine.Engine) -> None:
|
|
16
|
-
|
|
17
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
17
18
|
convert_table_version_record(engine, table_version_record_updater=__update_table_version_record)
|
|
18
19
|
convert_table_schema_version_record(
|
|
19
20
|
engine, table_schema_version_record_updater=__update_table_schema_version_record
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
|
|
23
|
-
def
|
|
24
|
-
|
|
25
|
-
Update TableMd with table_id
|
|
26
|
-
"""
|
|
27
|
-
assert isinstance(record.md, dict)
|
|
28
|
-
md = copy.copy(record.md)
|
|
29
|
-
md['tbl_id'] = str(record.id)
|
|
30
|
-
record.md = md
|
|
24
|
+
def __update_table_md(md: dict, tbl_id: UUID) -> None:
|
|
25
|
+
md['tbl_id'] = str(tbl_id)
|
|
31
26
|
|
|
32
27
|
|
|
33
28
|
def __update_table_version_record(record: TableVersion) -> None:
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata import register_converter
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@register_converter(version=35)
|
|
7
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
8
|
+
with engine.begin() as conn:
|
|
9
|
+
conn.execute(sql.text('ALTER TABLE tables ADD COLUMN lock_dummy int8'))
|
|
@@ -33,9 +33,10 @@ def convert_table_md(
|
|
|
33
33
|
the original entry will be replaced, and the traversal will continue with `v'`.
|
|
34
34
|
"""
|
|
35
35
|
with engine.begin() as conn:
|
|
36
|
-
|
|
36
|
+
# avoid a SELECT * here, which breaks when we add new columns to Table
|
|
37
|
+
for row in conn.execute(sql.select(Table.id, Table.md)):
|
|
37
38
|
tbl_id = row[0]
|
|
38
|
-
table_md = row[
|
|
39
|
+
table_md = row[1]
|
|
39
40
|
assert isinstance(table_md, dict)
|
|
40
41
|
updated_table_md = copy.deepcopy(table_md)
|
|
41
42
|
if table_md_updater is not None:
|
|
@@ -145,13 +146,6 @@ def __update_schema_column(table_schema_version_md: dict, schema_column_updater:
|
|
|
145
146
|
schema_column_updater(schema_col)
|
|
146
147
|
|
|
147
148
|
|
|
148
|
-
def convert_table_record(engine: sql.engine.Engine, table_record_updater: Optional[Callable[[Table], None]]) -> None:
|
|
149
|
-
with sql.orm.Session(engine, future=True) as session:
|
|
150
|
-
for record in session.query(Table).all():
|
|
151
|
-
table_record_updater(record)
|
|
152
|
-
session.commit()
|
|
153
|
-
|
|
154
|
-
|
|
155
149
|
def convert_table_version_record(
|
|
156
150
|
engine: sql.engine.Engine, table_version_record_updater: Optional[Callable[[TableVersion], None]]
|
|
157
151
|
) -> None:
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
36: 'Added Table.lock_dummy',
|
|
5
6
|
35: 'Track reference_tbl in ColumnRef',
|
|
6
7
|
34: 'Set default value for is_pk field in column metadata to False',
|
|
7
8
|
33: 'Add is_replica field to table metadata',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -84,7 +84,8 @@ class Dir(Base):
|
|
|
84
84
|
)
|
|
85
85
|
parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
86
86
|
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # DirMd
|
|
87
|
-
|
|
87
|
+
|
|
88
|
+
# used to force acquisition of an X-lock via an Update stmt
|
|
88
89
|
lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
|
|
89
90
|
|
|
90
91
|
|
|
@@ -200,6 +201,9 @@ class Table(Base):
|
|
|
200
201
|
dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
|
|
201
202
|
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableMd
|
|
202
203
|
|
|
204
|
+
# used to force acquisition of an X-lock via an Update stmt
|
|
205
|
+
lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
|
|
206
|
+
|
|
203
207
|
|
|
204
208
|
@dataclasses.dataclass
|
|
205
209
|
class TableVersionMd:
|
pixeltable/plan.py
CHANGED
|
@@ -289,7 +289,7 @@ class Planner:
|
|
|
289
289
|
|
|
290
290
|
# create InMemoryDataNode for 'rows'
|
|
291
291
|
plan: exec.ExecNode = exec.InMemoryDataNode(
|
|
292
|
-
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.
|
|
292
|
+
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
|
|
293
293
|
)
|
|
294
294
|
|
|
295
295
|
media_input_col_info = [
|
|
@@ -385,7 +385,7 @@ class Planner:
|
|
|
385
385
|
|
|
386
386
|
cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
|
|
387
387
|
|
|
388
|
-
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
|
|
388
|
+
recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
|
|
389
389
|
copied_cols = [
|
|
390
390
|
col
|
|
391
391
|
for col in target.cols_by_id.values()
|
|
@@ -409,7 +409,7 @@ class Planner:
|
|
|
409
409
|
for i, col in enumerate(all_base_cols):
|
|
410
410
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
411
411
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
412
|
-
return plan, [f'{c.tbl.
|
|
412
|
+
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
413
413
|
|
|
414
414
|
@classmethod
|
|
415
415
|
def __check_valid_columns(
|
|
@@ -465,7 +465,7 @@ class Planner:
|
|
|
465
465
|
recomputed_cols.update(idx_val_cols)
|
|
466
466
|
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
467
467
|
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
468
|
-
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
|
|
468
|
+
recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == target.id}
|
|
469
469
|
copied_cols = [
|
|
470
470
|
col
|
|
471
471
|
for col in target.cols_by_id.values()
|
pixeltable/share/packager.py
CHANGED
|
@@ -7,6 +7,7 @@ import urllib.request
|
|
|
7
7
|
import uuid
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any, Iterator, Optional
|
|
10
|
+
from uuid import UUID
|
|
10
11
|
|
|
11
12
|
import more_itertools
|
|
12
13
|
import pyarrow as pa
|
|
@@ -17,6 +18,7 @@ import pixeltable as pxt
|
|
|
17
18
|
from pixeltable import catalog, exceptions as excs, metadata
|
|
18
19
|
from pixeltable.env import Env
|
|
19
20
|
from pixeltable.metadata import schema
|
|
21
|
+
from pixeltable.utils import sha256sum
|
|
20
22
|
from pixeltable.utils.media_store import MediaStore
|
|
21
23
|
|
|
22
24
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -50,7 +52,7 @@ class TablePackager:
|
|
|
50
52
|
self.media_files = {}
|
|
51
53
|
|
|
52
54
|
# Load metadata
|
|
53
|
-
with
|
|
55
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
54
56
|
tbl_md = catalog.Catalog.get().load_replica_md(table)
|
|
55
57
|
self.md = {
|
|
56
58
|
'pxt_version': pxt.__version__,
|
|
@@ -65,15 +67,15 @@ class TablePackager:
|
|
|
65
67
|
Export the table to a tarball containing Parquet tables and media files.
|
|
66
68
|
"""
|
|
67
69
|
assert not self.tmp_dir.exists() # Packaging can only be done once per TablePackager instance
|
|
68
|
-
_logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
|
|
70
|
+
_logger.info(f"Packaging table '{self.table._path()}' and its ancestors in: {self.tmp_dir}")
|
|
69
71
|
self.tmp_dir.mkdir()
|
|
70
72
|
with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
|
|
71
73
|
json.dump(self.md, fp)
|
|
72
74
|
self.tables_dir = self.tmp_dir / 'tables'
|
|
73
75
|
self.tables_dir.mkdir()
|
|
74
|
-
with
|
|
76
|
+
with catalog.Catalog.get().begin_xact(for_write=False):
|
|
75
77
|
for tv in self.table._tbl_version_path.get_tbl_versions():
|
|
76
|
-
_logger.info(f"Exporting table '{tv.get().
|
|
78
|
+
_logger.info(f"Exporting table '{tv.get().versioned_name}'.")
|
|
77
79
|
self.__export_table(tv.get())
|
|
78
80
|
_logger.info('Building archive.')
|
|
79
81
|
bundle_path = self.__build_tarball()
|
|
@@ -88,7 +90,7 @@ class TablePackager:
|
|
|
88
90
|
assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
|
|
89
91
|
sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
|
|
90
92
|
media_cols: set[str] = set()
|
|
91
|
-
for col in tv.
|
|
93
|
+
for col in tv.cols:
|
|
92
94
|
if col.is_stored and col.col_type.is_media_type():
|
|
93
95
|
media_cols.add(col.store_name())
|
|
94
96
|
|
|
@@ -182,7 +184,12 @@ class TablePackager:
|
|
|
182
184
|
path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
|
|
183
185
|
if path not in self.media_files:
|
|
184
186
|
# Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
|
|
185
|
-
|
|
187
|
+
# We name the media files in the archive by their SHA256 hash. This ensures that we can properly
|
|
188
|
+
# deduplicate and validate them later.
|
|
189
|
+
# If we get a collision, it's not a problem; it just means we have two identical files (which will
|
|
190
|
+
# be conveniently deduplicated in the bundle).
|
|
191
|
+
sha = sha256sum(path)
|
|
192
|
+
dest_name = f'{sha}{path.suffix}'
|
|
186
193
|
self.media_files[path] = dest_name
|
|
187
194
|
return f'pxtmedia://{self.media_files[path]}'
|
|
188
195
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
@@ -247,13 +254,26 @@ class TableRestorer:
|
|
|
247
254
|
tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
|
|
248
255
|
|
|
249
256
|
# Create the replica table
|
|
250
|
-
#
|
|
251
|
-
|
|
252
|
-
|
|
257
|
+
# The logic here needs to be completely restructured in order to make it concurrency-safe.
|
|
258
|
+
# - Catalog.create_replica() needs to write the metadata and also create the physical store tables
|
|
259
|
+
# and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
|
|
260
|
+
# an actual table)
|
|
261
|
+
# - this could be done one replica at a time (instead of the entire hierarchy)
|
|
262
|
+
cat = catalog.Catalog.get()
|
|
263
|
+
cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
|
|
264
|
+
# don't call get_table() until after the calls to create_replica() and __import_table() below;
|
|
265
|
+
# the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
|
|
266
|
+
# TV instances for the same replica version, which then leads to failures when constructing queries
|
|
253
267
|
|
|
254
268
|
# Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
|
|
255
269
|
# replica_tbl itself if it's a pure snapshot.
|
|
256
|
-
|
|
270
|
+
target_md = tbl_md[0]
|
|
271
|
+
is_pure_snapshot = (
|
|
272
|
+
target_md.tbl_md.view_md is not None
|
|
273
|
+
and target_md.tbl_md.view_md.predicate is None
|
|
274
|
+
and len(target_md.schema_version_md.columns) == 0
|
|
275
|
+
)
|
|
276
|
+
if is_pure_snapshot:
|
|
257
277
|
ancestor_md = tbl_md[1:] # Pure snapshot; skip replica_tbl
|
|
258
278
|
else:
|
|
259
279
|
ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
|
|
@@ -267,7 +287,8 @@ class TableRestorer:
|
|
|
267
287
|
_logger.info(f'Importing table {tv.name!r}.')
|
|
268
288
|
self.__import_table(self.tmp_dir, tv, md)
|
|
269
289
|
|
|
270
|
-
|
|
290
|
+
with cat.begin_xact(for_write=False):
|
|
291
|
+
return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
|
|
271
292
|
|
|
272
293
|
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
|
|
273
294
|
"""
|
|
@@ -276,11 +297,182 @@ class TableRestorer:
|
|
|
276
297
|
tbl_id = uuid.UUID(tbl_md.tbl_md.tbl_id)
|
|
277
298
|
parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
|
|
278
299
|
parquet_table = pq.read_table(str(parquet_dir))
|
|
279
|
-
|
|
280
|
-
|
|
300
|
+
replica_version = tv.version
|
|
301
|
+
|
|
302
|
+
conn = Env.get().conn
|
|
303
|
+
store_sa_tbl = tv.store_tbl.sa_tbl
|
|
304
|
+
store_sa_tbl_name = tv.store_tbl._storage_name()
|
|
305
|
+
|
|
306
|
+
# Sometimes we are importing a table that has never been seen before. Other times, however, we are importing
|
|
307
|
+
# an existing replica table, and the table version and/or row selection differs from what was imported
|
|
308
|
+
# previously. Care must be taken to ensure that the new data is merged with existing data in a way that
|
|
309
|
+
# yields an internally consistent version history for each row.
|
|
310
|
+
|
|
311
|
+
# The overall strategy is this:
|
|
312
|
+
# 1. Import the parquet data into a temporary table;
|
|
313
|
+
# 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
|
|
314
|
+
# 3. Delete any row instances from the temporary table that are already present in the existing table;
|
|
315
|
+
# 4. Copy the remaining rows from the temporary table into the existing table.
|
|
316
|
+
|
|
317
|
+
# Create a temporary table for the initial data load, containing columns for all columns present in the
|
|
318
|
+
# parquet table. The parquet columns have identical names to those in the store table, so we can use the
|
|
319
|
+
# store table schema to get their SQL types (which are not necessarily derivable from their Parquet types,
|
|
320
|
+
# e.g., pa.string() may hold either VARCHAR or serialized JSONB).
|
|
321
|
+
temp_cols: dict[str, sql.Column] = {}
|
|
322
|
+
for field in parquet_table.schema:
|
|
323
|
+
assert field.name in store_sa_tbl.columns
|
|
324
|
+
col_type = store_sa_tbl.columns[field.name].type
|
|
325
|
+
temp_cols[field.name] = sql.Column(field.name, col_type)
|
|
326
|
+
temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
|
|
327
|
+
_logger.debug(f'Creating temporary table: {temp_sa_tbl_name}')
|
|
328
|
+
temp_md = sql.MetaData()
|
|
329
|
+
temp_sa_tbl = sql.Table(temp_sa_tbl_name, temp_md, *temp_cols.values(), prefixes=['TEMPORARY'])
|
|
330
|
+
temp_sa_tbl.create(conn)
|
|
331
|
+
|
|
332
|
+
# Populate the temporary table with data from the Parquet file.
|
|
333
|
+
_logger.debug(f'Loading {parquet_table.num_rows} row(s) into temporary table: {temp_sa_tbl_name}')
|
|
334
|
+
for batch in parquet_table.to_batches(max_chunksize=10_000):
|
|
281
335
|
pydict = batch.to_pydict()
|
|
282
336
|
rows = self.__from_pa_pydict(tv, pydict)
|
|
283
|
-
|
|
337
|
+
conn.execute(sql.insert(temp_sa_tbl), rows)
|
|
338
|
+
|
|
339
|
+
# Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
|
|
340
|
+
# Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
|
|
341
|
+
# In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
|
|
342
|
+
# row id, or MAX_VERSION if no such row instance exists. But in the replica, we need to be careful, since
|
|
343
|
+
# we might see only a subset of the original table's versions, and we might see them out of order.
|
|
344
|
+
|
|
345
|
+
# We'll adjust the v_max values according to the principle of "latest provable v_max":
|
|
346
|
+
# they will always correspond to the latest version for which we can prove the row instance was alive. This
|
|
347
|
+
# will enable us to maintain consistency of the v_max values if additional table versions are later imported,
|
|
348
|
+
# regardless of the order in which they are seen. It also means that replica tables (unlike original tables)
|
|
349
|
+
# may have gaps in their row version histories, but this is fine; the gaps simply correspond to table versions
|
|
350
|
+
# that have never been observed.
|
|
351
|
+
|
|
352
|
+
pk_predicates = [col == temp_cols[col.name] for col in tv.store_tbl.pk_columns()]
|
|
353
|
+
pk_clause = sql.and_(*pk_predicates)
|
|
354
|
+
|
|
355
|
+
# If the same pk exists in both the temporary table and the existing table, then the corresponding row data
|
|
356
|
+
# must be identical; the rows can differ only in their v_max value. As a sanity check, we go through the
|
|
357
|
+
# motion of verifying this; a failure implies data corruption in either the replica being imported or in a
|
|
358
|
+
# previously imported replica.
|
|
359
|
+
|
|
360
|
+
system_col_names = {col.name for col in tv.store_tbl.system_columns()}
|
|
361
|
+
media_col_names = {col.store_name() for col in tv.cols if col.col_type.is_media_type() and col.is_stored}
|
|
362
|
+
value_store_cols = [
|
|
363
|
+
store_sa_tbl.c[col_name]
|
|
364
|
+
for col_name in temp_cols
|
|
365
|
+
if col_name not in system_col_names and col_name not in media_col_names
|
|
366
|
+
]
|
|
367
|
+
value_temp_cols = [
|
|
368
|
+
col
|
|
369
|
+
for col_name, col in temp_cols.items()
|
|
370
|
+
if col_name not in system_col_names and col_name not in media_col_names
|
|
371
|
+
]
|
|
372
|
+
mismatch_predicates = [store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)]
|
|
373
|
+
mismatch_clause = sql.or_(*mismatch_predicates)
|
|
374
|
+
|
|
375
|
+
# This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
|
|
376
|
+
# one value column. Pseudo-SQL:
|
|
377
|
+
#
|
|
378
|
+
# SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
|
|
379
|
+
# FROM store_tbl, temp_tbl
|
|
380
|
+
# WHERE store_tbl.rowid = temp_tbl.rowid
|
|
381
|
+
# AND store_tbl.pos_0 = temp_tbl.pos_0
|
|
382
|
+
# AND ... AND store_tbl.pos_k = temp_tbl.pos_k
|
|
383
|
+
# AND store_tbl.v_min = temp_tbl.v_min
|
|
384
|
+
# AND (
|
|
385
|
+
# store_tbl.col_0 != temp_tbl.col_0
|
|
386
|
+
# OR store_tbl.col_1 != temp_tbl.col_1
|
|
387
|
+
# OR ... OR store_tbl.col_n != temp_tbl.col_n
|
|
388
|
+
# )
|
|
389
|
+
#
|
|
390
|
+
# The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
|
|
391
|
+
# either column is NULL; this is what we want, since it may indicate a column that is present in one version
|
|
392
|
+
# but not the other.
|
|
393
|
+
q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
|
|
394
|
+
_logger.debug(q.compile())
|
|
395
|
+
result = conn.execute(q)
|
|
396
|
+
if result.rowcount > 0:
|
|
397
|
+
_logger.debug(
|
|
398
|
+
f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
|
|
399
|
+
f'{result.rowcount} inconsistent row(s).'
|
|
400
|
+
)
|
|
401
|
+
row = result.first()
|
|
402
|
+
_logger.debug('Example mismatch:')
|
|
403
|
+
_logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
|
|
404
|
+
_logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
|
|
405
|
+
raise excs.Error(
|
|
406
|
+
'Data corruption error: the replica data are inconsistent with data retrieved from a previous replica.'
|
|
407
|
+
)
|
|
408
|
+
_logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
|
|
409
|
+
|
|
410
|
+
# Now rectify the v_max values in the temporary table.
|
|
411
|
+
# If a row instance has a concrete v_max value, then we know it's genuine: it's the unique and immutable
|
|
412
|
+
# version when the row was deleted. (This can only happen if later versions of the base table already
|
|
413
|
+
# existed at the time this replica was published.)
|
|
414
|
+
# But if a row instance has a v_max value of MAX_VERSION, then we don't know anything about its future.
|
|
415
|
+
# It might live indefinitely, or it might be deleted as early as version `n + 1`. Following the principle
|
|
416
|
+
# of "latest provable v_max", we simply set v_max equal to `n + 1`.
|
|
417
|
+
q = (
|
|
418
|
+
temp_sa_tbl.update()
|
|
419
|
+
.values(v_max=(replica_version + 1))
|
|
420
|
+
.where(temp_sa_tbl.c.v_max == schema.Table.MAX_VERSION)
|
|
421
|
+
)
|
|
422
|
+
_logger.debug(q.compile())
|
|
423
|
+
result = conn.execute(q)
|
|
424
|
+
_logger.debug(f'Rectified {result.rowcount} row(s) in {temp_sa_tbl_name!r}.')
|
|
425
|
+
|
|
426
|
+
# Now rectify the v_max values in the existing table. This is done by simply taking the later of the two v_max
|
|
427
|
+
# values (the existing one and the new one) for each row instance, following the "latest provable v_max"
|
|
428
|
+
# principle. Obviously we only need to do this for rows that exist in both tables (it's a simple join).
|
|
429
|
+
q = (
|
|
430
|
+
store_sa_tbl.update()
|
|
431
|
+
.values(v_max=sql.func.greatest(store_sa_tbl.c.v_max, temp_sa_tbl.c.v_max))
|
|
432
|
+
.where(pk_clause)
|
|
433
|
+
)
|
|
434
|
+
_logger.debug(q.compile())
|
|
435
|
+
result = conn.execute(q)
|
|
436
|
+
_logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
|
|
437
|
+
|
|
438
|
+
# Now we need to update rows in the existing table that are also present in the temporary table. This is to
|
|
439
|
+
# account for the scenario where the temporary table has columns that are not present in the existing table.
|
|
440
|
+
# (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
|
|
441
|
+
# might also occur; there may be columns in the existing table that are not present in the temporary table.)
|
|
442
|
+
value_update_clauses: dict[str, sql.ColumnElement] = {}
|
|
443
|
+
for temp_col in temp_cols.values():
|
|
444
|
+
if temp_col.name not in system_col_names:
|
|
445
|
+
store_col = store_sa_tbl.c[temp_col.name]
|
|
446
|
+
# Prefer the value from the existing table, substituting the value from the temporary table if it's
|
|
447
|
+
# NULL. This works in all cases (including media columns, where we prefer the existing media file).
|
|
448
|
+
clause = sql.case((store_col == None, temp_col), else_=store_col)
|
|
449
|
+
value_update_clauses[temp_col.name] = clause
|
|
450
|
+
if len(value_update_clauses) > 0:
|
|
451
|
+
q = store_sa_tbl.update().values(**value_update_clauses).where(pk_clause)
|
|
452
|
+
_logger.debug(q.compile())
|
|
453
|
+
result = conn.execute(q)
|
|
454
|
+
_logger.debug(
|
|
455
|
+
f'Merged values from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r} for {result.rowcount} row(s).'
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# Now drop any rows from the temporary table that are also present in the existing table.
|
|
459
|
+
# The v_max values have been rectified, data has been merged into NULL cells, and all other row values have
|
|
460
|
+
# been verified identical.
|
|
461
|
+
# TODO: Delete any media files that were orphaned by this operation (they're necessarily duplicates of media
|
|
462
|
+
# files that are already present in the existing table).
|
|
463
|
+
q = temp_sa_tbl.delete().where(pk_clause)
|
|
464
|
+
_logger.debug(q.compile())
|
|
465
|
+
result = conn.execute(q)
|
|
466
|
+
_logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
|
|
467
|
+
|
|
468
|
+
# Finally, copy the remaining data (consisting entirely of new row instances) from the temporary table into
|
|
469
|
+
# the actual table.
|
|
470
|
+
q = store_sa_tbl.insert().from_select(
|
|
471
|
+
[store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
|
|
472
|
+
)
|
|
473
|
+
_logger.debug(q.compile())
|
|
474
|
+
result = conn.execute(q)
|
|
475
|
+
_logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
|
|
284
476
|
|
|
285
477
|
def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
|
|
286
478
|
# Data conversions from pyarrow to Pixeltable
|
|
@@ -289,7 +481,7 @@ class TableRestorer:
|
|
|
289
481
|
assert col_name in tv.store_tbl.sa_tbl.columns
|
|
290
482
|
sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
|
|
291
483
|
media_col_ids: dict[str, int] = {}
|
|
292
|
-
for col in tv.
|
|
484
|
+
for col in tv.cols:
|
|
293
485
|
if col.is_stored and col.col_type.is_media_type():
|
|
294
486
|
media_col_ids[col.store_name()] = col.id
|
|
295
487
|
|
pixeltable/share/publish.py
CHANGED
|
@@ -35,7 +35,7 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
|
|
|
35
35
|
upload_id = response_json['upload_id']
|
|
36
36
|
destination_uri = response_json['destination_uri']
|
|
37
37
|
|
|
38
|
-
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
|
|
38
|
+
Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path()}' at: {dest_tbl_uri}")
|
|
39
39
|
|
|
40
40
|
bundle = packager.package()
|
|
41
41
|
|
|
@@ -117,7 +117,7 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
|
|
|
117
117
|
|
|
118
118
|
restorer = TableRestorer(dest_path, response_json)
|
|
119
119
|
tbl = restorer.restore(bundle_path)
|
|
120
|
-
Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
|
|
120
|
+
Env.get().console_logger.info(f'Created local replica {tbl._path()!r} from URI: {src_tbl_uri}')
|
|
121
121
|
return tbl
|
|
122
122
|
|
|
123
123
|
|