pixeltable 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -11
- pixeltable/catalog/catalog.py +575 -220
- pixeltable/catalog/column.py +22 -23
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +2 -148
- pixeltable/catalog/insertable_table.py +15 -13
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/schema_object.py +9 -4
- pixeltable/catalog/table.py +96 -85
- pixeltable/catalog/table_version.py +257 -174
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/tbl_ops.py +44 -0
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +50 -56
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +19 -6
- pixeltable/env.py +50 -4
- pixeltable/exec/data_row_batch.py +3 -1
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exec/in_memory_data_node.py +6 -7
- pixeltable/exprs/column_property_ref.py +21 -9
- pixeltable/exprs/column_ref.py +7 -2
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +10 -9
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/gemini.py +4 -4
- pixeltable/functions/openai.py +1 -2
- pixeltable/functions/video.py +59 -16
- pixeltable/globals.py +109 -24
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/datarows.py +2 -1
- pixeltable/io/external_store.py +3 -55
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +16 -16
- pixeltable/io/pandas.py +1 -0
- pixeltable/io/table_data_conduit.py +12 -13
- pixeltable/iterators/audio.py +17 -8
- pixeltable/iterators/image.py +5 -2
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +50 -1
- pixeltable/plan.py +4 -0
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +40 -51
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +50 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/RECORD +60 -57
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
|
@@ -47,13 +47,13 @@ class TableDataConduitFormat(str, enum.Enum):
|
|
|
47
47
|
|
|
48
48
|
@dataclass
|
|
49
49
|
class TableDataConduit:
|
|
50
|
-
source: TableDataSource
|
|
50
|
+
source: 'TableDataSource'
|
|
51
51
|
source_format: Optional[str] = None
|
|
52
52
|
source_column_map: Optional[dict[str, str]] = None
|
|
53
53
|
if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
|
|
54
|
-
pxt_schema: Optional[dict[str,
|
|
55
|
-
src_schema_overrides: Optional[dict[str,
|
|
56
|
-
src_schema: Optional[dict[str,
|
|
54
|
+
pxt_schema: Optional[dict[str, ts.ColumnType]] = None
|
|
55
|
+
src_schema_overrides: Optional[dict[str, ts.ColumnType]] = None
|
|
56
|
+
src_schema: Optional[dict[str, ts.ColumnType]] = None
|
|
57
57
|
pxt_pk: Optional[list[str]] = None
|
|
58
58
|
src_pk: Optional[list[str]] = None
|
|
59
59
|
valid_rows: Optional[RowData] = None
|
|
@@ -87,7 +87,7 @@ class TableDataConduit:
|
|
|
87
87
|
for name, coltype in self.pxt_schema.items():
|
|
88
88
|
self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
|
|
89
89
|
|
|
90
|
-
def infer_schema(self) -> dict[str,
|
|
90
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
91
91
|
raise NotImplementedError
|
|
92
92
|
|
|
93
93
|
def valid_row_batch(self) -> Iterator[RowData]:
|
|
@@ -137,7 +137,7 @@ class DFTableDataConduit(TableDataConduit):
|
|
|
137
137
|
t.pxt_df = tds.source
|
|
138
138
|
return t
|
|
139
139
|
|
|
140
|
-
def infer_schema(self) -> dict[str,
|
|
140
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
141
141
|
self.pxt_schema = self.pxt_df.schema
|
|
142
142
|
self.pxt_pk = self.src_pk
|
|
143
143
|
return self.pxt_schema
|
|
@@ -168,7 +168,7 @@ class RowDataTableDataConduit(TableDataConduit):
|
|
|
168
168
|
t.batch_count = 0
|
|
169
169
|
return t
|
|
170
170
|
|
|
171
|
-
def infer_schema(self) -> dict[str,
|
|
171
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
172
172
|
from .datarows import _infer_schema_from_rows
|
|
173
173
|
|
|
174
174
|
if self.source_column_map is None:
|
|
@@ -239,7 +239,7 @@ class PandasTableDataConduit(TableDataConduit):
|
|
|
239
239
|
t.batch_count = 0
|
|
240
240
|
return t
|
|
241
241
|
|
|
242
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
242
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
243
243
|
"""Return inferred schema, inferred primary key, and source column map"""
|
|
244
244
|
if self.source_column_map is None:
|
|
245
245
|
if self.src_schema_overrides is None:
|
|
@@ -252,7 +252,7 @@ class PandasTableDataConduit(TableDataConduit):
|
|
|
252
252
|
else:
|
|
253
253
|
raise NotImplementedError()
|
|
254
254
|
|
|
255
|
-
def infer_schema(self) -> dict[str,
|
|
255
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
256
256
|
self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
|
|
257
257
|
self.normalize_pxt_schema_types()
|
|
258
258
|
_df_check_primary_key_values(self.pd_df, self.src_pk)
|
|
@@ -328,7 +328,6 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
328
328
|
hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
|
|
329
329
|
column_name_for_split: Optional[str] = None
|
|
330
330
|
categorical_features: dict[str, dict[int, str]]
|
|
331
|
-
hf_schema: dict[str, Any] = None
|
|
332
331
|
dataset_dict: dict[str, datasets.Dataset] = None
|
|
333
332
|
hf_schema_source: dict[str, Any] = None
|
|
334
333
|
|
|
@@ -356,7 +355,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
356
355
|
except ImportError:
|
|
357
356
|
return False
|
|
358
357
|
|
|
359
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
358
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
360
359
|
from pixeltable.io.hf_datasets import _get_hf_schema, huggingface_schema_to_pxt_schema
|
|
361
360
|
|
|
362
361
|
if self.source_column_map is None:
|
|
@@ -469,7 +468,7 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
469
468
|
t.pq_ds = parquet.ParquetDataset(str(input_path))
|
|
470
469
|
return t
|
|
471
470
|
|
|
472
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
471
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
473
472
|
from pixeltable.utils.arrow import ar_infer_schema
|
|
474
473
|
|
|
475
474
|
if self.source_column_map is None:
|
|
@@ -483,7 +482,7 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
483
482
|
else:
|
|
484
483
|
raise NotImplementedError()
|
|
485
484
|
|
|
486
|
-
def infer_schema(self) -> dict[str,
|
|
485
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
487
486
|
self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
|
|
488
487
|
self.normalize_pxt_schema_types()
|
|
489
488
|
self.prepare_insert()
|
pixeltable/iterators/audio.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import uuid
|
|
3
2
|
from fractions import Fraction
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import Any, ClassVar, Optional
|
|
@@ -55,12 +54,9 @@ class AudioSplitter(ComponentIterator):
|
|
|
55
54
|
def __init__(
|
|
56
55
|
self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
|
|
57
56
|
):
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
62
|
-
if overlap_sec >= chunk_duration_sec:
|
|
63
|
-
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
57
|
+
assert chunk_duration_sec > 0.0
|
|
58
|
+
assert chunk_duration_sec >= min_chunk_duration_sec
|
|
59
|
+
assert overlap_sec < chunk_duration_sec
|
|
64
60
|
audio_path = Path(audio)
|
|
65
61
|
assert audio_path.exists() and audio_path.is_file()
|
|
66
62
|
self.audio_path = audio_path
|
|
@@ -128,6 +124,19 @@ class AudioSplitter(ComponentIterator):
|
|
|
128
124
|
|
|
129
125
|
@classmethod
|
|
130
126
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
127
|
+
param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
|
|
128
|
+
params = dict(zip(param_names, args))
|
|
129
|
+
params.update(kwargs)
|
|
130
|
+
|
|
131
|
+
chunk_duration_sec = params['chunk_duration_sec']
|
|
132
|
+
min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
|
|
133
|
+
overlap_sec = params.get('overlap_sec', 0.0)
|
|
134
|
+
if chunk_duration_sec <= 0.0:
|
|
135
|
+
raise excs.Error('chunk_duration_sec must be a positive number')
|
|
136
|
+
if chunk_duration_sec < min_chunk_duration_sec:
|
|
137
|
+
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
138
|
+
if overlap_sec >= chunk_duration_sec:
|
|
139
|
+
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
131
140
|
return {
|
|
132
141
|
'start_time_sec': ts.FloatType(),
|
|
133
142
|
'end_time_sec': ts.FloatType(),
|
|
@@ -140,7 +149,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
140
149
|
target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
|
|
141
150
|
chunk_start_pts = 0
|
|
142
151
|
chunk_end_pts = 0
|
|
143
|
-
chunk_file = str(env.Env.get().
|
|
152
|
+
chunk_file = str(env.Env.get().create_tmp_path(self.audio_path.suffix))
|
|
144
153
|
output_container = av.open(chunk_file, mode='w')
|
|
145
154
|
input_stream = self.container.streams.audio[0]
|
|
146
155
|
codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
|
pixeltable/iterators/image.py
CHANGED
|
@@ -31,8 +31,7 @@ class TileIterator(ComponentIterator):
|
|
|
31
31
|
__j: int
|
|
32
32
|
|
|
33
33
|
def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
|
|
34
|
-
|
|
35
|
-
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
34
|
+
assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
|
|
36
35
|
|
|
37
36
|
self.__image = image
|
|
38
37
|
self.__image.load()
|
|
@@ -79,4 +78,8 @@ class TileIterator(ComponentIterator):
|
|
|
79
78
|
|
|
80
79
|
@classmethod
|
|
81
80
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
81
|
+
tile_size = kwargs.get('tile_size')
|
|
82
|
+
overlap = kwargs.get('overlap', (0, 0))
|
|
83
|
+
if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
|
|
84
|
+
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
82
85
|
return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 40
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
|
|
5
|
+
import sqlalchemy as sql
|
|
6
|
+
|
|
7
|
+
from pixeltable.metadata import register_converter
|
|
8
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger('pixeltable')
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_converter(version=39)
|
|
14
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
15
|
+
convert_table_md(engine, table_modifier=__table_modifier)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
|
|
19
|
+
store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
|
|
20
|
+
store_name = f'{store_prefix}_{tbl_id.hex}'
|
|
21
|
+
|
|
22
|
+
# Get the list of column names that need to be migrated
|
|
23
|
+
col_names = find_error_columns(conn=conn, store_name=store_name)
|
|
24
|
+
if len(col_names) == 0:
|
|
25
|
+
_logger.info(f'No error columns found in table {store_name}. Skipping migration.')
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
# Check if the table exists, outside of the metadata we were given
|
|
29
|
+
# There seem to be cases where the metadata is present in the catalog,
|
|
30
|
+
# but the table itself is not in the database.
|
|
31
|
+
check_table_sql = sql.text(f"""
|
|
32
|
+
SELECT EXISTS (
|
|
33
|
+
SELECT 1
|
|
34
|
+
FROM information_schema.tables
|
|
35
|
+
WHERE table_name = '{store_name}'
|
|
36
|
+
)
|
|
37
|
+
""")
|
|
38
|
+
table_exists = conn.execute(check_table_sql).scalar()
|
|
39
|
+
if not table_exists:
|
|
40
|
+
_logger.warning(f'Table {store_name} does not exist. Skipping migration.')
|
|
41
|
+
return
|
|
42
|
+
|
|
43
|
+
return migrate_error_to_cellmd_columns(conn, store_name, col_names)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def find_error_columns(conn: sql.Connection, store_name: str) -> list[str]:
|
|
47
|
+
"""
|
|
48
|
+
Return and errormsg or errortype columns in the given table
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
conn: SQLAlchemy connection
|
|
52
|
+
store_name: Name of the table to check
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of column name roots (root_errormsg, root_errortype)
|
|
56
|
+
"""
|
|
57
|
+
check_columns_sql = sql.text(f"""
|
|
58
|
+
SELECT column_name
|
|
59
|
+
FROM information_schema.columns
|
|
60
|
+
WHERE table_name = '{store_name}'
|
|
61
|
+
""")
|
|
62
|
+
found_columns = [
|
|
63
|
+
row[0]
|
|
64
|
+
for row in conn.execute(check_columns_sql)
|
|
65
|
+
if row[0].endswith('_errormsg') or row[0].endswith('_errortype')
|
|
66
|
+
]
|
|
67
|
+
column_roots = {s.removesuffix('_errormsg').removesuffix('_errortype') for s in found_columns}
|
|
68
|
+
return [*column_roots]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def migrate_error_to_cellmd_columns(
|
|
72
|
+
conn: sql.Connection, store_name: str, col_names: list[str], backup_table: Optional[str] = None
|
|
73
|
+
) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Safe version with error handling and optional backup.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
engine: SQLAlchemy engine
|
|
79
|
+
store_name: Name of the table to modify
|
|
80
|
+
col_names: List of column name prefixes
|
|
81
|
+
backup_table: Optional name for backup table
|
|
82
|
+
|
|
83
|
+
Usage:
|
|
84
|
+
migrate_error_to_cellmd_columns(engine, 'my_table', ['columnname'], 'my_table_backup')
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
# Optional: Create backup
|
|
89
|
+
if backup_table:
|
|
90
|
+
backup_sql = sql.text(f"""
|
|
91
|
+
CREATE TABLE {backup_table} AS SELECT * FROM {store_name}
|
|
92
|
+
""")
|
|
93
|
+
conn.execute(backup_sql)
|
|
94
|
+
_logger.info(f'Backup created: {backup_table}')
|
|
95
|
+
|
|
96
|
+
# Step 1: Add new columns
|
|
97
|
+
add_column_str = ', '.join(f'ADD COLUMN {col}_cellmd JSONB DEFAULT NULL' for col in col_names)
|
|
98
|
+
add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
|
|
99
|
+
conn.execute(add_column_sql)
|
|
100
|
+
_logger.info(f'Added columns: {", ".join(f"{col}_cellmd" for col in col_names)}')
|
|
101
|
+
|
|
102
|
+
# Step 2: Populate new columns
|
|
103
|
+
set_column_str = ', '.join(
|
|
104
|
+
[
|
|
105
|
+
f'{col}_cellmd = CASE WHEN {col}_errormsg IS NULL OR {col}_errortype IS NULL '
|
|
106
|
+
f"THEN NULL ELSE jsonb_build_object('errormsg', {col}_errormsg, 'errortype', {col}_errortype) END"
|
|
107
|
+
for col in col_names
|
|
108
|
+
]
|
|
109
|
+
)
|
|
110
|
+
populate_sql = sql.text(f'UPDATE {store_name} SET {set_column_str}')
|
|
111
|
+
result = conn.execute(populate_sql)
|
|
112
|
+
_logger.info(f'Updated {result.rowcount} rows')
|
|
113
|
+
|
|
114
|
+
# Step 3: Drop old columns
|
|
115
|
+
drop_columns_str = ', '.join(
|
|
116
|
+
[f'DROP COLUMN IF EXISTS {col}_errormsg, DROP COLUMN IF EXISTS {col}_errortype' for col in col_names]
|
|
117
|
+
)
|
|
118
|
+
drop_columns_sql = sql.text(f'ALTER TABLE {store_name} {drop_columns_str}')
|
|
119
|
+
conn.execute(drop_columns_sql)
|
|
120
|
+
_logger.info(f'Dropped columns: {", ".join(f"{col}_errormsg, {col}_errortype" for col in col_names)}')
|
|
121
|
+
_logger.info(f'Migration completed successfully for table: {store_name}')
|
|
122
|
+
|
|
123
|
+
except sql.exc.SQLAlchemyError as e:
|
|
124
|
+
_logger.error(f'Migration for table {store_name} failed: {e}')
|
|
125
|
+
raise
|
|
@@ -16,6 +16,7 @@ def convert_table_md(
|
|
|
16
16
|
column_md_updater: Optional[Callable[[dict], None]] = None,
|
|
17
17
|
external_store_md_updater: Optional[Callable[[dict], None]] = None,
|
|
18
18
|
substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None,
|
|
19
|
+
table_modifier: Optional[Callable[[sql.Connection, UUID, dict, dict], None]] = None,
|
|
19
20
|
) -> None:
|
|
20
21
|
"""
|
|
21
22
|
Converts schema.TableMd dicts based on the specified conversion functions.
|
|
@@ -50,6 +51,8 @@ def convert_table_md(
|
|
|
50
51
|
if updated_table_md != table_md:
|
|
51
52
|
__logger.info(f'Updating schema for table: {tbl_id}')
|
|
52
53
|
conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
|
|
54
|
+
if table_modifier is not None:
|
|
55
|
+
table_modifier(conn, tbl_id, table_md, updated_table_md)
|
|
53
56
|
|
|
54
57
|
for row in conn.execute(sql.select(Function)):
|
|
55
58
|
fn_id = row[0]
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
40: 'Convert error property columns to cellmd columns',
|
|
5
6
|
39: 'ColumnHandles in external stores',
|
|
6
7
|
38: 'Added TableMd.view_sn',
|
|
7
8
|
37: 'Add support for the sample() method on DataFrames',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -8,6 +8,8 @@ from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
|
|
|
8
8
|
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
|
9
9
|
from sqlalchemy.orm.decl_api import DeclarativeMeta
|
|
10
10
|
|
|
11
|
+
from ..catalog.update_status import UpdateStatus
|
|
12
|
+
|
|
11
13
|
# Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
|
|
12
14
|
# a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
|
|
13
15
|
# outside of the module in a typesafe way.
|
|
@@ -180,6 +182,7 @@ class TableMd:
|
|
|
180
182
|
# sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
|
|
181
183
|
# - incremented for each add/drop of a mutable view
|
|
182
184
|
# - only maintained for mutable tables
|
|
185
|
+
# TODO: replace with mutable_views: list[UUID] to help with debugging
|
|
183
186
|
view_sn: int
|
|
184
187
|
|
|
185
188
|
# Metadata format for external stores:
|
|
@@ -191,6 +194,26 @@ class TableMd:
|
|
|
191
194
|
view_md: Optional[ViewMd]
|
|
192
195
|
additional_md: dict[str, Any]
|
|
193
196
|
|
|
197
|
+
has_pending_ops: bool = False
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def is_snapshot(self) -> bool:
|
|
201
|
+
return self.view_md is not None and self.view_md.is_snapshot
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def is_mutable(self) -> bool:
|
|
205
|
+
return not self.is_snapshot and not self.is_replica
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def is_pure_snapshot(self) -> bool:
|
|
209
|
+
return (
|
|
210
|
+
self.view_md is not None
|
|
211
|
+
and self.view_md.is_snapshot
|
|
212
|
+
and self.view_md.sample_clause is None
|
|
213
|
+
and self.view_md.predicate is None
|
|
214
|
+
and len(self.column_md) == 0
|
|
215
|
+
)
|
|
216
|
+
|
|
194
217
|
|
|
195
218
|
class Table(Base):
|
|
196
219
|
"""
|
|
@@ -219,7 +242,9 @@ class TableVersionMd:
|
|
|
219
242
|
created_at: float # time.time()
|
|
220
243
|
version: int
|
|
221
244
|
schema_version: int
|
|
222
|
-
|
|
245
|
+
user: Optional[str] = None # User that created this version
|
|
246
|
+
update_status: Optional[UpdateStatus] = None # UpdateStatus of the change that created this version
|
|
247
|
+
additional_md: dict[str, Any] = dataclasses.field(default_factory=dict)
|
|
223
248
|
|
|
224
249
|
|
|
225
250
|
class TableVersion(Base):
|
|
@@ -275,6 +300,22 @@ class TableSchemaVersion(Base):
|
|
|
275
300
|
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableSchemaVersionMd
|
|
276
301
|
|
|
277
302
|
|
|
303
|
+
class PendingTableOp(Base):
|
|
304
|
+
"""
|
|
305
|
+
Table operation that needs to be completed before the table can be used.
|
|
306
|
+
|
|
307
|
+
Operations need to be completed in order of increasing seq_num.
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
__tablename__ = 'pendingtableops'
|
|
311
|
+
|
|
312
|
+
tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
|
|
313
|
+
UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
|
|
314
|
+
)
|
|
315
|
+
op_sn: orm.Mapped[int] = orm.mapped_column(Integer, primary_key=True, nullable=False) # catalog.TableOp.op_sn
|
|
316
|
+
op: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # catalog.TableOp
|
|
317
|
+
|
|
318
|
+
|
|
278
319
|
@dataclasses.dataclass
|
|
279
320
|
class FunctionMd:
|
|
280
321
|
name: str
|
|
@@ -308,6 +349,14 @@ class FullTableMd(NamedTuple):
|
|
|
308
349
|
version_md: TableVersionMd
|
|
309
350
|
schema_version_md: TableSchemaVersionMd
|
|
310
351
|
|
|
352
|
+
@property
|
|
353
|
+
def is_pure_snapshot(self) -> bool:
|
|
354
|
+
return (
|
|
355
|
+
self.tbl_md.view_md is not None
|
|
356
|
+
and self.tbl_md.view_md.predicate is None
|
|
357
|
+
and len(self.schema_version_md.columns) == 0
|
|
358
|
+
)
|
|
359
|
+
|
|
311
360
|
def as_dict(self) -> dict[str, Any]:
|
|
312
361
|
return {
|
|
313
362
|
'table_id': self.tbl_md.tbl_id,
|
pixeltable/plan.py
CHANGED
|
@@ -512,6 +512,7 @@ class Planner:
|
|
|
512
512
|
# update row builder with column information
|
|
513
513
|
for i, col in enumerate(all_base_cols):
|
|
514
514
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
515
|
+
plan.ctx.num_computed_exprs = len(recomputed_exprs)
|
|
515
516
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
516
517
|
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
517
518
|
|
|
@@ -659,6 +660,7 @@ class Planner:
|
|
|
659
660
|
ignore_errors=True,
|
|
660
661
|
exact_version_only=view.get_bases(),
|
|
661
662
|
)
|
|
663
|
+
plan.ctx.num_computed_exprs = len(recomputed_exprs)
|
|
662
664
|
for i, col in enumerate(copied_cols + list(recomputed_cols)): # same order as select_list
|
|
663
665
|
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
664
666
|
# TODO: avoid duplication with view_load_plan() logic (where does this belong?)
|
|
@@ -1057,6 +1059,8 @@ class Planner:
|
|
|
1057
1059
|
plan.ctx.batch_size = 16
|
|
1058
1060
|
plan.ctx.show_pbar = True
|
|
1059
1061
|
plan.ctx.ignore_errors = True
|
|
1062
|
+
computed_exprs = row_builder.output_exprs - row_builder.input_exprs
|
|
1063
|
+
plan.ctx.num_computed_exprs = len(computed_exprs) # we are adding a computed column, so we need to evaluate it
|
|
1060
1064
|
|
|
1061
1065
|
# we want to flush images
|
|
1062
1066
|
if col.is_computed and col.is_stored and col.col_type.is_image_type():
|
pixeltable/share/packager.py
CHANGED
|
@@ -361,49 +361,32 @@ class TableRestorer:
|
|
|
361
361
|
)
|
|
362
362
|
|
|
363
363
|
tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
|
|
364
|
+
for md in tbl_md:
|
|
365
|
+
md.tbl_md.is_replica = True
|
|
364
366
|
|
|
365
|
-
# Create the replica table
|
|
366
|
-
# The logic here needs to be completely restructured in order to make it concurrency-safe.
|
|
367
|
-
# - Catalog.create_replica() needs to write the metadata and also create the physical store tables
|
|
368
|
-
# and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
|
|
369
|
-
# an actual table)
|
|
370
|
-
# - this could be done one replica at a time (instead of the entire hierarchy)
|
|
371
367
|
cat = catalog.Catalog.get()
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
else:
|
|
388
|
-
ancestor_md = tbl_md # Not a pure snapshot; include replica_tbl
|
|
389
|
-
|
|
390
|
-
# Instantiate data from the Parquet tables.
|
|
391
|
-
with Env.get().begin_xact():
|
|
392
|
-
for md in ancestor_md[::-1]: # Base table first
|
|
393
|
-
# Create a TableVersion instance (and a store table) for this ancestor.
|
|
394
|
-
tv = catalog.TableVersion.create_replica(md)
|
|
395
|
-
# Now import data from Parquet.
|
|
396
|
-
_logger.info(f'Importing table {tv.name!r}.')
|
|
397
|
-
self.__import_table(self.tmp_dir, tv, md)
|
|
398
|
-
|
|
399
|
-
with cat.begin_xact(for_write=False):
|
|
368
|
+
|
|
369
|
+
with cat.begin_xact(for_write=True):
|
|
370
|
+
# Create (or update) the replica table and its ancestors, along with TableVersion instances for any
|
|
371
|
+
# versions that have not been seen before.
|
|
372
|
+
cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
|
|
373
|
+
|
|
374
|
+
# Now we need to load data for replica_tbl and its ancestors, except that we skip
|
|
375
|
+
# replica_tbl itself if it's a pure snapshot.
|
|
376
|
+
for md in tbl_md[::-1]: # Base table first
|
|
377
|
+
if not md.is_pure_snapshot:
|
|
378
|
+
tv = cat.get_tbl_version(UUID(md.tbl_md.tbl_id), md.version_md.version)
|
|
379
|
+
# Import data from Parquet.
|
|
380
|
+
_logger.info(f'Importing table {tv.name!r}.')
|
|
381
|
+
self.__import_table(self.tmp_dir, tv, md)
|
|
382
|
+
|
|
400
383
|
return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
|
|
401
384
|
|
|
402
385
|
def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
|
|
403
386
|
"""
|
|
404
387
|
Import the Parquet table into the Pixeltable catalog.
|
|
405
388
|
"""
|
|
406
|
-
tbl_id =
|
|
389
|
+
tbl_id = UUID(tbl_md.tbl_md.tbl_id)
|
|
407
390
|
parquet_dir = bundle_path / 'tables' / f'tbl_{tbl_id.hex}'
|
|
408
391
|
parquet_table = pq.read_table(str(parquet_dir))
|
|
409
392
|
replica_version = tv.version
|
|
@@ -626,9 +609,8 @@ class TableRestorer:
|
|
|
626
609
|
# First time seeing this pxtmedia:// URL. Relocate the file to the media store and record the mapping
|
|
627
610
|
# in self.media_files.
|
|
628
611
|
src_path = self.tmp_dir / 'media' / parsed_url.netloc
|
|
629
|
-
|
|
630
|
-
src_path.
|
|
631
|
-
self.media_files[url] = urllib.parse.urljoin('file:', urllib.request.pathname2url(str(dest_path)))
|
|
612
|
+
# Move the file to the media store and update the URL.
|
|
613
|
+
self.media_files[url] = MediaStore.relocate_local_media_file(src_path, tv.id, media_col_id, tv.version)
|
|
632
614
|
return self.media_files[url]
|
|
633
615
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
634
616
|
return url
|