pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from uuid import UUID
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
|
|
9
|
+
_logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_converter(version=39)
|
|
13
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
14
|
+
convert_table_md(engine, table_modifier=__table_modifier)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
|
|
18
|
+
store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
|
|
19
|
+
store_name = f'{store_prefix}_{tbl_id.hex}'
|
|
20
|
+
|
|
21
|
+
# Get the list of column names that need to be migrated
|
|
22
|
+
col_names = find_error_columns(conn=conn, store_name=store_name)
|
|
23
|
+
if len(col_names) == 0:
|
|
24
|
+
_logger.info(f'No error columns found in table {store_name}. Skipping migration.')
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
# Check if the table exists, outside of the metadata we were given
|
|
28
|
+
# There seem to be cases where the metadata is present in the catalog,
|
|
29
|
+
# but the table itself is not in the database.
|
|
30
|
+
check_table_sql = sql.text(f"""
|
|
31
|
+
SELECT EXISTS (
|
|
32
|
+
SELECT 1
|
|
33
|
+
FROM information_schema.tables
|
|
34
|
+
WHERE table_name = '{store_name}'
|
|
35
|
+
)
|
|
36
|
+
""")
|
|
37
|
+
table_exists = conn.execute(check_table_sql).scalar()
|
|
38
|
+
if not table_exists:
|
|
39
|
+
_logger.warning(f'Table {store_name} does not exist. Skipping migration.')
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
return migrate_error_to_cellmd_columns(conn, store_name, col_names)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def find_error_columns(conn: sql.Connection, store_name: str) -> list[str]:
|
|
46
|
+
"""
|
|
47
|
+
Return and errormsg or errortype columns in the given table
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
conn: SQLAlchemy connection
|
|
51
|
+
store_name: Name of the table to check
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
List of column name roots (root_errormsg, root_errortype)
|
|
55
|
+
"""
|
|
56
|
+
check_columns_sql = sql.text(f"""
|
|
57
|
+
SELECT column_name
|
|
58
|
+
FROM information_schema.columns
|
|
59
|
+
WHERE table_name = '{store_name}'
|
|
60
|
+
""")
|
|
61
|
+
found_columns = [
|
|
62
|
+
row[0]
|
|
63
|
+
for row in conn.execute(check_columns_sql)
|
|
64
|
+
if row[0].endswith('_errormsg') or row[0].endswith('_errortype')
|
|
65
|
+
]
|
|
66
|
+
column_roots = {s.removesuffix('_errormsg').removesuffix('_errortype') for s in found_columns}
|
|
67
|
+
return [*column_roots]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def migrate_error_to_cellmd_columns(
|
|
71
|
+
conn: sql.Connection, store_name: str, col_names: list[str], backup_table: str | None = None
|
|
72
|
+
) -> None:
|
|
73
|
+
"""
|
|
74
|
+
Safe version with error handling and optional backup.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
engine: SQLAlchemy engine
|
|
78
|
+
store_name: Name of the table to modify
|
|
79
|
+
col_names: List of column name prefixes
|
|
80
|
+
backup_table: Optional name for backup table
|
|
81
|
+
|
|
82
|
+
Usage:
|
|
83
|
+
migrate_error_to_cellmd_columns(engine, 'my_table', ['columnname'], 'my_table_backup')
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
# Optional: Create backup
|
|
88
|
+
if backup_table:
|
|
89
|
+
backup_sql = sql.text(f"""
|
|
90
|
+
CREATE TABLE {backup_table} AS SELECT * FROM {store_name}
|
|
91
|
+
""")
|
|
92
|
+
conn.execute(backup_sql)
|
|
93
|
+
_logger.info(f'Backup created: {backup_table}')
|
|
94
|
+
|
|
95
|
+
# Step 1: Add new columns
|
|
96
|
+
add_column_str = ', '.join(f'ADD COLUMN {col}_cellmd JSONB DEFAULT NULL' for col in col_names)
|
|
97
|
+
add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
|
|
98
|
+
conn.execute(add_column_sql)
|
|
99
|
+
_logger.info(f'Added columns: {", ".join(f"{col}_cellmd" for col in col_names)}')
|
|
100
|
+
|
|
101
|
+
# Step 2: Populate new columns
|
|
102
|
+
set_column_str = ', '.join(
|
|
103
|
+
[
|
|
104
|
+
f'{col}_cellmd = CASE WHEN {col}_errormsg IS NULL OR {col}_errortype IS NULL '
|
|
105
|
+
f"THEN NULL ELSE jsonb_build_object('errormsg', {col}_errormsg, 'errortype', {col}_errortype) END"
|
|
106
|
+
for col in col_names
|
|
107
|
+
]
|
|
108
|
+
)
|
|
109
|
+
populate_sql = sql.text(f'UPDATE {store_name} SET {set_column_str}')
|
|
110
|
+
result = conn.execute(populate_sql)
|
|
111
|
+
_logger.info(f'Updated {result.rowcount} rows')
|
|
112
|
+
|
|
113
|
+
# Step 3: Drop old columns
|
|
114
|
+
drop_columns_str = ', '.join(
|
|
115
|
+
[f'DROP COLUMN IF EXISTS {col}_errormsg, DROP COLUMN IF EXISTS {col}_errortype' for col in col_names]
|
|
116
|
+
)
|
|
117
|
+
drop_columns_sql = sql.text(f'ALTER TABLE {store_name} {drop_columns_str}')
|
|
118
|
+
conn.execute(drop_columns_sql)
|
|
119
|
+
_logger.info(f'Dropped columns: {", ".join(f"{col}_errormsg, {col}_errortype" for col in col_names)}')
|
|
120
|
+
_logger.info(f'Migration completed successfully for table: {store_name}')
|
|
121
|
+
|
|
122
|
+
except sql.exc.SQLAlchemyError as e:
|
|
123
|
+
_logger.error(f'Migration for table {store_name} failed: {e}')
|
|
124
|
+
raise
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from uuid import UUID
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
|
|
9
|
+
_logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_converter(version=40)
|
|
13
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
14
|
+
convert_table_md(engine, table_modifier=__table_modifier)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
|
|
18
|
+
store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
|
|
19
|
+
store_name = f'{store_prefix}_{tbl_id.hex}'
|
|
20
|
+
|
|
21
|
+
# Get the list of column names that need _cellmd columns
|
|
22
|
+
_logger.info(f'Checking table {orig_table_md["name"]} ({store_name})')
|
|
23
|
+
col_ids = find_target_columns(orig_table_md)
|
|
24
|
+
if len(col_ids) == 0:
|
|
25
|
+
_logger.info(f'No Array or Json columns found in table {orig_table_md["name"]}. Skipping migration.')
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
# Check which columns already exist in the table
|
|
29
|
+
check_columns_sql = sql.text(f"""
|
|
30
|
+
SELECT column_name
|
|
31
|
+
FROM information_schema.columns
|
|
32
|
+
WHERE table_name = '{store_name}'
|
|
33
|
+
""")
|
|
34
|
+
existing_columns = {row[0] for row in conn.execute(check_columns_sql)}
|
|
35
|
+
|
|
36
|
+
# Filter out columns that already have _cellmd
|
|
37
|
+
col_ids_to_add: list[int] = []
|
|
38
|
+
for col_id in col_ids:
|
|
39
|
+
cellmd_col = f'col_{col_id}_cellmd'
|
|
40
|
+
if cellmd_col not in existing_columns:
|
|
41
|
+
col_ids_to_add.append(col_id)
|
|
42
|
+
else:
|
|
43
|
+
_logger.info(f'Column {cellmd_col} already exists in table {orig_table_md["name"]}. Skipping.')
|
|
44
|
+
|
|
45
|
+
if len(col_ids_to_add) == 0:
|
|
46
|
+
_logger.info(f'All _cellmd columns already exist in table {orig_table_md["name"]}. Skipping migration.')
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
return add_cellmd_columns(conn, store_name, col_ids_to_add)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def find_target_columns(table_md: dict) -> list[int]:
|
|
53
|
+
"""Returns ids of stored array and json columns"""
|
|
54
|
+
result: list[int] = []
|
|
55
|
+
for col_id, col_md in table_md['column_md'].items():
|
|
56
|
+
col_type = col_md['col_type']
|
|
57
|
+
classname = col_type.get('_classname')
|
|
58
|
+
if classname in ['ArrayType', 'JsonType'] and col_md.get('stored', False):
|
|
59
|
+
result.append(col_id)
|
|
60
|
+
_logger.info(f'Found {classname} column: {col_id}')
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def add_cellmd_columns(conn: sql.Connection, store_name: str, col_ids: list[int]) -> None:
|
|
65
|
+
try:
|
|
66
|
+
# Add new columns
|
|
67
|
+
add_column_str = ', '.join(f'ADD COLUMN col_{col_id}_cellmd JSONB DEFAULT NULL' for col_id in col_ids)
|
|
68
|
+
add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
|
|
69
|
+
conn.execute(add_column_sql)
|
|
70
|
+
_logger.info(f'Added columns to {store_name}: {", ".join(f"col_{col_id}_cellmd" for col_id in col_ids)}')
|
|
71
|
+
except sql.exc.SQLAlchemyError as e:
|
|
72
|
+
_logger.error(f'Migration for table {store_name} failed: {e}')
|
|
73
|
+
raise
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Any, Callable
|
|
3
|
+
from typing import Any, Callable
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
@@ -12,10 +12,11 @@ __logger = logging.getLogger('pixeltable')
|
|
|
12
12
|
|
|
13
13
|
def convert_table_md(
|
|
14
14
|
engine: sql.engine.Engine,
|
|
15
|
-
table_md_updater:
|
|
16
|
-
column_md_updater:
|
|
17
|
-
external_store_md_updater:
|
|
18
|
-
substitution_fn:
|
|
15
|
+
table_md_updater: Callable[[dict, UUID], None] | None = None,
|
|
16
|
+
column_md_updater: Callable[[dict], None] | None = None,
|
|
17
|
+
external_store_md_updater: Callable[[dict], None] | None = None,
|
|
18
|
+
substitution_fn: Callable[[str | None, Any], tuple[str | None, Any] | None] | None = None,
|
|
19
|
+
table_modifier: Callable[[sql.Connection, UUID, dict, dict], None] | None = None,
|
|
19
20
|
) -> None:
|
|
20
21
|
"""
|
|
21
22
|
Converts schema.TableMd dicts based on the specified conversion functions.
|
|
@@ -50,6 +51,8 @@ def convert_table_md(
|
|
|
50
51
|
if updated_table_md != table_md:
|
|
51
52
|
__logger.info(f'Updating schema for table: {tbl_id}')
|
|
52
53
|
conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
|
|
54
|
+
if table_modifier is not None:
|
|
55
|
+
table_modifier(conn, tbl_id, table_md, updated_table_md)
|
|
53
56
|
|
|
54
57
|
for row in conn.execute(sql.select(Function)):
|
|
55
58
|
fn_id = row[0]
|
|
@@ -77,9 +80,7 @@ def __update_external_store_md(table_md: dict, external_store_md_updater: Callab
|
|
|
77
80
|
external_store_md_updater(store_md)
|
|
78
81
|
|
|
79
82
|
|
|
80
|
-
def __substitute_md_rec(
|
|
81
|
-
md: Any, substitution_fn: Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]
|
|
82
|
-
) -> Any:
|
|
83
|
+
def __substitute_md_rec(md: Any, substitution_fn: Callable[[str | None, Any], tuple[str | None, Any] | None]) -> Any:
|
|
83
84
|
if isinstance(md, dict):
|
|
84
85
|
updated_dict: dict[str, Any] = {}
|
|
85
86
|
for k, v in md.items():
|
|
@@ -107,8 +108,8 @@ def __substitute_md_rec(
|
|
|
107
108
|
|
|
108
109
|
def convert_table_schema_version_md(
|
|
109
110
|
engine: sql.engine.Engine,
|
|
110
|
-
table_schema_version_md_updater:
|
|
111
|
-
schema_column_updater:
|
|
111
|
+
table_schema_version_md_updater: Callable[[dict], None] | None = None,
|
|
112
|
+
schema_column_updater: Callable[[dict], None] | None = None,
|
|
112
113
|
) -> None:
|
|
113
114
|
"""
|
|
114
115
|
Converts schema.TableSchemaVersionMd dicts based on the specified conversion functions.
|
|
@@ -147,7 +148,7 @@ def __update_schema_column(table_schema_version_md: dict, schema_column_updater:
|
|
|
147
148
|
|
|
148
149
|
|
|
149
150
|
def convert_table_version_record(
|
|
150
|
-
engine: sql.engine.Engine, table_version_record_updater:
|
|
151
|
+
engine: sql.engine.Engine, table_version_record_updater: Callable[[TableVersion], None] | None
|
|
151
152
|
) -> None:
|
|
152
153
|
with sql.orm.Session(engine, future=True) as session:
|
|
153
154
|
for record in session.query(TableVersion).all():
|
|
@@ -156,7 +157,7 @@ def convert_table_version_record(
|
|
|
156
157
|
|
|
157
158
|
|
|
158
159
|
def convert_table_schema_version_record(
|
|
159
|
-
engine: sql.engine.Engine, table_schema_version_record_updater:
|
|
160
|
+
engine: sql.engine.Engine, table_schema_version_record_updater: Callable[[TableSchemaVersion], None] | None
|
|
160
161
|
) -> None:
|
|
161
162
|
with sql.orm.Session(engine, future=True) as session:
|
|
162
163
|
for record in session.query(TableSchemaVersion).all():
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
41: 'Cellmd columns for array and json columns',
|
|
6
|
+
40: 'Convert error property columns to cellmd columns',
|
|
7
|
+
39: 'ColumnHandles in external stores',
|
|
8
|
+
38: 'Added TableMd.view_sn',
|
|
5
9
|
37: 'Add support for the sample() method on DataFrames',
|
|
6
10
|
36: 'Added Table.lock_dummy',
|
|
7
11
|
35: 'Track reference_tbl in ColumnRef',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
+
import types
|
|
2
3
|
import typing
|
|
3
4
|
import uuid
|
|
4
|
-
from typing import Any,
|
|
5
|
+
from typing import Any, TypeVar, Union, get_type_hints
|
|
5
6
|
|
|
6
7
|
import sqlalchemy as sql
|
|
7
8
|
from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
|
|
8
9
|
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
|
9
10
|
from sqlalchemy.orm.decl_api import DeclarativeMeta
|
|
10
11
|
|
|
12
|
+
from ..catalog.update_status import UpdateStatus
|
|
13
|
+
|
|
11
14
|
# Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
|
|
12
15
|
# a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
|
|
13
16
|
# outside of the module in a typesafe way.
|
|
@@ -22,13 +25,13 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
|
|
|
22
25
|
"""Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
|
|
23
26
|
if dataclasses.is_dataclass(data_class_type):
|
|
24
27
|
fieldtypes = get_type_hints(data_class_type)
|
|
25
|
-
return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
|
|
28
|
+
return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
|
|
26
29
|
|
|
27
30
|
origin = typing.get_origin(data_class_type)
|
|
28
31
|
if origin is not None:
|
|
29
32
|
type_args = typing.get_args(data_class_type)
|
|
30
|
-
if origin is Union and type(None) in type_args:
|
|
31
|
-
#
|
|
33
|
+
if (origin is Union or origin is types.UnionType) and type(None) in type_args:
|
|
34
|
+
# handling T | None, T | None
|
|
32
35
|
non_none_args = [arg for arg in type_args if arg is not type(None)]
|
|
33
36
|
assert len(non_none_args) == 1
|
|
34
37
|
return md_from_dict(non_none_args[0], data) if data is not None else None
|
|
@@ -72,7 +75,7 @@ class SystemInfo(Base):
|
|
|
72
75
|
@dataclasses.dataclass
|
|
73
76
|
class DirMd:
|
|
74
77
|
name: str
|
|
75
|
-
user:
|
|
78
|
+
user: str | None
|
|
76
79
|
additional_md: dict[str, Any]
|
|
77
80
|
|
|
78
81
|
|
|
@@ -101,17 +104,20 @@ class ColumnMd:
|
|
|
101
104
|
|
|
102
105
|
id: int
|
|
103
106
|
schema_version_add: int
|
|
104
|
-
schema_version_drop:
|
|
107
|
+
schema_version_drop: int | None
|
|
105
108
|
col_type: dict
|
|
106
109
|
|
|
107
110
|
# if True, is part of the primary key
|
|
108
111
|
is_pk: bool
|
|
109
112
|
|
|
110
113
|
# if set, this is a computed column
|
|
111
|
-
value_expr:
|
|
114
|
+
value_expr: dict | None
|
|
112
115
|
|
|
113
116
|
# if True, the column is present in the stored table
|
|
114
|
-
stored:
|
|
117
|
+
stored: bool | None
|
|
118
|
+
|
|
119
|
+
# If present, the URI for the destination for column values
|
|
120
|
+
destination: str | None = None
|
|
115
121
|
|
|
116
122
|
|
|
117
123
|
@dataclasses.dataclass
|
|
@@ -127,13 +133,13 @@ class IndexMd:
|
|
|
127
133
|
index_val_col_id: int # column holding the values to be indexed
|
|
128
134
|
index_val_undo_col_id: int # column holding index values for deleted rows
|
|
129
135
|
schema_version_add: int
|
|
130
|
-
schema_version_drop:
|
|
136
|
+
schema_version_drop: int | None
|
|
131
137
|
class_fqn: str
|
|
132
138
|
init_args: dict[str, Any]
|
|
133
139
|
|
|
134
140
|
|
|
135
141
|
# a stored table version path is a list of (table id as str, effective table version)
|
|
136
|
-
TableVersionPath = list[tuple[str,
|
|
142
|
+
TableVersionPath = list[tuple[str, int | None]]
|
|
137
143
|
|
|
138
144
|
|
|
139
145
|
@dataclasses.dataclass
|
|
@@ -145,16 +151,16 @@ class ViewMd:
|
|
|
145
151
|
base_versions: TableVersionPath
|
|
146
152
|
|
|
147
153
|
# filter predicate applied to the base table; view-only
|
|
148
|
-
predicate:
|
|
154
|
+
predicate: dict[str, Any] | None
|
|
149
155
|
|
|
150
156
|
# sampling predicate applied to the base table; view-only
|
|
151
|
-
sample_clause:
|
|
157
|
+
sample_clause: dict[str, Any] | None
|
|
152
158
|
|
|
153
159
|
# ComponentIterator subclass; only for component views
|
|
154
|
-
iterator_class_fqn:
|
|
160
|
+
iterator_class_fqn: str | None
|
|
155
161
|
|
|
156
162
|
# args to pass to the iterator class constructor; only for component views
|
|
157
|
-
iterator_args:
|
|
163
|
+
iterator_args: dict[str, Any] | None
|
|
158
164
|
|
|
159
165
|
|
|
160
166
|
@dataclasses.dataclass
|
|
@@ -163,7 +169,7 @@ class TableMd:
|
|
|
163
169
|
name: str
|
|
164
170
|
is_replica: bool
|
|
165
171
|
|
|
166
|
-
user:
|
|
172
|
+
user: str | None
|
|
167
173
|
|
|
168
174
|
# monotonically increasing w/in Table for both data and schema changes, starting at 0
|
|
169
175
|
current_version: int
|
|
@@ -177,15 +183,47 @@ class TableMd:
|
|
|
177
183
|
# - every row is assigned a unique and immutable rowid on insertion
|
|
178
184
|
next_row_id: int
|
|
179
185
|
|
|
186
|
+
# sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
|
|
187
|
+
# - incremented for each add/drop of a mutable view
|
|
188
|
+
# - only maintained for mutable tables
|
|
189
|
+
# TODO: replace with mutable_views: list[UUID] to help with debugging
|
|
190
|
+
view_sn: int
|
|
191
|
+
|
|
180
192
|
# Metadata format for external stores:
|
|
181
193
|
# {'class': 'pixeltable.io.label_studio.LabelStudioProject', 'md': {'project_id': 3}}
|
|
182
194
|
external_stores: list[dict[str, Any]]
|
|
183
195
|
|
|
184
196
|
column_md: dict[int, ColumnMd] # col_id -> ColumnMd
|
|
185
197
|
index_md: dict[int, IndexMd] # index_id -> IndexMd
|
|
186
|
-
view_md:
|
|
198
|
+
view_md: ViewMd | None
|
|
187
199
|
additional_md: dict[str, Any]
|
|
188
200
|
|
|
201
|
+
has_pending_ops: bool = False
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def is_snapshot(self) -> bool:
|
|
205
|
+
return self.view_md is not None and self.view_md.is_snapshot
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def is_mutable(self) -> bool:
|
|
209
|
+
return not self.is_snapshot and not self.is_replica
|
|
210
|
+
|
|
211
|
+
@property
|
|
212
|
+
def is_pure_snapshot(self) -> bool:
|
|
213
|
+
return (
|
|
214
|
+
self.view_md is not None
|
|
215
|
+
and self.view_md.is_snapshot
|
|
216
|
+
and self.view_md.sample_clause is None
|
|
217
|
+
and self.view_md.predicate is None
|
|
218
|
+
and len(self.column_md) == 0
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
@property
|
|
222
|
+
def ancestor_ids(self) -> list[str]:
|
|
223
|
+
if self.view_md is None:
|
|
224
|
+
return []
|
|
225
|
+
return [id for id, _ in self.view_md.base_versions]
|
|
226
|
+
|
|
189
227
|
|
|
190
228
|
class Table(Base):
|
|
191
229
|
"""
|
|
@@ -214,7 +252,12 @@ class TableVersionMd:
|
|
|
214
252
|
created_at: float # time.time()
|
|
215
253
|
version: int
|
|
216
254
|
schema_version: int
|
|
217
|
-
|
|
255
|
+
user: str | None = None # User that created this version
|
|
256
|
+
update_status: UpdateStatus | None = None # UpdateStatus of the change that created this version
|
|
257
|
+
# A version fragment cannot be queried or instantiated via get_table(). A fragment represents a version of a
|
|
258
|
+
# replica table that has incomplete data, and exists only to provide base table support for a dependent view.
|
|
259
|
+
is_fragment: bool = False
|
|
260
|
+
additional_md: dict[str, Any] = dataclasses.field(default_factory=dict)
|
|
218
261
|
|
|
219
262
|
|
|
220
263
|
class TableVersion(Base):
|
|
@@ -237,7 +280,7 @@ class SchemaColumn:
|
|
|
237
280
|
|
|
238
281
|
# media validation strategy of this particular media column; if not set, TableMd.media_validation applies
|
|
239
282
|
# stores column.MediaValiation.name.lower()
|
|
240
|
-
media_validation:
|
|
283
|
+
media_validation: str | None
|
|
241
284
|
|
|
242
285
|
|
|
243
286
|
@dataclasses.dataclass
|
|
@@ -248,7 +291,7 @@ class TableSchemaVersionMd:
|
|
|
248
291
|
|
|
249
292
|
tbl_id: str # uuid.UUID
|
|
250
293
|
schema_version: int
|
|
251
|
-
preceding_schema_version:
|
|
294
|
+
preceding_schema_version: int | None
|
|
252
295
|
columns: dict[int, SchemaColumn] # col_id -> SchemaColumn
|
|
253
296
|
num_retained_versions: int
|
|
254
297
|
comment: str
|
|
@@ -270,6 +313,22 @@ class TableSchemaVersion(Base):
|
|
|
270
313
|
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableSchemaVersionMd
|
|
271
314
|
|
|
272
315
|
|
|
316
|
+
class PendingTableOp(Base):
|
|
317
|
+
"""
|
|
318
|
+
Table operation that needs to be completed before the table can be used.
|
|
319
|
+
|
|
320
|
+
Operations need to be completed in order of increasing seq_num.
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
__tablename__ = 'pendingtableops'
|
|
324
|
+
|
|
325
|
+
tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
|
|
326
|
+
UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
|
|
327
|
+
)
|
|
328
|
+
op_sn: orm.Mapped[int] = orm.mapped_column(Integer, primary_key=True, nullable=False) # catalog.TableOp.op_sn
|
|
329
|
+
op: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # catalog.TableOp
|
|
330
|
+
|
|
331
|
+
|
|
273
332
|
@dataclasses.dataclass
|
|
274
333
|
class FunctionMd:
|
|
275
334
|
name: str
|
|
@@ -295,26 +354,4 @@ class Function(Base):
|
|
|
295
354
|
)
|
|
296
355
|
dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
297
356
|
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # FunctionMd
|
|
298
|
-
binary_obj: orm.Mapped[
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
class FullTableMd(NamedTuple):
|
|
302
|
-
tbl_md: TableMd
|
|
303
|
-
version_md: TableVersionMd
|
|
304
|
-
schema_version_md: TableSchemaVersionMd
|
|
305
|
-
|
|
306
|
-
def as_dict(self) -> dict[str, Any]:
|
|
307
|
-
return {
|
|
308
|
-
'table_id': self.tbl_md.tbl_id,
|
|
309
|
-
'table_md': dataclasses.asdict(self.tbl_md),
|
|
310
|
-
'table_version_md': dataclasses.asdict(self.version_md),
|
|
311
|
-
'table_schema_version_md': dataclasses.asdict(self.schema_version_md),
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
@classmethod
|
|
315
|
-
def from_dict(cls, data_dict: dict[str, Any]) -> 'FullTableMd':
|
|
316
|
-
return FullTableMd(
|
|
317
|
-
tbl_md=md_from_dict(TableMd, data_dict['table_md']),
|
|
318
|
-
version_md=md_from_dict(TableVersionMd, data_dict['table_version_md']),
|
|
319
|
-
schema_version_md=md_from_dict(TableSchemaVersionMd, data_dict['table_schema_version_md']),
|
|
320
|
-
)
|
|
357
|
+
binary_obj: orm.Mapped[bytes | None] = orm.mapped_column(LargeBinary, nullable=True)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata import schema
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MetadataUtils:
|
|
7
|
+
@classmethod
|
|
8
|
+
def _diff_md(
|
|
9
|
+
cls, old_md: dict[int, schema.SchemaColumn] | None, new_md: dict[int, schema.SchemaColumn] | None
|
|
10
|
+
) -> str:
|
|
11
|
+
"""Return a string reporting the differences in a specific entry in two dictionaries
|
|
12
|
+
|
|
13
|
+
Results are formatted as follows:
|
|
14
|
+
- If `old_md` is `None`, returns 'Initial Version'.
|
|
15
|
+
- If `old_md` and `new_md` are the same, returns an empty string.
|
|
16
|
+
- If there are additions, changes, or deletions, returns a string summarizing the changes.
|
|
17
|
+
"""
|
|
18
|
+
assert new_md is not None
|
|
19
|
+
if old_md is None:
|
|
20
|
+
return 'Initial Version'
|
|
21
|
+
if old_md == new_md:
|
|
22
|
+
return ''
|
|
23
|
+
added = {k: v.name for k, v in new_md.items() if k not in old_md}
|
|
24
|
+
changed = {
|
|
25
|
+
k: f'{old_md[k].name!r} to {v.name!r}'
|
|
26
|
+
for k, v in new_md.items()
|
|
27
|
+
if k in old_md and old_md[k].name != v.name
|
|
28
|
+
}
|
|
29
|
+
deleted = {k: v.name for k, v in old_md.items() if k not in new_md}
|
|
30
|
+
if len(added) == 0 and len(changed) == 0 and len(deleted) == 0:
|
|
31
|
+
return ''
|
|
32
|
+
# Format the result
|
|
33
|
+
t = []
|
|
34
|
+
if len(added) > 0:
|
|
35
|
+
t.append('Added: ' + ', '.join(added.values()))
|
|
36
|
+
if len(changed) > 0:
|
|
37
|
+
t.append('Renamed: ' + ', '.join(changed.values()))
|
|
38
|
+
if len(deleted) > 0:
|
|
39
|
+
t.append('Deleted: ' + ', '.join(deleted.values()))
|
|
40
|
+
r = ', '.join(t)
|
|
41
|
+
return r
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def _create_md_change_dict(cls, md_list: list[tuple[int, dict[int, schema.SchemaColumn]]] | None) -> dict[int, str]:
|
|
45
|
+
"""Return a dictionary of schema changes by version
|
|
46
|
+
Args:
|
|
47
|
+
md_list: a list of tuples, each containing a version number and a metadata dictionary.
|
|
48
|
+
"""
|
|
49
|
+
r: dict[int, str] = {}
|
|
50
|
+
if md_list is None or len(md_list) == 0:
|
|
51
|
+
return r
|
|
52
|
+
|
|
53
|
+
# Sort the list in place by version number
|
|
54
|
+
md_list.sort()
|
|
55
|
+
|
|
56
|
+
first_retrieved_version = md_list[0][0]
|
|
57
|
+
if first_retrieved_version == 0:
|
|
58
|
+
prev_md = None
|
|
59
|
+
prev_ver = -1
|
|
60
|
+
start = 0
|
|
61
|
+
else:
|
|
62
|
+
prev_md = md_list[0][1]
|
|
63
|
+
prev_ver = first_retrieved_version
|
|
64
|
+
start = 1
|
|
65
|
+
|
|
66
|
+
for ver, curr_md in md_list[start:]:
|
|
67
|
+
if ver == prev_ver:
|
|
68
|
+
continue
|
|
69
|
+
assert ver > prev_ver
|
|
70
|
+
tf = cls._diff_md(prev_md, curr_md)
|
|
71
|
+
if tf != '':
|
|
72
|
+
r[ver] = tf
|
|
73
|
+
prev_md = curr_md
|
|
74
|
+
return r
|