metaxy 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metaxy might be problematic. Click here for more details.
- metaxy/__init__.py +61 -0
- metaxy/_testing.py +542 -0
- metaxy/_utils.py +16 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +76 -0
- metaxy/cli/context.py +71 -0
- metaxy/cli/graph.py +576 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +42 -0
- metaxy/cli/metadata.py +271 -0
- metaxy/cli/migrations.py +862 -0
- metaxy/cli/push.py +55 -0
- metaxy/config.py +450 -0
- metaxy/data_versioning/__init__.py +24 -0
- metaxy/data_versioning/calculators/__init__.py +13 -0
- metaxy/data_versioning/calculators/base.py +97 -0
- metaxy/data_versioning/calculators/duckdb.py +186 -0
- metaxy/data_versioning/calculators/ibis.py +225 -0
- metaxy/data_versioning/calculators/polars.py +135 -0
- metaxy/data_versioning/diff/__init__.py +15 -0
- metaxy/data_versioning/diff/base.py +150 -0
- metaxy/data_versioning/diff/narwhals.py +108 -0
- metaxy/data_versioning/hash_algorithms.py +19 -0
- metaxy/data_versioning/joiners/__init__.py +9 -0
- metaxy/data_versioning/joiners/base.py +70 -0
- metaxy/data_versioning/joiners/narwhals.py +235 -0
- metaxy/entrypoints.py +309 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/alembic.py +326 -0
- metaxy/ext/sqlmodel.py +172 -0
- metaxy/ext/sqlmodel_system_tables.py +139 -0
- metaxy/graph/__init__.py +21 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +399 -0
- metaxy/graph/diff/differ.py +740 -0
- metaxy/graph/diff/models.py +418 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +274 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +320 -0
- metaxy/graph/diff/rendering/rich.py +165 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +31 -0
- metaxy/metadata_store/_protocols.py +38 -0
- metaxy/metadata_store/base.py +1676 -0
- metaxy/metadata_store/clickhouse.py +161 -0
- metaxy/metadata_store/duckdb.py +167 -0
- metaxy/metadata_store/exceptions.py +43 -0
- metaxy/metadata_store/ibis.py +451 -0
- metaxy/metadata_store/memory.py +228 -0
- metaxy/metadata_store/sqlite.py +187 -0
- metaxy/metadata_store/system_tables.py +257 -0
- metaxy/migrations/__init__.py +34 -0
- metaxy/migrations/detector.py +153 -0
- metaxy/migrations/executor.py +208 -0
- metaxy/migrations/loader.py +260 -0
- metaxy/migrations/models.py +718 -0
- metaxy/migrations/ops.py +390 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +6 -0
- metaxy/models/constants.py +24 -0
- metaxy/models/feature.py +665 -0
- metaxy/models/feature_spec.py +105 -0
- metaxy/models/field.py +25 -0
- metaxy/models/plan.py +155 -0
- metaxy/models/types.py +157 -0
- metaxy/py.typed +0 -0
- metaxy-0.0.0.dist-info/METADATA +247 -0
- metaxy-0.0.0.dist-info/RECORD +75 -0
- metaxy-0.0.0.dist-info/WHEEL +4 -0
- metaxy-0.0.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""SQLite metadata store - thin wrapper around IbisMetadataStore."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from metaxy.metadata_store.base import MetadataStore
|
|
10
|
+
|
|
11
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
12
|
+
from metaxy.metadata_store.ibis import IbisMetadataStore
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SQLiteMetadataStore(IbisMetadataStore):
|
|
16
|
+
"""
|
|
17
|
+
SQLite metadata store using Ibis backend.
|
|
18
|
+
|
|
19
|
+
Convenience wrapper that configures IbisMetadataStore for SQLite.
|
|
20
|
+
|
|
21
|
+
Hash algorithm support:
|
|
22
|
+
- MD5: Available (built-in SQLite function via extension)
|
|
23
|
+
|
|
24
|
+
Components:
|
|
25
|
+
- joiner: NarwhalsJoiner (works with any backend)
|
|
26
|
+
- calculator: PolarsDataVersionCalculator (SQLite always uses Polars, no native compute)
|
|
27
|
+
- diff_resolver: NarwhalsDiffResolver
|
|
28
|
+
|
|
29
|
+
Examples:
|
|
30
|
+
>>> # Local file database
|
|
31
|
+
>>> with SQLiteMetadataStore("metadata.db") as store:
|
|
32
|
+
... store.write_metadata(MyFeature, df)
|
|
33
|
+
|
|
34
|
+
>>> # In-memory database
|
|
35
|
+
>>> with SQLiteMetadataStore(":memory:") as store:
|
|
36
|
+
... store.write_metadata(MyFeature, df)
|
|
37
|
+
|
|
38
|
+
>>> # Explicit path
|
|
39
|
+
>>> store = SQLiteMetadataStore(Path("/path/to/metadata.db"))
|
|
40
|
+
>>> with store:
|
|
41
|
+
... store.write_metadata(MyFeature, df)
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
database: str | Path,
|
|
47
|
+
*,
|
|
48
|
+
fallback_stores: list["MetadataStore"] | None = None,
|
|
49
|
+
**kwargs,
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
Initialize SQLite metadata store.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
database: Database connection string or path.
|
|
56
|
+
- File path: "metadata.db" or Path("metadata.db")
|
|
57
|
+
- In-memory: ":memory:"
|
|
58
|
+
|
|
59
|
+
Note: Parent directories are NOT created automatically. Ensure paths exist
|
|
60
|
+
before initializing the store.
|
|
61
|
+
fallback_stores: Ordered list of read-only fallback stores.
|
|
62
|
+
**kwargs: Passed to IbisMetadataStore (e.g., hash_algorithm, prefer_native)
|
|
63
|
+
"""
|
|
64
|
+
database_str = str(database)
|
|
65
|
+
|
|
66
|
+
# Build connection params for Ibis SQLite backend
|
|
67
|
+
connection_params = {"database": database_str}
|
|
68
|
+
|
|
69
|
+
self.database = database_str
|
|
70
|
+
|
|
71
|
+
# Initialize Ibis store with SQLite backend
|
|
72
|
+
super().__init__(
|
|
73
|
+
backend="sqlite",
|
|
74
|
+
connection_params=connection_params,
|
|
75
|
+
fallback_stores=fallback_stores,
|
|
76
|
+
**kwargs,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
80
|
+
"""Get default hash algorithm for SQLite stores.
|
|
81
|
+
|
|
82
|
+
Uses MD5 which is universally supported in SQLite.
|
|
83
|
+
"""
|
|
84
|
+
return HashAlgorithm.MD5
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def supports_structs(cls) -> bool:
|
|
88
|
+
"""SQLite does not support struct types natively.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
False - SQLite stores structs as JSON strings
|
|
92
|
+
"""
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
def _supports_native_components(self) -> bool:
|
|
96
|
+
"""SQLite stores do not support native data version calculations.
|
|
97
|
+
|
|
98
|
+
SQLite doesn't have built-in hash functions (MD5, SHA256, etc.),
|
|
99
|
+
so we always use Polars components for data versioning.
|
|
100
|
+
"""
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
def _serialize_for_storage(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
104
|
+
"""Serialize structs and arrays to JSON strings for SQLite storage.
|
|
105
|
+
|
|
106
|
+
SQLite doesn't support struct or array types, so we convert them to JSON strings.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
df: DataFrame with potential struct/array columns
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
DataFrame with struct/array columns converted to JSON strings
|
|
113
|
+
"""
|
|
114
|
+
# Convert struct and array columns to JSON strings
|
|
115
|
+
for col_name in df.columns:
|
|
116
|
+
dtype = df.schema[col_name]
|
|
117
|
+
if isinstance(dtype, pl.Struct):
|
|
118
|
+
# Convert struct to JSON string
|
|
119
|
+
df = df.with_columns(
|
|
120
|
+
pl.col(col_name).struct.json_encode().alias(col_name)
|
|
121
|
+
)
|
|
122
|
+
elif isinstance(dtype, pl.List):
|
|
123
|
+
# Convert array/list to JSON string
|
|
124
|
+
# Note: Polars doesn't have native list.json_encode(), so we use map_elements
|
|
125
|
+
import json
|
|
126
|
+
|
|
127
|
+
df = df.with_columns(
|
|
128
|
+
pl.col(col_name)
|
|
129
|
+
.map_elements(
|
|
130
|
+
lambda x: None if x is None else json.dumps(x),
|
|
131
|
+
return_dtype=pl.Utf8,
|
|
132
|
+
)
|
|
133
|
+
.alias(col_name)
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return df
|
|
137
|
+
|
|
138
|
+
def _deserialize_from_storage(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
139
|
+
"""Deserialize JSON strings back to structs and arrays.
|
|
140
|
+
|
|
141
|
+
Converts JSON string columns back to their original struct/array types.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
df: DataFrame with JSON string columns
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
DataFrame with JSON strings converted back to structs/arrays
|
|
148
|
+
"""
|
|
149
|
+
# Known struct and array columns with their expected dtypes
|
|
150
|
+
# data_version is a struct, fields is a list of structs
|
|
151
|
+
# Migration system columns: operation_ids, expected_steps (list of strings),
|
|
152
|
+
# migration_yaml (struct), affected_features (list of strings)
|
|
153
|
+
|
|
154
|
+
# Columns that need JSON deserialization with specific dtypes
|
|
155
|
+
json_columns = {
|
|
156
|
+
"data_version": None, # Infer from data
|
|
157
|
+
"migration_yaml": None, # Infer from data
|
|
158
|
+
# "feature_spec": Leave as JSON string - contains enum values that can't be parsed
|
|
159
|
+
"operation_ids": pl.List(pl.Utf8), # List of strings
|
|
160
|
+
"expected_steps": pl.List(pl.Utf8), # List of strings
|
|
161
|
+
"affected_features": pl.List(pl.Utf8), # List of strings
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
# Deserialize JSON columns
|
|
165
|
+
for col_name, dtype in json_columns.items():
|
|
166
|
+
if col_name in df.columns and df.schema[col_name] == pl.Utf8:
|
|
167
|
+
if len(df) > 0:
|
|
168
|
+
if dtype is None:
|
|
169
|
+
# Infer dtype from sample value
|
|
170
|
+
sample_value = df[col_name].drop_nulls().head(1)
|
|
171
|
+
if len(sample_value) > 0:
|
|
172
|
+
inferred_series = sample_value.str.json_decode()
|
|
173
|
+
inferred_dtype = inferred_series.dtype
|
|
174
|
+
df = df.with_columns(
|
|
175
|
+
pl.col(col_name)
|
|
176
|
+
.str.json_decode(dtype=inferred_dtype)
|
|
177
|
+
.alias(col_name)
|
|
178
|
+
)
|
|
179
|
+
else:
|
|
180
|
+
# Use provided dtype
|
|
181
|
+
df = df.with_columns(
|
|
182
|
+
pl.col(col_name)
|
|
183
|
+
.str.json_decode(dtype=dtype)
|
|
184
|
+
.alias(col_name)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
return df
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""System table storage layer for metadata store.
|
|
2
|
+
|
|
3
|
+
Provides type-safe access to migration system tables using struct-based storage.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from collections.abc import Iterator
|
|
9
|
+
from contextlib import contextmanager
|
|
10
|
+
from contextvars import ContextVar
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import narwhals as nw
|
|
15
|
+
import polars as pl
|
|
16
|
+
|
|
17
|
+
from metaxy.metadata_store._protocols import MetadataStoreProtocol
|
|
18
|
+
from metaxy.models.types import FeatureKey
|
|
19
|
+
|
|
20
|
+
# System namespace
|
|
21
|
+
SYSTEM_NAMESPACE = "metaxy-system"
|
|
22
|
+
|
|
23
|
+
# System table keys
|
|
24
|
+
FEATURE_VERSIONS_KEY = FeatureKey([SYSTEM_NAMESPACE, "feature_versions"])
|
|
25
|
+
MIGRATION_EVENTS_KEY = FeatureKey([SYSTEM_NAMESPACE, "migration_events"])
|
|
26
|
+
# Note: No migrations table - definitions live in YAML files, only events are stored
|
|
27
|
+
|
|
28
|
+
# Context variable for suppressing feature_version warning in migrations
|
|
29
|
+
_suppress_feature_version_warning: ContextVar[bool] = ContextVar(
|
|
30
|
+
"_suppress_feature_version_warning", default=False
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@contextmanager
|
|
35
|
+
def allow_feature_version_override() -> Iterator[None]:
|
|
36
|
+
"""
|
|
37
|
+
Context manager to suppress warnings when writing metadata with pre-existing feature_version.
|
|
38
|
+
|
|
39
|
+
This should only be used in migration code where writing historical feature versions
|
|
40
|
+
is intentional and necessary.
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
>>> with allow_feature_version_override():
|
|
44
|
+
... # DataFrame already has feature_version column from migration
|
|
45
|
+
... store.write_metadata(MyFeature, df_with_feature_version)
|
|
46
|
+
"""
|
|
47
|
+
token = _suppress_feature_version_warning.set(True)
|
|
48
|
+
try:
|
|
49
|
+
yield
|
|
50
|
+
finally:
|
|
51
|
+
_suppress_feature_version_warning.reset(token)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Common Polars schemas for system tables
|
|
55
|
+
# TODO: Migrate to use METAXY_*_COL constants instead of plain names
|
|
56
|
+
FEATURE_VERSIONS_SCHEMA = {
|
|
57
|
+
"feature_key": pl.String,
|
|
58
|
+
"feature_version": pl.String, # TODO: Use METAXY_FEATURE_VERSION_COL
|
|
59
|
+
"recorded_at": pl.Datetime("us"),
|
|
60
|
+
"feature_spec": pl.String,
|
|
61
|
+
"feature_class_path": pl.String,
|
|
62
|
+
"snapshot_version": pl.String, # TODO: Use METAXY_SNAPSHOT_ID_COL
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
MIGRATION_EVENTS_SCHEMA = {
|
|
66
|
+
"migration_id": pl.String,
|
|
67
|
+
"event_type": pl.String, # "started", "feature_started", "feature_completed", "completed", "failed"
|
|
68
|
+
"timestamp": pl.Datetime("us"),
|
|
69
|
+
"feature_key": pl.String, # Empty for migration-level events
|
|
70
|
+
"rows_affected": pl.Int64,
|
|
71
|
+
"error_message": pl.String, # Empty if no error
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SystemTableStorage:
|
|
76
|
+
"""Storage layer for migration system tables.
|
|
77
|
+
|
|
78
|
+
Provides type-safe access to migration snapshots, migrations, and events.
|
|
79
|
+
Uses struct-based storage (not JSON/bytes) for efficient queries.
|
|
80
|
+
|
|
81
|
+
Status is computed at query-time from events (append-only).
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(self, store: MetadataStoreProtocol):
|
|
85
|
+
"""Initialize storage layer.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
store: Metadata store to use for system tables
|
|
89
|
+
"""
|
|
90
|
+
self.store = store
|
|
91
|
+
|
|
92
|
+
# ========== Migrations ==========
|
|
93
|
+
# Note: Migration definitions are stored in YAML files (git), not in the database.
|
|
94
|
+
# Only execution events are stored in DB for tracking progress and state.
|
|
95
|
+
|
|
96
|
+
def list_executed_migrations(self) -> list[str]:
|
|
97
|
+
"""List all migration IDs that have execution events.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of migration IDs that have been started/executed
|
|
101
|
+
"""
|
|
102
|
+
lazy = self.store._read_metadata_native(MIGRATION_EVENTS_KEY)
|
|
103
|
+
|
|
104
|
+
if lazy is None:
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
df = lazy.select("migration_id").unique().collect().to_polars()
|
|
108
|
+
return df["migration_id"].to_list()
|
|
109
|
+
|
|
110
|
+
# ========== Events ==========
|
|
111
|
+
|
|
112
|
+
def write_event(
|
|
113
|
+
self,
|
|
114
|
+
migration_id: str,
|
|
115
|
+
event_type: str,
|
|
116
|
+
feature_key: str = "",
|
|
117
|
+
rows_affected: int = 0,
|
|
118
|
+
error_message: str = "",
|
|
119
|
+
) -> None:
|
|
120
|
+
"""Write migration event to system table (append-only).
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
migration_id: Migration this event belongs to
|
|
124
|
+
event_type: Event type ("started", "feature_started", "feature_completed", "completed", "failed")
|
|
125
|
+
feature_key: Feature key (empty for migration-level events)
|
|
126
|
+
rows_affected: Number of rows affected (for feature events)
|
|
127
|
+
error_message: Error message (empty if no error)
|
|
128
|
+
"""
|
|
129
|
+
record = pl.DataFrame(
|
|
130
|
+
{
|
|
131
|
+
"migration_id": [migration_id],
|
|
132
|
+
"event_type": [event_type],
|
|
133
|
+
"timestamp": [datetime.now(timezone.utc)],
|
|
134
|
+
"feature_key": [feature_key],
|
|
135
|
+
"rows_affected": [rows_affected],
|
|
136
|
+
"error_message": [error_message],
|
|
137
|
+
},
|
|
138
|
+
schema=MIGRATION_EVENTS_SCHEMA,
|
|
139
|
+
)
|
|
140
|
+
self.store._write_metadata_impl(MIGRATION_EVENTS_KEY, record)
|
|
141
|
+
|
|
142
|
+
def get_migration_events(self, migration_id: str) -> nw.LazyFrame[Any]:
|
|
143
|
+
"""Get all events for a migration.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
migration_id: Migration ID
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Lazy frame with events sorted by timestamp
|
|
150
|
+
"""
|
|
151
|
+
lazy = self.store._read_metadata_native(
|
|
152
|
+
MIGRATION_EVENTS_KEY,
|
|
153
|
+
filters=[nw.col("migration_id") == migration_id],
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if lazy is None:
|
|
157
|
+
# No events yet
|
|
158
|
+
return nw.from_native(pl.DataFrame(schema=MIGRATION_EVENTS_SCHEMA).lazy())
|
|
159
|
+
|
|
160
|
+
return lazy.sort("timestamp", descending=False)
|
|
161
|
+
|
|
162
|
+
def get_migration_status(self, migration_id: str) -> str:
|
|
163
|
+
"""Compute migration status from events at query-time.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
migration_id: Migration ID
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Status: "not_started", "in_progress", "completed", "failed"
|
|
170
|
+
"""
|
|
171
|
+
events_lazy = self.get_migration_events(migration_id)
|
|
172
|
+
events_df = events_lazy.collect().to_polars()
|
|
173
|
+
|
|
174
|
+
if events_df.height == 0:
|
|
175
|
+
return "not_started"
|
|
176
|
+
|
|
177
|
+
# Get latest event
|
|
178
|
+
latest_event = events_df.sort("timestamp", descending=True).head(1)
|
|
179
|
+
latest_event_type = latest_event["event_type"][0]
|
|
180
|
+
|
|
181
|
+
if latest_event_type == "completed":
|
|
182
|
+
return "completed"
|
|
183
|
+
elif latest_event_type == "failed":
|
|
184
|
+
return "failed"
|
|
185
|
+
elif latest_event_type in ("started", "feature_started", "feature_completed"):
|
|
186
|
+
return "in_progress"
|
|
187
|
+
|
|
188
|
+
return "not_started"
|
|
189
|
+
|
|
190
|
+
def is_feature_completed(self, migration_id: str, feature_key: str) -> bool:
|
|
191
|
+
"""Check if a specific feature completed successfully in a migration.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
migration_id: Migration ID
|
|
195
|
+
feature_key: Feature key to check
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
True if feature completed without errors
|
|
199
|
+
"""
|
|
200
|
+
events_lazy = self.get_migration_events(migration_id)
|
|
201
|
+
events_df = (
|
|
202
|
+
events_lazy.filter(
|
|
203
|
+
(nw.col("feature_key") == feature_key)
|
|
204
|
+
& (nw.col("event_type") == "feature_completed")
|
|
205
|
+
& (nw.col("error_message") == "")
|
|
206
|
+
)
|
|
207
|
+
.collect()
|
|
208
|
+
.to_polars()
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
return events_df.height > 0
|
|
212
|
+
|
|
213
|
+
def get_completed_features(self, migration_id: str) -> list[str]:
|
|
214
|
+
"""Get list of features that completed successfully in a migration.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
migration_id: Migration ID
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
List of feature keys
|
|
221
|
+
"""
|
|
222
|
+
events_lazy = self.get_migration_events(migration_id)
|
|
223
|
+
events_df = (
|
|
224
|
+
events_lazy.filter(
|
|
225
|
+
(nw.col("event_type") == "feature_completed")
|
|
226
|
+
& (nw.col("error_message") == "")
|
|
227
|
+
)
|
|
228
|
+
.collect()
|
|
229
|
+
.to_polars()
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
return events_df["feature_key"].unique().to_list()
|
|
233
|
+
|
|
234
|
+
def get_failed_features(self, migration_id: str) -> dict[str, str]:
|
|
235
|
+
"""Get features that failed in a migration with error messages.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
migration_id: Migration ID
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Dict mapping feature key to error message
|
|
242
|
+
"""
|
|
243
|
+
events_lazy = self.get_migration_events(migration_id)
|
|
244
|
+
events_df = (
|
|
245
|
+
events_lazy.filter(
|
|
246
|
+
(nw.col("event_type") == "feature_completed")
|
|
247
|
+
& (nw.col("error_message") != "")
|
|
248
|
+
)
|
|
249
|
+
.collect()
|
|
250
|
+
.to_polars()
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
result = {}
|
|
254
|
+
for row in events_df.iter_rows(named=True):
|
|
255
|
+
result[row["feature_key"]] = row["error_message"]
|
|
256
|
+
|
|
257
|
+
return result
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Migration system for metadata version updates."""
|
|
2
|
+
|
|
3
|
+
from metaxy.metadata_store.system_tables import SystemTableStorage
|
|
4
|
+
from metaxy.migrations.detector import detect_migration
|
|
5
|
+
from metaxy.migrations.executor import MigrationExecutor
|
|
6
|
+
from metaxy.migrations.models import (
|
|
7
|
+
CustomMigration,
|
|
8
|
+
DiffMigration,
|
|
9
|
+
FullGraphMigration,
|
|
10
|
+
Migration,
|
|
11
|
+
MigrationResult,
|
|
12
|
+
)
|
|
13
|
+
from metaxy.migrations.ops import (
|
|
14
|
+
BaseOperation,
|
|
15
|
+
DataVersionReconciliation,
|
|
16
|
+
MetadataBackfill,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
# Core migration types
|
|
21
|
+
"Migration",
|
|
22
|
+
"DiffMigration",
|
|
23
|
+
"FullGraphMigration",
|
|
24
|
+
"CustomMigration",
|
|
25
|
+
"MigrationResult",
|
|
26
|
+
# Operations (for custom migrations)
|
|
27
|
+
"BaseOperation",
|
|
28
|
+
"DataVersionReconciliation",
|
|
29
|
+
"MetadataBackfill",
|
|
30
|
+
# Migration workflow
|
|
31
|
+
"detect_migration",
|
|
32
|
+
"MigrationExecutor",
|
|
33
|
+
"SystemTableStorage",
|
|
34
|
+
]
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Feature change detection for automatic migration generation."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from metaxy.graph.diff.differ import GraphDiffer
|
|
8
|
+
from metaxy.models.feature import FeatureGraph
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from metaxy.metadata_store.base import MetadataStore
|
|
12
|
+
from metaxy.migrations.models import DiffMigration
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def detect_migration(
|
|
16
|
+
store: "MetadataStore",
|
|
17
|
+
from_snapshot_version: str | None = None,
|
|
18
|
+
ops: list[dict[str, Any]] | None = None,
|
|
19
|
+
migrations_dir: Path | None = None,
|
|
20
|
+
name: str | None = None,
|
|
21
|
+
) -> "DiffMigration | None":
|
|
22
|
+
"""Detect migration needed between snapshots and write YAML file.
|
|
23
|
+
|
|
24
|
+
Compares the latest snapshot in the store (or specified from_snapshot_version)
|
|
25
|
+
with the current active graph to detect changes and generate a migration YAML file.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
store: Metadata store containing snapshot metadata
|
|
29
|
+
from_snapshot_version: Source snapshot version (defaults to latest in store)
|
|
30
|
+
ops: List of operation dicts with "type" field (defaults to [{"type": "metaxy.migrations.ops.DataVersionReconciliation"}])
|
|
31
|
+
migrations_dir: Directory to write migration YAML (defaults to .metaxy/migrations/)
|
|
32
|
+
name: Migration name (creates {timestamp}_{name} ID and filename)
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
DiffMigration if changes detected and written, None otherwise
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> # Compare latest snapshot in store vs current graph
|
|
39
|
+
>>> with store:
|
|
40
|
+
... migration = detect_migration(store)
|
|
41
|
+
... if migration:
|
|
42
|
+
... print(f"Migration written to {migration.yaml_path}")
|
|
43
|
+
|
|
44
|
+
>>> # Use custom operation
|
|
45
|
+
>>> migration = detect_migration(store, ops=[{"type": "myproject.ops.CustomOp"}])
|
|
46
|
+
|
|
47
|
+
>>> # Use custom name
|
|
48
|
+
>>> migration = detect_migration(store, name="example_migration")
|
|
49
|
+
"""
|
|
50
|
+
from metaxy.migrations.models import DiffMigration
|
|
51
|
+
|
|
52
|
+
differ = GraphDiffer()
|
|
53
|
+
|
|
54
|
+
# Get from_snapshot_version (use latest if not specified)
|
|
55
|
+
if from_snapshot_version is None:
|
|
56
|
+
snapshots = store.read_graph_snapshots()
|
|
57
|
+
if snapshots.height == 0:
|
|
58
|
+
# No snapshots in store - nothing to migrate from
|
|
59
|
+
return None
|
|
60
|
+
from_snapshot_version = snapshots["snapshot_version"][0]
|
|
61
|
+
|
|
62
|
+
# At this point, from_snapshot_version is guaranteed to be a str
|
|
63
|
+
assert from_snapshot_version is not None # Type narrowing for type checker
|
|
64
|
+
|
|
65
|
+
# Get to_snapshot_version from current active graph
|
|
66
|
+
active_graph = FeatureGraph.get_active()
|
|
67
|
+
if len(active_graph.features_by_key) == 0:
|
|
68
|
+
# No features in active graph - nothing to migrate to
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
to_snapshot_version = active_graph.snapshot_version
|
|
72
|
+
|
|
73
|
+
# Check if versions are the same (no changes)
|
|
74
|
+
if from_snapshot_version == to_snapshot_version:
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
# Load snapshot data using GraphDiffer
|
|
78
|
+
try:
|
|
79
|
+
from_snapshot_data = differ.load_snapshot_data(store, from_snapshot_version)
|
|
80
|
+
except ValueError:
|
|
81
|
+
# Snapshot not found - nothing to migrate from
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
# Build snapshot data for to_snapshot (current graph)
|
|
85
|
+
to_snapshot_data = active_graph.to_snapshot()
|
|
86
|
+
|
|
87
|
+
# Compute GraphDiff using GraphDiffer
|
|
88
|
+
graph_diff = differ.diff(
|
|
89
|
+
from_snapshot_data,
|
|
90
|
+
to_snapshot_data,
|
|
91
|
+
from_snapshot_version,
|
|
92
|
+
to_snapshot_version,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Check if there are any changes
|
|
96
|
+
if not graph_diff.has_changes:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
# Generate migration ID (timestamp first for sorting)
|
|
100
|
+
timestamp = datetime.now(timezone.utc)
|
|
101
|
+
timestamp_str = timestamp.strftime("%Y%m%d_%H%M%S")
|
|
102
|
+
if name is not None:
|
|
103
|
+
migration_id = f"{timestamp_str}_{name}"
|
|
104
|
+
else:
|
|
105
|
+
migration_id = f"{timestamp_str}"
|
|
106
|
+
|
|
107
|
+
# ops is required - caller must specify
|
|
108
|
+
if ops is None:
|
|
109
|
+
raise ValueError(
|
|
110
|
+
"ops parameter is required - must explicitly specify migration operations. "
|
|
111
|
+
"Example: ops=[{'type': 'metaxy.migrations.ops.DataVersionReconciliation'}]"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Default migrations directory
|
|
115
|
+
if migrations_dir is None:
|
|
116
|
+
migrations_dir = Path(".metaxy/migrations")
|
|
117
|
+
|
|
118
|
+
migrations_dir.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
|
|
120
|
+
# Find parent migration (latest migration in chain)
|
|
121
|
+
from metaxy.migrations.loader import find_latest_migration
|
|
122
|
+
|
|
123
|
+
parent = find_latest_migration(migrations_dir)
|
|
124
|
+
if parent is None:
|
|
125
|
+
parent = "initial"
|
|
126
|
+
|
|
127
|
+
# Create minimal DiffMigration - affected_features and description are computed on-demand
|
|
128
|
+
migration = DiffMigration(
|
|
129
|
+
migration_id=migration_id,
|
|
130
|
+
created_at=timestamp,
|
|
131
|
+
parent=parent,
|
|
132
|
+
from_snapshot_version=from_snapshot_version,
|
|
133
|
+
to_snapshot_version=to_snapshot_version,
|
|
134
|
+
ops=ops,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Write migration YAML file
|
|
138
|
+
import yaml
|
|
139
|
+
|
|
140
|
+
yaml_path = migrations_dir / f"{migration_id}.yaml"
|
|
141
|
+
migration_yaml = {
|
|
142
|
+
"id": migration.migration_id,
|
|
143
|
+
"created_at": migration.created_at.isoformat(),
|
|
144
|
+
"parent": migration.parent,
|
|
145
|
+
"from_snapshot_version": migration.from_snapshot_version,
|
|
146
|
+
"to_snapshot_version": migration.to_snapshot_version,
|
|
147
|
+
"ops": migration.ops,
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
with open(yaml_path, "w") as f:
|
|
151
|
+
yaml.safe_dump(migration_yaml, f, sort_keys=False, default_flow_style=False)
|
|
152
|
+
|
|
153
|
+
return migration
|