metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,957 @@
|
|
|
1
|
+
"""System table storage layer for metadata store.
|
|
2
|
+
|
|
3
|
+
Provides type-safe access to migration system tables using struct-based storage.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
import narwhals as nw
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
from metaxy.metadata_store.exceptions import SystemDataNotFoundError
|
|
14
|
+
from metaxy.metadata_store.system import (
|
|
15
|
+
FEATURE_VERSIONS_KEY,
|
|
16
|
+
)
|
|
17
|
+
from metaxy.metadata_store.system.events import (
|
|
18
|
+
COL_EVENT_TYPE,
|
|
19
|
+
COL_EXECUTION_ID,
|
|
20
|
+
COL_FEATURE_KEY,
|
|
21
|
+
COL_PAYLOAD,
|
|
22
|
+
COL_PROJECT,
|
|
23
|
+
COL_TIMESTAMP,
|
|
24
|
+
Event,
|
|
25
|
+
EventType,
|
|
26
|
+
MigrationStatus,
|
|
27
|
+
)
|
|
28
|
+
from metaxy.metadata_store.system.keys import EVENTS_KEY
|
|
29
|
+
from metaxy.metadata_store.system.models import POLARS_SCHEMAS, FeatureVersionsModel
|
|
30
|
+
from metaxy.models.constants import (
|
|
31
|
+
METAXY_FULL_DEFINITION_VERSION,
|
|
32
|
+
METAXY_SNAPSHOT_VERSION,
|
|
33
|
+
)
|
|
34
|
+
from metaxy.models.feature import FeatureGraph
|
|
35
|
+
from metaxy.models.types import FeatureKey, SnapshotPushResult
|
|
36
|
+
|
|
37
|
+
if TYPE_CHECKING:
|
|
38
|
+
from metaxy.metadata_store import MetadataStore
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SystemTableStorage:
|
|
42
|
+
"""Storage layer for migration system tables.
|
|
43
|
+
|
|
44
|
+
Provides type-safe access to migration snapshots, migrations, and events.
|
|
45
|
+
Uses struct-based storage (not JSON/bytes) for efficient queries.
|
|
46
|
+
|
|
47
|
+
Status is computed at query-time from events (append-only).
|
|
48
|
+
|
|
49
|
+
Usage:
|
|
50
|
+
```python
|
|
51
|
+
with SystemTableStorage(store) as storage:
|
|
52
|
+
storage.write_event(Event.migration_started(...))
|
|
53
|
+
```
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, store: MetadataStore):
|
|
57
|
+
"""Initialize storage layer.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
store: Metadata store to use for system tables
|
|
61
|
+
"""
|
|
62
|
+
self.store = store
|
|
63
|
+
|
|
64
|
+
# ========== Migrations ==========
|
|
65
|
+
# Note: Migration definitions are stored in YAML files (git), not in the database.
|
|
66
|
+
# Only execution events are stored in DB for tracking progress and state.
|
|
67
|
+
|
|
68
|
+
def list_executed_migrations(self, project: str | None = None) -> list[str]:
|
|
69
|
+
"""List all migration IDs that have execution events.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
project: Optional project name to filter by. If None, returns migrations for all projects.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
List of migration IDs that have been started/executed
|
|
76
|
+
|
|
77
|
+
Note:
|
|
78
|
+
The store must already be open when calling this method.
|
|
79
|
+
"""
|
|
80
|
+
events = self._read_system_metadata(EVENTS_KEY)
|
|
81
|
+
|
|
82
|
+
# Apply project filter only if specified
|
|
83
|
+
if project is not None:
|
|
84
|
+
events = events.filter(nw.col(COL_PROJECT) == project)
|
|
85
|
+
|
|
86
|
+
return (
|
|
87
|
+
events.select(COL_EXECUTION_ID)
|
|
88
|
+
.unique()
|
|
89
|
+
.collect()
|
|
90
|
+
.to_polars()[COL_EXECUTION_ID]
|
|
91
|
+
.to_list()
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def write_event(self, event: Event) -> None:
|
|
95
|
+
"""Write migration event to system table using typed event models.
|
|
96
|
+
|
|
97
|
+
This is the preferred way to write events with full type safety.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
event: A typed migration event created via Event classmethods
|
|
101
|
+
|
|
102
|
+
Note:
|
|
103
|
+
The store must already be open when calling this method.
|
|
104
|
+
|
|
105
|
+
Example:
|
|
106
|
+
```python
|
|
107
|
+
storage.write_event(
|
|
108
|
+
Event.migration_started(project="my_project", migration_id="m001")
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
storage.write_event(
|
|
112
|
+
Event.feature_completed(
|
|
113
|
+
project="my_project",
|
|
114
|
+
migration_id="m001",
|
|
115
|
+
feature_key="feature/a",
|
|
116
|
+
rows_affected=100,
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
"""
|
|
121
|
+
record = event.to_polars()
|
|
122
|
+
self.store.write_metadata(EVENTS_KEY, record)
|
|
123
|
+
|
|
124
|
+
def get_migration_events(
|
|
125
|
+
self, migration_id: str, project: str | None = None
|
|
126
|
+
) -> pl.DataFrame:
|
|
127
|
+
"""Get all events for a migration.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
migration_id: Migration ID
|
|
131
|
+
project: Optional project name to filter by. If None, returns events for all projects.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Polars DataFrame with events sorted by timestamp
|
|
135
|
+
|
|
136
|
+
Note:
|
|
137
|
+
The store must already be open when calling this method.
|
|
138
|
+
"""
|
|
139
|
+
# Read the table first without project filter
|
|
140
|
+
events = self._read_system_metadata(EVENTS_KEY)
|
|
141
|
+
if events is None:
|
|
142
|
+
# Table doesn't exist yet, return empty DataFrame with correct schema
|
|
143
|
+
return pl.DataFrame(schema=POLARS_SCHEMAS[EVENTS_KEY])
|
|
144
|
+
|
|
145
|
+
return (
|
|
146
|
+
events.filter(
|
|
147
|
+
nw.col(COL_EXECUTION_ID) == migration_id,
|
|
148
|
+
nw.col(COL_PROJECT) == project,
|
|
149
|
+
)
|
|
150
|
+
.sort(COL_TIMESTAMP, descending=False)
|
|
151
|
+
.collect()
|
|
152
|
+
.to_polars()
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def get_migration_status(
|
|
156
|
+
self,
|
|
157
|
+
migration_id: str,
|
|
158
|
+
project: str | None = None,
|
|
159
|
+
expected_features: list[str] | None = None,
|
|
160
|
+
) -> MigrationStatus:
|
|
161
|
+
"""Compute migration status from events at query-time.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
migration_id: Migration ID
|
|
165
|
+
project: Optional project name to filter by. If None, returns status across all projects.
|
|
166
|
+
expected_features: Optional list of feature keys that should be completed.
|
|
167
|
+
If provided, will check that ALL expected features have completed successfully
|
|
168
|
+
before returning COMPLETED status, even if migration_completed event exists.
|
|
169
|
+
This allows detecting when a migration YAML has been modified after completion.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
MigrationStatus enum value
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
events_df = self.get_migration_events(migration_id, project=project)
|
|
176
|
+
|
|
177
|
+
if events_df.height == 0:
|
|
178
|
+
return MigrationStatus.NOT_STARTED
|
|
179
|
+
|
|
180
|
+
# Get latest event
|
|
181
|
+
latest_event = events_df.sort(COL_TIMESTAMP, descending=True).head(1)
|
|
182
|
+
latest_event_type = latest_event[COL_EVENT_TYPE][0]
|
|
183
|
+
|
|
184
|
+
# If expected_features is provided, verify ALL features have completed
|
|
185
|
+
# This ensures we detect when operations are added to an already-completed migration
|
|
186
|
+
if expected_features is not None and len(expected_features) > 0:
|
|
187
|
+
completed_features = set(self.get_completed_features(migration_id, project))
|
|
188
|
+
expected_features_set = set(expected_features)
|
|
189
|
+
|
|
190
|
+
# Check if all expected features have been completed
|
|
191
|
+
all_features_completed = expected_features_set.issubset(completed_features)
|
|
192
|
+
|
|
193
|
+
if not all_features_completed:
|
|
194
|
+
# Some features are missing - migration is not complete
|
|
195
|
+
if latest_event_type in (
|
|
196
|
+
EventType.MIGRATION_STARTED.value,
|
|
197
|
+
EventType.FEATURE_MIGRATION_STARTED.value,
|
|
198
|
+
EventType.FEATURE_MIGRATION_COMPLETED.value,
|
|
199
|
+
EventType.FEATURE_MIGRATION_FAILED.value,
|
|
200
|
+
):
|
|
201
|
+
return MigrationStatus.IN_PROGRESS
|
|
202
|
+
elif latest_event_type == EventType.MIGRATION_FAILED.value:
|
|
203
|
+
return MigrationStatus.FAILED
|
|
204
|
+
else:
|
|
205
|
+
# Migration was marked complete but features are missing (YAML was modified)
|
|
206
|
+
return MigrationStatus.IN_PROGRESS
|
|
207
|
+
# If all features completed, continue with normal status logic below
|
|
208
|
+
|
|
209
|
+
if latest_event_type == EventType.MIGRATION_COMPLETED.value:
|
|
210
|
+
return MigrationStatus.COMPLETED
|
|
211
|
+
elif latest_event_type == EventType.MIGRATION_FAILED.value:
|
|
212
|
+
return MigrationStatus.FAILED
|
|
213
|
+
elif latest_event_type in (
|
|
214
|
+
EventType.MIGRATION_STARTED.value,
|
|
215
|
+
EventType.FEATURE_MIGRATION_STARTED.value,
|
|
216
|
+
EventType.FEATURE_MIGRATION_COMPLETED.value,
|
|
217
|
+
EventType.FEATURE_MIGRATION_FAILED.value,
|
|
218
|
+
):
|
|
219
|
+
return MigrationStatus.IN_PROGRESS
|
|
220
|
+
|
|
221
|
+
return MigrationStatus.NOT_STARTED
|
|
222
|
+
|
|
223
|
+
def is_feature_completed(
|
|
224
|
+
self, migration_id: str, feature_key: str, project: str | None = None
|
|
225
|
+
) -> bool:
|
|
226
|
+
"""Check if a specific feature completed successfully in a migration.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
migration_id: Migration ID
|
|
230
|
+
feature_key: Feature key to check
|
|
231
|
+
project: Optional project name to filter by. If None, checks across all projects.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
True if feature completed without errors
|
|
235
|
+
"""
|
|
236
|
+
events_df = self.get_migration_events(migration_id, project)
|
|
237
|
+
|
|
238
|
+
# Filter and check for completed events without errors
|
|
239
|
+
events_df = (
|
|
240
|
+
events_df.filter(
|
|
241
|
+
(pl.col(COL_FEATURE_KEY) == feature_key)
|
|
242
|
+
& (
|
|
243
|
+
pl.col(COL_EVENT_TYPE)
|
|
244
|
+
== EventType.FEATURE_MIGRATION_COMPLETED.value
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
.with_columns(
|
|
248
|
+
pl.col(COL_PAYLOAD)
|
|
249
|
+
.str.json_path_match("$.error_message")
|
|
250
|
+
.alias("error_message")
|
|
251
|
+
)
|
|
252
|
+
.filter(pl.col("error_message").is_null() | (pl.col("error_message") == ""))
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Check if any completed event has no error
|
|
256
|
+
return events_df.height > 0
|
|
257
|
+
|
|
258
|
+
def get_completed_features(
|
|
259
|
+
self, migration_id: str, project: str | None = None
|
|
260
|
+
) -> list[str]:
|
|
261
|
+
"""Get list of features that completed successfully in a migration.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
migration_id: Migration ID
|
|
265
|
+
project: Optional project name to filter by. If None, returns features for all projects.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
List of feature keys
|
|
269
|
+
"""
|
|
270
|
+
events_df = self.get_migration_events(migration_id, project=project)
|
|
271
|
+
|
|
272
|
+
# Filter and extract completed features
|
|
273
|
+
events_df = (
|
|
274
|
+
events_df.filter(
|
|
275
|
+
pl.col(COL_EVENT_TYPE) == EventType.FEATURE_MIGRATION_COMPLETED.value
|
|
276
|
+
)
|
|
277
|
+
.with_columns(
|
|
278
|
+
pl.col(COL_PAYLOAD)
|
|
279
|
+
.str.json_path_match("$.error_message")
|
|
280
|
+
.alias("error_message")
|
|
281
|
+
)
|
|
282
|
+
.filter(pl.col("error_message").is_null() | (pl.col("error_message") == ""))
|
|
283
|
+
.select(COL_FEATURE_KEY)
|
|
284
|
+
.unique()
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
return events_df[COL_FEATURE_KEY].to_list()
|
|
288
|
+
|
|
289
|
+
def get_failed_features(
|
|
290
|
+
self, migration_id: str, project: str | None = None
|
|
291
|
+
) -> dict[str, str]:
|
|
292
|
+
"""Get features that failed in a migration with error messages.
|
|
293
|
+
|
|
294
|
+
Only returns features whose LATEST event is a failure. If a feature
|
|
295
|
+
failed and then succeeded on retry, it won't be included here.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
migration_id: Migration ID
|
|
299
|
+
project: Optional project name to filter by. If None, returns features for all projects.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Dict mapping feature key to error message
|
|
303
|
+
"""
|
|
304
|
+
events_df = self.get_migration_events(migration_id, project=project)
|
|
305
|
+
|
|
306
|
+
if events_df.height == 0:
|
|
307
|
+
return {}
|
|
308
|
+
|
|
309
|
+
# Get completed features (these succeeded, even if they failed before)
|
|
310
|
+
completed_features = set(self.get_completed_features(migration_id, project))
|
|
311
|
+
|
|
312
|
+
# Filter for failed events, excluding features that later completed
|
|
313
|
+
failed_events = (
|
|
314
|
+
events_df.filter(
|
|
315
|
+
pl.col(COL_EVENT_TYPE) == EventType.FEATURE_MIGRATION_FAILED.value
|
|
316
|
+
)
|
|
317
|
+
.with_columns(
|
|
318
|
+
pl.col(COL_PAYLOAD)
|
|
319
|
+
.str.json_path_match("$.error_message")
|
|
320
|
+
.alias("error_message")
|
|
321
|
+
)
|
|
322
|
+
# Get latest failed event per feature
|
|
323
|
+
.sort(COL_TIMESTAMP, descending=True)
|
|
324
|
+
.group_by(COL_FEATURE_KEY, maintain_order=True)
|
|
325
|
+
.agg([pl.col("error_message").first().alias("error_message")])
|
|
326
|
+
# Exclude features that eventually completed
|
|
327
|
+
.filter(~pl.col(COL_FEATURE_KEY).is_in(list(completed_features)))
|
|
328
|
+
.select([COL_FEATURE_KEY, "error_message"])
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Convert to dict
|
|
332
|
+
return dict(
|
|
333
|
+
zip(
|
|
334
|
+
failed_events[COL_FEATURE_KEY].to_list(),
|
|
335
|
+
failed_events["error_message"].to_list(),
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
def get_migration_summary(
|
|
340
|
+
self,
|
|
341
|
+
migration_id: str,
|
|
342
|
+
project: str | None = None,
|
|
343
|
+
expected_features: list[str] | None = None,
|
|
344
|
+
) -> dict[str, Any]:
|
|
345
|
+
"""Get a comprehensive summary of migration execution status.
|
|
346
|
+
|
|
347
|
+
This is a convenience method that returns all migration information
|
|
348
|
+
in a single call, avoiding multiple queries.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
migration_id: Migration ID
|
|
352
|
+
project: Optional project name to filter by. If None, returns summary across all projects.
|
|
353
|
+
expected_features: Optional list of feature keys that should be completed.
|
|
354
|
+
If provided, will be used to determine if migration is truly complete.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Dict containing:
|
|
358
|
+
- status: MigrationStatus enum value
|
|
359
|
+
- completed_features: List of completed feature keys
|
|
360
|
+
- failed_features: Dict mapping failed feature keys to error messages
|
|
361
|
+
- total_features_processed: Count of completed + failed features
|
|
362
|
+
"""
|
|
363
|
+
status = self.get_migration_status(migration_id, project, expected_features)
|
|
364
|
+
completed = self.get_completed_features(migration_id, project)
|
|
365
|
+
failed = self.get_failed_features(migration_id, project)
|
|
366
|
+
|
|
367
|
+
return {
|
|
368
|
+
"status": status,
|
|
369
|
+
"completed_features": completed,
|
|
370
|
+
"failed_features": failed,
|
|
371
|
+
"total_features_processed": len(completed) + len(failed),
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
# ========== Convenience Methods for Reading Migration Data ==========
|
|
375
|
+
|
|
376
|
+
def read_migration_events(
|
|
377
|
+
self, project: str | None = None, migration_id: str | None = None
|
|
378
|
+
) -> pl.DataFrame:
|
|
379
|
+
"""Read all migration events, optionally filtered by project and/or migration ID.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
project: Optional project name to filter by. If None, returns events for all projects.
|
|
383
|
+
migration_id: Optional migration ID to filter by. If None, returns events for all migrations.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Polars DataFrame with migration events
|
|
387
|
+
"""
|
|
388
|
+
lazy = self._read_system_metadata(EVENTS_KEY)
|
|
389
|
+
|
|
390
|
+
# Apply filters if specified
|
|
391
|
+
if migration_id is not None:
|
|
392
|
+
lazy = lazy.filter(nw.col(COL_EXECUTION_ID) == migration_id)
|
|
393
|
+
|
|
394
|
+
if project is not None:
|
|
395
|
+
lazy = lazy.filter(nw.col(COL_PROJECT) == project)
|
|
396
|
+
|
|
397
|
+
# Convert to Polars DataFrame
|
|
398
|
+
return lazy.sort(COL_TIMESTAMP, descending=False).collect().to_polars()
|
|
399
|
+
|
|
400
|
+
def read_migration_progress(
|
|
401
|
+
self, project: str | None = None
|
|
402
|
+
) -> dict[str, dict[str, Any]]:
|
|
403
|
+
"""Read migration progress across all migrations.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
project: Optional project name to filter by. If None, returns progress for all projects.
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Dict mapping migration_id to progress information including:
|
|
410
|
+
- status: "not_started", "in_progress", "completed", "failed"
|
|
411
|
+
- completed_features: List of completed feature keys
|
|
412
|
+
- failed_features: Dict of failed feature keys to error messages
|
|
413
|
+
- total_rows_affected: Total rows affected across all features
|
|
414
|
+
"""
|
|
415
|
+
# Get all migration IDs
|
|
416
|
+
migration_ids = self.list_executed_migrations(project)
|
|
417
|
+
|
|
418
|
+
progress = {}
|
|
419
|
+
for mid in migration_ids:
|
|
420
|
+
events_df = self.read_migration_events(project=project, migration_id=mid)
|
|
421
|
+
|
|
422
|
+
if events_df.height == 0:
|
|
423
|
+
continue
|
|
424
|
+
|
|
425
|
+
# Get latest event for status
|
|
426
|
+
latest_event = events_df.sort(COL_TIMESTAMP, descending=True).head(1)
|
|
427
|
+
latest_event_type = latest_event[COL_EVENT_TYPE][0]
|
|
428
|
+
|
|
429
|
+
if latest_event_type == "completed":
|
|
430
|
+
status = "completed"
|
|
431
|
+
elif latest_event_type == "failed":
|
|
432
|
+
status = "failed"
|
|
433
|
+
elif latest_event_type in (
|
|
434
|
+
"started",
|
|
435
|
+
"feature_started",
|
|
436
|
+
EventType.FEATURE_MIGRATION_COMPLETED.value,
|
|
437
|
+
):
|
|
438
|
+
status = "in_progress"
|
|
439
|
+
else:
|
|
440
|
+
status = "not_started"
|
|
441
|
+
|
|
442
|
+
# Get completed and failed features using JSON path (polars operations on collected data)
|
|
443
|
+
feature_events = events_df.filter(
|
|
444
|
+
events_df[COL_EVENT_TYPE] == EventType.FEATURE_MIGRATION_COMPLETED.value
|
|
445
|
+
).with_columns(
|
|
446
|
+
[
|
|
447
|
+
pl.col(COL_PAYLOAD)
|
|
448
|
+
.str.json_path_match("$.error_message")
|
|
449
|
+
.alias("error_message"),
|
|
450
|
+
pl.col(COL_PAYLOAD)
|
|
451
|
+
.str.json_path_match("$.rows_affected")
|
|
452
|
+
.cast(pl.Int64)
|
|
453
|
+
.fill_null(0)
|
|
454
|
+
.alias("rows_affected"),
|
|
455
|
+
]
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
# Split into completed and failed
|
|
459
|
+
completed_df = feature_events.filter(
|
|
460
|
+
pl.col("error_message").is_null() | (pl.col("error_message") == "")
|
|
461
|
+
)
|
|
462
|
+
failed_df = feature_events.filter(
|
|
463
|
+
pl.col("error_message").is_not_null() & (pl.col("error_message") != "")
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
completed_features = completed_df[COL_FEATURE_KEY].unique().to_list()
|
|
467
|
+
failed_features = dict(
|
|
468
|
+
zip(
|
|
469
|
+
failed_df[COL_FEATURE_KEY].to_list(),
|
|
470
|
+
failed_df["error_message"].to_list(),
|
|
471
|
+
)
|
|
472
|
+
)
|
|
473
|
+
total_rows = int(feature_events["rows_affected"].sum() or 0)
|
|
474
|
+
|
|
475
|
+
progress[mid] = {
|
|
476
|
+
"status": status,
|
|
477
|
+
"completed_features": completed_features,
|
|
478
|
+
"failed_features": failed_features,
|
|
479
|
+
"total_rows_affected": total_rows or 0,
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
return progress
|
|
483
|
+
|
|
484
|
+
def read_applied_migrations(
|
|
485
|
+
self, project: str | None = None
|
|
486
|
+
) -> list[dict[str, Any]]:
|
|
487
|
+
"""Read all applied (completed) migrations with their details.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
project: Optional project name to filter by. If None, returns migrations for all projects.
|
|
491
|
+
|
|
492
|
+
Returns:
|
|
493
|
+
List of dicts containing migration details for completed migrations:
|
|
494
|
+
- migration_id: Migration ID
|
|
495
|
+
- project: Project name (if available)
|
|
496
|
+
- completed_at: Timestamp when migration completed
|
|
497
|
+
- features_count: Number of features affected
|
|
498
|
+
- rows_affected: Total rows affected
|
|
499
|
+
Note:
|
|
500
|
+
The store must already be open when calling this method.
|
|
501
|
+
"""
|
|
502
|
+
lazy = self._read_system_metadata(EVENTS_KEY)
|
|
503
|
+
if lazy is None:
|
|
504
|
+
# Table doesn't exist yet, return empty list
|
|
505
|
+
return []
|
|
506
|
+
|
|
507
|
+
# Filter to only completed migrations using narwhals
|
|
508
|
+
completed_events = lazy.filter(nw.col(COL_EVENT_TYPE) == "completed")
|
|
509
|
+
|
|
510
|
+
if project is not None:
|
|
511
|
+
completed_events = completed_events.filter(nw.col(COL_PROJECT) == project)
|
|
512
|
+
|
|
513
|
+
# Convert to polars LazyFrame and collect
|
|
514
|
+
completed_df = completed_events.to_native().collect()
|
|
515
|
+
|
|
516
|
+
if completed_df.height == 0:
|
|
517
|
+
return []
|
|
518
|
+
|
|
519
|
+
# Get all events for all migrations at once
|
|
520
|
+
all_events = self.read_migration_events(project=project)
|
|
521
|
+
|
|
522
|
+
# Extract rows_affected from payload using JSON path (polars operations)
|
|
523
|
+
feature_events = all_events.filter(
|
|
524
|
+
all_events[COL_EVENT_TYPE] == EventType.FEATURE_MIGRATION_COMPLETED.value
|
|
525
|
+
).with_columns(
|
|
526
|
+
pl.col(COL_PAYLOAD)
|
|
527
|
+
.str.json_path_match("$.rows_affected")
|
|
528
|
+
.cast(pl.Int64)
|
|
529
|
+
.fill_null(0)
|
|
530
|
+
.alias("rows_affected")
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Group by execution_id to get aggregated stats
|
|
534
|
+
migration_stats = feature_events.group_by(COL_EXECUTION_ID).agg(
|
|
535
|
+
[
|
|
536
|
+
pl.col(COL_FEATURE_KEY).n_unique().alias("features_count"),
|
|
537
|
+
pl.col("rows_affected").sum().alias("rows_affected"),
|
|
538
|
+
]
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Join with completed events to get project and timestamp
|
|
542
|
+
result_df = completed_df.join(
|
|
543
|
+
migration_stats, on=COL_EXECUTION_ID, how="left"
|
|
544
|
+
).select(
|
|
545
|
+
[
|
|
546
|
+
COL_EXECUTION_ID,
|
|
547
|
+
COL_PROJECT,
|
|
548
|
+
pl.col(COL_TIMESTAMP).alias("completed_at"),
|
|
549
|
+
pl.col("features_count").fill_null(0),
|
|
550
|
+
pl.col("rows_affected").fill_null(0).cast(pl.Int64),
|
|
551
|
+
]
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# Convert to list of dicts
|
|
555
|
+
return result_df.to_dicts()
|
|
556
|
+
|
|
557
|
+
def push_graph_snapshot(
|
|
558
|
+
self, tags: dict[str, Any] | None = None
|
|
559
|
+
) -> SnapshotPushResult:
|
|
560
|
+
"""Record all features in graph with a graph snapshot version.
|
|
561
|
+
|
|
562
|
+
This should be called during CD (Continuous Deployment) to record what
|
|
563
|
+
feature versions are being deployed. Typically invoked via `metaxy graph push`.
|
|
564
|
+
|
|
565
|
+
Records all features in the graph with the same snapshot_version, representing
|
|
566
|
+
a consistent state of the entire feature graph based on code definitions.
|
|
567
|
+
|
|
568
|
+
The snapshot_version is a deterministic hash of all feature_version hashes
|
|
569
|
+
in the graph, making it idempotent - calling multiple times with the
|
|
570
|
+
same feature definitions produces the same snapshot_version.
|
|
571
|
+
|
|
572
|
+
This method detects three scenarios:
|
|
573
|
+
1. New snapshot (computational changes): No existing rows with this snapshot_version
|
|
574
|
+
2. Metadata-only changes: Snapshot exists but some features have different feature_spec_version
|
|
575
|
+
3. No changes: Snapshot exists with identical feature_spec_versions for all features
|
|
576
|
+
|
|
577
|
+
Args:
|
|
578
|
+
tags: Optional dictionary of custom tags to attach to the snapshot
|
|
579
|
+
(e.g., git commit SHA).
|
|
580
|
+
|
|
581
|
+
Note:
|
|
582
|
+
The store must already be open when calling this method.
|
|
583
|
+
|
|
584
|
+
Returns: SnapshotPushResult
|
|
585
|
+
"""
|
|
586
|
+
tags = tags or {}
|
|
587
|
+
|
|
588
|
+
graph = FeatureGraph.get_active()
|
|
589
|
+
|
|
590
|
+
# Check if this exact snapshot already exists for this project
|
|
591
|
+
latest_pushed_snapshot = self._read_latest_snapshot_data(graph.snapshot_version)
|
|
592
|
+
current_snapshot_dict = graph.to_snapshot()
|
|
593
|
+
|
|
594
|
+
# Convert to DataFrame - need to serialize feature_spec dict to JSON string
|
|
595
|
+
# and add metaxy_snapshot_version and recorded_at columns
|
|
596
|
+
import json
|
|
597
|
+
from datetime import datetime, timezone
|
|
598
|
+
|
|
599
|
+
current_snapshot = pl.concat(
|
|
600
|
+
[
|
|
601
|
+
FeatureVersionsModel.model_validate(
|
|
602
|
+
{
|
|
603
|
+
"feature_key": k,
|
|
604
|
+
**{
|
|
605
|
+
field: (
|
|
606
|
+
json.dumps(val)
|
|
607
|
+
if field in ("feature_spec", "feature_schema")
|
|
608
|
+
else val
|
|
609
|
+
)
|
|
610
|
+
for field, val in v.items()
|
|
611
|
+
},
|
|
612
|
+
METAXY_SNAPSHOT_VERSION: graph.snapshot_version,
|
|
613
|
+
"recorded_at": datetime.now(timezone.utc),
|
|
614
|
+
"tags": json.dumps(tags),
|
|
615
|
+
}
|
|
616
|
+
).to_polars()
|
|
617
|
+
for k, v in current_snapshot_dict.items()
|
|
618
|
+
]
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
# Initialize to_push and already_pushed
|
|
622
|
+
to_push = current_snapshot # Will be updated if snapshot already exists
|
|
623
|
+
already_pushed: bool
|
|
624
|
+
|
|
625
|
+
if len(latest_pushed_snapshot) != 0:
|
|
626
|
+
# this snapshot_version HAS been previously pushed
|
|
627
|
+
# let's check for any differences
|
|
628
|
+
|
|
629
|
+
already_pushed = True
|
|
630
|
+
else:
|
|
631
|
+
# this snapshot_version has not been previously pushed at all
|
|
632
|
+
# we are safe to push all the features in our graph!
|
|
633
|
+
to_push = current_snapshot
|
|
634
|
+
already_pushed = False
|
|
635
|
+
|
|
636
|
+
if len(latest_pushed_snapshot) != 0:
|
|
637
|
+
# let's identify features that have updated definitions since the last push
|
|
638
|
+
# Join full current snapshot with latest pushed (keeping all columns)
|
|
639
|
+
pushed_with_current = current_snapshot.join(
|
|
640
|
+
latest_pushed_snapshot.select(
|
|
641
|
+
"feature_key",
|
|
642
|
+
pl.col(METAXY_FULL_DEFINITION_VERSION).alias(
|
|
643
|
+
f"{METAXY_FULL_DEFINITION_VERSION}_pushed"
|
|
644
|
+
),
|
|
645
|
+
),
|
|
646
|
+
on=["feature_key"],
|
|
647
|
+
how="left",
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
to_push = pl.concat(
|
|
651
|
+
[
|
|
652
|
+
# these are records that for some reason have not been pushed previously
|
|
653
|
+
pushed_with_current.filter(
|
|
654
|
+
pl.col(f"{METAXY_FULL_DEFINITION_VERSION}_pushed").is_null()
|
|
655
|
+
),
|
|
656
|
+
# these are the records with actual changes
|
|
657
|
+
pushed_with_current.filter(
|
|
658
|
+
pl.col(f"{METAXY_FULL_DEFINITION_VERSION}_pushed").is_not_null()
|
|
659
|
+
).filter(
|
|
660
|
+
pl.col(METAXY_FULL_DEFINITION_VERSION)
|
|
661
|
+
!= pl.col(f"{METAXY_FULL_DEFINITION_VERSION}_pushed")
|
|
662
|
+
),
|
|
663
|
+
]
|
|
664
|
+
).drop(f"{METAXY_FULL_DEFINITION_VERSION}_pushed")
|
|
665
|
+
|
|
666
|
+
if len(to_push) > 0:
|
|
667
|
+
self.store.write_metadata(FEATURE_VERSIONS_KEY, to_push)
|
|
668
|
+
|
|
669
|
+
# updated_features only populated when updating existing features
|
|
670
|
+
updated_features = (
|
|
671
|
+
to_push["feature_key"].to_list()
|
|
672
|
+
if len(latest_pushed_snapshot) != 0 and len(to_push) > 0
|
|
673
|
+
else []
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
return SnapshotPushResult(
|
|
677
|
+
snapshot_version=graph.snapshot_version,
|
|
678
|
+
already_pushed=already_pushed,
|
|
679
|
+
updated_features=updated_features,
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
def _read_system_metadata(self, key: FeatureKey) -> nw.LazyFrame[Any]:
|
|
683
|
+
"""Read system metadata.
|
|
684
|
+
|
|
685
|
+
System tables are handled specially by MetadataStore.read_metadata - they don't
|
|
686
|
+
require feature plan resolution when current_only=False.
|
|
687
|
+
|
|
688
|
+
Note:
|
|
689
|
+
The store must already be open when calling this method.
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
LazyFrame if table exists, empty LazyFrame with correct schema if it doesn't
|
|
693
|
+
"""
|
|
694
|
+
try:
|
|
695
|
+
# read_metadata handles system tables specially (no feature plan needed)
|
|
696
|
+
return self.store.read_metadata(key, current_only=False)
|
|
697
|
+
except SystemDataNotFoundError:
|
|
698
|
+
return nw.from_native(pl.DataFrame(schema=POLARS_SCHEMAS[key])).lazy()
|
|
699
|
+
|
|
700
|
+
def _read_latest_snapshot_data(
|
|
701
|
+
self,
|
|
702
|
+
snapshot_version: str,
|
|
703
|
+
) -> pl.DataFrame:
|
|
704
|
+
"""Read the latest snapshot data for a given snapshot version.
|
|
705
|
+
|
|
706
|
+
The same snapshot version may include multiple features as their no-topological metadata such as Pydantic fields or spec.metadata/tags change.
|
|
707
|
+
This method retrieves the latest feature data for each feature pushed to the metadata store.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
Polars DataFrame (materialized) with the latest data. Empty if table doesn't exist or snapshot not found.
|
|
711
|
+
"""
|
|
712
|
+
graph = FeatureGraph.get_active()
|
|
713
|
+
|
|
714
|
+
# Read system metadata
|
|
715
|
+
sys_meta = self._read_system_metadata(FEATURE_VERSIONS_KEY)
|
|
716
|
+
|
|
717
|
+
# Filter the data
|
|
718
|
+
lazy = sys_meta.filter(
|
|
719
|
+
nw.col(METAXY_SNAPSHOT_VERSION) == snapshot_version,
|
|
720
|
+
nw.col("project") == next(iter(graph.features_by_key.values())).project
|
|
721
|
+
if len(graph.features_by_key) > 0
|
|
722
|
+
else "_empty_graph_",
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
# Deduplicate using Polars (collect and use native operations)
|
|
726
|
+
return (
|
|
727
|
+
lazy.collect()
|
|
728
|
+
.to_polars()
|
|
729
|
+
.sort("recorded_at", descending=True)
|
|
730
|
+
.unique(subset=["feature_key"], keep="first")
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
def read_graph_snapshots(self, project: str | None = None) -> pl.DataFrame:
|
|
734
|
+
"""Read recorded graph snapshots from the feature_versions system table.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
project: Project name to filter by. If None, returns snapshots from all projects.
|
|
738
|
+
|
|
739
|
+
Returns a DataFrame with columns:
|
|
740
|
+
- snapshot_version: Unique identifier for each graph snapshot
|
|
741
|
+
- recorded_at: Timestamp when the snapshot was recorded
|
|
742
|
+
- feature_count: Number of features in this snapshot
|
|
743
|
+
|
|
744
|
+
Returns:
|
|
745
|
+
Polars DataFrame with snapshot information, sorted by recorded_at descending
|
|
746
|
+
|
|
747
|
+
Raises:
|
|
748
|
+
StoreNotOpenError: If store is not open
|
|
749
|
+
|
|
750
|
+
Example:
|
|
751
|
+
```py
|
|
752
|
+
with store:
|
|
753
|
+
storage = SystemTableStorage(store)
|
|
754
|
+
# Get snapshots for a specific project
|
|
755
|
+
snapshots = storage.read_graph_snapshots(project="my_project")
|
|
756
|
+
latest_snapshot = snapshots[METAXY_SNAPSHOT_VERSION][0]
|
|
757
|
+
print(f"Latest snapshot: {latest_snapshot}")
|
|
758
|
+
|
|
759
|
+
# Get snapshots across all projects
|
|
760
|
+
all_snapshots = storage.read_graph_snapshots()
|
|
761
|
+
```
|
|
762
|
+
"""
|
|
763
|
+
# Read system metadata
|
|
764
|
+
versions_lazy = self._read_system_metadata(FEATURE_VERSIONS_KEY)
|
|
765
|
+
if versions_lazy is None:
|
|
766
|
+
# No snapshots recorded yet
|
|
767
|
+
return pl.DataFrame(
|
|
768
|
+
schema={
|
|
769
|
+
METAXY_SNAPSHOT_VERSION: pl.String,
|
|
770
|
+
"recorded_at": pl.Datetime("us"),
|
|
771
|
+
"feature_count": pl.UInt32,
|
|
772
|
+
}
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
# Build filters based on project parameter
|
|
776
|
+
if project is not None:
|
|
777
|
+
versions_lazy = versions_lazy.filter(nw.col("project") == project)
|
|
778
|
+
|
|
779
|
+
# Materialize
|
|
780
|
+
versions_df = versions_lazy.collect().to_polars()
|
|
781
|
+
|
|
782
|
+
if versions_df.height == 0:
|
|
783
|
+
# No snapshots recorded yet
|
|
784
|
+
return pl.DataFrame(
|
|
785
|
+
schema={
|
|
786
|
+
METAXY_SNAPSHOT_VERSION: pl.String,
|
|
787
|
+
"recorded_at": pl.Datetime("us"),
|
|
788
|
+
"feature_count": pl.UInt32,
|
|
789
|
+
}
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
# Group by snapshot_version and get earliest recorded_at and count
|
|
793
|
+
snapshots = (
|
|
794
|
+
versions_df.group_by(METAXY_SNAPSHOT_VERSION)
|
|
795
|
+
.agg(
|
|
796
|
+
[
|
|
797
|
+
pl.col("recorded_at").min().alias("recorded_at"),
|
|
798
|
+
pl.col("feature_key").count().alias("feature_count"),
|
|
799
|
+
]
|
|
800
|
+
)
|
|
801
|
+
.sort("recorded_at", descending=True)
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
return snapshots
|
|
805
|
+
|
|
806
|
+
def read_features(
|
|
807
|
+
self,
|
|
808
|
+
*,
|
|
809
|
+
current: bool = True,
|
|
810
|
+
snapshot_version: str | None = None,
|
|
811
|
+
project: str | None = None,
|
|
812
|
+
) -> pl.DataFrame:
|
|
813
|
+
"""Read feature version information from the feature_versions system table.
|
|
814
|
+
|
|
815
|
+
Args:
|
|
816
|
+
current: If True, only return features from the current code snapshot.
|
|
817
|
+
If False, must provide snapshot_version.
|
|
818
|
+
snapshot_version: Specific snapshot version to filter by. Required if current=False.
|
|
819
|
+
project: Project name to filter by. Defaults to None.
|
|
820
|
+
|
|
821
|
+
Returns:
|
|
822
|
+
Polars DataFrame with columns from FEATURE_VERSIONS_SCHEMA:
|
|
823
|
+
- feature_key: Feature identifier
|
|
824
|
+
- feature_version: Version hash of the feature
|
|
825
|
+
- recorded_at: When this version was recorded
|
|
826
|
+
- feature_spec: JSON serialized feature specification
|
|
827
|
+
- feature_class_path: Python import path to the feature class
|
|
828
|
+
- snapshot_version: Graph snapshot this feature belongs to
|
|
829
|
+
|
|
830
|
+
Raises:
|
|
831
|
+
StoreNotOpenError: If store is not open
|
|
832
|
+
ValueError: If current=False but no snapshot_version provided
|
|
833
|
+
|
|
834
|
+
Examples:
|
|
835
|
+
```py
|
|
836
|
+
# Get features from current code
|
|
837
|
+
with store:
|
|
838
|
+
storage = SystemTableStorage(store)
|
|
839
|
+
features = storage.read_features(current=True)
|
|
840
|
+
print(f"Current graph has {len(features)} features")
|
|
841
|
+
```
|
|
842
|
+
|
|
843
|
+
```py
|
|
844
|
+
# Get features from a specific snapshot
|
|
845
|
+
with store:
|
|
846
|
+
storage = SystemTableStorage(store)
|
|
847
|
+
features = storage.read_features(current=False, snapshot_version="abc123")
|
|
848
|
+
for row in features.iter_rows(named=True):
|
|
849
|
+
print(f"{row['feature_key']}: {row['metaxy_feature_version']}")
|
|
850
|
+
```
|
|
851
|
+
"""
|
|
852
|
+
if not current and snapshot_version is None:
|
|
853
|
+
raise ValueError("Must provide snapshot_version when current=False")
|
|
854
|
+
|
|
855
|
+
if current:
|
|
856
|
+
# Get current snapshot from active graph
|
|
857
|
+
graph = FeatureGraph.get_active()
|
|
858
|
+
snapshot_version = graph.snapshot_version
|
|
859
|
+
|
|
860
|
+
# Read system metadata
|
|
861
|
+
versions_lazy = self._read_system_metadata(FEATURE_VERSIONS_KEY)
|
|
862
|
+
if versions_lazy is None:
|
|
863
|
+
# No features recorded yet
|
|
864
|
+
return pl.DataFrame(schema=POLARS_SCHEMAS[FEATURE_VERSIONS_KEY])
|
|
865
|
+
|
|
866
|
+
# Build filters
|
|
867
|
+
filters = [nw.col(METAXY_SNAPSHOT_VERSION) == snapshot_version]
|
|
868
|
+
if project is not None:
|
|
869
|
+
filters.append(nw.col("project") == project)
|
|
870
|
+
|
|
871
|
+
for f in filters:
|
|
872
|
+
versions_lazy = versions_lazy.filter(f)
|
|
873
|
+
|
|
874
|
+
# Materialize
|
|
875
|
+
versions_df = versions_lazy.collect().to_polars()
|
|
876
|
+
|
|
877
|
+
return versions_df
|
|
878
|
+
|
|
879
|
+
def load_graph_from_snapshot(
|
|
880
|
+
self,
|
|
881
|
+
snapshot_version: str,
|
|
882
|
+
project: str | None = None,
|
|
883
|
+
*,
|
|
884
|
+
class_path_overrides: dict[str, str] | None = None,
|
|
885
|
+
force_reload: bool = False,
|
|
886
|
+
) -> FeatureGraph:
|
|
887
|
+
"""Load and reconstruct a FeatureGraph from a stored snapshot.
|
|
888
|
+
|
|
889
|
+
This is a convenience method that encapsulates the pattern of:
|
|
890
|
+
|
|
891
|
+
1. Reading feature metadata for a snapshot
|
|
892
|
+
|
|
893
|
+
2. Building the snapshot data dictionary
|
|
894
|
+
|
|
895
|
+
3. Reconstructing the FeatureGraph from snapshot data
|
|
896
|
+
|
|
897
|
+
Args:
|
|
898
|
+
snapshot_version: The snapshot version to load
|
|
899
|
+
project: Optional project name to filter by
|
|
900
|
+
class_path_overrides: Optional dict mapping feature_key to new class path
|
|
901
|
+
for features that have been moved/renamed
|
|
902
|
+
force_reload: If True, force reimport of feature classes even if cached
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
Reconstructed FeatureGraph
|
|
906
|
+
|
|
907
|
+
Raises:
|
|
908
|
+
ValueError: If no features found for the snapshot version
|
|
909
|
+
ImportError: If feature classes cannot be imported at their recorded paths
|
|
910
|
+
|
|
911
|
+
Note:
|
|
912
|
+
The store must already be open when calling this method.
|
|
913
|
+
|
|
914
|
+
Example:
|
|
915
|
+
```python
|
|
916
|
+
with store:
|
|
917
|
+
storage = SystemTableStorage(store)
|
|
918
|
+
graph = storage.load_graph_from_snapshot(
|
|
919
|
+
snapshot_version="abc123",
|
|
920
|
+
project="my_project"
|
|
921
|
+
)
|
|
922
|
+
print(f"Loaded {len(graph.features_by_key)} features")
|
|
923
|
+
```
|
|
924
|
+
"""
|
|
925
|
+
import json
|
|
926
|
+
|
|
927
|
+
# Read features for this snapshot
|
|
928
|
+
features_df = self.read_features(
|
|
929
|
+
current=False,
|
|
930
|
+
snapshot_version=snapshot_version,
|
|
931
|
+
project=project,
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
if features_df.height == 0:
|
|
935
|
+
raise ValueError(
|
|
936
|
+
f"No features recorded for snapshot {snapshot_version}"
|
|
937
|
+
+ (f" in project {project}" if project else "")
|
|
938
|
+
)
|
|
939
|
+
|
|
940
|
+
# Build snapshot data dict for FeatureGraph.from_snapshot()
|
|
941
|
+
snapshot_data = {
|
|
942
|
+
row["feature_key"]: {
|
|
943
|
+
"feature_spec": json.loads(row["feature_spec"])
|
|
944
|
+
if isinstance(row["feature_spec"], str)
|
|
945
|
+
else row["feature_spec"],
|
|
946
|
+
"feature_class_path": row["feature_class_path"],
|
|
947
|
+
"metaxy_feature_version": row["feature_version"],
|
|
948
|
+
}
|
|
949
|
+
for row in features_df.iter_rows(named=True)
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
# Reconstruct graph from snapshot
|
|
953
|
+
return FeatureGraph.from_snapshot(
|
|
954
|
+
snapshot_data,
|
|
955
|
+
class_path_overrides=class_path_overrides,
|
|
956
|
+
force_reload=force_reload,
|
|
957
|
+
)
|