metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,957 @@
1
+ """System table storage layer for metadata store.
2
+
3
+ Provides type-safe access to migration system tables using struct-based storage.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import narwhals as nw
11
+ import polars as pl
12
+
13
+ from metaxy.metadata_store.exceptions import SystemDataNotFoundError
14
+ from metaxy.metadata_store.system import (
15
+ FEATURE_VERSIONS_KEY,
16
+ )
17
+ from metaxy.metadata_store.system.events import (
18
+ COL_EVENT_TYPE,
19
+ COL_EXECUTION_ID,
20
+ COL_FEATURE_KEY,
21
+ COL_PAYLOAD,
22
+ COL_PROJECT,
23
+ COL_TIMESTAMP,
24
+ Event,
25
+ EventType,
26
+ MigrationStatus,
27
+ )
28
+ from metaxy.metadata_store.system.keys import EVENTS_KEY
29
+ from metaxy.metadata_store.system.models import POLARS_SCHEMAS, FeatureVersionsModel
30
+ from metaxy.models.constants import (
31
+ METAXY_FULL_DEFINITION_VERSION,
32
+ METAXY_SNAPSHOT_VERSION,
33
+ )
34
+ from metaxy.models.feature import FeatureGraph
35
+ from metaxy.models.types import FeatureKey, SnapshotPushResult
36
+
37
+ if TYPE_CHECKING:
38
+ from metaxy.metadata_store import MetadataStore
39
+
40
+
41
+ class SystemTableStorage:
42
+ """Storage layer for migration system tables.
43
+
44
+ Provides type-safe access to migration snapshots, migrations, and events.
45
+ Uses struct-based storage (not JSON/bytes) for efficient queries.
46
+
47
+ Status is computed at query-time from events (append-only).
48
+
49
+ Usage:
50
+ ```python
51
+ with SystemTableStorage(store) as storage:
52
+ storage.write_event(Event.migration_started(...))
53
+ ```
54
+ """
55
+
56
+ def __init__(self, store: MetadataStore):
57
+ """Initialize storage layer.
58
+
59
+ Args:
60
+ store: Metadata store to use for system tables
61
+ """
62
+ self.store = store
63
+
64
+ # ========== Migrations ==========
65
+ # Note: Migration definitions are stored in YAML files (git), not in the database.
66
+ # Only execution events are stored in DB for tracking progress and state.
67
+
68
+ def list_executed_migrations(self, project: str | None = None) -> list[str]:
69
+ """List all migration IDs that have execution events.
70
+
71
+ Args:
72
+ project: Optional project name to filter by. If None, returns migrations for all projects.
73
+
74
+ Returns:
75
+ List of migration IDs that have been started/executed
76
+
77
+ Note:
78
+ The store must already be open when calling this method.
79
+ """
80
+ events = self._read_system_metadata(EVENTS_KEY)
81
+
82
+ # Apply project filter only if specified
83
+ if project is not None:
84
+ events = events.filter(nw.col(COL_PROJECT) == project)
85
+
86
+ return (
87
+ events.select(COL_EXECUTION_ID)
88
+ .unique()
89
+ .collect()
90
+ .to_polars()[COL_EXECUTION_ID]
91
+ .to_list()
92
+ )
93
+
94
+ def write_event(self, event: Event) -> None:
95
+ """Write migration event to system table using typed event models.
96
+
97
+ This is the preferred way to write events with full type safety.
98
+
99
+ Args:
100
+ event: A typed migration event created via Event classmethods
101
+
102
+ Note:
103
+ The store must already be open when calling this method.
104
+
105
+ Example:
106
+ ```python
107
+ storage.write_event(
108
+ Event.migration_started(project="my_project", migration_id="m001")
109
+ )
110
+
111
+ storage.write_event(
112
+ Event.feature_completed(
113
+ project="my_project",
114
+ migration_id="m001",
115
+ feature_key="feature/a",
116
+ rows_affected=100,
117
+ )
118
+ )
119
+ ```
120
+ """
121
+ record = event.to_polars()
122
+ self.store.write_metadata(EVENTS_KEY, record)
123
+
124
+ def get_migration_events(
125
+ self, migration_id: str, project: str | None = None
126
+ ) -> pl.DataFrame:
127
+ """Get all events for a migration.
128
+
129
+ Args:
130
+ migration_id: Migration ID
131
+ project: Optional project name to filter by. If None, returns events for all projects.
132
+
133
+ Returns:
134
+ Polars DataFrame with events sorted by timestamp
135
+
136
+ Note:
137
+ The store must already be open when calling this method.
138
+ """
139
+ # Read the table first without project filter
140
+ events = self._read_system_metadata(EVENTS_KEY)
141
+ if events is None:
142
+ # Table doesn't exist yet, return empty DataFrame with correct schema
143
+ return pl.DataFrame(schema=POLARS_SCHEMAS[EVENTS_KEY])
144
+
145
+ return (
146
+ events.filter(
147
+ nw.col(COL_EXECUTION_ID) == migration_id,
148
+ nw.col(COL_PROJECT) == project,
149
+ )
150
+ .sort(COL_TIMESTAMP, descending=False)
151
+ .collect()
152
+ .to_polars()
153
+ )
154
+
155
+ def get_migration_status(
156
+ self,
157
+ migration_id: str,
158
+ project: str | None = None,
159
+ expected_features: list[str] | None = None,
160
+ ) -> MigrationStatus:
161
+ """Compute migration status from events at query-time.
162
+
163
+ Args:
164
+ migration_id: Migration ID
165
+ project: Optional project name to filter by. If None, returns status across all projects.
166
+ expected_features: Optional list of feature keys that should be completed.
167
+ If provided, will check that ALL expected features have completed successfully
168
+ before returning COMPLETED status, even if migration_completed event exists.
169
+ This allows detecting when a migration YAML has been modified after completion.
170
+
171
+ Returns:
172
+ MigrationStatus enum value
173
+ """
174
+
175
+ events_df = self.get_migration_events(migration_id, project=project)
176
+
177
+ if events_df.height == 0:
178
+ return MigrationStatus.NOT_STARTED
179
+
180
+ # Get latest event
181
+ latest_event = events_df.sort(COL_TIMESTAMP, descending=True).head(1)
182
+ latest_event_type = latest_event[COL_EVENT_TYPE][0]
183
+
184
+ # If expected_features is provided, verify ALL features have completed
185
+ # This ensures we detect when operations are added to an already-completed migration
186
+ if expected_features is not None and len(expected_features) > 0:
187
+ completed_features = set(self.get_completed_features(migration_id, project))
188
+ expected_features_set = set(expected_features)
189
+
190
+ # Check if all expected features have been completed
191
+ all_features_completed = expected_features_set.issubset(completed_features)
192
+
193
+ if not all_features_completed:
194
+ # Some features are missing - migration is not complete
195
+ if latest_event_type in (
196
+ EventType.MIGRATION_STARTED.value,
197
+ EventType.FEATURE_MIGRATION_STARTED.value,
198
+ EventType.FEATURE_MIGRATION_COMPLETED.value,
199
+ EventType.FEATURE_MIGRATION_FAILED.value,
200
+ ):
201
+ return MigrationStatus.IN_PROGRESS
202
+ elif latest_event_type == EventType.MIGRATION_FAILED.value:
203
+ return MigrationStatus.FAILED
204
+ else:
205
+ # Migration was marked complete but features are missing (YAML was modified)
206
+ return MigrationStatus.IN_PROGRESS
207
+ # If all features completed, continue with normal status logic below
208
+
209
+ if latest_event_type == EventType.MIGRATION_COMPLETED.value:
210
+ return MigrationStatus.COMPLETED
211
+ elif latest_event_type == EventType.MIGRATION_FAILED.value:
212
+ return MigrationStatus.FAILED
213
+ elif latest_event_type in (
214
+ EventType.MIGRATION_STARTED.value,
215
+ EventType.FEATURE_MIGRATION_STARTED.value,
216
+ EventType.FEATURE_MIGRATION_COMPLETED.value,
217
+ EventType.FEATURE_MIGRATION_FAILED.value,
218
+ ):
219
+ return MigrationStatus.IN_PROGRESS
220
+
221
+ return MigrationStatus.NOT_STARTED
222
+
223
+ def is_feature_completed(
224
+ self, migration_id: str, feature_key: str, project: str | None = None
225
+ ) -> bool:
226
+ """Check if a specific feature completed successfully in a migration.
227
+
228
+ Args:
229
+ migration_id: Migration ID
230
+ feature_key: Feature key to check
231
+ project: Optional project name to filter by. If None, checks across all projects.
232
+
233
+ Returns:
234
+ True if feature completed without errors
235
+ """
236
+ events_df = self.get_migration_events(migration_id, project)
237
+
238
+ # Filter and check for completed events without errors
239
+ events_df = (
240
+ events_df.filter(
241
+ (pl.col(COL_FEATURE_KEY) == feature_key)
242
+ & (
243
+ pl.col(COL_EVENT_TYPE)
244
+ == EventType.FEATURE_MIGRATION_COMPLETED.value
245
+ )
246
+ )
247
+ .with_columns(
248
+ pl.col(COL_PAYLOAD)
249
+ .str.json_path_match("$.error_message")
250
+ .alias("error_message")
251
+ )
252
+ .filter(pl.col("error_message").is_null() | (pl.col("error_message") == ""))
253
+ )
254
+
255
+ # Check if any completed event has no error
256
+ return events_df.height > 0
257
+
258
+ def get_completed_features(
259
+ self, migration_id: str, project: str | None = None
260
+ ) -> list[str]:
261
+ """Get list of features that completed successfully in a migration.
262
+
263
+ Args:
264
+ migration_id: Migration ID
265
+ project: Optional project name to filter by. If None, returns features for all projects.
266
+
267
+ Returns:
268
+ List of feature keys
269
+ """
270
+ events_df = self.get_migration_events(migration_id, project=project)
271
+
272
+ # Filter and extract completed features
273
+ events_df = (
274
+ events_df.filter(
275
+ pl.col(COL_EVENT_TYPE) == EventType.FEATURE_MIGRATION_COMPLETED.value
276
+ )
277
+ .with_columns(
278
+ pl.col(COL_PAYLOAD)
279
+ .str.json_path_match("$.error_message")
280
+ .alias("error_message")
281
+ )
282
+ .filter(pl.col("error_message").is_null() | (pl.col("error_message") == ""))
283
+ .select(COL_FEATURE_KEY)
284
+ .unique()
285
+ )
286
+
287
+ return events_df[COL_FEATURE_KEY].to_list()
288
+
289
+ def get_failed_features(
290
+ self, migration_id: str, project: str | None = None
291
+ ) -> dict[str, str]:
292
+ """Get features that failed in a migration with error messages.
293
+
294
+ Only returns features whose LATEST event is a failure. If a feature
295
+ failed and then succeeded on retry, it won't be included here.
296
+
297
+ Args:
298
+ migration_id: Migration ID
299
+ project: Optional project name to filter by. If None, returns features for all projects.
300
+
301
+ Returns:
302
+ Dict mapping feature key to error message
303
+ """
304
+ events_df = self.get_migration_events(migration_id, project=project)
305
+
306
+ if events_df.height == 0:
307
+ return {}
308
+
309
+ # Get completed features (these succeeded, even if they failed before)
310
+ completed_features = set(self.get_completed_features(migration_id, project))
311
+
312
+ # Filter for failed events, excluding features that later completed
313
+ failed_events = (
314
+ events_df.filter(
315
+ pl.col(COL_EVENT_TYPE) == EventType.FEATURE_MIGRATION_FAILED.value
316
+ )
317
+ .with_columns(
318
+ pl.col(COL_PAYLOAD)
319
+ .str.json_path_match("$.error_message")
320
+ .alias("error_message")
321
+ )
322
+ # Get latest failed event per feature
323
+ .sort(COL_TIMESTAMP, descending=True)
324
+ .group_by(COL_FEATURE_KEY, maintain_order=True)
325
+ .agg([pl.col("error_message").first().alias("error_message")])
326
+ # Exclude features that eventually completed
327
+ .filter(~pl.col(COL_FEATURE_KEY).is_in(list(completed_features)))
328
+ .select([COL_FEATURE_KEY, "error_message"])
329
+ )
330
+
331
+ # Convert to dict
332
+ return dict(
333
+ zip(
334
+ failed_events[COL_FEATURE_KEY].to_list(),
335
+ failed_events["error_message"].to_list(),
336
+ )
337
+ )
338
+
339
+ def get_migration_summary(
340
+ self,
341
+ migration_id: str,
342
+ project: str | None = None,
343
+ expected_features: list[str] | None = None,
344
+ ) -> dict[str, Any]:
345
+ """Get a comprehensive summary of migration execution status.
346
+
347
+ This is a convenience method that returns all migration information
348
+ in a single call, avoiding multiple queries.
349
+
350
+ Args:
351
+ migration_id: Migration ID
352
+ project: Optional project name to filter by. If None, returns summary across all projects.
353
+ expected_features: Optional list of feature keys that should be completed.
354
+ If provided, will be used to determine if migration is truly complete.
355
+
356
+ Returns:
357
+ Dict containing:
358
+ - status: MigrationStatus enum value
359
+ - completed_features: List of completed feature keys
360
+ - failed_features: Dict mapping failed feature keys to error messages
361
+ - total_features_processed: Count of completed + failed features
362
+ """
363
+ status = self.get_migration_status(migration_id, project, expected_features)
364
+ completed = self.get_completed_features(migration_id, project)
365
+ failed = self.get_failed_features(migration_id, project)
366
+
367
+ return {
368
+ "status": status,
369
+ "completed_features": completed,
370
+ "failed_features": failed,
371
+ "total_features_processed": len(completed) + len(failed),
372
+ }
373
+
374
+ # ========== Convenience Methods for Reading Migration Data ==========
375
+
376
+ def read_migration_events(
377
+ self, project: str | None = None, migration_id: str | None = None
378
+ ) -> pl.DataFrame:
379
+ """Read all migration events, optionally filtered by project and/or migration ID.
380
+
381
+ Args:
382
+ project: Optional project name to filter by. If None, returns events for all projects.
383
+ migration_id: Optional migration ID to filter by. If None, returns events for all migrations.
384
+
385
+ Returns:
386
+ Polars DataFrame with migration events
387
+ """
388
+ lazy = self._read_system_metadata(EVENTS_KEY)
389
+
390
+ # Apply filters if specified
391
+ if migration_id is not None:
392
+ lazy = lazy.filter(nw.col(COL_EXECUTION_ID) == migration_id)
393
+
394
+ if project is not None:
395
+ lazy = lazy.filter(nw.col(COL_PROJECT) == project)
396
+
397
+ # Convert to Polars DataFrame
398
+ return lazy.sort(COL_TIMESTAMP, descending=False).collect().to_polars()
399
+
400
+ def read_migration_progress(
401
+ self, project: str | None = None
402
+ ) -> dict[str, dict[str, Any]]:
403
+ """Read migration progress across all migrations.
404
+
405
+ Args:
406
+ project: Optional project name to filter by. If None, returns progress for all projects.
407
+
408
+ Returns:
409
+ Dict mapping migration_id to progress information including:
410
+ - status: "not_started", "in_progress", "completed", "failed"
411
+ - completed_features: List of completed feature keys
412
+ - failed_features: Dict of failed feature keys to error messages
413
+ - total_rows_affected: Total rows affected across all features
414
+ """
415
+ # Get all migration IDs
416
+ migration_ids = self.list_executed_migrations(project)
417
+
418
+ progress = {}
419
+ for mid in migration_ids:
420
+ events_df = self.read_migration_events(project=project, migration_id=mid)
421
+
422
+ if events_df.height == 0:
423
+ continue
424
+
425
+ # Get latest event for status
426
+ latest_event = events_df.sort(COL_TIMESTAMP, descending=True).head(1)
427
+ latest_event_type = latest_event[COL_EVENT_TYPE][0]
428
+
429
+ if latest_event_type == "completed":
430
+ status = "completed"
431
+ elif latest_event_type == "failed":
432
+ status = "failed"
433
+ elif latest_event_type in (
434
+ "started",
435
+ "feature_started",
436
+ EventType.FEATURE_MIGRATION_COMPLETED.value,
437
+ ):
438
+ status = "in_progress"
439
+ else:
440
+ status = "not_started"
441
+
442
+ # Get completed and failed features using JSON path (polars operations on collected data)
443
+ feature_events = events_df.filter(
444
+ events_df[COL_EVENT_TYPE] == EventType.FEATURE_MIGRATION_COMPLETED.value
445
+ ).with_columns(
446
+ [
447
+ pl.col(COL_PAYLOAD)
448
+ .str.json_path_match("$.error_message")
449
+ .alias("error_message"),
450
+ pl.col(COL_PAYLOAD)
451
+ .str.json_path_match("$.rows_affected")
452
+ .cast(pl.Int64)
453
+ .fill_null(0)
454
+ .alias("rows_affected"),
455
+ ]
456
+ )
457
+
458
+ # Split into completed and failed
459
+ completed_df = feature_events.filter(
460
+ pl.col("error_message").is_null() | (pl.col("error_message") == "")
461
+ )
462
+ failed_df = feature_events.filter(
463
+ pl.col("error_message").is_not_null() & (pl.col("error_message") != "")
464
+ )
465
+
466
+ completed_features = completed_df[COL_FEATURE_KEY].unique().to_list()
467
+ failed_features = dict(
468
+ zip(
469
+ failed_df[COL_FEATURE_KEY].to_list(),
470
+ failed_df["error_message"].to_list(),
471
+ )
472
+ )
473
+ total_rows = int(feature_events["rows_affected"].sum() or 0)
474
+
475
+ progress[mid] = {
476
+ "status": status,
477
+ "completed_features": completed_features,
478
+ "failed_features": failed_features,
479
+ "total_rows_affected": total_rows or 0,
480
+ }
481
+
482
+ return progress
483
+
484
+ def read_applied_migrations(
485
+ self, project: str | None = None
486
+ ) -> list[dict[str, Any]]:
487
+ """Read all applied (completed) migrations with their details.
488
+
489
+ Args:
490
+ project: Optional project name to filter by. If None, returns migrations for all projects.
491
+
492
+ Returns:
493
+ List of dicts containing migration details for completed migrations:
494
+ - migration_id: Migration ID
495
+ - project: Project name (if available)
496
+ - completed_at: Timestamp when migration completed
497
+ - features_count: Number of features affected
498
+ - rows_affected: Total rows affected
499
+ Note:
500
+ The store must already be open when calling this method.
501
+ """
502
+ lazy = self._read_system_metadata(EVENTS_KEY)
503
+ if lazy is None:
504
+ # Table doesn't exist yet, return empty list
505
+ return []
506
+
507
+ # Filter to only completed migrations using narwhals
508
+ completed_events = lazy.filter(nw.col(COL_EVENT_TYPE) == "completed")
509
+
510
+ if project is not None:
511
+ completed_events = completed_events.filter(nw.col(COL_PROJECT) == project)
512
+
513
+ # Convert to polars LazyFrame and collect
514
+ completed_df = completed_events.to_native().collect()
515
+
516
+ if completed_df.height == 0:
517
+ return []
518
+
519
+ # Get all events for all migrations at once
520
+ all_events = self.read_migration_events(project=project)
521
+
522
+ # Extract rows_affected from payload using JSON path (polars operations)
523
+ feature_events = all_events.filter(
524
+ all_events[COL_EVENT_TYPE] == EventType.FEATURE_MIGRATION_COMPLETED.value
525
+ ).with_columns(
526
+ pl.col(COL_PAYLOAD)
527
+ .str.json_path_match("$.rows_affected")
528
+ .cast(pl.Int64)
529
+ .fill_null(0)
530
+ .alias("rows_affected")
531
+ )
532
+
533
+ # Group by execution_id to get aggregated stats
534
+ migration_stats = feature_events.group_by(COL_EXECUTION_ID).agg(
535
+ [
536
+ pl.col(COL_FEATURE_KEY).n_unique().alias("features_count"),
537
+ pl.col("rows_affected").sum().alias("rows_affected"),
538
+ ]
539
+ )
540
+
541
+ # Join with completed events to get project and timestamp
542
+ result_df = completed_df.join(
543
+ migration_stats, on=COL_EXECUTION_ID, how="left"
544
+ ).select(
545
+ [
546
+ COL_EXECUTION_ID,
547
+ COL_PROJECT,
548
+ pl.col(COL_TIMESTAMP).alias("completed_at"),
549
+ pl.col("features_count").fill_null(0),
550
+ pl.col("rows_affected").fill_null(0).cast(pl.Int64),
551
+ ]
552
+ )
553
+
554
+ # Convert to list of dicts
555
+ return result_df.to_dicts()
556
+
557
+ def push_graph_snapshot(
558
+ self, tags: dict[str, Any] | None = None
559
+ ) -> SnapshotPushResult:
560
+ """Record all features in graph with a graph snapshot version.
561
+
562
+ This should be called during CD (Continuous Deployment) to record what
563
+ feature versions are being deployed. Typically invoked via `metaxy graph push`.
564
+
565
+ Records all features in the graph with the same snapshot_version, representing
566
+ a consistent state of the entire feature graph based on code definitions.
567
+
568
+ The snapshot_version is a deterministic hash of all feature_version hashes
569
+ in the graph, making it idempotent - calling multiple times with the
570
+ same feature definitions produces the same snapshot_version.
571
+
572
+ This method detects three scenarios:
573
+ 1. New snapshot (computational changes): No existing rows with this snapshot_version
574
+ 2. Metadata-only changes: Snapshot exists but some features have different feature_spec_version
575
+ 3. No changes: Snapshot exists with identical feature_spec_versions for all features
576
+
577
+ Args:
578
+ tags: Optional dictionary of custom tags to attach to the snapshot
579
+ (e.g., git commit SHA).
580
+
581
+ Note:
582
+ The store must already be open when calling this method.
583
+
584
+ Returns: SnapshotPushResult
585
+ """
586
+ tags = tags or {}
587
+
588
+ graph = FeatureGraph.get_active()
589
+
590
+ # Check if this exact snapshot already exists for this project
591
+ latest_pushed_snapshot = self._read_latest_snapshot_data(graph.snapshot_version)
592
+ current_snapshot_dict = graph.to_snapshot()
593
+
594
+ # Convert to DataFrame - need to serialize feature_spec dict to JSON string
595
+ # and add metaxy_snapshot_version and recorded_at columns
596
+ import json
597
+ from datetime import datetime, timezone
598
+
599
+ current_snapshot = pl.concat(
600
+ [
601
+ FeatureVersionsModel.model_validate(
602
+ {
603
+ "feature_key": k,
604
+ **{
605
+ field: (
606
+ json.dumps(val)
607
+ if field in ("feature_spec", "feature_schema")
608
+ else val
609
+ )
610
+ for field, val in v.items()
611
+ },
612
+ METAXY_SNAPSHOT_VERSION: graph.snapshot_version,
613
+ "recorded_at": datetime.now(timezone.utc),
614
+ "tags": json.dumps(tags),
615
+ }
616
+ ).to_polars()
617
+ for k, v in current_snapshot_dict.items()
618
+ ]
619
+ )
620
+
621
+ # Initialize to_push and already_pushed
622
+ to_push = current_snapshot # Will be updated if snapshot already exists
623
+ already_pushed: bool
624
+
625
+ if len(latest_pushed_snapshot) != 0:
626
+ # this snapshot_version HAS been previously pushed
627
+ # let's check for any differences
628
+
629
+ already_pushed = True
630
+ else:
631
+ # this snapshot_version has not been previously pushed at all
632
+ # we are safe to push all the features in our graph!
633
+ to_push = current_snapshot
634
+ already_pushed = False
635
+
636
+ if len(latest_pushed_snapshot) != 0:
637
+ # let's identify features that have updated definitions since the last push
638
+ # Join full current snapshot with latest pushed (keeping all columns)
639
+ pushed_with_current = current_snapshot.join(
640
+ latest_pushed_snapshot.select(
641
+ "feature_key",
642
+ pl.col(METAXY_FULL_DEFINITION_VERSION).alias(
643
+ f"{METAXY_FULL_DEFINITION_VERSION}_pushed"
644
+ ),
645
+ ),
646
+ on=["feature_key"],
647
+ how="left",
648
+ )
649
+
650
+ to_push = pl.concat(
651
+ [
652
+ # these are records that for some reason have not been pushed previously
653
+ pushed_with_current.filter(
654
+ pl.col(f"{METAXY_FULL_DEFINITION_VERSION}_pushed").is_null()
655
+ ),
656
+ # these are the records with actual changes
657
+ pushed_with_current.filter(
658
+ pl.col(f"{METAXY_FULL_DEFINITION_VERSION}_pushed").is_not_null()
659
+ ).filter(
660
+ pl.col(METAXY_FULL_DEFINITION_VERSION)
661
+ != pl.col(f"{METAXY_FULL_DEFINITION_VERSION}_pushed")
662
+ ),
663
+ ]
664
+ ).drop(f"{METAXY_FULL_DEFINITION_VERSION}_pushed")
665
+
666
+ if len(to_push) > 0:
667
+ self.store.write_metadata(FEATURE_VERSIONS_KEY, to_push)
668
+
669
+ # updated_features only populated when updating existing features
670
+ updated_features = (
671
+ to_push["feature_key"].to_list()
672
+ if len(latest_pushed_snapshot) != 0 and len(to_push) > 0
673
+ else []
674
+ )
675
+
676
+ return SnapshotPushResult(
677
+ snapshot_version=graph.snapshot_version,
678
+ already_pushed=already_pushed,
679
+ updated_features=updated_features,
680
+ )
681
+
682
+ def _read_system_metadata(self, key: FeatureKey) -> nw.LazyFrame[Any]:
683
+ """Read system metadata.
684
+
685
+ System tables are handled specially by MetadataStore.read_metadata - they don't
686
+ require feature plan resolution when current_only=False.
687
+
688
+ Note:
689
+ The store must already be open when calling this method.
690
+
691
+ Returns:
692
+ LazyFrame if table exists, empty LazyFrame with correct schema if it doesn't
693
+ """
694
+ try:
695
+ # read_metadata handles system tables specially (no feature plan needed)
696
+ return self.store.read_metadata(key, current_only=False)
697
+ except SystemDataNotFoundError:
698
+ return nw.from_native(pl.DataFrame(schema=POLARS_SCHEMAS[key])).lazy()
699
+
700
+ def _read_latest_snapshot_data(
701
+ self,
702
+ snapshot_version: str,
703
+ ) -> pl.DataFrame:
704
+ """Read the latest snapshot data for a given snapshot version.
705
+
706
+ The same snapshot version may include multiple features as their no-topological metadata such as Pydantic fields or spec.metadata/tags change.
707
+ This method retrieves the latest feature data for each feature pushed to the metadata store.
708
+
709
+ Returns:
710
+ Polars DataFrame (materialized) with the latest data. Empty if table doesn't exist or snapshot not found.
711
+ """
712
+ graph = FeatureGraph.get_active()
713
+
714
+ # Read system metadata
715
+ sys_meta = self._read_system_metadata(FEATURE_VERSIONS_KEY)
716
+
717
+ # Filter the data
718
+ lazy = sys_meta.filter(
719
+ nw.col(METAXY_SNAPSHOT_VERSION) == snapshot_version,
720
+ nw.col("project") == next(iter(graph.features_by_key.values())).project
721
+ if len(graph.features_by_key) > 0
722
+ else "_empty_graph_",
723
+ )
724
+
725
+ # Deduplicate using Polars (collect and use native operations)
726
+ return (
727
+ lazy.collect()
728
+ .to_polars()
729
+ .sort("recorded_at", descending=True)
730
+ .unique(subset=["feature_key"], keep="first")
731
+ )
732
+
733
+ def read_graph_snapshots(self, project: str | None = None) -> pl.DataFrame:
734
+ """Read recorded graph snapshots from the feature_versions system table.
735
+
736
+ Args:
737
+ project: Project name to filter by. If None, returns snapshots from all projects.
738
+
739
+ Returns a DataFrame with columns:
740
+ - snapshot_version: Unique identifier for each graph snapshot
741
+ - recorded_at: Timestamp when the snapshot was recorded
742
+ - feature_count: Number of features in this snapshot
743
+
744
+ Returns:
745
+ Polars DataFrame with snapshot information, sorted by recorded_at descending
746
+
747
+ Raises:
748
+ StoreNotOpenError: If store is not open
749
+
750
+ Example:
751
+ ```py
752
+ with store:
753
+ storage = SystemTableStorage(store)
754
+ # Get snapshots for a specific project
755
+ snapshots = storage.read_graph_snapshots(project="my_project")
756
+ latest_snapshot = snapshots[METAXY_SNAPSHOT_VERSION][0]
757
+ print(f"Latest snapshot: {latest_snapshot}")
758
+
759
+ # Get snapshots across all projects
760
+ all_snapshots = storage.read_graph_snapshots()
761
+ ```
762
+ """
763
+ # Read system metadata
764
+ versions_lazy = self._read_system_metadata(FEATURE_VERSIONS_KEY)
765
+ if versions_lazy is None:
766
+ # No snapshots recorded yet
767
+ return pl.DataFrame(
768
+ schema={
769
+ METAXY_SNAPSHOT_VERSION: pl.String,
770
+ "recorded_at": pl.Datetime("us"),
771
+ "feature_count": pl.UInt32,
772
+ }
773
+ )
774
+
775
+ # Build filters based on project parameter
776
+ if project is not None:
777
+ versions_lazy = versions_lazy.filter(nw.col("project") == project)
778
+
779
+ # Materialize
780
+ versions_df = versions_lazy.collect().to_polars()
781
+
782
+ if versions_df.height == 0:
783
+ # No snapshots recorded yet
784
+ return pl.DataFrame(
785
+ schema={
786
+ METAXY_SNAPSHOT_VERSION: pl.String,
787
+ "recorded_at": pl.Datetime("us"),
788
+ "feature_count": pl.UInt32,
789
+ }
790
+ )
791
+
792
+ # Group by snapshot_version and get earliest recorded_at and count
793
+ snapshots = (
794
+ versions_df.group_by(METAXY_SNAPSHOT_VERSION)
795
+ .agg(
796
+ [
797
+ pl.col("recorded_at").min().alias("recorded_at"),
798
+ pl.col("feature_key").count().alias("feature_count"),
799
+ ]
800
+ )
801
+ .sort("recorded_at", descending=True)
802
+ )
803
+
804
+ return snapshots
805
+
806
+ def read_features(
807
+ self,
808
+ *,
809
+ current: bool = True,
810
+ snapshot_version: str | None = None,
811
+ project: str | None = None,
812
+ ) -> pl.DataFrame:
813
+ """Read feature version information from the feature_versions system table.
814
+
815
+ Args:
816
+ current: If True, only return features from the current code snapshot.
817
+ If False, must provide snapshot_version.
818
+ snapshot_version: Specific snapshot version to filter by. Required if current=False.
819
+ project: Project name to filter by. Defaults to None.
820
+
821
+ Returns:
822
+ Polars DataFrame with columns from FEATURE_VERSIONS_SCHEMA:
823
+ - feature_key: Feature identifier
824
+ - feature_version: Version hash of the feature
825
+ - recorded_at: When this version was recorded
826
+ - feature_spec: JSON serialized feature specification
827
+ - feature_class_path: Python import path to the feature class
828
+ - snapshot_version: Graph snapshot this feature belongs to
829
+
830
+ Raises:
831
+ StoreNotOpenError: If store is not open
832
+ ValueError: If current=False but no snapshot_version provided
833
+
834
+ Examples:
835
+ ```py
836
+ # Get features from current code
837
+ with store:
838
+ storage = SystemTableStorage(store)
839
+ features = storage.read_features(current=True)
840
+ print(f"Current graph has {len(features)} features")
841
+ ```
842
+
843
+ ```py
844
+ # Get features from a specific snapshot
845
+ with store:
846
+ storage = SystemTableStorage(store)
847
+ features = storage.read_features(current=False, snapshot_version="abc123")
848
+ for row in features.iter_rows(named=True):
849
+ print(f"{row['feature_key']}: {row['metaxy_feature_version']}")
850
+ ```
851
+ """
852
+ if not current and snapshot_version is None:
853
+ raise ValueError("Must provide snapshot_version when current=False")
854
+
855
+ if current:
856
+ # Get current snapshot from active graph
857
+ graph = FeatureGraph.get_active()
858
+ snapshot_version = graph.snapshot_version
859
+
860
+ # Read system metadata
861
+ versions_lazy = self._read_system_metadata(FEATURE_VERSIONS_KEY)
862
+ if versions_lazy is None:
863
+ # No features recorded yet
864
+ return pl.DataFrame(schema=POLARS_SCHEMAS[FEATURE_VERSIONS_KEY])
865
+
866
+ # Build filters
867
+ filters = [nw.col(METAXY_SNAPSHOT_VERSION) == snapshot_version]
868
+ if project is not None:
869
+ filters.append(nw.col("project") == project)
870
+
871
+ for f in filters:
872
+ versions_lazy = versions_lazy.filter(f)
873
+
874
+ # Materialize
875
+ versions_df = versions_lazy.collect().to_polars()
876
+
877
+ return versions_df
878
+
879
+ def load_graph_from_snapshot(
880
+ self,
881
+ snapshot_version: str,
882
+ project: str | None = None,
883
+ *,
884
+ class_path_overrides: dict[str, str] | None = None,
885
+ force_reload: bool = False,
886
+ ) -> FeatureGraph:
887
+ """Load and reconstruct a FeatureGraph from a stored snapshot.
888
+
889
+ This is a convenience method that encapsulates the pattern of:
890
+
891
+ 1. Reading feature metadata for a snapshot
892
+
893
+ 2. Building the snapshot data dictionary
894
+
895
+ 3. Reconstructing the FeatureGraph from snapshot data
896
+
897
+ Args:
898
+ snapshot_version: The snapshot version to load
899
+ project: Optional project name to filter by
900
+ class_path_overrides: Optional dict mapping feature_key to new class path
901
+ for features that have been moved/renamed
902
+ force_reload: If True, force reimport of feature classes even if cached
903
+
904
+ Returns:
905
+ Reconstructed FeatureGraph
906
+
907
+ Raises:
908
+ ValueError: If no features found for the snapshot version
909
+ ImportError: If feature classes cannot be imported at their recorded paths
910
+
911
+ Note:
912
+ The store must already be open when calling this method.
913
+
914
+ Example:
915
+ ```python
916
+ with store:
917
+ storage = SystemTableStorage(store)
918
+ graph = storage.load_graph_from_snapshot(
919
+ snapshot_version="abc123",
920
+ project="my_project"
921
+ )
922
+ print(f"Loaded {len(graph.features_by_key)} features")
923
+ ```
924
+ """
925
+ import json
926
+
927
+ # Read features for this snapshot
928
+ features_df = self.read_features(
929
+ current=False,
930
+ snapshot_version=snapshot_version,
931
+ project=project,
932
+ )
933
+
934
+ if features_df.height == 0:
935
+ raise ValueError(
936
+ f"No features recorded for snapshot {snapshot_version}"
937
+ + (f" in project {project}" if project else "")
938
+ )
939
+
940
+ # Build snapshot data dict for FeatureGraph.from_snapshot()
941
+ snapshot_data = {
942
+ row["feature_key"]: {
943
+ "feature_spec": json.loads(row["feature_spec"])
944
+ if isinstance(row["feature_spec"], str)
945
+ else row["feature_spec"],
946
+ "feature_class_path": row["feature_class_path"],
947
+ "metaxy_feature_version": row["feature_version"],
948
+ }
949
+ for row in features_df.iter_rows(named=True)
950
+ }
951
+
952
+ # Reconstruct graph from snapshot
953
+ return FeatureGraph.from_snapshot(
954
+ snapshot_data,
955
+ class_path_overrides=class_path_overrides,
956
+ force_reload=force_reload,
957
+ )