metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,769 @@
1
+ """Graph diffing logic and snapshot resolution."""
2
+
3
+ import json
4
+ import warnings
5
+ from collections.abc import Mapping
6
+ from typing import Any
7
+
8
+ from metaxy.graph.diff.diff_models import (
9
+ AddedNode,
10
+ FieldChange,
11
+ GraphDiff,
12
+ NodeChange,
13
+ RemovedNode,
14
+ )
15
+ from metaxy.metadata_store.base import MetadataStore
16
+ from metaxy.metadata_store.system import FEATURE_VERSIONS_KEY
17
+ from metaxy.models.feature import FeatureGraph
18
+ from metaxy.models.types import FeatureKey, FieldKey
19
+
20
+
21
+ class SnapshotResolver:
22
+ """Resolves snapshot version literals to actual snapshot hashes."""
23
+
24
+ def resolve_snapshot(
25
+ self, literal: str, store: MetadataStore | None, graph: FeatureGraph | None
26
+ ) -> str:
27
+ """Resolve a snapshot literal to its actual version hash.
28
+
29
+ Args:
30
+ literal: Snapshot identifier ("latest", "current", or version hash)
31
+ store: Metadata store to query for snapshots (required for "latest")
32
+ graph: Optional active graph for "current" resolution
33
+
34
+ Returns:
35
+ Resolved snapshot version hash
36
+
37
+ Raises:
38
+ ValueError: If literal is invalid or cannot be resolved
39
+ """
40
+ if literal == "latest":
41
+ if store is None:
42
+ raise ValueError(
43
+ "Cannot resolve 'latest': no metadata store provided. "
44
+ "Provide a store to query for snapshots."
45
+ )
46
+ return self._resolve_latest(store)
47
+ elif literal == "current":
48
+ return self._resolve_current(graph)
49
+ else:
50
+ # Treat as explicit snapshot version
51
+ return literal
52
+
53
+ def _resolve_latest(self, store: MetadataStore) -> str:
54
+ """Resolve 'latest' to most recent snapshot in store."""
55
+ from metaxy.metadata_store.system.storage import SystemTableStorage
56
+
57
+ with store:
58
+ storage = SystemTableStorage(store)
59
+ snapshots_df = storage.read_graph_snapshots()
60
+
61
+ if snapshots_df.height == 0:
62
+ raise ValueError(
63
+ "No snapshots found in store. Cannot resolve 'latest'. "
64
+ "Run 'metaxy graph push' to record a snapshot."
65
+ )
66
+
67
+ # read_graph_snapshots() returns sorted by recorded_at descending
68
+ latest_snapshot = snapshots_df["metaxy_snapshot_version"][0]
69
+ return latest_snapshot
70
+
71
+ def _resolve_current(self, graph: FeatureGraph | None) -> str:
72
+ """Resolve 'current' to active graph's snapshot version."""
73
+ if graph is None:
74
+ raise ValueError(
75
+ "Cannot resolve 'current': no active graph provided. "
76
+ "Ensure features are loaded before using 'current'."
77
+ )
78
+
79
+ if len(graph.features_by_key) == 0:
80
+ raise ValueError(
81
+ "Cannot resolve 'current': active graph is empty. "
82
+ "Ensure features are loaded before using 'current'."
83
+ )
84
+
85
+ return graph.snapshot_version
86
+
87
+
88
+ class GraphDiffer:
89
+ """Compares two graph snapshots and produces a diff."""
90
+
91
+ def diff(
92
+ self,
93
+ snapshot1_data: Mapping[str, Mapping[str, Any]],
94
+ snapshot2_data: Mapping[str, Mapping[str, Any]],
95
+ from_snapshot_version: str = "unknown",
96
+ to_snapshot_version: str = "unknown",
97
+ ) -> GraphDiff:
98
+ """Compute diff between two snapshots.
99
+
100
+ Args:
101
+ snapshot1_data: First snapshot (feature_key -> {feature_version, feature_spec, fields})
102
+ snapshot2_data: Second snapshot (feature_key -> {feature_version, feature_spec, fields})
103
+ from_snapshot_version: Source snapshot version
104
+ to_snapshot_version: Target snapshot version
105
+
106
+ Returns:
107
+ GraphDiff with added, removed, and changed features
108
+ """
109
+ # Extract feature keys
110
+ keys1 = set(snapshot1_data.keys())
111
+ keys2 = set(snapshot2_data.keys())
112
+
113
+ # Identify added and removed features
114
+ added_keys = keys2 - keys1
115
+ removed_keys = keys1 - keys2
116
+ common_keys = keys1 & keys2
117
+
118
+ # Build added nodes
119
+ added_nodes = []
120
+ for key_str in sorted(added_keys):
121
+ feature_data = snapshot2_data[key_str]
122
+ feature_spec = feature_data.get("feature_spec", {})
123
+
124
+ # Extract fields
125
+ fields_list = []
126
+ for field_dict in feature_spec.get("fields", []):
127
+ field_key_list = field_dict.get("key", [])
128
+ field_key_str = (
129
+ "/".join(field_key_list)
130
+ if isinstance(field_key_list, list)
131
+ else field_key_list
132
+ )
133
+ fields_list.append(
134
+ {
135
+ "key": field_key_str,
136
+ "version": feature_data.get("fields", {}).get(
137
+ field_key_str, ""
138
+ ),
139
+ "code_version": field_dict.get("code_version"),
140
+ }
141
+ )
142
+
143
+ # Extract dependencies
144
+ deps = []
145
+ if feature_spec.get("deps"):
146
+ for dep in feature_spec["deps"]:
147
+ # Check for 'feature' field (new format) or 'key' field (old format)
148
+ dep_key = dep.get("feature") or dep.get("key", [])
149
+ if isinstance(dep_key, list):
150
+ deps.append(FeatureKey(dep_key))
151
+ else:
152
+ deps.append(FeatureKey(dep_key.split("/")))
153
+
154
+ added_nodes.append(
155
+ AddedNode(
156
+ feature_key=FeatureKey(key_str.split("/")),
157
+ version=feature_data["metaxy_feature_version"],
158
+ code_version=feature_spec.get("code_version"),
159
+ fields=fields_list,
160
+ dependencies=deps,
161
+ )
162
+ )
163
+
164
+ # Build removed nodes
165
+ removed_nodes = []
166
+ for key_str in sorted(removed_keys):
167
+ feature_data = snapshot1_data[key_str]
168
+ feature_spec = feature_data.get("feature_spec", {})
169
+
170
+ # Extract fields
171
+ fields_list = []
172
+ for field_dict in feature_spec.get("fields", []):
173
+ field_key_list = field_dict.get("key", [])
174
+ field_key_str = (
175
+ "/".join(field_key_list)
176
+ if isinstance(field_key_list, list)
177
+ else field_key_list
178
+ )
179
+ fields_list.append(
180
+ {
181
+ "key": field_key_str,
182
+ "version": feature_data.get("fields", {}).get(
183
+ field_key_str, ""
184
+ ),
185
+ "code_version": field_dict.get("code_version"),
186
+ }
187
+ )
188
+
189
+ # Extract dependencies
190
+ deps = []
191
+ if feature_spec.get("deps"):
192
+ for dep in feature_spec["deps"]:
193
+ # Check for 'feature' field (new format) or 'key' field (old format)
194
+ dep_key = dep.get("feature") or dep.get("key", [])
195
+ if isinstance(dep_key, list):
196
+ deps.append(FeatureKey(dep_key))
197
+ else:
198
+ deps.append(FeatureKey(dep_key.split("/")))
199
+
200
+ removed_nodes.append(
201
+ RemovedNode(
202
+ feature_key=FeatureKey(key_str.split("/")),
203
+ version=feature_data["metaxy_feature_version"],
204
+ code_version=feature_spec.get("code_version"),
205
+ fields=fields_list,
206
+ dependencies=deps,
207
+ )
208
+ )
209
+
210
+ # Identify changed features
211
+ changed_nodes = []
212
+ for key_str in sorted(common_keys):
213
+ feature1 = snapshot1_data[key_str]
214
+ feature2 = snapshot2_data[key_str]
215
+
216
+ version1 = feature1["metaxy_feature_version"]
217
+ version2 = feature2["metaxy_feature_version"]
218
+
219
+ spec1 = feature1.get("feature_spec", {})
220
+ spec2 = feature2.get("feature_spec", {})
221
+
222
+ fields1 = feature1.get("fields", {})
223
+ fields2 = feature2.get("fields", {})
224
+
225
+ # Get tracking versions for migration detection
226
+ # Use tracking version if available (new system), otherwise fall back to feature_version
227
+ tracking_version1 = feature1.get("metaxy_full_definition_version", version1)
228
+ tracking_version2 = feature2.get("metaxy_full_definition_version", version2)
229
+
230
+ # Check if feature tracking version changed (indicates migration needed)
231
+ if tracking_version1 != tracking_version2:
232
+ # Compute field changes
233
+ field_changes = self._compute_field_changes(fields1, fields2)
234
+
235
+ changed_nodes.append(
236
+ NodeChange(
237
+ feature_key=FeatureKey(key_str.split("/")),
238
+ old_version=version1,
239
+ new_version=version2,
240
+ old_code_version=spec1.get("code_version"),
241
+ new_code_version=spec2.get("code_version"),
242
+ added_fields=[fc for fc in field_changes if fc.is_added],
243
+ removed_fields=[fc for fc in field_changes if fc.is_removed],
244
+ changed_fields=[fc for fc in field_changes if fc.is_changed],
245
+ )
246
+ )
247
+
248
+ return GraphDiff(
249
+ from_snapshot_version=from_snapshot_version,
250
+ to_snapshot_version=to_snapshot_version,
251
+ added_nodes=added_nodes,
252
+ removed_nodes=removed_nodes,
253
+ changed_nodes=changed_nodes,
254
+ )
255
+
256
+ def _compute_field_changes(
257
+ self, fields1: dict[str, str], fields2: dict[str, str]
258
+ ) -> list[FieldChange]:
259
+ """Compute changes between two field version mappings.
260
+
261
+ Args:
262
+ fields1: Field key (string) -> field version (hash) from snapshot1
263
+ fields2: Field key (string) -> field version (hash) from snapshot2
264
+
265
+ Returns:
266
+ List of FieldChange objects
267
+ """
268
+ field_keys1 = set(fields1.keys())
269
+ field_keys2 = set(fields2.keys())
270
+
271
+ added_fields = field_keys2 - field_keys1
272
+ removed_fields = field_keys1 - field_keys2
273
+ common_fields = field_keys1 & field_keys2
274
+
275
+ changes = []
276
+
277
+ # Added fields
278
+ for field_key_str in sorted(added_fields):
279
+ changes.append(
280
+ FieldChange(
281
+ field_key=FieldKey(field_key_str.split("/")),
282
+ old_version=None,
283
+ new_version=fields2[field_key_str],
284
+ old_code_version=None,
285
+ new_code_version=None, # TODO: Extract from spec if available
286
+ )
287
+ )
288
+
289
+ # Removed fields
290
+ for field_key_str in sorted(removed_fields):
291
+ changes.append(
292
+ FieldChange(
293
+ field_key=FieldKey(field_key_str.split("/")),
294
+ old_version=fields1[field_key_str],
295
+ new_version=None,
296
+ old_code_version=None, # TODO: Extract from spec if available
297
+ new_code_version=None,
298
+ )
299
+ )
300
+
301
+ # Changed fields
302
+ for field_key_str in sorted(common_fields):
303
+ version1 = fields1[field_key_str]
304
+ version2 = fields2[field_key_str]
305
+
306
+ if version1 != version2:
307
+ changes.append(
308
+ FieldChange(
309
+ field_key=FieldKey(field_key_str.split("/")),
310
+ old_version=version1,
311
+ new_version=version2,
312
+ old_code_version=None, # TODO: Extract from spec if available
313
+ new_code_version=None, # TODO: Extract from spec if available
314
+ )
315
+ )
316
+
317
+ return changes
318
+
319
+ def create_merged_graph_data(
320
+ self,
321
+ snapshot1_data: Mapping[str, Mapping[str, Any]],
322
+ snapshot2_data: Mapping[str, Mapping[str, Any]],
323
+ diff: GraphDiff,
324
+ ) -> dict[str, Any]:
325
+ """Create merged graph data structure with status annotations.
326
+
327
+ This combines features from both snapshots into a single unified view,
328
+ annotating each feature with its status (added/removed/changed/unchanged).
329
+
330
+ Args:
331
+ snapshot1_data: First snapshot data (feature_key -> {feature_version, fields})
332
+ snapshot2_data: Second snapshot data (feature_key -> {feature_version, fields})
333
+ diff: Computed diff between snapshots
334
+
335
+ Returns:
336
+ Dict with structure:
337
+ {
338
+ 'nodes': {
339
+ feature_key_str: {
340
+ 'status': 'added' | 'removed' | 'changed' | 'unchanged',
341
+ 'old_version': str | None,
342
+ 'new_version': str | None,
343
+ 'fields': {...}, # fields from relevant snapshot
344
+ 'field_changes': [...], # FieldChange objects for changed nodes
345
+ 'dependencies': [feature_key_str, ...], # deps from relevant snapshot
346
+ }
347
+ },
348
+ 'edges': [
349
+ {'from': feature_key_str, 'to': feature_key_str}
350
+ ]
351
+ }
352
+ """
353
+ # Create status mapping for efficient lookup
354
+ added_keys = {node.feature_key.to_string() for node in diff.added_nodes}
355
+ removed_keys = {node.feature_key.to_string() for node in diff.removed_nodes}
356
+ changed_keys = {
357
+ node.feature_key.to_string(): node for node in diff.changed_nodes
358
+ }
359
+
360
+ # Get all feature keys from both snapshots
361
+ all_keys = set(snapshot1_data.keys()) | set(snapshot2_data.keys())
362
+
363
+ nodes = {}
364
+ edges = []
365
+
366
+ for feature_key_str in all_keys:
367
+ # Determine status
368
+ if feature_key_str in added_keys:
369
+ status = "added"
370
+ old_version = None
371
+ new_version = snapshot2_data[feature_key_str]["metaxy_feature_version"]
372
+ fields = snapshot2_data[feature_key_str].get("fields", {})
373
+ field_changes = []
374
+ # Dependencies from snapshot2
375
+ deps = self._extract_dependencies(
376
+ snapshot2_data[feature_key_str].get("feature_spec", {})
377
+ )
378
+ elif feature_key_str in removed_keys:
379
+ status = "removed"
380
+ old_version = snapshot1_data[feature_key_str]["metaxy_feature_version"]
381
+ new_version = None
382
+ fields = snapshot1_data[feature_key_str].get("fields", {})
383
+ field_changes = []
384
+ # Dependencies from snapshot1
385
+ deps = self._extract_dependencies(
386
+ snapshot1_data[feature_key_str].get("feature_spec", {})
387
+ )
388
+ elif feature_key_str in changed_keys:
389
+ status = "changed"
390
+ node_change = changed_keys[feature_key_str]
391
+ old_version = node_change.old_version
392
+ new_version = node_change.new_version
393
+ fields = snapshot2_data[feature_key_str].get("fields", {})
394
+ # Combine all field changes from the NodeChange
395
+ field_changes = (
396
+ node_change.added_fields
397
+ + node_change.removed_fields
398
+ + node_change.changed_fields
399
+ )
400
+ # Dependencies from snapshot2 (current version)
401
+ deps = self._extract_dependencies(
402
+ snapshot2_data[feature_key_str].get("feature_spec", {})
403
+ )
404
+ else:
405
+ # Unchanged
406
+ status = "unchanged"
407
+ old_version = snapshot1_data[feature_key_str]["metaxy_feature_version"]
408
+ new_version = snapshot2_data[feature_key_str]["metaxy_feature_version"]
409
+ fields = snapshot2_data[feature_key_str].get("fields", {})
410
+ field_changes = []
411
+ # Dependencies from snapshot2
412
+ deps = self._extract_dependencies(
413
+ snapshot2_data[feature_key_str].get("feature_spec", {})
414
+ )
415
+
416
+ nodes[feature_key_str] = {
417
+ "status": status,
418
+ "old_version": old_version,
419
+ "new_version": new_version,
420
+ "fields": fields,
421
+ "field_changes": field_changes,
422
+ "dependencies": deps,
423
+ }
424
+
425
+ # Create edges for dependencies (arrow points from dependency to feature)
426
+ for dep_key in deps:
427
+ edges.append({"from": dep_key, "to": feature_key_str})
428
+
429
+ return {
430
+ "nodes": nodes,
431
+ "edges": edges,
432
+ }
433
+
434
+ def _extract_dependencies(self, feature_spec: dict[str, Any]) -> list[str]:
435
+ """Extract dependency feature keys from a feature spec.
436
+
437
+ Args:
438
+ feature_spec: Parsed feature spec dict
439
+
440
+ Returns:
441
+ List of dependency feature keys as strings
442
+ """
443
+ deps = feature_spec.get("deps", [])
444
+ if deps is None:
445
+ return []
446
+
447
+ dep_keys = []
448
+ for dep in deps:
449
+ # Check for 'feature' field (new format) or 'key' field (old format)
450
+ dep_key = dep.get("feature") or dep.get("key", [])
451
+ if isinstance(dep_key, list):
452
+ dep_keys.append("/".join(dep_key))
453
+ else:
454
+ dep_keys.append(dep_key)
455
+
456
+ return dep_keys
457
+
458
+ def filter_merged_graph(
459
+ self,
460
+ merged_data: dict[str, Any],
461
+ focus_feature: str | None = None,
462
+ up: int | None = None,
463
+ down: int | None = None,
464
+ ) -> dict[str, Any]:
465
+ """Filter merged graph to show only relevant features.
466
+
467
+ Args:
468
+ merged_data: Merged graph data with nodes and edges
469
+ focus_feature: Feature key to focus on (string format with / or __)
470
+ up: Number of upstream levels (None = all if focus_feature is set, 0 otherwise)
471
+ down: Number of downstream levels (None = all if focus_feature is set, 0 otherwise)
472
+
473
+ Returns:
474
+ Filtered merged graph data with same structure
475
+
476
+ Raises:
477
+ ValueError: If focus_feature is specified but not found in graph
478
+ """
479
+ if focus_feature is None:
480
+ # No filtering
481
+ return merged_data
482
+
483
+ # Parse feature key (support both / and __ formats)
484
+ if "/" in focus_feature:
485
+ focus_key = focus_feature
486
+ else:
487
+ focus_key = focus_feature.replace("__", "/")
488
+
489
+ # Check if focus feature exists
490
+ if focus_key not in merged_data["nodes"]:
491
+ raise ValueError(f"Feature '{focus_feature}' not found in graph")
492
+
493
+ # Build dependency graph for traversal
494
+ # Build forward edges (feature -> dependents) and backward edges (feature -> dependencies)
495
+ forward_edges: dict[str, list[str]] = {} # feature -> list of dependents
496
+ backward_edges: dict[str, list[str]] = {} # feature -> list of dependencies
497
+
498
+ for edge in merged_data["edges"]:
499
+ dep = edge["from"] # dependency
500
+ feat = edge["to"] # dependent feature
501
+
502
+ if feat not in backward_edges:
503
+ backward_edges[feat] = []
504
+ backward_edges[feat].append(dep)
505
+
506
+ if dep not in forward_edges:
507
+ forward_edges[dep] = []
508
+ forward_edges[dep].append(feat)
509
+
510
+ # Find features to include
511
+ features_to_include = {focus_key}
512
+
513
+ # Add upstream (dependencies)
514
+ # Default behavior: if focus_feature is set but up is not specified, include all upstream
515
+ if up is None:
516
+ # Include all upstream
517
+ upstream = self._get_upstream_features(
518
+ focus_key, backward_edges, max_levels=None
519
+ )
520
+ features_to_include.update(upstream)
521
+ elif up > 0:
522
+ # Include specified number of levels
523
+ upstream = self._get_upstream_features(
524
+ focus_key, backward_edges, max_levels=up
525
+ )
526
+ features_to_include.update(upstream)
527
+ # else: up == 0, don't include upstream
528
+
529
+ # Add downstream (dependents)
530
+ # Default behavior: if focus_feature is set but down is not specified, include all downstream
531
+ if down is None:
532
+ # Include all downstream
533
+ downstream = self._get_downstream_features(
534
+ focus_key, forward_edges, max_levels=None
535
+ )
536
+ features_to_include.update(downstream)
537
+ elif down > 0:
538
+ # Include specified number of levels
539
+ downstream = self._get_downstream_features(
540
+ focus_key, forward_edges, max_levels=down
541
+ )
542
+ features_to_include.update(downstream)
543
+ # else: down == 0, don't include downstream
544
+
545
+ # Filter nodes and edges
546
+ filtered_nodes = {
547
+ k: v for k, v in merged_data["nodes"].items() if k in features_to_include
548
+ }
549
+ filtered_edges = [
550
+ e
551
+ for e in merged_data["edges"]
552
+ if e["from"] in features_to_include and e["to"] in features_to_include
553
+ ]
554
+
555
+ return {
556
+ "nodes": filtered_nodes,
557
+ "edges": filtered_edges,
558
+ }
559
+
560
+ def _get_upstream_features(
561
+ self,
562
+ start_key: str,
563
+ backward_edges: dict[str, list[str]],
564
+ max_levels: int | None = None,
565
+ visited: set[str] | None = None,
566
+ level: int = 0,
567
+ ) -> set[str]:
568
+ """Get upstream features (dependencies) recursively."""
569
+ if visited is None:
570
+ visited = set()
571
+
572
+ if start_key in visited:
573
+ return set()
574
+
575
+ if max_levels is not None and level >= max_levels:
576
+ return set()
577
+
578
+ visited.add(start_key)
579
+ upstream: set[str] = set()
580
+
581
+ deps = backward_edges.get(start_key, [])
582
+ for dep in deps:
583
+ if dep not in visited:
584
+ upstream.add(dep)
585
+ # Recurse
586
+ upstream.update(
587
+ self._get_upstream_features(
588
+ dep, backward_edges, max_levels, visited, level + 1
589
+ )
590
+ )
591
+
592
+ return upstream
593
+
594
+ def _get_downstream_features(
595
+ self,
596
+ start_key: str,
597
+ forward_edges: dict[str, list[str]],
598
+ max_levels: int | None = None,
599
+ visited: set[str] | None = None,
600
+ level: int = 0,
601
+ ) -> set[str]:
602
+ """Get downstream features (dependents) recursively."""
603
+ if visited is None:
604
+ visited = set()
605
+
606
+ if start_key in visited:
607
+ return set()
608
+
609
+ if max_levels is not None and level >= max_levels:
610
+ return set()
611
+
612
+ visited.add(start_key)
613
+ downstream: set[str] = set()
614
+
615
+ dependents = forward_edges.get(start_key, [])
616
+ for dependent in dependents:
617
+ if dependent not in visited:
618
+ downstream.add(dependent)
619
+ # Recurse
620
+ downstream.update(
621
+ self._get_downstream_features(
622
+ dependent, forward_edges, max_levels, visited, level + 1
623
+ )
624
+ )
625
+
626
+ return downstream
627
+
628
+ def load_snapshot_data(
629
+ self, store: MetadataStore, snapshot_version: str, project: str | None = None
630
+ ) -> Mapping[str, Mapping[str, Any]]:
631
+ """Load snapshot data from store.
632
+
633
+ Args:
634
+ store: Metadata store to query
635
+ snapshot_version: Snapshot version to load
636
+ project: Optional project name to filter by (None means all projects)
637
+
638
+ Returns:
639
+ Dict mapping feature_key (string) -> {feature_version, feature_spec, fields}
640
+ where fields is dict mapping field_key (string) -> field_version (hash)
641
+
642
+ Raises:
643
+ ValueError: If snapshot not found in store
644
+ """
645
+ # Import at function level to avoid circular imports
646
+
647
+ # Auto-open store if not already open
648
+ if not store._is_open:
649
+ with store.open("read"):
650
+ return self.load_snapshot_data(store, snapshot_version, project)
651
+
652
+ # Query feature_versions table for this snapshot
653
+ try:
654
+ features_lazy = store.read_metadata_in_store(FEATURE_VERSIONS_KEY)
655
+ if features_lazy is None:
656
+ raise ValueError(
657
+ f"No feature_versions table found in store. Cannot load snapshot {snapshot_version}."
658
+ )
659
+
660
+ # Filter by snapshot_version and project
661
+ import narwhals as nw
662
+
663
+ features_df = (
664
+ features_lazy.filter(
665
+ nw.col("metaxy_snapshot_version") == snapshot_version
666
+ )
667
+ .collect()
668
+ .to_polars()
669
+ )
670
+
671
+ if features_df.height == 0:
672
+ raise ValueError(
673
+ f"Snapshot {snapshot_version} not found in store. "
674
+ "Run 'metaxy graph push' to record snapshots or check the version hash."
675
+ )
676
+
677
+ except Exception as e:
678
+ raise ValueError(f"Failed to load snapshot {snapshot_version}: {e}") from e
679
+
680
+ # Build snapshot data structure
681
+ snapshot_dict = {}
682
+ for row in features_df.iter_rows(named=True):
683
+ feature_key_str = row["feature_key"]
684
+ feature_version = row["metaxy_feature_version"]
685
+ feature_spec_json = row["feature_spec"]
686
+ feature_class_path = row.get("feature_class_path", "")
687
+
688
+ feature_spec_dict = json.loads(feature_spec_json)
689
+
690
+ snapshot_dict[feature_key_str] = {
691
+ "metaxy_feature_version": feature_version,
692
+ "feature_spec": feature_spec_dict,
693
+ "feature_class_path": feature_class_path,
694
+ "metaxy_full_definition_version": row.get(
695
+ "metaxy_full_definition_version", feature_version
696
+ ), # Fallback for backward compatibility
697
+ }
698
+
699
+ # Try to reconstruct FeatureGraph from snapshot to compute field versions
700
+ # This may fail if features have been removed/moved, so we handle that gracefully
701
+ graph: FeatureGraph | None = None
702
+ try:
703
+ graph = FeatureGraph.from_snapshot(snapshot_dict)
704
+ graph_available = True
705
+ except ImportError:
706
+ # Some features can't be imported (likely removed) - proceed without graph
707
+ # For diff purposes, we can still show feature-level changes
708
+ # We'll use feature_version as a fallback for all field versions
709
+ graph_available = False
710
+ warnings.warn(
711
+ "Using feature_version as field_version fallback for features that cannot be imported. "
712
+ "This may occur when features have been removed or moved.",
713
+ UserWarning,
714
+ stacklevel=2,
715
+ )
716
+
717
+ # Compute field versions using the reconstructed graph (if available)
718
+ from metaxy.models.plan import FQFieldKey
719
+
720
+ snapshot_data = {}
721
+ for feature_key_str in snapshot_dict.keys():
722
+ feature_version = snapshot_dict[feature_key_str]["metaxy_feature_version"]
723
+ feature_spec = snapshot_dict[feature_key_str]["feature_spec"]
724
+ feature_key_obj = FeatureKey(feature_key_str.split("/"))
725
+
726
+ # Compute field versions using graph (if available)
727
+ fields_data = {}
728
+ if (
729
+ graph_available
730
+ and graph is not None
731
+ and feature_key_obj in graph.features_by_key
732
+ ):
733
+ # Feature exists in reconstructed graph - compute precise field versions
734
+ for field_dict in feature_spec.get("fields", []):
735
+ field_key_list = field_dict.get("key")
736
+ if isinstance(field_key_list, list):
737
+ field_key = FieldKey(field_key_list)
738
+ field_key_str_normalized = "/".join(field_key_list)
739
+ else:
740
+ field_key = FieldKey([field_key_list])
741
+ field_key_str_normalized = field_key_list
742
+
743
+ # Compute field version using the graph
744
+ fq_key = FQFieldKey(feature=feature_key_obj, field=field_key)
745
+ field_version = graph.get_field_version(fq_key)
746
+ fields_data[field_key_str_normalized] = field_version
747
+ else:
748
+ # Feature doesn't exist in graph (removed/moved) - use feature_version as fallback
749
+ # All fields get the same version (the feature version)
750
+ for field_dict in feature_spec.get("fields", []):
751
+ field_key_list = field_dict.get("key")
752
+ if isinstance(field_key_list, list):
753
+ field_key_str_normalized = "/".join(field_key_list)
754
+ else:
755
+ field_key_str_normalized = field_key_list
756
+
757
+ # Use feature_version directly as fallback
758
+ fields_data[field_key_str_normalized] = feature_version
759
+
760
+ snapshot_data[feature_key_str] = {
761
+ "metaxy_feature_version": feature_version,
762
+ "fields": fields_data,
763
+ "feature_spec": feature_spec,
764
+ "metaxy_full_definition_version": snapshot_dict[feature_key_str].get(
765
+ "metaxy_full_definition_version", feature_version
766
+ ),
767
+ }
768
+
769
+ return snapshot_data