metaxy 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metaxy might be problematic. Click here for more details.

Files changed (75) hide show
  1. metaxy/__init__.py +61 -0
  2. metaxy/_testing.py +542 -0
  3. metaxy/_utils.py +16 -0
  4. metaxy/_version.py +1 -0
  5. metaxy/cli/app.py +76 -0
  6. metaxy/cli/context.py +71 -0
  7. metaxy/cli/graph.py +576 -0
  8. metaxy/cli/graph_diff.py +290 -0
  9. metaxy/cli/list.py +42 -0
  10. metaxy/cli/metadata.py +271 -0
  11. metaxy/cli/migrations.py +862 -0
  12. metaxy/cli/push.py +55 -0
  13. metaxy/config.py +450 -0
  14. metaxy/data_versioning/__init__.py +24 -0
  15. metaxy/data_versioning/calculators/__init__.py +13 -0
  16. metaxy/data_versioning/calculators/base.py +97 -0
  17. metaxy/data_versioning/calculators/duckdb.py +186 -0
  18. metaxy/data_versioning/calculators/ibis.py +225 -0
  19. metaxy/data_versioning/calculators/polars.py +135 -0
  20. metaxy/data_versioning/diff/__init__.py +15 -0
  21. metaxy/data_versioning/diff/base.py +150 -0
  22. metaxy/data_versioning/diff/narwhals.py +108 -0
  23. metaxy/data_versioning/hash_algorithms.py +19 -0
  24. metaxy/data_versioning/joiners/__init__.py +9 -0
  25. metaxy/data_versioning/joiners/base.py +70 -0
  26. metaxy/data_versioning/joiners/narwhals.py +235 -0
  27. metaxy/entrypoints.py +309 -0
  28. metaxy/ext/__init__.py +1 -0
  29. metaxy/ext/alembic.py +326 -0
  30. metaxy/ext/sqlmodel.py +172 -0
  31. metaxy/ext/sqlmodel_system_tables.py +139 -0
  32. metaxy/graph/__init__.py +21 -0
  33. metaxy/graph/diff/__init__.py +21 -0
  34. metaxy/graph/diff/diff_models.py +399 -0
  35. metaxy/graph/diff/differ.py +740 -0
  36. metaxy/graph/diff/models.py +418 -0
  37. metaxy/graph/diff/rendering/__init__.py +18 -0
  38. metaxy/graph/diff/rendering/base.py +274 -0
  39. metaxy/graph/diff/rendering/cards.py +188 -0
  40. metaxy/graph/diff/rendering/formatter.py +805 -0
  41. metaxy/graph/diff/rendering/graphviz.py +246 -0
  42. metaxy/graph/diff/rendering/mermaid.py +320 -0
  43. metaxy/graph/diff/rendering/rich.py +165 -0
  44. metaxy/graph/diff/rendering/theme.py +48 -0
  45. metaxy/graph/diff/traversal.py +247 -0
  46. metaxy/graph/utils.py +58 -0
  47. metaxy/metadata_store/__init__.py +31 -0
  48. metaxy/metadata_store/_protocols.py +38 -0
  49. metaxy/metadata_store/base.py +1676 -0
  50. metaxy/metadata_store/clickhouse.py +161 -0
  51. metaxy/metadata_store/duckdb.py +167 -0
  52. metaxy/metadata_store/exceptions.py +43 -0
  53. metaxy/metadata_store/ibis.py +451 -0
  54. metaxy/metadata_store/memory.py +228 -0
  55. metaxy/metadata_store/sqlite.py +187 -0
  56. metaxy/metadata_store/system_tables.py +257 -0
  57. metaxy/migrations/__init__.py +34 -0
  58. metaxy/migrations/detector.py +153 -0
  59. metaxy/migrations/executor.py +208 -0
  60. metaxy/migrations/loader.py +260 -0
  61. metaxy/migrations/models.py +718 -0
  62. metaxy/migrations/ops.py +390 -0
  63. metaxy/models/__init__.py +0 -0
  64. metaxy/models/bases.py +6 -0
  65. metaxy/models/constants.py +24 -0
  66. metaxy/models/feature.py +665 -0
  67. metaxy/models/feature_spec.py +105 -0
  68. metaxy/models/field.py +25 -0
  69. metaxy/models/plan.py +155 -0
  70. metaxy/models/types.py +157 -0
  71. metaxy/py.typed +0 -0
  72. metaxy-0.0.0.dist-info/METADATA +247 -0
  73. metaxy-0.0.0.dist-info/RECORD +75 -0
  74. metaxy-0.0.0.dist-info/WHEEL +4 -0
  75. metaxy-0.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,740 @@
1
+ """Graph diffing logic and snapshot resolution."""
2
+
3
+ import json
4
+ import warnings
5
+ from typing import Any
6
+
7
+ from metaxy.graph.diff.diff_models import (
8
+ AddedNode,
9
+ FieldChange,
10
+ GraphDiff,
11
+ NodeChange,
12
+ RemovedNode,
13
+ )
14
+ from metaxy.metadata_store.base import FEATURE_VERSIONS_KEY, MetadataStore
15
+ from metaxy.models.feature import FeatureGraph
16
+ from metaxy.models.types import FeatureKey, FieldKey
17
+
18
+
19
+ class SnapshotResolver:
20
+ """Resolves snapshot version literals to actual snapshot hashes."""
21
+
22
+ def resolve_snapshot(
23
+ self, literal: str, store: MetadataStore | None, graph: FeatureGraph | None
24
+ ) -> str:
25
+ """Resolve a snapshot literal to its actual version hash.
26
+
27
+ Args:
28
+ literal: Snapshot identifier ("latest", "current", or version hash)
29
+ store: Metadata store to query for snapshots (required for "latest")
30
+ graph: Optional active graph for "current" resolution
31
+
32
+ Returns:
33
+ Resolved snapshot version hash
34
+
35
+ Raises:
36
+ ValueError: If literal is invalid or cannot be resolved
37
+ """
38
+ if literal == "latest":
39
+ if store is None:
40
+ raise ValueError(
41
+ "Cannot resolve 'latest': no metadata store provided. "
42
+ "Provide a store to query for snapshots."
43
+ )
44
+ return self._resolve_latest(store)
45
+ elif literal == "current":
46
+ return self._resolve_current(graph)
47
+ else:
48
+ # Treat as explicit snapshot version
49
+ return literal
50
+
51
+ def _resolve_latest(self, store: MetadataStore) -> str:
52
+ """Resolve 'latest' to most recent snapshot in store."""
53
+ snapshots_df = store.read_graph_snapshots()
54
+
55
+ if snapshots_df.height == 0:
56
+ raise ValueError(
57
+ "No snapshots found in store. Cannot resolve 'latest'. "
58
+ "Run 'metaxy push' to record a snapshot."
59
+ )
60
+
61
+ # read_graph_snapshots() returns sorted by recorded_at descending
62
+ latest_snapshot = snapshots_df["snapshot_version"][0]
63
+ return latest_snapshot
64
+
65
+ def _resolve_current(self, graph: FeatureGraph | None) -> str:
66
+ """Resolve 'current' to active graph's snapshot version."""
67
+ if graph is None:
68
+ raise ValueError(
69
+ "Cannot resolve 'current': no active graph provided. "
70
+ "Ensure features are loaded before using 'current'."
71
+ )
72
+
73
+ if len(graph.features_by_key) == 0:
74
+ raise ValueError(
75
+ "Cannot resolve 'current': active graph is empty. "
76
+ "Ensure features are loaded before using 'current'."
77
+ )
78
+
79
+ return graph.snapshot_version
80
+
81
+
82
+ class GraphDiffer:
83
+ """Compares two graph snapshots and produces a diff."""
84
+
85
+ def diff(
86
+ self,
87
+ snapshot1_data: dict[str, dict[str, Any]],
88
+ snapshot2_data: dict[str, dict[str, Any]],
89
+ from_snapshot_version: str = "unknown",
90
+ to_snapshot_version: str = "unknown",
91
+ ) -> GraphDiff:
92
+ """Compute diff between two snapshots.
93
+
94
+ Args:
95
+ snapshot1_data: First snapshot (feature_key -> {feature_version, feature_spec, fields})
96
+ snapshot2_data: Second snapshot (feature_key -> {feature_version, feature_spec, fields})
97
+ from_snapshot_version: Source snapshot version
98
+ to_snapshot_version: Target snapshot version
99
+
100
+ Returns:
101
+ GraphDiff with added, removed, and changed features
102
+ """
103
+ # Extract feature keys
104
+ keys1 = set(snapshot1_data.keys())
105
+ keys2 = set(snapshot2_data.keys())
106
+
107
+ # Identify added and removed features
108
+ added_keys = keys2 - keys1
109
+ removed_keys = keys1 - keys2
110
+ common_keys = keys1 & keys2
111
+
112
+ # Build added nodes
113
+ added_nodes = []
114
+ for key_str in sorted(added_keys):
115
+ feature_data = snapshot2_data[key_str]
116
+ feature_spec = feature_data.get("feature_spec", {})
117
+
118
+ # Extract fields
119
+ fields_list = []
120
+ for field_dict in feature_spec.get("fields", []):
121
+ field_key_list = field_dict.get("key", [])
122
+ field_key_str = (
123
+ "/".join(field_key_list)
124
+ if isinstance(field_key_list, list)
125
+ else field_key_list
126
+ )
127
+ fields_list.append(
128
+ {
129
+ "key": field_key_str,
130
+ "version": feature_data.get("fields", {}).get(
131
+ field_key_str, ""
132
+ ),
133
+ "code_version": field_dict.get("code_version"),
134
+ }
135
+ )
136
+
137
+ # Extract dependencies
138
+ deps = []
139
+ if feature_spec.get("deps"):
140
+ for dep in feature_spec["deps"]:
141
+ dep_key = dep.get("key", [])
142
+ if isinstance(dep_key, list):
143
+ deps.append(FeatureKey(dep_key))
144
+ else:
145
+ deps.append(FeatureKey(dep_key.split("/")))
146
+
147
+ added_nodes.append(
148
+ AddedNode(
149
+ feature_key=FeatureKey(key_str.split("/")),
150
+ version=feature_data["feature_version"],
151
+ code_version=feature_spec.get("code_version"),
152
+ fields=fields_list,
153
+ dependencies=deps,
154
+ )
155
+ )
156
+
157
+ # Build removed nodes
158
+ removed_nodes = []
159
+ for key_str in sorted(removed_keys):
160
+ feature_data = snapshot1_data[key_str]
161
+ feature_spec = feature_data.get("feature_spec", {})
162
+
163
+ # Extract fields
164
+ fields_list = []
165
+ for field_dict in feature_spec.get("fields", []):
166
+ field_key_list = field_dict.get("key", [])
167
+ field_key_str = (
168
+ "/".join(field_key_list)
169
+ if isinstance(field_key_list, list)
170
+ else field_key_list
171
+ )
172
+ fields_list.append(
173
+ {
174
+ "key": field_key_str,
175
+ "version": feature_data.get("fields", {}).get(
176
+ field_key_str, ""
177
+ ),
178
+ "code_version": field_dict.get("code_version"),
179
+ }
180
+ )
181
+
182
+ # Extract dependencies
183
+ deps = []
184
+ if feature_spec.get("deps"):
185
+ for dep in feature_spec["deps"]:
186
+ dep_key = dep.get("key", [])
187
+ if isinstance(dep_key, list):
188
+ deps.append(FeatureKey(dep_key))
189
+ else:
190
+ deps.append(FeatureKey(dep_key.split("/")))
191
+
192
+ removed_nodes.append(
193
+ RemovedNode(
194
+ feature_key=FeatureKey(key_str.split("/")),
195
+ version=feature_data["feature_version"],
196
+ code_version=feature_spec.get("code_version"),
197
+ fields=fields_list,
198
+ dependencies=deps,
199
+ )
200
+ )
201
+
202
+ # Identify changed features
203
+ changed_nodes = []
204
+ for key_str in sorted(common_keys):
205
+ feature1 = snapshot1_data[key_str]
206
+ feature2 = snapshot2_data[key_str]
207
+
208
+ version1 = feature1["feature_version"]
209
+ version2 = feature2["feature_version"]
210
+
211
+ spec1 = feature1.get("feature_spec", {})
212
+ spec2 = feature2.get("feature_spec", {})
213
+
214
+ fields1 = feature1.get("fields", {})
215
+ fields2 = feature2.get("fields", {})
216
+
217
+ # Check if feature version changed
218
+ if version1 != version2:
219
+ # Compute field changes
220
+ field_changes = self._compute_field_changes(fields1, fields2)
221
+
222
+ changed_nodes.append(
223
+ NodeChange(
224
+ feature_key=FeatureKey(key_str.split("/")),
225
+ old_version=version1,
226
+ new_version=version2,
227
+ old_code_version=spec1.get("code_version"),
228
+ new_code_version=spec2.get("code_version"),
229
+ added_fields=[fc for fc in field_changes if fc.is_added],
230
+ removed_fields=[fc for fc in field_changes if fc.is_removed],
231
+ changed_fields=[fc for fc in field_changes if fc.is_changed],
232
+ )
233
+ )
234
+
235
+ return GraphDiff(
236
+ from_snapshot_version=from_snapshot_version,
237
+ to_snapshot_version=to_snapshot_version,
238
+ added_nodes=added_nodes,
239
+ removed_nodes=removed_nodes,
240
+ changed_nodes=changed_nodes,
241
+ )
242
+
243
+ def _compute_field_changes(
244
+ self, fields1: dict[str, str], fields2: dict[str, str]
245
+ ) -> list[FieldChange]:
246
+ """Compute changes between two field version mappings.
247
+
248
+ Args:
249
+ fields1: Field key (string) -> field version (hash) from snapshot1
250
+ fields2: Field key (string) -> field version (hash) from snapshot2
251
+
252
+ Returns:
253
+ List of FieldChange objects
254
+ """
255
+ field_keys1 = set(fields1.keys())
256
+ field_keys2 = set(fields2.keys())
257
+
258
+ added_fields = field_keys2 - field_keys1
259
+ removed_fields = field_keys1 - field_keys2
260
+ common_fields = field_keys1 & field_keys2
261
+
262
+ changes = []
263
+
264
+ # Added fields
265
+ for field_key_str in sorted(added_fields):
266
+ changes.append(
267
+ FieldChange(
268
+ field_key=FieldKey(field_key_str.split("/")),
269
+ old_version=None,
270
+ new_version=fields2[field_key_str],
271
+ old_code_version=None,
272
+ new_code_version=None, # TODO: Extract from spec if available
273
+ )
274
+ )
275
+
276
+ # Removed fields
277
+ for field_key_str in sorted(removed_fields):
278
+ changes.append(
279
+ FieldChange(
280
+ field_key=FieldKey(field_key_str.split("/")),
281
+ old_version=fields1[field_key_str],
282
+ new_version=None,
283
+ old_code_version=None, # TODO: Extract from spec if available
284
+ new_code_version=None,
285
+ )
286
+ )
287
+
288
+ # Changed fields
289
+ for field_key_str in sorted(common_fields):
290
+ version1 = fields1[field_key_str]
291
+ version2 = fields2[field_key_str]
292
+
293
+ if version1 != version2:
294
+ changes.append(
295
+ FieldChange(
296
+ field_key=FieldKey(field_key_str.split("/")),
297
+ old_version=version1,
298
+ new_version=version2,
299
+ old_code_version=None, # TODO: Extract from spec if available
300
+ new_code_version=None, # TODO: Extract from spec if available
301
+ )
302
+ )
303
+
304
+ return changes
305
+
306
+ def create_merged_graph_data(
307
+ self,
308
+ snapshot1_data: dict[str, dict[str, Any]],
309
+ snapshot2_data: dict[str, dict[str, Any]],
310
+ diff: GraphDiff,
311
+ ) -> dict[str, Any]:
312
+ """Create merged graph data structure with status annotations.
313
+
314
+ This combines features from both snapshots into a single unified view,
315
+ annotating each feature with its status (added/removed/changed/unchanged).
316
+
317
+ Args:
318
+ snapshot1_data: First snapshot data (feature_key -> {feature_version, fields})
319
+ snapshot2_data: Second snapshot data (feature_key -> {feature_version, fields})
320
+ diff: Computed diff between snapshots
321
+
322
+ Returns:
323
+ Dict with structure:
324
+ {
325
+ 'nodes': {
326
+ feature_key_str: {
327
+ 'status': 'added' | 'removed' | 'changed' | 'unchanged',
328
+ 'old_version': str | None,
329
+ 'new_version': str | None,
330
+ 'fields': {...}, # fields from relevant snapshot
331
+ 'field_changes': [...], # FieldChange objects for changed nodes
332
+ 'dependencies': [feature_key_str, ...], # deps from relevant snapshot
333
+ }
334
+ },
335
+ 'edges': [
336
+ {'from': feature_key_str, 'to': feature_key_str}
337
+ ]
338
+ }
339
+ """
340
+ # Create status mapping for efficient lookup
341
+ added_keys = {node.feature_key.to_string() for node in diff.added_nodes}
342
+ removed_keys = {node.feature_key.to_string() for node in diff.removed_nodes}
343
+ changed_keys = {
344
+ node.feature_key.to_string(): node for node in diff.changed_nodes
345
+ }
346
+
347
+ # Get all feature keys from both snapshots
348
+ all_keys = set(snapshot1_data.keys()) | set(snapshot2_data.keys())
349
+
350
+ nodes = {}
351
+ edges = []
352
+
353
+ for feature_key_str in all_keys:
354
+ # Determine status
355
+ if feature_key_str in added_keys:
356
+ status = "added"
357
+ old_version = None
358
+ new_version = snapshot2_data[feature_key_str]["feature_version"]
359
+ fields = snapshot2_data[feature_key_str].get("fields", {})
360
+ field_changes = []
361
+ # Dependencies from snapshot2
362
+ deps = self._extract_dependencies(
363
+ snapshot2_data[feature_key_str].get("feature_spec", {})
364
+ )
365
+ elif feature_key_str in removed_keys:
366
+ status = "removed"
367
+ old_version = snapshot1_data[feature_key_str]["feature_version"]
368
+ new_version = None
369
+ fields = snapshot1_data[feature_key_str].get("fields", {})
370
+ field_changes = []
371
+ # Dependencies from snapshot1
372
+ deps = self._extract_dependencies(
373
+ snapshot1_data[feature_key_str].get("feature_spec", {})
374
+ )
375
+ elif feature_key_str in changed_keys:
376
+ status = "changed"
377
+ node_change = changed_keys[feature_key_str]
378
+ old_version = node_change.old_version
379
+ new_version = node_change.new_version
380
+ fields = snapshot2_data[feature_key_str].get("fields", {})
381
+ # Combine all field changes from the NodeChange
382
+ field_changes = (
383
+ node_change.added_fields
384
+ + node_change.removed_fields
385
+ + node_change.changed_fields
386
+ )
387
+ # Dependencies from snapshot2 (current version)
388
+ deps = self._extract_dependencies(
389
+ snapshot2_data[feature_key_str].get("feature_spec", {})
390
+ )
391
+ else:
392
+ # Unchanged
393
+ status = "unchanged"
394
+ old_version = snapshot1_data[feature_key_str]["feature_version"]
395
+ new_version = snapshot2_data[feature_key_str]["feature_version"]
396
+ fields = snapshot2_data[feature_key_str].get("fields", {})
397
+ field_changes = []
398
+ # Dependencies from snapshot2
399
+ deps = self._extract_dependencies(
400
+ snapshot2_data[feature_key_str].get("feature_spec", {})
401
+ )
402
+
403
+ nodes[feature_key_str] = {
404
+ "status": status,
405
+ "old_version": old_version,
406
+ "new_version": new_version,
407
+ "fields": fields,
408
+ "field_changes": field_changes,
409
+ "dependencies": deps,
410
+ }
411
+
412
+ # Create edges for dependencies (arrow points from dependency to feature)
413
+ for dep_key in deps:
414
+ edges.append({"from": dep_key, "to": feature_key_str})
415
+
416
+ return {
417
+ "nodes": nodes,
418
+ "edges": edges,
419
+ }
420
+
421
+ def _extract_dependencies(self, feature_spec: dict[str, Any]) -> list[str]:
422
+ """Extract dependency feature keys from a feature spec.
423
+
424
+ Args:
425
+ feature_spec: Parsed feature spec dict
426
+
427
+ Returns:
428
+ List of dependency feature keys as strings
429
+ """
430
+ deps = feature_spec.get("deps", [])
431
+ if deps is None:
432
+ return []
433
+
434
+ dep_keys = []
435
+ for dep in deps:
436
+ # dep is a dict with 'key' field
437
+ dep_key = dep.get("key", [])
438
+ if isinstance(dep_key, list):
439
+ dep_keys.append("/".join(dep_key))
440
+ else:
441
+ dep_keys.append(dep_key)
442
+
443
+ return dep_keys
444
+
445
+ def filter_merged_graph(
446
+ self,
447
+ merged_data: dict[str, Any],
448
+ focus_feature: str | None = None,
449
+ up: int | None = None,
450
+ down: int | None = None,
451
+ ) -> dict[str, Any]:
452
+ """Filter merged graph to show only relevant features.
453
+
454
+ Args:
455
+ merged_data: Merged graph data with nodes and edges
456
+ focus_feature: Feature key to focus on (string format with / or __)
457
+ up: Number of upstream levels (None = all if focus_feature is set, 0 otherwise)
458
+ down: Number of downstream levels (None = all if focus_feature is set, 0 otherwise)
459
+
460
+ Returns:
461
+ Filtered merged graph data with same structure
462
+
463
+ Raises:
464
+ ValueError: If focus_feature is specified but not found in graph
465
+ """
466
+ if focus_feature is None:
467
+ # No filtering
468
+ return merged_data
469
+
470
+ # Parse feature key (support both / and __ formats)
471
+ if "/" in focus_feature:
472
+ focus_key = focus_feature
473
+ else:
474
+ focus_key = focus_feature.replace("__", "/")
475
+
476
+ # Check if focus feature exists
477
+ if focus_key not in merged_data["nodes"]:
478
+ raise ValueError(f"Feature '{focus_feature}' not found in graph")
479
+
480
+ # Build dependency graph for traversal
481
+ # Build forward edges (feature -> dependents) and backward edges (feature -> dependencies)
482
+ forward_edges: dict[str, list[str]] = {} # feature -> list of dependents
483
+ backward_edges: dict[str, list[str]] = {} # feature -> list of dependencies
484
+
485
+ for edge in merged_data["edges"]:
486
+ dep = edge["from"] # dependency
487
+ feat = edge["to"] # dependent feature
488
+
489
+ if feat not in backward_edges:
490
+ backward_edges[feat] = []
491
+ backward_edges[feat].append(dep)
492
+
493
+ if dep not in forward_edges:
494
+ forward_edges[dep] = []
495
+ forward_edges[dep].append(feat)
496
+
497
+ # Find features to include
498
+ features_to_include = {focus_key}
499
+
500
+ # Add upstream (dependencies)
501
+ # Default behavior: if focus_feature is set but up is not specified, include all upstream
502
+ if up is None:
503
+ # Include all upstream
504
+ upstream = self._get_upstream_features(
505
+ focus_key, backward_edges, max_levels=None
506
+ )
507
+ features_to_include.update(upstream)
508
+ elif up > 0:
509
+ # Include specified number of levels
510
+ upstream = self._get_upstream_features(
511
+ focus_key, backward_edges, max_levels=up
512
+ )
513
+ features_to_include.update(upstream)
514
+ # else: up == 0, don't include upstream
515
+
516
+ # Add downstream (dependents)
517
+ # Default behavior: if focus_feature is set but down is not specified, include all downstream
518
+ if down is None:
519
+ # Include all downstream
520
+ downstream = self._get_downstream_features(
521
+ focus_key, forward_edges, max_levels=None
522
+ )
523
+ features_to_include.update(downstream)
524
+ elif down > 0:
525
+ # Include specified number of levels
526
+ downstream = self._get_downstream_features(
527
+ focus_key, forward_edges, max_levels=down
528
+ )
529
+ features_to_include.update(downstream)
530
+ # else: down == 0, don't include downstream
531
+
532
+ # Filter nodes and edges
533
+ filtered_nodes = {
534
+ k: v for k, v in merged_data["nodes"].items() if k in features_to_include
535
+ }
536
+ filtered_edges = [
537
+ e
538
+ for e in merged_data["edges"]
539
+ if e["from"] in features_to_include and e["to"] in features_to_include
540
+ ]
541
+
542
+ return {
543
+ "nodes": filtered_nodes,
544
+ "edges": filtered_edges,
545
+ }
546
+
547
+ def _get_upstream_features(
548
+ self,
549
+ start_key: str,
550
+ backward_edges: dict[str, list[str]],
551
+ max_levels: int | None = None,
552
+ visited: set[str] | None = None,
553
+ level: int = 0,
554
+ ) -> set[str]:
555
+ """Get upstream features (dependencies) recursively."""
556
+ if visited is None:
557
+ visited = set()
558
+
559
+ if start_key in visited:
560
+ return set()
561
+
562
+ if max_levels is not None and level >= max_levels:
563
+ return set()
564
+
565
+ visited.add(start_key)
566
+ upstream: set[str] = set()
567
+
568
+ deps = backward_edges.get(start_key, [])
569
+ for dep in deps:
570
+ if dep not in visited:
571
+ upstream.add(dep)
572
+ # Recurse
573
+ upstream.update(
574
+ self._get_upstream_features(
575
+ dep, backward_edges, max_levels, visited, level + 1
576
+ )
577
+ )
578
+
579
+ return upstream
580
+
581
+ def _get_downstream_features(
582
+ self,
583
+ start_key: str,
584
+ forward_edges: dict[str, list[str]],
585
+ max_levels: int | None = None,
586
+ visited: set[str] | None = None,
587
+ level: int = 0,
588
+ ) -> set[str]:
589
+ """Get downstream features (dependents) recursively."""
590
+ if visited is None:
591
+ visited = set()
592
+
593
+ if start_key in visited:
594
+ return set()
595
+
596
+ if max_levels is not None and level >= max_levels:
597
+ return set()
598
+
599
+ visited.add(start_key)
600
+ downstream: set[str] = set()
601
+
602
+ dependents = forward_edges.get(start_key, [])
603
+ for dependent in dependents:
604
+ if dependent not in visited:
605
+ downstream.add(dependent)
606
+ # Recurse
607
+ downstream.update(
608
+ self._get_downstream_features(
609
+ dependent, forward_edges, max_levels, visited, level + 1
610
+ )
611
+ )
612
+
613
+ return downstream
614
+
615
+ def load_snapshot_data(
616
+ self, store: MetadataStore, snapshot_version: str
617
+ ) -> dict[str, dict[str, Any]]:
618
+ """Load snapshot data from store.
619
+
620
+ Args:
621
+ store: Metadata store to query
622
+ snapshot_version: Snapshot version to load
623
+
624
+ Returns:
625
+ Dict mapping feature_key (string) -> {feature_version, feature_spec, fields}
626
+ where fields is dict mapping field_key (string) -> field_version (hash)
627
+
628
+ Raises:
629
+ ValueError: If snapshot not found in store
630
+ """
631
+ # Query feature_versions table for this snapshot
632
+ try:
633
+ features_lazy = store._read_metadata_native(FEATURE_VERSIONS_KEY)
634
+ if features_lazy is None:
635
+ raise ValueError(
636
+ f"No feature_versions table found in store. Cannot load snapshot {snapshot_version}."
637
+ )
638
+
639
+ # Filter by snapshot_version
640
+ import narwhals as nw
641
+
642
+ features_df = (
643
+ features_lazy.filter(nw.col("snapshot_version") == snapshot_version)
644
+ .collect()
645
+ .to_polars()
646
+ )
647
+
648
+ if features_df.height == 0:
649
+ raise ValueError(
650
+ f"Snapshot {snapshot_version} not found in store. "
651
+ "Run 'metaxy push' to record snapshots or check the version hash."
652
+ )
653
+
654
+ except Exception as e:
655
+ raise ValueError(f"Failed to load snapshot {snapshot_version}: {e}") from e
656
+
657
+ # Build snapshot data structure
658
+ snapshot_dict = {}
659
+ for row in features_df.iter_rows(named=True):
660
+ feature_key_str = row["feature_key"]
661
+ feature_version = row["feature_version"]
662
+ feature_spec_json = row["feature_spec"]
663
+ feature_class_path = row.get("feature_class_path", "")
664
+
665
+ feature_spec_dict = json.loads(feature_spec_json)
666
+
667
+ snapshot_dict[feature_key_str] = {
668
+ "feature_version": feature_version,
669
+ "feature_spec": feature_spec_dict,
670
+ "feature_class_path": feature_class_path,
671
+ }
672
+
673
+ # Try to reconstruct FeatureGraph from snapshot to compute field versions
674
+ # This may fail if features have been removed/moved, so we handle that gracefully
675
+ graph: FeatureGraph | None = None
676
+ try:
677
+ graph = FeatureGraph.from_snapshot(snapshot_dict)
678
+ graph_available = True
679
+ except ImportError:
680
+ # Some features can't be imported (likely removed) - proceed without graph
681
+ # For diff purposes, we can still show feature-level changes
682
+ # We'll use feature_version as a fallback for all field versions
683
+ graph_available = False
684
+ warnings.warn(
685
+ "Using feature_version as field_version fallback for features that cannot be imported. "
686
+ "This may occur when features have been removed or moved.",
687
+ UserWarning,
688
+ stacklevel=2,
689
+ )
690
+
691
+ # Compute field versions using the reconstructed graph (if available)
692
+ from metaxy.models.plan import FQFieldKey
693
+
694
+ snapshot_data = {}
695
+ for feature_key_str in snapshot_dict.keys():
696
+ feature_version = snapshot_dict[feature_key_str]["feature_version"]
697
+ feature_spec = snapshot_dict[feature_key_str]["feature_spec"]
698
+ feature_key_obj = FeatureKey(feature_key_str.split("/"))
699
+
700
+ # Compute field versions using graph (if available)
701
+ fields_data = {}
702
+ if (
703
+ graph_available
704
+ and graph is not None
705
+ and feature_key_obj in graph.features_by_key
706
+ ):
707
+ # Feature exists in reconstructed graph - compute precise field versions
708
+ for field_dict in feature_spec.get("fields", []):
709
+ field_key_list = field_dict.get("key")
710
+ if isinstance(field_key_list, list):
711
+ field_key = FieldKey(field_key_list)
712
+ field_key_str_normalized = "/".join(field_key_list)
713
+ else:
714
+ field_key = FieldKey([field_key_list])
715
+ field_key_str_normalized = field_key_list
716
+
717
+ # Compute field version using the graph
718
+ fq_key = FQFieldKey(feature=feature_key_obj, field=field_key)
719
+ field_version = graph.get_field_version(fq_key)
720
+ fields_data[field_key_str_normalized] = field_version
721
+ else:
722
+ # Feature doesn't exist in graph (removed/moved) - use feature_version as fallback
723
+ # All fields get the same version (the feature version)
724
+ for field_dict in feature_spec.get("fields", []):
725
+ field_key_list = field_dict.get("key")
726
+ if isinstance(field_key_list, list):
727
+ field_key_str_normalized = "/".join(field_key_list)
728
+ else:
729
+ field_key_str_normalized = field_key_list
730
+
731
+ # Use feature_version directly as fallback
732
+ fields_data[field_key_str_normalized] = feature_version
733
+
734
+ snapshot_data[feature_key_str] = {
735
+ "feature_version": feature_version,
736
+ "fields": fields_data,
737
+ "feature_spec": feature_spec,
738
+ }
739
+
740
+ return snapshot_data