metaxy 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metaxy might be problematic. Click here for more details.
- metaxy/__init__.py +61 -0
- metaxy/_testing.py +542 -0
- metaxy/_utils.py +16 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +76 -0
- metaxy/cli/context.py +71 -0
- metaxy/cli/graph.py +576 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +42 -0
- metaxy/cli/metadata.py +271 -0
- metaxy/cli/migrations.py +862 -0
- metaxy/cli/push.py +55 -0
- metaxy/config.py +450 -0
- metaxy/data_versioning/__init__.py +24 -0
- metaxy/data_versioning/calculators/__init__.py +13 -0
- metaxy/data_versioning/calculators/base.py +97 -0
- metaxy/data_versioning/calculators/duckdb.py +186 -0
- metaxy/data_versioning/calculators/ibis.py +225 -0
- metaxy/data_versioning/calculators/polars.py +135 -0
- metaxy/data_versioning/diff/__init__.py +15 -0
- metaxy/data_versioning/diff/base.py +150 -0
- metaxy/data_versioning/diff/narwhals.py +108 -0
- metaxy/data_versioning/hash_algorithms.py +19 -0
- metaxy/data_versioning/joiners/__init__.py +9 -0
- metaxy/data_versioning/joiners/base.py +70 -0
- metaxy/data_versioning/joiners/narwhals.py +235 -0
- metaxy/entrypoints.py +309 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/alembic.py +326 -0
- metaxy/ext/sqlmodel.py +172 -0
- metaxy/ext/sqlmodel_system_tables.py +139 -0
- metaxy/graph/__init__.py +21 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +399 -0
- metaxy/graph/diff/differ.py +740 -0
- metaxy/graph/diff/models.py +418 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +274 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +320 -0
- metaxy/graph/diff/rendering/rich.py +165 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +31 -0
- metaxy/metadata_store/_protocols.py +38 -0
- metaxy/metadata_store/base.py +1676 -0
- metaxy/metadata_store/clickhouse.py +161 -0
- metaxy/metadata_store/duckdb.py +167 -0
- metaxy/metadata_store/exceptions.py +43 -0
- metaxy/metadata_store/ibis.py +451 -0
- metaxy/metadata_store/memory.py +228 -0
- metaxy/metadata_store/sqlite.py +187 -0
- metaxy/metadata_store/system_tables.py +257 -0
- metaxy/migrations/__init__.py +34 -0
- metaxy/migrations/detector.py +153 -0
- metaxy/migrations/executor.py +208 -0
- metaxy/migrations/loader.py +260 -0
- metaxy/migrations/models.py +718 -0
- metaxy/migrations/ops.py +390 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +6 -0
- metaxy/models/constants.py +24 -0
- metaxy/models/feature.py +665 -0
- metaxy/models/feature_spec.py +105 -0
- metaxy/models/field.py +25 -0
- metaxy/models/plan.py +155 -0
- metaxy/models/types.py +157 -0
- metaxy/py.typed +0 -0
- metaxy-0.0.0.dist-info/METADATA +247 -0
- metaxy-0.0.0.dist-info/RECORD +75 -0
- metaxy-0.0.0.dist-info/WHEEL +4 -0
- metaxy-0.0.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,740 @@
|
|
|
1
|
+
"""Graph diffing logic and snapshot resolution."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from metaxy.graph.diff.diff_models import (
|
|
8
|
+
AddedNode,
|
|
9
|
+
FieldChange,
|
|
10
|
+
GraphDiff,
|
|
11
|
+
NodeChange,
|
|
12
|
+
RemovedNode,
|
|
13
|
+
)
|
|
14
|
+
from metaxy.metadata_store.base import FEATURE_VERSIONS_KEY, MetadataStore
|
|
15
|
+
from metaxy.models.feature import FeatureGraph
|
|
16
|
+
from metaxy.models.types import FeatureKey, FieldKey
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SnapshotResolver:
|
|
20
|
+
"""Resolves snapshot version literals to actual snapshot hashes."""
|
|
21
|
+
|
|
22
|
+
def resolve_snapshot(
|
|
23
|
+
self, literal: str, store: MetadataStore | None, graph: FeatureGraph | None
|
|
24
|
+
) -> str:
|
|
25
|
+
"""Resolve a snapshot literal to its actual version hash.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
literal: Snapshot identifier ("latest", "current", or version hash)
|
|
29
|
+
store: Metadata store to query for snapshots (required for "latest")
|
|
30
|
+
graph: Optional active graph for "current" resolution
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Resolved snapshot version hash
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
ValueError: If literal is invalid or cannot be resolved
|
|
37
|
+
"""
|
|
38
|
+
if literal == "latest":
|
|
39
|
+
if store is None:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
"Cannot resolve 'latest': no metadata store provided. "
|
|
42
|
+
"Provide a store to query for snapshots."
|
|
43
|
+
)
|
|
44
|
+
return self._resolve_latest(store)
|
|
45
|
+
elif literal == "current":
|
|
46
|
+
return self._resolve_current(graph)
|
|
47
|
+
else:
|
|
48
|
+
# Treat as explicit snapshot version
|
|
49
|
+
return literal
|
|
50
|
+
|
|
51
|
+
def _resolve_latest(self, store: MetadataStore) -> str:
|
|
52
|
+
"""Resolve 'latest' to most recent snapshot in store."""
|
|
53
|
+
snapshots_df = store.read_graph_snapshots()
|
|
54
|
+
|
|
55
|
+
if snapshots_df.height == 0:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
"No snapshots found in store. Cannot resolve 'latest'. "
|
|
58
|
+
"Run 'metaxy push' to record a snapshot."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# read_graph_snapshots() returns sorted by recorded_at descending
|
|
62
|
+
latest_snapshot = snapshots_df["snapshot_version"][0]
|
|
63
|
+
return latest_snapshot
|
|
64
|
+
|
|
65
|
+
def _resolve_current(self, graph: FeatureGraph | None) -> str:
|
|
66
|
+
"""Resolve 'current' to active graph's snapshot version."""
|
|
67
|
+
if graph is None:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"Cannot resolve 'current': no active graph provided. "
|
|
70
|
+
"Ensure features are loaded before using 'current'."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if len(graph.features_by_key) == 0:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
"Cannot resolve 'current': active graph is empty. "
|
|
76
|
+
"Ensure features are loaded before using 'current'."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return graph.snapshot_version
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class GraphDiffer:
|
|
83
|
+
"""Compares two graph snapshots and produces a diff."""
|
|
84
|
+
|
|
85
|
+
def diff(
|
|
86
|
+
self,
|
|
87
|
+
snapshot1_data: dict[str, dict[str, Any]],
|
|
88
|
+
snapshot2_data: dict[str, dict[str, Any]],
|
|
89
|
+
from_snapshot_version: str = "unknown",
|
|
90
|
+
to_snapshot_version: str = "unknown",
|
|
91
|
+
) -> GraphDiff:
|
|
92
|
+
"""Compute diff between two snapshots.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
snapshot1_data: First snapshot (feature_key -> {feature_version, feature_spec, fields})
|
|
96
|
+
snapshot2_data: Second snapshot (feature_key -> {feature_version, feature_spec, fields})
|
|
97
|
+
from_snapshot_version: Source snapshot version
|
|
98
|
+
to_snapshot_version: Target snapshot version
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
GraphDiff with added, removed, and changed features
|
|
102
|
+
"""
|
|
103
|
+
# Extract feature keys
|
|
104
|
+
keys1 = set(snapshot1_data.keys())
|
|
105
|
+
keys2 = set(snapshot2_data.keys())
|
|
106
|
+
|
|
107
|
+
# Identify added and removed features
|
|
108
|
+
added_keys = keys2 - keys1
|
|
109
|
+
removed_keys = keys1 - keys2
|
|
110
|
+
common_keys = keys1 & keys2
|
|
111
|
+
|
|
112
|
+
# Build added nodes
|
|
113
|
+
added_nodes = []
|
|
114
|
+
for key_str in sorted(added_keys):
|
|
115
|
+
feature_data = snapshot2_data[key_str]
|
|
116
|
+
feature_spec = feature_data.get("feature_spec", {})
|
|
117
|
+
|
|
118
|
+
# Extract fields
|
|
119
|
+
fields_list = []
|
|
120
|
+
for field_dict in feature_spec.get("fields", []):
|
|
121
|
+
field_key_list = field_dict.get("key", [])
|
|
122
|
+
field_key_str = (
|
|
123
|
+
"/".join(field_key_list)
|
|
124
|
+
if isinstance(field_key_list, list)
|
|
125
|
+
else field_key_list
|
|
126
|
+
)
|
|
127
|
+
fields_list.append(
|
|
128
|
+
{
|
|
129
|
+
"key": field_key_str,
|
|
130
|
+
"version": feature_data.get("fields", {}).get(
|
|
131
|
+
field_key_str, ""
|
|
132
|
+
),
|
|
133
|
+
"code_version": field_dict.get("code_version"),
|
|
134
|
+
}
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Extract dependencies
|
|
138
|
+
deps = []
|
|
139
|
+
if feature_spec.get("deps"):
|
|
140
|
+
for dep in feature_spec["deps"]:
|
|
141
|
+
dep_key = dep.get("key", [])
|
|
142
|
+
if isinstance(dep_key, list):
|
|
143
|
+
deps.append(FeatureKey(dep_key))
|
|
144
|
+
else:
|
|
145
|
+
deps.append(FeatureKey(dep_key.split("/")))
|
|
146
|
+
|
|
147
|
+
added_nodes.append(
|
|
148
|
+
AddedNode(
|
|
149
|
+
feature_key=FeatureKey(key_str.split("/")),
|
|
150
|
+
version=feature_data["feature_version"],
|
|
151
|
+
code_version=feature_spec.get("code_version"),
|
|
152
|
+
fields=fields_list,
|
|
153
|
+
dependencies=deps,
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Build removed nodes
|
|
158
|
+
removed_nodes = []
|
|
159
|
+
for key_str in sorted(removed_keys):
|
|
160
|
+
feature_data = snapshot1_data[key_str]
|
|
161
|
+
feature_spec = feature_data.get("feature_spec", {})
|
|
162
|
+
|
|
163
|
+
# Extract fields
|
|
164
|
+
fields_list = []
|
|
165
|
+
for field_dict in feature_spec.get("fields", []):
|
|
166
|
+
field_key_list = field_dict.get("key", [])
|
|
167
|
+
field_key_str = (
|
|
168
|
+
"/".join(field_key_list)
|
|
169
|
+
if isinstance(field_key_list, list)
|
|
170
|
+
else field_key_list
|
|
171
|
+
)
|
|
172
|
+
fields_list.append(
|
|
173
|
+
{
|
|
174
|
+
"key": field_key_str,
|
|
175
|
+
"version": feature_data.get("fields", {}).get(
|
|
176
|
+
field_key_str, ""
|
|
177
|
+
),
|
|
178
|
+
"code_version": field_dict.get("code_version"),
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Extract dependencies
|
|
183
|
+
deps = []
|
|
184
|
+
if feature_spec.get("deps"):
|
|
185
|
+
for dep in feature_spec["deps"]:
|
|
186
|
+
dep_key = dep.get("key", [])
|
|
187
|
+
if isinstance(dep_key, list):
|
|
188
|
+
deps.append(FeatureKey(dep_key))
|
|
189
|
+
else:
|
|
190
|
+
deps.append(FeatureKey(dep_key.split("/")))
|
|
191
|
+
|
|
192
|
+
removed_nodes.append(
|
|
193
|
+
RemovedNode(
|
|
194
|
+
feature_key=FeatureKey(key_str.split("/")),
|
|
195
|
+
version=feature_data["feature_version"],
|
|
196
|
+
code_version=feature_spec.get("code_version"),
|
|
197
|
+
fields=fields_list,
|
|
198
|
+
dependencies=deps,
|
|
199
|
+
)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Identify changed features
|
|
203
|
+
changed_nodes = []
|
|
204
|
+
for key_str in sorted(common_keys):
|
|
205
|
+
feature1 = snapshot1_data[key_str]
|
|
206
|
+
feature2 = snapshot2_data[key_str]
|
|
207
|
+
|
|
208
|
+
version1 = feature1["feature_version"]
|
|
209
|
+
version2 = feature2["feature_version"]
|
|
210
|
+
|
|
211
|
+
spec1 = feature1.get("feature_spec", {})
|
|
212
|
+
spec2 = feature2.get("feature_spec", {})
|
|
213
|
+
|
|
214
|
+
fields1 = feature1.get("fields", {})
|
|
215
|
+
fields2 = feature2.get("fields", {})
|
|
216
|
+
|
|
217
|
+
# Check if feature version changed
|
|
218
|
+
if version1 != version2:
|
|
219
|
+
# Compute field changes
|
|
220
|
+
field_changes = self._compute_field_changes(fields1, fields2)
|
|
221
|
+
|
|
222
|
+
changed_nodes.append(
|
|
223
|
+
NodeChange(
|
|
224
|
+
feature_key=FeatureKey(key_str.split("/")),
|
|
225
|
+
old_version=version1,
|
|
226
|
+
new_version=version2,
|
|
227
|
+
old_code_version=spec1.get("code_version"),
|
|
228
|
+
new_code_version=spec2.get("code_version"),
|
|
229
|
+
added_fields=[fc for fc in field_changes if fc.is_added],
|
|
230
|
+
removed_fields=[fc for fc in field_changes if fc.is_removed],
|
|
231
|
+
changed_fields=[fc for fc in field_changes if fc.is_changed],
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return GraphDiff(
|
|
236
|
+
from_snapshot_version=from_snapshot_version,
|
|
237
|
+
to_snapshot_version=to_snapshot_version,
|
|
238
|
+
added_nodes=added_nodes,
|
|
239
|
+
removed_nodes=removed_nodes,
|
|
240
|
+
changed_nodes=changed_nodes,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
def _compute_field_changes(
|
|
244
|
+
self, fields1: dict[str, str], fields2: dict[str, str]
|
|
245
|
+
) -> list[FieldChange]:
|
|
246
|
+
"""Compute changes between two field version mappings.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
fields1: Field key (string) -> field version (hash) from snapshot1
|
|
250
|
+
fields2: Field key (string) -> field version (hash) from snapshot2
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
List of FieldChange objects
|
|
254
|
+
"""
|
|
255
|
+
field_keys1 = set(fields1.keys())
|
|
256
|
+
field_keys2 = set(fields2.keys())
|
|
257
|
+
|
|
258
|
+
added_fields = field_keys2 - field_keys1
|
|
259
|
+
removed_fields = field_keys1 - field_keys2
|
|
260
|
+
common_fields = field_keys1 & field_keys2
|
|
261
|
+
|
|
262
|
+
changes = []
|
|
263
|
+
|
|
264
|
+
# Added fields
|
|
265
|
+
for field_key_str in sorted(added_fields):
|
|
266
|
+
changes.append(
|
|
267
|
+
FieldChange(
|
|
268
|
+
field_key=FieldKey(field_key_str.split("/")),
|
|
269
|
+
old_version=None,
|
|
270
|
+
new_version=fields2[field_key_str],
|
|
271
|
+
old_code_version=None,
|
|
272
|
+
new_code_version=None, # TODO: Extract from spec if available
|
|
273
|
+
)
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Removed fields
|
|
277
|
+
for field_key_str in sorted(removed_fields):
|
|
278
|
+
changes.append(
|
|
279
|
+
FieldChange(
|
|
280
|
+
field_key=FieldKey(field_key_str.split("/")),
|
|
281
|
+
old_version=fields1[field_key_str],
|
|
282
|
+
new_version=None,
|
|
283
|
+
old_code_version=None, # TODO: Extract from spec if available
|
|
284
|
+
new_code_version=None,
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Changed fields
|
|
289
|
+
for field_key_str in sorted(common_fields):
|
|
290
|
+
version1 = fields1[field_key_str]
|
|
291
|
+
version2 = fields2[field_key_str]
|
|
292
|
+
|
|
293
|
+
if version1 != version2:
|
|
294
|
+
changes.append(
|
|
295
|
+
FieldChange(
|
|
296
|
+
field_key=FieldKey(field_key_str.split("/")),
|
|
297
|
+
old_version=version1,
|
|
298
|
+
new_version=version2,
|
|
299
|
+
old_code_version=None, # TODO: Extract from spec if available
|
|
300
|
+
new_code_version=None, # TODO: Extract from spec if available
|
|
301
|
+
)
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return changes
|
|
305
|
+
|
|
306
|
+
def create_merged_graph_data(
|
|
307
|
+
self,
|
|
308
|
+
snapshot1_data: dict[str, dict[str, Any]],
|
|
309
|
+
snapshot2_data: dict[str, dict[str, Any]],
|
|
310
|
+
diff: GraphDiff,
|
|
311
|
+
) -> dict[str, Any]:
|
|
312
|
+
"""Create merged graph data structure with status annotations.
|
|
313
|
+
|
|
314
|
+
This combines features from both snapshots into a single unified view,
|
|
315
|
+
annotating each feature with its status (added/removed/changed/unchanged).
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
snapshot1_data: First snapshot data (feature_key -> {feature_version, fields})
|
|
319
|
+
snapshot2_data: Second snapshot data (feature_key -> {feature_version, fields})
|
|
320
|
+
diff: Computed diff between snapshots
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Dict with structure:
|
|
324
|
+
{
|
|
325
|
+
'nodes': {
|
|
326
|
+
feature_key_str: {
|
|
327
|
+
'status': 'added' | 'removed' | 'changed' | 'unchanged',
|
|
328
|
+
'old_version': str | None,
|
|
329
|
+
'new_version': str | None,
|
|
330
|
+
'fields': {...}, # fields from relevant snapshot
|
|
331
|
+
'field_changes': [...], # FieldChange objects for changed nodes
|
|
332
|
+
'dependencies': [feature_key_str, ...], # deps from relevant snapshot
|
|
333
|
+
}
|
|
334
|
+
},
|
|
335
|
+
'edges': [
|
|
336
|
+
{'from': feature_key_str, 'to': feature_key_str}
|
|
337
|
+
]
|
|
338
|
+
}
|
|
339
|
+
"""
|
|
340
|
+
# Create status mapping for efficient lookup
|
|
341
|
+
added_keys = {node.feature_key.to_string() for node in diff.added_nodes}
|
|
342
|
+
removed_keys = {node.feature_key.to_string() for node in diff.removed_nodes}
|
|
343
|
+
changed_keys = {
|
|
344
|
+
node.feature_key.to_string(): node for node in diff.changed_nodes
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
# Get all feature keys from both snapshots
|
|
348
|
+
all_keys = set(snapshot1_data.keys()) | set(snapshot2_data.keys())
|
|
349
|
+
|
|
350
|
+
nodes = {}
|
|
351
|
+
edges = []
|
|
352
|
+
|
|
353
|
+
for feature_key_str in all_keys:
|
|
354
|
+
# Determine status
|
|
355
|
+
if feature_key_str in added_keys:
|
|
356
|
+
status = "added"
|
|
357
|
+
old_version = None
|
|
358
|
+
new_version = snapshot2_data[feature_key_str]["feature_version"]
|
|
359
|
+
fields = snapshot2_data[feature_key_str].get("fields", {})
|
|
360
|
+
field_changes = []
|
|
361
|
+
# Dependencies from snapshot2
|
|
362
|
+
deps = self._extract_dependencies(
|
|
363
|
+
snapshot2_data[feature_key_str].get("feature_spec", {})
|
|
364
|
+
)
|
|
365
|
+
elif feature_key_str in removed_keys:
|
|
366
|
+
status = "removed"
|
|
367
|
+
old_version = snapshot1_data[feature_key_str]["feature_version"]
|
|
368
|
+
new_version = None
|
|
369
|
+
fields = snapshot1_data[feature_key_str].get("fields", {})
|
|
370
|
+
field_changes = []
|
|
371
|
+
# Dependencies from snapshot1
|
|
372
|
+
deps = self._extract_dependencies(
|
|
373
|
+
snapshot1_data[feature_key_str].get("feature_spec", {})
|
|
374
|
+
)
|
|
375
|
+
elif feature_key_str in changed_keys:
|
|
376
|
+
status = "changed"
|
|
377
|
+
node_change = changed_keys[feature_key_str]
|
|
378
|
+
old_version = node_change.old_version
|
|
379
|
+
new_version = node_change.new_version
|
|
380
|
+
fields = snapshot2_data[feature_key_str].get("fields", {})
|
|
381
|
+
# Combine all field changes from the NodeChange
|
|
382
|
+
field_changes = (
|
|
383
|
+
node_change.added_fields
|
|
384
|
+
+ node_change.removed_fields
|
|
385
|
+
+ node_change.changed_fields
|
|
386
|
+
)
|
|
387
|
+
# Dependencies from snapshot2 (current version)
|
|
388
|
+
deps = self._extract_dependencies(
|
|
389
|
+
snapshot2_data[feature_key_str].get("feature_spec", {})
|
|
390
|
+
)
|
|
391
|
+
else:
|
|
392
|
+
# Unchanged
|
|
393
|
+
status = "unchanged"
|
|
394
|
+
old_version = snapshot1_data[feature_key_str]["feature_version"]
|
|
395
|
+
new_version = snapshot2_data[feature_key_str]["feature_version"]
|
|
396
|
+
fields = snapshot2_data[feature_key_str].get("fields", {})
|
|
397
|
+
field_changes = []
|
|
398
|
+
# Dependencies from snapshot2
|
|
399
|
+
deps = self._extract_dependencies(
|
|
400
|
+
snapshot2_data[feature_key_str].get("feature_spec", {})
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
nodes[feature_key_str] = {
|
|
404
|
+
"status": status,
|
|
405
|
+
"old_version": old_version,
|
|
406
|
+
"new_version": new_version,
|
|
407
|
+
"fields": fields,
|
|
408
|
+
"field_changes": field_changes,
|
|
409
|
+
"dependencies": deps,
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
# Create edges for dependencies (arrow points from dependency to feature)
|
|
413
|
+
for dep_key in deps:
|
|
414
|
+
edges.append({"from": dep_key, "to": feature_key_str})
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
"nodes": nodes,
|
|
418
|
+
"edges": edges,
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
def _extract_dependencies(self, feature_spec: dict[str, Any]) -> list[str]:
|
|
422
|
+
"""Extract dependency feature keys from a feature spec.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
feature_spec: Parsed feature spec dict
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
List of dependency feature keys as strings
|
|
429
|
+
"""
|
|
430
|
+
deps = feature_spec.get("deps", [])
|
|
431
|
+
if deps is None:
|
|
432
|
+
return []
|
|
433
|
+
|
|
434
|
+
dep_keys = []
|
|
435
|
+
for dep in deps:
|
|
436
|
+
# dep is a dict with 'key' field
|
|
437
|
+
dep_key = dep.get("key", [])
|
|
438
|
+
if isinstance(dep_key, list):
|
|
439
|
+
dep_keys.append("/".join(dep_key))
|
|
440
|
+
else:
|
|
441
|
+
dep_keys.append(dep_key)
|
|
442
|
+
|
|
443
|
+
return dep_keys
|
|
444
|
+
|
|
445
|
+
def filter_merged_graph(
|
|
446
|
+
self,
|
|
447
|
+
merged_data: dict[str, Any],
|
|
448
|
+
focus_feature: str | None = None,
|
|
449
|
+
up: int | None = None,
|
|
450
|
+
down: int | None = None,
|
|
451
|
+
) -> dict[str, Any]:
|
|
452
|
+
"""Filter merged graph to show only relevant features.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
merged_data: Merged graph data with nodes and edges
|
|
456
|
+
focus_feature: Feature key to focus on (string format with / or __)
|
|
457
|
+
up: Number of upstream levels (None = all if focus_feature is set, 0 otherwise)
|
|
458
|
+
down: Number of downstream levels (None = all if focus_feature is set, 0 otherwise)
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
Filtered merged graph data with same structure
|
|
462
|
+
|
|
463
|
+
Raises:
|
|
464
|
+
ValueError: If focus_feature is specified but not found in graph
|
|
465
|
+
"""
|
|
466
|
+
if focus_feature is None:
|
|
467
|
+
# No filtering
|
|
468
|
+
return merged_data
|
|
469
|
+
|
|
470
|
+
# Parse feature key (support both / and __ formats)
|
|
471
|
+
if "/" in focus_feature:
|
|
472
|
+
focus_key = focus_feature
|
|
473
|
+
else:
|
|
474
|
+
focus_key = focus_feature.replace("__", "/")
|
|
475
|
+
|
|
476
|
+
# Check if focus feature exists
|
|
477
|
+
if focus_key not in merged_data["nodes"]:
|
|
478
|
+
raise ValueError(f"Feature '{focus_feature}' not found in graph")
|
|
479
|
+
|
|
480
|
+
# Build dependency graph for traversal
|
|
481
|
+
# Build forward edges (feature -> dependents) and backward edges (feature -> dependencies)
|
|
482
|
+
forward_edges: dict[str, list[str]] = {} # feature -> list of dependents
|
|
483
|
+
backward_edges: dict[str, list[str]] = {} # feature -> list of dependencies
|
|
484
|
+
|
|
485
|
+
for edge in merged_data["edges"]:
|
|
486
|
+
dep = edge["from"] # dependency
|
|
487
|
+
feat = edge["to"] # dependent feature
|
|
488
|
+
|
|
489
|
+
if feat not in backward_edges:
|
|
490
|
+
backward_edges[feat] = []
|
|
491
|
+
backward_edges[feat].append(dep)
|
|
492
|
+
|
|
493
|
+
if dep not in forward_edges:
|
|
494
|
+
forward_edges[dep] = []
|
|
495
|
+
forward_edges[dep].append(feat)
|
|
496
|
+
|
|
497
|
+
# Find features to include
|
|
498
|
+
features_to_include = {focus_key}
|
|
499
|
+
|
|
500
|
+
# Add upstream (dependencies)
|
|
501
|
+
# Default behavior: if focus_feature is set but up is not specified, include all upstream
|
|
502
|
+
if up is None:
|
|
503
|
+
# Include all upstream
|
|
504
|
+
upstream = self._get_upstream_features(
|
|
505
|
+
focus_key, backward_edges, max_levels=None
|
|
506
|
+
)
|
|
507
|
+
features_to_include.update(upstream)
|
|
508
|
+
elif up > 0:
|
|
509
|
+
# Include specified number of levels
|
|
510
|
+
upstream = self._get_upstream_features(
|
|
511
|
+
focus_key, backward_edges, max_levels=up
|
|
512
|
+
)
|
|
513
|
+
features_to_include.update(upstream)
|
|
514
|
+
# else: up == 0, don't include upstream
|
|
515
|
+
|
|
516
|
+
# Add downstream (dependents)
|
|
517
|
+
# Default behavior: if focus_feature is set but down is not specified, include all downstream
|
|
518
|
+
if down is None:
|
|
519
|
+
# Include all downstream
|
|
520
|
+
downstream = self._get_downstream_features(
|
|
521
|
+
focus_key, forward_edges, max_levels=None
|
|
522
|
+
)
|
|
523
|
+
features_to_include.update(downstream)
|
|
524
|
+
elif down > 0:
|
|
525
|
+
# Include specified number of levels
|
|
526
|
+
downstream = self._get_downstream_features(
|
|
527
|
+
focus_key, forward_edges, max_levels=down
|
|
528
|
+
)
|
|
529
|
+
features_to_include.update(downstream)
|
|
530
|
+
# else: down == 0, don't include downstream
|
|
531
|
+
|
|
532
|
+
# Filter nodes and edges
|
|
533
|
+
filtered_nodes = {
|
|
534
|
+
k: v for k, v in merged_data["nodes"].items() if k in features_to_include
|
|
535
|
+
}
|
|
536
|
+
filtered_edges = [
|
|
537
|
+
e
|
|
538
|
+
for e in merged_data["edges"]
|
|
539
|
+
if e["from"] in features_to_include and e["to"] in features_to_include
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
return {
|
|
543
|
+
"nodes": filtered_nodes,
|
|
544
|
+
"edges": filtered_edges,
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
def _get_upstream_features(
|
|
548
|
+
self,
|
|
549
|
+
start_key: str,
|
|
550
|
+
backward_edges: dict[str, list[str]],
|
|
551
|
+
max_levels: int | None = None,
|
|
552
|
+
visited: set[str] | None = None,
|
|
553
|
+
level: int = 0,
|
|
554
|
+
) -> set[str]:
|
|
555
|
+
"""Get upstream features (dependencies) recursively."""
|
|
556
|
+
if visited is None:
|
|
557
|
+
visited = set()
|
|
558
|
+
|
|
559
|
+
if start_key in visited:
|
|
560
|
+
return set()
|
|
561
|
+
|
|
562
|
+
if max_levels is not None and level >= max_levels:
|
|
563
|
+
return set()
|
|
564
|
+
|
|
565
|
+
visited.add(start_key)
|
|
566
|
+
upstream: set[str] = set()
|
|
567
|
+
|
|
568
|
+
deps = backward_edges.get(start_key, [])
|
|
569
|
+
for dep in deps:
|
|
570
|
+
if dep not in visited:
|
|
571
|
+
upstream.add(dep)
|
|
572
|
+
# Recurse
|
|
573
|
+
upstream.update(
|
|
574
|
+
self._get_upstream_features(
|
|
575
|
+
dep, backward_edges, max_levels, visited, level + 1
|
|
576
|
+
)
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
return upstream
|
|
580
|
+
|
|
581
|
+
def _get_downstream_features(
|
|
582
|
+
self,
|
|
583
|
+
start_key: str,
|
|
584
|
+
forward_edges: dict[str, list[str]],
|
|
585
|
+
max_levels: int | None = None,
|
|
586
|
+
visited: set[str] | None = None,
|
|
587
|
+
level: int = 0,
|
|
588
|
+
) -> set[str]:
|
|
589
|
+
"""Get downstream features (dependents) recursively."""
|
|
590
|
+
if visited is None:
|
|
591
|
+
visited = set()
|
|
592
|
+
|
|
593
|
+
if start_key in visited:
|
|
594
|
+
return set()
|
|
595
|
+
|
|
596
|
+
if max_levels is not None and level >= max_levels:
|
|
597
|
+
return set()
|
|
598
|
+
|
|
599
|
+
visited.add(start_key)
|
|
600
|
+
downstream: set[str] = set()
|
|
601
|
+
|
|
602
|
+
dependents = forward_edges.get(start_key, [])
|
|
603
|
+
for dependent in dependents:
|
|
604
|
+
if dependent not in visited:
|
|
605
|
+
downstream.add(dependent)
|
|
606
|
+
# Recurse
|
|
607
|
+
downstream.update(
|
|
608
|
+
self._get_downstream_features(
|
|
609
|
+
dependent, forward_edges, max_levels, visited, level + 1
|
|
610
|
+
)
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
return downstream
|
|
614
|
+
|
|
615
|
+
def load_snapshot_data(
|
|
616
|
+
self, store: MetadataStore, snapshot_version: str
|
|
617
|
+
) -> dict[str, dict[str, Any]]:
|
|
618
|
+
"""Load snapshot data from store.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
store: Metadata store to query
|
|
622
|
+
snapshot_version: Snapshot version to load
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
Dict mapping feature_key (string) -> {feature_version, feature_spec, fields}
|
|
626
|
+
where fields is dict mapping field_key (string) -> field_version (hash)
|
|
627
|
+
|
|
628
|
+
Raises:
|
|
629
|
+
ValueError: If snapshot not found in store
|
|
630
|
+
"""
|
|
631
|
+
# Query feature_versions table for this snapshot
|
|
632
|
+
try:
|
|
633
|
+
features_lazy = store._read_metadata_native(FEATURE_VERSIONS_KEY)
|
|
634
|
+
if features_lazy is None:
|
|
635
|
+
raise ValueError(
|
|
636
|
+
f"No feature_versions table found in store. Cannot load snapshot {snapshot_version}."
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# Filter by snapshot_version
|
|
640
|
+
import narwhals as nw
|
|
641
|
+
|
|
642
|
+
features_df = (
|
|
643
|
+
features_lazy.filter(nw.col("snapshot_version") == snapshot_version)
|
|
644
|
+
.collect()
|
|
645
|
+
.to_polars()
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
if features_df.height == 0:
|
|
649
|
+
raise ValueError(
|
|
650
|
+
f"Snapshot {snapshot_version} not found in store. "
|
|
651
|
+
"Run 'metaxy push' to record snapshots or check the version hash."
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
except Exception as e:
|
|
655
|
+
raise ValueError(f"Failed to load snapshot {snapshot_version}: {e}") from e
|
|
656
|
+
|
|
657
|
+
# Build snapshot data structure
|
|
658
|
+
snapshot_dict = {}
|
|
659
|
+
for row in features_df.iter_rows(named=True):
|
|
660
|
+
feature_key_str = row["feature_key"]
|
|
661
|
+
feature_version = row["feature_version"]
|
|
662
|
+
feature_spec_json = row["feature_spec"]
|
|
663
|
+
feature_class_path = row.get("feature_class_path", "")
|
|
664
|
+
|
|
665
|
+
feature_spec_dict = json.loads(feature_spec_json)
|
|
666
|
+
|
|
667
|
+
snapshot_dict[feature_key_str] = {
|
|
668
|
+
"feature_version": feature_version,
|
|
669
|
+
"feature_spec": feature_spec_dict,
|
|
670
|
+
"feature_class_path": feature_class_path,
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
# Try to reconstruct FeatureGraph from snapshot to compute field versions
|
|
674
|
+
# This may fail if features have been removed/moved, so we handle that gracefully
|
|
675
|
+
graph: FeatureGraph | None = None
|
|
676
|
+
try:
|
|
677
|
+
graph = FeatureGraph.from_snapshot(snapshot_dict)
|
|
678
|
+
graph_available = True
|
|
679
|
+
except ImportError:
|
|
680
|
+
# Some features can't be imported (likely removed) - proceed without graph
|
|
681
|
+
# For diff purposes, we can still show feature-level changes
|
|
682
|
+
# We'll use feature_version as a fallback for all field versions
|
|
683
|
+
graph_available = False
|
|
684
|
+
warnings.warn(
|
|
685
|
+
"Using feature_version as field_version fallback for features that cannot be imported. "
|
|
686
|
+
"This may occur when features have been removed or moved.",
|
|
687
|
+
UserWarning,
|
|
688
|
+
stacklevel=2,
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
# Compute field versions using the reconstructed graph (if available)
|
|
692
|
+
from metaxy.models.plan import FQFieldKey
|
|
693
|
+
|
|
694
|
+
snapshot_data = {}
|
|
695
|
+
for feature_key_str in snapshot_dict.keys():
|
|
696
|
+
feature_version = snapshot_dict[feature_key_str]["feature_version"]
|
|
697
|
+
feature_spec = snapshot_dict[feature_key_str]["feature_spec"]
|
|
698
|
+
feature_key_obj = FeatureKey(feature_key_str.split("/"))
|
|
699
|
+
|
|
700
|
+
# Compute field versions using graph (if available)
|
|
701
|
+
fields_data = {}
|
|
702
|
+
if (
|
|
703
|
+
graph_available
|
|
704
|
+
and graph is not None
|
|
705
|
+
and feature_key_obj in graph.features_by_key
|
|
706
|
+
):
|
|
707
|
+
# Feature exists in reconstructed graph - compute precise field versions
|
|
708
|
+
for field_dict in feature_spec.get("fields", []):
|
|
709
|
+
field_key_list = field_dict.get("key")
|
|
710
|
+
if isinstance(field_key_list, list):
|
|
711
|
+
field_key = FieldKey(field_key_list)
|
|
712
|
+
field_key_str_normalized = "/".join(field_key_list)
|
|
713
|
+
else:
|
|
714
|
+
field_key = FieldKey([field_key_list])
|
|
715
|
+
field_key_str_normalized = field_key_list
|
|
716
|
+
|
|
717
|
+
# Compute field version using the graph
|
|
718
|
+
fq_key = FQFieldKey(feature=feature_key_obj, field=field_key)
|
|
719
|
+
field_version = graph.get_field_version(fq_key)
|
|
720
|
+
fields_data[field_key_str_normalized] = field_version
|
|
721
|
+
else:
|
|
722
|
+
# Feature doesn't exist in graph (removed/moved) - use feature_version as fallback
|
|
723
|
+
# All fields get the same version (the feature version)
|
|
724
|
+
for field_dict in feature_spec.get("fields", []):
|
|
725
|
+
field_key_list = field_dict.get("key")
|
|
726
|
+
if isinstance(field_key_list, list):
|
|
727
|
+
field_key_str_normalized = "/".join(field_key_list)
|
|
728
|
+
else:
|
|
729
|
+
field_key_str_normalized = field_key_list
|
|
730
|
+
|
|
731
|
+
# Use feature_version directly as fallback
|
|
732
|
+
fields_data[field_key_str_normalized] = feature_version
|
|
733
|
+
|
|
734
|
+
snapshot_data[feature_key_str] = {
|
|
735
|
+
"feature_version": feature_version,
|
|
736
|
+
"fields": fields_data,
|
|
737
|
+
"feature_spec": feature_spec,
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
return snapshot_data
|