metaxy 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metaxy might be problematic. Click here for more details.
- metaxy/__init__.py +61 -0
- metaxy/_testing.py +542 -0
- metaxy/_utils.py +16 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +76 -0
- metaxy/cli/context.py +71 -0
- metaxy/cli/graph.py +576 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +42 -0
- metaxy/cli/metadata.py +271 -0
- metaxy/cli/migrations.py +862 -0
- metaxy/cli/push.py +55 -0
- metaxy/config.py +450 -0
- metaxy/data_versioning/__init__.py +24 -0
- metaxy/data_versioning/calculators/__init__.py +13 -0
- metaxy/data_versioning/calculators/base.py +97 -0
- metaxy/data_versioning/calculators/duckdb.py +186 -0
- metaxy/data_versioning/calculators/ibis.py +225 -0
- metaxy/data_versioning/calculators/polars.py +135 -0
- metaxy/data_versioning/diff/__init__.py +15 -0
- metaxy/data_versioning/diff/base.py +150 -0
- metaxy/data_versioning/diff/narwhals.py +108 -0
- metaxy/data_versioning/hash_algorithms.py +19 -0
- metaxy/data_versioning/joiners/__init__.py +9 -0
- metaxy/data_versioning/joiners/base.py +70 -0
- metaxy/data_versioning/joiners/narwhals.py +235 -0
- metaxy/entrypoints.py +309 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/alembic.py +326 -0
- metaxy/ext/sqlmodel.py +172 -0
- metaxy/ext/sqlmodel_system_tables.py +139 -0
- metaxy/graph/__init__.py +21 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +399 -0
- metaxy/graph/diff/differ.py +740 -0
- metaxy/graph/diff/models.py +418 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +274 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +320 -0
- metaxy/graph/diff/rendering/rich.py +165 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +31 -0
- metaxy/metadata_store/_protocols.py +38 -0
- metaxy/metadata_store/base.py +1676 -0
- metaxy/metadata_store/clickhouse.py +161 -0
- metaxy/metadata_store/duckdb.py +167 -0
- metaxy/metadata_store/exceptions.py +43 -0
- metaxy/metadata_store/ibis.py +451 -0
- metaxy/metadata_store/memory.py +228 -0
- metaxy/metadata_store/sqlite.py +187 -0
- metaxy/metadata_store/system_tables.py +257 -0
- metaxy/migrations/__init__.py +34 -0
- metaxy/migrations/detector.py +153 -0
- metaxy/migrations/executor.py +208 -0
- metaxy/migrations/loader.py +260 -0
- metaxy/migrations/models.py +718 -0
- metaxy/migrations/ops.py +390 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +6 -0
- metaxy/models/constants.py +24 -0
- metaxy/models/feature.py +665 -0
- metaxy/models/feature_spec.py +105 -0
- metaxy/models/field.py +25 -0
- metaxy/models/plan.py +155 -0
- metaxy/models/types.py +157 -0
- metaxy/py.typed +0 -0
- metaxy-0.0.0.dist-info/METADATA +247 -0
- metaxy-0.0.0.dist-info/RECORD +75 -0
- metaxy-0.0.0.dist-info/WHEEL +4 -0
- metaxy-0.0.0.dist-info/entry_points.txt +3 -0
metaxy/models/feature.py
ADDED
|
@@ -0,0 +1,665 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from contextvars import ContextVar
|
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
5
|
+
|
|
6
|
+
from pydantic._internal._model_construction import ModelMetaclass
|
|
7
|
+
from typing_extensions import Self
|
|
8
|
+
|
|
9
|
+
from metaxy.models.bases import FrozenBaseModel
|
|
10
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
11
|
+
from metaxy.models.plan import FeaturePlan, FQFieldKey
|
|
12
|
+
from metaxy.models.types import FeatureKey
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import narwhals as nw
|
|
16
|
+
|
|
17
|
+
from metaxy.data_versioning.diff import (
|
|
18
|
+
DiffResult,
|
|
19
|
+
LazyDiffResult,
|
|
20
|
+
MetadataDiffResolver,
|
|
21
|
+
)
|
|
22
|
+
from metaxy.data_versioning.joiners import UpstreamJoiner
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Context variable for active graph (module-level)
|
|
26
|
+
_active_graph: ContextVar["FeatureGraph | None"] = ContextVar(
|
|
27
|
+
"_active_graph", default=None
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_feature_by_key(key: "FeatureKey") -> type["Feature"]:
|
|
32
|
+
"""Get a feature class by its key from the active graph.
|
|
33
|
+
|
|
34
|
+
Convenience function that retrieves from the currently active graph.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
key: Feature key to look up
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Feature class
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
KeyError: If no feature with the given key is registered
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
>>> from metaxy import get_feature_by_key, FeatureKey
|
|
47
|
+
>>> parent_key = FeatureKey(["examples", "parent"])
|
|
48
|
+
>>> ParentFeature = get_feature_by_key(parent_key)
|
|
49
|
+
"""
|
|
50
|
+
graph = FeatureGraph.get_active()
|
|
51
|
+
return graph.get_feature_by_key(key)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FeatureGraph:
|
|
55
|
+
def __init__(self):
|
|
56
|
+
self.features_by_key: dict[FeatureKey, type[Feature]] = {}
|
|
57
|
+
self.feature_specs_by_key: dict[FeatureKey, FeatureSpec] = {}
|
|
58
|
+
|
|
59
|
+
def add_feature(self, feature: type["Feature"]) -> None:
|
|
60
|
+
"""Add a feature to the graph.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
feature: Feature class to register
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If a feature with the same key is already registered
|
|
67
|
+
"""
|
|
68
|
+
if feature.spec.key in self.features_by_key:
|
|
69
|
+
existing = self.features_by_key[feature.spec.key]
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"Feature with key {feature.spec.key.to_string()} already registered. "
|
|
72
|
+
f"Existing: {existing.__name__}, New: {feature.__name__}. "
|
|
73
|
+
f"Each feature key must be unique within a graph."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
self.features_by_key[feature.spec.key] = feature
|
|
77
|
+
self.feature_specs_by_key[feature.spec.key] = feature.spec
|
|
78
|
+
|
|
79
|
+
def remove_feature(self, key: FeatureKey) -> None:
|
|
80
|
+
"""Remove a feature from the graph.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
key: Feature key to remove
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
KeyError: If no feature with the given key is registered
|
|
87
|
+
"""
|
|
88
|
+
if key not in self.features_by_key:
|
|
89
|
+
raise KeyError(
|
|
90
|
+
f"No feature with key {key.to_string()} found in graph. "
|
|
91
|
+
f"Available keys: {[k.to_string() for k in self.features_by_key.keys()]}"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
del self.features_by_key[key]
|
|
95
|
+
del self.feature_specs_by_key[key]
|
|
96
|
+
|
|
97
|
+
def get_feature_by_key(self, key: FeatureKey) -> type["Feature"]:
|
|
98
|
+
"""Get a feature class by its key.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
key: Feature key to look up
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Feature class
|
|
105
|
+
|
|
106
|
+
Raises:
|
|
107
|
+
KeyError: If no feature with the given key is registered
|
|
108
|
+
|
|
109
|
+
Example:
|
|
110
|
+
>>> graph = FeatureGraph.get_active()
|
|
111
|
+
>>> parent_key = FeatureKey(["examples", "parent"])
|
|
112
|
+
>>> ParentFeature = graph.get_feature_by_key(parent_key)
|
|
113
|
+
"""
|
|
114
|
+
if key not in self.features_by_key:
|
|
115
|
+
raise KeyError(
|
|
116
|
+
f"No feature with key {key.to_string()} found in graph. "
|
|
117
|
+
f"Available keys: {[k.to_string() for k in self.features_by_key.keys()]}"
|
|
118
|
+
)
|
|
119
|
+
return self.features_by_key[key]
|
|
120
|
+
|
|
121
|
+
def get_feature_plan(self, key: FeatureKey) -> FeaturePlan:
|
|
122
|
+
feature = self.feature_specs_by_key[key]
|
|
123
|
+
|
|
124
|
+
return FeaturePlan(
|
|
125
|
+
feature=feature,
|
|
126
|
+
deps=[self.feature_specs_by_key[dep.key] for dep in feature.deps or []]
|
|
127
|
+
or None,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def get_field_version(self, key: "FQFieldKey") -> str:
|
|
131
|
+
hasher = hashlib.sha256()
|
|
132
|
+
|
|
133
|
+
plan = self.get_feature_plan(key.feature)
|
|
134
|
+
field = plan.feature.fields_by_key[key.field]
|
|
135
|
+
|
|
136
|
+
hasher.update(key.to_string().encode())
|
|
137
|
+
hasher.update(str(field.code_version).encode())
|
|
138
|
+
|
|
139
|
+
for k, v in sorted(plan.get_parent_fields_for_field(key.field).items()):
|
|
140
|
+
hasher.update(self.get_field_version(k).encode())
|
|
141
|
+
|
|
142
|
+
return hasher.hexdigest()
|
|
143
|
+
|
|
144
|
+
def get_feature_version_by_field(self, key: FeatureKey) -> dict[str, str]:
|
|
145
|
+
"""Computes the feature data version.
|
|
146
|
+
|
|
147
|
+
Hash together field data versions versions with the feature code version.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
dict[str, str]: The data version for each field in the feature plan.
|
|
151
|
+
Keys are field names as strings.
|
|
152
|
+
"""
|
|
153
|
+
res = {}
|
|
154
|
+
|
|
155
|
+
plan = self.get_feature_plan(key)
|
|
156
|
+
|
|
157
|
+
for k, v in plan.feature.fields_by_key.items():
|
|
158
|
+
res[k.to_string()] = self.get_field_version(
|
|
159
|
+
FQFieldKey(field=k, feature=key)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return res
|
|
163
|
+
|
|
164
|
+
def get_feature_version(self, key: FeatureKey) -> str:
|
|
165
|
+
"""Computes the feature version as a single string"""
|
|
166
|
+
hasher = hashlib.sha256()
|
|
167
|
+
data_version = self.get_feature_version_by_field(key)
|
|
168
|
+
for field_key in sorted(data_version):
|
|
169
|
+
hasher.update(field_key.encode())
|
|
170
|
+
hasher.update(data_version[field_key].encode())
|
|
171
|
+
|
|
172
|
+
return hasher.hexdigest()
|
|
173
|
+
|
|
174
|
+
def get_downstream_features(self, sources: list[FeatureKey]) -> list[FeatureKey]:
|
|
175
|
+
"""Get all features downstream of sources, topologically sorted.
|
|
176
|
+
|
|
177
|
+
Performs a depth-first traversal of the dependency graph to find all
|
|
178
|
+
features that transitively depend on any of the source features.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
sources: List of source feature keys
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List of downstream feature keys in topological order (dependencies first).
|
|
185
|
+
Does not include the source features themselves.
|
|
186
|
+
|
|
187
|
+
Example:
|
|
188
|
+
>>> # DAG: A -> B -> D
|
|
189
|
+
>>> # A -> C -> D
|
|
190
|
+
>>> graph.get_downstream_features([FeatureKey(["A"])])
|
|
191
|
+
[FeatureKey(["B"]), FeatureKey(["C"]), FeatureKey(["D"])]
|
|
192
|
+
"""
|
|
193
|
+
source_set = set(sources)
|
|
194
|
+
visited = set()
|
|
195
|
+
post_order = [] # Reverse topological order
|
|
196
|
+
|
|
197
|
+
def visit(key: FeatureKey):
|
|
198
|
+
"""DFS traversal."""
|
|
199
|
+
if key in visited:
|
|
200
|
+
return
|
|
201
|
+
visited.add(key)
|
|
202
|
+
|
|
203
|
+
# Find all features that depend on this one
|
|
204
|
+
for feature_key, feature_spec in self.feature_specs_by_key.items():
|
|
205
|
+
if feature_spec.deps:
|
|
206
|
+
for dep in feature_spec.deps:
|
|
207
|
+
if dep.key == key:
|
|
208
|
+
# This feature depends on 'key', so visit it
|
|
209
|
+
visit(feature_key)
|
|
210
|
+
|
|
211
|
+
post_order.append(key)
|
|
212
|
+
|
|
213
|
+
# Visit all sources
|
|
214
|
+
for source in sources:
|
|
215
|
+
visit(source)
|
|
216
|
+
|
|
217
|
+
# Remove sources from result, reverse to get topological order
|
|
218
|
+
result = [k for k in reversed(post_order) if k not in source_set]
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
@property
|
|
222
|
+
def snapshot_version(self) -> str:
|
|
223
|
+
"""Generate a snapshot version representing the current topology + versions of the feature graph"""
|
|
224
|
+
if len(self.feature_specs_by_key) == 0:
|
|
225
|
+
return "empty"
|
|
226
|
+
|
|
227
|
+
hasher = hashlib.sha256()
|
|
228
|
+
for feature_key in sorted(self.feature_specs_by_key.keys()):
|
|
229
|
+
hasher.update(feature_key.to_string().encode("utf-8"))
|
|
230
|
+
hasher.update(self.get_feature_version(feature_key).encode("utf-8"))
|
|
231
|
+
return hasher.hexdigest()
|
|
232
|
+
|
|
233
|
+
def to_snapshot(self) -> dict[str, dict[str, Any]]:
|
|
234
|
+
"""Serialize graph to snapshot format.
|
|
235
|
+
|
|
236
|
+
Returns a dict mapping feature_key (string) to feature data dict,
|
|
237
|
+
including the import path of the Feature class for reconstruction.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Dict of feature_key -> {
|
|
241
|
+
feature_spec: dict,
|
|
242
|
+
feature_version: str,
|
|
243
|
+
feature_class_path: str
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
Example:
|
|
247
|
+
>>> snapshot = graph.to_snapshot()
|
|
248
|
+
>>> snapshot["video_processing"]["feature_version"]
|
|
249
|
+
'abc12345'
|
|
250
|
+
>>> snapshot["video_processing"]["feature_class_path"]
|
|
251
|
+
'myapp.features.video.VideoProcessing'
|
|
252
|
+
"""
|
|
253
|
+
snapshot = {}
|
|
254
|
+
|
|
255
|
+
for feature_key, feature_cls in self.features_by_key.items():
|
|
256
|
+
feature_key_str = feature_key.to_string()
|
|
257
|
+
feature_spec_dict = feature_cls.spec.model_dump(mode="json") # type: ignore[attr-defined]
|
|
258
|
+
feature_version = feature_cls.feature_version() # type: ignore[attr-defined]
|
|
259
|
+
|
|
260
|
+
# Get class import path (module.ClassName)
|
|
261
|
+
class_path = f"{feature_cls.__module__}.{feature_cls.__name__}"
|
|
262
|
+
|
|
263
|
+
snapshot[feature_key_str] = {
|
|
264
|
+
"feature_spec": feature_spec_dict,
|
|
265
|
+
"feature_version": feature_version,
|
|
266
|
+
"feature_class_path": class_path,
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return snapshot
|
|
270
|
+
|
|
271
|
+
@classmethod
|
|
272
|
+
def from_snapshot(
|
|
273
|
+
cls,
|
|
274
|
+
snapshot_data: dict[str, dict[str, Any]],
|
|
275
|
+
*,
|
|
276
|
+
class_path_overrides: dict[str, str] | None = None,
|
|
277
|
+
force_reload: bool = False,
|
|
278
|
+
) -> "FeatureGraph":
|
|
279
|
+
"""Reconstruct graph from snapshot by importing Feature classes.
|
|
280
|
+
|
|
281
|
+
Strictly requires Feature classes to exist at their recorded import paths.
|
|
282
|
+
This ensures custom methods (like load_input) are available.
|
|
283
|
+
|
|
284
|
+
If a feature has been moved/renamed, use class_path_overrides to specify
|
|
285
|
+
the new location.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
snapshot_data: Dict of feature_key -> {
|
|
289
|
+
feature_spec: dict,
|
|
290
|
+
feature_class_path: str,
|
|
291
|
+
...
|
|
292
|
+
} (as returned by to_snapshot() or loaded from DB)
|
|
293
|
+
class_path_overrides: Optional dict mapping feature_key to new class path
|
|
294
|
+
for features that have been moved/renamed
|
|
295
|
+
force_reload: If True, reload modules from disk to get current code state.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
New FeatureGraph with historical features
|
|
299
|
+
|
|
300
|
+
Raises:
|
|
301
|
+
ImportError: If feature class cannot be imported at recorded path
|
|
302
|
+
|
|
303
|
+
Example:
|
|
304
|
+
>>> # Load snapshot from metadata store
|
|
305
|
+
>>> historical_graph = FeatureGraph.from_snapshot(snapshot_data)
|
|
306
|
+
>>>
|
|
307
|
+
>>> # With override for moved feature
|
|
308
|
+
>>> historical_graph = FeatureGraph.from_snapshot(
|
|
309
|
+
... snapshot_data,
|
|
310
|
+
... class_path_overrides={
|
|
311
|
+
... "video_processing": "myapp.features_v2.VideoProcessing"
|
|
312
|
+
... }
|
|
313
|
+
... )
|
|
314
|
+
"""
|
|
315
|
+
import importlib
|
|
316
|
+
import sys
|
|
317
|
+
|
|
318
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
319
|
+
|
|
320
|
+
graph = cls()
|
|
321
|
+
class_path_overrides = class_path_overrides or {}
|
|
322
|
+
|
|
323
|
+
# If force_reload, collect all module paths first to remove ALL features
|
|
324
|
+
# from those modules before reloading (modules can have multiple features)
|
|
325
|
+
modules_to_reload = set()
|
|
326
|
+
if force_reload:
|
|
327
|
+
for feature_key_str, feature_data in snapshot_data.items():
|
|
328
|
+
class_path = class_path_overrides.get(
|
|
329
|
+
feature_key_str
|
|
330
|
+
) or feature_data.get("feature_class_path")
|
|
331
|
+
if class_path:
|
|
332
|
+
module_path, _ = class_path.rsplit(".", 1)
|
|
333
|
+
if module_path in sys.modules:
|
|
334
|
+
modules_to_reload.add(module_path)
|
|
335
|
+
|
|
336
|
+
# Use context manager to temporarily set the new graph as active
|
|
337
|
+
# This ensures imported Feature classes register to the new graph, not the current one
|
|
338
|
+
with graph.use():
|
|
339
|
+
for feature_key_str, feature_data in snapshot_data.items():
|
|
340
|
+
# Parse FeatureSpec for validation
|
|
341
|
+
feature_spec_dict = feature_data["feature_spec"]
|
|
342
|
+
FeatureSpec.model_validate(feature_spec_dict)
|
|
343
|
+
|
|
344
|
+
# Get class path (check overrides first)
|
|
345
|
+
if feature_key_str in class_path_overrides:
|
|
346
|
+
class_path = class_path_overrides[feature_key_str]
|
|
347
|
+
else:
|
|
348
|
+
class_path = feature_data.get("feature_class_path")
|
|
349
|
+
if not class_path:
|
|
350
|
+
raise ValueError(
|
|
351
|
+
f"Feature '{feature_key_str}' has no feature_class_path in snapshot. "
|
|
352
|
+
f"Cannot reconstruct historical graph."
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Import the class
|
|
356
|
+
try:
|
|
357
|
+
module_path, class_name = class_path.rsplit(".", 1)
|
|
358
|
+
|
|
359
|
+
# Force reload module from disk if requested
|
|
360
|
+
# This is critical for migration detection - when code changes,
|
|
361
|
+
# we need fresh imports to detect the changes
|
|
362
|
+
if force_reload and module_path in modules_to_reload:
|
|
363
|
+
# Before first reload of this module, remove ALL features from this module
|
|
364
|
+
# (a module can define multiple features)
|
|
365
|
+
if module_path in modules_to_reload:
|
|
366
|
+
# Find all features from this module in snapshot and remove them
|
|
367
|
+
for fk_str, fd in snapshot_data.items():
|
|
368
|
+
fcp = class_path_overrides.get(fk_str) or fd.get(
|
|
369
|
+
"feature_class_path"
|
|
370
|
+
)
|
|
371
|
+
if fcp and fcp.rsplit(".", 1)[0] == module_path:
|
|
372
|
+
fspec_dict = fd["feature_spec"]
|
|
373
|
+
fspec = FeatureSpec.model_validate(fspec_dict)
|
|
374
|
+
if fspec.key in graph.features_by_key:
|
|
375
|
+
graph.remove_feature(fspec.key)
|
|
376
|
+
|
|
377
|
+
# Mark module as processed so we don't remove features again
|
|
378
|
+
modules_to_reload.discard(module_path)
|
|
379
|
+
|
|
380
|
+
module = importlib.reload(sys.modules[module_path])
|
|
381
|
+
else:
|
|
382
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
383
|
+
|
|
384
|
+
feature_cls = getattr(module, class_name)
|
|
385
|
+
except (ImportError, AttributeError) as e:
|
|
386
|
+
raise ImportError(
|
|
387
|
+
f"Cannot import Feature class '{class_path}' for feature graph reconstruction from snapshot. "
|
|
388
|
+
f"Feature '{feature_key_str}' is required to reconstruct the graph, but the class "
|
|
389
|
+
f"cannot be found at the recorded import path. "
|
|
390
|
+
) from e
|
|
391
|
+
|
|
392
|
+
# Validate the imported class matches the stored spec
|
|
393
|
+
if not hasattr(feature_cls, "spec"):
|
|
394
|
+
raise TypeError(
|
|
395
|
+
f"Imported class '{class_path}' is not a valid Feature class "
|
|
396
|
+
f"(missing 'spec' attribute)"
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Register the imported feature to this graph if not already present
|
|
400
|
+
# If the module was imported for the first time, the metaclass already registered it
|
|
401
|
+
# If the module was previously imported, we need to manually register it
|
|
402
|
+
if feature_cls.spec.key not in graph.features_by_key:
|
|
403
|
+
graph.add_feature(feature_cls)
|
|
404
|
+
|
|
405
|
+
return graph
|
|
406
|
+
|
|
407
|
+
@classmethod
|
|
408
|
+
def get_active(cls) -> "FeatureGraph":
|
|
409
|
+
"""Get the currently active graph.
|
|
410
|
+
|
|
411
|
+
Returns the graph from the context variable if set, otherwise returns
|
|
412
|
+
the default global graph.
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
Active FeatureGraph instance
|
|
416
|
+
|
|
417
|
+
Example:
|
|
418
|
+
>>> # Normal usage - returns default graph
|
|
419
|
+
>>> reg = FeatureGraph.get_active()
|
|
420
|
+
>>>
|
|
421
|
+
>>> # With custom graph in context
|
|
422
|
+
>>> with my_graph.use():
|
|
423
|
+
... reg = FeatureGraph.get_active() # Returns my_graph
|
|
424
|
+
"""
|
|
425
|
+
return _active_graph.get() or graph
|
|
426
|
+
|
|
427
|
+
@classmethod
|
|
428
|
+
def set_active(cls, reg: "FeatureGraph") -> None:
|
|
429
|
+
"""Set the active graph for the current context.
|
|
430
|
+
|
|
431
|
+
This sets the context variable that will be returned by get_active().
|
|
432
|
+
Typically used in application setup code or test fixtures.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
reg: FeatureGraph to activate
|
|
436
|
+
|
|
437
|
+
Example:
|
|
438
|
+
>>> # In application setup
|
|
439
|
+
>>> my_graph = FeatureGraph()
|
|
440
|
+
>>> FeatureGraph.set_active(my_graph)
|
|
441
|
+
>>>
|
|
442
|
+
>>> # Now all operations use my_graph
|
|
443
|
+
>>> FeatureGraph.get_active() # Returns my_graph
|
|
444
|
+
"""
|
|
445
|
+
_active_graph.set(reg)
|
|
446
|
+
|
|
447
|
+
@contextmanager
|
|
448
|
+
def use(self):
|
|
449
|
+
"""Context manager to temporarily use this graph as active.
|
|
450
|
+
|
|
451
|
+
This is the recommended way to use custom registries, especially in tests.
|
|
452
|
+
The graph is automatically restored when the context exits.
|
|
453
|
+
|
|
454
|
+
Yields:
|
|
455
|
+
This graph instance
|
|
456
|
+
|
|
457
|
+
Example:
|
|
458
|
+
>>> test_graph = FeatureGraph()
|
|
459
|
+
>>>
|
|
460
|
+
>>> with test_graph.use():
|
|
461
|
+
... # All operations use test_graph
|
|
462
|
+
... class TestFeature(Feature, spec=...):
|
|
463
|
+
... pass
|
|
464
|
+
...
|
|
465
|
+
>>> # Outside context, back to previous graph
|
|
466
|
+
"""
|
|
467
|
+
token = _active_graph.set(self)
|
|
468
|
+
try:
|
|
469
|
+
yield self
|
|
470
|
+
finally:
|
|
471
|
+
_active_graph.reset(token)
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# Default global graph
|
|
475
|
+
graph = FeatureGraph()
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
class MetaxyMeta(ModelMetaclass):
|
|
479
|
+
def __new__(
|
|
480
|
+
cls,
|
|
481
|
+
cls_name: str,
|
|
482
|
+
bases: tuple[type[Any], ...],
|
|
483
|
+
namespace: dict[str, Any],
|
|
484
|
+
*,
|
|
485
|
+
spec: FeatureSpec | None,
|
|
486
|
+
**kwargs,
|
|
487
|
+
) -> type[Self]: # pyright: ignore[reportGeneralTypeIssues]
|
|
488
|
+
new_cls = super().__new__(cls, cls_name, bases, namespace, **kwargs)
|
|
489
|
+
|
|
490
|
+
if spec:
|
|
491
|
+
# Get graph from context at class definition time
|
|
492
|
+
active_graph = FeatureGraph.get_active()
|
|
493
|
+
new_cls.graph = active_graph # type: ignore[attr-defined]
|
|
494
|
+
new_cls.spec = spec # type: ignore[attr-defined]
|
|
495
|
+
active_graph.add_feature(new_cls)
|
|
496
|
+
else:
|
|
497
|
+
pass # TODO: set spec to a property that would raise an exception on access
|
|
498
|
+
|
|
499
|
+
return new_cls
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class Feature(FrozenBaseModel, metaclass=MetaxyMeta, spec=None):
|
|
503
|
+
spec: ClassVar[FeatureSpec]
|
|
504
|
+
graph: ClassVar[FeatureGraph]
|
|
505
|
+
|
|
506
|
+
@classmethod
|
|
507
|
+
def table_name(cls) -> str:
|
|
508
|
+
"""Get SQL-like table name for this feature.
|
|
509
|
+
|
|
510
|
+
Converts feature key to SQL-compatible table name by joining
|
|
511
|
+
parts with double underscores, consistent with IbisMetadataStore.
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
Table name string (e.g., "my_namespace__my_feature")
|
|
515
|
+
|
|
516
|
+
Example:
|
|
517
|
+
>>> class VideoFeature(Feature, spec=FeatureSpec(
|
|
518
|
+
... key=FeatureKey(["video", "processing"]),
|
|
519
|
+
... ...
|
|
520
|
+
... )):
|
|
521
|
+
... pass
|
|
522
|
+
>>> VideoFeature.table_name()
|
|
523
|
+
'video__processing'
|
|
524
|
+
"""
|
|
525
|
+
return cls.spec.table_name()
|
|
526
|
+
|
|
527
|
+
@classmethod
|
|
528
|
+
def feature_version(cls) -> str:
|
|
529
|
+
"""Get hash of feature specification.
|
|
530
|
+
|
|
531
|
+
Returns a hash representing the feature's complete configuration:
|
|
532
|
+
- Feature key
|
|
533
|
+
- Field definitions and code versions
|
|
534
|
+
- Dependencies (feature-level and field-level)
|
|
535
|
+
|
|
536
|
+
This hash changes when you modify:
|
|
537
|
+
- Field code versions
|
|
538
|
+
- Dependencies
|
|
539
|
+
- Field definitions
|
|
540
|
+
|
|
541
|
+
Used to distinguish current vs historical metadata versions.
|
|
542
|
+
Stored in the 'feature_version' column of metadata DataFrames.
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
SHA256 hex digest (like git short hashes)
|
|
546
|
+
|
|
547
|
+
Example:
|
|
548
|
+
>>> class MyFeature(Feature, spec=FeatureSpec(
|
|
549
|
+
... key=FeatureKey(["my", "feature"]),
|
|
550
|
+
... fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)],
|
|
551
|
+
... )):
|
|
552
|
+
... pass
|
|
553
|
+
>>> MyFeature.feature_version()
|
|
554
|
+
'a3f8b2c1...'
|
|
555
|
+
"""
|
|
556
|
+
return cls.graph.get_feature_version(cls.spec.key)
|
|
557
|
+
|
|
558
|
+
@classmethod
|
|
559
|
+
def data_version(cls) -> dict[str, str]:
|
|
560
|
+
"""Get the code-level data version for this feature.
|
|
561
|
+
|
|
562
|
+
This returns a static hash based on code versions and dependencies,
|
|
563
|
+
not sample-level data versions.
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
Dictionary mapping field keys to their data version hashes.
|
|
567
|
+
"""
|
|
568
|
+
return cls.graph.get_feature_version_by_field(cls.spec.key)
|
|
569
|
+
|
|
570
|
+
@classmethod
|
|
571
|
+
def load_input(
|
|
572
|
+
cls,
|
|
573
|
+
joiner: "UpstreamJoiner",
|
|
574
|
+
upstream_refs: dict[str, "nw.LazyFrame[Any]"],
|
|
575
|
+
) -> tuple["nw.LazyFrame[Any]", dict[str, str]]:
|
|
576
|
+
"""Join upstream feature metadata.
|
|
577
|
+
|
|
578
|
+
Override for custom join logic (1:many, different keys, filtering, etc.).
|
|
579
|
+
|
|
580
|
+
Args:
|
|
581
|
+
joiner: UpstreamJoiner from MetadataStore
|
|
582
|
+
upstream_refs: Upstream feature metadata references (lazy where possible)
|
|
583
|
+
|
|
584
|
+
Returns:
|
|
585
|
+
(joined_upstream, upstream_column_mapping)
|
|
586
|
+
- joined_upstream: All upstream data joined together
|
|
587
|
+
- upstream_column_mapping: Maps upstream_key -> column name
|
|
588
|
+
"""
|
|
589
|
+
from metaxy.models.feature_spec import FeatureDep
|
|
590
|
+
|
|
591
|
+
# Extract columns and renames from deps
|
|
592
|
+
upstream_columns: dict[str, tuple[str, ...] | None] = {}
|
|
593
|
+
upstream_renames: dict[str, dict[str, str] | None] = {}
|
|
594
|
+
|
|
595
|
+
if cls.spec.deps:
|
|
596
|
+
for dep in cls.spec.deps:
|
|
597
|
+
if isinstance(dep, FeatureDep):
|
|
598
|
+
dep_key_str = dep.key.to_string()
|
|
599
|
+
upstream_columns[dep_key_str] = dep.columns
|
|
600
|
+
upstream_renames[dep_key_str] = dep.rename
|
|
601
|
+
|
|
602
|
+
return joiner.join_upstream(
|
|
603
|
+
upstream_refs=upstream_refs,
|
|
604
|
+
feature_spec=cls.spec,
|
|
605
|
+
feature_plan=cls.graph.get_feature_plan(cls.spec.key),
|
|
606
|
+
upstream_columns=upstream_columns,
|
|
607
|
+
upstream_renames=upstream_renames,
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
@classmethod
|
|
611
|
+
def resolve_data_version_diff(
|
|
612
|
+
cls,
|
|
613
|
+
diff_resolver: "MetadataDiffResolver",
|
|
614
|
+
target_versions: "nw.LazyFrame[Any]",
|
|
615
|
+
current_metadata: "nw.LazyFrame[Any] | None",
|
|
616
|
+
*,
|
|
617
|
+
lazy: bool = False,
|
|
618
|
+
) -> "DiffResult | LazyDiffResult":
|
|
619
|
+
"""Resolve differences between target and current data versions.
|
|
620
|
+
|
|
621
|
+
Override for custom diff logic (ignore certain fields, custom rules, etc.).
|
|
622
|
+
|
|
623
|
+
Args:
|
|
624
|
+
diff_resolver: MetadataDiffResolver from MetadataStore
|
|
625
|
+
target_versions: Calculated target data_versions (Narwhals LazyFrame)
|
|
626
|
+
current_metadata: Current metadata for this feature (Narwhals LazyFrame, or None).
|
|
627
|
+
Should be pre-filtered by feature_version at the store level.
|
|
628
|
+
lazy: If True, return LazyDiffResult. If False, return DiffResult.
|
|
629
|
+
|
|
630
|
+
Returns:
|
|
631
|
+
DiffResult (eager) or LazyDiffResult (lazy) with added, changed, removed
|
|
632
|
+
|
|
633
|
+
Example (default):
|
|
634
|
+
>>> class MyFeature(Feature, spec=...):
|
|
635
|
+
... pass # Uses diff resolver's default implementation
|
|
636
|
+
|
|
637
|
+
Example (ignore certain field changes):
|
|
638
|
+
>>> class MyFeature(Feature, spec=...):
|
|
639
|
+
... @classmethod
|
|
640
|
+
... def resolve_data_version_diff(cls, diff_resolver, target_versions, current_metadata, **kwargs):
|
|
641
|
+
... # Get standard diff
|
|
642
|
+
... result = diff_resolver.find_changes(target_versions, current_metadata)
|
|
643
|
+
...
|
|
644
|
+
... # Custom: Only consider 'frames' field changes, ignore 'audio'
|
|
645
|
+
... # Users can filter/modify the diff result here
|
|
646
|
+
...
|
|
647
|
+
... return result # Return modified DiffResult
|
|
648
|
+
"""
|
|
649
|
+
# Diff resolver always returns LazyDiffResult - materialize if needed
|
|
650
|
+
lazy_result = diff_resolver.find_changes(
|
|
651
|
+
target_versions=target_versions,
|
|
652
|
+
current_metadata=current_metadata,
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
# Materialize to DiffResult if lazy=False
|
|
656
|
+
if not lazy:
|
|
657
|
+
from metaxy.data_versioning.diff import DiffResult
|
|
658
|
+
|
|
659
|
+
return DiffResult(
|
|
660
|
+
added=lazy_result.added.collect(),
|
|
661
|
+
changed=lazy_result.changed.collect(),
|
|
662
|
+
removed=lazy_result.removed.collect(),
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
return lazy_result
|