metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
metaxy/models/feature.py
ADDED
|
@@ -0,0 +1,1335 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from contextvars import ContextVar
|
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar, TypedDict
|
|
6
|
+
|
|
7
|
+
import pydantic
|
|
8
|
+
from pydantic import AwareDatetime, Field, model_validator
|
|
9
|
+
from pydantic._internal._model_construction import ModelMetaclass
|
|
10
|
+
from typing_extensions import Self
|
|
11
|
+
|
|
12
|
+
from metaxy.models.constants import (
|
|
13
|
+
METAXY_FEATURE_SPEC_VERSION,
|
|
14
|
+
METAXY_FEATURE_VERSION,
|
|
15
|
+
METAXY_FULL_DEFINITION_VERSION,
|
|
16
|
+
)
|
|
17
|
+
from metaxy.models.feature_spec import (
|
|
18
|
+
FeatureSpec,
|
|
19
|
+
)
|
|
20
|
+
from metaxy.models.plan import FeaturePlan, FQFieldKey
|
|
21
|
+
from metaxy.models.types import (
|
|
22
|
+
CoercibleToFeatureKey,
|
|
23
|
+
FeatureKey,
|
|
24
|
+
ValidatedFeatureKeyAdapter,
|
|
25
|
+
ValidatedFeatureKeySequenceAdapter,
|
|
26
|
+
)
|
|
27
|
+
from metaxy.utils.hashing import truncate_hash
|
|
28
|
+
|
|
29
|
+
FEATURE_VERSION_COL = METAXY_FEATURE_VERSION
|
|
30
|
+
FEATURE_SPEC_VERSION_COL = METAXY_FEATURE_SPEC_VERSION
|
|
31
|
+
FEATURE_TRACKING_VERSION_COL = METAXY_FULL_DEFINITION_VERSION
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
import narwhals as nw
|
|
35
|
+
|
|
36
|
+
from metaxy.versioning.types import Increment, LazyIncrement
|
|
37
|
+
|
|
38
|
+
# TODO: These are no longer used - remove after refactoring
|
|
39
|
+
# from metaxy.data_versioning.diff import MetadataDiffResolver
|
|
40
|
+
# from metaxy.data_versioning.joiners import UpstreamJoiner
|
|
41
|
+
|
|
42
|
+
# Context variable for active graph (module-level)
|
|
43
|
+
_active_graph: ContextVar["FeatureGraph | None"] = ContextVar(
|
|
44
|
+
"_active_graph", default=None
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_feature_by_key(key: CoercibleToFeatureKey) -> type["BaseFeature"]:
|
|
49
|
+
"""Get a feature class by its key from the active graph.
|
|
50
|
+
|
|
51
|
+
Convenience function that retrieves Metaxy feature class from the currently active [feature graph][metaxy.FeatureGraph]. Can be useful when receiving a feature key from storage or across process boundaries.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
key: Feature key to look up. Accepts types that can be converted into a feature key..
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Feature class
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
KeyError: If no feature with the given key is registered
|
|
61
|
+
|
|
62
|
+
Example:
|
|
63
|
+
```py
|
|
64
|
+
from metaxy import get_feature_by_key, FeatureKey
|
|
65
|
+
parent_key = FeatureKey(["examples", "parent"])
|
|
66
|
+
ParentFeature = get_feature_by_key(parent_key)
|
|
67
|
+
|
|
68
|
+
# Or use string notation
|
|
69
|
+
ParentFeature = get_feature_by_key("examples/parent")
|
|
70
|
+
```
|
|
71
|
+
"""
|
|
72
|
+
graph = FeatureGraph.get_active()
|
|
73
|
+
return graph.get_feature_by_key(key)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class SerializedFeature(TypedDict):
|
|
77
|
+
feature_spec: dict[str, Any]
|
|
78
|
+
feature_schema: dict[str, Any]
|
|
79
|
+
metaxy_feature_version: str
|
|
80
|
+
metaxy_feature_spec_version: str
|
|
81
|
+
metaxy_full_definition_version: str
|
|
82
|
+
feature_class_path: str
|
|
83
|
+
project: str
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class FeatureGraph:
|
|
87
|
+
def __init__(self):
|
|
88
|
+
self.features_by_key: dict[FeatureKey, type[BaseFeature]] = {}
|
|
89
|
+
self.feature_specs_by_key: dict[FeatureKey, FeatureSpec] = {}
|
|
90
|
+
# Standalone specs registered without Feature classes (for migrations)
|
|
91
|
+
self.standalone_specs_by_key: dict[FeatureKey, FeatureSpec] = {}
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def all_specs_by_key(self) -> dict[FeatureKey, FeatureSpec]:
|
|
95
|
+
return {**self.feature_specs_by_key, **self.standalone_specs_by_key}
|
|
96
|
+
|
|
97
|
+
def add_feature(self, feature: type["BaseFeature"]) -> None:
|
|
98
|
+
"""Add a feature to the graph.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
feature: Feature class to register
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If a feature with the same key is already registered
|
|
105
|
+
or if duplicate column names would result from renaming operations
|
|
106
|
+
"""
|
|
107
|
+
if feature.spec().key in self.features_by_key:
|
|
108
|
+
existing = self.features_by_key[feature.spec().key]
|
|
109
|
+
raise ValueError(
|
|
110
|
+
f"Feature with key {feature.spec().key.to_string()} already registered. "
|
|
111
|
+
f"Existing: {existing.__name__}, New: {feature.__name__}. "
|
|
112
|
+
f"Each feature key must be unique within a graph."
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Validate that there are no duplicate column names across dependencies after renaming
|
|
116
|
+
if feature.spec().deps:
|
|
117
|
+
self._validate_no_duplicate_columns(feature.spec())
|
|
118
|
+
|
|
119
|
+
self.features_by_key[feature.spec().key] = feature
|
|
120
|
+
self.feature_specs_by_key[feature.spec().key] = feature.spec()
|
|
121
|
+
|
|
122
|
+
def add_feature_spec(self, spec: FeatureSpec) -> None:
|
|
123
|
+
import warnings
|
|
124
|
+
|
|
125
|
+
# Check if a Feature class already exists for this key
|
|
126
|
+
if spec.key in self.features_by_key:
|
|
127
|
+
warnings.warn(
|
|
128
|
+
f"Feature class already exists for key {spec.key.to_string()}. "
|
|
129
|
+
f"Standalone spec will be ignored - Feature class takes precedence.",
|
|
130
|
+
stacklevel=2,
|
|
131
|
+
)
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
# Check if a standalone spec already exists
|
|
135
|
+
if spec.key in self.standalone_specs_by_key:
|
|
136
|
+
existing = self.standalone_specs_by_key[spec.key]
|
|
137
|
+
# Only warn if it's a different spec (by comparing feature_spec_version)
|
|
138
|
+
if existing.feature_spec_version != spec.feature_spec_version:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f"Standalone spec for key {spec.key.to_string()} already exists "
|
|
141
|
+
f"with a different version."
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Validate that there are no duplicate columns across dependencies after renaming
|
|
145
|
+
if spec.deps:
|
|
146
|
+
self._validate_no_duplicate_columns(spec)
|
|
147
|
+
|
|
148
|
+
# Store standalone spec
|
|
149
|
+
self.standalone_specs_by_key[spec.key] = spec
|
|
150
|
+
# Also add to feature_specs_by_key for methods that only need the spec
|
|
151
|
+
self.feature_specs_by_key[spec.key] = spec
|
|
152
|
+
|
|
153
|
+
def _validate_no_duplicate_columns(self, spec: "FeatureSpec") -> None:
|
|
154
|
+
"""Validate that there are no duplicate column names across dependencies after renaming.
|
|
155
|
+
|
|
156
|
+
This method checks that after all column selection and renaming operations,
|
|
157
|
+
no two columns have the same name (except for ID columns which are expected to be the same).
|
|
158
|
+
Also validates that columns are not renamed to system column names.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
spec: Feature specification to validate
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
ValueError: If duplicate column names would result from the dependency configuration
|
|
165
|
+
or if columns are renamed to system column names
|
|
166
|
+
"""
|
|
167
|
+
from metaxy.models.constants import ALL_SYSTEM_COLUMNS
|
|
168
|
+
from metaxy.models.feature_spec import FeatureDep
|
|
169
|
+
|
|
170
|
+
if not spec.deps:
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
# First, validate each dependency individually
|
|
174
|
+
for dep in spec.deps:
|
|
175
|
+
if not isinstance(dep, FeatureDep):
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
if dep.rename:
|
|
179
|
+
# Get the upstream feature's spec to check its ID columns
|
|
180
|
+
upstream_spec = self.feature_specs_by_key.get(dep.feature)
|
|
181
|
+
upstream_id_columns = upstream_spec.id_columns if upstream_spec else []
|
|
182
|
+
|
|
183
|
+
# Check for renaming to system columns or upstream's ID columns
|
|
184
|
+
for old_name, new_name in dep.rename.items():
|
|
185
|
+
if new_name in ALL_SYSTEM_COLUMNS:
|
|
186
|
+
raise ValueError(
|
|
187
|
+
f"Cannot rename column '{old_name}' to system column name '{new_name}' "
|
|
188
|
+
f"in dependency '{dep.feature.to_string()}'. "
|
|
189
|
+
f"System columns: {sorted(ALL_SYSTEM_COLUMNS)}"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Check against upstream feature's ID columns
|
|
193
|
+
if new_name in upstream_id_columns:
|
|
194
|
+
raise ValueError(
|
|
195
|
+
f"Cannot rename column '{old_name}' to ID column '{new_name}' "
|
|
196
|
+
f"from upstream feature '{dep.feature.to_string()}'. "
|
|
197
|
+
f"ID columns for '{dep.feature.to_string()}': {upstream_id_columns}"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Check for duplicate column names within this dependency
|
|
201
|
+
renamed_values = list(dep.rename.values())
|
|
202
|
+
if len(renamed_values) != len(set(renamed_values)):
|
|
203
|
+
# Find the duplicate(s)
|
|
204
|
+
seen = set()
|
|
205
|
+
duplicates = set()
|
|
206
|
+
for name in renamed_values:
|
|
207
|
+
if name in seen:
|
|
208
|
+
duplicates.add(name)
|
|
209
|
+
seen.add(name)
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f"Duplicate column names after renaming in dependency '{dep.feature.to_string()}': "
|
|
212
|
+
f"{sorted(duplicates)}. Cannot rename multiple columns to the same name within a single dependency."
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Track all column names and their sources
|
|
216
|
+
column_sources: dict[str, list[str]] = {} # column_name -> [source_features]
|
|
217
|
+
id_columns_set = set(spec.id_columns)
|
|
218
|
+
|
|
219
|
+
for dep in spec.deps:
|
|
220
|
+
if not isinstance(dep, FeatureDep):
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
dep_key_str = dep.feature.to_string()
|
|
224
|
+
|
|
225
|
+
# Get the upstream feature spec if available
|
|
226
|
+
upstream_spec = self.feature_specs_by_key.get(dep.feature)
|
|
227
|
+
if not upstream_spec:
|
|
228
|
+
# If upstream feature isn't registered yet, skip validation
|
|
229
|
+
# This can happen during circular imports or when features are defined in different modules
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
# Determine which columns will be present from this dependency
|
|
233
|
+
if dep.columns is None:
|
|
234
|
+
# All columns from upstream (except droppable system columns)
|
|
235
|
+
# We don't know exactly which columns without the actual data,
|
|
236
|
+
# but we can check the renamed columns at least
|
|
237
|
+
if dep.rename:
|
|
238
|
+
for old_name, new_name in dep.rename.items():
|
|
239
|
+
if (
|
|
240
|
+
new_name not in id_columns_set
|
|
241
|
+
): # ID columns are expected to be the same
|
|
242
|
+
if new_name not in column_sources:
|
|
243
|
+
column_sources[new_name] = []
|
|
244
|
+
column_sources[new_name].append(
|
|
245
|
+
f"{dep_key_str} (renamed from '{old_name}')"
|
|
246
|
+
)
|
|
247
|
+
# For non-renamed columns, we can't validate without knowing the actual columns
|
|
248
|
+
# This validation will happen at runtime in the joiner
|
|
249
|
+
elif dep.columns == ():
|
|
250
|
+
# Only system columns - no user columns to track
|
|
251
|
+
pass
|
|
252
|
+
else:
|
|
253
|
+
# Specific columns selected
|
|
254
|
+
for col in dep.columns:
|
|
255
|
+
# Check if this column is renamed
|
|
256
|
+
if dep.rename and col in dep.rename:
|
|
257
|
+
new_name = dep.rename[col]
|
|
258
|
+
if new_name not in id_columns_set:
|
|
259
|
+
if new_name not in column_sources:
|
|
260
|
+
column_sources[new_name] = []
|
|
261
|
+
column_sources[new_name].append(
|
|
262
|
+
f"{dep_key_str} (renamed from '{col}')"
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
# Column keeps its original name
|
|
266
|
+
if col not in id_columns_set:
|
|
267
|
+
if col not in column_sources:
|
|
268
|
+
column_sources[col] = []
|
|
269
|
+
column_sources[col].append(dep_key_str)
|
|
270
|
+
|
|
271
|
+
# Check for duplicates
|
|
272
|
+
duplicates = {
|
|
273
|
+
col: sources for col, sources in column_sources.items() if len(sources) > 1
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if duplicates:
|
|
277
|
+
# Format error message
|
|
278
|
+
error_lines = []
|
|
279
|
+
for col, sources in sorted(duplicates.items()):
|
|
280
|
+
error_lines.append(
|
|
281
|
+
f" - Column '{col}' appears in: {', '.join(sources)}"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
raise ValueError(
|
|
285
|
+
f"Feature '{spec.key.to_string()}' would have duplicate column names after renaming:\n"
|
|
286
|
+
+ "\n".join(error_lines)
|
|
287
|
+
+ "\n\nUse the 'rename' parameter in FeatureDep to resolve conflicts, "
|
|
288
|
+
"or use 'columns' to select only the columns you need."
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
def remove_feature(self, key: CoercibleToFeatureKey) -> None:
|
|
292
|
+
"""Remove a feature from the graph.
|
|
293
|
+
|
|
294
|
+
Removes Feature class or standalone spec (whichever exists).
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
key: Feature key to remove. Accepts types that can be converted into a feature key..
|
|
298
|
+
|
|
299
|
+
Raises:
|
|
300
|
+
KeyError: If no feature with the given key is registered
|
|
301
|
+
"""
|
|
302
|
+
# Validate and coerce the key
|
|
303
|
+
validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
|
|
304
|
+
|
|
305
|
+
# Check both Feature classes and standalone specs
|
|
306
|
+
combined = {**self.feature_specs_by_key, **self.standalone_specs_by_key}
|
|
307
|
+
|
|
308
|
+
if validated_key not in combined:
|
|
309
|
+
raise KeyError(
|
|
310
|
+
f"No feature with key {validated_key.to_string()} found in graph. "
|
|
311
|
+
f"Available keys: {[k.to_string() for k in combined]}"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Remove from all relevant dicts
|
|
315
|
+
if validated_key in self.features_by_key:
|
|
316
|
+
del self.features_by_key[validated_key]
|
|
317
|
+
if validated_key in self.standalone_specs_by_key:
|
|
318
|
+
del self.standalone_specs_by_key[validated_key]
|
|
319
|
+
if validated_key in self.feature_specs_by_key:
|
|
320
|
+
del self.feature_specs_by_key[validated_key]
|
|
321
|
+
|
|
322
|
+
def get_feature_by_key(self, key: CoercibleToFeatureKey) -> type["BaseFeature"]:
|
|
323
|
+
"""Get a feature class by its key.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
key: Feature key to look up. Accepts types that can be converted into a feature key..
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Feature class
|
|
330
|
+
|
|
331
|
+
Raises:
|
|
332
|
+
KeyError: If no feature with the given key is registered
|
|
333
|
+
|
|
334
|
+
Example:
|
|
335
|
+
```py
|
|
336
|
+
graph = FeatureGraph.get_active()
|
|
337
|
+
parent_key = FeatureKey(["examples", "parent"])
|
|
338
|
+
ParentFeature = graph.get_feature_by_key(parent_key)
|
|
339
|
+
|
|
340
|
+
# Or use string notation
|
|
341
|
+
ParentFeature = graph.get_feature_by_key("examples/parent")
|
|
342
|
+
```
|
|
343
|
+
"""
|
|
344
|
+
# Validate and coerce the key
|
|
345
|
+
validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
|
|
346
|
+
|
|
347
|
+
if validated_key not in self.features_by_key:
|
|
348
|
+
raise KeyError(
|
|
349
|
+
f"No feature with key {validated_key.to_string()} found in graph. "
|
|
350
|
+
f"Available keys: {[k.to_string() for k in self.features_by_key.keys()]}"
|
|
351
|
+
)
|
|
352
|
+
return self.features_by_key[validated_key]
|
|
353
|
+
|
|
354
|
+
def list_features(
|
|
355
|
+
self,
|
|
356
|
+
projects: list[str] | str | None = None,
|
|
357
|
+
*,
|
|
358
|
+
only_current_project: bool = True,
|
|
359
|
+
) -> list[FeatureKey]:
|
|
360
|
+
"""List all feature keys in the graph, optionally filtered by project(s).
|
|
361
|
+
|
|
362
|
+
By default, filters features by the current project (first part of feature key).
|
|
363
|
+
This prevents operations from affecting features in other projects.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
projects: Project name(s) to filter by. Can be:
|
|
367
|
+
- None: Use current project from MetaxyConfig (if only_current_project=True)
|
|
368
|
+
- str: Single project name
|
|
369
|
+
- list[str]: Multiple project names
|
|
370
|
+
only_current_project: If True, filter by current/specified project(s).
|
|
371
|
+
If False, return all features regardless of project.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
List of feature keys
|
|
375
|
+
|
|
376
|
+
Example:
|
|
377
|
+
```py
|
|
378
|
+
# Get all features for current project
|
|
379
|
+
graph = FeatureGraph.get_active()
|
|
380
|
+
features = graph.list_features()
|
|
381
|
+
|
|
382
|
+
# Get features for specific project
|
|
383
|
+
features = graph.list_features(projects="myproject")
|
|
384
|
+
|
|
385
|
+
# Get features for multiple projects
|
|
386
|
+
features = graph.list_features(projects=["project1", "project2"])
|
|
387
|
+
|
|
388
|
+
# Get all features regardless of project
|
|
389
|
+
all_features = graph.list_features(only_current_project=False)
|
|
390
|
+
```
|
|
391
|
+
"""
|
|
392
|
+
if not only_current_project:
|
|
393
|
+
# Return all features
|
|
394
|
+
return list(self.features_by_key.keys())
|
|
395
|
+
|
|
396
|
+
# Normalize projects to list
|
|
397
|
+
project_list: list[str]
|
|
398
|
+
if projects is None:
|
|
399
|
+
# Try to get from config context
|
|
400
|
+
try:
|
|
401
|
+
from metaxy.config import MetaxyConfig
|
|
402
|
+
|
|
403
|
+
config = MetaxyConfig.get()
|
|
404
|
+
project_list = [config.project]
|
|
405
|
+
except RuntimeError:
|
|
406
|
+
# Config not initialized - in tests or non-CLI usage
|
|
407
|
+
# Return all features (can't determine project)
|
|
408
|
+
return list(self.features_by_key.keys())
|
|
409
|
+
elif isinstance(projects, str):
|
|
410
|
+
project_list = [projects]
|
|
411
|
+
else:
|
|
412
|
+
project_list = projects
|
|
413
|
+
|
|
414
|
+
# Filter by project(s) using Feature.project attribute
|
|
415
|
+
return [
|
|
416
|
+
key
|
|
417
|
+
for key in self.features_by_key.keys()
|
|
418
|
+
if self.features_by_key[key].project in project_list
|
|
419
|
+
]
|
|
420
|
+
|
|
421
|
+
def get_feature_plan(self, key: CoercibleToFeatureKey) -> FeaturePlan:
|
|
422
|
+
"""Get a feature plan for a given feature key.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
key: Feature key to get plan for. Accepts types that can be converted into a feature key..
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
FeaturePlan instance with feature spec and dependencies.
|
|
429
|
+
"""
|
|
430
|
+
# Validate and coerce the key
|
|
431
|
+
validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
|
|
432
|
+
|
|
433
|
+
spec = self.all_specs_by_key[validated_key]
|
|
434
|
+
|
|
435
|
+
return FeaturePlan(
|
|
436
|
+
feature=spec,
|
|
437
|
+
deps=[self.feature_specs_by_key[dep.feature] for dep in spec.deps or []]
|
|
438
|
+
or None,
|
|
439
|
+
feature_deps=spec.deps, # Pass the actual FeatureDep objects with field mappings
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
def get_field_version(self, key: "FQFieldKey") -> str:
|
|
443
|
+
hasher = hashlib.sha256()
|
|
444
|
+
|
|
445
|
+
plan = self.get_feature_plan(key.feature)
|
|
446
|
+
field = plan.feature.fields_by_key[key.field]
|
|
447
|
+
|
|
448
|
+
hasher.update(key.to_string().encode())
|
|
449
|
+
hasher.update(str(field.code_version).encode())
|
|
450
|
+
|
|
451
|
+
for k, v in sorted(plan.get_parent_fields_for_field(key.field).items()):
|
|
452
|
+
hasher.update(self.get_field_version(k).encode())
|
|
453
|
+
|
|
454
|
+
return truncate_hash(hasher.hexdigest())
|
|
455
|
+
|
|
456
|
+
def get_feature_version_by_field(
|
|
457
|
+
self, key: CoercibleToFeatureKey
|
|
458
|
+
) -> dict[str, str]:
|
|
459
|
+
"""Computes the field provenance map for a feature.
|
|
460
|
+
|
|
461
|
+
Hash together field provenance entries with the feature code version.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
key: Feature key to get field versions for. Accepts types that can be converted into a feature key..
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
dict[str, str]: The provenance hash for each field in the feature plan.
|
|
468
|
+
Keys are field names as strings.
|
|
469
|
+
"""
|
|
470
|
+
# Validate and coerce the key
|
|
471
|
+
validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
|
|
472
|
+
|
|
473
|
+
res = {}
|
|
474
|
+
|
|
475
|
+
plan = self.get_feature_plan(validated_key)
|
|
476
|
+
|
|
477
|
+
for k, v in plan.feature.fields_by_key.items():
|
|
478
|
+
res[k.to_string()] = self.get_field_version(
|
|
479
|
+
FQFieldKey(field=k, feature=validated_key)
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
return res
|
|
483
|
+
|
|
484
|
+
def get_feature_version(self, key: CoercibleToFeatureKey) -> str:
|
|
485
|
+
"""Computes the feature version as a single string.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
key: Feature key to get version for. Accepts types that can be converted into a feature key..
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
Truncated SHA256 hash representing the feature version.
|
|
492
|
+
"""
|
|
493
|
+
# Validate and coerce the key
|
|
494
|
+
validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
|
|
495
|
+
|
|
496
|
+
hasher = hashlib.sha256()
|
|
497
|
+
provenance_by_field = self.get_feature_version_by_field(validated_key)
|
|
498
|
+
for field_key in sorted(provenance_by_field):
|
|
499
|
+
hasher.update(field_key.encode())
|
|
500
|
+
hasher.update(provenance_by_field[field_key].encode())
|
|
501
|
+
|
|
502
|
+
return truncate_hash(hasher.hexdigest())
|
|
503
|
+
|
|
504
|
+
def get_downstream_features(
|
|
505
|
+
self, sources: Sequence[CoercibleToFeatureKey]
|
|
506
|
+
) -> list[FeatureKey]:
|
|
507
|
+
"""Get all features downstream of sources, topologically sorted.
|
|
508
|
+
|
|
509
|
+
Performs a depth-first traversal of the dependency graph to find all
|
|
510
|
+
features that transitively depend on any of the source features.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
sources: List of source feature keys. Each element can be string, sequence, FeatureKey, or BaseFeature class.
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
List of downstream feature keys in topological order (dependencies first).
|
|
517
|
+
Does not include the source features themselves.
|
|
518
|
+
|
|
519
|
+
Example:
|
|
520
|
+
```py
|
|
521
|
+
# DAG: A -> B -> D
|
|
522
|
+
# A -> C -> D
|
|
523
|
+
graph.get_downstream_features([FeatureKey(["A"])])
|
|
524
|
+
# [FeatureKey(["B"]), FeatureKey(["C"]), FeatureKey(["D"])]
|
|
525
|
+
|
|
526
|
+
# Or use string notation
|
|
527
|
+
graph.get_downstream_features(["A"])
|
|
528
|
+
```
|
|
529
|
+
"""
|
|
530
|
+
# Validate and coerce the source keys
|
|
531
|
+
validated_sources = ValidatedFeatureKeySequenceAdapter.validate_python(sources)
|
|
532
|
+
|
|
533
|
+
source_set = set(validated_sources)
|
|
534
|
+
visited = set()
|
|
535
|
+
post_order = []
|
|
536
|
+
source_set = set(sources)
|
|
537
|
+
visited = set()
|
|
538
|
+
post_order = [] # Reverse topological order
|
|
539
|
+
|
|
540
|
+
def visit(key: FeatureKey):
|
|
541
|
+
"""DFS traversal."""
|
|
542
|
+
if key in visited:
|
|
543
|
+
return
|
|
544
|
+
visited.add(key)
|
|
545
|
+
|
|
546
|
+
# Find all features that depend on this one
|
|
547
|
+
for feature_key, feature_spec in self.feature_specs_by_key.items():
|
|
548
|
+
if feature_spec.deps:
|
|
549
|
+
for dep in feature_spec.deps:
|
|
550
|
+
if dep.feature == key:
|
|
551
|
+
# This feature depends on 'key', so visit it
|
|
552
|
+
visit(feature_key)
|
|
553
|
+
|
|
554
|
+
post_order.append(key)
|
|
555
|
+
|
|
556
|
+
# Visit all sources
|
|
557
|
+
for source in validated_sources:
|
|
558
|
+
visit(source)
|
|
559
|
+
|
|
560
|
+
# Remove sources from result, reverse to get topological order
|
|
561
|
+
result = [k for k in reversed(post_order) if k not in source_set]
|
|
562
|
+
return result
|
|
563
|
+
|
|
564
|
+
def topological_sort_features(
|
|
565
|
+
self,
|
|
566
|
+
feature_keys: Sequence[CoercibleToFeatureKey] | None = None,
|
|
567
|
+
*,
|
|
568
|
+
descending: bool = False,
|
|
569
|
+
) -> list[FeatureKey]:
|
|
570
|
+
"""Sort feature keys in topological order.
|
|
571
|
+
|
|
572
|
+
Uses stable alphabetical ordering when multiple nodes are at the same level.
|
|
573
|
+
This ensures deterministic output for diff comparisons and migrations.
|
|
574
|
+
|
|
575
|
+
Implemented using depth-first search with post-order traversal.
|
|
576
|
+
|
|
577
|
+
Args:
|
|
578
|
+
feature_keys: List of feature keys to sort. Each element can be string, sequence,
|
|
579
|
+
FeatureKey, or BaseFeature class. If None, sorts all features
|
|
580
|
+
(both Feature classes and standalone specs) in the graph.
|
|
581
|
+
descending: If False (default), dependencies appear before dependents.
|
|
582
|
+
For a chain A -> B -> C, returns [A, B, C].
|
|
583
|
+
If True, dependents appear before dependencies.
|
|
584
|
+
For a chain A -> B -> C, returns [C, B, A].
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
List of feature keys sorted in topological order
|
|
588
|
+
|
|
589
|
+
Example:
|
|
590
|
+
```py
|
|
591
|
+
graph = FeatureGraph.get_active()
|
|
592
|
+
# Sort specific features (dependencies first)
|
|
593
|
+
sorted_keys = graph.topological_sort_features([
|
|
594
|
+
FeatureKey(["video", "raw"]),
|
|
595
|
+
FeatureKey(["video", "scene"]),
|
|
596
|
+
])
|
|
597
|
+
|
|
598
|
+
# Or use string notation
|
|
599
|
+
sorted_keys = graph.topological_sort_features(["video/raw", "video/scene"])
|
|
600
|
+
|
|
601
|
+
# Sort all features in the graph (including standalone specs)
|
|
602
|
+
all_sorted = graph.topological_sort_features()
|
|
603
|
+
|
|
604
|
+
# Sort with dependents first (useful for processing leaf nodes before roots)
|
|
605
|
+
reverse_sorted = graph.topological_sort_features(descending=True)
|
|
606
|
+
```
|
|
607
|
+
"""
|
|
608
|
+
# Determine which features to sort
|
|
609
|
+
if feature_keys is None:
|
|
610
|
+
# Include both Feature classes and standalone specs
|
|
611
|
+
keys_to_sort = set(self.feature_specs_by_key.keys())
|
|
612
|
+
else:
|
|
613
|
+
# Validate and coerce the feature keys
|
|
614
|
+
validated_keys = ValidatedFeatureKeySequenceAdapter.validate_python(
|
|
615
|
+
feature_keys
|
|
616
|
+
)
|
|
617
|
+
keys_to_sort = set(validated_keys)
|
|
618
|
+
|
|
619
|
+
visited = set()
|
|
620
|
+
result = [] # Topological order (dependencies first)
|
|
621
|
+
|
|
622
|
+
def visit(key: FeatureKey):
|
|
623
|
+
"""DFS visit with post-order traversal."""
|
|
624
|
+
if key in visited or key not in keys_to_sort:
|
|
625
|
+
return
|
|
626
|
+
visited.add(key)
|
|
627
|
+
|
|
628
|
+
# Get dependencies from feature spec
|
|
629
|
+
spec = self.feature_specs_by_key.get(key)
|
|
630
|
+
if spec and spec.deps:
|
|
631
|
+
# Sort dependencies alphabetically for deterministic ordering
|
|
632
|
+
sorted_deps = sorted(
|
|
633
|
+
(dep.feature for dep in spec.deps),
|
|
634
|
+
key=lambda k: k.to_string().lower(),
|
|
635
|
+
)
|
|
636
|
+
for dep_key in sorted_deps:
|
|
637
|
+
if dep_key in keys_to_sort:
|
|
638
|
+
visit(dep_key)
|
|
639
|
+
|
|
640
|
+
# Add to result after visiting dependencies (post-order)
|
|
641
|
+
result.append(key)
|
|
642
|
+
|
|
643
|
+
# Visit all keys in sorted order for deterministic traversal
|
|
644
|
+
for key in sorted(keys_to_sort, key=lambda k: k.to_string().lower()):
|
|
645
|
+
visit(key)
|
|
646
|
+
|
|
647
|
+
# Post-order DFS gives topological order (dependencies before dependents)
|
|
648
|
+
if descending:
|
|
649
|
+
return list(reversed(result))
|
|
650
|
+
return result
|
|
651
|
+
|
|
652
|
+
@property
|
|
653
|
+
def snapshot_version(self) -> str:
|
|
654
|
+
"""Generate a snapshot version representing the current topology + versions of the feature graph"""
|
|
655
|
+
if len(self.feature_specs_by_key) == 0:
|
|
656
|
+
return "empty"
|
|
657
|
+
|
|
658
|
+
hasher = hashlib.sha256()
|
|
659
|
+
for feature_key in sorted(self.feature_specs_by_key.keys()):
|
|
660
|
+
hasher.update(feature_key.to_string().encode("utf-8"))
|
|
661
|
+
hasher.update(self.get_feature_version(feature_key).encode("utf-8"))
|
|
662
|
+
return truncate_hash(hasher.hexdigest())
|
|
663
|
+
|
|
664
|
+
def to_snapshot(self) -> dict[str, SerializedFeature]:
|
|
665
|
+
"""Serialize graph to snapshot format.
|
|
666
|
+
|
|
667
|
+
Returns a dict mapping feature_key (string) to feature data dict,
|
|
668
|
+
including the import path of the Feature class for reconstruction.
|
|
669
|
+
|
|
670
|
+
Returns: dictionary mapping feature_key (string) to feature data dict
|
|
671
|
+
|
|
672
|
+
Example:
|
|
673
|
+
```py
|
|
674
|
+
snapshot = graph.to_snapshot()
|
|
675
|
+
snapshot["video_processing"]["metaxy_feature_version"]
|
|
676
|
+
# 'abc12345'
|
|
677
|
+
snapshot["video_processing"]["metaxy_feature_spec_version"]
|
|
678
|
+
# 'def67890'
|
|
679
|
+
snapshot["video_processing"]["metaxy_full_definition_version"]
|
|
680
|
+
# 'xyz98765'
|
|
681
|
+
snapshot["video_processing"]["feature_class_path"]
|
|
682
|
+
# 'myapp.features.video.VideoProcessing'
|
|
683
|
+
snapshot["video_processing"]["project"]
|
|
684
|
+
# 'myapp'
|
|
685
|
+
```
|
|
686
|
+
"""
|
|
687
|
+
snapshot: dict[str, SerializedFeature] = {}
|
|
688
|
+
|
|
689
|
+
for feature_key, feature_cls in self.features_by_key.items():
|
|
690
|
+
feature_key_str = feature_key.to_string()
|
|
691
|
+
feature_spec_dict = feature_cls.spec().model_dump(mode="json") # type: ignore[attr-defined]
|
|
692
|
+
feature_schema_dict = feature_cls.model_json_schema() # type: ignore[attr-defined]
|
|
693
|
+
feature_version = feature_cls.feature_version() # type: ignore[attr-defined]
|
|
694
|
+
feature_spec_version = feature_cls.spec().feature_spec_version # type: ignore[attr-defined]
|
|
695
|
+
full_definition_version = feature_cls.full_definition_version() # type: ignore[attr-defined]
|
|
696
|
+
project = feature_cls.project # type: ignore[attr-defined]
|
|
697
|
+
|
|
698
|
+
# Get class import path (module.ClassName)
|
|
699
|
+
class_path = f"{feature_cls.__module__}.{feature_cls.__name__}"
|
|
700
|
+
|
|
701
|
+
snapshot[feature_key_str] = { # pyright: ignore
|
|
702
|
+
"feature_spec": feature_spec_dict,
|
|
703
|
+
"feature_schema": feature_schema_dict,
|
|
704
|
+
FEATURE_VERSION_COL: feature_version,
|
|
705
|
+
FEATURE_SPEC_VERSION_COL: feature_spec_version,
|
|
706
|
+
FEATURE_TRACKING_VERSION_COL: full_definition_version,
|
|
707
|
+
"feature_class_path": class_path,
|
|
708
|
+
"project": project,
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
return snapshot
|
|
712
|
+
|
|
713
|
+
@classmethod
|
|
714
|
+
def from_snapshot(
|
|
715
|
+
cls,
|
|
716
|
+
snapshot_data: Mapping[str, Mapping[str, Any]],
|
|
717
|
+
*,
|
|
718
|
+
class_path_overrides: dict[str, str] | None = None,
|
|
719
|
+
force_reload: bool = False,
|
|
720
|
+
) -> "FeatureGraph":
|
|
721
|
+
"""Reconstruct graph from snapshot by importing Feature classes.
|
|
722
|
+
|
|
723
|
+
Strictly requires Feature classes to exist at their recorded import paths.
|
|
724
|
+
This ensures custom methods (like load_input) are available.
|
|
725
|
+
|
|
726
|
+
If a feature has been moved/renamed, use class_path_overrides to specify
|
|
727
|
+
the new location.
|
|
728
|
+
|
|
729
|
+
Args:
|
|
730
|
+
snapshot_data: Dict of feature_key -> dict containing
|
|
731
|
+
feature_spec (dict), feature_class_path (str), and other fields
|
|
732
|
+
as returned by to_snapshot() or loaded from DB
|
|
733
|
+
class_path_overrides: Optional dict mapping feature_key to new class path
|
|
734
|
+
for features that have been moved/renamed
|
|
735
|
+
force_reload: If True, reload modules from disk to get current code state.
|
|
736
|
+
|
|
737
|
+
Returns:
|
|
738
|
+
New FeatureGraph with historical features
|
|
739
|
+
|
|
740
|
+
Raises:
|
|
741
|
+
ImportError: If feature class cannot be imported at recorded path
|
|
742
|
+
|
|
743
|
+
Example:
|
|
744
|
+
```py
|
|
745
|
+
# Load snapshot from metadata store
|
|
746
|
+
historical_graph = FeatureGraph.from_snapshot(snapshot_data)
|
|
747
|
+
|
|
748
|
+
# With override for moved feature
|
|
749
|
+
historical_graph = FeatureGraph.from_snapshot(
|
|
750
|
+
snapshot_data,
|
|
751
|
+
class_path_overrides={
|
|
752
|
+
"video_processing": "myapp.features_v2.VideoProcessing"
|
|
753
|
+
}
|
|
754
|
+
)
|
|
755
|
+
```
|
|
756
|
+
"""
|
|
757
|
+
import importlib
|
|
758
|
+
import sys
|
|
759
|
+
|
|
760
|
+
graph = cls()
|
|
761
|
+
class_path_overrides = class_path_overrides or {}
|
|
762
|
+
|
|
763
|
+
# If force_reload, collect all module paths first to remove ALL features
|
|
764
|
+
# from those modules before reloading (modules can have multiple features)
|
|
765
|
+
modules_to_reload = set()
|
|
766
|
+
if force_reload:
|
|
767
|
+
for feature_key_str, feature_data in snapshot_data.items():
|
|
768
|
+
class_path = class_path_overrides.get(
|
|
769
|
+
feature_key_str
|
|
770
|
+
) or feature_data.get("feature_class_path")
|
|
771
|
+
if class_path:
|
|
772
|
+
module_path, _ = class_path.rsplit(".", 1)
|
|
773
|
+
if module_path in sys.modules:
|
|
774
|
+
modules_to_reload.add(module_path)
|
|
775
|
+
|
|
776
|
+
# Use context manager to temporarily set the new graph as active
|
|
777
|
+
# This ensures imported Feature classes register to the new graph, not the current one
|
|
778
|
+
with graph.use():
|
|
779
|
+
for feature_key_str, feature_data in snapshot_data.items():
|
|
780
|
+
# Parse FeatureSpec for validation
|
|
781
|
+
feature_spec_dict = feature_data["feature_spec"]
|
|
782
|
+
FeatureSpec.model_validate(feature_spec_dict)
|
|
783
|
+
|
|
784
|
+
# Get class path (check overrides first)
|
|
785
|
+
if feature_key_str in class_path_overrides:
|
|
786
|
+
class_path = class_path_overrides[feature_key_str]
|
|
787
|
+
else:
|
|
788
|
+
class_path = feature_data.get("feature_class_path")
|
|
789
|
+
if not class_path:
|
|
790
|
+
raise ValueError(
|
|
791
|
+
f"Feature '{feature_key_str}' has no feature_class_path in snapshot. "
|
|
792
|
+
f"Cannot reconstruct historical graph."
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
# Import the class
|
|
796
|
+
try:
|
|
797
|
+
module_path, class_name = class_path.rsplit(".", 1)
|
|
798
|
+
|
|
799
|
+
# Force reload module from disk if requested
|
|
800
|
+
# This is critical for migration detection - when code changes,
|
|
801
|
+
# we need fresh imports to detect the changes
|
|
802
|
+
if force_reload and module_path in modules_to_reload:
|
|
803
|
+
# Before first reload of this module, remove ALL features from this module
|
|
804
|
+
# (a module can define multiple features)
|
|
805
|
+
if module_path in modules_to_reload:
|
|
806
|
+
# Find all features from this module in snapshot and remove them
|
|
807
|
+
for fk_str, fd in snapshot_data.items():
|
|
808
|
+
fcp = class_path_overrides.get(fk_str) or fd.get(
|
|
809
|
+
"feature_class_path"
|
|
810
|
+
)
|
|
811
|
+
if fcp and fcp.rsplit(".", 1)[0] == module_path:
|
|
812
|
+
fspec_dict = fd["feature_spec"]
|
|
813
|
+
fspec = FeatureSpec.model_validate(fspec_dict)
|
|
814
|
+
if fspec.key in graph.features_by_key:
|
|
815
|
+
graph.remove_feature(fspec.key)
|
|
816
|
+
|
|
817
|
+
# Mark module as processed so we don't remove features again
|
|
818
|
+
modules_to_reload.discard(module_path)
|
|
819
|
+
|
|
820
|
+
module = importlib.reload(sys.modules[module_path])
|
|
821
|
+
else:
|
|
822
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
823
|
+
|
|
824
|
+
feature_cls = getattr(module, class_name)
|
|
825
|
+
except (ImportError, AttributeError):
|
|
826
|
+
# Feature class not importable - add as standalone spec instead
|
|
827
|
+
# This allows migrations to work even when old Feature classes are deleted/moved
|
|
828
|
+
import logging
|
|
829
|
+
|
|
830
|
+
logger = logging.getLogger(__name__)
|
|
831
|
+
logger.exception(
|
|
832
|
+
f"Cannot import Feature class '{class_path}' for '{feature_key_str}'. "
|
|
833
|
+
f"Adding only the FeatureSpec. "
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
feature_spec = FeatureSpec.model_validate(feature_spec_dict)
|
|
837
|
+
# Add the spec as a standalone spec
|
|
838
|
+
graph.add_feature_spec(feature_spec)
|
|
839
|
+
continue
|
|
840
|
+
|
|
841
|
+
# Validate the imported class matches the stored spec
|
|
842
|
+
if not hasattr(feature_cls, "spec"):
|
|
843
|
+
raise TypeError(
|
|
844
|
+
f"Imported class '{class_path}' is not a valid Feature class "
|
|
845
|
+
f"(missing 'spec' attribute)"
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
# Register the imported feature to this graph if not already present
|
|
849
|
+
# If the module was imported for the first time, the metaclass already registered it
|
|
850
|
+
# If the module was previously imported, we need to manually register it
|
|
851
|
+
if feature_cls.spec().key not in graph.features_by_key:
|
|
852
|
+
graph.add_feature(feature_cls)
|
|
853
|
+
|
|
854
|
+
return graph
|
|
855
|
+
|
|
856
|
+
@classmethod
|
|
857
|
+
def get_active(cls) -> "FeatureGraph":
|
|
858
|
+
"""Get the currently active graph.
|
|
859
|
+
|
|
860
|
+
Returns the graph from the context variable if set, otherwise returns
|
|
861
|
+
the default global graph.
|
|
862
|
+
|
|
863
|
+
Returns:
|
|
864
|
+
Active FeatureGraph instance
|
|
865
|
+
|
|
866
|
+
Example:
|
|
867
|
+
```py
|
|
868
|
+
# Normal usage - returns default graph
|
|
869
|
+
reg = FeatureGraph.get_active()
|
|
870
|
+
|
|
871
|
+
# With custom graph in context
|
|
872
|
+
with my_graph.use():
|
|
873
|
+
reg = FeatureGraph.get_active() # Returns my_graph
|
|
874
|
+
```
|
|
875
|
+
"""
|
|
876
|
+
return _active_graph.get() or graph
|
|
877
|
+
|
|
878
|
+
@classmethod
|
|
879
|
+
def set_active(cls, reg: "FeatureGraph") -> None:
|
|
880
|
+
"""Set the active graph for the current context.
|
|
881
|
+
|
|
882
|
+
This sets the context variable that will be returned by get_active().
|
|
883
|
+
Typically used in application setup code or test fixtures.
|
|
884
|
+
|
|
885
|
+
Args:
|
|
886
|
+
reg: FeatureGraph to activate
|
|
887
|
+
|
|
888
|
+
Example:
|
|
889
|
+
```py
|
|
890
|
+
# In application setup
|
|
891
|
+
my_graph = FeatureGraph()
|
|
892
|
+
FeatureGraph.set_active(my_graph)
|
|
893
|
+
|
|
894
|
+
# Now all operations use my_graph
|
|
895
|
+
FeatureGraph.get_active() # Returns my_graph
|
|
896
|
+
```
|
|
897
|
+
"""
|
|
898
|
+
_active_graph.set(reg)
|
|
899
|
+
|
|
900
|
+
@contextmanager
|
|
901
|
+
def use(self) -> Iterator[Self]:
|
|
902
|
+
"""Context manager to temporarily use this graph as active.
|
|
903
|
+
|
|
904
|
+
This is the recommended way to use custom registries, especially in tests.
|
|
905
|
+
The graph is automatically restored when the context exits.
|
|
906
|
+
|
|
907
|
+
Yields:
|
|
908
|
+
FeatureGraph: This graph instance
|
|
909
|
+
|
|
910
|
+
Example:
|
|
911
|
+
```py
|
|
912
|
+
test_graph = FeatureGraph()
|
|
913
|
+
|
|
914
|
+
with test_graph.use():
|
|
915
|
+
# All operations use test_graph
|
|
916
|
+
class TestFeature(Feature, spec=...):
|
|
917
|
+
pass
|
|
918
|
+
|
|
919
|
+
# Outside context, back to previous graph
|
|
920
|
+
```
|
|
921
|
+
"""
|
|
922
|
+
token = _active_graph.set(self)
|
|
923
|
+
try:
|
|
924
|
+
yield self
|
|
925
|
+
finally:
|
|
926
|
+
_active_graph.reset(token)
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def current_graph() -> FeatureGraph:
|
|
930
|
+
"""Get the currently active graph.
|
|
931
|
+
|
|
932
|
+
Returns:
|
|
933
|
+
FeatureGraph: The currently active graph.
|
|
934
|
+
"""
|
|
935
|
+
return FeatureGraph.get_active()
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
# Default global graph
|
|
939
|
+
graph = FeatureGraph()
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
class MetaxyMeta(ModelMetaclass):
|
|
943
|
+
def __new__(
|
|
944
|
+
cls,
|
|
945
|
+
cls_name: str,
|
|
946
|
+
bases: tuple[type[Any], ...],
|
|
947
|
+
namespace: dict[str, Any],
|
|
948
|
+
*,
|
|
949
|
+
spec: FeatureSpec | None = None,
|
|
950
|
+
**kwargs,
|
|
951
|
+
) -> type[Self]: # pyright: ignore[reportGeneralTypeIssues]
|
|
952
|
+
# Inject frozen config if not already specified in namespace
|
|
953
|
+
if "model_config" not in namespace:
|
|
954
|
+
from pydantic import ConfigDict
|
|
955
|
+
|
|
956
|
+
namespace["model_config"] = ConfigDict(frozen=True)
|
|
957
|
+
|
|
958
|
+
new_cls = super().__new__(cls, cls_name, bases, namespace, **kwargs)
|
|
959
|
+
|
|
960
|
+
if spec:
|
|
961
|
+
# Get graph from context at class definition time
|
|
962
|
+
active_graph = FeatureGraph.get_active()
|
|
963
|
+
new_cls.graph = active_graph # type: ignore[attr-defined]
|
|
964
|
+
new_cls._spec = spec # type: ignore[attr-defined]
|
|
965
|
+
|
|
966
|
+
# Determine project for this feature using intelligent detection
|
|
967
|
+
project = cls._detect_project(new_cls)
|
|
968
|
+
new_cls.project = project # type: ignore[attr-defined]
|
|
969
|
+
|
|
970
|
+
active_graph.add_feature(new_cls)
|
|
971
|
+
else:
|
|
972
|
+
pass # TODO: set spec to a property that would raise an exception on access
|
|
973
|
+
|
|
974
|
+
return new_cls
|
|
975
|
+
|
|
976
|
+
@staticmethod
|
|
977
|
+
def _detect_project(feature_cls: type) -> str:
|
|
978
|
+
"""Detect project for a feature class.
|
|
979
|
+
|
|
980
|
+
Detection order:
|
|
981
|
+
1. Try to auto-load MetaxyConfig from metaxy.toml/pyproject.toml
|
|
982
|
+
starting from the feature's file location
|
|
983
|
+
2. Use config.project if available
|
|
984
|
+
3. Check metaxy.projects entry points as fallback
|
|
985
|
+
4. Fall back to "default" with a warning
|
|
986
|
+
|
|
987
|
+
Args:
|
|
988
|
+
feature_cls: The Feature class being registered
|
|
989
|
+
|
|
990
|
+
Returns:
|
|
991
|
+
Project name string
|
|
992
|
+
"""
|
|
993
|
+
import inspect
|
|
994
|
+
import warnings
|
|
995
|
+
from pathlib import Path
|
|
996
|
+
|
|
997
|
+
from metaxy._packaging import detect_project_from_entrypoints
|
|
998
|
+
from metaxy.config import MetaxyConfig
|
|
999
|
+
|
|
1000
|
+
module_name = feature_cls.__module__
|
|
1001
|
+
|
|
1002
|
+
# Strategy 1: Try to load config if not already set
|
|
1003
|
+
if not MetaxyConfig.is_set():
|
|
1004
|
+
# Get the file where the feature class is defined
|
|
1005
|
+
feature_file = inspect.getfile(feature_cls)
|
|
1006
|
+
feature_dir = Path(feature_file).parent
|
|
1007
|
+
|
|
1008
|
+
# Attempt to auto-load config from metaxy.toml or pyproject.toml
|
|
1009
|
+
# starting from the feature's directory
|
|
1010
|
+
config = MetaxyConfig.load(
|
|
1011
|
+
search_parents=True, auto_discovery_start=feature_dir
|
|
1012
|
+
)
|
|
1013
|
+
return config.project
|
|
1014
|
+
else:
|
|
1015
|
+
# Config already set, use it
|
|
1016
|
+
config = MetaxyConfig.get()
|
|
1017
|
+
return config.project
|
|
1018
|
+
|
|
1019
|
+
# Strategy 2: Check metaxy.projects entry points as fallback
|
|
1020
|
+
project = detect_project_from_entrypoints(module_name)
|
|
1021
|
+
if project is not None:
|
|
1022
|
+
return project
|
|
1023
|
+
|
|
1024
|
+
# Strategy 3: Fall back to "default" with a warning
|
|
1025
|
+
warnings.warn(
|
|
1026
|
+
f"Could not detect project for feature '{feature_cls.__name__}' "
|
|
1027
|
+
f"from module '{module_name}'. No metaxy.toml found and no entry point configured. "
|
|
1028
|
+
f"Using 'default' as project name. This may cause issues with metadata isolation. "
|
|
1029
|
+
f"Please ensure features are imported after init_metaxy() or configure a metaxy.toml file.",
|
|
1030
|
+
stacklevel=3,
|
|
1031
|
+
)
|
|
1032
|
+
return "default"
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
class _FeatureSpecDescriptor:
|
|
1036
|
+
"""Descriptor that returns the feature spec of the feature."""
|
|
1037
|
+
|
|
1038
|
+
def __get__(self, instance, owner) -> str:
|
|
1039
|
+
if owner.spec is None:
|
|
1040
|
+
raise ValueError(f"Feature '{owner.__name__}' has no spec defined.")
|
|
1041
|
+
return owner.spec
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
class BaseFeature(pydantic.BaseModel, metaclass=MetaxyMeta, spec=None):
|
|
1045
|
+
_spec: ClassVar[FeatureSpec]
|
|
1046
|
+
|
|
1047
|
+
graph: ClassVar[FeatureGraph]
|
|
1048
|
+
project: ClassVar[str]
|
|
1049
|
+
|
|
1050
|
+
# System columns - automatically managed by Metaxy
|
|
1051
|
+
# Most of them are optional since Metaxy injects them into dataframes at some point
|
|
1052
|
+
metaxy_provenance_by_field: dict[str, str] = Field(
|
|
1053
|
+
default_factory=dict,
|
|
1054
|
+
description="Field-level provenance hashes (maps field names to hashes)",
|
|
1055
|
+
)
|
|
1056
|
+
metaxy_provenance: str | None = Field(
|
|
1057
|
+
default=None,
|
|
1058
|
+
description="Hash of metaxy_provenance_by_field",
|
|
1059
|
+
)
|
|
1060
|
+
metaxy_feature_version: str | None = Field(
|
|
1061
|
+
default=None,
|
|
1062
|
+
description="Hash of the feature definition (dependencies + fields + code_versions)",
|
|
1063
|
+
)
|
|
1064
|
+
metaxy_snapshot_version: str | None = Field(
|
|
1065
|
+
default=None,
|
|
1066
|
+
description="Hash of the entire feature graph snapshot",
|
|
1067
|
+
)
|
|
1068
|
+
metaxy_data_version_by_field: dict[str, str] | None = Field(
|
|
1069
|
+
default=None,
|
|
1070
|
+
description="Field-level data version hashes (maps field names to version hashes)",
|
|
1071
|
+
)
|
|
1072
|
+
metaxy_data_version: str | None = Field(
|
|
1073
|
+
default=None,
|
|
1074
|
+
description="Hash of metaxy_data_version_by_field",
|
|
1075
|
+
)
|
|
1076
|
+
metaxy_created_at: AwareDatetime | None = Field(
|
|
1077
|
+
default=None,
|
|
1078
|
+
description="Timestamp when the metadata row was created (UTC)",
|
|
1079
|
+
)
|
|
1080
|
+
metaxy_materialization_id: str | None = Field(
|
|
1081
|
+
default=None,
|
|
1082
|
+
description="External orchestration run ID (e.g., Dagster Run ID)",
|
|
1083
|
+
)
|
|
1084
|
+
|
|
1085
|
+
@model_validator(mode="after")
|
|
1086
|
+
def _validate_id_columns_exist(self) -> Self:
|
|
1087
|
+
"""Validate that all id_columns from spec are present in model fields."""
|
|
1088
|
+
spec = self.__class__.spec()
|
|
1089
|
+
model_fields = set(self.__class__.model_fields.keys())
|
|
1090
|
+
|
|
1091
|
+
missing_columns = set(spec.id_columns) - model_fields
|
|
1092
|
+
if missing_columns:
|
|
1093
|
+
raise ValueError(
|
|
1094
|
+
f"ID columns {missing_columns} specified in spec are not present in model fields. "
|
|
1095
|
+
f"Available fields: {model_fields}"
|
|
1096
|
+
)
|
|
1097
|
+
return self
|
|
1098
|
+
|
|
1099
|
+
@classmethod
|
|
1100
|
+
def spec(cls) -> FeatureSpec: # type: ignore[override]
|
|
1101
|
+
return cls._spec
|
|
1102
|
+
|
|
1103
|
+
@classmethod
|
|
1104
|
+
def table_name(cls) -> str:
|
|
1105
|
+
"""Get SQL-like table name for this feature.
|
|
1106
|
+
|
|
1107
|
+
Converts feature key to SQL-compatible table name by joining
|
|
1108
|
+
parts with double underscores, consistent with IbisMetadataStore.
|
|
1109
|
+
|
|
1110
|
+
Returns:
|
|
1111
|
+
Table name string (e.g., "my_namespace__my_feature")
|
|
1112
|
+
|
|
1113
|
+
Example:
|
|
1114
|
+
```py
|
|
1115
|
+
class VideoFeature(Feature, spec=FeatureSpec(
|
|
1116
|
+
key=FeatureKey(["video", "processing"]),
|
|
1117
|
+
...
|
|
1118
|
+
)):
|
|
1119
|
+
pass
|
|
1120
|
+
VideoFeature.table_name()
|
|
1121
|
+
# 'video__processing'
|
|
1122
|
+
```
|
|
1123
|
+
"""
|
|
1124
|
+
return cls.spec().table_name()
|
|
1125
|
+
|
|
1126
|
+
@classmethod
|
|
1127
|
+
def feature_version(cls) -> str:
|
|
1128
|
+
"""Get hash of feature specification.
|
|
1129
|
+
|
|
1130
|
+
Returns a hash representing the feature's complete configuration:
|
|
1131
|
+
- Feature key
|
|
1132
|
+
- Field definitions and code versions
|
|
1133
|
+
- Dependencies (feature-level and field-level)
|
|
1134
|
+
|
|
1135
|
+
This hash changes when you modify:
|
|
1136
|
+
- Field code versions
|
|
1137
|
+
- Dependencies
|
|
1138
|
+
- Field definitions
|
|
1139
|
+
|
|
1140
|
+
Used to distinguish current vs historical metafield provenance hashes.
|
|
1141
|
+
Stored in the 'metaxy_feature_version' column of metadata DataFrames.
|
|
1142
|
+
|
|
1143
|
+
Returns:
|
|
1144
|
+
SHA256 hex digest (like git short hashes)
|
|
1145
|
+
|
|
1146
|
+
Example:
|
|
1147
|
+
```py
|
|
1148
|
+
class MyFeature(Feature, spec=FeatureSpec(
|
|
1149
|
+
key=FeatureKey(["my", "feature"]),
|
|
1150
|
+
fields=[FieldSpec(key=FieldKey(["default"]), code_version="1")],
|
|
1151
|
+
)):
|
|
1152
|
+
pass
|
|
1153
|
+
MyFeature.feature_version()
|
|
1154
|
+
# 'a3f8b2c1...'
|
|
1155
|
+
```
|
|
1156
|
+
"""
|
|
1157
|
+
return cls.graph.get_feature_version(cls.spec().key)
|
|
1158
|
+
|
|
1159
|
+
@classmethod
|
|
1160
|
+
def feature_spec_version(cls) -> str:
|
|
1161
|
+
"""Get hash of the complete feature specification.
|
|
1162
|
+
|
|
1163
|
+
Returns a hash representing ALL specification properties including:
|
|
1164
|
+
- Feature key
|
|
1165
|
+
- Dependencies
|
|
1166
|
+
- Fields
|
|
1167
|
+
- Code versions
|
|
1168
|
+
- Any future metadata, tags, or other properties
|
|
1169
|
+
|
|
1170
|
+
Unlike feature_version which only hashes computational properties
|
|
1171
|
+
(for migration triggering), feature_spec_version captures the entire specification
|
|
1172
|
+
for complete reproducibility and audit purposes.
|
|
1173
|
+
|
|
1174
|
+
Stored in the 'metaxy_feature_spec_version' column of metadata DataFrames.
|
|
1175
|
+
|
|
1176
|
+
Returns:
|
|
1177
|
+
SHA256 hex digest of the complete specification
|
|
1178
|
+
|
|
1179
|
+
Example:
|
|
1180
|
+
```py
|
|
1181
|
+
class MyFeature(Feature, spec=FeatureSpec(
|
|
1182
|
+
key=FeatureKey(["my", "feature"]),
|
|
1183
|
+
fields=[FieldSpec(key=FieldKey(["default"]), code_version="1")],
|
|
1184
|
+
)):
|
|
1185
|
+
pass
|
|
1186
|
+
MyFeature.feature_spec_version()
|
|
1187
|
+
# 'def456...' # Different from feature_version
|
|
1188
|
+
```
|
|
1189
|
+
"""
|
|
1190
|
+
return cls.spec().feature_spec_version
|
|
1191
|
+
|
|
1192
|
+
@classmethod
|
|
1193
|
+
def full_definition_version(cls) -> str:
|
|
1194
|
+
"""Get hash of the complete feature definition including Pydantic schema.
|
|
1195
|
+
|
|
1196
|
+
This method computes a hash of the entire feature class definition, including:
|
|
1197
|
+
- Pydantic model schema
|
|
1198
|
+
- Project name
|
|
1199
|
+
|
|
1200
|
+
Used in the `metaxy_full_definition_version` column of system tables.
|
|
1201
|
+
|
|
1202
|
+
Returns:
|
|
1203
|
+
SHA256 hex digest of the complete definition
|
|
1204
|
+
"""
|
|
1205
|
+
import json
|
|
1206
|
+
|
|
1207
|
+
hasher = hashlib.sha256()
|
|
1208
|
+
|
|
1209
|
+
# Hash the Pydantic schema (includes field types, descriptions, validators, etc.)
|
|
1210
|
+
schema = cls.model_json_schema()
|
|
1211
|
+
schema_json = json.dumps(schema, sort_keys=True)
|
|
1212
|
+
hasher.update(schema_json.encode())
|
|
1213
|
+
|
|
1214
|
+
# Hash the feature specification
|
|
1215
|
+
hasher.update(cls.feature_spec_version().encode())
|
|
1216
|
+
|
|
1217
|
+
# Hash the project name
|
|
1218
|
+
hasher.update(cls.project.encode())
|
|
1219
|
+
|
|
1220
|
+
return truncate_hash(hasher.hexdigest())
|
|
1221
|
+
|
|
1222
|
+
@classmethod
|
|
1223
|
+
def provenance_by_field(cls) -> dict[str, str]:
|
|
1224
|
+
"""Get the code-level field provenance for this feature.
|
|
1225
|
+
|
|
1226
|
+
This returns a static hash based on code versions and dependencies,
|
|
1227
|
+
not sample-level field provenance computed from upstream data.
|
|
1228
|
+
|
|
1229
|
+
Returns:
|
|
1230
|
+
Dictionary mapping field keys to their provenance hashes.
|
|
1231
|
+
"""
|
|
1232
|
+
return cls.graph.get_feature_version_by_field(cls.spec().key)
|
|
1233
|
+
|
|
1234
|
+
@classmethod
|
|
1235
|
+
def load_input(
|
|
1236
|
+
cls,
|
|
1237
|
+
joiner: Any,
|
|
1238
|
+
upstream_refs: dict[str, "nw.LazyFrame[Any]"],
|
|
1239
|
+
) -> tuple["nw.LazyFrame[Any]", dict[str, str]]:
|
|
1240
|
+
"""Join upstream feature metadata.
|
|
1241
|
+
|
|
1242
|
+
Override for custom join logic (1:many, different keys, filtering, etc.).
|
|
1243
|
+
|
|
1244
|
+
Args:
|
|
1245
|
+
joiner: UpstreamJoiner from MetadataStore
|
|
1246
|
+
upstream_refs: Upstream feature metadata references (lazy where possible)
|
|
1247
|
+
|
|
1248
|
+
Returns:
|
|
1249
|
+
(joined_upstream, upstream_column_mapping)
|
|
1250
|
+
- joined_upstream: All upstream data joined together
|
|
1251
|
+
- upstream_column_mapping: Maps upstream_key -> column name
|
|
1252
|
+
"""
|
|
1253
|
+
from metaxy.models.feature_spec import FeatureDep
|
|
1254
|
+
|
|
1255
|
+
# Extract columns and renames from deps
|
|
1256
|
+
upstream_columns: dict[str, tuple[str, ...] | None] = {}
|
|
1257
|
+
upstream_renames: dict[str, dict[str, str] | None] = {}
|
|
1258
|
+
|
|
1259
|
+
deps = cls.spec().deps
|
|
1260
|
+
if deps:
|
|
1261
|
+
for dep in deps:
|
|
1262
|
+
if isinstance(dep, FeatureDep):
|
|
1263
|
+
dep_key_str = dep.feature.to_string()
|
|
1264
|
+
upstream_columns[dep_key_str] = dep.columns
|
|
1265
|
+
upstream_renames[dep_key_str] = dep.rename
|
|
1266
|
+
|
|
1267
|
+
return joiner.join_upstream(
|
|
1268
|
+
upstream_refs=upstream_refs,
|
|
1269
|
+
feature_spec=cls.spec(),
|
|
1270
|
+
feature_plan=cls.graph.get_feature_plan(cls.spec().key),
|
|
1271
|
+
upstream_columns=upstream_columns,
|
|
1272
|
+
upstream_renames=upstream_renames,
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
@classmethod
|
|
1276
|
+
def resolve_data_version_diff(
|
|
1277
|
+
cls,
|
|
1278
|
+
diff_resolver: Any,
|
|
1279
|
+
target_provenance: "nw.LazyFrame[Any]",
|
|
1280
|
+
current_metadata: "nw.LazyFrame[Any] | None",
|
|
1281
|
+
*,
|
|
1282
|
+
lazy: bool = False,
|
|
1283
|
+
) -> "Increment | LazyIncrement":
|
|
1284
|
+
"""Resolve differences between target and current field provenance.
|
|
1285
|
+
|
|
1286
|
+
Override for custom diff logic (ignore certain fields, custom rules, etc.).
|
|
1287
|
+
|
|
1288
|
+
Args:
|
|
1289
|
+
diff_resolver: MetadataDiffResolver from MetadataStore
|
|
1290
|
+
target_provenance: Calculated target field provenance (Narwhals LazyFrame)
|
|
1291
|
+
current_metadata: Current metadata for this feature (Narwhals LazyFrame, or None).
|
|
1292
|
+
Should be pre-filtered by feature_version at the store level.
|
|
1293
|
+
lazy: If True, return LazyIncrement. If False, return Increment.
|
|
1294
|
+
|
|
1295
|
+
Returns:
|
|
1296
|
+
Increment (eager) or LazyIncrement (lazy) with added, changed, removed
|
|
1297
|
+
|
|
1298
|
+
Example (default):
|
|
1299
|
+
```py
|
|
1300
|
+
class MyFeature(Feature, spec=...):
|
|
1301
|
+
pass # Uses diff resolver's default implementation
|
|
1302
|
+
```
|
|
1303
|
+
|
|
1304
|
+
Example (ignore certain field changes):
|
|
1305
|
+
```py
|
|
1306
|
+
class MyFeature(Feature, spec=...):
|
|
1307
|
+
@classmethod
|
|
1308
|
+
def resolve_data_version_diff(cls, diff_resolver, target_provenance, current_metadata, **kwargs):
|
|
1309
|
+
# Get standard diff
|
|
1310
|
+
result = diff_resolver.find_changes(target_provenance, current_metadata, cls.spec().id_columns)
|
|
1311
|
+
|
|
1312
|
+
# Custom: Only consider 'frames' field changes, ignore 'audio'
|
|
1313
|
+
# Users can filter/modify the increment here
|
|
1314
|
+
|
|
1315
|
+
return result # Return modified Increment
|
|
1316
|
+
```
|
|
1317
|
+
"""
|
|
1318
|
+
# Diff resolver always returns LazyIncrement - materialize if needed
|
|
1319
|
+
lazy_result = diff_resolver.find_changes(
|
|
1320
|
+
target_provenance=target_provenance,
|
|
1321
|
+
current_metadata=current_metadata,
|
|
1322
|
+
id_columns=cls.spec().id_columns, # Pass ID columns from feature spec
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
# Materialize to Increment if lazy=False
|
|
1326
|
+
if not lazy:
|
|
1327
|
+
from metaxy.versioning.types import Increment
|
|
1328
|
+
|
|
1329
|
+
return Increment(
|
|
1330
|
+
added=lazy_result.added.collect(),
|
|
1331
|
+
changed=lazy_result.changed.collect(),
|
|
1332
|
+
removed=lazy_result.removed.collect(),
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
return lazy_result
|