metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,1335 @@
1
+ import hashlib
2
+ from collections.abc import Iterator, Mapping, Sequence
3
+ from contextlib import contextmanager
4
+ from contextvars import ContextVar
5
+ from typing import TYPE_CHECKING, Any, ClassVar, TypedDict
6
+
7
+ import pydantic
8
+ from pydantic import AwareDatetime, Field, model_validator
9
+ from pydantic._internal._model_construction import ModelMetaclass
10
+ from typing_extensions import Self
11
+
12
+ from metaxy.models.constants import (
13
+ METAXY_FEATURE_SPEC_VERSION,
14
+ METAXY_FEATURE_VERSION,
15
+ METAXY_FULL_DEFINITION_VERSION,
16
+ )
17
+ from metaxy.models.feature_spec import (
18
+ FeatureSpec,
19
+ )
20
+ from metaxy.models.plan import FeaturePlan, FQFieldKey
21
+ from metaxy.models.types import (
22
+ CoercibleToFeatureKey,
23
+ FeatureKey,
24
+ ValidatedFeatureKeyAdapter,
25
+ ValidatedFeatureKeySequenceAdapter,
26
+ )
27
+ from metaxy.utils.hashing import truncate_hash
28
+
29
+ FEATURE_VERSION_COL = METAXY_FEATURE_VERSION
30
+ FEATURE_SPEC_VERSION_COL = METAXY_FEATURE_SPEC_VERSION
31
+ FEATURE_TRACKING_VERSION_COL = METAXY_FULL_DEFINITION_VERSION
32
+
33
+ if TYPE_CHECKING:
34
+ import narwhals as nw
35
+
36
+ from metaxy.versioning.types import Increment, LazyIncrement
37
+
38
+ # TODO: These are no longer used - remove after refactoring
39
+ # from metaxy.data_versioning.diff import MetadataDiffResolver
40
+ # from metaxy.data_versioning.joiners import UpstreamJoiner
41
+
42
+ # Context variable for active graph (module-level)
43
+ _active_graph: ContextVar["FeatureGraph | None"] = ContextVar(
44
+ "_active_graph", default=None
45
+ )
46
+
47
+
48
+ def get_feature_by_key(key: CoercibleToFeatureKey) -> type["BaseFeature"]:
49
+ """Get a feature class by its key from the active graph.
50
+
51
+ Convenience function that retrieves Metaxy feature class from the currently active [feature graph][metaxy.FeatureGraph]. Can be useful when receiving a feature key from storage or across process boundaries.
52
+
53
+ Args:
54
+ key: Feature key to look up. Accepts types that can be converted into a feature key..
55
+
56
+ Returns:
57
+ Feature class
58
+
59
+ Raises:
60
+ KeyError: If no feature with the given key is registered
61
+
62
+ Example:
63
+ ```py
64
+ from metaxy import get_feature_by_key, FeatureKey
65
+ parent_key = FeatureKey(["examples", "parent"])
66
+ ParentFeature = get_feature_by_key(parent_key)
67
+
68
+ # Or use string notation
69
+ ParentFeature = get_feature_by_key("examples/parent")
70
+ ```
71
+ """
72
+ graph = FeatureGraph.get_active()
73
+ return graph.get_feature_by_key(key)
74
+
75
+
76
+ class SerializedFeature(TypedDict):
77
+ feature_spec: dict[str, Any]
78
+ feature_schema: dict[str, Any]
79
+ metaxy_feature_version: str
80
+ metaxy_feature_spec_version: str
81
+ metaxy_full_definition_version: str
82
+ feature_class_path: str
83
+ project: str
84
+
85
+
86
+ class FeatureGraph:
87
+ def __init__(self):
88
+ self.features_by_key: dict[FeatureKey, type[BaseFeature]] = {}
89
+ self.feature_specs_by_key: dict[FeatureKey, FeatureSpec] = {}
90
+ # Standalone specs registered without Feature classes (for migrations)
91
+ self.standalone_specs_by_key: dict[FeatureKey, FeatureSpec] = {}
92
+
93
+ @property
94
+ def all_specs_by_key(self) -> dict[FeatureKey, FeatureSpec]:
95
+ return {**self.feature_specs_by_key, **self.standalone_specs_by_key}
96
+
97
+ def add_feature(self, feature: type["BaseFeature"]) -> None:
98
+ """Add a feature to the graph.
99
+
100
+ Args:
101
+ feature: Feature class to register
102
+
103
+ Raises:
104
+ ValueError: If a feature with the same key is already registered
105
+ or if duplicate column names would result from renaming operations
106
+ """
107
+ if feature.spec().key in self.features_by_key:
108
+ existing = self.features_by_key[feature.spec().key]
109
+ raise ValueError(
110
+ f"Feature with key {feature.spec().key.to_string()} already registered. "
111
+ f"Existing: {existing.__name__}, New: {feature.__name__}. "
112
+ f"Each feature key must be unique within a graph."
113
+ )
114
+
115
+ # Validate that there are no duplicate column names across dependencies after renaming
116
+ if feature.spec().deps:
117
+ self._validate_no_duplicate_columns(feature.spec())
118
+
119
+ self.features_by_key[feature.spec().key] = feature
120
+ self.feature_specs_by_key[feature.spec().key] = feature.spec()
121
+
122
+ def add_feature_spec(self, spec: FeatureSpec) -> None:
123
+ import warnings
124
+
125
+ # Check if a Feature class already exists for this key
126
+ if spec.key in self.features_by_key:
127
+ warnings.warn(
128
+ f"Feature class already exists for key {spec.key.to_string()}. "
129
+ f"Standalone spec will be ignored - Feature class takes precedence.",
130
+ stacklevel=2,
131
+ )
132
+ return
133
+
134
+ # Check if a standalone spec already exists
135
+ if spec.key in self.standalone_specs_by_key:
136
+ existing = self.standalone_specs_by_key[spec.key]
137
+ # Only warn if it's a different spec (by comparing feature_spec_version)
138
+ if existing.feature_spec_version != spec.feature_spec_version:
139
+ raise ValueError(
140
+ f"Standalone spec for key {spec.key.to_string()} already exists "
141
+ f"with a different version."
142
+ )
143
+
144
+ # Validate that there are no duplicate columns across dependencies after renaming
145
+ if spec.deps:
146
+ self._validate_no_duplicate_columns(spec)
147
+
148
+ # Store standalone spec
149
+ self.standalone_specs_by_key[spec.key] = spec
150
+ # Also add to feature_specs_by_key for methods that only need the spec
151
+ self.feature_specs_by_key[spec.key] = spec
152
+
153
+ def _validate_no_duplicate_columns(self, spec: "FeatureSpec") -> None:
154
+ """Validate that there are no duplicate column names across dependencies after renaming.
155
+
156
+ This method checks that after all column selection and renaming operations,
157
+ no two columns have the same name (except for ID columns which are expected to be the same).
158
+ Also validates that columns are not renamed to system column names.
159
+
160
+ Args:
161
+ spec: Feature specification to validate
162
+
163
+ Raises:
164
+ ValueError: If duplicate column names would result from the dependency configuration
165
+ or if columns are renamed to system column names
166
+ """
167
+ from metaxy.models.constants import ALL_SYSTEM_COLUMNS
168
+ from metaxy.models.feature_spec import FeatureDep
169
+
170
+ if not spec.deps:
171
+ return
172
+
173
+ # First, validate each dependency individually
174
+ for dep in spec.deps:
175
+ if not isinstance(dep, FeatureDep):
176
+ continue
177
+
178
+ if dep.rename:
179
+ # Get the upstream feature's spec to check its ID columns
180
+ upstream_spec = self.feature_specs_by_key.get(dep.feature)
181
+ upstream_id_columns = upstream_spec.id_columns if upstream_spec else []
182
+
183
+ # Check for renaming to system columns or upstream's ID columns
184
+ for old_name, new_name in dep.rename.items():
185
+ if new_name in ALL_SYSTEM_COLUMNS:
186
+ raise ValueError(
187
+ f"Cannot rename column '{old_name}' to system column name '{new_name}' "
188
+ f"in dependency '{dep.feature.to_string()}'. "
189
+ f"System columns: {sorted(ALL_SYSTEM_COLUMNS)}"
190
+ )
191
+
192
+ # Check against upstream feature's ID columns
193
+ if new_name in upstream_id_columns:
194
+ raise ValueError(
195
+ f"Cannot rename column '{old_name}' to ID column '{new_name}' "
196
+ f"from upstream feature '{dep.feature.to_string()}'. "
197
+ f"ID columns for '{dep.feature.to_string()}': {upstream_id_columns}"
198
+ )
199
+
200
+ # Check for duplicate column names within this dependency
201
+ renamed_values = list(dep.rename.values())
202
+ if len(renamed_values) != len(set(renamed_values)):
203
+ # Find the duplicate(s)
204
+ seen = set()
205
+ duplicates = set()
206
+ for name in renamed_values:
207
+ if name in seen:
208
+ duplicates.add(name)
209
+ seen.add(name)
210
+ raise ValueError(
211
+ f"Duplicate column names after renaming in dependency '{dep.feature.to_string()}': "
212
+ f"{sorted(duplicates)}. Cannot rename multiple columns to the same name within a single dependency."
213
+ )
214
+
215
+ # Track all column names and their sources
216
+ column_sources: dict[str, list[str]] = {} # column_name -> [source_features]
217
+ id_columns_set = set(spec.id_columns)
218
+
219
+ for dep in spec.deps:
220
+ if not isinstance(dep, FeatureDep):
221
+ continue
222
+
223
+ dep_key_str = dep.feature.to_string()
224
+
225
+ # Get the upstream feature spec if available
226
+ upstream_spec = self.feature_specs_by_key.get(dep.feature)
227
+ if not upstream_spec:
228
+ # If upstream feature isn't registered yet, skip validation
229
+ # This can happen during circular imports or when features are defined in different modules
230
+ continue
231
+
232
+ # Determine which columns will be present from this dependency
233
+ if dep.columns is None:
234
+ # All columns from upstream (except droppable system columns)
235
+ # We don't know exactly which columns without the actual data,
236
+ # but we can check the renamed columns at least
237
+ if dep.rename:
238
+ for old_name, new_name in dep.rename.items():
239
+ if (
240
+ new_name not in id_columns_set
241
+ ): # ID columns are expected to be the same
242
+ if new_name not in column_sources:
243
+ column_sources[new_name] = []
244
+ column_sources[new_name].append(
245
+ f"{dep_key_str} (renamed from '{old_name}')"
246
+ )
247
+ # For non-renamed columns, we can't validate without knowing the actual columns
248
+ # This validation will happen at runtime in the joiner
249
+ elif dep.columns == ():
250
+ # Only system columns - no user columns to track
251
+ pass
252
+ else:
253
+ # Specific columns selected
254
+ for col in dep.columns:
255
+ # Check if this column is renamed
256
+ if dep.rename and col in dep.rename:
257
+ new_name = dep.rename[col]
258
+ if new_name not in id_columns_set:
259
+ if new_name not in column_sources:
260
+ column_sources[new_name] = []
261
+ column_sources[new_name].append(
262
+ f"{dep_key_str} (renamed from '{col}')"
263
+ )
264
+ else:
265
+ # Column keeps its original name
266
+ if col not in id_columns_set:
267
+ if col not in column_sources:
268
+ column_sources[col] = []
269
+ column_sources[col].append(dep_key_str)
270
+
271
+ # Check for duplicates
272
+ duplicates = {
273
+ col: sources for col, sources in column_sources.items() if len(sources) > 1
274
+ }
275
+
276
+ if duplicates:
277
+ # Format error message
278
+ error_lines = []
279
+ for col, sources in sorted(duplicates.items()):
280
+ error_lines.append(
281
+ f" - Column '{col}' appears in: {', '.join(sources)}"
282
+ )
283
+
284
+ raise ValueError(
285
+ f"Feature '{spec.key.to_string()}' would have duplicate column names after renaming:\n"
286
+ + "\n".join(error_lines)
287
+ + "\n\nUse the 'rename' parameter in FeatureDep to resolve conflicts, "
288
+ "or use 'columns' to select only the columns you need."
289
+ )
290
+
291
+ def remove_feature(self, key: CoercibleToFeatureKey) -> None:
292
+ """Remove a feature from the graph.
293
+
294
+ Removes Feature class or standalone spec (whichever exists).
295
+
296
+ Args:
297
+ key: Feature key to remove. Accepts types that can be converted into a feature key..
298
+
299
+ Raises:
300
+ KeyError: If no feature with the given key is registered
301
+ """
302
+ # Validate and coerce the key
303
+ validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
304
+
305
+ # Check both Feature classes and standalone specs
306
+ combined = {**self.feature_specs_by_key, **self.standalone_specs_by_key}
307
+
308
+ if validated_key not in combined:
309
+ raise KeyError(
310
+ f"No feature with key {validated_key.to_string()} found in graph. "
311
+ f"Available keys: {[k.to_string() for k in combined]}"
312
+ )
313
+
314
+ # Remove from all relevant dicts
315
+ if validated_key in self.features_by_key:
316
+ del self.features_by_key[validated_key]
317
+ if validated_key in self.standalone_specs_by_key:
318
+ del self.standalone_specs_by_key[validated_key]
319
+ if validated_key in self.feature_specs_by_key:
320
+ del self.feature_specs_by_key[validated_key]
321
+
322
+ def get_feature_by_key(self, key: CoercibleToFeatureKey) -> type["BaseFeature"]:
323
+ """Get a feature class by its key.
324
+
325
+ Args:
326
+ key: Feature key to look up. Accepts types that can be converted into a feature key..
327
+
328
+ Returns:
329
+ Feature class
330
+
331
+ Raises:
332
+ KeyError: If no feature with the given key is registered
333
+
334
+ Example:
335
+ ```py
336
+ graph = FeatureGraph.get_active()
337
+ parent_key = FeatureKey(["examples", "parent"])
338
+ ParentFeature = graph.get_feature_by_key(parent_key)
339
+
340
+ # Or use string notation
341
+ ParentFeature = graph.get_feature_by_key("examples/parent")
342
+ ```
343
+ """
344
+ # Validate and coerce the key
345
+ validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
346
+
347
+ if validated_key not in self.features_by_key:
348
+ raise KeyError(
349
+ f"No feature with key {validated_key.to_string()} found in graph. "
350
+ f"Available keys: {[k.to_string() for k in self.features_by_key.keys()]}"
351
+ )
352
+ return self.features_by_key[validated_key]
353
+
354
+ def list_features(
355
+ self,
356
+ projects: list[str] | str | None = None,
357
+ *,
358
+ only_current_project: bool = True,
359
+ ) -> list[FeatureKey]:
360
+ """List all feature keys in the graph, optionally filtered by project(s).
361
+
362
+ By default, filters features by the current project (first part of feature key).
363
+ This prevents operations from affecting features in other projects.
364
+
365
+ Args:
366
+ projects: Project name(s) to filter by. Can be:
367
+ - None: Use current project from MetaxyConfig (if only_current_project=True)
368
+ - str: Single project name
369
+ - list[str]: Multiple project names
370
+ only_current_project: If True, filter by current/specified project(s).
371
+ If False, return all features regardless of project.
372
+
373
+ Returns:
374
+ List of feature keys
375
+
376
+ Example:
377
+ ```py
378
+ # Get all features for current project
379
+ graph = FeatureGraph.get_active()
380
+ features = graph.list_features()
381
+
382
+ # Get features for specific project
383
+ features = graph.list_features(projects="myproject")
384
+
385
+ # Get features for multiple projects
386
+ features = graph.list_features(projects=["project1", "project2"])
387
+
388
+ # Get all features regardless of project
389
+ all_features = graph.list_features(only_current_project=False)
390
+ ```
391
+ """
392
+ if not only_current_project:
393
+ # Return all features
394
+ return list(self.features_by_key.keys())
395
+
396
+ # Normalize projects to list
397
+ project_list: list[str]
398
+ if projects is None:
399
+ # Try to get from config context
400
+ try:
401
+ from metaxy.config import MetaxyConfig
402
+
403
+ config = MetaxyConfig.get()
404
+ project_list = [config.project]
405
+ except RuntimeError:
406
+ # Config not initialized - in tests or non-CLI usage
407
+ # Return all features (can't determine project)
408
+ return list(self.features_by_key.keys())
409
+ elif isinstance(projects, str):
410
+ project_list = [projects]
411
+ else:
412
+ project_list = projects
413
+
414
+ # Filter by project(s) using Feature.project attribute
415
+ return [
416
+ key
417
+ for key in self.features_by_key.keys()
418
+ if self.features_by_key[key].project in project_list
419
+ ]
420
+
421
+ def get_feature_plan(self, key: CoercibleToFeatureKey) -> FeaturePlan:
422
+ """Get a feature plan for a given feature key.
423
+
424
+ Args:
425
+ key: Feature key to get plan for. Accepts types that can be converted into a feature key..
426
+
427
+ Returns:
428
+ FeaturePlan instance with feature spec and dependencies.
429
+ """
430
+ # Validate and coerce the key
431
+ validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
432
+
433
+ spec = self.all_specs_by_key[validated_key]
434
+
435
+ return FeaturePlan(
436
+ feature=spec,
437
+ deps=[self.feature_specs_by_key[dep.feature] for dep in spec.deps or []]
438
+ or None,
439
+ feature_deps=spec.deps, # Pass the actual FeatureDep objects with field mappings
440
+ )
441
+
442
+ def get_field_version(self, key: "FQFieldKey") -> str:
443
+ hasher = hashlib.sha256()
444
+
445
+ plan = self.get_feature_plan(key.feature)
446
+ field = plan.feature.fields_by_key[key.field]
447
+
448
+ hasher.update(key.to_string().encode())
449
+ hasher.update(str(field.code_version).encode())
450
+
451
+ for k, v in sorted(plan.get_parent_fields_for_field(key.field).items()):
452
+ hasher.update(self.get_field_version(k).encode())
453
+
454
+ return truncate_hash(hasher.hexdigest())
455
+
456
+ def get_feature_version_by_field(
457
+ self, key: CoercibleToFeatureKey
458
+ ) -> dict[str, str]:
459
+ """Computes the field provenance map for a feature.
460
+
461
+ Hash together field provenance entries with the feature code version.
462
+
463
+ Args:
464
+ key: Feature key to get field versions for. Accepts types that can be converted into a feature key..
465
+
466
+ Returns:
467
+ dict[str, str]: The provenance hash for each field in the feature plan.
468
+ Keys are field names as strings.
469
+ """
470
+ # Validate and coerce the key
471
+ validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
472
+
473
+ res = {}
474
+
475
+ plan = self.get_feature_plan(validated_key)
476
+
477
+ for k, v in plan.feature.fields_by_key.items():
478
+ res[k.to_string()] = self.get_field_version(
479
+ FQFieldKey(field=k, feature=validated_key)
480
+ )
481
+
482
+ return res
483
+
484
+ def get_feature_version(self, key: CoercibleToFeatureKey) -> str:
485
+ """Computes the feature version as a single string.
486
+
487
+ Args:
488
+ key: Feature key to get version for. Accepts types that can be converted into a feature key..
489
+
490
+ Returns:
491
+ Truncated SHA256 hash representing the feature version.
492
+ """
493
+ # Validate and coerce the key
494
+ validated_key = ValidatedFeatureKeyAdapter.validate_python(key)
495
+
496
+ hasher = hashlib.sha256()
497
+ provenance_by_field = self.get_feature_version_by_field(validated_key)
498
+ for field_key in sorted(provenance_by_field):
499
+ hasher.update(field_key.encode())
500
+ hasher.update(provenance_by_field[field_key].encode())
501
+
502
+ return truncate_hash(hasher.hexdigest())
503
+
504
+ def get_downstream_features(
505
+ self, sources: Sequence[CoercibleToFeatureKey]
506
+ ) -> list[FeatureKey]:
507
+ """Get all features downstream of sources, topologically sorted.
508
+
509
+ Performs a depth-first traversal of the dependency graph to find all
510
+ features that transitively depend on any of the source features.
511
+
512
+ Args:
513
+ sources: List of source feature keys. Each element can be string, sequence, FeatureKey, or BaseFeature class.
514
+
515
+ Returns:
516
+ List of downstream feature keys in topological order (dependencies first).
517
+ Does not include the source features themselves.
518
+
519
+ Example:
520
+ ```py
521
+ # DAG: A -> B -> D
522
+ # A -> C -> D
523
+ graph.get_downstream_features([FeatureKey(["A"])])
524
+ # [FeatureKey(["B"]), FeatureKey(["C"]), FeatureKey(["D"])]
525
+
526
+ # Or use string notation
527
+ graph.get_downstream_features(["A"])
528
+ ```
529
+ """
530
+ # Validate and coerce the source keys
531
+ validated_sources = ValidatedFeatureKeySequenceAdapter.validate_python(sources)
532
+
533
+ source_set = set(validated_sources)
534
+ visited = set()
535
+ post_order = []
536
+ source_set = set(sources)
537
+ visited = set()
538
+ post_order = [] # Reverse topological order
539
+
540
+ def visit(key: FeatureKey):
541
+ """DFS traversal."""
542
+ if key in visited:
543
+ return
544
+ visited.add(key)
545
+
546
+ # Find all features that depend on this one
547
+ for feature_key, feature_spec in self.feature_specs_by_key.items():
548
+ if feature_spec.deps:
549
+ for dep in feature_spec.deps:
550
+ if dep.feature == key:
551
+ # This feature depends on 'key', so visit it
552
+ visit(feature_key)
553
+
554
+ post_order.append(key)
555
+
556
+ # Visit all sources
557
+ for source in validated_sources:
558
+ visit(source)
559
+
560
+ # Remove sources from result, reverse to get topological order
561
+ result = [k for k in reversed(post_order) if k not in source_set]
562
+ return result
563
+
564
+ def topological_sort_features(
565
+ self,
566
+ feature_keys: Sequence[CoercibleToFeatureKey] | None = None,
567
+ *,
568
+ descending: bool = False,
569
+ ) -> list[FeatureKey]:
570
+ """Sort feature keys in topological order.
571
+
572
+ Uses stable alphabetical ordering when multiple nodes are at the same level.
573
+ This ensures deterministic output for diff comparisons and migrations.
574
+
575
+ Implemented using depth-first search with post-order traversal.
576
+
577
+ Args:
578
+ feature_keys: List of feature keys to sort. Each element can be string, sequence,
579
+ FeatureKey, or BaseFeature class. If None, sorts all features
580
+ (both Feature classes and standalone specs) in the graph.
581
+ descending: If False (default), dependencies appear before dependents.
582
+ For a chain A -> B -> C, returns [A, B, C].
583
+ If True, dependents appear before dependencies.
584
+ For a chain A -> B -> C, returns [C, B, A].
585
+
586
+ Returns:
587
+ List of feature keys sorted in topological order
588
+
589
+ Example:
590
+ ```py
591
+ graph = FeatureGraph.get_active()
592
+ # Sort specific features (dependencies first)
593
+ sorted_keys = graph.topological_sort_features([
594
+ FeatureKey(["video", "raw"]),
595
+ FeatureKey(["video", "scene"]),
596
+ ])
597
+
598
+ # Or use string notation
599
+ sorted_keys = graph.topological_sort_features(["video/raw", "video/scene"])
600
+
601
+ # Sort all features in the graph (including standalone specs)
602
+ all_sorted = graph.topological_sort_features()
603
+
604
+ # Sort with dependents first (useful for processing leaf nodes before roots)
605
+ reverse_sorted = graph.topological_sort_features(descending=True)
606
+ ```
607
+ """
608
+ # Determine which features to sort
609
+ if feature_keys is None:
610
+ # Include both Feature classes and standalone specs
611
+ keys_to_sort = set(self.feature_specs_by_key.keys())
612
+ else:
613
+ # Validate and coerce the feature keys
614
+ validated_keys = ValidatedFeatureKeySequenceAdapter.validate_python(
615
+ feature_keys
616
+ )
617
+ keys_to_sort = set(validated_keys)
618
+
619
+ visited = set()
620
+ result = [] # Topological order (dependencies first)
621
+
622
+ def visit(key: FeatureKey):
623
+ """DFS visit with post-order traversal."""
624
+ if key in visited or key not in keys_to_sort:
625
+ return
626
+ visited.add(key)
627
+
628
+ # Get dependencies from feature spec
629
+ spec = self.feature_specs_by_key.get(key)
630
+ if spec and spec.deps:
631
+ # Sort dependencies alphabetically for deterministic ordering
632
+ sorted_deps = sorted(
633
+ (dep.feature for dep in spec.deps),
634
+ key=lambda k: k.to_string().lower(),
635
+ )
636
+ for dep_key in sorted_deps:
637
+ if dep_key in keys_to_sort:
638
+ visit(dep_key)
639
+
640
+ # Add to result after visiting dependencies (post-order)
641
+ result.append(key)
642
+
643
+ # Visit all keys in sorted order for deterministic traversal
644
+ for key in sorted(keys_to_sort, key=lambda k: k.to_string().lower()):
645
+ visit(key)
646
+
647
+ # Post-order DFS gives topological order (dependencies before dependents)
648
+ if descending:
649
+ return list(reversed(result))
650
+ return result
651
+
652
+ @property
653
+ def snapshot_version(self) -> str:
654
+ """Generate a snapshot version representing the current topology + versions of the feature graph"""
655
+ if len(self.feature_specs_by_key) == 0:
656
+ return "empty"
657
+
658
+ hasher = hashlib.sha256()
659
+ for feature_key in sorted(self.feature_specs_by_key.keys()):
660
+ hasher.update(feature_key.to_string().encode("utf-8"))
661
+ hasher.update(self.get_feature_version(feature_key).encode("utf-8"))
662
+ return truncate_hash(hasher.hexdigest())
663
+
664
+ def to_snapshot(self) -> dict[str, SerializedFeature]:
665
+ """Serialize graph to snapshot format.
666
+
667
+ Returns a dict mapping feature_key (string) to feature data dict,
668
+ including the import path of the Feature class for reconstruction.
669
+
670
+ Returns: dictionary mapping feature_key (string) to feature data dict
671
+
672
+ Example:
673
+ ```py
674
+ snapshot = graph.to_snapshot()
675
+ snapshot["video_processing"]["metaxy_feature_version"]
676
+ # 'abc12345'
677
+ snapshot["video_processing"]["metaxy_feature_spec_version"]
678
+ # 'def67890'
679
+ snapshot["video_processing"]["metaxy_full_definition_version"]
680
+ # 'xyz98765'
681
+ snapshot["video_processing"]["feature_class_path"]
682
+ # 'myapp.features.video.VideoProcessing'
683
+ snapshot["video_processing"]["project"]
684
+ # 'myapp'
685
+ ```
686
+ """
687
+ snapshot: dict[str, SerializedFeature] = {}
688
+
689
+ for feature_key, feature_cls in self.features_by_key.items():
690
+ feature_key_str = feature_key.to_string()
691
+ feature_spec_dict = feature_cls.spec().model_dump(mode="json") # type: ignore[attr-defined]
692
+ feature_schema_dict = feature_cls.model_json_schema() # type: ignore[attr-defined]
693
+ feature_version = feature_cls.feature_version() # type: ignore[attr-defined]
694
+ feature_spec_version = feature_cls.spec().feature_spec_version # type: ignore[attr-defined]
695
+ full_definition_version = feature_cls.full_definition_version() # type: ignore[attr-defined]
696
+ project = feature_cls.project # type: ignore[attr-defined]
697
+
698
+ # Get class import path (module.ClassName)
699
+ class_path = f"{feature_cls.__module__}.{feature_cls.__name__}"
700
+
701
+ snapshot[feature_key_str] = { # pyright: ignore
702
+ "feature_spec": feature_spec_dict,
703
+ "feature_schema": feature_schema_dict,
704
+ FEATURE_VERSION_COL: feature_version,
705
+ FEATURE_SPEC_VERSION_COL: feature_spec_version,
706
+ FEATURE_TRACKING_VERSION_COL: full_definition_version,
707
+ "feature_class_path": class_path,
708
+ "project": project,
709
+ }
710
+
711
+ return snapshot
712
+
713
+ @classmethod
714
+ def from_snapshot(
715
+ cls,
716
+ snapshot_data: Mapping[str, Mapping[str, Any]],
717
+ *,
718
+ class_path_overrides: dict[str, str] | None = None,
719
+ force_reload: bool = False,
720
+ ) -> "FeatureGraph":
721
+ """Reconstruct graph from snapshot by importing Feature classes.
722
+
723
+ Strictly requires Feature classes to exist at their recorded import paths.
724
+ This ensures custom methods (like load_input) are available.
725
+
726
+ If a feature has been moved/renamed, use class_path_overrides to specify
727
+ the new location.
728
+
729
+ Args:
730
+ snapshot_data: Dict of feature_key -> dict containing
731
+ feature_spec (dict), feature_class_path (str), and other fields
732
+ as returned by to_snapshot() or loaded from DB
733
+ class_path_overrides: Optional dict mapping feature_key to new class path
734
+ for features that have been moved/renamed
735
+ force_reload: If True, reload modules from disk to get current code state.
736
+
737
+ Returns:
738
+ New FeatureGraph with historical features
739
+
740
+ Raises:
741
+ ImportError: If feature class cannot be imported at recorded path
742
+
743
+ Example:
744
+ ```py
745
+ # Load snapshot from metadata store
746
+ historical_graph = FeatureGraph.from_snapshot(snapshot_data)
747
+
748
+ # With override for moved feature
749
+ historical_graph = FeatureGraph.from_snapshot(
750
+ snapshot_data,
751
+ class_path_overrides={
752
+ "video_processing": "myapp.features_v2.VideoProcessing"
753
+ }
754
+ )
755
+ ```
756
+ """
757
+ import importlib
758
+ import sys
759
+
760
+ graph = cls()
761
+ class_path_overrides = class_path_overrides or {}
762
+
763
+ # If force_reload, collect all module paths first to remove ALL features
764
+ # from those modules before reloading (modules can have multiple features)
765
+ modules_to_reload = set()
766
+ if force_reload:
767
+ for feature_key_str, feature_data in snapshot_data.items():
768
+ class_path = class_path_overrides.get(
769
+ feature_key_str
770
+ ) or feature_data.get("feature_class_path")
771
+ if class_path:
772
+ module_path, _ = class_path.rsplit(".", 1)
773
+ if module_path in sys.modules:
774
+ modules_to_reload.add(module_path)
775
+
776
+ # Use context manager to temporarily set the new graph as active
777
+ # This ensures imported Feature classes register to the new graph, not the current one
778
+ with graph.use():
779
+ for feature_key_str, feature_data in snapshot_data.items():
780
+ # Parse FeatureSpec for validation
781
+ feature_spec_dict = feature_data["feature_spec"]
782
+ FeatureSpec.model_validate(feature_spec_dict)
783
+
784
+ # Get class path (check overrides first)
785
+ if feature_key_str in class_path_overrides:
786
+ class_path = class_path_overrides[feature_key_str]
787
+ else:
788
+ class_path = feature_data.get("feature_class_path")
789
+ if not class_path:
790
+ raise ValueError(
791
+ f"Feature '{feature_key_str}' has no feature_class_path in snapshot. "
792
+ f"Cannot reconstruct historical graph."
793
+ )
794
+
795
+ # Import the class
796
+ try:
797
+ module_path, class_name = class_path.rsplit(".", 1)
798
+
799
+ # Force reload module from disk if requested
800
+ # This is critical for migration detection - when code changes,
801
+ # we need fresh imports to detect the changes
802
+ if force_reload and module_path in modules_to_reload:
803
+ # Before first reload of this module, remove ALL features from this module
804
+ # (a module can define multiple features)
805
+ if module_path in modules_to_reload:
806
+ # Find all features from this module in snapshot and remove them
807
+ for fk_str, fd in snapshot_data.items():
808
+ fcp = class_path_overrides.get(fk_str) or fd.get(
809
+ "feature_class_path"
810
+ )
811
+ if fcp and fcp.rsplit(".", 1)[0] == module_path:
812
+ fspec_dict = fd["feature_spec"]
813
+ fspec = FeatureSpec.model_validate(fspec_dict)
814
+ if fspec.key in graph.features_by_key:
815
+ graph.remove_feature(fspec.key)
816
+
817
+ # Mark module as processed so we don't remove features again
818
+ modules_to_reload.discard(module_path)
819
+
820
+ module = importlib.reload(sys.modules[module_path])
821
+ else:
822
+ module = __import__(module_path, fromlist=[class_name])
823
+
824
+ feature_cls = getattr(module, class_name)
825
+ except (ImportError, AttributeError):
826
+ # Feature class not importable - add as standalone spec instead
827
+ # This allows migrations to work even when old Feature classes are deleted/moved
828
+ import logging
829
+
830
+ logger = logging.getLogger(__name__)
831
+ logger.exception(
832
+ f"Cannot import Feature class '{class_path}' for '{feature_key_str}'. "
833
+ f"Adding only the FeatureSpec. "
834
+ )
835
+
836
+ feature_spec = FeatureSpec.model_validate(feature_spec_dict)
837
+ # Add the spec as a standalone spec
838
+ graph.add_feature_spec(feature_spec)
839
+ continue
840
+
841
+ # Validate the imported class matches the stored spec
842
+ if not hasattr(feature_cls, "spec"):
843
+ raise TypeError(
844
+ f"Imported class '{class_path}' is not a valid Feature class "
845
+ f"(missing 'spec' attribute)"
846
+ )
847
+
848
+ # Register the imported feature to this graph if not already present
849
+ # If the module was imported for the first time, the metaclass already registered it
850
+ # If the module was previously imported, we need to manually register it
851
+ if feature_cls.spec().key not in graph.features_by_key:
852
+ graph.add_feature(feature_cls)
853
+
854
+ return graph
855
+
856
+ @classmethod
857
+ def get_active(cls) -> "FeatureGraph":
858
+ """Get the currently active graph.
859
+
860
+ Returns the graph from the context variable if set, otherwise returns
861
+ the default global graph.
862
+
863
+ Returns:
864
+ Active FeatureGraph instance
865
+
866
+ Example:
867
+ ```py
868
+ # Normal usage - returns default graph
869
+ reg = FeatureGraph.get_active()
870
+
871
+ # With custom graph in context
872
+ with my_graph.use():
873
+ reg = FeatureGraph.get_active() # Returns my_graph
874
+ ```
875
+ """
876
+ return _active_graph.get() or graph
877
+
878
+ @classmethod
879
+ def set_active(cls, reg: "FeatureGraph") -> None:
880
+ """Set the active graph for the current context.
881
+
882
+ This sets the context variable that will be returned by get_active().
883
+ Typically used in application setup code or test fixtures.
884
+
885
+ Args:
886
+ reg: FeatureGraph to activate
887
+
888
+ Example:
889
+ ```py
890
+ # In application setup
891
+ my_graph = FeatureGraph()
892
+ FeatureGraph.set_active(my_graph)
893
+
894
+ # Now all operations use my_graph
895
+ FeatureGraph.get_active() # Returns my_graph
896
+ ```
897
+ """
898
+ _active_graph.set(reg)
899
+
900
+ @contextmanager
901
+ def use(self) -> Iterator[Self]:
902
+ """Context manager to temporarily use this graph as active.
903
+
904
+ This is the recommended way to use custom registries, especially in tests.
905
+ The graph is automatically restored when the context exits.
906
+
907
+ Yields:
908
+ FeatureGraph: This graph instance
909
+
910
+ Example:
911
+ ```py
912
+ test_graph = FeatureGraph()
913
+
914
+ with test_graph.use():
915
+ # All operations use test_graph
916
+ class TestFeature(Feature, spec=...):
917
+ pass
918
+
919
+ # Outside context, back to previous graph
920
+ ```
921
+ """
922
+ token = _active_graph.set(self)
923
+ try:
924
+ yield self
925
+ finally:
926
+ _active_graph.reset(token)
927
+
928
+
929
+ def current_graph() -> FeatureGraph:
930
+ """Get the currently active graph.
931
+
932
+ Returns:
933
+ FeatureGraph: The currently active graph.
934
+ """
935
+ return FeatureGraph.get_active()
936
+
937
+
938
+ # Default global graph
939
+ graph = FeatureGraph()
940
+
941
+
942
+ class MetaxyMeta(ModelMetaclass):
943
+ def __new__(
944
+ cls,
945
+ cls_name: str,
946
+ bases: tuple[type[Any], ...],
947
+ namespace: dict[str, Any],
948
+ *,
949
+ spec: FeatureSpec | None = None,
950
+ **kwargs,
951
+ ) -> type[Self]: # pyright: ignore[reportGeneralTypeIssues]
952
+ # Inject frozen config if not already specified in namespace
953
+ if "model_config" not in namespace:
954
+ from pydantic import ConfigDict
955
+
956
+ namespace["model_config"] = ConfigDict(frozen=True)
957
+
958
+ new_cls = super().__new__(cls, cls_name, bases, namespace, **kwargs)
959
+
960
+ if spec:
961
+ # Get graph from context at class definition time
962
+ active_graph = FeatureGraph.get_active()
963
+ new_cls.graph = active_graph # type: ignore[attr-defined]
964
+ new_cls._spec = spec # type: ignore[attr-defined]
965
+
966
+ # Determine project for this feature using intelligent detection
967
+ project = cls._detect_project(new_cls)
968
+ new_cls.project = project # type: ignore[attr-defined]
969
+
970
+ active_graph.add_feature(new_cls)
971
+ else:
972
+ pass # TODO: set spec to a property that would raise an exception on access
973
+
974
+ return new_cls
975
+
976
+ @staticmethod
977
+ def _detect_project(feature_cls: type) -> str:
978
+ """Detect project for a feature class.
979
+
980
+ Detection order:
981
+ 1. Try to auto-load MetaxyConfig from metaxy.toml/pyproject.toml
982
+ starting from the feature's file location
983
+ 2. Use config.project if available
984
+ 3. Check metaxy.projects entry points as fallback
985
+ 4. Fall back to "default" with a warning
986
+
987
+ Args:
988
+ feature_cls: The Feature class being registered
989
+
990
+ Returns:
991
+ Project name string
992
+ """
993
+ import inspect
994
+ import warnings
995
+ from pathlib import Path
996
+
997
+ from metaxy._packaging import detect_project_from_entrypoints
998
+ from metaxy.config import MetaxyConfig
999
+
1000
+ module_name = feature_cls.__module__
1001
+
1002
+ # Strategy 1: Try to load config if not already set
1003
+ if not MetaxyConfig.is_set():
1004
+ # Get the file where the feature class is defined
1005
+ feature_file = inspect.getfile(feature_cls)
1006
+ feature_dir = Path(feature_file).parent
1007
+
1008
+ # Attempt to auto-load config from metaxy.toml or pyproject.toml
1009
+ # starting from the feature's directory
1010
+ config = MetaxyConfig.load(
1011
+ search_parents=True, auto_discovery_start=feature_dir
1012
+ )
1013
+ return config.project
1014
+ else:
1015
+ # Config already set, use it
1016
+ config = MetaxyConfig.get()
1017
+ return config.project
1018
+
1019
+ # Strategy 2: Check metaxy.projects entry points as fallback
1020
+ project = detect_project_from_entrypoints(module_name)
1021
+ if project is not None:
1022
+ return project
1023
+
1024
+ # Strategy 3: Fall back to "default" with a warning
1025
+ warnings.warn(
1026
+ f"Could not detect project for feature '{feature_cls.__name__}' "
1027
+ f"from module '{module_name}'. No metaxy.toml found and no entry point configured. "
1028
+ f"Using 'default' as project name. This may cause issues with metadata isolation. "
1029
+ f"Please ensure features are imported after init_metaxy() or configure a metaxy.toml file.",
1030
+ stacklevel=3,
1031
+ )
1032
+ return "default"
1033
+
1034
+
1035
+ class _FeatureSpecDescriptor:
1036
+ """Descriptor that returns the feature spec of the feature."""
1037
+
1038
+ def __get__(self, instance, owner) -> str:
1039
+ if owner.spec is None:
1040
+ raise ValueError(f"Feature '{owner.__name__}' has no spec defined.")
1041
+ return owner.spec
1042
+
1043
+
1044
+ class BaseFeature(pydantic.BaseModel, metaclass=MetaxyMeta, spec=None):
1045
+ _spec: ClassVar[FeatureSpec]
1046
+
1047
+ graph: ClassVar[FeatureGraph]
1048
+ project: ClassVar[str]
1049
+
1050
+ # System columns - automatically managed by Metaxy
1051
+ # Most of them are optional since Metaxy injects them into dataframes at some point
1052
+ metaxy_provenance_by_field: dict[str, str] = Field(
1053
+ default_factory=dict,
1054
+ description="Field-level provenance hashes (maps field names to hashes)",
1055
+ )
1056
+ metaxy_provenance: str | None = Field(
1057
+ default=None,
1058
+ description="Hash of metaxy_provenance_by_field",
1059
+ )
1060
+ metaxy_feature_version: str | None = Field(
1061
+ default=None,
1062
+ description="Hash of the feature definition (dependencies + fields + code_versions)",
1063
+ )
1064
+ metaxy_snapshot_version: str | None = Field(
1065
+ default=None,
1066
+ description="Hash of the entire feature graph snapshot",
1067
+ )
1068
+ metaxy_data_version_by_field: dict[str, str] | None = Field(
1069
+ default=None,
1070
+ description="Field-level data version hashes (maps field names to version hashes)",
1071
+ )
1072
+ metaxy_data_version: str | None = Field(
1073
+ default=None,
1074
+ description="Hash of metaxy_data_version_by_field",
1075
+ )
1076
+ metaxy_created_at: AwareDatetime | None = Field(
1077
+ default=None,
1078
+ description="Timestamp when the metadata row was created (UTC)",
1079
+ )
1080
+ metaxy_materialization_id: str | None = Field(
1081
+ default=None,
1082
+ description="External orchestration run ID (e.g., Dagster Run ID)",
1083
+ )
1084
+
1085
+ @model_validator(mode="after")
1086
+ def _validate_id_columns_exist(self) -> Self:
1087
+ """Validate that all id_columns from spec are present in model fields."""
1088
+ spec = self.__class__.spec()
1089
+ model_fields = set(self.__class__.model_fields.keys())
1090
+
1091
+ missing_columns = set(spec.id_columns) - model_fields
1092
+ if missing_columns:
1093
+ raise ValueError(
1094
+ f"ID columns {missing_columns} specified in spec are not present in model fields. "
1095
+ f"Available fields: {model_fields}"
1096
+ )
1097
+ return self
1098
+
1099
+ @classmethod
1100
+ def spec(cls) -> FeatureSpec: # type: ignore[override]
1101
+ return cls._spec
1102
+
1103
+ @classmethod
1104
+ def table_name(cls) -> str:
1105
+ """Get SQL-like table name for this feature.
1106
+
1107
+ Converts feature key to SQL-compatible table name by joining
1108
+ parts with double underscores, consistent with IbisMetadataStore.
1109
+
1110
+ Returns:
1111
+ Table name string (e.g., "my_namespace__my_feature")
1112
+
1113
+ Example:
1114
+ ```py
1115
+ class VideoFeature(Feature, spec=FeatureSpec(
1116
+ key=FeatureKey(["video", "processing"]),
1117
+ ...
1118
+ )):
1119
+ pass
1120
+ VideoFeature.table_name()
1121
+ # 'video__processing'
1122
+ ```
1123
+ """
1124
+ return cls.spec().table_name()
1125
+
1126
+ @classmethod
1127
+ def feature_version(cls) -> str:
1128
+ """Get hash of feature specification.
1129
+
1130
+ Returns a hash representing the feature's complete configuration:
1131
+ - Feature key
1132
+ - Field definitions and code versions
1133
+ - Dependencies (feature-level and field-level)
1134
+
1135
+ This hash changes when you modify:
1136
+ - Field code versions
1137
+ - Dependencies
1138
+ - Field definitions
1139
+
1140
+ Used to distinguish current vs historical metafield provenance hashes.
1141
+ Stored in the 'metaxy_feature_version' column of metadata DataFrames.
1142
+
1143
+ Returns:
1144
+ SHA256 hex digest (like git short hashes)
1145
+
1146
+ Example:
1147
+ ```py
1148
+ class MyFeature(Feature, spec=FeatureSpec(
1149
+ key=FeatureKey(["my", "feature"]),
1150
+ fields=[FieldSpec(key=FieldKey(["default"]), code_version="1")],
1151
+ )):
1152
+ pass
1153
+ MyFeature.feature_version()
1154
+ # 'a3f8b2c1...'
1155
+ ```
1156
+ """
1157
+ return cls.graph.get_feature_version(cls.spec().key)
1158
+
1159
+ @classmethod
1160
+ def feature_spec_version(cls) -> str:
1161
+ """Get hash of the complete feature specification.
1162
+
1163
+ Returns a hash representing ALL specification properties including:
1164
+ - Feature key
1165
+ - Dependencies
1166
+ - Fields
1167
+ - Code versions
1168
+ - Any future metadata, tags, or other properties
1169
+
1170
+ Unlike feature_version which only hashes computational properties
1171
+ (for migration triggering), feature_spec_version captures the entire specification
1172
+ for complete reproducibility and audit purposes.
1173
+
1174
+ Stored in the 'metaxy_feature_spec_version' column of metadata DataFrames.
1175
+
1176
+ Returns:
1177
+ SHA256 hex digest of the complete specification
1178
+
1179
+ Example:
1180
+ ```py
1181
+ class MyFeature(Feature, spec=FeatureSpec(
1182
+ key=FeatureKey(["my", "feature"]),
1183
+ fields=[FieldSpec(key=FieldKey(["default"]), code_version="1")],
1184
+ )):
1185
+ pass
1186
+ MyFeature.feature_spec_version()
1187
+ # 'def456...' # Different from feature_version
1188
+ ```
1189
+ """
1190
+ return cls.spec().feature_spec_version
1191
+
1192
+ @classmethod
1193
+ def full_definition_version(cls) -> str:
1194
+ """Get hash of the complete feature definition including Pydantic schema.
1195
+
1196
+ This method computes a hash of the entire feature class definition, including:
1197
+ - Pydantic model schema
1198
+ - Project name
1199
+
1200
+ Used in the `metaxy_full_definition_version` column of system tables.
1201
+
1202
+ Returns:
1203
+ SHA256 hex digest of the complete definition
1204
+ """
1205
+ import json
1206
+
1207
+ hasher = hashlib.sha256()
1208
+
1209
+ # Hash the Pydantic schema (includes field types, descriptions, validators, etc.)
1210
+ schema = cls.model_json_schema()
1211
+ schema_json = json.dumps(schema, sort_keys=True)
1212
+ hasher.update(schema_json.encode())
1213
+
1214
+ # Hash the feature specification
1215
+ hasher.update(cls.feature_spec_version().encode())
1216
+
1217
+ # Hash the project name
1218
+ hasher.update(cls.project.encode())
1219
+
1220
+ return truncate_hash(hasher.hexdigest())
1221
+
1222
+ @classmethod
1223
+ def provenance_by_field(cls) -> dict[str, str]:
1224
+ """Get the code-level field provenance for this feature.
1225
+
1226
+ This returns a static hash based on code versions and dependencies,
1227
+ not sample-level field provenance computed from upstream data.
1228
+
1229
+ Returns:
1230
+ Dictionary mapping field keys to their provenance hashes.
1231
+ """
1232
+ return cls.graph.get_feature_version_by_field(cls.spec().key)
1233
+
1234
+ @classmethod
1235
+ def load_input(
1236
+ cls,
1237
+ joiner: Any,
1238
+ upstream_refs: dict[str, "nw.LazyFrame[Any]"],
1239
+ ) -> tuple["nw.LazyFrame[Any]", dict[str, str]]:
1240
+ """Join upstream feature metadata.
1241
+
1242
+ Override for custom join logic (1:many, different keys, filtering, etc.).
1243
+
1244
+ Args:
1245
+ joiner: UpstreamJoiner from MetadataStore
1246
+ upstream_refs: Upstream feature metadata references (lazy where possible)
1247
+
1248
+ Returns:
1249
+ (joined_upstream, upstream_column_mapping)
1250
+ - joined_upstream: All upstream data joined together
1251
+ - upstream_column_mapping: Maps upstream_key -> column name
1252
+ """
1253
+ from metaxy.models.feature_spec import FeatureDep
1254
+
1255
+ # Extract columns and renames from deps
1256
+ upstream_columns: dict[str, tuple[str, ...] | None] = {}
1257
+ upstream_renames: dict[str, dict[str, str] | None] = {}
1258
+
1259
+ deps = cls.spec().deps
1260
+ if deps:
1261
+ for dep in deps:
1262
+ if isinstance(dep, FeatureDep):
1263
+ dep_key_str = dep.feature.to_string()
1264
+ upstream_columns[dep_key_str] = dep.columns
1265
+ upstream_renames[dep_key_str] = dep.rename
1266
+
1267
+ return joiner.join_upstream(
1268
+ upstream_refs=upstream_refs,
1269
+ feature_spec=cls.spec(),
1270
+ feature_plan=cls.graph.get_feature_plan(cls.spec().key),
1271
+ upstream_columns=upstream_columns,
1272
+ upstream_renames=upstream_renames,
1273
+ )
1274
+
1275
+ @classmethod
1276
+ def resolve_data_version_diff(
1277
+ cls,
1278
+ diff_resolver: Any,
1279
+ target_provenance: "nw.LazyFrame[Any]",
1280
+ current_metadata: "nw.LazyFrame[Any] | None",
1281
+ *,
1282
+ lazy: bool = False,
1283
+ ) -> "Increment | LazyIncrement":
1284
+ """Resolve differences between target and current field provenance.
1285
+
1286
+ Override for custom diff logic (ignore certain fields, custom rules, etc.).
1287
+
1288
+ Args:
1289
+ diff_resolver: MetadataDiffResolver from MetadataStore
1290
+ target_provenance: Calculated target field provenance (Narwhals LazyFrame)
1291
+ current_metadata: Current metadata for this feature (Narwhals LazyFrame, or None).
1292
+ Should be pre-filtered by feature_version at the store level.
1293
+ lazy: If True, return LazyIncrement. If False, return Increment.
1294
+
1295
+ Returns:
1296
+ Increment (eager) or LazyIncrement (lazy) with added, changed, removed
1297
+
1298
+ Example (default):
1299
+ ```py
1300
+ class MyFeature(Feature, spec=...):
1301
+ pass # Uses diff resolver's default implementation
1302
+ ```
1303
+
1304
+ Example (ignore certain field changes):
1305
+ ```py
1306
+ class MyFeature(Feature, spec=...):
1307
+ @classmethod
1308
+ def resolve_data_version_diff(cls, diff_resolver, target_provenance, current_metadata, **kwargs):
1309
+ # Get standard diff
1310
+ result = diff_resolver.find_changes(target_provenance, current_metadata, cls.spec().id_columns)
1311
+
1312
+ # Custom: Only consider 'frames' field changes, ignore 'audio'
1313
+ # Users can filter/modify the increment here
1314
+
1315
+ return result # Return modified Increment
1316
+ ```
1317
+ """
1318
+ # Diff resolver always returns LazyIncrement - materialize if needed
1319
+ lazy_result = diff_resolver.find_changes(
1320
+ target_provenance=target_provenance,
1321
+ current_metadata=current_metadata,
1322
+ id_columns=cls.spec().id_columns, # Pass ID columns from feature spec
1323
+ )
1324
+
1325
+ # Materialize to Increment if lazy=False
1326
+ if not lazy:
1327
+ from metaxy.versioning.types import Increment
1328
+
1329
+ return Increment(
1330
+ added=lazy_result.added.collect(),
1331
+ changed=lazy_result.changed.collect(),
1332
+ removed=lazy_result.removed.collect(),
1333
+ )
1334
+
1335
+ return lazy_result