metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,656 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from abc import ABC, abstractmethod
5
+ from collections import Counter
6
+ from collections.abc import Mapping, Sequence
7
+ from functools import cached_property
8
+ from typing import TYPE_CHECKING, cast
9
+
10
+ import narwhals as nw
11
+ from narwhals.typing import FrameT
12
+
13
+ from metaxy.config import MetaxyConfig
14
+ from metaxy.models.constants import (
15
+ METAXY_FEATURE_VERSION,
16
+ METAXY_PROVENANCE,
17
+ METAXY_PROVENANCE_BY_FIELD,
18
+ METAXY_SNAPSHOT_VERSION,
19
+ )
20
+ from metaxy.models.lineage import LineageRelationshipType
21
+ from metaxy.models.plan import FeaturePlan, FQFieldKey
22
+ from metaxy.models.types import FeatureKey, FieldKey
23
+ from metaxy.utils.hashing import get_hash_truncation_length
24
+ from metaxy.versioning.feature_dep_transformer import FeatureDepTransformer
25
+ from metaxy.versioning.renamed_df import RenamedDataFrame
26
+ from metaxy.versioning.types import HashAlgorithm
27
+
28
+ if TYPE_CHECKING:
29
+ from metaxy.versioning.lineage_handler import LineageHandler
30
+
31
+
32
+ class VersioningEngine(ABC):
33
+ """A class responsible for tracking sample and field level provenance."""
34
+
35
+ def __init__(self, plan: FeaturePlan):
36
+ self.plan = plan
37
+
38
+ @classmethod
39
+ @abstractmethod
40
+ def implementation(cls) -> nw.Implementation: ...
41
+
42
+ @cached_property
43
+ def key(self) -> FeatureKey:
44
+ """Feature key for the feature we are calculating provenance for."""
45
+ return self.plan.feature.key
46
+
47
+ @cached_property
48
+ def feature_transformers_by_key(self) -> dict[FeatureKey, FeatureDepTransformer]:
49
+ transformers = {
50
+ dep.feature: FeatureDepTransformer(dep=dep, plan=self.plan)
51
+ for dep in (self.plan.feature_deps or [])
52
+ }
53
+ # make sure only ID columns are repeated across transformers
54
+
55
+ column_counter = Counter()
56
+ all_id_columns = set()
57
+ for transformer in transformers.values():
58
+ renamed_cols = transformer.renamed_columns
59
+ if renamed_cols is not None:
60
+ column_counter.update(renamed_cols)
61
+ all_id_columns.update(transformer.renamed_id_columns)
62
+
63
+ repeated_columns = []
64
+ for col, count in column_counter.items():
65
+ if count > 1 and col not in all_id_columns:
66
+ repeated_columns.append(col)
67
+
68
+ if repeated_columns:
69
+ raise RuntimeError(
70
+ f"Identified ambiguous columns while resolving upstream column selection for feature {self.key}. Repeated columns: {repeated_columns}. Only ID columns ({all_id_columns}) are allowed to be repeated. Please tweak the `rename` field on the `FeatureDep` objects of {self.key} feature spec."
71
+ )
72
+
73
+ return transformers
74
+
75
+ @cached_property
76
+ def shared_id_columns(self) -> list[str]:
77
+ """Warning: order of columns is not guaranteed"""
78
+ cols = set()
79
+ for transformer in self.feature_transformers_by_key.values():
80
+ cols.update(transformer.renamed_id_columns)
81
+
82
+ if not cols:
83
+ raise ValueError(
84
+ f"No shared ID columns found for upstream features of feature {self.key}. Please ensure that there is at least one ID column shared across all upstream features. Consider tweaking the `rename` field on the `FeatureDep` objects of {self.key} feature spec, as ID columns are being renamed before this check."
85
+ )
86
+
87
+ return list(cols)
88
+
89
+ def join(self, upstream: Mapping[FeatureKey, RenamedDataFrame[FrameT]]) -> FrameT:
90
+ """Join the renamed upstream dataframes on the intersection of renamed id_columns of all feature specs."""
91
+ assert len(upstream) > 0, "No upstream dataframes provided"
92
+
93
+ key, renamed_df = next(iter(upstream.items()))
94
+
95
+ df = renamed_df.df
96
+
97
+ for next_key, renamed_df in upstream.items():
98
+ if key == next_key:
99
+ continue
100
+ # we do not need to provide a _suffix here
101
+ # because the columns are already renamed
102
+ # it's on the user to specify correct renames for colliding columns
103
+ df = cast(
104
+ FrameT, df.join(renamed_df.df, on=self.shared_id_columns, how="inner")
105
+ )
106
+
107
+ return df
108
+
109
+ def prepare_upstream(
110
+ self,
111
+ upstream: Mapping[FeatureKey, FrameT],
112
+ filters: Mapping[FeatureKey, Sequence[nw.Expr]] | None,
113
+ ) -> FrameT:
114
+ """Prepare the upstream dataframes for the given feature.
115
+
116
+ This includes, in order:
117
+
118
+ - filtering (static filters from FeatureDep.filters + additional runtime filters)
119
+
120
+ - renaming
121
+
122
+ - selecting
123
+
124
+ based on [metaxy.models.feature_spec.FeatureDep][], and joining
125
+ on the intersection of id_columns of all feature specs.
126
+
127
+ Args:
128
+ upstream: Dictionary of upstream dataframes keyed by FeatureKey
129
+ filters: Optional additional runtime filters to apply (combined with FeatureDep.filters)
130
+ """
131
+ assert len(upstream) > 0, "No upstream dataframes provided"
132
+
133
+ dfs: dict[FeatureKey, RenamedDataFrame[FrameT]] = {
134
+ k: self.feature_transformers_by_key[k].transform(
135
+ df, filters=(filters or {}).get(k)
136
+ )
137
+ for k, df in upstream.items()
138
+ }
139
+
140
+ # Drop system columns that aren't needed for provenance calculation
141
+ # Keep only METAXY_PROVENANCE and METAXY_PROVENANCE_BY_FIELD
142
+ # Drop METAXY_FEATURE_VERSION and METAXY_SNAPSHOT_VERSION to avoid collisions
143
+ columns_to_drop = [METAXY_FEATURE_VERSION, METAXY_SNAPSHOT_VERSION]
144
+
145
+ for feature_key, renamed_df in dfs.items():
146
+ cols = renamed_df.df.collect_schema().names()
147
+ cols_to_drop = [col for col in columns_to_drop if col in cols]
148
+ if cols_to_drop:
149
+ dfs[feature_key] = RenamedDataFrame(
150
+ df=renamed_df.df.drop(*cols_to_drop),
151
+ id_columns=renamed_df.id_columns,
152
+ )
153
+
154
+ # Validate no column collisions (except ID columns and required system columns)
155
+ if len(dfs) > 1:
156
+ all_columns: dict[str, list[FeatureKey]] = {}
157
+ for feature_key, renamed_df in dfs.items():
158
+ cols = renamed_df.df.collect_schema().names()
159
+ for col in cols:
160
+ if col not in all_columns:
161
+ all_columns[col] = []
162
+ all_columns[col].append(feature_key)
163
+
164
+ # System columns that are allowed to collide (needed for provenance calculation)
165
+ from metaxy.models.constants import (
166
+ METAXY_CREATED_AT,
167
+ METAXY_DATA_VERSION,
168
+ METAXY_DATA_VERSION_BY_FIELD,
169
+ METAXY_MATERIALIZATION_ID,
170
+ )
171
+
172
+ allowed_system_columns = {
173
+ METAXY_PROVENANCE,
174
+ METAXY_PROVENANCE_BY_FIELD,
175
+ METAXY_DATA_VERSION,
176
+ METAXY_DATA_VERSION_BY_FIELD,
177
+ METAXY_CREATED_AT,
178
+ METAXY_MATERIALIZATION_ID,
179
+ }
180
+ id_cols = set(self.shared_id_columns)
181
+ colliding_columns = [
182
+ col
183
+ for col, features in all_columns.items()
184
+ if len(features) > 1
185
+ and col not in id_cols
186
+ and col not in allowed_system_columns
187
+ ]
188
+
189
+ if colliding_columns:
190
+ raise ValueError(
191
+ f"Found additional shared columns across upstream features for feature {self.plan.feature}: {colliding_columns}. "
192
+ f"Only ID columns {list(id_cols)} and required system columns {list(allowed_system_columns)} should be shared. "
193
+ f"Please add explicit renames in your FeatureDep to avoid column collisions."
194
+ )
195
+
196
+ return self.join(dfs)
197
+
198
+ @abstractmethod
199
+ def hash_string_column(
200
+ self,
201
+ df: FrameT,
202
+ source_column: str,
203
+ target_column: str,
204
+ hash_algo: HashAlgorithm,
205
+ ) -> FrameT:
206
+ """Hash a string column using backend-specific hash function.
207
+
208
+ Args:
209
+ df: Narwhals DataFrame
210
+ source_column: Name of string column to hash
211
+ target_column: Name for the new column containing the hash
212
+ hash_algo: Hash algorithm to use
213
+ hash_length: Length to truncate hash to
214
+
215
+ Returns:
216
+ Narwhals DataFrame with new hashed column added.
217
+ The source column remains unchanged.
218
+ """
219
+ raise NotImplementedError()
220
+
221
+ @staticmethod
222
+ @abstractmethod
223
+ def build_struct_column(
224
+ df: FrameT,
225
+ struct_name: str,
226
+ field_columns: dict[str, str],
227
+ ) -> FrameT:
228
+ """Build a struct column from existing columns.
229
+
230
+ Args:
231
+ df: Narwhals DataFrame
232
+ struct_name: Name for the new struct column
233
+ field_columns: Mapping from struct field names to column names in df
234
+
235
+ Returns:
236
+ Narwhals DataFrame with new struct column added.
237
+ The source columns remain unchanged.
238
+ """
239
+ raise NotImplementedError()
240
+
241
+ @staticmethod
242
+ @abstractmethod
243
+ def aggregate_with_string_concat(
244
+ df: FrameT,
245
+ group_by_columns: list[str],
246
+ concat_column: str,
247
+ concat_separator: str,
248
+ exclude_columns: list[str],
249
+ ) -> FrameT:
250
+ """Aggregate DataFrame by grouping and concatenating strings.
251
+
252
+ Used for N:1 aggregation lineage where multiple upstream rows
253
+ are aggregated into one downstream row. The concat_column strings
254
+ are concatenated with a separator, and other columns take their
255
+ first value within each group.
256
+
257
+ Args:
258
+ df: Narwhals DataFrame to aggregate
259
+ group_by_columns: Columns to group by
260
+ concat_column: Column containing strings to concatenate within groups
261
+ concat_separator: Separator to use when concatenating strings
262
+ exclude_columns: Columns to exclude from aggregation (typically system columns
263
+ that will be recalculated after aggregation)
264
+
265
+ Returns:
266
+ Narwhals DataFrame with one row per group, with concat_column containing
267
+ concatenated strings and other columns taking their first value.
268
+ """
269
+ raise NotImplementedError()
270
+
271
+ @staticmethod
272
+ @abstractmethod
273
+ def keep_latest_by_group(
274
+ df: FrameT,
275
+ group_columns: list[str],
276
+ timestamp_column: str,
277
+ ) -> FrameT:
278
+ """Keep only the latest row per group based on a timestamp column.
279
+
280
+ Args:
281
+ df: Narwhals DataFrame/LazyFrame
282
+ group_columns: Columns to group by (typically ID columns)
283
+ timestamp_column: Column to use for determining "latest" (typically metaxy_created_at)
284
+
285
+ Returns:
286
+ Narwhals DataFrame/LazyFrame with only the latest row per group
287
+
288
+ Raises:
289
+ ValueError: If timestamp_column doesn't exist in df
290
+ """
291
+ raise NotImplementedError()
292
+
293
+ def get_renamed_provenance_by_field_col(self, feature_key: FeatureKey) -> str:
294
+ """Get the renamed provenance_by_field column name for an upstream feature."""
295
+ return self.feature_transformers_by_key[
296
+ feature_key
297
+ ].renamed_provenance_by_field_col
298
+
299
+ def get_renamed_data_version_by_field_col(self, feature_key: FeatureKey) -> str:
300
+ """Get the renamed data_version_by_field column name for an upstream feature."""
301
+ return self.feature_transformers_by_key[
302
+ feature_key
303
+ ].renamed_data_version_by_field_col
304
+
305
+ def get_field_provenance_exprs(
306
+ self,
307
+ ) -> dict[FieldKey, dict[FQFieldKey, nw.Expr]]:
308
+ """Returns a mapping from field keys to data structures that determine provenances for each field.
309
+ Each value is itself a mapping from fully qualified field keys of upstream features to an expression that selects the corresponding upstream data version.
310
+
311
+ Resolves field-level dependencies. Only actual parent fields are considered.
312
+
313
+ Note:
314
+ This reads from upstream `metaxy_data_version_by_field` instead of `metaxy_provenance_by_field`,
315
+ enabling users to control version propagation by overriding data_version values.
316
+ """
317
+ res: dict[FieldKey, dict[FQFieldKey, nw.Expr]] = {}
318
+ # THIS LINES HERE
319
+ # ARE THE PINNACLE OF METAXY
320
+ for field_spec in self.plan.feature.fields:
321
+ field_provenance: dict[FQFieldKey, nw.Expr] = {}
322
+ for fq_key, parent_field_spec in self.plan.get_parent_fields_for_field(
323
+ field_spec.key
324
+ ).items():
325
+ # Read from data_version_by_field instead of provenance_by_field
326
+ # This enables user-defined versioning control
327
+ field_provenance[fq_key] = nw.col(
328
+ self.get_renamed_data_version_by_field_col(fq_key.feature)
329
+ ).struct.field(parent_field_spec.key.to_struct_key())
330
+ res[field_spec.key] = field_provenance
331
+ return res
332
+
333
+ def load_upstream_with_provenance(
334
+ self,
335
+ upstream: dict[FeatureKey, FrameT],
336
+ hash_algo: HashAlgorithm,
337
+ filters: Mapping[FeatureKey, Sequence[nw.Expr]] | None,
338
+ ) -> FrameT:
339
+ """Compute the provenance of the given feature.
340
+
341
+ Args:
342
+ key: Feature key to compute provenance for
343
+ upstream: Dictionary of upstream dataframes
344
+ hash_algo: Hash algorithm to use
345
+ filters: Optional additional runtime filters to apply to upstream data (combined with FeatureDep.filters)
346
+
347
+ Returns:
348
+ DataFrame with metaxy_provenance_by_field and metaxy_provenance columns added
349
+
350
+ Note:
351
+ Hash truncation length is read from MetaxyConfig.get().hash_truncation_length
352
+ """
353
+ # Read hash truncation length from global config
354
+ hash_length = MetaxyConfig.get().hash_truncation_length or 64
355
+
356
+ # Prepare upstream: filter, rename, select, join
357
+ df = self.prepare_upstream(upstream, filters=filters)
358
+
359
+ # Build concatenation columns for each field
360
+ temp_concat_cols: dict[str, str] = {} # field_key_str -> temp_col_name
361
+ field_key_strs: dict[FieldKey, str] = {} # field_key -> field_key_str
362
+
363
+ # Get field provenance expressions
364
+ field_provenance_exprs = self.get_field_provenance_exprs()
365
+
366
+ for field_spec in self.plan.feature.fields:
367
+ field_key_str = field_spec.key.to_struct_key()
368
+ field_key_strs[field_spec.key] = field_key_str
369
+ temp_col_name = f"__concat_{field_key_str}"
370
+ temp_concat_cols[field_key_str] = temp_col_name
371
+
372
+ # Build concatenation components
373
+ components: list[nw.Expr] = [
374
+ nw.lit(field_spec.key.to_string()),
375
+ nw.lit(str(field_spec.code_version)),
376
+ ]
377
+
378
+ # Add upstream provenance values in deterministic order
379
+ parent_field_exprs = field_provenance_exprs.get(field_spec.key, {})
380
+ for fq_field_key in sorted(parent_field_exprs.keys()):
381
+ # Add label
382
+ components.append(nw.lit(fq_field_key.to_string()))
383
+ # Add the expression that selects the upstream provenance
384
+ components.append(parent_field_exprs[fq_field_key])
385
+
386
+ # Concatenate all components
387
+ concat_expr = nw.concat_str(components, separator="|")
388
+ df = df.with_columns(concat_expr.alias(temp_col_name))
389
+
390
+ # Hash each concatenation column (BACKEND DOES THIS)
391
+ temp_hash_cols: dict[str, str] = {} # field_key_str -> hash_col_name
392
+ for field_key_str, concat_col in temp_concat_cols.items():
393
+ hash_col_name = f"__hash_{field_key_str}"
394
+ temp_hash_cols[field_key_str] = hash_col_name
395
+
396
+ # Hash the concatenated string column into a new column
397
+ df = self.hash_string_column(
398
+ df, concat_col, hash_col_name, hash_algo
399
+ ).with_columns(nw.col(hash_col_name).str.slice(0, hash_length))
400
+
401
+ # Build provenance_by_field struct (BACKEND DOES THIS)
402
+ df = self.build_struct_column(df, METAXY_PROVENANCE_BY_FIELD, temp_hash_cols)
403
+
404
+ # Compute sample-level provenance hash
405
+ # Step 1: Concatenate all field hashes with separator
406
+ df = self.hash_struct_version_column(df, hash_algorithm=hash_algo)
407
+
408
+ # Drop all temporary columns (BASE CLASS CLEANUP)
409
+ # Drop temporary concat columns and hash columns
410
+ temp_columns_to_drop = list(temp_concat_cols.values()) + list(
411
+ temp_hash_cols.values()
412
+ )
413
+
414
+ df = df.drop(*temp_columns_to_drop)
415
+
416
+ # Drop version columns if present (they come from upstream and shouldn't be in the result)
417
+ version_columns = ["metaxy_feature_version", "metaxy_snapshot_version"]
418
+ current_columns = df.collect_schema().names()
419
+ columns_to_drop = [col for col in version_columns if col in current_columns]
420
+
421
+ # Drop renamed upstream provenance and data_version columns (e.g., metaxy_provenance__raw_video)
422
+ # These were needed for provenance calculation but shouldn't be in final result
423
+ for transformer in self.feature_transformers_by_key.values():
424
+ renamed_prov_col = transformer.renamed_provenance_col
425
+ renamed_prov_by_field_col = transformer.renamed_provenance_by_field_col
426
+ renamed_data_version_by_field_col = (
427
+ transformer.renamed_data_version_by_field_col
428
+ )
429
+ if renamed_prov_col in current_columns:
430
+ columns_to_drop.append(renamed_prov_col)
431
+ if renamed_prov_by_field_col in current_columns:
432
+ columns_to_drop.append(renamed_prov_by_field_col)
433
+ if renamed_data_version_by_field_col in current_columns:
434
+ columns_to_drop.append(renamed_data_version_by_field_col)
435
+
436
+ if columns_to_drop:
437
+ df = df.drop(*columns_to_drop)
438
+
439
+ # Add data_version columns (default to provenance values)
440
+ from metaxy.models.constants import (
441
+ METAXY_DATA_VERSION,
442
+ METAXY_DATA_VERSION_BY_FIELD,
443
+ )
444
+
445
+ df = df.with_columns(
446
+ nw.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION),
447
+ nw.col(METAXY_PROVENANCE_BY_FIELD).alias(METAXY_DATA_VERSION_BY_FIELD),
448
+ )
449
+
450
+ return df
451
+
452
+ def hash_struct_version_column(
453
+ self,
454
+ df: FrameT,
455
+ hash_algorithm: HashAlgorithm,
456
+ struct_column: str = METAXY_PROVENANCE_BY_FIELD,
457
+ hash_column: str = METAXY_PROVENANCE,
458
+ ) -> FrameT:
459
+ # Compute sample-level provenance from field-level provenance
460
+ # Get all field names from the struct (we need feature spec for this)
461
+ field_names = sorted([f.key.to_struct_key() for f in self.plan.feature.fields])
462
+
463
+ # Concatenate all field hashes with separator
464
+ sample_components = [
465
+ nw.col(struct_column).struct.field(field_name) for field_name in field_names
466
+ ]
467
+ sample_concat = nw.concat_str(sample_components, separator="|")
468
+ df = df.with_columns(sample_concat.alias("__sample_concat"))
469
+
470
+ # Hash the concatenation to produce final provenance hash
471
+ return (
472
+ self.hash_string_column(
473
+ df,
474
+ "__sample_concat",
475
+ hash_column,
476
+ hash_algorithm,
477
+ )
478
+ .with_columns(
479
+ nw.col(hash_column).str.slice(0, get_hash_truncation_length())
480
+ )
481
+ .drop("__sample_concat")
482
+ )
483
+
484
+ def resolve_increment_with_provenance(
485
+ self,
486
+ current: FrameT | None,
487
+ upstream: dict[FeatureKey, FrameT],
488
+ hash_algorithm: HashAlgorithm,
489
+ filters: Mapping[FeatureKey, Sequence[nw.Expr]],
490
+ sample: FrameT | None,
491
+ ) -> tuple[FrameT, FrameT | None, FrameT | None]:
492
+ """Loads upstream data, filters, renames, joins it, calculates expected provenance, and compares it with existing provenance.
493
+
494
+ Args:
495
+ current: Current metadata for this feature, if available.
496
+ upstream: A dictionary of upstream data frames.
497
+ hash_algorithm: The hash algorithm to use.
498
+ filters: Additional runtime filters (combined with FeatureDep.filters by FeatureDepTransformer).
499
+ sample: For root features this is used instead of the upstream dataframe.
500
+ Must contain both metaxy_provenance_by_field (struct of field hashes)
501
+ and metaxy_provenance (hash of all field hashes concatenated).
502
+ IMPORTANT: metaxy_provenance must be a HASH, not a raw concatenation.
503
+
504
+ Returns:
505
+ tuple[FrameT, FrameT | None, FrameT | None]
506
+ New samples appearing in upstream, samples with changed provenance (mismatch between expected and current state), and samples that have been removed from upstream but are in the current state. New samples DataFrame is never None, but may be empty. changed and removed DataFrames may be None (for the first increment on the feature).
507
+
508
+ Note:
509
+ Hash truncation length is read from MetaxyConfig.get().hash_truncation_length
510
+ """
511
+ # Handle root feature case
512
+ if sample is not None:
513
+ # Root features: sample is user-provided with provenance columns already
514
+ assert len(upstream) == 0, (
515
+ "Root features should have no upstream dependencies"
516
+ )
517
+ expected = sample
518
+ # Auto-compute metaxy_provenance if missing but metaxy_provenance_by_field exists
519
+ cols = expected.collect_schema().names()
520
+ if METAXY_PROVENANCE not in cols and METAXY_PROVENANCE_BY_FIELD in cols:
521
+ warnings.warn(
522
+ f"Auto-computing {METAXY_PROVENANCE} from {METAXY_PROVENANCE_BY_FIELD} because it is missing in samples DataFrame"
523
+ )
524
+ expected = self.hash_struct_version_column(
525
+ expected, hash_algorithm=hash_algorithm
526
+ )
527
+
528
+ # Validate that root features provide both required provenance columns
529
+ self._check_required_provenance_columns(
530
+ expected, "The `sample` DataFrame (must be provided to root features)"
531
+ )
532
+ else:
533
+ # Normal case: compute provenance from upstream
534
+ expected = self.load_upstream_with_provenance(
535
+ upstream,
536
+ hash_algo=hash_algorithm,
537
+ filters=filters,
538
+ )
539
+
540
+ # Case 1: No current metadata - everything is added
541
+ if current is None:
542
+ return expected, None, None
543
+ assert current is not None
544
+
545
+ # Case 2 & 3: Compare expected with current metadata
546
+ # Validate that current has metaxy_provenance column
547
+ self._check_required_provenance_columns(
548
+ current, "The `current` DataFrame loaded from the metadata store"
549
+ )
550
+
551
+ # Handle different lineage relationships before comparison
552
+ lineage_handler = create_lineage_handler(self.plan, self)
553
+ expected, current, join_columns = lineage_handler.normalize_for_comparison(
554
+ expected, current, hash_algorithm
555
+ )
556
+
557
+ current = current.rename(
558
+ {
559
+ METAXY_PROVENANCE: f"__current_{METAXY_PROVENANCE}",
560
+ METAXY_PROVENANCE_BY_FIELD: f"__current_{METAXY_PROVENANCE_BY_FIELD}",
561
+ }
562
+ )
563
+
564
+ added = cast(
565
+ FrameT,
566
+ expected.join(
567
+ cast(FrameT, current.select(join_columns)),
568
+ on=join_columns,
569
+ how="anti",
570
+ ),
571
+ )
572
+
573
+ changed = cast(
574
+ FrameT,
575
+ expected.join(
576
+ cast(
577
+ FrameT,
578
+ current.select(*join_columns, f"__current_{METAXY_PROVENANCE}"),
579
+ ),
580
+ on=join_columns,
581
+ how="inner",
582
+ )
583
+ .filter(
584
+ nw.col(f"__current_{METAXY_PROVENANCE}").is_null()
585
+ | (
586
+ nw.col(METAXY_PROVENANCE)
587
+ != nw.col(f"__current_{METAXY_PROVENANCE}")
588
+ )
589
+ )
590
+ .drop(f"__current_{METAXY_PROVENANCE}"),
591
+ )
592
+
593
+ removed = cast(
594
+ FrameT,
595
+ current.join(
596
+ cast(FrameT, expected.select(join_columns)),
597
+ on=join_columns,
598
+ how="anti",
599
+ ).rename(
600
+ {
601
+ f"__current_{METAXY_PROVENANCE}": METAXY_PROVENANCE,
602
+ f"__current_{METAXY_PROVENANCE_BY_FIELD}": METAXY_PROVENANCE_BY_FIELD,
603
+ }
604
+ ),
605
+ )
606
+
607
+ # Return lazy frames with ID and provenance columns (caller decides whether to collect)
608
+ return added, changed, removed
609
+
610
+ def _check_required_provenance_columns(self, df: FrameT, message: str):
611
+ cols = df.collect_schema().names()
612
+
613
+ if METAXY_PROVENANCE_BY_FIELD not in cols:
614
+ raise ValueError(
615
+ f"{message} is missing required "
616
+ f"'{METAXY_PROVENANCE_BY_FIELD}' column. This column must be a struct containing the provenance of each field on this feature."
617
+ )
618
+ if METAXY_PROVENANCE not in cols:
619
+ raise ValueError(
620
+ f"{message} is missing required "
621
+ f"'{METAXY_PROVENANCE}' column. All metadata in the store must have both provenance columns. "
622
+ f"This column is automatically added by Metaxy when writing metadata."
623
+ )
624
+
625
+
626
+ def create_lineage_handler(
627
+ feature_plan: FeaturePlan,
628
+ engine: VersioningEngine,
629
+ ) -> LineageHandler:
630
+ """Factory function to create appropriate lineage handler.
631
+
632
+ Args:
633
+ feature_plan: The feature plan containing lineage information
634
+ engine: The provenance engine instance
635
+
636
+ Returns:
637
+ Appropriate LineageHandler instance based on lineage type
638
+ """
639
+ # Import handler classes at runtime to avoid circular import
640
+ from metaxy.versioning.lineage_handler import (
641
+ AggregationLineageHandler,
642
+ ExpansionLineageHandler,
643
+ IdentityLineageHandler,
644
+ )
645
+
646
+ lineage = feature_plan.feature.lineage
647
+ relationship_type = lineage.relationship.type
648
+
649
+ if relationship_type == LineageRelationshipType.IDENTITY:
650
+ return IdentityLineageHandler(feature_plan, engine)
651
+ elif relationship_type == LineageRelationshipType.AGGREGATION:
652
+ return AggregationLineageHandler(feature_plan, engine)
653
+ elif relationship_type == LineageRelationshipType.EXPANSION:
654
+ return ExpansionLineageHandler(feature_plan, engine)
655
+ else:
656
+ raise ValueError(f"Unknown lineage relationship type: {relationship_type}")