metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,664 @@
1
+ """Hypothesis strategies for generating upstream reference metadata for features.
2
+
3
+ This module provides strategies for property-based testing of features that require
4
+ upstream metadata. The generated metadata matches the structure expected by Metaxy's
5
+ metadata stores, including all system columns.
6
+
7
+ Uses Polars' native parametric testing for efficient DataFrame generation.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import TYPE_CHECKING
13
+
14
+ import polars as pl
15
+ from hypothesis import strategies as st
16
+ from hypothesis.strategies import composite
17
+ from polars.testing.parametric import column, dataframes
18
+
19
+ from metaxy.config import MetaxyConfig
20
+ from metaxy.models.constants import (
21
+ METAXY_CREATED_AT,
22
+ METAXY_DATA_VERSION,
23
+ METAXY_DATA_VERSION_BY_FIELD,
24
+ METAXY_FEATURE_VERSION,
25
+ METAXY_MATERIALIZATION_ID,
26
+ METAXY_PROVENANCE,
27
+ METAXY_PROVENANCE_BY_FIELD,
28
+ METAXY_SNAPSHOT_VERSION,
29
+ )
30
+ from metaxy.models.types import FeatureKey
31
+ from metaxy.versioning.types import HashAlgorithm
32
+
33
+ if TYPE_CHECKING:
34
+ from metaxy.models.feature_spec import FeatureSpec
35
+ from metaxy.models.plan import FeaturePlan
36
+
37
+
38
+ from collections.abc import Callable
39
+ from typing import TYPE_CHECKING, TypeVar, overload
40
+
41
+ import polars_hash as plh
42
+
43
+ if TYPE_CHECKING:
44
+ from metaxy.models.feature_spec import FeatureSpec
45
+ from metaxy.models.plan import FeaturePlan
46
+
47
+
48
+ # Map HashAlgorithm enum to polars-hash functions
49
+ _HASH_FUNCTION_MAP: dict[HashAlgorithm, Callable[[pl.Expr], pl.Expr]] = {
50
+ HashAlgorithm.XXHASH64: lambda expr: expr.nchash.xxhash64(), # pyright: ignore[reportAttributeAccessIssue]
51
+ HashAlgorithm.XXHASH32: lambda expr: expr.nchash.xxhash32(), # pyright: ignore[reportAttributeAccessIssue]
52
+ HashAlgorithm.WYHASH: lambda expr: expr.nchash.wyhash(), # pyright: ignore[reportAttributeAccessIssue]
53
+ HashAlgorithm.SHA256: lambda expr: expr.chash.sha2_256(), # pyright: ignore[reportAttributeAccessIssue]
54
+ HashAlgorithm.MD5: lambda expr: expr.nchash.md5(), # pyright: ignore[reportAttributeAccessIssue]
55
+ }
56
+
57
+
58
+ PolarsFrameT = TypeVar("PolarsFrameT", pl.DataFrame, pl.LazyFrame)
59
+
60
+
61
+ @overload
62
+ def calculate_provenance_by_field_polars(
63
+ joined_upstream_df: pl.DataFrame,
64
+ feature_spec: FeatureSpec,
65
+ feature_plan: FeaturePlan,
66
+ upstream_column_mapping: dict[str, str],
67
+ hash_algorithm: HashAlgorithm,
68
+ hash_truncation_length: int | None = None,
69
+ ) -> pl.DataFrame: ...
70
+
71
+
72
+ @overload
73
+ def calculate_provenance_by_field_polars(
74
+ joined_upstream_df: pl.LazyFrame,
75
+ feature_spec: FeatureSpec,
76
+ feature_plan: FeaturePlan,
77
+ upstream_column_mapping: dict[str, str],
78
+ hash_algorithm: HashAlgorithm,
79
+ hash_truncation_length: int | None = None,
80
+ ) -> pl.LazyFrame: ...
81
+
82
+
83
+ def calculate_provenance_by_field_polars(
84
+ joined_upstream_df: pl.DataFrame | pl.LazyFrame,
85
+ feature_spec: FeatureSpec,
86
+ feature_plan: FeaturePlan,
87
+ upstream_column_mapping: dict[str, str],
88
+ hash_algorithm: HashAlgorithm,
89
+ hash_truncation_length: int | None = None,
90
+ ) -> pl.DataFrame | pl.LazyFrame:
91
+ """Calculate metaxy_provenance_by_field for a Polars DataFrame.
92
+
93
+ This is a standalone function that can be used for testing or direct calculation
94
+ without going through the Narwhals interface.
95
+
96
+ Args:
97
+ joined_upstream_df: Polars DataFrame or LazyFrame with upstream data joined
98
+ feature_spec: Feature specification
99
+ feature_plan: Feature plan with field dependencies
100
+ upstream_column_mapping: Maps upstream feature key -> provenance column name
101
+ hash_algorithm: Hash algorithm to use (default: XXHASH64)
102
+ hash_truncation_length: Optional length to truncate hashes to
103
+
104
+ Returns:
105
+ Polars frame of the same type as joined_upstream_df with metaxy_provenance_by_field column added
106
+
107
+ Example:
108
+ ```python
109
+ from metaxy.data_versioning.calculators.polars import calculate_provenance_by_field_polars
110
+ from metaxy.versioning.types import HashAlgorithm
111
+
112
+ result = calculate_provenance_by_field_polars(
113
+ joined_df,
114
+ feature_spec,
115
+ feature_plan,
116
+ upstream_column_mapping={"parent": "metaxy_provenance_by_field"},
117
+ hash_algorithm=HashAlgorithm.SHA256,
118
+ hash_truncation_length=16,
119
+ )
120
+ ```
121
+ """
122
+ if hash_algorithm not in _HASH_FUNCTION_MAP:
123
+ raise ValueError(
124
+ f"Hash algorithm {hash_algorithm} not supported. "
125
+ f"Supported: {list(_HASH_FUNCTION_MAP.keys())}"
126
+ )
127
+
128
+ hash_fn = _HASH_FUNCTION_MAP[hash_algorithm]
129
+
130
+ # Build hash expressions for each field
131
+ field_exprs = {}
132
+
133
+ for field in feature_spec.fields:
134
+ field_key_str = field.key.to_struct_key()
135
+
136
+ field_deps = feature_plan.field_dependencies.get(field.key, {})
137
+
138
+ # Build hash components
139
+ components = [
140
+ pl.lit(field_key_str),
141
+ pl.lit(str(field.code_version)),
142
+ ]
143
+
144
+ # Add upstream provenance values in deterministic order
145
+ for upstream_feature_key in sorted(field_deps.keys()):
146
+ upstream_fields = field_deps[upstream_feature_key]
147
+ upstream_key_str = upstream_feature_key.to_string()
148
+
149
+ provenance_col_name = upstream_column_mapping.get(
150
+ upstream_key_str, METAXY_PROVENANCE_BY_FIELD
151
+ )
152
+
153
+ for upstream_field in sorted(upstream_fields):
154
+ upstream_field_str = upstream_field.to_struct_key()
155
+
156
+ components.append(pl.lit(f"{upstream_key_str}/{upstream_field_str}"))
157
+ components.append(
158
+ pl.col(provenance_col_name).struct.field(upstream_field_str)
159
+ )
160
+
161
+ # Concatenate and hash
162
+ concat_expr = plh.concat_str(*components, separator="|")
163
+ hashed = hash_fn(concat_expr).cast(pl.Utf8)
164
+
165
+ # Apply truncation if specified
166
+ if hash_truncation_length is not None:
167
+ hashed = hashed.str.slice(0, hash_truncation_length)
168
+
169
+ field_exprs[field_key_str] = hashed
170
+
171
+ # Create provenance struct
172
+ provenance_expr = pl.struct(**field_exprs) # type: ignore[call-overload]
173
+
174
+ return joined_upstream_df.with_columns(
175
+ provenance_expr.alias(METAXY_PROVENANCE_BY_FIELD)
176
+ )
177
+
178
+
179
+ @composite
180
+ def feature_metadata_strategy(
181
+ draw: st.DrawFn,
182
+ feature_spec: FeatureSpec,
183
+ feature_version: str,
184
+ snapshot_version: str,
185
+ num_rows: int | None = None,
186
+ min_rows: int = 1,
187
+ max_rows: int = 100,
188
+ id_columns_df: pl.DataFrame | None = None,
189
+ ) -> pl.DataFrame:
190
+ """Generate valid metadata DataFrame for a single FeatureSpec.
191
+
192
+ Creates a Polars DataFrame with all required Metaxy system columns and ID columns
193
+ as defined in the feature spec. This can be used standalone or as part of
194
+ upstream_metadata_strategy for generating aligned metadata across features.
195
+
196
+ Uses Polars' native parametric testing for efficient generation.
197
+
198
+ Args:
199
+ draw: Hypothesis draw function (provided by @composite decorator)
200
+ feature_spec: FeatureSpec to generate metadata for
201
+ feature_version: The feature version hash to use (from FeatureGraph)
202
+ snapshot_version: The snapshot version hash to use (from FeatureGraph)
203
+ num_rows: Exact number of rows to generate. If None, will draw from min_rows to max_rows
204
+ min_rows: Minimum number of rows (only used if num_rows is None, default: 1)
205
+ max_rows: Maximum number of rows (only used if num_rows is None, default: 100)
206
+ id_columns_df: Optional DataFrame containing ID column values to use.
207
+ If provided, uses these values and ignores num_rows/min_rows/max_rows.
208
+ Useful for aligning metadata across multiple features in a FeaturePlan.
209
+
210
+ Returns:
211
+ Polars DataFrame with ID columns and all Metaxy system columns
212
+
213
+ Example:
214
+ ```python
215
+ from hypothesis import given
216
+ from metaxy import FieldSpec, FieldKey
217
+ from metaxy._testing.models import SampleFeatureSpec
218
+ from metaxy._testing.parametric import feature_metadata_strategy
219
+
220
+ spec = SampleFeatureSpec(
221
+ key="my_feature",
222
+ fields=[FieldSpec(key=FieldKey(["field1"]))],
223
+ )
224
+
225
+ @given(feature_metadata_strategy(spec, min_rows=5, max_rows=20))
226
+ def test_something(metadata_df):
227
+ assert len(metadata_df) >= 5
228
+ assert "sample_uid" in metadata_df.columns
229
+ assert "metaxy_provenance_by_field" in metadata_df.columns
230
+ ```
231
+
232
+ Note:
233
+ - The provenance_by_field struct values are generated by Polars
234
+ - System columns use actual Metaxy constant names from models.constants
235
+ """
236
+ # Determine number of rows
237
+ if id_columns_df is not None:
238
+ num_rows_actual = len(id_columns_df)
239
+ elif num_rows is not None:
240
+ num_rows_actual = num_rows
241
+ else:
242
+ num_rows_actual = draw(st.integers(min_value=min_rows, max_value=max_rows))
243
+
244
+ # Build list of columns for the DataFrame
245
+ cols = []
246
+
247
+ # Add ID columns
248
+ if id_columns_df is not None:
249
+ # Use provided ID column values - we'll add them after generation
250
+ pass
251
+ else:
252
+ # Generate ID columns with Polars
253
+ for id_col in feature_spec.id_columns:
254
+ cols.append(
255
+ column(
256
+ name=id_col,
257
+ dtype=pl.Int64,
258
+ unique=True, # ID columns should be unique
259
+ allow_null=False,
260
+ )
261
+ )
262
+
263
+ # Add provenance_by_field struct column
264
+ # Use a custom strategy to ensure non-empty strings (hash values shouldn't be empty)
265
+ struct_fields = [
266
+ pl.Field(field_spec.key.to_struct_key(), pl.String)
267
+ for field_spec in feature_spec.fields
268
+ ]
269
+
270
+ # Create strategy that generates non-empty hash-like strings
271
+ # Read hash truncation length from global config
272
+ hash_truncation_length = MetaxyConfig.get().hash_truncation_length or 64
273
+
274
+ # Generate fixed-length strings matching the truncation length
275
+ hash_string_strategy = st.text(
276
+ alphabet=st.characters(
277
+ whitelist_categories=("Ll", "Nd"),
278
+ whitelist_characters="abcdef0123456789",
279
+ ),
280
+ min_size=hash_truncation_length,
281
+ max_size=hash_truncation_length,
282
+ )
283
+
284
+ cols.append(
285
+ column(
286
+ name=METAXY_PROVENANCE_BY_FIELD,
287
+ dtype=pl.Struct(struct_fields),
288
+ strategy=st.builds(
289
+ dict, **{field.name: hash_string_strategy for field in struct_fields}
290
+ ),
291
+ allow_null=False,
292
+ )
293
+ )
294
+
295
+ # Generate the DataFrame (without version columns yet)
296
+ df_strategy = dataframes(
297
+ cols=cols,
298
+ min_size=num_rows_actual,
299
+ max_size=num_rows_actual,
300
+ )
301
+ df = draw(df_strategy)
302
+
303
+ # Add constant version columns
304
+ df = df.with_columns(
305
+ pl.lit(feature_version).alias(METAXY_FEATURE_VERSION),
306
+ pl.lit(snapshot_version).alias(METAXY_SNAPSHOT_VERSION),
307
+ )
308
+
309
+ # Add METAXY_PROVENANCE column - hash of all field hashes concatenated
310
+ # Get field names from the struct in sorted order for determinism
311
+ field_names = sorted([f.key.to_struct_key() for f in feature_spec.fields])
312
+
313
+ # Concatenate all field hashes with separator
314
+ sample_components = [
315
+ pl.col(METAXY_PROVENANCE_BY_FIELD).struct.field(field_name)
316
+ for field_name in field_names
317
+ ]
318
+ sample_concat = plh.concat_str(*sample_components, separator="|")
319
+
320
+ # Hash the concatenation using the same algorithm as the test
321
+ hash_fn = _HASH_FUNCTION_MAP.get(HashAlgorithm.XXHASH64)
322
+ if hash_fn is None:
323
+ raise ValueError(f"Hash algorithm {HashAlgorithm.XXHASH64} not supported")
324
+
325
+ sample_hash = hash_fn(sample_concat).cast(pl.Utf8)
326
+
327
+ # Apply truncation if specified
328
+ if hash_truncation_length is not None:
329
+ sample_hash = sample_hash.str.slice(0, hash_truncation_length)
330
+
331
+ df = df.with_columns(sample_hash.alias(METAXY_PROVENANCE))
332
+
333
+ # Add data_version columns (default to provenance values)
334
+ df = df.with_columns(
335
+ pl.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION),
336
+ pl.col(METAXY_PROVENANCE_BY_FIELD).alias(METAXY_DATA_VERSION_BY_FIELD),
337
+ )
338
+
339
+ # Add created_at timestamp column
340
+ from datetime import datetime, timezone
341
+
342
+ df = df.with_columns(
343
+ pl.lit(datetime.now(timezone.utc)).alias(METAXY_CREATED_AT),
344
+ )
345
+
346
+ # If id_columns_df was provided, replace the generated ID columns with provided ones
347
+ if id_columns_df is not None:
348
+ # Drop the generated ID columns and add the provided ones
349
+ non_id_columns = [
350
+ col for col in df.columns if col not in feature_spec.id_columns
351
+ ]
352
+ df = df.select(non_id_columns)
353
+
354
+ # Add the provided ID columns
355
+ for id_col in feature_spec.id_columns:
356
+ if id_col not in id_columns_df.columns:
357
+ raise ValueError(
358
+ f"ID column '{id_col}' from feature spec not found in id_columns_df. "
359
+ f"Available columns: {id_columns_df.columns}"
360
+ )
361
+ df = df.with_columns(id_columns_df[id_col])
362
+
363
+ return df
364
+
365
+
366
+ @composite
367
+ def upstream_metadata_strategy(
368
+ draw: st.DrawFn,
369
+ feature_plan: FeaturePlan,
370
+ feature_versions: dict[str, str],
371
+ snapshot_version: str,
372
+ min_rows: int = 1,
373
+ max_rows: int = 100,
374
+ ) -> dict[str, pl.DataFrame]:
375
+ """Generate upstream reference metadata for a given FeaturePlan.
376
+
377
+ Creates a dictionary mapping upstream feature keys to Polars DataFrames that
378
+ contain valid Metaxy metadata. The DataFrames include all system columns
379
+ (metaxy_provenance_by_field, metaxy_feature_version, metaxy_snapshot_version)
380
+ and ID columns as defined in each upstream feature spec.
381
+
382
+ Uses Polars' native parametric testing for efficient generation.
383
+
384
+ The generated metadata has the structure expected by metadata stores:
385
+ - ID columns (as defined per feature spec) with generated values
386
+ - metaxy_provenance_by_field: Struct column with field keys mapped to hash strings
387
+ - metaxy_feature_version: Feature version hash string (from FeatureGraph)
388
+ - metaxy_snapshot_version: Snapshot version hash string (from FeatureGraph)
389
+
390
+ Args:
391
+ draw: Hypothesis draw function (provided by @composite decorator)
392
+ feature_plan: FeaturePlan containing the feature and its upstream dependencies
393
+ feature_versions: Dict mapping feature key strings to their feature_version hashes
394
+ snapshot_version: The snapshot version hash to use for all features
395
+ min_rows: Minimum number of rows to generate per upstream feature (default: 1)
396
+ max_rows: Maximum number of rows to generate per upstream feature (default: 100)
397
+
398
+ Returns:
399
+ Dictionary mapping upstream feature key strings to Polars DataFrames
400
+
401
+ Example:
402
+ ```python
403
+ from hypothesis import given
404
+ from metaxy import BaseFeature as FeatureGraph, Feature, FieldSpec, FieldKey
405
+ from metaxy._testing.models import SampleFeatureSpec
406
+ from metaxy._testing.parametric import upstream_metadata_strategy
407
+
408
+ graph = FeatureGraph()
409
+ with graph.use():
410
+ class ParentFeature(
411
+ Feature,
412
+ spec=SampleFeatureSpec(
413
+ key="parent",
414
+ fields=[FieldSpec(key=FieldKey(["field1"]))],
415
+ ),
416
+ ):
417
+ pass
418
+
419
+ class ChildFeature(
420
+ Feature,
421
+ spec=SampleFeatureSpec(
422
+ key="child",
423
+ deps=[FeatureDep(feature="parent")],
424
+ fields=[FieldSpec(key=FieldKey(["result"]))],
425
+ ),
426
+ ):
427
+ pass
428
+
429
+ plan = graph.get_feature_plan(FeatureKey(["child"]))
430
+
431
+ @given(upstream_metadata_strategy(plan))
432
+ def test_feature_property(upstream_data):
433
+ # upstream_data is a dict with "parent" key mapped to a valid DataFrame
434
+ assert "parent" in upstream_data
435
+ assert "metaxy_provenance_by_field" in upstream_data["parent"].columns
436
+ ```
437
+
438
+ Note:
439
+ - The provenance_by_field struct values are generated by Polars
440
+ - Each upstream feature respects its own ID column definition from its spec
441
+ - For joins to work, features with overlapping ID columns will have aligned values
442
+ - System columns use actual Metaxy constant names from models.constants
443
+ """
444
+ if not feature_plan.deps:
445
+ return {}
446
+
447
+ # Generate number of rows (same for all upstream features to enable joins)
448
+ num_rows = draw(st.integers(min_value=min_rows, max_value=max_rows))
449
+
450
+ # Collect all unique ID columns across all upstream features
451
+ # and generate shared values for columns that appear in multiple features
452
+ all_id_columns: set[str] = set()
453
+ for upstream_spec in feature_plan.deps:
454
+ all_id_columns.update(upstream_spec.id_columns)
455
+
456
+ # Generate a DataFrame with all unique ID columns using Polars parametric testing
457
+ id_cols = [
458
+ column(
459
+ name=id_col,
460
+ dtype=pl.Int64,
461
+ unique=True,
462
+ allow_null=False,
463
+ )
464
+ for id_col in sorted(all_id_columns) # Sort for deterministic ordering
465
+ ]
466
+
467
+ id_columns_df_strategy = dataframes(
468
+ cols=id_cols,
469
+ min_size=num_rows,
470
+ max_size=num_rows,
471
+ )
472
+ id_columns_df = draw(id_columns_df_strategy)
473
+
474
+ # Generate metadata for each upstream feature using feature_metadata_strategy
475
+ result: dict[str, pl.DataFrame] = {}
476
+
477
+ for upstream_spec in feature_plan.deps:
478
+ # Get the feature version for this upstream feature
479
+ feature_key_str = upstream_spec.key.to_string()
480
+ if feature_key_str not in feature_versions:
481
+ raise ValueError(
482
+ f"Feature version for '{feature_key_str}' not found in feature_versions. "
483
+ f"Available keys: {list(feature_versions.keys())}"
484
+ )
485
+ feature_version = feature_versions[feature_key_str]
486
+
487
+ # Use feature_metadata_strategy to generate metadata for this spec
488
+ # Pass only the ID columns that this feature needs
489
+ upstream_id_df = id_columns_df.select(list(upstream_spec.id_columns))
490
+
491
+ df = draw(
492
+ feature_metadata_strategy(
493
+ upstream_spec,
494
+ feature_version=feature_version,
495
+ snapshot_version=snapshot_version,
496
+ id_columns_df=upstream_id_df,
497
+ )
498
+ )
499
+
500
+ # Store using feature key string
501
+ result[feature_key_str] = df
502
+
503
+ return result
504
+
505
+
506
+ @composite
507
+ def downstream_metadata_strategy(
508
+ draw: st.DrawFn,
509
+ feature_plan: FeaturePlan,
510
+ feature_versions: dict[str, str],
511
+ snapshot_version: str,
512
+ hash_algorithm: HashAlgorithm = HashAlgorithm.XXHASH64,
513
+ min_rows: int = 1,
514
+ max_rows: int = 100,
515
+ ) -> tuple[dict[str, pl.DataFrame], pl.DataFrame]:
516
+ """Generate upstream metadata AND correctly calculated downstream metadata.
517
+
518
+ This strategy generates upstream metadata using upstream_metadata_strategy,
519
+ then calculates the "golden" downstream metadata with correctly computed
520
+ metaxy_provenance_by_field values using the Polars calculator.
521
+
522
+ This is useful for testing that:
523
+ - Provenance calculations are correct
524
+ - Joins work properly
525
+ - Hash algorithms produce expected results
526
+ - Hash truncation works correctly
527
+
528
+ Args:
529
+ draw: Hypothesis draw function (provided by @composite decorator)
530
+ feature_plan: FeaturePlan containing the feature and its upstream dependencies
531
+ feature_versions: Dict mapping feature key strings to their feature_version hashes
532
+ (must include the downstream feature itself)
533
+ snapshot_version: The snapshot version hash to use for all features
534
+ hash_algorithm: Hash algorithm to use for provenance calculation (default: XXHASH64)
535
+ min_rows: Minimum number of rows to generate per upstream feature (default: 1)
536
+ max_rows: Maximum number of rows to generate per upstream feature (default: 100)
537
+
538
+ Returns:
539
+ Tuple of (upstream_metadata, downstream_metadata):
540
+ - upstream_metadata: Dict mapping upstream feature keys to DataFrames
541
+ - downstream_metadata: DataFrame with correctly calculated provenance_by_field
542
+
543
+ Example:
544
+ ```python
545
+ from hypothesis import given
546
+ from metaxy import BaseFeature as FeatureGraph, FeatureKey
547
+ from metaxy._testing.parametric import downstream_metadata_strategy
548
+ from metaxy.versioning.types import HashAlgorithm
549
+
550
+ graph = FeatureGraph()
551
+ # ... define features ...
552
+
553
+ plan = graph.get_feature_plan(FeatureKey(["child"]))
554
+
555
+ # Get versions from graph
556
+ feature_versions = {
557
+ "parent": graph.get_feature_by_key(FeatureKey(["parent"])).feature_version(),
558
+ "child": graph.get_feature_by_key(FeatureKey(["child"])).feature_version(),
559
+ }
560
+ snapshot_version = graph.snapshot_version()
561
+
562
+ @given(downstream_metadata_strategy(
563
+ plan,
564
+ feature_versions=feature_versions,
565
+ snapshot_version=snapshot_version,
566
+ hash_algorithm=HashAlgorithm.SHA256,
567
+ ))
568
+ def test_provenance_calculation(data):
569
+ upstream_data, downstream_df = data
570
+ # Test that downstream_df has correctly calculated provenance
571
+ assert "metaxy_provenance_by_field" in downstream_df.columns
572
+ ```
573
+
574
+ Note:
575
+ - The downstream feature's feature_version must be in feature_versions dict
576
+ - Provenance is calculated using the actual Polars calculator
577
+ - Hash algorithm and truncation settings are applied consistently
578
+ """
579
+ # Generate upstream metadata first
580
+ upstream_data = draw(
581
+ upstream_metadata_strategy(
582
+ feature_plan,
583
+ feature_versions={
584
+ k: v
585
+ for k, v in feature_versions.items()
586
+ if k != feature_plan.feature.key.to_string()
587
+ },
588
+ snapshot_version=snapshot_version,
589
+ min_rows=min_rows,
590
+ max_rows=max_rows,
591
+ )
592
+ )
593
+
594
+ # If there are no upstream features, return empty upstream and just the downstream
595
+ if not upstream_data:
596
+ # Generate standalone downstream metadata
597
+ downstream_feature_key = feature_plan.feature.key.to_string()
598
+ if downstream_feature_key not in feature_versions:
599
+ raise ValueError(
600
+ f"Feature version for downstream feature '{downstream_feature_key}' not found. "
601
+ f"Available keys: {list(feature_versions.keys())}"
602
+ )
603
+
604
+ downstream_df = draw(
605
+ feature_metadata_strategy(
606
+ feature_plan.feature,
607
+ feature_version=feature_versions[downstream_feature_key],
608
+ snapshot_version=snapshot_version,
609
+ min_rows=min_rows,
610
+ max_rows=max_rows,
611
+ )
612
+ )
613
+ return ({}, downstream_df)
614
+
615
+ # Use the new PolarsVersioningEngine to calculate provenance
616
+ import narwhals as nw
617
+
618
+ from metaxy.versioning.polars import PolarsVersioningEngine
619
+
620
+ # Create engine (only accepts plan parameter)
621
+ engine = PolarsVersioningEngine(plan=feature_plan)
622
+
623
+ # Convert upstream_data keys from strings to FeatureKey objects and wrap in Narwhals
624
+ # Keys are simple strings like "parent", "child" that need to be wrapped in a list
625
+ # DataFrames need to be converted to LazyFrames and wrapped in Narwhals
626
+ upstream_dict = {
627
+ FeatureKey([k]): nw.from_native(v.lazy()) for k, v in upstream_data.items()
628
+ }
629
+
630
+ # Load upstream with provenance calculation
631
+ # Note: hash_length is read from MetaxyConfig.get().hash_truncation_length internally
632
+ downstream_df = engine.load_upstream_with_provenance(
633
+ upstream=upstream_dict,
634
+ hash_algo=hash_algorithm,
635
+ filters=None,
636
+ ).collect()
637
+
638
+ # Add downstream feature version and snapshot version
639
+ downstream_feature_key = feature_plan.feature.key.to_string()
640
+ if downstream_feature_key not in feature_versions:
641
+ raise ValueError(
642
+ f"Feature version for downstream feature '{downstream_feature_key}' not found. "
643
+ f"Available keys: {list(feature_versions.keys())}"
644
+ )
645
+
646
+ # Use Narwhals lit since downstream_df is a Narwhals DataFrame
647
+ from datetime import datetime, timezone
648
+
649
+ downstream_df = downstream_df.with_columns(
650
+ nw.lit(feature_versions[downstream_feature_key]).alias(METAXY_FEATURE_VERSION),
651
+ nw.lit(snapshot_version).alias(METAXY_SNAPSHOT_VERSION),
652
+ # Add data_version columns (default to provenance)
653
+ nw.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION),
654
+ nw.col(METAXY_PROVENANCE_BY_FIELD).alias(METAXY_DATA_VERSION_BY_FIELD),
655
+ # Add created_at timestamp
656
+ nw.lit(datetime.now(timezone.utc)).alias(METAXY_CREATED_AT),
657
+ # Add materialization_id (nullable)
658
+ nw.lit(None, dtype=nw.String).alias(METAXY_MATERIALIZATION_ID),
659
+ )
660
+
661
+ # Convert back to native Polars DataFrame for the return type
662
+ downstream_df_polars = downstream_df.to_native()
663
+
664
+ return (upstream_data, downstream_df_polars)