metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,664 @@
|
|
|
1
|
+
"""Hypothesis strategies for generating upstream reference metadata for features.
|
|
2
|
+
|
|
3
|
+
This module provides strategies for property-based testing of features that require
|
|
4
|
+
upstream metadata. The generated metadata matches the structure expected by Metaxy's
|
|
5
|
+
metadata stores, including all system columns.
|
|
6
|
+
|
|
7
|
+
Uses Polars' native parametric testing for efficient DataFrame generation.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
import polars as pl
|
|
15
|
+
from hypothesis import strategies as st
|
|
16
|
+
from hypothesis.strategies import composite
|
|
17
|
+
from polars.testing.parametric import column, dataframes
|
|
18
|
+
|
|
19
|
+
from metaxy.config import MetaxyConfig
|
|
20
|
+
from metaxy.models.constants import (
|
|
21
|
+
METAXY_CREATED_AT,
|
|
22
|
+
METAXY_DATA_VERSION,
|
|
23
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
24
|
+
METAXY_FEATURE_VERSION,
|
|
25
|
+
METAXY_MATERIALIZATION_ID,
|
|
26
|
+
METAXY_PROVENANCE,
|
|
27
|
+
METAXY_PROVENANCE_BY_FIELD,
|
|
28
|
+
METAXY_SNAPSHOT_VERSION,
|
|
29
|
+
)
|
|
30
|
+
from metaxy.models.types import FeatureKey
|
|
31
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
35
|
+
from metaxy.models.plan import FeaturePlan
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
from collections.abc import Callable
|
|
39
|
+
from typing import TYPE_CHECKING, TypeVar, overload
|
|
40
|
+
|
|
41
|
+
import polars_hash as plh
|
|
42
|
+
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
45
|
+
from metaxy.models.plan import FeaturePlan
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Map HashAlgorithm enum to polars-hash functions
|
|
49
|
+
_HASH_FUNCTION_MAP: dict[HashAlgorithm, Callable[[pl.Expr], pl.Expr]] = {
|
|
50
|
+
HashAlgorithm.XXHASH64: lambda expr: expr.nchash.xxhash64(), # pyright: ignore[reportAttributeAccessIssue]
|
|
51
|
+
HashAlgorithm.XXHASH32: lambda expr: expr.nchash.xxhash32(), # pyright: ignore[reportAttributeAccessIssue]
|
|
52
|
+
HashAlgorithm.WYHASH: lambda expr: expr.nchash.wyhash(), # pyright: ignore[reportAttributeAccessIssue]
|
|
53
|
+
HashAlgorithm.SHA256: lambda expr: expr.chash.sha2_256(), # pyright: ignore[reportAttributeAccessIssue]
|
|
54
|
+
HashAlgorithm.MD5: lambda expr: expr.nchash.md5(), # pyright: ignore[reportAttributeAccessIssue]
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
PolarsFrameT = TypeVar("PolarsFrameT", pl.DataFrame, pl.LazyFrame)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@overload
|
|
62
|
+
def calculate_provenance_by_field_polars(
|
|
63
|
+
joined_upstream_df: pl.DataFrame,
|
|
64
|
+
feature_spec: FeatureSpec,
|
|
65
|
+
feature_plan: FeaturePlan,
|
|
66
|
+
upstream_column_mapping: dict[str, str],
|
|
67
|
+
hash_algorithm: HashAlgorithm,
|
|
68
|
+
hash_truncation_length: int | None = None,
|
|
69
|
+
) -> pl.DataFrame: ...
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@overload
|
|
73
|
+
def calculate_provenance_by_field_polars(
|
|
74
|
+
joined_upstream_df: pl.LazyFrame,
|
|
75
|
+
feature_spec: FeatureSpec,
|
|
76
|
+
feature_plan: FeaturePlan,
|
|
77
|
+
upstream_column_mapping: dict[str, str],
|
|
78
|
+
hash_algorithm: HashAlgorithm,
|
|
79
|
+
hash_truncation_length: int | None = None,
|
|
80
|
+
) -> pl.LazyFrame: ...
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def calculate_provenance_by_field_polars(
|
|
84
|
+
joined_upstream_df: pl.DataFrame | pl.LazyFrame,
|
|
85
|
+
feature_spec: FeatureSpec,
|
|
86
|
+
feature_plan: FeaturePlan,
|
|
87
|
+
upstream_column_mapping: dict[str, str],
|
|
88
|
+
hash_algorithm: HashAlgorithm,
|
|
89
|
+
hash_truncation_length: int | None = None,
|
|
90
|
+
) -> pl.DataFrame | pl.LazyFrame:
|
|
91
|
+
"""Calculate metaxy_provenance_by_field for a Polars DataFrame.
|
|
92
|
+
|
|
93
|
+
This is a standalone function that can be used for testing or direct calculation
|
|
94
|
+
without going through the Narwhals interface.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
joined_upstream_df: Polars DataFrame or LazyFrame with upstream data joined
|
|
98
|
+
feature_spec: Feature specification
|
|
99
|
+
feature_plan: Feature plan with field dependencies
|
|
100
|
+
upstream_column_mapping: Maps upstream feature key -> provenance column name
|
|
101
|
+
hash_algorithm: Hash algorithm to use (default: XXHASH64)
|
|
102
|
+
hash_truncation_length: Optional length to truncate hashes to
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Polars frame of the same type as joined_upstream_df with metaxy_provenance_by_field column added
|
|
106
|
+
|
|
107
|
+
Example:
|
|
108
|
+
```python
|
|
109
|
+
from metaxy.data_versioning.calculators.polars import calculate_provenance_by_field_polars
|
|
110
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
111
|
+
|
|
112
|
+
result = calculate_provenance_by_field_polars(
|
|
113
|
+
joined_df,
|
|
114
|
+
feature_spec,
|
|
115
|
+
feature_plan,
|
|
116
|
+
upstream_column_mapping={"parent": "metaxy_provenance_by_field"},
|
|
117
|
+
hash_algorithm=HashAlgorithm.SHA256,
|
|
118
|
+
hash_truncation_length=16,
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
"""
|
|
122
|
+
if hash_algorithm not in _HASH_FUNCTION_MAP:
|
|
123
|
+
raise ValueError(
|
|
124
|
+
f"Hash algorithm {hash_algorithm} not supported. "
|
|
125
|
+
f"Supported: {list(_HASH_FUNCTION_MAP.keys())}"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
hash_fn = _HASH_FUNCTION_MAP[hash_algorithm]
|
|
129
|
+
|
|
130
|
+
# Build hash expressions for each field
|
|
131
|
+
field_exprs = {}
|
|
132
|
+
|
|
133
|
+
for field in feature_spec.fields:
|
|
134
|
+
field_key_str = field.key.to_struct_key()
|
|
135
|
+
|
|
136
|
+
field_deps = feature_plan.field_dependencies.get(field.key, {})
|
|
137
|
+
|
|
138
|
+
# Build hash components
|
|
139
|
+
components = [
|
|
140
|
+
pl.lit(field_key_str),
|
|
141
|
+
pl.lit(str(field.code_version)),
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
# Add upstream provenance values in deterministic order
|
|
145
|
+
for upstream_feature_key in sorted(field_deps.keys()):
|
|
146
|
+
upstream_fields = field_deps[upstream_feature_key]
|
|
147
|
+
upstream_key_str = upstream_feature_key.to_string()
|
|
148
|
+
|
|
149
|
+
provenance_col_name = upstream_column_mapping.get(
|
|
150
|
+
upstream_key_str, METAXY_PROVENANCE_BY_FIELD
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
for upstream_field in sorted(upstream_fields):
|
|
154
|
+
upstream_field_str = upstream_field.to_struct_key()
|
|
155
|
+
|
|
156
|
+
components.append(pl.lit(f"{upstream_key_str}/{upstream_field_str}"))
|
|
157
|
+
components.append(
|
|
158
|
+
pl.col(provenance_col_name).struct.field(upstream_field_str)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Concatenate and hash
|
|
162
|
+
concat_expr = plh.concat_str(*components, separator="|")
|
|
163
|
+
hashed = hash_fn(concat_expr).cast(pl.Utf8)
|
|
164
|
+
|
|
165
|
+
# Apply truncation if specified
|
|
166
|
+
if hash_truncation_length is not None:
|
|
167
|
+
hashed = hashed.str.slice(0, hash_truncation_length)
|
|
168
|
+
|
|
169
|
+
field_exprs[field_key_str] = hashed
|
|
170
|
+
|
|
171
|
+
# Create provenance struct
|
|
172
|
+
provenance_expr = pl.struct(**field_exprs) # type: ignore[call-overload]
|
|
173
|
+
|
|
174
|
+
return joined_upstream_df.with_columns(
|
|
175
|
+
provenance_expr.alias(METAXY_PROVENANCE_BY_FIELD)
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@composite
|
|
180
|
+
def feature_metadata_strategy(
|
|
181
|
+
draw: st.DrawFn,
|
|
182
|
+
feature_spec: FeatureSpec,
|
|
183
|
+
feature_version: str,
|
|
184
|
+
snapshot_version: str,
|
|
185
|
+
num_rows: int | None = None,
|
|
186
|
+
min_rows: int = 1,
|
|
187
|
+
max_rows: int = 100,
|
|
188
|
+
id_columns_df: pl.DataFrame | None = None,
|
|
189
|
+
) -> pl.DataFrame:
|
|
190
|
+
"""Generate valid metadata DataFrame for a single FeatureSpec.
|
|
191
|
+
|
|
192
|
+
Creates a Polars DataFrame with all required Metaxy system columns and ID columns
|
|
193
|
+
as defined in the feature spec. This can be used standalone or as part of
|
|
194
|
+
upstream_metadata_strategy for generating aligned metadata across features.
|
|
195
|
+
|
|
196
|
+
Uses Polars' native parametric testing for efficient generation.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
draw: Hypothesis draw function (provided by @composite decorator)
|
|
200
|
+
feature_spec: FeatureSpec to generate metadata for
|
|
201
|
+
feature_version: The feature version hash to use (from FeatureGraph)
|
|
202
|
+
snapshot_version: The snapshot version hash to use (from FeatureGraph)
|
|
203
|
+
num_rows: Exact number of rows to generate. If None, will draw from min_rows to max_rows
|
|
204
|
+
min_rows: Minimum number of rows (only used if num_rows is None, default: 1)
|
|
205
|
+
max_rows: Maximum number of rows (only used if num_rows is None, default: 100)
|
|
206
|
+
id_columns_df: Optional DataFrame containing ID column values to use.
|
|
207
|
+
If provided, uses these values and ignores num_rows/min_rows/max_rows.
|
|
208
|
+
Useful for aligning metadata across multiple features in a FeaturePlan.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Polars DataFrame with ID columns and all Metaxy system columns
|
|
212
|
+
|
|
213
|
+
Example:
|
|
214
|
+
```python
|
|
215
|
+
from hypothesis import given
|
|
216
|
+
from metaxy import FieldSpec, FieldKey
|
|
217
|
+
from metaxy._testing.models import SampleFeatureSpec
|
|
218
|
+
from metaxy._testing.parametric import feature_metadata_strategy
|
|
219
|
+
|
|
220
|
+
spec = SampleFeatureSpec(
|
|
221
|
+
key="my_feature",
|
|
222
|
+
fields=[FieldSpec(key=FieldKey(["field1"]))],
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
@given(feature_metadata_strategy(spec, min_rows=5, max_rows=20))
|
|
226
|
+
def test_something(metadata_df):
|
|
227
|
+
assert len(metadata_df) >= 5
|
|
228
|
+
assert "sample_uid" in metadata_df.columns
|
|
229
|
+
assert "metaxy_provenance_by_field" in metadata_df.columns
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Note:
|
|
233
|
+
- The provenance_by_field struct values are generated by Polars
|
|
234
|
+
- System columns use actual Metaxy constant names from models.constants
|
|
235
|
+
"""
|
|
236
|
+
# Determine number of rows
|
|
237
|
+
if id_columns_df is not None:
|
|
238
|
+
num_rows_actual = len(id_columns_df)
|
|
239
|
+
elif num_rows is not None:
|
|
240
|
+
num_rows_actual = num_rows
|
|
241
|
+
else:
|
|
242
|
+
num_rows_actual = draw(st.integers(min_value=min_rows, max_value=max_rows))
|
|
243
|
+
|
|
244
|
+
# Build list of columns for the DataFrame
|
|
245
|
+
cols = []
|
|
246
|
+
|
|
247
|
+
# Add ID columns
|
|
248
|
+
if id_columns_df is not None:
|
|
249
|
+
# Use provided ID column values - we'll add them after generation
|
|
250
|
+
pass
|
|
251
|
+
else:
|
|
252
|
+
# Generate ID columns with Polars
|
|
253
|
+
for id_col in feature_spec.id_columns:
|
|
254
|
+
cols.append(
|
|
255
|
+
column(
|
|
256
|
+
name=id_col,
|
|
257
|
+
dtype=pl.Int64,
|
|
258
|
+
unique=True, # ID columns should be unique
|
|
259
|
+
allow_null=False,
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
# Add provenance_by_field struct column
|
|
264
|
+
# Use a custom strategy to ensure non-empty strings (hash values shouldn't be empty)
|
|
265
|
+
struct_fields = [
|
|
266
|
+
pl.Field(field_spec.key.to_struct_key(), pl.String)
|
|
267
|
+
for field_spec in feature_spec.fields
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
# Create strategy that generates non-empty hash-like strings
|
|
271
|
+
# Read hash truncation length from global config
|
|
272
|
+
hash_truncation_length = MetaxyConfig.get().hash_truncation_length or 64
|
|
273
|
+
|
|
274
|
+
# Generate fixed-length strings matching the truncation length
|
|
275
|
+
hash_string_strategy = st.text(
|
|
276
|
+
alphabet=st.characters(
|
|
277
|
+
whitelist_categories=("Ll", "Nd"),
|
|
278
|
+
whitelist_characters="abcdef0123456789",
|
|
279
|
+
),
|
|
280
|
+
min_size=hash_truncation_length,
|
|
281
|
+
max_size=hash_truncation_length,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
cols.append(
|
|
285
|
+
column(
|
|
286
|
+
name=METAXY_PROVENANCE_BY_FIELD,
|
|
287
|
+
dtype=pl.Struct(struct_fields),
|
|
288
|
+
strategy=st.builds(
|
|
289
|
+
dict, **{field.name: hash_string_strategy for field in struct_fields}
|
|
290
|
+
),
|
|
291
|
+
allow_null=False,
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Generate the DataFrame (without version columns yet)
|
|
296
|
+
df_strategy = dataframes(
|
|
297
|
+
cols=cols,
|
|
298
|
+
min_size=num_rows_actual,
|
|
299
|
+
max_size=num_rows_actual,
|
|
300
|
+
)
|
|
301
|
+
df = draw(df_strategy)
|
|
302
|
+
|
|
303
|
+
# Add constant version columns
|
|
304
|
+
df = df.with_columns(
|
|
305
|
+
pl.lit(feature_version).alias(METAXY_FEATURE_VERSION),
|
|
306
|
+
pl.lit(snapshot_version).alias(METAXY_SNAPSHOT_VERSION),
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Add METAXY_PROVENANCE column - hash of all field hashes concatenated
|
|
310
|
+
# Get field names from the struct in sorted order for determinism
|
|
311
|
+
field_names = sorted([f.key.to_struct_key() for f in feature_spec.fields])
|
|
312
|
+
|
|
313
|
+
# Concatenate all field hashes with separator
|
|
314
|
+
sample_components = [
|
|
315
|
+
pl.col(METAXY_PROVENANCE_BY_FIELD).struct.field(field_name)
|
|
316
|
+
for field_name in field_names
|
|
317
|
+
]
|
|
318
|
+
sample_concat = plh.concat_str(*sample_components, separator="|")
|
|
319
|
+
|
|
320
|
+
# Hash the concatenation using the same algorithm as the test
|
|
321
|
+
hash_fn = _HASH_FUNCTION_MAP.get(HashAlgorithm.XXHASH64)
|
|
322
|
+
if hash_fn is None:
|
|
323
|
+
raise ValueError(f"Hash algorithm {HashAlgorithm.XXHASH64} not supported")
|
|
324
|
+
|
|
325
|
+
sample_hash = hash_fn(sample_concat).cast(pl.Utf8)
|
|
326
|
+
|
|
327
|
+
# Apply truncation if specified
|
|
328
|
+
if hash_truncation_length is not None:
|
|
329
|
+
sample_hash = sample_hash.str.slice(0, hash_truncation_length)
|
|
330
|
+
|
|
331
|
+
df = df.with_columns(sample_hash.alias(METAXY_PROVENANCE))
|
|
332
|
+
|
|
333
|
+
# Add data_version columns (default to provenance values)
|
|
334
|
+
df = df.with_columns(
|
|
335
|
+
pl.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION),
|
|
336
|
+
pl.col(METAXY_PROVENANCE_BY_FIELD).alias(METAXY_DATA_VERSION_BY_FIELD),
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Add created_at timestamp column
|
|
340
|
+
from datetime import datetime, timezone
|
|
341
|
+
|
|
342
|
+
df = df.with_columns(
|
|
343
|
+
pl.lit(datetime.now(timezone.utc)).alias(METAXY_CREATED_AT),
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# If id_columns_df was provided, replace the generated ID columns with provided ones
|
|
347
|
+
if id_columns_df is not None:
|
|
348
|
+
# Drop the generated ID columns and add the provided ones
|
|
349
|
+
non_id_columns = [
|
|
350
|
+
col for col in df.columns if col not in feature_spec.id_columns
|
|
351
|
+
]
|
|
352
|
+
df = df.select(non_id_columns)
|
|
353
|
+
|
|
354
|
+
# Add the provided ID columns
|
|
355
|
+
for id_col in feature_spec.id_columns:
|
|
356
|
+
if id_col not in id_columns_df.columns:
|
|
357
|
+
raise ValueError(
|
|
358
|
+
f"ID column '{id_col}' from feature spec not found in id_columns_df. "
|
|
359
|
+
f"Available columns: {id_columns_df.columns}"
|
|
360
|
+
)
|
|
361
|
+
df = df.with_columns(id_columns_df[id_col])
|
|
362
|
+
|
|
363
|
+
return df
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
@composite
|
|
367
|
+
def upstream_metadata_strategy(
|
|
368
|
+
draw: st.DrawFn,
|
|
369
|
+
feature_plan: FeaturePlan,
|
|
370
|
+
feature_versions: dict[str, str],
|
|
371
|
+
snapshot_version: str,
|
|
372
|
+
min_rows: int = 1,
|
|
373
|
+
max_rows: int = 100,
|
|
374
|
+
) -> dict[str, pl.DataFrame]:
|
|
375
|
+
"""Generate upstream reference metadata for a given FeaturePlan.
|
|
376
|
+
|
|
377
|
+
Creates a dictionary mapping upstream feature keys to Polars DataFrames that
|
|
378
|
+
contain valid Metaxy metadata. The DataFrames include all system columns
|
|
379
|
+
(metaxy_provenance_by_field, metaxy_feature_version, metaxy_snapshot_version)
|
|
380
|
+
and ID columns as defined in each upstream feature spec.
|
|
381
|
+
|
|
382
|
+
Uses Polars' native parametric testing for efficient generation.
|
|
383
|
+
|
|
384
|
+
The generated metadata has the structure expected by metadata stores:
|
|
385
|
+
- ID columns (as defined per feature spec) with generated values
|
|
386
|
+
- metaxy_provenance_by_field: Struct column with field keys mapped to hash strings
|
|
387
|
+
- metaxy_feature_version: Feature version hash string (from FeatureGraph)
|
|
388
|
+
- metaxy_snapshot_version: Snapshot version hash string (from FeatureGraph)
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
draw: Hypothesis draw function (provided by @composite decorator)
|
|
392
|
+
feature_plan: FeaturePlan containing the feature and its upstream dependencies
|
|
393
|
+
feature_versions: Dict mapping feature key strings to their feature_version hashes
|
|
394
|
+
snapshot_version: The snapshot version hash to use for all features
|
|
395
|
+
min_rows: Minimum number of rows to generate per upstream feature (default: 1)
|
|
396
|
+
max_rows: Maximum number of rows to generate per upstream feature (default: 100)
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Dictionary mapping upstream feature key strings to Polars DataFrames
|
|
400
|
+
|
|
401
|
+
Example:
|
|
402
|
+
```python
|
|
403
|
+
from hypothesis import given
|
|
404
|
+
from metaxy import BaseFeature as FeatureGraph, Feature, FieldSpec, FieldKey
|
|
405
|
+
from metaxy._testing.models import SampleFeatureSpec
|
|
406
|
+
from metaxy._testing.parametric import upstream_metadata_strategy
|
|
407
|
+
|
|
408
|
+
graph = FeatureGraph()
|
|
409
|
+
with graph.use():
|
|
410
|
+
class ParentFeature(
|
|
411
|
+
Feature,
|
|
412
|
+
spec=SampleFeatureSpec(
|
|
413
|
+
key="parent",
|
|
414
|
+
fields=[FieldSpec(key=FieldKey(["field1"]))],
|
|
415
|
+
),
|
|
416
|
+
):
|
|
417
|
+
pass
|
|
418
|
+
|
|
419
|
+
class ChildFeature(
|
|
420
|
+
Feature,
|
|
421
|
+
spec=SampleFeatureSpec(
|
|
422
|
+
key="child",
|
|
423
|
+
deps=[FeatureDep(feature="parent")],
|
|
424
|
+
fields=[FieldSpec(key=FieldKey(["result"]))],
|
|
425
|
+
),
|
|
426
|
+
):
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
plan = graph.get_feature_plan(FeatureKey(["child"]))
|
|
430
|
+
|
|
431
|
+
@given(upstream_metadata_strategy(plan))
|
|
432
|
+
def test_feature_property(upstream_data):
|
|
433
|
+
# upstream_data is a dict with "parent" key mapped to a valid DataFrame
|
|
434
|
+
assert "parent" in upstream_data
|
|
435
|
+
assert "metaxy_provenance_by_field" in upstream_data["parent"].columns
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
Note:
|
|
439
|
+
- The provenance_by_field struct values are generated by Polars
|
|
440
|
+
- Each upstream feature respects its own ID column definition from its spec
|
|
441
|
+
- For joins to work, features with overlapping ID columns will have aligned values
|
|
442
|
+
- System columns use actual Metaxy constant names from models.constants
|
|
443
|
+
"""
|
|
444
|
+
if not feature_plan.deps:
|
|
445
|
+
return {}
|
|
446
|
+
|
|
447
|
+
# Generate number of rows (same for all upstream features to enable joins)
|
|
448
|
+
num_rows = draw(st.integers(min_value=min_rows, max_value=max_rows))
|
|
449
|
+
|
|
450
|
+
# Collect all unique ID columns across all upstream features
|
|
451
|
+
# and generate shared values for columns that appear in multiple features
|
|
452
|
+
all_id_columns: set[str] = set()
|
|
453
|
+
for upstream_spec in feature_plan.deps:
|
|
454
|
+
all_id_columns.update(upstream_spec.id_columns)
|
|
455
|
+
|
|
456
|
+
# Generate a DataFrame with all unique ID columns using Polars parametric testing
|
|
457
|
+
id_cols = [
|
|
458
|
+
column(
|
|
459
|
+
name=id_col,
|
|
460
|
+
dtype=pl.Int64,
|
|
461
|
+
unique=True,
|
|
462
|
+
allow_null=False,
|
|
463
|
+
)
|
|
464
|
+
for id_col in sorted(all_id_columns) # Sort for deterministic ordering
|
|
465
|
+
]
|
|
466
|
+
|
|
467
|
+
id_columns_df_strategy = dataframes(
|
|
468
|
+
cols=id_cols,
|
|
469
|
+
min_size=num_rows,
|
|
470
|
+
max_size=num_rows,
|
|
471
|
+
)
|
|
472
|
+
id_columns_df = draw(id_columns_df_strategy)
|
|
473
|
+
|
|
474
|
+
# Generate metadata for each upstream feature using feature_metadata_strategy
|
|
475
|
+
result: dict[str, pl.DataFrame] = {}
|
|
476
|
+
|
|
477
|
+
for upstream_spec in feature_plan.deps:
|
|
478
|
+
# Get the feature version for this upstream feature
|
|
479
|
+
feature_key_str = upstream_spec.key.to_string()
|
|
480
|
+
if feature_key_str not in feature_versions:
|
|
481
|
+
raise ValueError(
|
|
482
|
+
f"Feature version for '{feature_key_str}' not found in feature_versions. "
|
|
483
|
+
f"Available keys: {list(feature_versions.keys())}"
|
|
484
|
+
)
|
|
485
|
+
feature_version = feature_versions[feature_key_str]
|
|
486
|
+
|
|
487
|
+
# Use feature_metadata_strategy to generate metadata for this spec
|
|
488
|
+
# Pass only the ID columns that this feature needs
|
|
489
|
+
upstream_id_df = id_columns_df.select(list(upstream_spec.id_columns))
|
|
490
|
+
|
|
491
|
+
df = draw(
|
|
492
|
+
feature_metadata_strategy(
|
|
493
|
+
upstream_spec,
|
|
494
|
+
feature_version=feature_version,
|
|
495
|
+
snapshot_version=snapshot_version,
|
|
496
|
+
id_columns_df=upstream_id_df,
|
|
497
|
+
)
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
# Store using feature key string
|
|
501
|
+
result[feature_key_str] = df
|
|
502
|
+
|
|
503
|
+
return result
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
@composite
|
|
507
|
+
def downstream_metadata_strategy(
|
|
508
|
+
draw: st.DrawFn,
|
|
509
|
+
feature_plan: FeaturePlan,
|
|
510
|
+
feature_versions: dict[str, str],
|
|
511
|
+
snapshot_version: str,
|
|
512
|
+
hash_algorithm: HashAlgorithm = HashAlgorithm.XXHASH64,
|
|
513
|
+
min_rows: int = 1,
|
|
514
|
+
max_rows: int = 100,
|
|
515
|
+
) -> tuple[dict[str, pl.DataFrame], pl.DataFrame]:
|
|
516
|
+
"""Generate upstream metadata AND correctly calculated downstream metadata.
|
|
517
|
+
|
|
518
|
+
This strategy generates upstream metadata using upstream_metadata_strategy,
|
|
519
|
+
then calculates the "golden" downstream metadata with correctly computed
|
|
520
|
+
metaxy_provenance_by_field values using the Polars calculator.
|
|
521
|
+
|
|
522
|
+
This is useful for testing that:
|
|
523
|
+
- Provenance calculations are correct
|
|
524
|
+
- Joins work properly
|
|
525
|
+
- Hash algorithms produce expected results
|
|
526
|
+
- Hash truncation works correctly
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
draw: Hypothesis draw function (provided by @composite decorator)
|
|
530
|
+
feature_plan: FeaturePlan containing the feature and its upstream dependencies
|
|
531
|
+
feature_versions: Dict mapping feature key strings to their feature_version hashes
|
|
532
|
+
(must include the downstream feature itself)
|
|
533
|
+
snapshot_version: The snapshot version hash to use for all features
|
|
534
|
+
hash_algorithm: Hash algorithm to use for provenance calculation (default: XXHASH64)
|
|
535
|
+
min_rows: Minimum number of rows to generate per upstream feature (default: 1)
|
|
536
|
+
max_rows: Maximum number of rows to generate per upstream feature (default: 100)
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Tuple of (upstream_metadata, downstream_metadata):
|
|
540
|
+
- upstream_metadata: Dict mapping upstream feature keys to DataFrames
|
|
541
|
+
- downstream_metadata: DataFrame with correctly calculated provenance_by_field
|
|
542
|
+
|
|
543
|
+
Example:
|
|
544
|
+
```python
|
|
545
|
+
from hypothesis import given
|
|
546
|
+
from metaxy import BaseFeature as FeatureGraph, FeatureKey
|
|
547
|
+
from metaxy._testing.parametric import downstream_metadata_strategy
|
|
548
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
549
|
+
|
|
550
|
+
graph = FeatureGraph()
|
|
551
|
+
# ... define features ...
|
|
552
|
+
|
|
553
|
+
plan = graph.get_feature_plan(FeatureKey(["child"]))
|
|
554
|
+
|
|
555
|
+
# Get versions from graph
|
|
556
|
+
feature_versions = {
|
|
557
|
+
"parent": graph.get_feature_by_key(FeatureKey(["parent"])).feature_version(),
|
|
558
|
+
"child": graph.get_feature_by_key(FeatureKey(["child"])).feature_version(),
|
|
559
|
+
}
|
|
560
|
+
snapshot_version = graph.snapshot_version()
|
|
561
|
+
|
|
562
|
+
@given(downstream_metadata_strategy(
|
|
563
|
+
plan,
|
|
564
|
+
feature_versions=feature_versions,
|
|
565
|
+
snapshot_version=snapshot_version,
|
|
566
|
+
hash_algorithm=HashAlgorithm.SHA256,
|
|
567
|
+
))
|
|
568
|
+
def test_provenance_calculation(data):
|
|
569
|
+
upstream_data, downstream_df = data
|
|
570
|
+
# Test that downstream_df has correctly calculated provenance
|
|
571
|
+
assert "metaxy_provenance_by_field" in downstream_df.columns
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
Note:
|
|
575
|
+
- The downstream feature's feature_version must be in feature_versions dict
|
|
576
|
+
- Provenance is calculated using the actual Polars calculator
|
|
577
|
+
- Hash algorithm and truncation settings are applied consistently
|
|
578
|
+
"""
|
|
579
|
+
# Generate upstream metadata first
|
|
580
|
+
upstream_data = draw(
|
|
581
|
+
upstream_metadata_strategy(
|
|
582
|
+
feature_plan,
|
|
583
|
+
feature_versions={
|
|
584
|
+
k: v
|
|
585
|
+
for k, v in feature_versions.items()
|
|
586
|
+
if k != feature_plan.feature.key.to_string()
|
|
587
|
+
},
|
|
588
|
+
snapshot_version=snapshot_version,
|
|
589
|
+
min_rows=min_rows,
|
|
590
|
+
max_rows=max_rows,
|
|
591
|
+
)
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
# If there are no upstream features, return empty upstream and just the downstream
|
|
595
|
+
if not upstream_data:
|
|
596
|
+
# Generate standalone downstream metadata
|
|
597
|
+
downstream_feature_key = feature_plan.feature.key.to_string()
|
|
598
|
+
if downstream_feature_key not in feature_versions:
|
|
599
|
+
raise ValueError(
|
|
600
|
+
f"Feature version for downstream feature '{downstream_feature_key}' not found. "
|
|
601
|
+
f"Available keys: {list(feature_versions.keys())}"
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
downstream_df = draw(
|
|
605
|
+
feature_metadata_strategy(
|
|
606
|
+
feature_plan.feature,
|
|
607
|
+
feature_version=feature_versions[downstream_feature_key],
|
|
608
|
+
snapshot_version=snapshot_version,
|
|
609
|
+
min_rows=min_rows,
|
|
610
|
+
max_rows=max_rows,
|
|
611
|
+
)
|
|
612
|
+
)
|
|
613
|
+
return ({}, downstream_df)
|
|
614
|
+
|
|
615
|
+
# Use the new PolarsVersioningEngine to calculate provenance
|
|
616
|
+
import narwhals as nw
|
|
617
|
+
|
|
618
|
+
from metaxy.versioning.polars import PolarsVersioningEngine
|
|
619
|
+
|
|
620
|
+
# Create engine (only accepts plan parameter)
|
|
621
|
+
engine = PolarsVersioningEngine(plan=feature_plan)
|
|
622
|
+
|
|
623
|
+
# Convert upstream_data keys from strings to FeatureKey objects and wrap in Narwhals
|
|
624
|
+
# Keys are simple strings like "parent", "child" that need to be wrapped in a list
|
|
625
|
+
# DataFrames need to be converted to LazyFrames and wrapped in Narwhals
|
|
626
|
+
upstream_dict = {
|
|
627
|
+
FeatureKey([k]): nw.from_native(v.lazy()) for k, v in upstream_data.items()
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
# Load upstream with provenance calculation
|
|
631
|
+
# Note: hash_length is read from MetaxyConfig.get().hash_truncation_length internally
|
|
632
|
+
downstream_df = engine.load_upstream_with_provenance(
|
|
633
|
+
upstream=upstream_dict,
|
|
634
|
+
hash_algo=hash_algorithm,
|
|
635
|
+
filters=None,
|
|
636
|
+
).collect()
|
|
637
|
+
|
|
638
|
+
# Add downstream feature version and snapshot version
|
|
639
|
+
downstream_feature_key = feature_plan.feature.key.to_string()
|
|
640
|
+
if downstream_feature_key not in feature_versions:
|
|
641
|
+
raise ValueError(
|
|
642
|
+
f"Feature version for downstream feature '{downstream_feature_key}' not found. "
|
|
643
|
+
f"Available keys: {list(feature_versions.keys())}"
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
# Use Narwhals lit since downstream_df is a Narwhals DataFrame
|
|
647
|
+
from datetime import datetime, timezone
|
|
648
|
+
|
|
649
|
+
downstream_df = downstream_df.with_columns(
|
|
650
|
+
nw.lit(feature_versions[downstream_feature_key]).alias(METAXY_FEATURE_VERSION),
|
|
651
|
+
nw.lit(snapshot_version).alias(METAXY_SNAPSHOT_VERSION),
|
|
652
|
+
# Add data_version columns (default to provenance)
|
|
653
|
+
nw.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION),
|
|
654
|
+
nw.col(METAXY_PROVENANCE_BY_FIELD).alias(METAXY_DATA_VERSION_BY_FIELD),
|
|
655
|
+
# Add created_at timestamp
|
|
656
|
+
nw.lit(datetime.now(timezone.utc)).alias(METAXY_CREATED_AT),
|
|
657
|
+
# Add materialization_id (nullable)
|
|
658
|
+
nw.lit(None, dtype=nw.String).alias(METAXY_MATERIALIZATION_ID),
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
# Convert back to native Polars DataFrame for the return type
|
|
662
|
+
downstream_df_polars = downstream_df.to_native()
|
|
663
|
+
|
|
664
|
+
return (upstream_data, downstream_df_polars)
|