metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from collections.abc import Mapping, Sequence
|
|
7
|
+
from functools import cached_property
|
|
8
|
+
from typing import TYPE_CHECKING, cast
|
|
9
|
+
|
|
10
|
+
import narwhals as nw
|
|
11
|
+
from narwhals.typing import FrameT
|
|
12
|
+
|
|
13
|
+
from metaxy.config import MetaxyConfig
|
|
14
|
+
from metaxy.models.constants import (
|
|
15
|
+
METAXY_FEATURE_VERSION,
|
|
16
|
+
METAXY_PROVENANCE,
|
|
17
|
+
METAXY_PROVENANCE_BY_FIELD,
|
|
18
|
+
METAXY_SNAPSHOT_VERSION,
|
|
19
|
+
)
|
|
20
|
+
from metaxy.models.lineage import LineageRelationshipType
|
|
21
|
+
from metaxy.models.plan import FeaturePlan, FQFieldKey
|
|
22
|
+
from metaxy.models.types import FeatureKey, FieldKey
|
|
23
|
+
from metaxy.utils.hashing import get_hash_truncation_length
|
|
24
|
+
from metaxy.versioning.feature_dep_transformer import FeatureDepTransformer
|
|
25
|
+
from metaxy.versioning.renamed_df import RenamedDataFrame
|
|
26
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from metaxy.versioning.lineage_handler import LineageHandler
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class VersioningEngine(ABC):
|
|
33
|
+
"""A class responsible for tracking sample and field level provenance."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, plan: FeaturePlan):
|
|
36
|
+
self.plan = plan
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def implementation(cls) -> nw.Implementation: ...
|
|
41
|
+
|
|
42
|
+
@cached_property
|
|
43
|
+
def key(self) -> FeatureKey:
|
|
44
|
+
"""Feature key for the feature we are calculating provenance for."""
|
|
45
|
+
return self.plan.feature.key
|
|
46
|
+
|
|
47
|
+
@cached_property
|
|
48
|
+
def feature_transformers_by_key(self) -> dict[FeatureKey, FeatureDepTransformer]:
|
|
49
|
+
transformers = {
|
|
50
|
+
dep.feature: FeatureDepTransformer(dep=dep, plan=self.plan)
|
|
51
|
+
for dep in (self.plan.feature_deps or [])
|
|
52
|
+
}
|
|
53
|
+
# make sure only ID columns are repeated across transformers
|
|
54
|
+
|
|
55
|
+
column_counter = Counter()
|
|
56
|
+
all_id_columns = set()
|
|
57
|
+
for transformer in transformers.values():
|
|
58
|
+
renamed_cols = transformer.renamed_columns
|
|
59
|
+
if renamed_cols is not None:
|
|
60
|
+
column_counter.update(renamed_cols)
|
|
61
|
+
all_id_columns.update(transformer.renamed_id_columns)
|
|
62
|
+
|
|
63
|
+
repeated_columns = []
|
|
64
|
+
for col, count in column_counter.items():
|
|
65
|
+
if count > 1 and col not in all_id_columns:
|
|
66
|
+
repeated_columns.append(col)
|
|
67
|
+
|
|
68
|
+
if repeated_columns:
|
|
69
|
+
raise RuntimeError(
|
|
70
|
+
f"Identified ambiguous columns while resolving upstream column selection for feature {self.key}. Repeated columns: {repeated_columns}. Only ID columns ({all_id_columns}) are allowed to be repeated. Please tweak the `rename` field on the `FeatureDep` objects of {self.key} feature spec."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return transformers
|
|
74
|
+
|
|
75
|
+
@cached_property
|
|
76
|
+
def shared_id_columns(self) -> list[str]:
|
|
77
|
+
"""Warning: order of columns is not guaranteed"""
|
|
78
|
+
cols = set()
|
|
79
|
+
for transformer in self.feature_transformers_by_key.values():
|
|
80
|
+
cols.update(transformer.renamed_id_columns)
|
|
81
|
+
|
|
82
|
+
if not cols:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"No shared ID columns found for upstream features of feature {self.key}. Please ensure that there is at least one ID column shared across all upstream features. Consider tweaking the `rename` field on the `FeatureDep` objects of {self.key} feature spec, as ID columns are being renamed before this check."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return list(cols)
|
|
88
|
+
|
|
89
|
+
def join(self, upstream: Mapping[FeatureKey, RenamedDataFrame[FrameT]]) -> FrameT:
|
|
90
|
+
"""Join the renamed upstream dataframes on the intersection of renamed id_columns of all feature specs."""
|
|
91
|
+
assert len(upstream) > 0, "No upstream dataframes provided"
|
|
92
|
+
|
|
93
|
+
key, renamed_df = next(iter(upstream.items()))
|
|
94
|
+
|
|
95
|
+
df = renamed_df.df
|
|
96
|
+
|
|
97
|
+
for next_key, renamed_df in upstream.items():
|
|
98
|
+
if key == next_key:
|
|
99
|
+
continue
|
|
100
|
+
# we do not need to provide a _suffix here
|
|
101
|
+
# because the columns are already renamed
|
|
102
|
+
# it's on the user to specify correct renames for colliding columns
|
|
103
|
+
df = cast(
|
|
104
|
+
FrameT, df.join(renamed_df.df, on=self.shared_id_columns, how="inner")
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return df
|
|
108
|
+
|
|
109
|
+
def prepare_upstream(
|
|
110
|
+
self,
|
|
111
|
+
upstream: Mapping[FeatureKey, FrameT],
|
|
112
|
+
filters: Mapping[FeatureKey, Sequence[nw.Expr]] | None,
|
|
113
|
+
) -> FrameT:
|
|
114
|
+
"""Prepare the upstream dataframes for the given feature.
|
|
115
|
+
|
|
116
|
+
This includes, in order:
|
|
117
|
+
|
|
118
|
+
- filtering (static filters from FeatureDep.filters + additional runtime filters)
|
|
119
|
+
|
|
120
|
+
- renaming
|
|
121
|
+
|
|
122
|
+
- selecting
|
|
123
|
+
|
|
124
|
+
based on [metaxy.models.feature_spec.FeatureDep][], and joining
|
|
125
|
+
on the intersection of id_columns of all feature specs.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
upstream: Dictionary of upstream dataframes keyed by FeatureKey
|
|
129
|
+
filters: Optional additional runtime filters to apply (combined with FeatureDep.filters)
|
|
130
|
+
"""
|
|
131
|
+
assert len(upstream) > 0, "No upstream dataframes provided"
|
|
132
|
+
|
|
133
|
+
dfs: dict[FeatureKey, RenamedDataFrame[FrameT]] = {
|
|
134
|
+
k: self.feature_transformers_by_key[k].transform(
|
|
135
|
+
df, filters=(filters or {}).get(k)
|
|
136
|
+
)
|
|
137
|
+
for k, df in upstream.items()
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Drop system columns that aren't needed for provenance calculation
|
|
141
|
+
# Keep only METAXY_PROVENANCE and METAXY_PROVENANCE_BY_FIELD
|
|
142
|
+
# Drop METAXY_FEATURE_VERSION and METAXY_SNAPSHOT_VERSION to avoid collisions
|
|
143
|
+
columns_to_drop = [METAXY_FEATURE_VERSION, METAXY_SNAPSHOT_VERSION]
|
|
144
|
+
|
|
145
|
+
for feature_key, renamed_df in dfs.items():
|
|
146
|
+
cols = renamed_df.df.collect_schema().names()
|
|
147
|
+
cols_to_drop = [col for col in columns_to_drop if col in cols]
|
|
148
|
+
if cols_to_drop:
|
|
149
|
+
dfs[feature_key] = RenamedDataFrame(
|
|
150
|
+
df=renamed_df.df.drop(*cols_to_drop),
|
|
151
|
+
id_columns=renamed_df.id_columns,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Validate no column collisions (except ID columns and required system columns)
|
|
155
|
+
if len(dfs) > 1:
|
|
156
|
+
all_columns: dict[str, list[FeatureKey]] = {}
|
|
157
|
+
for feature_key, renamed_df in dfs.items():
|
|
158
|
+
cols = renamed_df.df.collect_schema().names()
|
|
159
|
+
for col in cols:
|
|
160
|
+
if col not in all_columns:
|
|
161
|
+
all_columns[col] = []
|
|
162
|
+
all_columns[col].append(feature_key)
|
|
163
|
+
|
|
164
|
+
# System columns that are allowed to collide (needed for provenance calculation)
|
|
165
|
+
from metaxy.models.constants import (
|
|
166
|
+
METAXY_CREATED_AT,
|
|
167
|
+
METAXY_DATA_VERSION,
|
|
168
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
169
|
+
METAXY_MATERIALIZATION_ID,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
allowed_system_columns = {
|
|
173
|
+
METAXY_PROVENANCE,
|
|
174
|
+
METAXY_PROVENANCE_BY_FIELD,
|
|
175
|
+
METAXY_DATA_VERSION,
|
|
176
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
177
|
+
METAXY_CREATED_AT,
|
|
178
|
+
METAXY_MATERIALIZATION_ID,
|
|
179
|
+
}
|
|
180
|
+
id_cols = set(self.shared_id_columns)
|
|
181
|
+
colliding_columns = [
|
|
182
|
+
col
|
|
183
|
+
for col, features in all_columns.items()
|
|
184
|
+
if len(features) > 1
|
|
185
|
+
and col not in id_cols
|
|
186
|
+
and col not in allowed_system_columns
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
if colliding_columns:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"Found additional shared columns across upstream features for feature {self.plan.feature}: {colliding_columns}. "
|
|
192
|
+
f"Only ID columns {list(id_cols)} and required system columns {list(allowed_system_columns)} should be shared. "
|
|
193
|
+
f"Please add explicit renames in your FeatureDep to avoid column collisions."
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return self.join(dfs)
|
|
197
|
+
|
|
198
|
+
@abstractmethod
|
|
199
|
+
def hash_string_column(
|
|
200
|
+
self,
|
|
201
|
+
df: FrameT,
|
|
202
|
+
source_column: str,
|
|
203
|
+
target_column: str,
|
|
204
|
+
hash_algo: HashAlgorithm,
|
|
205
|
+
) -> FrameT:
|
|
206
|
+
"""Hash a string column using backend-specific hash function.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
df: Narwhals DataFrame
|
|
210
|
+
source_column: Name of string column to hash
|
|
211
|
+
target_column: Name for the new column containing the hash
|
|
212
|
+
hash_algo: Hash algorithm to use
|
|
213
|
+
hash_length: Length to truncate hash to
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Narwhals DataFrame with new hashed column added.
|
|
217
|
+
The source column remains unchanged.
|
|
218
|
+
"""
|
|
219
|
+
raise NotImplementedError()
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
@abstractmethod
|
|
223
|
+
def build_struct_column(
|
|
224
|
+
df: FrameT,
|
|
225
|
+
struct_name: str,
|
|
226
|
+
field_columns: dict[str, str],
|
|
227
|
+
) -> FrameT:
|
|
228
|
+
"""Build a struct column from existing columns.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
df: Narwhals DataFrame
|
|
232
|
+
struct_name: Name for the new struct column
|
|
233
|
+
field_columns: Mapping from struct field names to column names in df
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Narwhals DataFrame with new struct column added.
|
|
237
|
+
The source columns remain unchanged.
|
|
238
|
+
"""
|
|
239
|
+
raise NotImplementedError()
|
|
240
|
+
|
|
241
|
+
@staticmethod
|
|
242
|
+
@abstractmethod
|
|
243
|
+
def aggregate_with_string_concat(
|
|
244
|
+
df: FrameT,
|
|
245
|
+
group_by_columns: list[str],
|
|
246
|
+
concat_column: str,
|
|
247
|
+
concat_separator: str,
|
|
248
|
+
exclude_columns: list[str],
|
|
249
|
+
) -> FrameT:
|
|
250
|
+
"""Aggregate DataFrame by grouping and concatenating strings.
|
|
251
|
+
|
|
252
|
+
Used for N:1 aggregation lineage where multiple upstream rows
|
|
253
|
+
are aggregated into one downstream row. The concat_column strings
|
|
254
|
+
are concatenated with a separator, and other columns take their
|
|
255
|
+
first value within each group.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
df: Narwhals DataFrame to aggregate
|
|
259
|
+
group_by_columns: Columns to group by
|
|
260
|
+
concat_column: Column containing strings to concatenate within groups
|
|
261
|
+
concat_separator: Separator to use when concatenating strings
|
|
262
|
+
exclude_columns: Columns to exclude from aggregation (typically system columns
|
|
263
|
+
that will be recalculated after aggregation)
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Narwhals DataFrame with one row per group, with concat_column containing
|
|
267
|
+
concatenated strings and other columns taking their first value.
|
|
268
|
+
"""
|
|
269
|
+
raise NotImplementedError()
|
|
270
|
+
|
|
271
|
+
@staticmethod
|
|
272
|
+
@abstractmethod
|
|
273
|
+
def keep_latest_by_group(
|
|
274
|
+
df: FrameT,
|
|
275
|
+
group_columns: list[str],
|
|
276
|
+
timestamp_column: str,
|
|
277
|
+
) -> FrameT:
|
|
278
|
+
"""Keep only the latest row per group based on a timestamp column.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
df: Narwhals DataFrame/LazyFrame
|
|
282
|
+
group_columns: Columns to group by (typically ID columns)
|
|
283
|
+
timestamp_column: Column to use for determining "latest" (typically metaxy_created_at)
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Narwhals DataFrame/LazyFrame with only the latest row per group
|
|
287
|
+
|
|
288
|
+
Raises:
|
|
289
|
+
ValueError: If timestamp_column doesn't exist in df
|
|
290
|
+
"""
|
|
291
|
+
raise NotImplementedError()
|
|
292
|
+
|
|
293
|
+
def get_renamed_provenance_by_field_col(self, feature_key: FeatureKey) -> str:
|
|
294
|
+
"""Get the renamed provenance_by_field column name for an upstream feature."""
|
|
295
|
+
return self.feature_transformers_by_key[
|
|
296
|
+
feature_key
|
|
297
|
+
].renamed_provenance_by_field_col
|
|
298
|
+
|
|
299
|
+
def get_renamed_data_version_by_field_col(self, feature_key: FeatureKey) -> str:
|
|
300
|
+
"""Get the renamed data_version_by_field column name for an upstream feature."""
|
|
301
|
+
return self.feature_transformers_by_key[
|
|
302
|
+
feature_key
|
|
303
|
+
].renamed_data_version_by_field_col
|
|
304
|
+
|
|
305
|
+
def get_field_provenance_exprs(
|
|
306
|
+
self,
|
|
307
|
+
) -> dict[FieldKey, dict[FQFieldKey, nw.Expr]]:
|
|
308
|
+
"""Returns a mapping from field keys to data structures that determine provenances for each field.
|
|
309
|
+
Each value is itself a mapping from fully qualified field keys of upstream features to an expression that selects the corresponding upstream data version.
|
|
310
|
+
|
|
311
|
+
Resolves field-level dependencies. Only actual parent fields are considered.
|
|
312
|
+
|
|
313
|
+
Note:
|
|
314
|
+
This reads from upstream `metaxy_data_version_by_field` instead of `metaxy_provenance_by_field`,
|
|
315
|
+
enabling users to control version propagation by overriding data_version values.
|
|
316
|
+
"""
|
|
317
|
+
res: dict[FieldKey, dict[FQFieldKey, nw.Expr]] = {}
|
|
318
|
+
# THIS LINES HERE
|
|
319
|
+
# ARE THE PINNACLE OF METAXY
|
|
320
|
+
for field_spec in self.plan.feature.fields:
|
|
321
|
+
field_provenance: dict[FQFieldKey, nw.Expr] = {}
|
|
322
|
+
for fq_key, parent_field_spec in self.plan.get_parent_fields_for_field(
|
|
323
|
+
field_spec.key
|
|
324
|
+
).items():
|
|
325
|
+
# Read from data_version_by_field instead of provenance_by_field
|
|
326
|
+
# This enables user-defined versioning control
|
|
327
|
+
field_provenance[fq_key] = nw.col(
|
|
328
|
+
self.get_renamed_data_version_by_field_col(fq_key.feature)
|
|
329
|
+
).struct.field(parent_field_spec.key.to_struct_key())
|
|
330
|
+
res[field_spec.key] = field_provenance
|
|
331
|
+
return res
|
|
332
|
+
|
|
333
|
+
def load_upstream_with_provenance(
|
|
334
|
+
self,
|
|
335
|
+
upstream: dict[FeatureKey, FrameT],
|
|
336
|
+
hash_algo: HashAlgorithm,
|
|
337
|
+
filters: Mapping[FeatureKey, Sequence[nw.Expr]] | None,
|
|
338
|
+
) -> FrameT:
|
|
339
|
+
"""Compute the provenance of the given feature.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
key: Feature key to compute provenance for
|
|
343
|
+
upstream: Dictionary of upstream dataframes
|
|
344
|
+
hash_algo: Hash algorithm to use
|
|
345
|
+
filters: Optional additional runtime filters to apply to upstream data (combined with FeatureDep.filters)
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
DataFrame with metaxy_provenance_by_field and metaxy_provenance columns added
|
|
349
|
+
|
|
350
|
+
Note:
|
|
351
|
+
Hash truncation length is read from MetaxyConfig.get().hash_truncation_length
|
|
352
|
+
"""
|
|
353
|
+
# Read hash truncation length from global config
|
|
354
|
+
hash_length = MetaxyConfig.get().hash_truncation_length or 64
|
|
355
|
+
|
|
356
|
+
# Prepare upstream: filter, rename, select, join
|
|
357
|
+
df = self.prepare_upstream(upstream, filters=filters)
|
|
358
|
+
|
|
359
|
+
# Build concatenation columns for each field
|
|
360
|
+
temp_concat_cols: dict[str, str] = {} # field_key_str -> temp_col_name
|
|
361
|
+
field_key_strs: dict[FieldKey, str] = {} # field_key -> field_key_str
|
|
362
|
+
|
|
363
|
+
# Get field provenance expressions
|
|
364
|
+
field_provenance_exprs = self.get_field_provenance_exprs()
|
|
365
|
+
|
|
366
|
+
for field_spec in self.plan.feature.fields:
|
|
367
|
+
field_key_str = field_spec.key.to_struct_key()
|
|
368
|
+
field_key_strs[field_spec.key] = field_key_str
|
|
369
|
+
temp_col_name = f"__concat_{field_key_str}"
|
|
370
|
+
temp_concat_cols[field_key_str] = temp_col_name
|
|
371
|
+
|
|
372
|
+
# Build concatenation components
|
|
373
|
+
components: list[nw.Expr] = [
|
|
374
|
+
nw.lit(field_spec.key.to_string()),
|
|
375
|
+
nw.lit(str(field_spec.code_version)),
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
# Add upstream provenance values in deterministic order
|
|
379
|
+
parent_field_exprs = field_provenance_exprs.get(field_spec.key, {})
|
|
380
|
+
for fq_field_key in sorted(parent_field_exprs.keys()):
|
|
381
|
+
# Add label
|
|
382
|
+
components.append(nw.lit(fq_field_key.to_string()))
|
|
383
|
+
# Add the expression that selects the upstream provenance
|
|
384
|
+
components.append(parent_field_exprs[fq_field_key])
|
|
385
|
+
|
|
386
|
+
# Concatenate all components
|
|
387
|
+
concat_expr = nw.concat_str(components, separator="|")
|
|
388
|
+
df = df.with_columns(concat_expr.alias(temp_col_name))
|
|
389
|
+
|
|
390
|
+
# Hash each concatenation column (BACKEND DOES THIS)
|
|
391
|
+
temp_hash_cols: dict[str, str] = {} # field_key_str -> hash_col_name
|
|
392
|
+
for field_key_str, concat_col in temp_concat_cols.items():
|
|
393
|
+
hash_col_name = f"__hash_{field_key_str}"
|
|
394
|
+
temp_hash_cols[field_key_str] = hash_col_name
|
|
395
|
+
|
|
396
|
+
# Hash the concatenated string column into a new column
|
|
397
|
+
df = self.hash_string_column(
|
|
398
|
+
df, concat_col, hash_col_name, hash_algo
|
|
399
|
+
).with_columns(nw.col(hash_col_name).str.slice(0, hash_length))
|
|
400
|
+
|
|
401
|
+
# Build provenance_by_field struct (BACKEND DOES THIS)
|
|
402
|
+
df = self.build_struct_column(df, METAXY_PROVENANCE_BY_FIELD, temp_hash_cols)
|
|
403
|
+
|
|
404
|
+
# Compute sample-level provenance hash
|
|
405
|
+
# Step 1: Concatenate all field hashes with separator
|
|
406
|
+
df = self.hash_struct_version_column(df, hash_algorithm=hash_algo)
|
|
407
|
+
|
|
408
|
+
# Drop all temporary columns (BASE CLASS CLEANUP)
|
|
409
|
+
# Drop temporary concat columns and hash columns
|
|
410
|
+
temp_columns_to_drop = list(temp_concat_cols.values()) + list(
|
|
411
|
+
temp_hash_cols.values()
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
df = df.drop(*temp_columns_to_drop)
|
|
415
|
+
|
|
416
|
+
# Drop version columns if present (they come from upstream and shouldn't be in the result)
|
|
417
|
+
version_columns = ["metaxy_feature_version", "metaxy_snapshot_version"]
|
|
418
|
+
current_columns = df.collect_schema().names()
|
|
419
|
+
columns_to_drop = [col for col in version_columns if col in current_columns]
|
|
420
|
+
|
|
421
|
+
# Drop renamed upstream provenance and data_version columns (e.g., metaxy_provenance__raw_video)
|
|
422
|
+
# These were needed for provenance calculation but shouldn't be in final result
|
|
423
|
+
for transformer in self.feature_transformers_by_key.values():
|
|
424
|
+
renamed_prov_col = transformer.renamed_provenance_col
|
|
425
|
+
renamed_prov_by_field_col = transformer.renamed_provenance_by_field_col
|
|
426
|
+
renamed_data_version_by_field_col = (
|
|
427
|
+
transformer.renamed_data_version_by_field_col
|
|
428
|
+
)
|
|
429
|
+
if renamed_prov_col in current_columns:
|
|
430
|
+
columns_to_drop.append(renamed_prov_col)
|
|
431
|
+
if renamed_prov_by_field_col in current_columns:
|
|
432
|
+
columns_to_drop.append(renamed_prov_by_field_col)
|
|
433
|
+
if renamed_data_version_by_field_col in current_columns:
|
|
434
|
+
columns_to_drop.append(renamed_data_version_by_field_col)
|
|
435
|
+
|
|
436
|
+
if columns_to_drop:
|
|
437
|
+
df = df.drop(*columns_to_drop)
|
|
438
|
+
|
|
439
|
+
# Add data_version columns (default to provenance values)
|
|
440
|
+
from metaxy.models.constants import (
|
|
441
|
+
METAXY_DATA_VERSION,
|
|
442
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
df = df.with_columns(
|
|
446
|
+
nw.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION),
|
|
447
|
+
nw.col(METAXY_PROVENANCE_BY_FIELD).alias(METAXY_DATA_VERSION_BY_FIELD),
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
return df
|
|
451
|
+
|
|
452
|
+
def hash_struct_version_column(
|
|
453
|
+
self,
|
|
454
|
+
df: FrameT,
|
|
455
|
+
hash_algorithm: HashAlgorithm,
|
|
456
|
+
struct_column: str = METAXY_PROVENANCE_BY_FIELD,
|
|
457
|
+
hash_column: str = METAXY_PROVENANCE,
|
|
458
|
+
) -> FrameT:
|
|
459
|
+
# Compute sample-level provenance from field-level provenance
|
|
460
|
+
# Get all field names from the struct (we need feature spec for this)
|
|
461
|
+
field_names = sorted([f.key.to_struct_key() for f in self.plan.feature.fields])
|
|
462
|
+
|
|
463
|
+
# Concatenate all field hashes with separator
|
|
464
|
+
sample_components = [
|
|
465
|
+
nw.col(struct_column).struct.field(field_name) for field_name in field_names
|
|
466
|
+
]
|
|
467
|
+
sample_concat = nw.concat_str(sample_components, separator="|")
|
|
468
|
+
df = df.with_columns(sample_concat.alias("__sample_concat"))
|
|
469
|
+
|
|
470
|
+
# Hash the concatenation to produce final provenance hash
|
|
471
|
+
return (
|
|
472
|
+
self.hash_string_column(
|
|
473
|
+
df,
|
|
474
|
+
"__sample_concat",
|
|
475
|
+
hash_column,
|
|
476
|
+
hash_algorithm,
|
|
477
|
+
)
|
|
478
|
+
.with_columns(
|
|
479
|
+
nw.col(hash_column).str.slice(0, get_hash_truncation_length())
|
|
480
|
+
)
|
|
481
|
+
.drop("__sample_concat")
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
def resolve_increment_with_provenance(
|
|
485
|
+
self,
|
|
486
|
+
current: FrameT | None,
|
|
487
|
+
upstream: dict[FeatureKey, FrameT],
|
|
488
|
+
hash_algorithm: HashAlgorithm,
|
|
489
|
+
filters: Mapping[FeatureKey, Sequence[nw.Expr]],
|
|
490
|
+
sample: FrameT | None,
|
|
491
|
+
) -> tuple[FrameT, FrameT | None, FrameT | None]:
|
|
492
|
+
"""Loads upstream data, filters, renames, joins it, calculates expected provenance, and compares it with existing provenance.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
current: Current metadata for this feature, if available.
|
|
496
|
+
upstream: A dictionary of upstream data frames.
|
|
497
|
+
hash_algorithm: The hash algorithm to use.
|
|
498
|
+
filters: Additional runtime filters (combined with FeatureDep.filters by FeatureDepTransformer).
|
|
499
|
+
sample: For root features this is used instead of the upstream dataframe.
|
|
500
|
+
Must contain both metaxy_provenance_by_field (struct of field hashes)
|
|
501
|
+
and metaxy_provenance (hash of all field hashes concatenated).
|
|
502
|
+
IMPORTANT: metaxy_provenance must be a HASH, not a raw concatenation.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
tuple[FrameT, FrameT | None, FrameT | None]
|
|
506
|
+
New samples appearing in upstream, samples with changed provenance (mismatch between expected and current state), and samples that have been removed from upstream but are in the current state. New samples DataFrame is never None, but may be empty. changed and removed DataFrames may be None (for the first increment on the feature).
|
|
507
|
+
|
|
508
|
+
Note:
|
|
509
|
+
Hash truncation length is read from MetaxyConfig.get().hash_truncation_length
|
|
510
|
+
"""
|
|
511
|
+
# Handle root feature case
|
|
512
|
+
if sample is not None:
|
|
513
|
+
# Root features: sample is user-provided with provenance columns already
|
|
514
|
+
assert len(upstream) == 0, (
|
|
515
|
+
"Root features should have no upstream dependencies"
|
|
516
|
+
)
|
|
517
|
+
expected = sample
|
|
518
|
+
# Auto-compute metaxy_provenance if missing but metaxy_provenance_by_field exists
|
|
519
|
+
cols = expected.collect_schema().names()
|
|
520
|
+
if METAXY_PROVENANCE not in cols and METAXY_PROVENANCE_BY_FIELD in cols:
|
|
521
|
+
warnings.warn(
|
|
522
|
+
f"Auto-computing {METAXY_PROVENANCE} from {METAXY_PROVENANCE_BY_FIELD} because it is missing in samples DataFrame"
|
|
523
|
+
)
|
|
524
|
+
expected = self.hash_struct_version_column(
|
|
525
|
+
expected, hash_algorithm=hash_algorithm
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
# Validate that root features provide both required provenance columns
|
|
529
|
+
self._check_required_provenance_columns(
|
|
530
|
+
expected, "The `sample` DataFrame (must be provided to root features)"
|
|
531
|
+
)
|
|
532
|
+
else:
|
|
533
|
+
# Normal case: compute provenance from upstream
|
|
534
|
+
expected = self.load_upstream_with_provenance(
|
|
535
|
+
upstream,
|
|
536
|
+
hash_algo=hash_algorithm,
|
|
537
|
+
filters=filters,
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# Case 1: No current metadata - everything is added
|
|
541
|
+
if current is None:
|
|
542
|
+
return expected, None, None
|
|
543
|
+
assert current is not None
|
|
544
|
+
|
|
545
|
+
# Case 2 & 3: Compare expected with current metadata
|
|
546
|
+
# Validate that current has metaxy_provenance column
|
|
547
|
+
self._check_required_provenance_columns(
|
|
548
|
+
current, "The `current` DataFrame loaded from the metadata store"
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# Handle different lineage relationships before comparison
|
|
552
|
+
lineage_handler = create_lineage_handler(self.plan, self)
|
|
553
|
+
expected, current, join_columns = lineage_handler.normalize_for_comparison(
|
|
554
|
+
expected, current, hash_algorithm
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
current = current.rename(
|
|
558
|
+
{
|
|
559
|
+
METAXY_PROVENANCE: f"__current_{METAXY_PROVENANCE}",
|
|
560
|
+
METAXY_PROVENANCE_BY_FIELD: f"__current_{METAXY_PROVENANCE_BY_FIELD}",
|
|
561
|
+
}
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
added = cast(
|
|
565
|
+
FrameT,
|
|
566
|
+
expected.join(
|
|
567
|
+
cast(FrameT, current.select(join_columns)),
|
|
568
|
+
on=join_columns,
|
|
569
|
+
how="anti",
|
|
570
|
+
),
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
changed = cast(
|
|
574
|
+
FrameT,
|
|
575
|
+
expected.join(
|
|
576
|
+
cast(
|
|
577
|
+
FrameT,
|
|
578
|
+
current.select(*join_columns, f"__current_{METAXY_PROVENANCE}"),
|
|
579
|
+
),
|
|
580
|
+
on=join_columns,
|
|
581
|
+
how="inner",
|
|
582
|
+
)
|
|
583
|
+
.filter(
|
|
584
|
+
nw.col(f"__current_{METAXY_PROVENANCE}").is_null()
|
|
585
|
+
| (
|
|
586
|
+
nw.col(METAXY_PROVENANCE)
|
|
587
|
+
!= nw.col(f"__current_{METAXY_PROVENANCE}")
|
|
588
|
+
)
|
|
589
|
+
)
|
|
590
|
+
.drop(f"__current_{METAXY_PROVENANCE}"),
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
removed = cast(
|
|
594
|
+
FrameT,
|
|
595
|
+
current.join(
|
|
596
|
+
cast(FrameT, expected.select(join_columns)),
|
|
597
|
+
on=join_columns,
|
|
598
|
+
how="anti",
|
|
599
|
+
).rename(
|
|
600
|
+
{
|
|
601
|
+
f"__current_{METAXY_PROVENANCE}": METAXY_PROVENANCE,
|
|
602
|
+
f"__current_{METAXY_PROVENANCE_BY_FIELD}": METAXY_PROVENANCE_BY_FIELD,
|
|
603
|
+
}
|
|
604
|
+
),
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
# Return lazy frames with ID and provenance columns (caller decides whether to collect)
|
|
608
|
+
return added, changed, removed
|
|
609
|
+
|
|
610
|
+
def _check_required_provenance_columns(self, df: FrameT, message: str):
|
|
611
|
+
cols = df.collect_schema().names()
|
|
612
|
+
|
|
613
|
+
if METAXY_PROVENANCE_BY_FIELD not in cols:
|
|
614
|
+
raise ValueError(
|
|
615
|
+
f"{message} is missing required "
|
|
616
|
+
f"'{METAXY_PROVENANCE_BY_FIELD}' column. This column must be a struct containing the provenance of each field on this feature."
|
|
617
|
+
)
|
|
618
|
+
if METAXY_PROVENANCE not in cols:
|
|
619
|
+
raise ValueError(
|
|
620
|
+
f"{message} is missing required "
|
|
621
|
+
f"'{METAXY_PROVENANCE}' column. All metadata in the store must have both provenance columns. "
|
|
622
|
+
f"This column is automatically added by Metaxy when writing metadata."
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
def create_lineage_handler(
|
|
627
|
+
feature_plan: FeaturePlan,
|
|
628
|
+
engine: VersioningEngine,
|
|
629
|
+
) -> LineageHandler:
|
|
630
|
+
"""Factory function to create appropriate lineage handler.
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
feature_plan: The feature plan containing lineage information
|
|
634
|
+
engine: The provenance engine instance
|
|
635
|
+
|
|
636
|
+
Returns:
|
|
637
|
+
Appropriate LineageHandler instance based on lineage type
|
|
638
|
+
"""
|
|
639
|
+
# Import handler classes at runtime to avoid circular import
|
|
640
|
+
from metaxy.versioning.lineage_handler import (
|
|
641
|
+
AggregationLineageHandler,
|
|
642
|
+
ExpansionLineageHandler,
|
|
643
|
+
IdentityLineageHandler,
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
lineage = feature_plan.feature.lineage
|
|
647
|
+
relationship_type = lineage.relationship.type
|
|
648
|
+
|
|
649
|
+
if relationship_type == LineageRelationshipType.IDENTITY:
|
|
650
|
+
return IdentityLineageHandler(feature_plan, engine)
|
|
651
|
+
elif relationship_type == LineageRelationshipType.AGGREGATION:
|
|
652
|
+
return AggregationLineageHandler(feature_plan, engine)
|
|
653
|
+
elif relationship_type == LineageRelationshipType.EXPANSION:
|
|
654
|
+
return ExpansionLineageHandler(feature_plan, engine)
|
|
655
|
+
else:
|
|
656
|
+
raise ValueError(f"Unknown lineage relationship type: {relationship_type}")
|