metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from functools import cached_property
|
|
3
|
+
|
|
4
|
+
import narwhals as nw
|
|
5
|
+
from narwhals.typing import FrameT
|
|
6
|
+
|
|
7
|
+
from metaxy.models.constants import (
|
|
8
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
9
|
+
METAXY_PROVENANCE,
|
|
10
|
+
METAXY_PROVENANCE_BY_FIELD,
|
|
11
|
+
)
|
|
12
|
+
from metaxy.models.feature_spec import FeatureDep, FeatureSpec
|
|
13
|
+
from metaxy.models.plan import FeaturePlan
|
|
14
|
+
from metaxy.models.types import FeatureKey
|
|
15
|
+
from metaxy.versioning.renamed_df import RenamedDataFrame
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FeatureDepTransformer:
|
|
19
|
+
def __init__(self, dep: FeatureDep, plan: FeaturePlan):
|
|
20
|
+
"""A class responsible for applying transformations that live on the [metaxy.models.feature_spec.FeatureDep][]:
|
|
21
|
+
|
|
22
|
+
- Filters (from FeatureDep.filters)
|
|
23
|
+
- Renames
|
|
24
|
+
- Selections
|
|
25
|
+
|
|
26
|
+
This is supposed to always run before the upstream metadata is joined.
|
|
27
|
+
|
|
28
|
+
Will also inject Metaxy system columns.
|
|
29
|
+
"""
|
|
30
|
+
self.plan = plan
|
|
31
|
+
self.dep = dep
|
|
32
|
+
|
|
33
|
+
# allow adding more in the future
|
|
34
|
+
self.metaxy_columns_to_load = [
|
|
35
|
+
METAXY_PROVENANCE_BY_FIELD,
|
|
36
|
+
METAXY_PROVENANCE,
|
|
37
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
@cached_property
|
|
41
|
+
def upstream_feature_key(self) -> FeatureKey:
|
|
42
|
+
return self.dep.feature
|
|
43
|
+
|
|
44
|
+
@cached_property
|
|
45
|
+
def upstream_feature_spec(self) -> FeatureSpec:
|
|
46
|
+
return self.plan.parent_features_by_key[self.dep.feature]
|
|
47
|
+
|
|
48
|
+
def transform(
|
|
49
|
+
self, df: FrameT, filters: Sequence[nw.Expr] | None = None
|
|
50
|
+
) -> RenamedDataFrame[FrameT]:
|
|
51
|
+
"""Apply the transformation specified by the feature dependency.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
df: The dataframe to transform, it's expected to represent the raw upstream feature metadata
|
|
55
|
+
as it resides in the metadata store.
|
|
56
|
+
filters: Optional sequence of additional filters to apply to the dataframe **after renames**.
|
|
57
|
+
These are combined with the static filters from FeatureDep.filters.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
The transformed dataframe coupled with the renamed ID columns
|
|
61
|
+
|
|
62
|
+
"""
|
|
63
|
+
# Combine static filters from FeatureDep with any additional filters passed as arguments
|
|
64
|
+
combined_filters: list[nw.Expr] = []
|
|
65
|
+
if self.dep.filters is not None:
|
|
66
|
+
combined_filters.extend(self.dep.filters)
|
|
67
|
+
if filters:
|
|
68
|
+
combined_filters.extend(filters)
|
|
69
|
+
|
|
70
|
+
return (
|
|
71
|
+
RenamedDataFrame(
|
|
72
|
+
df=df, id_columns=list(self.upstream_feature_spec.id_columns)
|
|
73
|
+
)
|
|
74
|
+
.rename(self.renames)
|
|
75
|
+
.filter(combined_filters if combined_filters else None)
|
|
76
|
+
.select(self.renamed_columns)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def rename_upstream_metaxy_column(self, column_name: str) -> str:
|
|
80
|
+
"""Insert the upstream feature key suffix into the column name.
|
|
81
|
+
|
|
82
|
+
Is typically applied to Metaxy's system columns since they have to be loaded and do not have user-defined renames."""
|
|
83
|
+
return f"{column_name}{self.upstream_feature_key.to_column_suffix()}"
|
|
84
|
+
|
|
85
|
+
@cached_property
|
|
86
|
+
def renamed_provenance_col(self) -> str:
|
|
87
|
+
return self.rename_upstream_metaxy_column(METAXY_PROVENANCE)
|
|
88
|
+
|
|
89
|
+
@cached_property
|
|
90
|
+
def renamed_provenance_by_field_col(self) -> str:
|
|
91
|
+
return self.rename_upstream_metaxy_column(METAXY_PROVENANCE_BY_FIELD)
|
|
92
|
+
|
|
93
|
+
@cached_property
|
|
94
|
+
def renamed_data_version_by_field_col(self) -> str:
|
|
95
|
+
return self.rename_upstream_metaxy_column(METAXY_DATA_VERSION_BY_FIELD)
|
|
96
|
+
|
|
97
|
+
@cached_property
|
|
98
|
+
def renamed_metaxy_cols(self) -> list[str]:
|
|
99
|
+
return list(
|
|
100
|
+
map(self.rename_upstream_metaxy_column, self.metaxy_columns_to_load)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
@cached_property
|
|
104
|
+
def renames(self) -> dict[str, str]:
|
|
105
|
+
"""Get column renames for an upstream feature.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Dictionary of column renames
|
|
109
|
+
"""
|
|
110
|
+
# TODO: potentially include more system columns here?
|
|
111
|
+
return {
|
|
112
|
+
**(self.dep.rename or {}),
|
|
113
|
+
**{
|
|
114
|
+
col: self.rename_upstream_metaxy_column(col)
|
|
115
|
+
for col in self.metaxy_columns_to_load
|
|
116
|
+
},
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
@cached_property
|
|
120
|
+
def renamed_id_columns(self) -> list[str]:
|
|
121
|
+
return [
|
|
122
|
+
self.renames.get(col, col) for col in self.upstream_feature_spec.id_columns
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
@cached_property
|
|
126
|
+
def renamed_columns(
|
|
127
|
+
self,
|
|
128
|
+
) -> list[str] | None:
|
|
129
|
+
"""Get columns to select from an upstream feature.
|
|
130
|
+
|
|
131
|
+
There include both original and metaxy-injected columns, all already renamed.
|
|
132
|
+
Users are expected to use renamed column names in their columns specification.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
List of column names to select, or None to select all columns
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
# If no specific columns requested (None), return None to keep all columns
|
|
139
|
+
# If empty tuple, return only ID columns and system columns
|
|
140
|
+
if self.dep.columns is None:
|
|
141
|
+
return None
|
|
142
|
+
else:
|
|
143
|
+
# Apply renames to the selected columns since selection happens after renaming
|
|
144
|
+
renamed_selected_cols = [
|
|
145
|
+
self.renames.get(col, col) for col in self.dep.columns
|
|
146
|
+
]
|
|
147
|
+
return [
|
|
148
|
+
*self.renamed_id_columns,
|
|
149
|
+
*renamed_selected_cols,
|
|
150
|
+
*self.renamed_metaxy_cols,
|
|
151
|
+
]
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Ibis implementation of VersioningEngine.
|
|
2
|
+
|
|
3
|
+
CRITICAL: This implementation NEVER materializes lazy expressions.
|
|
4
|
+
All operations stay in the lazy Ibis world for SQL execution.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Protocol, cast
|
|
8
|
+
|
|
9
|
+
import narwhals as nw
|
|
10
|
+
from ibis import Expr as IbisExpr
|
|
11
|
+
from narwhals.typing import FrameT
|
|
12
|
+
|
|
13
|
+
from metaxy.models.plan import FeaturePlan
|
|
14
|
+
from metaxy.versioning.engine import VersioningEngine
|
|
15
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IbisHashFn(Protocol):
|
|
19
|
+
def __call__(self, expr: IbisExpr) -> IbisExpr: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class IbisVersioningEngine(VersioningEngine):
|
|
23
|
+
"""Provenance engine using Ibis for SQL databases.
|
|
24
|
+
|
|
25
|
+
Only implements hash_string_column and build_struct_column.
|
|
26
|
+
All logic lives in the base class.
|
|
27
|
+
|
|
28
|
+
CRITICAL: This implementation NEVER leaves the lazy world.
|
|
29
|
+
All operations stay as Ibis expressions that compile to SQL.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
plan: FeaturePlan,
|
|
35
|
+
hash_functions: dict[HashAlgorithm, IbisHashFn],
|
|
36
|
+
) -> None:
|
|
37
|
+
"""Initialize the Ibis engine.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
plan: Feature plan to track provenance for
|
|
41
|
+
backend: Ibis backend instance (e.g., ibis.duckdb.connect())
|
|
42
|
+
hash_functions: Mapping from HashAlgorithm to Ibis hash functions.
|
|
43
|
+
Each function takes an Ibis expression and returns an Ibis expression.
|
|
44
|
+
"""
|
|
45
|
+
super().__init__(plan)
|
|
46
|
+
self.hash_functions: dict[HashAlgorithm, IbisHashFn] = hash_functions
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def implementation(cls) -> nw.Implementation:
|
|
50
|
+
return nw.Implementation.IBIS
|
|
51
|
+
|
|
52
|
+
def hash_string_column(
|
|
53
|
+
self,
|
|
54
|
+
df: FrameT,
|
|
55
|
+
source_column: str,
|
|
56
|
+
target_column: str,
|
|
57
|
+
hash_algo: HashAlgorithm,
|
|
58
|
+
) -> FrameT:
|
|
59
|
+
"""Hash a string column using Ibis hash functions.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
df: Narwhals DataFrame backed by Ibis
|
|
63
|
+
source_column: Name of string column to hash
|
|
64
|
+
target_column: Name for the new column containing the hash
|
|
65
|
+
hash_algo: Hash algorithm to use
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Narwhals DataFrame with new hashed column added, backed by Ibis.
|
|
69
|
+
The source column remains unchanged.
|
|
70
|
+
"""
|
|
71
|
+
if hash_algo not in self.hash_functions:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"Hash algorithm {hash_algo} not supported by this Ibis backend. "
|
|
74
|
+
f"Supported: {list(self.hash_functions.keys())}"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Import ibis lazily (module-level import restriction)
|
|
78
|
+
import ibis.expr.types
|
|
79
|
+
|
|
80
|
+
# Convert to Ibis table
|
|
81
|
+
assert df.implementation == nw.Implementation.IBIS, (
|
|
82
|
+
"Only Ibis DataFrames are accepted"
|
|
83
|
+
)
|
|
84
|
+
ibis_table: ibis.expr.types.Table = cast(ibis.expr.types.Table, df.to_native())
|
|
85
|
+
|
|
86
|
+
# Get hash function
|
|
87
|
+
hash_fn = self.hash_functions[hash_algo]
|
|
88
|
+
|
|
89
|
+
# Apply hash to source column
|
|
90
|
+
# Hash functions are responsible for returning strings
|
|
91
|
+
hashed = hash_fn(ibis_table[source_column])
|
|
92
|
+
|
|
93
|
+
# Add new column with the hash
|
|
94
|
+
result_table = ibis_table.mutate(**{target_column: hashed}) # pyright: ignore[reportArgumentType]
|
|
95
|
+
|
|
96
|
+
# Convert back to Narwhals
|
|
97
|
+
return cast(FrameT, nw.from_native(result_table))
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def build_struct_column(
|
|
101
|
+
df: FrameT,
|
|
102
|
+
struct_name: str,
|
|
103
|
+
field_columns: dict[str, str],
|
|
104
|
+
) -> FrameT:
|
|
105
|
+
"""Build a struct column from existing columns.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
df: Narwhals DataFrame backed by Ibis
|
|
109
|
+
struct_name: Name for the new struct column
|
|
110
|
+
field_columns: Mapping from struct field names to column names
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Narwhals DataFrame with new struct column added, backed by Ibis.
|
|
114
|
+
The source columns remain unchanged.
|
|
115
|
+
"""
|
|
116
|
+
# Import ibis lazily
|
|
117
|
+
import ibis.expr.types
|
|
118
|
+
|
|
119
|
+
# Convert to Ibis table
|
|
120
|
+
assert df.implementation == nw.Implementation.IBIS, (
|
|
121
|
+
"Only Ibis DataFrames are accepted"
|
|
122
|
+
)
|
|
123
|
+
ibis_table: ibis.expr.types.Table = cast(ibis.expr.types.Table, df.to_native())
|
|
124
|
+
|
|
125
|
+
# Build struct expression - reference columns by name
|
|
126
|
+
struct_expr = ibis.struct(
|
|
127
|
+
{
|
|
128
|
+
field_name: ibis_table[col_name]
|
|
129
|
+
for field_name, col_name in field_columns.items()
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Add struct column
|
|
134
|
+
result_table = ibis_table.mutate(**{struct_name: struct_expr})
|
|
135
|
+
|
|
136
|
+
# Convert back to Narwhals
|
|
137
|
+
return cast(FrameT, nw.from_native(result_table))
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def aggregate_with_string_concat(
|
|
141
|
+
df: FrameT,
|
|
142
|
+
group_by_columns: list[str],
|
|
143
|
+
concat_column: str,
|
|
144
|
+
concat_separator: str,
|
|
145
|
+
exclude_columns: list[str],
|
|
146
|
+
) -> FrameT:
|
|
147
|
+
"""Aggregate DataFrame by grouping and concatenating strings.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
df: Narwhals DataFrame backed by Ibis
|
|
151
|
+
group_by_columns: Columns to group by
|
|
152
|
+
concat_column: Column containing strings to concatenate within groups
|
|
153
|
+
concat_separator: Separator to use when concatenating strings
|
|
154
|
+
exclude_columns: Columns to exclude from aggregation
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Narwhals DataFrame with one row per group.
|
|
158
|
+
"""
|
|
159
|
+
# Import ibis lazily
|
|
160
|
+
import ibis
|
|
161
|
+
import ibis.expr.types
|
|
162
|
+
|
|
163
|
+
# Convert to Ibis table
|
|
164
|
+
assert df.implementation == nw.Implementation.IBIS, (
|
|
165
|
+
"Only Ibis DataFrames are accepted"
|
|
166
|
+
)
|
|
167
|
+
ibis_table: ibis.expr.types.Table = cast(ibis.expr.types.Table, df.to_native())
|
|
168
|
+
|
|
169
|
+
# Build aggregation expressions
|
|
170
|
+
agg_exprs = {}
|
|
171
|
+
|
|
172
|
+
# Concatenate the concat_column with separator
|
|
173
|
+
agg_exprs[concat_column] = ibis_table[concat_column].group_concat(
|
|
174
|
+
concat_separator
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Take first value for all other columns (except group_by and exclude)
|
|
178
|
+
all_columns = set(ibis_table.columns)
|
|
179
|
+
columns_to_aggregate = (
|
|
180
|
+
all_columns - set(group_by_columns) - {concat_column} - set(exclude_columns)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
for col in columns_to_aggregate:
|
|
184
|
+
agg_exprs[col] = ibis_table[
|
|
185
|
+
col
|
|
186
|
+
].arbitrary() # Take any value (like first())
|
|
187
|
+
|
|
188
|
+
# Perform groupby and aggregate
|
|
189
|
+
result_table = ibis_table.group_by(group_by_columns).aggregate(**agg_exprs)
|
|
190
|
+
|
|
191
|
+
# Convert back to Narwhals
|
|
192
|
+
return cast(FrameT, nw.from_native(result_table))
|
|
193
|
+
|
|
194
|
+
@staticmethod
|
|
195
|
+
def keep_latest_by_group(
|
|
196
|
+
df: FrameT,
|
|
197
|
+
group_columns: list[str],
|
|
198
|
+
timestamp_column: str,
|
|
199
|
+
) -> FrameT:
|
|
200
|
+
"""Keep only the latest row per group based on a timestamp column.
|
|
201
|
+
|
|
202
|
+
Uses argmax aggregation to get the value from each column where the
|
|
203
|
+
timestamp is maximum. This is simpler and more semantically clear than
|
|
204
|
+
window functions.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
df: Narwhals DataFrame/LazyFrame backed by Ibis
|
|
208
|
+
group_columns: Columns to group by (typically ID columns)
|
|
209
|
+
timestamp_column: Column to use for determining "latest" (typically metaxy_created_at)
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Narwhals DataFrame/LazyFrame with only the latest row per group
|
|
213
|
+
|
|
214
|
+
Raises:
|
|
215
|
+
ValueError: If timestamp_column doesn't exist in df
|
|
216
|
+
"""
|
|
217
|
+
# Import ibis lazily
|
|
218
|
+
import ibis.expr.types
|
|
219
|
+
|
|
220
|
+
# Convert to Ibis table
|
|
221
|
+
assert df.implementation == nw.Implementation.IBIS, (
|
|
222
|
+
"Only Ibis DataFrames are accepted"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Check if timestamp_column exists
|
|
226
|
+
if timestamp_column not in df.columns:
|
|
227
|
+
raise ValueError(
|
|
228
|
+
f"Timestamp column '{timestamp_column}' not found in DataFrame. "
|
|
229
|
+
f"Available columns: {df.columns}"
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
ibis_table: ibis.expr.types.Table = cast(ibis.expr.types.Table, df.to_native())
|
|
233
|
+
|
|
234
|
+
# Use argmax aggregation: for each column, get the value where timestamp is maximum
|
|
235
|
+
# This directly expresses "get the row with the latest timestamp per group"
|
|
236
|
+
all_columns = set(ibis_table.columns)
|
|
237
|
+
non_group_columns = all_columns - set(group_columns)
|
|
238
|
+
|
|
239
|
+
# Build aggregation dict: for each non-group column, use argmax(timestamp)
|
|
240
|
+
agg_exprs = {
|
|
241
|
+
col: ibis_table[col].argmax(ibis_table[timestamp_column])
|
|
242
|
+
for col in non_group_columns
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
# Perform groupby and aggregate
|
|
246
|
+
result_table = ibis_table.group_by(group_columns).aggregate(**agg_exprs)
|
|
247
|
+
|
|
248
|
+
# Convert back to Narwhals
|
|
249
|
+
return cast(FrameT, nw.from_native(result_table))
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Handler for normalizing provenance based on lineage relationships.
|
|
2
|
+
|
|
3
|
+
This module provides abstractions for handling different lineage relationship types
|
|
4
|
+
(identity, aggregation, expansion) when comparing expected vs current provenance.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
import narwhals as nw
|
|
13
|
+
from narwhals.typing import FrameT
|
|
14
|
+
|
|
15
|
+
from metaxy.models.constants import METAXY_PROVENANCE, METAXY_PROVENANCE_BY_FIELD
|
|
16
|
+
from metaxy.models.lineage import ExpansionRelationship
|
|
17
|
+
from metaxy.utils.hashing import get_hash_truncation_length
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from metaxy.models.plan import FeaturePlan
|
|
21
|
+
from metaxy.versioning.engine import VersioningEngine
|
|
22
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class LineageHandler(ABC):
|
|
26
|
+
"""Base class for handling lineage-based provenance normalization."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, feature_plan: FeaturePlan, engine: VersioningEngine):
|
|
29
|
+
"""Initialize handler with feature plan and engine.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
feature_plan: The feature plan containing lineage information
|
|
33
|
+
engine: The provenance engine instance
|
|
34
|
+
"""
|
|
35
|
+
self.plan = feature_plan
|
|
36
|
+
self.feature_spec = feature_plan.feature
|
|
37
|
+
self.engine = engine
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def normalize_for_comparison(
|
|
41
|
+
self,
|
|
42
|
+
expected: FrameT,
|
|
43
|
+
current: FrameT,
|
|
44
|
+
hash_algorithm: HashAlgorithm,
|
|
45
|
+
) -> tuple[FrameT, FrameT, list[str]]:
|
|
46
|
+
"""Normalize expected and current DataFrames for provenance comparison.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
expected: Expected metadata computed from upstream
|
|
50
|
+
current: Current metadata from store
|
|
51
|
+
hash_algorithm: Hash algorithm to use
|
|
52
|
+
hash_length: Hash truncation length
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Tuple of (normalized_expected, normalized_current, join_columns)
|
|
56
|
+
"""
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class IdentityLineageHandler(LineageHandler):
|
|
61
|
+
"""Handler for 1:1 identity lineage relationships.
|
|
62
|
+
|
|
63
|
+
No normalization needed - each upstream row maps to exactly one downstream row.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def normalize_for_comparison(
|
|
67
|
+
self,
|
|
68
|
+
expected: FrameT,
|
|
69
|
+
current: FrameT,
|
|
70
|
+
hash_algorithm: HashAlgorithm,
|
|
71
|
+
) -> tuple[FrameT, FrameT, list[str]]:
|
|
72
|
+
"""No normalization needed for identity relationships."""
|
|
73
|
+
id_columns = list(self.feature_spec.id_columns)
|
|
74
|
+
return expected, current, id_columns
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class AggregationLineageHandler(LineageHandler):
|
|
78
|
+
"""Handler for N:1 aggregation lineage relationships.
|
|
79
|
+
|
|
80
|
+
Multiple upstream rows aggregate to one downstream row. We need to:
|
|
81
|
+
1. Group expected metadata by aggregation columns (sorted within group)
|
|
82
|
+
2. Concatenate provenance values deterministically
|
|
83
|
+
3. Hash the concatenated result using engine's hash method
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def normalize_for_comparison(
|
|
87
|
+
self,
|
|
88
|
+
expected: FrameT,
|
|
89
|
+
current: FrameT,
|
|
90
|
+
hash_algorithm: HashAlgorithm,
|
|
91
|
+
) -> tuple[FrameT, FrameT, list[str]]:
|
|
92
|
+
"""Aggregate expected provenance by grouping."""
|
|
93
|
+
id_columns = list(self.feature_spec.id_columns)
|
|
94
|
+
agg_result = self.feature_spec.lineage.get_aggregation_columns(id_columns)
|
|
95
|
+
assert agg_result is not None, (
|
|
96
|
+
"Aggregation relationship must have aggregation columns"
|
|
97
|
+
)
|
|
98
|
+
agg_columns = list(agg_result)
|
|
99
|
+
|
|
100
|
+
# Aggregate expected provenance
|
|
101
|
+
expected_agg = self._aggregate_provenance(expected, agg_columns, hash_algorithm)
|
|
102
|
+
|
|
103
|
+
return expected_agg, current, agg_columns
|
|
104
|
+
|
|
105
|
+
def _aggregate_provenance(
|
|
106
|
+
self,
|
|
107
|
+
expected: FrameT,
|
|
108
|
+
agg_columns: list[str],
|
|
109
|
+
hash_algorithm: HashAlgorithm,
|
|
110
|
+
) -> FrameT:
|
|
111
|
+
"""Aggregate provenance for N:1 relationships.
|
|
112
|
+
|
|
113
|
+
Strategy:
|
|
114
|
+
1. Sort by id_columns within each group for deterministic ordering
|
|
115
|
+
2. Group by aggregation columns and concatenate provenance with engine's method
|
|
116
|
+
3. Hash the concatenated result using engine's hash_string_column
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
expected: Expected metadata with upstream provenance
|
|
120
|
+
agg_columns: Columns to group by
|
|
121
|
+
hash_algorithm: Hash algorithm to use
|
|
122
|
+
hash_length: Length to truncate hash to
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Aggregated DataFrame with one row per group
|
|
126
|
+
"""
|
|
127
|
+
# Sort by all id_columns for deterministic ordering within groups
|
|
128
|
+
id_columns = list(self.feature_spec.id_columns)
|
|
129
|
+
expected_sorted = expected.sort(id_columns)
|
|
130
|
+
|
|
131
|
+
# Use engine's aggregate_with_string_concat method
|
|
132
|
+
# This concatenates provenance strings and stores in a temporary column
|
|
133
|
+
grouped = self.engine.aggregate_with_string_concat(
|
|
134
|
+
df=expected_sorted,
|
|
135
|
+
group_by_columns=agg_columns,
|
|
136
|
+
concat_column=METAXY_PROVENANCE,
|
|
137
|
+
concat_separator="|",
|
|
138
|
+
exclude_columns=[METAXY_PROVENANCE_BY_FIELD],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Hash the concatenated provenance using engine's method
|
|
142
|
+
# Note: the concat column still has name METAXY_PROVENANCE after aggregation
|
|
143
|
+
hashed = self.engine.hash_string_column(
|
|
144
|
+
grouped, METAXY_PROVENANCE, "__hashed_prov", hash_algorithm
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Replace METAXY_PROVENANCE with truncated hash
|
|
148
|
+
hashed = hashed.drop(METAXY_PROVENANCE).rename(
|
|
149
|
+
{"__hashed_prov": METAXY_PROVENANCE}
|
|
150
|
+
)
|
|
151
|
+
hashed = hashed.with_columns(
|
|
152
|
+
nw.col(METAXY_PROVENANCE).str.slice(0, get_hash_truncation_length())
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Create placeholder provenance_by_field struct using engine's method
|
|
156
|
+
field_names = [f.key.to_struct_key() for f in self.plan.feature.fields]
|
|
157
|
+
field_map = {name: "__aggregated_placeholder" for name in field_names}
|
|
158
|
+
|
|
159
|
+
# Add placeholder column
|
|
160
|
+
hashed = hashed.with_columns(
|
|
161
|
+
nw.lit("aggregated").alias("__aggregated_placeholder")
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Build struct using engine's method
|
|
165
|
+
result = self.engine.build_struct_column(
|
|
166
|
+
hashed, METAXY_PROVENANCE_BY_FIELD, field_map
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Drop placeholder
|
|
170
|
+
result = result.drop("__aggregated_placeholder")
|
|
171
|
+
|
|
172
|
+
return result
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class ExpansionLineageHandler(LineageHandler):
|
|
176
|
+
"""Handler for 1:N expansion lineage relationships.
|
|
177
|
+
|
|
178
|
+
One upstream row expands to many downstream rows. All downstream rows
|
|
179
|
+
with the same parent ID should have the same provenance. We group
|
|
180
|
+
current by parent columns and take any representative row.
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
def normalize_for_comparison(
|
|
184
|
+
self,
|
|
185
|
+
expected: FrameT,
|
|
186
|
+
current: FrameT,
|
|
187
|
+
hash_algorithm: HashAlgorithm,
|
|
188
|
+
) -> tuple[FrameT, FrameT, list[str]]:
|
|
189
|
+
"""Group current by parent ID columns."""
|
|
190
|
+
# Access the ExpansionRelationship to get the .on attribute
|
|
191
|
+
assert isinstance(self.feature_spec.lineage.relationship, ExpansionRelationship)
|
|
192
|
+
parent_columns = list(self.feature_spec.lineage.relationship.on)
|
|
193
|
+
|
|
194
|
+
# Group current by parent columns and take any representative row
|
|
195
|
+
current_grouped = (
|
|
196
|
+
current.with_columns(nw.lit(True).alias("_dummy"))
|
|
197
|
+
.filter(
|
|
198
|
+
nw.col("_dummy")
|
|
199
|
+
.is_first_distinct()
|
|
200
|
+
.over(*parent_columns, order_by="_dummy")
|
|
201
|
+
)
|
|
202
|
+
.drop("_dummy")
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return expected, current_grouped, parent_columns
|