metaxy 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metaxy might be problematic. Click here for more details.

Files changed (75) hide show
  1. metaxy/__init__.py +61 -0
  2. metaxy/_testing.py +542 -0
  3. metaxy/_utils.py +16 -0
  4. metaxy/_version.py +1 -0
  5. metaxy/cli/app.py +76 -0
  6. metaxy/cli/context.py +71 -0
  7. metaxy/cli/graph.py +576 -0
  8. metaxy/cli/graph_diff.py +290 -0
  9. metaxy/cli/list.py +42 -0
  10. metaxy/cli/metadata.py +271 -0
  11. metaxy/cli/migrations.py +862 -0
  12. metaxy/cli/push.py +55 -0
  13. metaxy/config.py +450 -0
  14. metaxy/data_versioning/__init__.py +24 -0
  15. metaxy/data_versioning/calculators/__init__.py +13 -0
  16. metaxy/data_versioning/calculators/base.py +97 -0
  17. metaxy/data_versioning/calculators/duckdb.py +186 -0
  18. metaxy/data_versioning/calculators/ibis.py +225 -0
  19. metaxy/data_versioning/calculators/polars.py +135 -0
  20. metaxy/data_versioning/diff/__init__.py +15 -0
  21. metaxy/data_versioning/diff/base.py +150 -0
  22. metaxy/data_versioning/diff/narwhals.py +108 -0
  23. metaxy/data_versioning/hash_algorithms.py +19 -0
  24. metaxy/data_versioning/joiners/__init__.py +9 -0
  25. metaxy/data_versioning/joiners/base.py +70 -0
  26. metaxy/data_versioning/joiners/narwhals.py +235 -0
  27. metaxy/entrypoints.py +309 -0
  28. metaxy/ext/__init__.py +1 -0
  29. metaxy/ext/alembic.py +326 -0
  30. metaxy/ext/sqlmodel.py +172 -0
  31. metaxy/ext/sqlmodel_system_tables.py +139 -0
  32. metaxy/graph/__init__.py +21 -0
  33. metaxy/graph/diff/__init__.py +21 -0
  34. metaxy/graph/diff/diff_models.py +399 -0
  35. metaxy/graph/diff/differ.py +740 -0
  36. metaxy/graph/diff/models.py +418 -0
  37. metaxy/graph/diff/rendering/__init__.py +18 -0
  38. metaxy/graph/diff/rendering/base.py +274 -0
  39. metaxy/graph/diff/rendering/cards.py +188 -0
  40. metaxy/graph/diff/rendering/formatter.py +805 -0
  41. metaxy/graph/diff/rendering/graphviz.py +246 -0
  42. metaxy/graph/diff/rendering/mermaid.py +320 -0
  43. metaxy/graph/diff/rendering/rich.py +165 -0
  44. metaxy/graph/diff/rendering/theme.py +48 -0
  45. metaxy/graph/diff/traversal.py +247 -0
  46. metaxy/graph/utils.py +58 -0
  47. metaxy/metadata_store/__init__.py +31 -0
  48. metaxy/metadata_store/_protocols.py +38 -0
  49. metaxy/metadata_store/base.py +1676 -0
  50. metaxy/metadata_store/clickhouse.py +161 -0
  51. metaxy/metadata_store/duckdb.py +167 -0
  52. metaxy/metadata_store/exceptions.py +43 -0
  53. metaxy/metadata_store/ibis.py +451 -0
  54. metaxy/metadata_store/memory.py +228 -0
  55. metaxy/metadata_store/sqlite.py +187 -0
  56. metaxy/metadata_store/system_tables.py +257 -0
  57. metaxy/migrations/__init__.py +34 -0
  58. metaxy/migrations/detector.py +153 -0
  59. metaxy/migrations/executor.py +208 -0
  60. metaxy/migrations/loader.py +260 -0
  61. metaxy/migrations/models.py +718 -0
  62. metaxy/migrations/ops.py +390 -0
  63. metaxy/models/__init__.py +0 -0
  64. metaxy/models/bases.py +6 -0
  65. metaxy/models/constants.py +24 -0
  66. metaxy/models/feature.py +665 -0
  67. metaxy/models/feature_spec.py +105 -0
  68. metaxy/models/field.py +25 -0
  69. metaxy/models/plan.py +155 -0
  70. metaxy/models/types.py +157 -0
  71. metaxy/py.typed +0 -0
  72. metaxy-0.0.0.dist-info/METADATA +247 -0
  73. metaxy-0.0.0.dist-info/RECORD +75 -0
  74. metaxy-0.0.0.dist-info/WHEEL +4 -0
  75. metaxy-0.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,186 @@
1
+ """DuckDB-specific data version calculator with extension management.
2
+
3
+ This calculator extends IbisDataVersionCalculator to handle DuckDB-specific
4
+ extension loading (e.g., hashfuncs for xxHash support).
5
+ """
6
+ # pyright: reportImportCycles=false
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ from metaxy.data_versioning.calculators.ibis import IbisDataVersionCalculator
11
+ from metaxy.data_versioning.hash_algorithms import HashAlgorithm
12
+
13
+ if TYPE_CHECKING:
14
+ import ibis
15
+
16
+ from metaxy.data_versioning.calculators.ibis import HashSQLGenerator
17
+ from metaxy.metadata_store.duckdb import (
18
+ ExtensionSpec, # pyright: ignore[reportImportCycles]
19
+ )
20
+
21
+
22
+ class DuckDBDataVersionCalculator(IbisDataVersionCalculator):
23
+ """DuckDB-specific calculator that manages extensions lazily.
24
+
25
+ This calculator:
26
+ 1. Installs and loads DuckDB extensions on first use (lazy loading)
27
+ 2. Supports xxHash64, xxHash32, and MD5 hash functions
28
+ 3. Generates DuckDB-specific SQL for hash computation
29
+
30
+ The extension loading happens in __init__, which is only called when
31
+ native data version calculations are actually needed (not on store open).
32
+
33
+ Example:
34
+ >>> backend = ibis.duckdb.connect("metadata.db")
35
+ >>> calculator = DuckDBDataVersionCalculator(
36
+ ... backend=backend,
37
+ ... extensions=["hashfuncs"]
38
+ ... )
39
+ >>> # Extensions are now loaded and xxHash64 is available
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ backend: "ibis.BaseBackend",
45
+ extensions: "list[ExtensionSpec | str] | None" = None,
46
+ ):
47
+ """Initialize DuckDB calculator and load extensions.
48
+
49
+ Args:
50
+ backend: DuckDB Ibis backend connection
51
+ extensions: List of DuckDB extensions to install/load.
52
+ Can be strings (from 'community' repo) or dicts with
53
+ 'name' and optional 'repository' keys.
54
+
55
+ Example:
56
+ >>> extensions = ["hashfuncs"] # Simple form
57
+ >>> extensions = [{"name": "spatial", "repository": "core_nightly"}]
58
+ """
59
+ self._backend = backend
60
+ self.extensions = extensions or []
61
+
62
+ # Load extensions immediately (lazy at calculator creation time)
63
+ self._load_extensions()
64
+
65
+ # Generate hash SQL generators for DuckDB
66
+ hash_sql_generators = self._generate_hash_sql_generators()
67
+
68
+ # Initialize parent with backend and generators
69
+ super().__init__(
70
+ backend=backend,
71
+ hash_sql_generators=hash_sql_generators,
72
+ )
73
+
74
+ def _load_extensions(self) -> None:
75
+ """Install and load DuckDB extensions.
76
+
77
+ This is called once when the calculator is created, which happens
78
+ lazily when native data version calculations are first needed.
79
+ """
80
+ if not self.extensions:
81
+ return
82
+
83
+ # Type narrowing: we know this is a DuckDB backend
84
+ from typing import Any, cast
85
+
86
+ backend = cast(
87
+ Any, self._backend
88
+ ) # DuckDB backend has raw_sql but not in ibis.BaseBackend stubs
89
+
90
+ for ext_spec in self.extensions:
91
+ if isinstance(ext_spec, str):
92
+ # Simple string form - install from community repo
93
+ ext_name = ext_spec
94
+ # Install and load extension
95
+ backend.raw_sql(f"INSTALL {ext_name}")
96
+ backend.raw_sql(f"LOAD {ext_name}")
97
+ else:
98
+ # Dict form with optional repository
99
+ ext_name = ext_spec.get("name", "")
100
+ ext_repo = ext_spec.get("repository", "community")
101
+
102
+ if ext_repo != "community":
103
+ # Set custom repository
104
+ backend.raw_sql(f"SET custom_extension_repository='{ext_repo}'")
105
+
106
+ # Install and load extension
107
+ backend.raw_sql(f"INSTALL {ext_name}")
108
+ backend.raw_sql(f"LOAD {ext_name}")
109
+
110
+ def _generate_hash_sql_generators(self) -> dict[HashAlgorithm, "HashSQLGenerator"]:
111
+ """Generate hash SQL generators for DuckDB.
112
+
113
+ DuckDB supports:
114
+ - MD5: Always available (built-in)
115
+ - XXHASH32, XXHASH64: Available when 'hashfuncs' extension is loaded
116
+
117
+ Returns:
118
+ Dictionary mapping HashAlgorithm to SQL generator functions
119
+ """
120
+ generators: dict[HashAlgorithm, HashSQLGenerator] = {}
121
+
122
+ # MD5 is always available
123
+ def md5_generator(table, concat_columns: dict[str, str]) -> str:
124
+ hash_selects: list[str] = []
125
+ for field_key, concat_col in concat_columns.items():
126
+ hash_col = f"__hash_{field_key}"
127
+ # md5() in DuckDB returns hex string, cast to VARCHAR for consistency
128
+ hash_expr = f"CAST(md5({concat_col}) AS VARCHAR)"
129
+ hash_selects.append(f"{hash_expr} as {hash_col}")
130
+
131
+ hash_clause = ", ".join(hash_selects)
132
+ table_sql = table.compile()
133
+ return f"SELECT *, {hash_clause} FROM ({table_sql}) AS __metaxy_temp"
134
+
135
+ generators[HashAlgorithm.MD5] = md5_generator
136
+
137
+ # Check if hashfuncs extension is in the list
138
+ extension_names = [
139
+ ext if isinstance(ext, str) else ext.get("name", "")
140
+ for ext in self.extensions
141
+ ]
142
+
143
+ if "hashfuncs" in extension_names:
144
+
145
+ def xxhash32_generator(table, concat_columns: dict[str, str]) -> str:
146
+ hash_selects: list[str] = []
147
+ for field_key, concat_col in concat_columns.items():
148
+ hash_col = f"__hash_{field_key}"
149
+ hash_expr = f"CAST(xxh32({concat_col}) AS VARCHAR)"
150
+ hash_selects.append(f"{hash_expr} as {hash_col}")
151
+
152
+ hash_clause = ", ".join(hash_selects)
153
+ table_sql = table.compile()
154
+ return f"SELECT *, {hash_clause} FROM ({table_sql}) AS __metaxy_temp"
155
+
156
+ def xxhash64_generator(table, concat_columns: dict[str, str]) -> str:
157
+ hash_selects: list[str] = []
158
+ for field_key, concat_col in concat_columns.items():
159
+ hash_col = f"__hash_{field_key}"
160
+ hash_expr = f"CAST(xxh64({concat_col}) AS VARCHAR)"
161
+ hash_selects.append(f"{hash_expr} as {hash_col}")
162
+
163
+ hash_clause = ", ".join(hash_selects)
164
+ table_sql = table.compile()
165
+ return f"SELECT *, {hash_clause} FROM ({table_sql}) AS __metaxy_temp"
166
+
167
+ generators[HashAlgorithm.XXHASH32] = xxhash32_generator
168
+ generators[HashAlgorithm.XXHASH64] = xxhash64_generator
169
+
170
+ return generators
171
+
172
+ @property
173
+ def supported_algorithms(self) -> list[HashAlgorithm]:
174
+ """Algorithms supported by this calculator based on loaded extensions."""
175
+ # Dynamically determine based on what was actually loaded
176
+ return list(self._hash_sql_generators.keys())
177
+
178
+ @property
179
+ def default_algorithm(self) -> HashAlgorithm:
180
+ """Default hash algorithm for DuckDB.
181
+
182
+ Uses XXHASH64 if hashfuncs extension is loaded, otherwise MD5.
183
+ """
184
+ if HashAlgorithm.XXHASH64 in self.supported_algorithms:
185
+ return HashAlgorithm.XXHASH64
186
+ return HashAlgorithm.MD5
@@ -0,0 +1,225 @@
1
+ """Ibis-based data version calculator using native SQL hash functions.
2
+
3
+ This calculator uses Ibis to generate backend-specific SQL for hash computation,
4
+ executing entirely in the database without pulling data into memory.
5
+ """
6
+
7
+ from typing import TYPE_CHECKING, Any, Protocol
8
+
9
+ import narwhals as nw
10
+
11
+ from metaxy.data_versioning.calculators.base import DataVersionCalculator
12
+ from metaxy.data_versioning.hash_algorithms import HashAlgorithm
13
+
14
+ if TYPE_CHECKING:
15
+ import ibis
16
+ import ibis.expr.types
17
+ import ibis.expr.types.relations
18
+
19
+ from metaxy.models.feature_spec import FeatureSpec
20
+ from metaxy.models.plan import FeaturePlan
21
+
22
+
23
+ class HashSQLGenerator(Protocol):
24
+ """Protocol for backend-specific hash SQL generation.
25
+
26
+ Takes an Ibis table with concatenated columns and returns SQL that adds hash columns.
27
+ """
28
+
29
+ def __call__(
30
+ self, table: "ibis.expr.types.Table", concat_columns: dict[str, str]
31
+ ) -> str:
32
+ """Generate SQL query to compute hash columns.
33
+
34
+ Args:
35
+ table: Input Ibis table with concatenated columns
36
+ concat_columns: Maps field_key -> concat_column_name
37
+
38
+ Returns:
39
+ SQL query string that selects all columns plus hash columns
40
+ """
41
+ ...
42
+
43
+
44
+ class IbisDataVersionCalculator(DataVersionCalculator):
45
+ """Calculates data versions using native SQL hash functions via Ibis.
46
+
47
+ This calculator:
48
+ 1. Accepts Narwhals LazyFrame as input
49
+ 2. Converts to Ibis table internally
50
+ 3. Builds concatenated columns using Ibis expressions
51
+ 4. Applies backend-specific SQL hash functions
52
+ 5. Returns Narwhals LazyFrame
53
+
54
+ Different SQL backends have different hash function names and signatures,
55
+ so hash functions are provided as SQL template generators per backend.
56
+
57
+ Example hash SQL generators:
58
+ DuckDB: SELECT *, CAST(xxh64(concat_col) AS VARCHAR) as hash FROM table
59
+ ClickHouse: SELECT *, CAST(xxHash64(concat_col) AS String) as hash FROM table
60
+ PostgreSQL: SELECT *, MD5(concat_col) as hash FROM table
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ backend: "ibis.BaseBackend",
66
+ hash_sql_generators: dict[HashAlgorithm, HashSQLGenerator],
67
+ ):
68
+ """Initialize calculator with Ibis backend and hash SQL generators.
69
+
70
+ Args:
71
+ backend: Ibis backend connection for SQL execution
72
+ hash_sql_generators: Map from HashAlgorithm to SQL generator function
73
+ """
74
+ self._backend = backend
75
+ self._hash_sql_generators = hash_sql_generators
76
+
77
+ @property
78
+ def supported_algorithms(self) -> list[HashAlgorithm]:
79
+ """Algorithms supported by this calculator."""
80
+ return list(self._hash_sql_generators.keys())
81
+
82
+ @property
83
+ def default_algorithm(self) -> HashAlgorithm:
84
+ """Default hash algorithm.
85
+
86
+ Base implementation returns XXHASH64 if available, otherwise first available.
87
+ """
88
+ if HashAlgorithm.XXHASH64 in self.supported_algorithms:
89
+ return HashAlgorithm.XXHASH64
90
+ return self.supported_algorithms[0]
91
+
92
+ def calculate_data_versions(
93
+ self,
94
+ joined_upstream: nw.LazyFrame[Any],
95
+ feature_spec: "FeatureSpec",
96
+ feature_plan: "FeaturePlan",
97
+ upstream_column_mapping: dict[str, str],
98
+ hash_algorithm: HashAlgorithm | None = None,
99
+ ) -> nw.LazyFrame[Any]:
100
+ """Calculate data_version using SQL hash functions.
101
+
102
+ Args:
103
+ joined_upstream: Narwhals LazyFrame with upstream data joined
104
+ feature_spec: Feature specification
105
+ feature_plan: Feature plan
106
+ upstream_column_mapping: Maps upstream key -> column name
107
+ hash_algorithm: Hash to use
108
+
109
+ Returns:
110
+ Narwhals LazyFrame with data_version column added
111
+ """
112
+ import ibis
113
+
114
+ algo = hash_algorithm or self.default_algorithm
115
+
116
+ if algo not in self.supported_algorithms:
117
+ raise ValueError(
118
+ f"Hash algorithm {algo} not supported by {self.__class__.__name__}. "
119
+ f"Supported: {self.supported_algorithms}"
120
+ )
121
+
122
+ # Convert Narwhals LazyFrame to Ibis table
123
+ import ibis.expr.types
124
+
125
+ native = joined_upstream.to_native()
126
+
127
+ # Validate that we have an Ibis table
128
+ if not isinstance(native, ibis.expr.types.Table):
129
+ # Not an Ibis table - this calculator only works with Ibis-backed data
130
+ raise TypeError(
131
+ f"IbisDataVersionCalculator requires Ibis-backed data. "
132
+ f"Got {type(native)} instead. "
133
+ f"This usually means the metadata store is not using Ibis tables. "
134
+ f"Use PolarsDataVersionCalculator for non-Ibis stores."
135
+ )
136
+
137
+ ibis_table: ibis.expr.types.Table = native # type: ignore[assignment]
138
+
139
+ # Get the hash SQL generator
140
+ hash_sql_gen = self._hash_sql_generators[algo]
141
+
142
+ # Build concatenated string columns for each field (using Ibis expressions)
143
+ concat_columns = {}
144
+
145
+ for field in feature_spec.fields:
146
+ field_key_str = (
147
+ field.key.to_string()
148
+ if hasattr(field.key, "to_string")
149
+ else "__".join(field.key)
150
+ )
151
+
152
+ field_deps = feature_plan.field_dependencies.get(field.key, {})
153
+
154
+ # Build hash components (same structure as Polars)
155
+ components = [
156
+ ibis.literal(field_key_str),
157
+ ibis.literal(str(field.code_version)),
158
+ ]
159
+
160
+ # Add upstream data versions in deterministic order
161
+ for upstream_feature_key in sorted(field_deps.keys()):
162
+ upstream_fields = field_deps[upstream_feature_key]
163
+ upstream_key_str = (
164
+ upstream_feature_key.to_string()
165
+ if hasattr(upstream_feature_key, "to_string")
166
+ else "__".join(upstream_feature_key)
167
+ )
168
+
169
+ data_version_col_name = upstream_column_mapping.get(
170
+ upstream_key_str, "data_version"
171
+ )
172
+
173
+ for upstream_field in sorted(upstream_fields):
174
+ upstream_field_str = (
175
+ upstream_field.to_string()
176
+ if hasattr(upstream_field, "to_string")
177
+ else "__".join(upstream_field)
178
+ )
179
+
180
+ components.append(
181
+ ibis.literal(f"{upstream_key_str}/{upstream_field_str}")
182
+ )
183
+ # Access struct field for upstream field's hash
184
+ components.append(
185
+ ibis_table[data_version_col_name][upstream_field_str]
186
+ )
187
+
188
+ # Concatenate all components with separator
189
+ concat_expr = components[0]
190
+ for component in components[1:]:
191
+ concat_expr = concat_expr.concat(ibis.literal("|")).concat(component) # pyright: ignore[reportAttributeAccessIssue]
192
+
193
+ # Store concat column for this field
194
+ concat_col_name = f"__concat_{field_key_str}"
195
+ concat_columns[field_key_str] = concat_col_name
196
+ ibis_table = ibis_table.mutate(**{concat_col_name: concat_expr})
197
+
198
+ # Generate SQL for hashing all concat columns
199
+ hash_sql = hash_sql_gen(ibis_table, concat_columns)
200
+
201
+ # Execute SQL to get table with hash columns
202
+ result_table = self._backend.sql(hash_sql) # pyright: ignore[reportAttributeAccessIssue]
203
+
204
+ # Build data_version struct from hash columns
205
+ hash_col_names = [f"__hash_{k}" for k in concat_columns.keys()]
206
+ field_keys = list(concat_columns.keys())
207
+
208
+ # Create struct column from hash columns
209
+ struct_fields = {
210
+ field_key: result_table[f"__hash_{field_key}"] for field_key in field_keys
211
+ }
212
+
213
+ # Drop temp columns and add data_version
214
+ cols_to_keep = [
215
+ c
216
+ for c in result_table.columns
217
+ if c not in concat_columns.values() and c not in hash_col_names
218
+ ]
219
+
220
+ result_table = result_table.select(
221
+ *cols_to_keep, data_version=ibis.struct(struct_fields)
222
+ )
223
+
224
+ # Convert back to Narwhals LazyFrame
225
+ return nw.from_native(result_table, eager_only=False)
@@ -0,0 +1,135 @@
1
+ """Polars implementation of data version calculator."""
2
+
3
+ from collections.abc import Callable
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ import narwhals as nw
7
+ import polars as pl
8
+ import polars_hash as plh
9
+
10
+ from metaxy.data_versioning.calculators.base import DataVersionCalculator
11
+ from metaxy.data_versioning.hash_algorithms import HashAlgorithm
12
+
13
+ if TYPE_CHECKING:
14
+ from metaxy.models.feature_spec import FeatureSpec
15
+ from metaxy.models.plan import FeaturePlan
16
+
17
+
18
+ class PolarsDataVersionCalculator(DataVersionCalculator):
19
+ """Calculates data versions using polars-hash.
20
+
21
+ Accepts Narwhals LazyFrames and converts internally to Polars for hashing.
22
+ Supports all hash functions available in polars-hash plugin.
23
+ Default is xxHash64 for cross-database compatibility.
24
+ """
25
+
26
+ # Map HashAlgorithm enum to polars-hash functions
27
+ _HASH_FUNCTION_MAP: dict[HashAlgorithm, Callable[[pl.Expr], pl.Expr]] = {
28
+ HashAlgorithm.XXHASH64: lambda expr: expr.nchash.xxhash64(), # pyright: ignore[reportAttributeAccessIssue]
29
+ HashAlgorithm.XXHASH32: lambda expr: expr.nchash.xxhash32(), # pyright: ignore[reportAttributeAccessIssue]
30
+ HashAlgorithm.WYHASH: lambda expr: expr.nchash.wyhash(), # pyright: ignore[reportAttributeAccessIssue]
31
+ HashAlgorithm.SHA256: lambda expr: expr.chash.sha2_256(), # pyright: ignore[reportAttributeAccessIssue]
32
+ HashAlgorithm.MD5: lambda expr: expr.nchash.md5(), # pyright: ignore[reportAttributeAccessIssue]
33
+ }
34
+
35
+ @property
36
+ def supported_algorithms(self) -> list[HashAlgorithm]:
37
+ """All algorithms supported by polars-hash."""
38
+ return list(self._HASH_FUNCTION_MAP.keys())
39
+
40
+ @property
41
+ def default_algorithm(self) -> HashAlgorithm:
42
+ """xxHash64 - fast and cross-database compatible."""
43
+ return HashAlgorithm.XXHASH64
44
+
45
+ def calculate_data_versions(
46
+ self,
47
+ joined_upstream: nw.LazyFrame[Any],
48
+ feature_spec: "FeatureSpec",
49
+ feature_plan: "FeaturePlan",
50
+ upstream_column_mapping: dict[str, str],
51
+ hash_algorithm: HashAlgorithm | None = None,
52
+ ) -> nw.LazyFrame[Any]:
53
+ """Calculate data_version using polars-hash.
54
+
55
+ Args:
56
+ joined_upstream: Narwhals LazyFrame with upstream data joined
57
+ feature_spec: Feature specification
58
+ feature_plan: Feature plan
59
+ upstream_column_mapping: Maps upstream key -> column name
60
+ hash_algorithm: Hash to use (default: xxHash64)
61
+
62
+ Returns:
63
+ Narwhals LazyFrame with data_version column added
64
+ """
65
+ algo = hash_algorithm or self.default_algorithm
66
+
67
+ if algo not in self.supported_algorithms:
68
+ raise ValueError(
69
+ f"Hash algorithm {algo} not supported by PolarsDataVersionCalculator. "
70
+ f"Supported: {self.supported_algorithms}"
71
+ )
72
+
73
+ # Convert Narwhals LazyFrame to Polars LazyFrame
74
+ # Must collect first (LazyFrame doesn't have to_polars, only DataFrame does)
75
+ pl_lazy = joined_upstream.collect().to_polars().lazy()
76
+
77
+ hash_fn = self._HASH_FUNCTION_MAP[algo]
78
+
79
+ # Build hash expressions for each field
80
+ field_exprs = {}
81
+
82
+ for field in feature_spec.fields:
83
+ field_key_str = (
84
+ field.key.to_string()
85
+ if hasattr(field.key, "to_string")
86
+ else "_".join(field.key)
87
+ )
88
+
89
+ field_deps = feature_plan.field_dependencies.get(field.key, {})
90
+
91
+ # Build hash components
92
+ components = [
93
+ pl.lit(field_key_str),
94
+ pl.lit(str(field.code_version)),
95
+ ]
96
+
97
+ # Add upstream data versions in deterministic order
98
+ for upstream_feature_key in sorted(field_deps.keys()):
99
+ upstream_fields = field_deps[upstream_feature_key]
100
+ upstream_key_str = (
101
+ upstream_feature_key.to_string()
102
+ if hasattr(upstream_feature_key, "to_string")
103
+ else "_".join(upstream_feature_key)
104
+ )
105
+
106
+ data_version_col_name = upstream_column_mapping.get(
107
+ upstream_key_str, "data_version"
108
+ )
109
+
110
+ for upstream_field in sorted(upstream_fields):
111
+ upstream_field_str = (
112
+ upstream_field.to_string()
113
+ if hasattr(upstream_field, "to_string")
114
+ else "_".join(upstream_field)
115
+ )
116
+
117
+ components.append(
118
+ pl.lit(f"{upstream_key_str}/{upstream_field_str}")
119
+ )
120
+ components.append(
121
+ pl.col(data_version_col_name).struct.field(upstream_field_str)
122
+ )
123
+
124
+ # Concatenate and hash
125
+ concat_expr = plh.concat_str(*components, separator="|")
126
+ hashed = hash_fn(concat_expr).cast(pl.Utf8)
127
+ field_exprs[field_key_str] = hashed
128
+
129
+ # Create data_version struct
130
+ data_version_expr = pl.struct(**field_exprs) # type: ignore[call-overload]
131
+
132
+ result_pl = pl_lazy.with_columns(data_version_expr.alias("data_version"))
133
+
134
+ # Convert back to Narwhals LazyFrame
135
+ return nw.from_native(result_pl, eager_only=False)
@@ -0,0 +1,15 @@
1
+ """Metadata diff resolvers for identifying changed data versions."""
2
+
3
+ from metaxy.data_versioning.diff.base import (
4
+ DiffResult,
5
+ LazyDiffResult,
6
+ MetadataDiffResolver,
7
+ )
8
+ from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
9
+
10
+ __all__ = [
11
+ "DiffResult",
12
+ "LazyDiffResult",
13
+ "MetadataDiffResolver",
14
+ "NarwhalsDiffResolver",
15
+ ]