metaxy 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metaxy might be problematic. Click here for more details.
- metaxy/__init__.py +61 -0
- metaxy/_testing.py +542 -0
- metaxy/_utils.py +16 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +76 -0
- metaxy/cli/context.py +71 -0
- metaxy/cli/graph.py +576 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +42 -0
- metaxy/cli/metadata.py +271 -0
- metaxy/cli/migrations.py +862 -0
- metaxy/cli/push.py +55 -0
- metaxy/config.py +450 -0
- metaxy/data_versioning/__init__.py +24 -0
- metaxy/data_versioning/calculators/__init__.py +13 -0
- metaxy/data_versioning/calculators/base.py +97 -0
- metaxy/data_versioning/calculators/duckdb.py +186 -0
- metaxy/data_versioning/calculators/ibis.py +225 -0
- metaxy/data_versioning/calculators/polars.py +135 -0
- metaxy/data_versioning/diff/__init__.py +15 -0
- metaxy/data_versioning/diff/base.py +150 -0
- metaxy/data_versioning/diff/narwhals.py +108 -0
- metaxy/data_versioning/hash_algorithms.py +19 -0
- metaxy/data_versioning/joiners/__init__.py +9 -0
- metaxy/data_versioning/joiners/base.py +70 -0
- metaxy/data_versioning/joiners/narwhals.py +235 -0
- metaxy/entrypoints.py +309 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/alembic.py +326 -0
- metaxy/ext/sqlmodel.py +172 -0
- metaxy/ext/sqlmodel_system_tables.py +139 -0
- metaxy/graph/__init__.py +21 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +399 -0
- metaxy/graph/diff/differ.py +740 -0
- metaxy/graph/diff/models.py +418 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +274 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +320 -0
- metaxy/graph/diff/rendering/rich.py +165 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +31 -0
- metaxy/metadata_store/_protocols.py +38 -0
- metaxy/metadata_store/base.py +1676 -0
- metaxy/metadata_store/clickhouse.py +161 -0
- metaxy/metadata_store/duckdb.py +167 -0
- metaxy/metadata_store/exceptions.py +43 -0
- metaxy/metadata_store/ibis.py +451 -0
- metaxy/metadata_store/memory.py +228 -0
- metaxy/metadata_store/sqlite.py +187 -0
- metaxy/metadata_store/system_tables.py +257 -0
- metaxy/migrations/__init__.py +34 -0
- metaxy/migrations/detector.py +153 -0
- metaxy/migrations/executor.py +208 -0
- metaxy/migrations/loader.py +260 -0
- metaxy/migrations/models.py +718 -0
- metaxy/migrations/ops.py +390 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +6 -0
- metaxy/models/constants.py +24 -0
- metaxy/models/feature.py +665 -0
- metaxy/models/feature_spec.py +105 -0
- metaxy/models/field.py +25 -0
- metaxy/models/plan.py +155 -0
- metaxy/models/types.py +157 -0
- metaxy/py.typed +0 -0
- metaxy-0.0.0.dist-info/METADATA +247 -0
- metaxy-0.0.0.dist-info/RECORD +75 -0
- metaxy-0.0.0.dist-info/WHEEL +4 -0
- metaxy-0.0.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""DuckDB-specific data version calculator with extension management.
|
|
2
|
+
|
|
3
|
+
This calculator extends IbisDataVersionCalculator to handle DuckDB-specific
|
|
4
|
+
extension loading (e.g., hashfuncs for xxHash support).
|
|
5
|
+
"""
|
|
6
|
+
# pyright: reportImportCycles=false
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from metaxy.data_versioning.calculators.ibis import IbisDataVersionCalculator
|
|
11
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
import ibis
|
|
15
|
+
|
|
16
|
+
from metaxy.data_versioning.calculators.ibis import HashSQLGenerator
|
|
17
|
+
from metaxy.metadata_store.duckdb import (
|
|
18
|
+
ExtensionSpec, # pyright: ignore[reportImportCycles]
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DuckDBDataVersionCalculator(IbisDataVersionCalculator):
|
|
23
|
+
"""DuckDB-specific calculator that manages extensions lazily.
|
|
24
|
+
|
|
25
|
+
This calculator:
|
|
26
|
+
1. Installs and loads DuckDB extensions on first use (lazy loading)
|
|
27
|
+
2. Supports xxHash64, xxHash32, and MD5 hash functions
|
|
28
|
+
3. Generates DuckDB-specific SQL for hash computation
|
|
29
|
+
|
|
30
|
+
The extension loading happens in __init__, which is only called when
|
|
31
|
+
native data version calculations are actually needed (not on store open).
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
>>> backend = ibis.duckdb.connect("metadata.db")
|
|
35
|
+
>>> calculator = DuckDBDataVersionCalculator(
|
|
36
|
+
... backend=backend,
|
|
37
|
+
... extensions=["hashfuncs"]
|
|
38
|
+
... )
|
|
39
|
+
>>> # Extensions are now loaded and xxHash64 is available
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
backend: "ibis.BaseBackend",
|
|
45
|
+
extensions: "list[ExtensionSpec | str] | None" = None,
|
|
46
|
+
):
|
|
47
|
+
"""Initialize DuckDB calculator and load extensions.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
backend: DuckDB Ibis backend connection
|
|
51
|
+
extensions: List of DuckDB extensions to install/load.
|
|
52
|
+
Can be strings (from 'community' repo) or dicts with
|
|
53
|
+
'name' and optional 'repository' keys.
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
>>> extensions = ["hashfuncs"] # Simple form
|
|
57
|
+
>>> extensions = [{"name": "spatial", "repository": "core_nightly"}]
|
|
58
|
+
"""
|
|
59
|
+
self._backend = backend
|
|
60
|
+
self.extensions = extensions or []
|
|
61
|
+
|
|
62
|
+
# Load extensions immediately (lazy at calculator creation time)
|
|
63
|
+
self._load_extensions()
|
|
64
|
+
|
|
65
|
+
# Generate hash SQL generators for DuckDB
|
|
66
|
+
hash_sql_generators = self._generate_hash_sql_generators()
|
|
67
|
+
|
|
68
|
+
# Initialize parent with backend and generators
|
|
69
|
+
super().__init__(
|
|
70
|
+
backend=backend,
|
|
71
|
+
hash_sql_generators=hash_sql_generators,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def _load_extensions(self) -> None:
|
|
75
|
+
"""Install and load DuckDB extensions.
|
|
76
|
+
|
|
77
|
+
This is called once when the calculator is created, which happens
|
|
78
|
+
lazily when native data version calculations are first needed.
|
|
79
|
+
"""
|
|
80
|
+
if not self.extensions:
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
# Type narrowing: we know this is a DuckDB backend
|
|
84
|
+
from typing import Any, cast
|
|
85
|
+
|
|
86
|
+
backend = cast(
|
|
87
|
+
Any, self._backend
|
|
88
|
+
) # DuckDB backend has raw_sql but not in ibis.BaseBackend stubs
|
|
89
|
+
|
|
90
|
+
for ext_spec in self.extensions:
|
|
91
|
+
if isinstance(ext_spec, str):
|
|
92
|
+
# Simple string form - install from community repo
|
|
93
|
+
ext_name = ext_spec
|
|
94
|
+
# Install and load extension
|
|
95
|
+
backend.raw_sql(f"INSTALL {ext_name}")
|
|
96
|
+
backend.raw_sql(f"LOAD {ext_name}")
|
|
97
|
+
else:
|
|
98
|
+
# Dict form with optional repository
|
|
99
|
+
ext_name = ext_spec.get("name", "")
|
|
100
|
+
ext_repo = ext_spec.get("repository", "community")
|
|
101
|
+
|
|
102
|
+
if ext_repo != "community":
|
|
103
|
+
# Set custom repository
|
|
104
|
+
backend.raw_sql(f"SET custom_extension_repository='{ext_repo}'")
|
|
105
|
+
|
|
106
|
+
# Install and load extension
|
|
107
|
+
backend.raw_sql(f"INSTALL {ext_name}")
|
|
108
|
+
backend.raw_sql(f"LOAD {ext_name}")
|
|
109
|
+
|
|
110
|
+
def _generate_hash_sql_generators(self) -> dict[HashAlgorithm, "HashSQLGenerator"]:
|
|
111
|
+
"""Generate hash SQL generators for DuckDB.
|
|
112
|
+
|
|
113
|
+
DuckDB supports:
|
|
114
|
+
- MD5: Always available (built-in)
|
|
115
|
+
- XXHASH32, XXHASH64: Available when 'hashfuncs' extension is loaded
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Dictionary mapping HashAlgorithm to SQL generator functions
|
|
119
|
+
"""
|
|
120
|
+
generators: dict[HashAlgorithm, HashSQLGenerator] = {}
|
|
121
|
+
|
|
122
|
+
# MD5 is always available
|
|
123
|
+
def md5_generator(table, concat_columns: dict[str, str]) -> str:
|
|
124
|
+
hash_selects: list[str] = []
|
|
125
|
+
for field_key, concat_col in concat_columns.items():
|
|
126
|
+
hash_col = f"__hash_{field_key}"
|
|
127
|
+
# md5() in DuckDB returns hex string, cast to VARCHAR for consistency
|
|
128
|
+
hash_expr = f"CAST(md5({concat_col}) AS VARCHAR)"
|
|
129
|
+
hash_selects.append(f"{hash_expr} as {hash_col}")
|
|
130
|
+
|
|
131
|
+
hash_clause = ", ".join(hash_selects)
|
|
132
|
+
table_sql = table.compile()
|
|
133
|
+
return f"SELECT *, {hash_clause} FROM ({table_sql}) AS __metaxy_temp"
|
|
134
|
+
|
|
135
|
+
generators[HashAlgorithm.MD5] = md5_generator
|
|
136
|
+
|
|
137
|
+
# Check if hashfuncs extension is in the list
|
|
138
|
+
extension_names = [
|
|
139
|
+
ext if isinstance(ext, str) else ext.get("name", "")
|
|
140
|
+
for ext in self.extensions
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
if "hashfuncs" in extension_names:
|
|
144
|
+
|
|
145
|
+
def xxhash32_generator(table, concat_columns: dict[str, str]) -> str:
|
|
146
|
+
hash_selects: list[str] = []
|
|
147
|
+
for field_key, concat_col in concat_columns.items():
|
|
148
|
+
hash_col = f"__hash_{field_key}"
|
|
149
|
+
hash_expr = f"CAST(xxh32({concat_col}) AS VARCHAR)"
|
|
150
|
+
hash_selects.append(f"{hash_expr} as {hash_col}")
|
|
151
|
+
|
|
152
|
+
hash_clause = ", ".join(hash_selects)
|
|
153
|
+
table_sql = table.compile()
|
|
154
|
+
return f"SELECT *, {hash_clause} FROM ({table_sql}) AS __metaxy_temp"
|
|
155
|
+
|
|
156
|
+
def xxhash64_generator(table, concat_columns: dict[str, str]) -> str:
|
|
157
|
+
hash_selects: list[str] = []
|
|
158
|
+
for field_key, concat_col in concat_columns.items():
|
|
159
|
+
hash_col = f"__hash_{field_key}"
|
|
160
|
+
hash_expr = f"CAST(xxh64({concat_col}) AS VARCHAR)"
|
|
161
|
+
hash_selects.append(f"{hash_expr} as {hash_col}")
|
|
162
|
+
|
|
163
|
+
hash_clause = ", ".join(hash_selects)
|
|
164
|
+
table_sql = table.compile()
|
|
165
|
+
return f"SELECT *, {hash_clause} FROM ({table_sql}) AS __metaxy_temp"
|
|
166
|
+
|
|
167
|
+
generators[HashAlgorithm.XXHASH32] = xxhash32_generator
|
|
168
|
+
generators[HashAlgorithm.XXHASH64] = xxhash64_generator
|
|
169
|
+
|
|
170
|
+
return generators
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def supported_algorithms(self) -> list[HashAlgorithm]:
|
|
174
|
+
"""Algorithms supported by this calculator based on loaded extensions."""
|
|
175
|
+
# Dynamically determine based on what was actually loaded
|
|
176
|
+
return list(self._hash_sql_generators.keys())
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def default_algorithm(self) -> HashAlgorithm:
|
|
180
|
+
"""Default hash algorithm for DuckDB.
|
|
181
|
+
|
|
182
|
+
Uses XXHASH64 if hashfuncs extension is loaded, otherwise MD5.
|
|
183
|
+
"""
|
|
184
|
+
if HashAlgorithm.XXHASH64 in self.supported_algorithms:
|
|
185
|
+
return HashAlgorithm.XXHASH64
|
|
186
|
+
return HashAlgorithm.MD5
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""Ibis-based data version calculator using native SQL hash functions.
|
|
2
|
+
|
|
3
|
+
This calculator uses Ibis to generate backend-specific SQL for hash computation,
|
|
4
|
+
executing entirely in the database without pulling data into memory.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Protocol
|
|
8
|
+
|
|
9
|
+
import narwhals as nw
|
|
10
|
+
|
|
11
|
+
from metaxy.data_versioning.calculators.base import DataVersionCalculator
|
|
12
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import ibis
|
|
16
|
+
import ibis.expr.types
|
|
17
|
+
import ibis.expr.types.relations
|
|
18
|
+
|
|
19
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
20
|
+
from metaxy.models.plan import FeaturePlan
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HashSQLGenerator(Protocol):
|
|
24
|
+
"""Protocol for backend-specific hash SQL generation.
|
|
25
|
+
|
|
26
|
+
Takes an Ibis table with concatenated columns and returns SQL that adds hash columns.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __call__(
|
|
30
|
+
self, table: "ibis.expr.types.Table", concat_columns: dict[str, str]
|
|
31
|
+
) -> str:
|
|
32
|
+
"""Generate SQL query to compute hash columns.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
table: Input Ibis table with concatenated columns
|
|
36
|
+
concat_columns: Maps field_key -> concat_column_name
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
SQL query string that selects all columns plus hash columns
|
|
40
|
+
"""
|
|
41
|
+
...
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class IbisDataVersionCalculator(DataVersionCalculator):
|
|
45
|
+
"""Calculates data versions using native SQL hash functions via Ibis.
|
|
46
|
+
|
|
47
|
+
This calculator:
|
|
48
|
+
1. Accepts Narwhals LazyFrame as input
|
|
49
|
+
2. Converts to Ibis table internally
|
|
50
|
+
3. Builds concatenated columns using Ibis expressions
|
|
51
|
+
4. Applies backend-specific SQL hash functions
|
|
52
|
+
5. Returns Narwhals LazyFrame
|
|
53
|
+
|
|
54
|
+
Different SQL backends have different hash function names and signatures,
|
|
55
|
+
so hash functions are provided as SQL template generators per backend.
|
|
56
|
+
|
|
57
|
+
Example hash SQL generators:
|
|
58
|
+
DuckDB: SELECT *, CAST(xxh64(concat_col) AS VARCHAR) as hash FROM table
|
|
59
|
+
ClickHouse: SELECT *, CAST(xxHash64(concat_col) AS String) as hash FROM table
|
|
60
|
+
PostgreSQL: SELECT *, MD5(concat_col) as hash FROM table
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
backend: "ibis.BaseBackend",
|
|
66
|
+
hash_sql_generators: dict[HashAlgorithm, HashSQLGenerator],
|
|
67
|
+
):
|
|
68
|
+
"""Initialize calculator with Ibis backend and hash SQL generators.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
backend: Ibis backend connection for SQL execution
|
|
72
|
+
hash_sql_generators: Map from HashAlgorithm to SQL generator function
|
|
73
|
+
"""
|
|
74
|
+
self._backend = backend
|
|
75
|
+
self._hash_sql_generators = hash_sql_generators
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def supported_algorithms(self) -> list[HashAlgorithm]:
|
|
79
|
+
"""Algorithms supported by this calculator."""
|
|
80
|
+
return list(self._hash_sql_generators.keys())
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def default_algorithm(self) -> HashAlgorithm:
|
|
84
|
+
"""Default hash algorithm.
|
|
85
|
+
|
|
86
|
+
Base implementation returns XXHASH64 if available, otherwise first available.
|
|
87
|
+
"""
|
|
88
|
+
if HashAlgorithm.XXHASH64 in self.supported_algorithms:
|
|
89
|
+
return HashAlgorithm.XXHASH64
|
|
90
|
+
return self.supported_algorithms[0]
|
|
91
|
+
|
|
92
|
+
def calculate_data_versions(
|
|
93
|
+
self,
|
|
94
|
+
joined_upstream: nw.LazyFrame[Any],
|
|
95
|
+
feature_spec: "FeatureSpec",
|
|
96
|
+
feature_plan: "FeaturePlan",
|
|
97
|
+
upstream_column_mapping: dict[str, str],
|
|
98
|
+
hash_algorithm: HashAlgorithm | None = None,
|
|
99
|
+
) -> nw.LazyFrame[Any]:
|
|
100
|
+
"""Calculate data_version using SQL hash functions.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
joined_upstream: Narwhals LazyFrame with upstream data joined
|
|
104
|
+
feature_spec: Feature specification
|
|
105
|
+
feature_plan: Feature plan
|
|
106
|
+
upstream_column_mapping: Maps upstream key -> column name
|
|
107
|
+
hash_algorithm: Hash to use
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Narwhals LazyFrame with data_version column added
|
|
111
|
+
"""
|
|
112
|
+
import ibis
|
|
113
|
+
|
|
114
|
+
algo = hash_algorithm or self.default_algorithm
|
|
115
|
+
|
|
116
|
+
if algo not in self.supported_algorithms:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"Hash algorithm {algo} not supported by {self.__class__.__name__}. "
|
|
119
|
+
f"Supported: {self.supported_algorithms}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Convert Narwhals LazyFrame to Ibis table
|
|
123
|
+
import ibis.expr.types
|
|
124
|
+
|
|
125
|
+
native = joined_upstream.to_native()
|
|
126
|
+
|
|
127
|
+
# Validate that we have an Ibis table
|
|
128
|
+
if not isinstance(native, ibis.expr.types.Table):
|
|
129
|
+
# Not an Ibis table - this calculator only works with Ibis-backed data
|
|
130
|
+
raise TypeError(
|
|
131
|
+
f"IbisDataVersionCalculator requires Ibis-backed data. "
|
|
132
|
+
f"Got {type(native)} instead. "
|
|
133
|
+
f"This usually means the metadata store is not using Ibis tables. "
|
|
134
|
+
f"Use PolarsDataVersionCalculator for non-Ibis stores."
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
ibis_table: ibis.expr.types.Table = native # type: ignore[assignment]
|
|
138
|
+
|
|
139
|
+
# Get the hash SQL generator
|
|
140
|
+
hash_sql_gen = self._hash_sql_generators[algo]
|
|
141
|
+
|
|
142
|
+
# Build concatenated string columns for each field (using Ibis expressions)
|
|
143
|
+
concat_columns = {}
|
|
144
|
+
|
|
145
|
+
for field in feature_spec.fields:
|
|
146
|
+
field_key_str = (
|
|
147
|
+
field.key.to_string()
|
|
148
|
+
if hasattr(field.key, "to_string")
|
|
149
|
+
else "__".join(field.key)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
field_deps = feature_plan.field_dependencies.get(field.key, {})
|
|
153
|
+
|
|
154
|
+
# Build hash components (same structure as Polars)
|
|
155
|
+
components = [
|
|
156
|
+
ibis.literal(field_key_str),
|
|
157
|
+
ibis.literal(str(field.code_version)),
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
# Add upstream data versions in deterministic order
|
|
161
|
+
for upstream_feature_key in sorted(field_deps.keys()):
|
|
162
|
+
upstream_fields = field_deps[upstream_feature_key]
|
|
163
|
+
upstream_key_str = (
|
|
164
|
+
upstream_feature_key.to_string()
|
|
165
|
+
if hasattr(upstream_feature_key, "to_string")
|
|
166
|
+
else "__".join(upstream_feature_key)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
data_version_col_name = upstream_column_mapping.get(
|
|
170
|
+
upstream_key_str, "data_version"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
for upstream_field in sorted(upstream_fields):
|
|
174
|
+
upstream_field_str = (
|
|
175
|
+
upstream_field.to_string()
|
|
176
|
+
if hasattr(upstream_field, "to_string")
|
|
177
|
+
else "__".join(upstream_field)
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
components.append(
|
|
181
|
+
ibis.literal(f"{upstream_key_str}/{upstream_field_str}")
|
|
182
|
+
)
|
|
183
|
+
# Access struct field for upstream field's hash
|
|
184
|
+
components.append(
|
|
185
|
+
ibis_table[data_version_col_name][upstream_field_str]
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Concatenate all components with separator
|
|
189
|
+
concat_expr = components[0]
|
|
190
|
+
for component in components[1:]:
|
|
191
|
+
concat_expr = concat_expr.concat(ibis.literal("|")).concat(component) # pyright: ignore[reportAttributeAccessIssue]
|
|
192
|
+
|
|
193
|
+
# Store concat column for this field
|
|
194
|
+
concat_col_name = f"__concat_{field_key_str}"
|
|
195
|
+
concat_columns[field_key_str] = concat_col_name
|
|
196
|
+
ibis_table = ibis_table.mutate(**{concat_col_name: concat_expr})
|
|
197
|
+
|
|
198
|
+
# Generate SQL for hashing all concat columns
|
|
199
|
+
hash_sql = hash_sql_gen(ibis_table, concat_columns)
|
|
200
|
+
|
|
201
|
+
# Execute SQL to get table with hash columns
|
|
202
|
+
result_table = self._backend.sql(hash_sql) # pyright: ignore[reportAttributeAccessIssue]
|
|
203
|
+
|
|
204
|
+
# Build data_version struct from hash columns
|
|
205
|
+
hash_col_names = [f"__hash_{k}" for k in concat_columns.keys()]
|
|
206
|
+
field_keys = list(concat_columns.keys())
|
|
207
|
+
|
|
208
|
+
# Create struct column from hash columns
|
|
209
|
+
struct_fields = {
|
|
210
|
+
field_key: result_table[f"__hash_{field_key}"] for field_key in field_keys
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
# Drop temp columns and add data_version
|
|
214
|
+
cols_to_keep = [
|
|
215
|
+
c
|
|
216
|
+
for c in result_table.columns
|
|
217
|
+
if c not in concat_columns.values() and c not in hash_col_names
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
result_table = result_table.select(
|
|
221
|
+
*cols_to_keep, data_version=ibis.struct(struct_fields)
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Convert back to Narwhals LazyFrame
|
|
225
|
+
return nw.from_native(result_table, eager_only=False)
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Polars implementation of data version calculator."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import narwhals as nw
|
|
7
|
+
import polars as pl
|
|
8
|
+
import polars_hash as plh
|
|
9
|
+
|
|
10
|
+
from metaxy.data_versioning.calculators.base import DataVersionCalculator
|
|
11
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
15
|
+
from metaxy.models.plan import FeaturePlan
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PolarsDataVersionCalculator(DataVersionCalculator):
|
|
19
|
+
"""Calculates data versions using polars-hash.
|
|
20
|
+
|
|
21
|
+
Accepts Narwhals LazyFrames and converts internally to Polars for hashing.
|
|
22
|
+
Supports all hash functions available in polars-hash plugin.
|
|
23
|
+
Default is xxHash64 for cross-database compatibility.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# Map HashAlgorithm enum to polars-hash functions
|
|
27
|
+
_HASH_FUNCTION_MAP: dict[HashAlgorithm, Callable[[pl.Expr], pl.Expr]] = {
|
|
28
|
+
HashAlgorithm.XXHASH64: lambda expr: expr.nchash.xxhash64(), # pyright: ignore[reportAttributeAccessIssue]
|
|
29
|
+
HashAlgorithm.XXHASH32: lambda expr: expr.nchash.xxhash32(), # pyright: ignore[reportAttributeAccessIssue]
|
|
30
|
+
HashAlgorithm.WYHASH: lambda expr: expr.nchash.wyhash(), # pyright: ignore[reportAttributeAccessIssue]
|
|
31
|
+
HashAlgorithm.SHA256: lambda expr: expr.chash.sha2_256(), # pyright: ignore[reportAttributeAccessIssue]
|
|
32
|
+
HashAlgorithm.MD5: lambda expr: expr.nchash.md5(), # pyright: ignore[reportAttributeAccessIssue]
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def supported_algorithms(self) -> list[HashAlgorithm]:
|
|
37
|
+
"""All algorithms supported by polars-hash."""
|
|
38
|
+
return list(self._HASH_FUNCTION_MAP.keys())
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def default_algorithm(self) -> HashAlgorithm:
|
|
42
|
+
"""xxHash64 - fast and cross-database compatible."""
|
|
43
|
+
return HashAlgorithm.XXHASH64
|
|
44
|
+
|
|
45
|
+
def calculate_data_versions(
|
|
46
|
+
self,
|
|
47
|
+
joined_upstream: nw.LazyFrame[Any],
|
|
48
|
+
feature_spec: "FeatureSpec",
|
|
49
|
+
feature_plan: "FeaturePlan",
|
|
50
|
+
upstream_column_mapping: dict[str, str],
|
|
51
|
+
hash_algorithm: HashAlgorithm | None = None,
|
|
52
|
+
) -> nw.LazyFrame[Any]:
|
|
53
|
+
"""Calculate data_version using polars-hash.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
joined_upstream: Narwhals LazyFrame with upstream data joined
|
|
57
|
+
feature_spec: Feature specification
|
|
58
|
+
feature_plan: Feature plan
|
|
59
|
+
upstream_column_mapping: Maps upstream key -> column name
|
|
60
|
+
hash_algorithm: Hash to use (default: xxHash64)
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Narwhals LazyFrame with data_version column added
|
|
64
|
+
"""
|
|
65
|
+
algo = hash_algorithm or self.default_algorithm
|
|
66
|
+
|
|
67
|
+
if algo not in self.supported_algorithms:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Hash algorithm {algo} not supported by PolarsDataVersionCalculator. "
|
|
70
|
+
f"Supported: {self.supported_algorithms}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Convert Narwhals LazyFrame to Polars LazyFrame
|
|
74
|
+
# Must collect first (LazyFrame doesn't have to_polars, only DataFrame does)
|
|
75
|
+
pl_lazy = joined_upstream.collect().to_polars().lazy()
|
|
76
|
+
|
|
77
|
+
hash_fn = self._HASH_FUNCTION_MAP[algo]
|
|
78
|
+
|
|
79
|
+
# Build hash expressions for each field
|
|
80
|
+
field_exprs = {}
|
|
81
|
+
|
|
82
|
+
for field in feature_spec.fields:
|
|
83
|
+
field_key_str = (
|
|
84
|
+
field.key.to_string()
|
|
85
|
+
if hasattr(field.key, "to_string")
|
|
86
|
+
else "_".join(field.key)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
field_deps = feature_plan.field_dependencies.get(field.key, {})
|
|
90
|
+
|
|
91
|
+
# Build hash components
|
|
92
|
+
components = [
|
|
93
|
+
pl.lit(field_key_str),
|
|
94
|
+
pl.lit(str(field.code_version)),
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
# Add upstream data versions in deterministic order
|
|
98
|
+
for upstream_feature_key in sorted(field_deps.keys()):
|
|
99
|
+
upstream_fields = field_deps[upstream_feature_key]
|
|
100
|
+
upstream_key_str = (
|
|
101
|
+
upstream_feature_key.to_string()
|
|
102
|
+
if hasattr(upstream_feature_key, "to_string")
|
|
103
|
+
else "_".join(upstream_feature_key)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
data_version_col_name = upstream_column_mapping.get(
|
|
107
|
+
upstream_key_str, "data_version"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
for upstream_field in sorted(upstream_fields):
|
|
111
|
+
upstream_field_str = (
|
|
112
|
+
upstream_field.to_string()
|
|
113
|
+
if hasattr(upstream_field, "to_string")
|
|
114
|
+
else "_".join(upstream_field)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
components.append(
|
|
118
|
+
pl.lit(f"{upstream_key_str}/{upstream_field_str}")
|
|
119
|
+
)
|
|
120
|
+
components.append(
|
|
121
|
+
pl.col(data_version_col_name).struct.field(upstream_field_str)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Concatenate and hash
|
|
125
|
+
concat_expr = plh.concat_str(*components, separator="|")
|
|
126
|
+
hashed = hash_fn(concat_expr).cast(pl.Utf8)
|
|
127
|
+
field_exprs[field_key_str] = hashed
|
|
128
|
+
|
|
129
|
+
# Create data_version struct
|
|
130
|
+
data_version_expr = pl.struct(**field_exprs) # type: ignore[call-overload]
|
|
131
|
+
|
|
132
|
+
result_pl = pl_lazy.with_columns(data_version_expr.alias("data_version"))
|
|
133
|
+
|
|
134
|
+
# Convert back to Narwhals LazyFrame
|
|
135
|
+
return nw.from_native(result_pl, eager_only=False)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Metadata diff resolvers for identifying changed data versions."""
|
|
2
|
+
|
|
3
|
+
from metaxy.data_versioning.diff.base import (
|
|
4
|
+
DiffResult,
|
|
5
|
+
LazyDiffResult,
|
|
6
|
+
MetadataDiffResolver,
|
|
7
|
+
)
|
|
8
|
+
from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"DiffResult",
|
|
12
|
+
"LazyDiffResult",
|
|
13
|
+
"MetadataDiffResolver",
|
|
14
|
+
"NarwhalsDiffResolver",
|
|
15
|
+
]
|