metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
metaxy/utils/hashing.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Hash truncation utilities for Metaxy.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for globally truncating hash outputs to reduce
|
|
4
|
+
storage requirements and improve readability. Hash truncation is configured
|
|
5
|
+
through the global MetaxyConfig.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, TypeVar, overload
|
|
9
|
+
|
|
10
|
+
import narwhals as nw
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
# Minimum allowed truncation length
|
|
14
|
+
MIN_TRUNCATION_LENGTH = 8
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def truncate_hash(hash_str: str) -> str:
|
|
18
|
+
"""Truncate a hash string using the global truncation setting.
|
|
19
|
+
|
|
20
|
+
Uses the global hash truncation setting from MetaxyConfig.
|
|
21
|
+
If the global setting is None, returns the full hash.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
hash_str: The hash string to truncate
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Truncated hash string
|
|
28
|
+
|
|
29
|
+
Examples:
|
|
30
|
+
```py
|
|
31
|
+
# With global config set to truncation_length=12:
|
|
32
|
+
truncate_hash("a" * 64)
|
|
33
|
+
# 'aaaaaaaaaaaa'
|
|
34
|
+
|
|
35
|
+
# With no truncation setting:
|
|
36
|
+
truncate_hash("abc123")
|
|
37
|
+
# 'abc123'
|
|
38
|
+
```
|
|
39
|
+
"""
|
|
40
|
+
# Get length from global setting
|
|
41
|
+
length = get_hash_truncation_length()
|
|
42
|
+
|
|
43
|
+
# No truncation if length is None
|
|
44
|
+
if length is None:
|
|
45
|
+
return hash_str
|
|
46
|
+
|
|
47
|
+
# If hash is already shorter than truncation length, return as-is
|
|
48
|
+
if len(hash_str) <= length:
|
|
49
|
+
return hash_str
|
|
50
|
+
|
|
51
|
+
# Truncate to specified length
|
|
52
|
+
return hash_str[:length]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_hash_truncation_length() -> int:
|
|
56
|
+
"""Get the current global hash truncation length from MetaxyConfig.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Current truncation length, or 64 if no truncation is configured
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
```py
|
|
63
|
+
# With MetaxyConfig.hash_truncation_length = 16
|
|
64
|
+
get_hash_truncation_length()
|
|
65
|
+
```
|
|
66
|
+
16
|
|
67
|
+
"""
|
|
68
|
+
from metaxy.config import MetaxyConfig
|
|
69
|
+
|
|
70
|
+
config = MetaxyConfig.get()
|
|
71
|
+
return config.hash_truncation_length or 64
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def ensure_hash_compatibility(hash1: str, hash2: str) -> bool:
|
|
75
|
+
"""Check if two hashes are compatible considering truncation.
|
|
76
|
+
|
|
77
|
+
Two hashes are compatible if:
|
|
78
|
+
- They are exactly equal, OR
|
|
79
|
+
- One is a truncated version of the other
|
|
80
|
+
|
|
81
|
+
This is useful for comparing hashes that may have been truncated
|
|
82
|
+
at different lengths.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
hash1: First hash to compare
|
|
86
|
+
hash2: Second hash to compare
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
True if hashes are compatible, False otherwise
|
|
90
|
+
|
|
91
|
+
Examples:
|
|
92
|
+
```py
|
|
93
|
+
ensure_hash_compatibility("abc123", "abc123")
|
|
94
|
+
# True
|
|
95
|
+
|
|
96
|
+
ensure_hash_compatibility("abc123456789", "abc12345")
|
|
97
|
+
# True # Second is truncation of first
|
|
98
|
+
|
|
99
|
+
ensure_hash_compatibility("abc123", "def456")
|
|
100
|
+
# False # Different hashes
|
|
101
|
+
```
|
|
102
|
+
"""
|
|
103
|
+
if hash1 == hash2:
|
|
104
|
+
return True
|
|
105
|
+
|
|
106
|
+
# Check if one is a prefix of the other (truncation)
|
|
107
|
+
shorter, longer = sorted([hash1, hash2], key=len)
|
|
108
|
+
return longer.startswith(shorter)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@nw.narwhalify
|
|
112
|
+
def truncate_string_column(
|
|
113
|
+
df: nw.DataFrame[Any], column_name: str
|
|
114
|
+
) -> nw.DataFrame[Any]:
|
|
115
|
+
"""Truncate hash values in a DataFrame column.
|
|
116
|
+
|
|
117
|
+
Uses the global hash truncation setting from MetaxyConfig.
|
|
118
|
+
If no truncation is configured, returns the DataFrame unchanged.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
df: DataFrame containing the hash column
|
|
122
|
+
column_name: Name of the column containing hash strings
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
DataFrame with truncated hash values in the specified column
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
```py
|
|
129
|
+
# With global config set to truncation_length=12:
|
|
130
|
+
df = nw.from_native(pd.DataFrame({"hash": ["a" * 64, "b" * 64]}))
|
|
131
|
+
result = truncate_string_column(df, "hash")
|
|
132
|
+
# result["hash"] contains ["aaaaaaaaaaaa", "bbbbbbbbbbbb"]
|
|
133
|
+
```
|
|
134
|
+
"""
|
|
135
|
+
length = get_hash_truncation_length()
|
|
136
|
+
|
|
137
|
+
# No truncation if length is None
|
|
138
|
+
if length is None:
|
|
139
|
+
return df
|
|
140
|
+
|
|
141
|
+
# Apply truncation to the specified column
|
|
142
|
+
return df.with_columns(nw.col(column_name).str.slice(0, length).alias(column_name))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
PolarsFrameT = TypeVar("PolarsFrameT", pl.DataFrame, pl.LazyFrame)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@overload
|
|
149
|
+
def truncate_struct_column(df: pl.DataFrame, struct_column: str) -> pl.DataFrame: ...
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@overload
|
|
153
|
+
def truncate_struct_column(df: pl.LazyFrame, struct_column: str) -> pl.LazyFrame: ...
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def truncate_struct_column(
|
|
157
|
+
df: pl.DataFrame | pl.LazyFrame, struct_column: str
|
|
158
|
+
) -> pl.DataFrame | pl.LazyFrame:
|
|
159
|
+
"""Truncate hash values within a struct column.
|
|
160
|
+
|
|
161
|
+
Uses the global hash truncation setting from MetaxyConfig.
|
|
162
|
+
Truncates all string values within the struct that appear to be hashes.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
df: DataFrame containing the struct column (Polars or Narwhals)
|
|
166
|
+
struct_column: Name of the struct column containing hash values
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
DataFrame with truncated hash values within the struct
|
|
170
|
+
|
|
171
|
+
Example:
|
|
172
|
+
```py
|
|
173
|
+
# With global config set to truncation_length=12:
|
|
174
|
+
df = pl.DataFrame({
|
|
175
|
+
"metaxy_provenance_by_field": [{"field1": "a" * 64, "field2": "b" * 64}]
|
|
176
|
+
})
|
|
177
|
+
result = truncate_struct_column(df, "metaxy_provenance_by_field")
|
|
178
|
+
# result["metaxy_provenance_by_field"] contains [{"field1": "aaaaaaaaaaaa", "field2": "bbbbbbbbbbbb"}]
|
|
179
|
+
```
|
|
180
|
+
"""
|
|
181
|
+
length = get_hash_truncation_length()
|
|
182
|
+
|
|
183
|
+
# No truncation if length is None
|
|
184
|
+
if length is None:
|
|
185
|
+
return df
|
|
186
|
+
|
|
187
|
+
import polars as pl
|
|
188
|
+
|
|
189
|
+
# Only handle Polars DataFrames and LazyFrames (structs are Polars-only)
|
|
190
|
+
if not isinstance(df, (pl.DataFrame, pl.LazyFrame)):
|
|
191
|
+
raise TypeError(
|
|
192
|
+
f"truncate_struct_column only supports Polars DataFrame/LazyFrame, got {type(df)}"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# For LazyFrame, we need to collect once to get field names
|
|
196
|
+
if isinstance(df, pl.LazyFrame):
|
|
197
|
+
temp_df = df.limit(1).collect()
|
|
198
|
+
else:
|
|
199
|
+
temp_df = df
|
|
200
|
+
|
|
201
|
+
# Get field names from the struct column
|
|
202
|
+
if temp_df.height == 0:
|
|
203
|
+
return df
|
|
204
|
+
|
|
205
|
+
struct_val = temp_df[struct_column][0]
|
|
206
|
+
if struct_val is None:
|
|
207
|
+
return df
|
|
208
|
+
|
|
209
|
+
field_names = list(struct_val.keys())
|
|
210
|
+
|
|
211
|
+
# Create expressions to extract and truncate each field
|
|
212
|
+
field_exprs = []
|
|
213
|
+
for field_name in field_names:
|
|
214
|
+
field_exprs.append(
|
|
215
|
+
pl.col(struct_column)
|
|
216
|
+
.struct.field(field_name)
|
|
217
|
+
.str.slice(0, length)
|
|
218
|
+
.alias(field_name)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Extract and truncate fields as separate columns
|
|
222
|
+
df_with_fields = df.with_columns(field_exprs)
|
|
223
|
+
|
|
224
|
+
# Recreate the struct from truncated fields
|
|
225
|
+
struct_expr = pl.struct([pl.col(fn) for fn in field_names])
|
|
226
|
+
result = df_with_fields.with_columns(struct_expr.alias(struct_column))
|
|
227
|
+
|
|
228
|
+
# Drop temporary columns
|
|
229
|
+
result = result.drop(field_names)
|
|
230
|
+
return result
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Provenance tracking system for Metaxy.
|
|
2
|
+
|
|
3
|
+
This package provides a unified interface for tracking field and sample-level provenance
|
|
4
|
+
across different backend implementations (Polars, DuckDB, ClickHouse, etc).
|
|
5
|
+
|
|
6
|
+
The VersioningEngine is the core abstraction that:
|
|
7
|
+
1. Joins upstream feature metadata
|
|
8
|
+
2. Calculates field-level provenance hashes
|
|
9
|
+
3. Assembles sample-level provenance
|
|
10
|
+
4. Compares with existing metadata to find incremental updates
|
|
11
|
+
|
|
12
|
+
Backend-specific implementations:
|
|
13
|
+
- PolarsVersioningEngine: Uses polars_hash plugin, may materialize lazy frames
|
|
14
|
+
- IbisVersioningEngine: Base class for SQL backends, stays completely lazy
|
|
15
|
+
- DuckDBVersioningEngine: DuckDB-specific hash functions (xxHash via hashfuncs extension)
|
|
16
|
+
- ClickHouseVersioningEngine: ClickHouse-specific hash functions (native support)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from metaxy.versioning.engine import (
|
|
20
|
+
RenamedDataFrame,
|
|
21
|
+
VersioningEngine,
|
|
22
|
+
)
|
|
23
|
+
from metaxy.versioning.types import HashAlgorithm, Increment, LazyIncrement
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"VersioningEngine",
|
|
27
|
+
"RenamedDataFrame",
|
|
28
|
+
"HashAlgorithm",
|
|
29
|
+
"Increment",
|
|
30
|
+
"LazyIncrement",
|
|
31
|
+
]
|