metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,230 @@
1
+ """Hash truncation utilities for Metaxy.
2
+
3
+ This module provides utilities for globally truncating hash outputs to reduce
4
+ storage requirements and improve readability. Hash truncation is configured
5
+ through the global MetaxyConfig.
6
+ """
7
+
8
+ from typing import Any, TypeVar, overload
9
+
10
+ import narwhals as nw
11
+ import polars as pl
12
+
13
+ # Minimum allowed truncation length
14
+ MIN_TRUNCATION_LENGTH = 8
15
+
16
+
17
+ def truncate_hash(hash_str: str) -> str:
18
+ """Truncate a hash string using the global truncation setting.
19
+
20
+ Uses the global hash truncation setting from MetaxyConfig.
21
+ If the global setting is None, returns the full hash.
22
+
23
+ Args:
24
+ hash_str: The hash string to truncate
25
+
26
+ Returns:
27
+ Truncated hash string
28
+
29
+ Examples:
30
+ ```py
31
+ # With global config set to truncation_length=12:
32
+ truncate_hash("a" * 64)
33
+ # 'aaaaaaaaaaaa'
34
+
35
+ # With no truncation setting:
36
+ truncate_hash("abc123")
37
+ # 'abc123'
38
+ ```
39
+ """
40
+ # Get length from global setting
41
+ length = get_hash_truncation_length()
42
+
43
+ # No truncation if length is None
44
+ if length is None:
45
+ return hash_str
46
+
47
+ # If hash is already shorter than truncation length, return as-is
48
+ if len(hash_str) <= length:
49
+ return hash_str
50
+
51
+ # Truncate to specified length
52
+ return hash_str[:length]
53
+
54
+
55
+ def get_hash_truncation_length() -> int:
56
+ """Get the current global hash truncation length from MetaxyConfig.
57
+
58
+ Returns:
59
+ Current truncation length, or 64 if no truncation is configured
60
+
61
+ Example:
62
+ ```py
63
+ # With MetaxyConfig.hash_truncation_length = 16
64
+ get_hash_truncation_length()
65
+ ```
66
+ 16
67
+ """
68
+ from metaxy.config import MetaxyConfig
69
+
70
+ config = MetaxyConfig.get()
71
+ return config.hash_truncation_length or 64
72
+
73
+
74
+ def ensure_hash_compatibility(hash1: str, hash2: str) -> bool:
75
+ """Check if two hashes are compatible considering truncation.
76
+
77
+ Two hashes are compatible if:
78
+ - They are exactly equal, OR
79
+ - One is a truncated version of the other
80
+
81
+ This is useful for comparing hashes that may have been truncated
82
+ at different lengths.
83
+
84
+ Args:
85
+ hash1: First hash to compare
86
+ hash2: Second hash to compare
87
+
88
+ Returns:
89
+ True if hashes are compatible, False otherwise
90
+
91
+ Examples:
92
+ ```py
93
+ ensure_hash_compatibility("abc123", "abc123")
94
+ # True
95
+
96
+ ensure_hash_compatibility("abc123456789", "abc12345")
97
+ # True # Second is truncation of first
98
+
99
+ ensure_hash_compatibility("abc123", "def456")
100
+ # False # Different hashes
101
+ ```
102
+ """
103
+ if hash1 == hash2:
104
+ return True
105
+
106
+ # Check if one is a prefix of the other (truncation)
107
+ shorter, longer = sorted([hash1, hash2], key=len)
108
+ return longer.startswith(shorter)
109
+
110
+
111
+ @nw.narwhalify
112
+ def truncate_string_column(
113
+ df: nw.DataFrame[Any], column_name: str
114
+ ) -> nw.DataFrame[Any]:
115
+ """Truncate hash values in a DataFrame column.
116
+
117
+ Uses the global hash truncation setting from MetaxyConfig.
118
+ If no truncation is configured, returns the DataFrame unchanged.
119
+
120
+ Args:
121
+ df: DataFrame containing the hash column
122
+ column_name: Name of the column containing hash strings
123
+
124
+ Returns:
125
+ DataFrame with truncated hash values in the specified column
126
+
127
+ Example:
128
+ ```py
129
+ # With global config set to truncation_length=12:
130
+ df = nw.from_native(pd.DataFrame({"hash": ["a" * 64, "b" * 64]}))
131
+ result = truncate_string_column(df, "hash")
132
+ # result["hash"] contains ["aaaaaaaaaaaa", "bbbbbbbbbbbb"]
133
+ ```
134
+ """
135
+ length = get_hash_truncation_length()
136
+
137
+ # No truncation if length is None
138
+ if length is None:
139
+ return df
140
+
141
+ # Apply truncation to the specified column
142
+ return df.with_columns(nw.col(column_name).str.slice(0, length).alias(column_name))
143
+
144
+
145
+ PolarsFrameT = TypeVar("PolarsFrameT", pl.DataFrame, pl.LazyFrame)
146
+
147
+
148
+ @overload
149
+ def truncate_struct_column(df: pl.DataFrame, struct_column: str) -> pl.DataFrame: ...
150
+
151
+
152
+ @overload
153
+ def truncate_struct_column(df: pl.LazyFrame, struct_column: str) -> pl.LazyFrame: ...
154
+
155
+
156
+ def truncate_struct_column(
157
+ df: pl.DataFrame | pl.LazyFrame, struct_column: str
158
+ ) -> pl.DataFrame | pl.LazyFrame:
159
+ """Truncate hash values within a struct column.
160
+
161
+ Uses the global hash truncation setting from MetaxyConfig.
162
+ Truncates all string values within the struct that appear to be hashes.
163
+
164
+ Args:
165
+ df: DataFrame containing the struct column (Polars or Narwhals)
166
+ struct_column: Name of the struct column containing hash values
167
+
168
+ Returns:
169
+ DataFrame with truncated hash values within the struct
170
+
171
+ Example:
172
+ ```py
173
+ # With global config set to truncation_length=12:
174
+ df = pl.DataFrame({
175
+ "metaxy_provenance_by_field": [{"field1": "a" * 64, "field2": "b" * 64}]
176
+ })
177
+ result = truncate_struct_column(df, "metaxy_provenance_by_field")
178
+ # result["metaxy_provenance_by_field"] contains [{"field1": "aaaaaaaaaaaa", "field2": "bbbbbbbbbbbb"}]
179
+ ```
180
+ """
181
+ length = get_hash_truncation_length()
182
+
183
+ # No truncation if length is None
184
+ if length is None:
185
+ return df
186
+
187
+ import polars as pl
188
+
189
+ # Only handle Polars DataFrames and LazyFrames (structs are Polars-only)
190
+ if not isinstance(df, (pl.DataFrame, pl.LazyFrame)):
191
+ raise TypeError(
192
+ f"truncate_struct_column only supports Polars DataFrame/LazyFrame, got {type(df)}"
193
+ )
194
+
195
+ # For LazyFrame, we need to collect once to get field names
196
+ if isinstance(df, pl.LazyFrame):
197
+ temp_df = df.limit(1).collect()
198
+ else:
199
+ temp_df = df
200
+
201
+ # Get field names from the struct column
202
+ if temp_df.height == 0:
203
+ return df
204
+
205
+ struct_val = temp_df[struct_column][0]
206
+ if struct_val is None:
207
+ return df
208
+
209
+ field_names = list(struct_val.keys())
210
+
211
+ # Create expressions to extract and truncate each field
212
+ field_exprs = []
213
+ for field_name in field_names:
214
+ field_exprs.append(
215
+ pl.col(struct_column)
216
+ .struct.field(field_name)
217
+ .str.slice(0, length)
218
+ .alias(field_name)
219
+ )
220
+
221
+ # Extract and truncate fields as separate columns
222
+ df_with_fields = df.with_columns(field_exprs)
223
+
224
+ # Recreate the struct from truncated fields
225
+ struct_expr = pl.struct([pl.col(fn) for fn in field_names])
226
+ result = df_with_fields.with_columns(struct_expr.alias(struct_column))
227
+
228
+ # Drop temporary columns
229
+ result = result.drop(field_names)
230
+ return result
@@ -0,0 +1,31 @@
1
+ """Provenance tracking system for Metaxy.
2
+
3
+ This package provides a unified interface for tracking field and sample-level provenance
4
+ across different backend implementations (Polars, DuckDB, ClickHouse, etc).
5
+
6
+ The VersioningEngine is the core abstraction that:
7
+ 1. Joins upstream feature metadata
8
+ 2. Calculates field-level provenance hashes
9
+ 3. Assembles sample-level provenance
10
+ 4. Compares with existing metadata to find incremental updates
11
+
12
+ Backend-specific implementations:
13
+ - PolarsVersioningEngine: Uses polars_hash plugin, may materialize lazy frames
14
+ - IbisVersioningEngine: Base class for SQL backends, stays completely lazy
15
+ - DuckDBVersioningEngine: DuckDB-specific hash functions (xxHash via hashfuncs extension)
16
+ - ClickHouseVersioningEngine: ClickHouse-specific hash functions (native support)
17
+ """
18
+
19
+ from metaxy.versioning.engine import (
20
+ RenamedDataFrame,
21
+ VersioningEngine,
22
+ )
23
+ from metaxy.versioning.types import HashAlgorithm, Increment, LazyIncrement
24
+
25
+ __all__ = [
26
+ "VersioningEngine",
27
+ "RenamedDataFrame",
28
+ "HashAlgorithm",
29
+ "Increment",
30
+ "LazyIncrement",
31
+ ]