metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,354 @@
1
+ """BigQuery metadata store - thin wrapper around IbisMetadataStore."""
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ if TYPE_CHECKING:
6
+ from metaxy.metadata_store.base import MetadataStore
7
+
8
+ from pydantic import Field
9
+
10
+ from metaxy.metadata_store.ibis import IbisMetadataStore, IbisMetadataStoreConfig
11
+ from metaxy.versioning.types import HashAlgorithm
12
+
13
+
14
+ class BigQueryMetadataStoreConfig(IbisMetadataStoreConfig):
15
+ """Configuration for BigQueryMetadataStore.
16
+
17
+ Example:
18
+ ```python
19
+ config = BigQueryMetadataStoreConfig(
20
+ project_id="my-project",
21
+ dataset_id="my_dataset",
22
+ credentials_path="/path/to/service-account.json",
23
+ )
24
+
25
+ store = BigQueryMetadataStore.from_config(config)
26
+ ```
27
+ """
28
+
29
+ project_id: str | None = Field(
30
+ default=None, description="Google Cloud project ID containing the dataset."
31
+ )
32
+ dataset_id: str | None = Field(
33
+ default=None, description="BigQuery dataset name for storing metadata tables."
34
+ )
35
+ credentials_path: str | None = Field(
36
+ default=None, description="Path to service account JSON file."
37
+ )
38
+ credentials: Any | None = Field(
39
+ default=None, description="Google Cloud credentials object."
40
+ )
41
+ location: str | None = Field(
42
+ default=None,
43
+ description="Default location for BigQuery resources (e.g., 'US', 'EU').",
44
+ )
45
+
46
+
47
+ class BigQueryMetadataStore(IbisMetadataStore):
48
+ """
49
+ [BigQuery](https://cloud.google.com/bigquery) metadata store using [Ibis](https://ibis-project.org/) backend.
50
+
51
+ Warning:
52
+ It's on the user to set up infrastructure for Metaxy correctly.
53
+ Make sure to have large tables partitioned as appropriate for your use case.
54
+
55
+ Note:
56
+ BigQuery automatically optimizes queries on partitioned tables.
57
+ When tables are partitioned (e.g., by date or ingestion time with _PARTITIONTIME), BigQuery will
58
+ automatically prune partitions based on WHERE clauses in queries, without needing
59
+ explicit configuration in the metadata store.
60
+ Make sure to use appropriate `filters` when calling [BigQueryMetadataStore.read_metadata][metaxy.metadata_store.bigquery.BigQueryMetadataStore.read_metadata].
61
+
62
+ Example: Basic Connection
63
+ ```py
64
+ store = BigQueryMetadataStore(
65
+ project_id="my-project",
66
+ dataset_id="my_dataset",
67
+ )
68
+ ```
69
+
70
+ Example: With Service Account
71
+ ```py
72
+ store = BigQueryMetadataStore(
73
+ project_id="my-project",
74
+ dataset_id="my_dataset",
75
+ credentials_path="/path/to/service-account.json",
76
+ )
77
+ ```
78
+
79
+ Example: With Location Configuration
80
+ ```py
81
+ store = BigQueryMetadataStore(
82
+ project_id="my-project",
83
+ dataset_id="my_dataset",
84
+ location="EU", # Specify data location
85
+ )
86
+ ```
87
+
88
+ Example: With Custom Hash Algorithm
89
+ ```py
90
+ store = BigQueryMetadataStore(
91
+ project_id="my-project",
92
+ dataset_id="my_dataset",
93
+ hash_algorithm=HashAlgorithm.SHA256, # Use SHA256 instead of default FARMHASH
94
+ )
95
+ ```
96
+ """
97
+
98
+ def __init__(
99
+ self,
100
+ project_id: str | None = None,
101
+ dataset_id: str | None = None,
102
+ *,
103
+ credentials_path: str | None = None,
104
+ credentials: Any | None = None,
105
+ location: str | None = None,
106
+ connection_params: dict[str, Any] | None = None,
107
+ fallback_stores: list["MetadataStore"] | None = None,
108
+ **kwargs: Any,
109
+ ):
110
+ """
111
+ Initialize [BigQuery](https://cloud.google.com/bigquery) metadata store.
112
+
113
+ Args:
114
+ project_id: Google Cloud project ID containing the dataset.
115
+ Can also be set via GOOGLE_CLOUD_PROJECT environment variable.
116
+ dataset_id: BigQuery dataset name for storing metadata tables.
117
+ If not provided, uses the default dataset for the project.
118
+ credentials_path: Path to service account JSON file.
119
+ Alternative to passing credentials object directly.
120
+ credentials: Google Cloud credentials object.
121
+ If not provided, uses default credentials from environment.
122
+ location: Default location for BigQuery resources (e.g., "US", "EU").
123
+ If not specified, BigQuery determines based on dataset location.
124
+ connection_params: Additional Ibis BigQuery connection parameters.
125
+ Overrides individual parameters if provided.
126
+ fallback_stores: Ordered list of read-only fallback stores.
127
+ **kwargs: Passed to [metaxy.metadata_store.ibis.IbisMetadataStore][]
128
+
129
+ Raises:
130
+ ImportError: If ibis-bigquery not installed
131
+ ValueError: If neither project_id nor connection_params provided
132
+
133
+ Note:
134
+ Authentication priority:
135
+ 1. Explicit credentials or credentials_path
136
+ 2. Application Default Credentials (ADC)
137
+ 3. Google Cloud SDK credentials
138
+
139
+ BigQuery automatically handles partition pruning when querying partitioned tables.
140
+ If your tables are partitioned (e.g., by date or ingestion time), BigQuery will
141
+ automatically optimize queries with appropriate WHERE clauses on the partition column.
142
+
143
+ Example:
144
+ ```py
145
+ # Using environment authentication
146
+ store = BigQueryMetadataStore(
147
+ project_id="my-project",
148
+ dataset_id="ml_metadata",
149
+ )
150
+
151
+ # Using service account
152
+ store = BigQueryMetadataStore(
153
+ project_id="my-project",
154
+ dataset_id="ml_metadata",
155
+ credentials_path="/path/to/key.json",
156
+ )
157
+
158
+ # With location specification
159
+ store = BigQueryMetadataStore(
160
+ project_id="my-project",
161
+ dataset_id="ml_metadata",
162
+ location="EU",
163
+ )
164
+ ```
165
+ """
166
+ # Build connection parameters if not provided
167
+ if connection_params is None:
168
+ connection_params = self._build_connection_params(
169
+ project_id=project_id,
170
+ dataset_id=dataset_id,
171
+ credentials_path=credentials_path,
172
+ credentials=credentials,
173
+ location=location,
174
+ )
175
+
176
+ # Validate we have minimum required parameters
177
+ if "project_id" not in connection_params and project_id is None:
178
+ raise ValueError(
179
+ "Must provide either project_id or connection_params with project_id. "
180
+ "Example: project_id='my-project'"
181
+ )
182
+
183
+ # Store parameters for display
184
+ self.project_id = project_id or connection_params.get("project_id")
185
+ self.dataset_id = dataset_id or connection_params.get("dataset_id", "")
186
+
187
+ # Initialize Ibis store with BigQuery backend
188
+ super().__init__(
189
+ backend="bigquery",
190
+ connection_params=connection_params,
191
+ fallback_stores=fallback_stores,
192
+ **kwargs,
193
+ )
194
+
195
+ def _build_connection_params(
196
+ self,
197
+ project_id: str | None = None,
198
+ dataset_id: str | None = None,
199
+ credentials_path: str | None = None,
200
+ credentials: Any | None = None,
201
+ location: str | None = None,
202
+ ) -> dict[str, Any]:
203
+ """Build connection parameters for Ibis BigQuery backend.
204
+
205
+ This method centralizes the authentication logic, supporting:
206
+ 1. Explicit service account file (credentials_path)
207
+ 2. Explicit credentials object
208
+ 3. Application Default Credentials (automatic fallback)
209
+
210
+ Args:
211
+ project_id: Google Cloud project ID
212
+ dataset_id: BigQuery dataset name
213
+ credentials_path: Path to service account JSON file
214
+ credentials: Pre-loaded credentials object
215
+ location: BigQuery resource location
216
+
217
+ Returns:
218
+ Dictionary of connection parameters for Ibis
219
+ """
220
+ connection_params: dict[str, Any] = {}
221
+
222
+ # Set core BigQuery parameters
223
+ if project_id is not None:
224
+ connection_params["project_id"] = project_id
225
+ if dataset_id is not None:
226
+ connection_params["dataset_id"] = dataset_id
227
+ if location is not None:
228
+ connection_params["location"] = location
229
+
230
+ # Handle authentication - prioritize explicit credentials
231
+ if credentials_path is not None:
232
+ connection_params["credentials"] = self._load_service_account_credentials(
233
+ credentials_path
234
+ )
235
+ elif credentials is not None:
236
+ connection_params["credentials"] = credentials
237
+ # Otherwise, Ibis will automatically use Application Default Credentials
238
+
239
+ return connection_params
240
+
241
+ def _load_service_account_credentials(self, credentials_path: str) -> Any:
242
+ """Load service account credentials from a JSON file.
243
+
244
+ Uses Google's recommended approach with google.oauth2.service_account
245
+ instead of manually parsing JSON and constructing credentials.
246
+
247
+ Args:
248
+ credentials_path: Path to service account JSON file
249
+
250
+ Returns:
251
+ Google Cloud credentials object
252
+
253
+ Raises:
254
+ ImportError: If google-auth library not installed
255
+ FileNotFoundError: If credentials file doesn't exist
256
+ ValueError: If credentials file is invalid
257
+ """
258
+ try:
259
+ from google.oauth2 import (
260
+ service_account, # pyright: ignore[reportMissingImports]
261
+ )
262
+ except ImportError as e:
263
+ raise ImportError(
264
+ "Google Cloud authentication libraries required for service account credentials. "
265
+ "Install with: pip install google-auth"
266
+ ) from e
267
+
268
+ try:
269
+ # Use Google's recommended method - it handles all edge cases
270
+ return service_account.Credentials.from_service_account_file(
271
+ credentials_path,
272
+ scopes=["https://www.googleapis.com/auth/bigquery"],
273
+ )
274
+ except FileNotFoundError as e:
275
+ raise FileNotFoundError(
276
+ f"Service account credentials file not found: {credentials_path}"
277
+ ) from e
278
+ except Exception as e:
279
+ # Catch JSON decode errors and other credential format issues
280
+ raise ValueError(
281
+ f"Invalid service account credentials file: {credentials_path}. "
282
+ "Ensure it's a valid service account JSON key file."
283
+ ) from e
284
+
285
+ def _get_default_hash_algorithm(self) -> HashAlgorithm:
286
+ # Should switch to FARM_FINGERPRINT64 once https://github.com/ion-elgreco/polars-hash/issues/49 is resolved
287
+ return HashAlgorithm.MD5
288
+
289
+ def _create_hash_functions(self):
290
+ """Create BigQuery-specific hash functions for Ibis expressions.
291
+
292
+ BigQuery supports FARM_FINGERPRINT, MD5, and SHA256 natively.
293
+ """
294
+ # Import ibis for wrapping built-in SQL functions
295
+ import ibis
296
+
297
+ # Use Ibis's builtin UDF decorator to wrap BigQuery's hash functions
298
+ @ibis.udf.scalar.builtin
299
+ def MD5(x: str) -> str:
300
+ """BigQuery MD5() function."""
301
+ ...
302
+
303
+ @ibis.udf.scalar.builtin
304
+ def FARM_FINGERPRINT(x: str) -> str:
305
+ """BigQuery FARM_FINGERPRINT() function."""
306
+ ...
307
+
308
+ @ibis.udf.scalar.builtin
309
+ def SHA256(x: str) -> str:
310
+ """BigQuery SHA256() function."""
311
+ ...
312
+
313
+ @ibis.udf.scalar.builtin
314
+ def TO_HEX(x: str) -> str:
315
+ """BigQuery TO_HEX() function."""
316
+ ...
317
+
318
+ @ibis.udf.scalar.builtin
319
+ def LOWER(x: str) -> str:
320
+ """BigQuery LOWER() function."""
321
+ ...
322
+
323
+ # Create hash functions that use these wrapped SQL functions
324
+ def md5_hash(col_expr):
325
+ """Hash a column using BigQuery's MD5() function."""
326
+ # MD5 returns bytes, convert to lowercase hex string
327
+ return LOWER(TO_HEX(MD5(col_expr.cast(str))))
328
+
329
+ def farmhash_hash(col_expr):
330
+ """Hash a column using BigQuery's FARM_FINGERPRINT() function."""
331
+ # FARM_FINGERPRINT returns INT64, cast to string
332
+ return FARM_FINGERPRINT(col_expr).cast(str)
333
+
334
+ def sha256_hash(col_expr):
335
+ """Hash a column using BigQuery's SHA256() function."""
336
+ # SHA256 returns bytes, convert to lowercase hex string
337
+ return LOWER(TO_HEX(SHA256(col_expr)))
338
+
339
+ hash_functions = {
340
+ HashAlgorithm.MD5: md5_hash,
341
+ HashAlgorithm.FARMHASH: farmhash_hash,
342
+ HashAlgorithm.SHA256: sha256_hash,
343
+ }
344
+
345
+ return hash_functions
346
+
347
+ def display(self) -> str:
348
+ """Display string for this store."""
349
+ dataset_info = f"/{self.dataset_id}" if self.dataset_id else ""
350
+ return f"BigQueryMetadataStore(project={self.project_id}{dataset_info})"
351
+
352
+ @classmethod
353
+ def config_model(cls) -> type[BigQueryMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
354
+ return BigQueryMetadataStoreConfig
@@ -0,0 +1,184 @@
1
+ """ClickHouse metadata store - thin wrapper around IbisMetadataStore."""
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ if TYPE_CHECKING:
6
+ from metaxy.metadata_store.base import MetadataStore
7
+
8
+ from metaxy.metadata_store.ibis import IbisMetadataStore, IbisMetadataStoreConfig
9
+ from metaxy.versioning.types import HashAlgorithm
10
+
11
+
12
+ class ClickHouseMetadataStoreConfig(IbisMetadataStoreConfig):
13
+ """Configuration for ClickHouseMetadataStore.
14
+
15
+ Inherits connection_string, connection_params, table_prefix, auto_create_tables from IbisMetadataStoreConfig.
16
+
17
+ Example:
18
+ ```python
19
+ config = ClickHouseMetadataStoreConfig(
20
+ connection_string="clickhouse://localhost:9000/default",
21
+ hash_algorithm=HashAlgorithm.XXHASH64,
22
+ )
23
+
24
+ store = ClickHouseMetadataStore.from_config(config)
25
+ ```
26
+ """
27
+
28
+ pass # All fields inherited from IbisMetadataStoreConfig
29
+
30
+
31
+ class ClickHouseMetadataStore(IbisMetadataStore):
32
+ """
33
+ [ClickHouse](https://clickhouse.com/) metadata storeusing [Ibis](https://ibis-project.org/) backend.
34
+
35
+ Example: Connection Parameters
36
+ ```py
37
+ store = ClickHouseMetadataStore(
38
+ backend="clickhouse",
39
+ connection_params={
40
+ "host": "localhost",
41
+ "port": 9000,
42
+ "database": "default",
43
+ "user": "default",
44
+ "password": ""
45
+ },
46
+ hash_algorithm=HashAlgorithm.XXHASH64
47
+ )
48
+ ```
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ connection_string: str | None = None,
54
+ *,
55
+ connection_params: dict[str, Any] | None = None,
56
+ fallback_stores: list["MetadataStore"] | None = None,
57
+ **kwargs: Any,
58
+ ):
59
+ """
60
+ Initialize [ClickHouse](https://clickhouse.com/) metadata store.
61
+
62
+ Args:
63
+ connection_string: ClickHouse connection string.
64
+
65
+ Format: `clickhouse://[user[:password]@]host[:port]/database[?param=value]`
66
+
67
+ Examples:
68
+ ```
69
+ - "clickhouse://localhost:9000/default"
70
+ - "clickhouse://user:pass@host:9000/db"
71
+ - "clickhouse://host:9000/db?secure=true"
72
+ ```
73
+
74
+ connection_params: Alternative to connection_string, specify params as dict:
75
+
76
+ - host: Server host
77
+
78
+ - port: Server port (default: `9000`)
79
+
80
+ - database: Database name
81
+
82
+ - user: Username
83
+
84
+ - password: Password
85
+
86
+ - secure: Use secure connection (default: `False`)
87
+
88
+ fallback_stores: Ordered list of read-only fallback stores.
89
+
90
+ **kwargs: Passed to [metaxy.metadata_store.ibis.IbisMetadataStore][]`
91
+
92
+ Raises:
93
+ ImportError: If ibis-clickhouse not installed
94
+ ValueError: If neither connection_string nor connection_params provided
95
+ """
96
+ if connection_string is None and connection_params is None:
97
+ raise ValueError(
98
+ "Must provide either connection_string or connection_params. "
99
+ "Example: connection_string='clickhouse://localhost:9000/default'"
100
+ )
101
+
102
+ # Initialize Ibis store with ClickHouse backend
103
+ super().__init__(
104
+ connection_string=connection_string,
105
+ backend="clickhouse" if connection_string is None else None,
106
+ connection_params=connection_params,
107
+ fallback_stores=fallback_stores,
108
+ **kwargs,
109
+ )
110
+
111
+ def _get_default_hash_algorithm(self) -> HashAlgorithm:
112
+ """Get default hash algorithm for ClickHouse stores.
113
+
114
+ Uses XXHASH64 which is built-in to ClickHouse.
115
+ """
116
+ return HashAlgorithm.XXHASH64
117
+
118
+ def _create_hash_functions(self):
119
+ """Create ClickHouse-specific hash functions for Ibis expressions.
120
+
121
+ Implements MD5 and xxHash functions using ClickHouse's native functions.
122
+ """
123
+ # Import ibis for wrapping built-in SQL functions
124
+ import ibis
125
+
126
+ hash_functions = {}
127
+
128
+ # ClickHouse MD5 implementation
129
+ @ibis.udf.scalar.builtin
130
+ def MD5(x: str) -> str:
131
+ """ClickHouse MD5() function."""
132
+ ...
133
+
134
+ @ibis.udf.scalar.builtin
135
+ def HEX(x: str) -> str:
136
+ """ClickHouse HEX() function."""
137
+ ...
138
+
139
+ @ibis.udf.scalar.builtin
140
+ def lower(x: str) -> str:
141
+ """ClickHouse lower() function."""
142
+ ...
143
+
144
+ def md5_hash(col_expr):
145
+ """Hash a column using ClickHouse's MD5() function."""
146
+ # MD5 returns binary FixedString(16), convert to lowercase hex
147
+ return lower(HEX(MD5(col_expr.cast(str))))
148
+
149
+ hash_functions[HashAlgorithm.MD5] = md5_hash
150
+
151
+ # ClickHouse xxHash functions
152
+ @ibis.udf.scalar.builtin
153
+ def xxHash32(x: str) -> int:
154
+ """ClickHouse xxHash32() function - returns UInt32."""
155
+ ...
156
+
157
+ @ibis.udf.scalar.builtin
158
+ def xxHash64(x: str) -> int:
159
+ """ClickHouse xxHash64() function - returns UInt64."""
160
+ ...
161
+
162
+ @ibis.udf.scalar.builtin
163
+ def toString(x: int) -> str:
164
+ """ClickHouse toString() function - converts integer to string."""
165
+ ...
166
+
167
+ def xxhash32_hash(col_expr):
168
+ """Hash a column using ClickHouse's xxHash32() function."""
169
+ # xxHash32 returns UInt32, convert to string
170
+ return toString(xxHash32(col_expr))
171
+
172
+ def xxhash64_hash(col_expr):
173
+ """Hash a column using ClickHouse's xxHash64() function."""
174
+ # xxHash64 returns UInt64, convert to string
175
+ return toString(xxHash64(col_expr))
176
+
177
+ hash_functions[HashAlgorithm.XXHASH32] = xxhash32_hash
178
+ hash_functions[HashAlgorithm.XXHASH64] = xxhash64_hash
179
+
180
+ return hash_functions
181
+
182
+ @classmethod
183
+ def config_model(cls) -> type[ClickHouseMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
184
+ return ClickHouseMetadataStoreConfig