metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""BigQuery metadata store - thin wrapper around IbisMetadataStore."""
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from metaxy.metadata_store.base import MetadataStore
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from metaxy.metadata_store.ibis import IbisMetadataStore, IbisMetadataStoreConfig
|
|
11
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BigQueryMetadataStoreConfig(IbisMetadataStoreConfig):
|
|
15
|
+
"""Configuration for BigQueryMetadataStore.
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
```python
|
|
19
|
+
config = BigQueryMetadataStoreConfig(
|
|
20
|
+
project_id="my-project",
|
|
21
|
+
dataset_id="my_dataset",
|
|
22
|
+
credentials_path="/path/to/service-account.json",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
store = BigQueryMetadataStore.from_config(config)
|
|
26
|
+
```
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
project_id: str | None = Field(
|
|
30
|
+
default=None, description="Google Cloud project ID containing the dataset."
|
|
31
|
+
)
|
|
32
|
+
dataset_id: str | None = Field(
|
|
33
|
+
default=None, description="BigQuery dataset name for storing metadata tables."
|
|
34
|
+
)
|
|
35
|
+
credentials_path: str | None = Field(
|
|
36
|
+
default=None, description="Path to service account JSON file."
|
|
37
|
+
)
|
|
38
|
+
credentials: Any | None = Field(
|
|
39
|
+
default=None, description="Google Cloud credentials object."
|
|
40
|
+
)
|
|
41
|
+
location: str | None = Field(
|
|
42
|
+
default=None,
|
|
43
|
+
description="Default location for BigQuery resources (e.g., 'US', 'EU').",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class BigQueryMetadataStore(IbisMetadataStore):
|
|
48
|
+
"""
|
|
49
|
+
[BigQuery](https://cloud.google.com/bigquery) metadata store using [Ibis](https://ibis-project.org/) backend.
|
|
50
|
+
|
|
51
|
+
Warning:
|
|
52
|
+
It's on the user to set up infrastructure for Metaxy correctly.
|
|
53
|
+
Make sure to have large tables partitioned as appropriate for your use case.
|
|
54
|
+
|
|
55
|
+
Note:
|
|
56
|
+
BigQuery automatically optimizes queries on partitioned tables.
|
|
57
|
+
When tables are partitioned (e.g., by date or ingestion time with _PARTITIONTIME), BigQuery will
|
|
58
|
+
automatically prune partitions based on WHERE clauses in queries, without needing
|
|
59
|
+
explicit configuration in the metadata store.
|
|
60
|
+
Make sure to use appropriate `filters` when calling [BigQueryMetadataStore.read_metadata][metaxy.metadata_store.bigquery.BigQueryMetadataStore.read_metadata].
|
|
61
|
+
|
|
62
|
+
Example: Basic Connection
|
|
63
|
+
```py
|
|
64
|
+
store = BigQueryMetadataStore(
|
|
65
|
+
project_id="my-project",
|
|
66
|
+
dataset_id="my_dataset",
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Example: With Service Account
|
|
71
|
+
```py
|
|
72
|
+
store = BigQueryMetadataStore(
|
|
73
|
+
project_id="my-project",
|
|
74
|
+
dataset_id="my_dataset",
|
|
75
|
+
credentials_path="/path/to/service-account.json",
|
|
76
|
+
)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Example: With Location Configuration
|
|
80
|
+
```py
|
|
81
|
+
store = BigQueryMetadataStore(
|
|
82
|
+
project_id="my-project",
|
|
83
|
+
dataset_id="my_dataset",
|
|
84
|
+
location="EU", # Specify data location
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Example: With Custom Hash Algorithm
|
|
89
|
+
```py
|
|
90
|
+
store = BigQueryMetadataStore(
|
|
91
|
+
project_id="my-project",
|
|
92
|
+
dataset_id="my_dataset",
|
|
93
|
+
hash_algorithm=HashAlgorithm.SHA256, # Use SHA256 instead of default FARMHASH
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
project_id: str | None = None,
|
|
101
|
+
dataset_id: str | None = None,
|
|
102
|
+
*,
|
|
103
|
+
credentials_path: str | None = None,
|
|
104
|
+
credentials: Any | None = None,
|
|
105
|
+
location: str | None = None,
|
|
106
|
+
connection_params: dict[str, Any] | None = None,
|
|
107
|
+
fallback_stores: list["MetadataStore"] | None = None,
|
|
108
|
+
**kwargs: Any,
|
|
109
|
+
):
|
|
110
|
+
"""
|
|
111
|
+
Initialize [BigQuery](https://cloud.google.com/bigquery) metadata store.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
project_id: Google Cloud project ID containing the dataset.
|
|
115
|
+
Can also be set via GOOGLE_CLOUD_PROJECT environment variable.
|
|
116
|
+
dataset_id: BigQuery dataset name for storing metadata tables.
|
|
117
|
+
If not provided, uses the default dataset for the project.
|
|
118
|
+
credentials_path: Path to service account JSON file.
|
|
119
|
+
Alternative to passing credentials object directly.
|
|
120
|
+
credentials: Google Cloud credentials object.
|
|
121
|
+
If not provided, uses default credentials from environment.
|
|
122
|
+
location: Default location for BigQuery resources (e.g., "US", "EU").
|
|
123
|
+
If not specified, BigQuery determines based on dataset location.
|
|
124
|
+
connection_params: Additional Ibis BigQuery connection parameters.
|
|
125
|
+
Overrides individual parameters if provided.
|
|
126
|
+
fallback_stores: Ordered list of read-only fallback stores.
|
|
127
|
+
**kwargs: Passed to [metaxy.metadata_store.ibis.IbisMetadataStore][]
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
ImportError: If ibis-bigquery not installed
|
|
131
|
+
ValueError: If neither project_id nor connection_params provided
|
|
132
|
+
|
|
133
|
+
Note:
|
|
134
|
+
Authentication priority:
|
|
135
|
+
1. Explicit credentials or credentials_path
|
|
136
|
+
2. Application Default Credentials (ADC)
|
|
137
|
+
3. Google Cloud SDK credentials
|
|
138
|
+
|
|
139
|
+
BigQuery automatically handles partition pruning when querying partitioned tables.
|
|
140
|
+
If your tables are partitioned (e.g., by date or ingestion time), BigQuery will
|
|
141
|
+
automatically optimize queries with appropriate WHERE clauses on the partition column.
|
|
142
|
+
|
|
143
|
+
Example:
|
|
144
|
+
```py
|
|
145
|
+
# Using environment authentication
|
|
146
|
+
store = BigQueryMetadataStore(
|
|
147
|
+
project_id="my-project",
|
|
148
|
+
dataset_id="ml_metadata",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Using service account
|
|
152
|
+
store = BigQueryMetadataStore(
|
|
153
|
+
project_id="my-project",
|
|
154
|
+
dataset_id="ml_metadata",
|
|
155
|
+
credentials_path="/path/to/key.json",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# With location specification
|
|
159
|
+
store = BigQueryMetadataStore(
|
|
160
|
+
project_id="my-project",
|
|
161
|
+
dataset_id="ml_metadata",
|
|
162
|
+
location="EU",
|
|
163
|
+
)
|
|
164
|
+
```
|
|
165
|
+
"""
|
|
166
|
+
# Build connection parameters if not provided
|
|
167
|
+
if connection_params is None:
|
|
168
|
+
connection_params = self._build_connection_params(
|
|
169
|
+
project_id=project_id,
|
|
170
|
+
dataset_id=dataset_id,
|
|
171
|
+
credentials_path=credentials_path,
|
|
172
|
+
credentials=credentials,
|
|
173
|
+
location=location,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Validate we have minimum required parameters
|
|
177
|
+
if "project_id" not in connection_params and project_id is None:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
"Must provide either project_id or connection_params with project_id. "
|
|
180
|
+
"Example: project_id='my-project'"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Store parameters for display
|
|
184
|
+
self.project_id = project_id or connection_params.get("project_id")
|
|
185
|
+
self.dataset_id = dataset_id or connection_params.get("dataset_id", "")
|
|
186
|
+
|
|
187
|
+
# Initialize Ibis store with BigQuery backend
|
|
188
|
+
super().__init__(
|
|
189
|
+
backend="bigquery",
|
|
190
|
+
connection_params=connection_params,
|
|
191
|
+
fallback_stores=fallback_stores,
|
|
192
|
+
**kwargs,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
def _build_connection_params(
|
|
196
|
+
self,
|
|
197
|
+
project_id: str | None = None,
|
|
198
|
+
dataset_id: str | None = None,
|
|
199
|
+
credentials_path: str | None = None,
|
|
200
|
+
credentials: Any | None = None,
|
|
201
|
+
location: str | None = None,
|
|
202
|
+
) -> dict[str, Any]:
|
|
203
|
+
"""Build connection parameters for Ibis BigQuery backend.
|
|
204
|
+
|
|
205
|
+
This method centralizes the authentication logic, supporting:
|
|
206
|
+
1. Explicit service account file (credentials_path)
|
|
207
|
+
2. Explicit credentials object
|
|
208
|
+
3. Application Default Credentials (automatic fallback)
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
project_id: Google Cloud project ID
|
|
212
|
+
dataset_id: BigQuery dataset name
|
|
213
|
+
credentials_path: Path to service account JSON file
|
|
214
|
+
credentials: Pre-loaded credentials object
|
|
215
|
+
location: BigQuery resource location
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Dictionary of connection parameters for Ibis
|
|
219
|
+
"""
|
|
220
|
+
connection_params: dict[str, Any] = {}
|
|
221
|
+
|
|
222
|
+
# Set core BigQuery parameters
|
|
223
|
+
if project_id is not None:
|
|
224
|
+
connection_params["project_id"] = project_id
|
|
225
|
+
if dataset_id is not None:
|
|
226
|
+
connection_params["dataset_id"] = dataset_id
|
|
227
|
+
if location is not None:
|
|
228
|
+
connection_params["location"] = location
|
|
229
|
+
|
|
230
|
+
# Handle authentication - prioritize explicit credentials
|
|
231
|
+
if credentials_path is not None:
|
|
232
|
+
connection_params["credentials"] = self._load_service_account_credentials(
|
|
233
|
+
credentials_path
|
|
234
|
+
)
|
|
235
|
+
elif credentials is not None:
|
|
236
|
+
connection_params["credentials"] = credentials
|
|
237
|
+
# Otherwise, Ibis will automatically use Application Default Credentials
|
|
238
|
+
|
|
239
|
+
return connection_params
|
|
240
|
+
|
|
241
|
+
def _load_service_account_credentials(self, credentials_path: str) -> Any:
|
|
242
|
+
"""Load service account credentials from a JSON file.
|
|
243
|
+
|
|
244
|
+
Uses Google's recommended approach with google.oauth2.service_account
|
|
245
|
+
instead of manually parsing JSON and constructing credentials.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
credentials_path: Path to service account JSON file
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Google Cloud credentials object
|
|
252
|
+
|
|
253
|
+
Raises:
|
|
254
|
+
ImportError: If google-auth library not installed
|
|
255
|
+
FileNotFoundError: If credentials file doesn't exist
|
|
256
|
+
ValueError: If credentials file is invalid
|
|
257
|
+
"""
|
|
258
|
+
try:
|
|
259
|
+
from google.oauth2 import (
|
|
260
|
+
service_account, # pyright: ignore[reportMissingImports]
|
|
261
|
+
)
|
|
262
|
+
except ImportError as e:
|
|
263
|
+
raise ImportError(
|
|
264
|
+
"Google Cloud authentication libraries required for service account credentials. "
|
|
265
|
+
"Install with: pip install google-auth"
|
|
266
|
+
) from e
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
# Use Google's recommended method - it handles all edge cases
|
|
270
|
+
return service_account.Credentials.from_service_account_file(
|
|
271
|
+
credentials_path,
|
|
272
|
+
scopes=["https://www.googleapis.com/auth/bigquery"],
|
|
273
|
+
)
|
|
274
|
+
except FileNotFoundError as e:
|
|
275
|
+
raise FileNotFoundError(
|
|
276
|
+
f"Service account credentials file not found: {credentials_path}"
|
|
277
|
+
) from e
|
|
278
|
+
except Exception as e:
|
|
279
|
+
# Catch JSON decode errors and other credential format issues
|
|
280
|
+
raise ValueError(
|
|
281
|
+
f"Invalid service account credentials file: {credentials_path}. "
|
|
282
|
+
"Ensure it's a valid service account JSON key file."
|
|
283
|
+
) from e
|
|
284
|
+
|
|
285
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
286
|
+
# Should switch to FARM_FINGERPRINT64 once https://github.com/ion-elgreco/polars-hash/issues/49 is resolved
|
|
287
|
+
return HashAlgorithm.MD5
|
|
288
|
+
|
|
289
|
+
def _create_hash_functions(self):
|
|
290
|
+
"""Create BigQuery-specific hash functions for Ibis expressions.
|
|
291
|
+
|
|
292
|
+
BigQuery supports FARM_FINGERPRINT, MD5, and SHA256 natively.
|
|
293
|
+
"""
|
|
294
|
+
# Import ibis for wrapping built-in SQL functions
|
|
295
|
+
import ibis
|
|
296
|
+
|
|
297
|
+
# Use Ibis's builtin UDF decorator to wrap BigQuery's hash functions
|
|
298
|
+
@ibis.udf.scalar.builtin
|
|
299
|
+
def MD5(x: str) -> str:
|
|
300
|
+
"""BigQuery MD5() function."""
|
|
301
|
+
...
|
|
302
|
+
|
|
303
|
+
@ibis.udf.scalar.builtin
|
|
304
|
+
def FARM_FINGERPRINT(x: str) -> str:
|
|
305
|
+
"""BigQuery FARM_FINGERPRINT() function."""
|
|
306
|
+
...
|
|
307
|
+
|
|
308
|
+
@ibis.udf.scalar.builtin
|
|
309
|
+
def SHA256(x: str) -> str:
|
|
310
|
+
"""BigQuery SHA256() function."""
|
|
311
|
+
...
|
|
312
|
+
|
|
313
|
+
@ibis.udf.scalar.builtin
|
|
314
|
+
def TO_HEX(x: str) -> str:
|
|
315
|
+
"""BigQuery TO_HEX() function."""
|
|
316
|
+
...
|
|
317
|
+
|
|
318
|
+
@ibis.udf.scalar.builtin
|
|
319
|
+
def LOWER(x: str) -> str:
|
|
320
|
+
"""BigQuery LOWER() function."""
|
|
321
|
+
...
|
|
322
|
+
|
|
323
|
+
# Create hash functions that use these wrapped SQL functions
|
|
324
|
+
def md5_hash(col_expr):
|
|
325
|
+
"""Hash a column using BigQuery's MD5() function."""
|
|
326
|
+
# MD5 returns bytes, convert to lowercase hex string
|
|
327
|
+
return LOWER(TO_HEX(MD5(col_expr.cast(str))))
|
|
328
|
+
|
|
329
|
+
def farmhash_hash(col_expr):
|
|
330
|
+
"""Hash a column using BigQuery's FARM_FINGERPRINT() function."""
|
|
331
|
+
# FARM_FINGERPRINT returns INT64, cast to string
|
|
332
|
+
return FARM_FINGERPRINT(col_expr).cast(str)
|
|
333
|
+
|
|
334
|
+
def sha256_hash(col_expr):
|
|
335
|
+
"""Hash a column using BigQuery's SHA256() function."""
|
|
336
|
+
# SHA256 returns bytes, convert to lowercase hex string
|
|
337
|
+
return LOWER(TO_HEX(SHA256(col_expr)))
|
|
338
|
+
|
|
339
|
+
hash_functions = {
|
|
340
|
+
HashAlgorithm.MD5: md5_hash,
|
|
341
|
+
HashAlgorithm.FARMHASH: farmhash_hash,
|
|
342
|
+
HashAlgorithm.SHA256: sha256_hash,
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
return hash_functions
|
|
346
|
+
|
|
347
|
+
def display(self) -> str:
|
|
348
|
+
"""Display string for this store."""
|
|
349
|
+
dataset_info = f"/{self.dataset_id}" if self.dataset_id else ""
|
|
350
|
+
return f"BigQueryMetadataStore(project={self.project_id}{dataset_info})"
|
|
351
|
+
|
|
352
|
+
@classmethod
|
|
353
|
+
def config_model(cls) -> type[BigQueryMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
354
|
+
return BigQueryMetadataStoreConfig
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""ClickHouse metadata store - thin wrapper around IbisMetadataStore."""
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from metaxy.metadata_store.base import MetadataStore
|
|
7
|
+
|
|
8
|
+
from metaxy.metadata_store.ibis import IbisMetadataStore, IbisMetadataStoreConfig
|
|
9
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ClickHouseMetadataStoreConfig(IbisMetadataStoreConfig):
|
|
13
|
+
"""Configuration for ClickHouseMetadataStore.
|
|
14
|
+
|
|
15
|
+
Inherits connection_string, connection_params, table_prefix, auto_create_tables from IbisMetadataStoreConfig.
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
```python
|
|
19
|
+
config = ClickHouseMetadataStoreConfig(
|
|
20
|
+
connection_string="clickhouse://localhost:9000/default",
|
|
21
|
+
hash_algorithm=HashAlgorithm.XXHASH64,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
store = ClickHouseMetadataStore.from_config(config)
|
|
25
|
+
```
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
pass # All fields inherited from IbisMetadataStoreConfig
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ClickHouseMetadataStore(IbisMetadataStore):
|
|
32
|
+
"""
|
|
33
|
+
[ClickHouse](https://clickhouse.com/) metadata storeusing [Ibis](https://ibis-project.org/) backend.
|
|
34
|
+
|
|
35
|
+
Example: Connection Parameters
|
|
36
|
+
```py
|
|
37
|
+
store = ClickHouseMetadataStore(
|
|
38
|
+
backend="clickhouse",
|
|
39
|
+
connection_params={
|
|
40
|
+
"host": "localhost",
|
|
41
|
+
"port": 9000,
|
|
42
|
+
"database": "default",
|
|
43
|
+
"user": "default",
|
|
44
|
+
"password": ""
|
|
45
|
+
},
|
|
46
|
+
hash_algorithm=HashAlgorithm.XXHASH64
|
|
47
|
+
)
|
|
48
|
+
```
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
connection_string: str | None = None,
|
|
54
|
+
*,
|
|
55
|
+
connection_params: dict[str, Any] | None = None,
|
|
56
|
+
fallback_stores: list["MetadataStore"] | None = None,
|
|
57
|
+
**kwargs: Any,
|
|
58
|
+
):
|
|
59
|
+
"""
|
|
60
|
+
Initialize [ClickHouse](https://clickhouse.com/) metadata store.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
connection_string: ClickHouse connection string.
|
|
64
|
+
|
|
65
|
+
Format: `clickhouse://[user[:password]@]host[:port]/database[?param=value]`
|
|
66
|
+
|
|
67
|
+
Examples:
|
|
68
|
+
```
|
|
69
|
+
- "clickhouse://localhost:9000/default"
|
|
70
|
+
- "clickhouse://user:pass@host:9000/db"
|
|
71
|
+
- "clickhouse://host:9000/db?secure=true"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
connection_params: Alternative to connection_string, specify params as dict:
|
|
75
|
+
|
|
76
|
+
- host: Server host
|
|
77
|
+
|
|
78
|
+
- port: Server port (default: `9000`)
|
|
79
|
+
|
|
80
|
+
- database: Database name
|
|
81
|
+
|
|
82
|
+
- user: Username
|
|
83
|
+
|
|
84
|
+
- password: Password
|
|
85
|
+
|
|
86
|
+
- secure: Use secure connection (default: `False`)
|
|
87
|
+
|
|
88
|
+
fallback_stores: Ordered list of read-only fallback stores.
|
|
89
|
+
|
|
90
|
+
**kwargs: Passed to [metaxy.metadata_store.ibis.IbisMetadataStore][]`
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
ImportError: If ibis-clickhouse not installed
|
|
94
|
+
ValueError: If neither connection_string nor connection_params provided
|
|
95
|
+
"""
|
|
96
|
+
if connection_string is None and connection_params is None:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
"Must provide either connection_string or connection_params. "
|
|
99
|
+
"Example: connection_string='clickhouse://localhost:9000/default'"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Initialize Ibis store with ClickHouse backend
|
|
103
|
+
super().__init__(
|
|
104
|
+
connection_string=connection_string,
|
|
105
|
+
backend="clickhouse" if connection_string is None else None,
|
|
106
|
+
connection_params=connection_params,
|
|
107
|
+
fallback_stores=fallback_stores,
|
|
108
|
+
**kwargs,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
112
|
+
"""Get default hash algorithm for ClickHouse stores.
|
|
113
|
+
|
|
114
|
+
Uses XXHASH64 which is built-in to ClickHouse.
|
|
115
|
+
"""
|
|
116
|
+
return HashAlgorithm.XXHASH64
|
|
117
|
+
|
|
118
|
+
def _create_hash_functions(self):
|
|
119
|
+
"""Create ClickHouse-specific hash functions for Ibis expressions.
|
|
120
|
+
|
|
121
|
+
Implements MD5 and xxHash functions using ClickHouse's native functions.
|
|
122
|
+
"""
|
|
123
|
+
# Import ibis for wrapping built-in SQL functions
|
|
124
|
+
import ibis
|
|
125
|
+
|
|
126
|
+
hash_functions = {}
|
|
127
|
+
|
|
128
|
+
# ClickHouse MD5 implementation
|
|
129
|
+
@ibis.udf.scalar.builtin
|
|
130
|
+
def MD5(x: str) -> str:
|
|
131
|
+
"""ClickHouse MD5() function."""
|
|
132
|
+
...
|
|
133
|
+
|
|
134
|
+
@ibis.udf.scalar.builtin
|
|
135
|
+
def HEX(x: str) -> str:
|
|
136
|
+
"""ClickHouse HEX() function."""
|
|
137
|
+
...
|
|
138
|
+
|
|
139
|
+
@ibis.udf.scalar.builtin
|
|
140
|
+
def lower(x: str) -> str:
|
|
141
|
+
"""ClickHouse lower() function."""
|
|
142
|
+
...
|
|
143
|
+
|
|
144
|
+
def md5_hash(col_expr):
|
|
145
|
+
"""Hash a column using ClickHouse's MD5() function."""
|
|
146
|
+
# MD5 returns binary FixedString(16), convert to lowercase hex
|
|
147
|
+
return lower(HEX(MD5(col_expr.cast(str))))
|
|
148
|
+
|
|
149
|
+
hash_functions[HashAlgorithm.MD5] = md5_hash
|
|
150
|
+
|
|
151
|
+
# ClickHouse xxHash functions
|
|
152
|
+
@ibis.udf.scalar.builtin
|
|
153
|
+
def xxHash32(x: str) -> int:
|
|
154
|
+
"""ClickHouse xxHash32() function - returns UInt32."""
|
|
155
|
+
...
|
|
156
|
+
|
|
157
|
+
@ibis.udf.scalar.builtin
|
|
158
|
+
def xxHash64(x: str) -> int:
|
|
159
|
+
"""ClickHouse xxHash64() function - returns UInt64."""
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
@ibis.udf.scalar.builtin
|
|
163
|
+
def toString(x: int) -> str:
|
|
164
|
+
"""ClickHouse toString() function - converts integer to string."""
|
|
165
|
+
...
|
|
166
|
+
|
|
167
|
+
def xxhash32_hash(col_expr):
|
|
168
|
+
"""Hash a column using ClickHouse's xxHash32() function."""
|
|
169
|
+
# xxHash32 returns UInt32, convert to string
|
|
170
|
+
return toString(xxHash32(col_expr))
|
|
171
|
+
|
|
172
|
+
def xxhash64_hash(col_expr):
|
|
173
|
+
"""Hash a column using ClickHouse's xxHash64() function."""
|
|
174
|
+
# xxHash64 returns UInt64, convert to string
|
|
175
|
+
return toString(xxHash64(col_expr))
|
|
176
|
+
|
|
177
|
+
hash_functions[HashAlgorithm.XXHASH32] = xxhash32_hash
|
|
178
|
+
hash_functions[HashAlgorithm.XXHASH64] = xxhash64_hash
|
|
179
|
+
|
|
180
|
+
return hash_functions
|
|
181
|
+
|
|
182
|
+
@classmethod
|
|
183
|
+
def config_model(cls) -> type[ClickHouseMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
184
|
+
return ClickHouseMetadataStoreConfig
|