metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,446 @@
|
|
|
1
|
+
"""DuckDB metadata store - thin wrapper around IbisMetadataStore."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, ValidationError
|
|
9
|
+
from typing_extensions import Self
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from metaxy.metadata_store.base import MetadataStore
|
|
13
|
+
|
|
14
|
+
from metaxy.metadata_store._ducklake_support import (
|
|
15
|
+
DuckDBPyConnection,
|
|
16
|
+
DuckLakeAttachmentConfig,
|
|
17
|
+
DuckLakeAttachmentManager,
|
|
18
|
+
DuckLakeConfigInput,
|
|
19
|
+
build_ducklake_attachment,
|
|
20
|
+
ensure_extensions_with_plugins,
|
|
21
|
+
)
|
|
22
|
+
from metaxy.metadata_store.ibis import IbisMetadataStore, IbisMetadataStoreConfig
|
|
23
|
+
from metaxy.metadata_store.types import AccessMode
|
|
24
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExtensionSpec(BaseModel):
|
|
28
|
+
"""
|
|
29
|
+
DuckDB extension specification accepted by DuckDBMetadataStore.
|
|
30
|
+
|
|
31
|
+
Supports additional keys for forward compatibility.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
name: str
|
|
35
|
+
repository: str | None = None
|
|
36
|
+
|
|
37
|
+
model_config = ConfigDict(extra="allow")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
ExtensionInput = str | ExtensionSpec | Mapping[str, Any]
|
|
41
|
+
NormalisedExtension = str | ExtensionSpec
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DuckDBMetadataStoreConfig(IbisMetadataStoreConfig):
|
|
45
|
+
"""Configuration for DuckDBMetadataStore.
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
```python
|
|
49
|
+
config = DuckDBMetadataStoreConfig(
|
|
50
|
+
database="metadata.db",
|
|
51
|
+
extensions=["hashfuncs"],
|
|
52
|
+
hash_algorithm=HashAlgorithm.XXHASH64,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
store = DuckDBMetadataStore.from_config(config)
|
|
56
|
+
```
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
database: str | Path = Field(
|
|
60
|
+
description="Database path (:memory:, file path, or md:database).",
|
|
61
|
+
)
|
|
62
|
+
config: dict[str, str] | None = Field(
|
|
63
|
+
default=None,
|
|
64
|
+
description="DuckDB configuration settings (e.g., {'threads': '4'}).",
|
|
65
|
+
)
|
|
66
|
+
extensions: Sequence[ExtensionInput] | None = Field(
|
|
67
|
+
default=None,
|
|
68
|
+
description="DuckDB extensions to install and load on open.",
|
|
69
|
+
)
|
|
70
|
+
ducklake: DuckLakeConfigInput | None = Field(
|
|
71
|
+
default=None,
|
|
72
|
+
description="DuckLake attachment configuration.",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _normalise_extensions(
|
|
77
|
+
extensions: Iterable[ExtensionInput],
|
|
78
|
+
) -> list[NormalisedExtension]:
|
|
79
|
+
"""Coerce extension inputs into strings or fully-validated specs."""
|
|
80
|
+
normalised: list[NormalisedExtension] = []
|
|
81
|
+
for ext in extensions:
|
|
82
|
+
if isinstance(ext, str):
|
|
83
|
+
normalised.append(ext)
|
|
84
|
+
elif isinstance(ext, ExtensionSpec):
|
|
85
|
+
normalised.append(ext)
|
|
86
|
+
elif isinstance(ext, Mapping):
|
|
87
|
+
try:
|
|
88
|
+
normalised.append(ExtensionSpec.model_validate(ext))
|
|
89
|
+
except ValidationError as exc:
|
|
90
|
+
raise ValueError(f"Invalid DuckDB extension spec: {ext!r}") from exc
|
|
91
|
+
else:
|
|
92
|
+
raise TypeError(
|
|
93
|
+
"DuckDB extensions must be strings or mapping-like objects with a 'name'."
|
|
94
|
+
)
|
|
95
|
+
return normalised
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class DuckDBMetadataStore(IbisMetadataStore):
|
|
99
|
+
"""
|
|
100
|
+
[DuckDB](https://duckdb.org/) metadata store using [Ibis](https://ibis-project.org/) backend.
|
|
101
|
+
|
|
102
|
+
Example: Local File
|
|
103
|
+
```py
|
|
104
|
+
store = DuckDBMetadataStore("metadata.db")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Example: In-memory database
|
|
108
|
+
```py
|
|
109
|
+
# In-memory database
|
|
110
|
+
store = DuckDBMetadataStore(":memory:")
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Example: MotherDuck
|
|
114
|
+
```py
|
|
115
|
+
# MotherDuck
|
|
116
|
+
store = DuckDBMetadataStore("md:my_database")
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Example: With extensions
|
|
120
|
+
```py
|
|
121
|
+
# With extensions
|
|
122
|
+
store = DuckDBMetadataStore(
|
|
123
|
+
"metadata.db",
|
|
124
|
+
hash_algorithm=HashAlgorithm.XXHASH64,
|
|
125
|
+
extensions=["hashfuncs"]
|
|
126
|
+
)
|
|
127
|
+
```
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
def __init__(
|
|
131
|
+
self,
|
|
132
|
+
database: str | Path,
|
|
133
|
+
*,
|
|
134
|
+
config: dict[str, str] | None = None,
|
|
135
|
+
extensions: Sequence[ExtensionInput] | None = None,
|
|
136
|
+
fallback_stores: list["MetadataStore"] | None = None,
|
|
137
|
+
ducklake: DuckLakeConfigInput | None = None,
|
|
138
|
+
**kwargs,
|
|
139
|
+
):
|
|
140
|
+
"""
|
|
141
|
+
Initialize [DuckDB](https://duckdb.org/) metadata store.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
database: Database connection string or path.
|
|
145
|
+
- File path: `"metadata.db"` or `Path("metadata.db")`
|
|
146
|
+
|
|
147
|
+
- In-memory: `":memory:"`
|
|
148
|
+
|
|
149
|
+
- MotherDuck: `"md:my_database"` or `"md:my_database?motherduck_token=..."`
|
|
150
|
+
|
|
151
|
+
- S3: `"s3://bucket/path/database.duckdb"` (read-only via ATTACH)
|
|
152
|
+
|
|
153
|
+
- HTTPS: `"https://example.com/database.duckdb"` (read-only via ATTACH)
|
|
154
|
+
|
|
155
|
+
- Any valid DuckDB connection string
|
|
156
|
+
|
|
157
|
+
config: Optional DuckDB configuration settings (e.g., {'threads': '4', 'memory_limit': '4GB'})
|
|
158
|
+
extensions: List of DuckDB extensions to install and load on open.
|
|
159
|
+
Supports strings (community repo), mapping-like objects with
|
|
160
|
+
``name``/``repository`` keys, or [metaxy.metadata_store.duckdb.ExtensionSpec][] instances.
|
|
161
|
+
|
|
162
|
+
ducklake: Optional DuckLake attachment configuration. Provide either a
|
|
163
|
+
mapping with 'metadata_backend' and 'storage_backend' entries or a
|
|
164
|
+
DuckLakeAttachmentConfig instance. When supplied, the DuckDB
|
|
165
|
+
connection is configured to ATTACH the DuckLake catalog after open().
|
|
166
|
+
fallback_stores: Ordered list of read-only fallback stores.
|
|
167
|
+
|
|
168
|
+
**kwargs: Passed to [metaxy.metadata_store.ibis.IbisMetadataStore][]`
|
|
169
|
+
|
|
170
|
+
Warning:
|
|
171
|
+
Parent directories are NOT created automatically. Ensure paths exist
|
|
172
|
+
before initializing the store.
|
|
173
|
+
"""
|
|
174
|
+
database_str = str(database)
|
|
175
|
+
|
|
176
|
+
# Build connection params for Ibis DuckDB backend
|
|
177
|
+
# Ibis DuckDB backend accepts config params directly (not nested under 'config')
|
|
178
|
+
connection_params = {"database": database_str}
|
|
179
|
+
if config:
|
|
180
|
+
connection_params.update(config)
|
|
181
|
+
|
|
182
|
+
self.database = database_str
|
|
183
|
+
base_extensions: list[NormalisedExtension] = _normalise_extensions(
|
|
184
|
+
extensions or []
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
self._ducklake_config: DuckLakeAttachmentConfig | None = None
|
|
188
|
+
self._ducklake_attachment: DuckLakeAttachmentManager | None = None
|
|
189
|
+
if ducklake is not None:
|
|
190
|
+
attachment_config, manager = build_ducklake_attachment(ducklake)
|
|
191
|
+
ensure_extensions_with_plugins(base_extensions, attachment_config.plugins)
|
|
192
|
+
self._ducklake_config = attachment_config
|
|
193
|
+
self._ducklake_attachment = manager
|
|
194
|
+
|
|
195
|
+
self.extensions = base_extensions
|
|
196
|
+
|
|
197
|
+
# Auto-add hashfuncs extension if not present (needed for default XXHASH64)
|
|
198
|
+
# But we'll fall back to MD5 if hashfuncs is not available
|
|
199
|
+
extension_names: list[str] = []
|
|
200
|
+
for ext in self.extensions:
|
|
201
|
+
if isinstance(ext, str):
|
|
202
|
+
extension_names.append(ext)
|
|
203
|
+
elif isinstance(ext, ExtensionSpec):
|
|
204
|
+
extension_names.append(ext.name)
|
|
205
|
+
else:
|
|
206
|
+
# After _normalise_extensions, this should not happen
|
|
207
|
+
# But keep defensive check for type safety
|
|
208
|
+
raise TypeError(
|
|
209
|
+
f"Extension must be str or ExtensionSpec after normalization; got {type(ext)}"
|
|
210
|
+
)
|
|
211
|
+
if "hashfuncs" not in extension_names:
|
|
212
|
+
self.extensions.append("hashfuncs")
|
|
213
|
+
|
|
214
|
+
# Initialize Ibis store with DuckDB backend
|
|
215
|
+
super().__init__(
|
|
216
|
+
backend="duckdb",
|
|
217
|
+
connection_params=connection_params,
|
|
218
|
+
fallback_stores=fallback_stores,
|
|
219
|
+
**kwargs,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def sqlalchemy_url(self) -> str:
|
|
224
|
+
"""Get SQLAlchemy-compatible connection URL for DuckDB.
|
|
225
|
+
|
|
226
|
+
Constructs a DuckDB SQLAlchemy URL from the database parameter.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
SQLAlchemy-compatible URL string (e.g., "duckdb:///path/to/db.db")
|
|
230
|
+
|
|
231
|
+
Example:
|
|
232
|
+
```python
|
|
233
|
+
store = DuckDBMetadataStore(":memory:")
|
|
234
|
+
print(store.sqlalchemy_url) # duckdb:///:memory:
|
|
235
|
+
|
|
236
|
+
store = DuckDBMetadataStore("metadata.db")
|
|
237
|
+
print(store.sqlalchemy_url) # duckdb:///metadata.db
|
|
238
|
+
```
|
|
239
|
+
"""
|
|
240
|
+
# DuckDB SQLAlchemy URL format: duckdb:///database_path
|
|
241
|
+
return f"duckdb:///{self.database}"
|
|
242
|
+
|
|
243
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
244
|
+
"""Get default hash algorithm for DuckDB stores.
|
|
245
|
+
|
|
246
|
+
Uses XXHASH64 if hashfuncs extension is available, otherwise falls back to MD5.
|
|
247
|
+
"""
|
|
248
|
+
# Default to MD5 which is always available
|
|
249
|
+
# If hashfuncs loads successfully, the calculator will support XXHASH64 too
|
|
250
|
+
return HashAlgorithm.MD5
|
|
251
|
+
|
|
252
|
+
@contextmanager
|
|
253
|
+
def _create_versioning_engine(self, plan):
|
|
254
|
+
"""Create provenance engine for DuckDB backend as a context manager.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
plan: Feature plan for the feature we're tracking provenance for
|
|
258
|
+
|
|
259
|
+
Yields:
|
|
260
|
+
IbisVersioningEngine with DuckDB-specific hash functions.
|
|
261
|
+
|
|
262
|
+
Note:
|
|
263
|
+
Extensions are loaded lazily when engine is created.
|
|
264
|
+
"""
|
|
265
|
+
# Load extensions first (if connection is open)
|
|
266
|
+
if self._conn is not None:
|
|
267
|
+
self._load_extensions()
|
|
268
|
+
|
|
269
|
+
# Call parent implementation (which calls our _create_hash_functions)
|
|
270
|
+
with super()._create_versioning_engine(plan) as engine:
|
|
271
|
+
yield engine
|
|
272
|
+
|
|
273
|
+
def _load_extensions(self) -> None:
|
|
274
|
+
"""Load DuckDB extensions if not already loaded."""
|
|
275
|
+
if not self.extensions:
|
|
276
|
+
return
|
|
277
|
+
|
|
278
|
+
# Get raw DuckDB connection
|
|
279
|
+
duckdb_conn = self._duckdb_raw_connection()
|
|
280
|
+
|
|
281
|
+
for ext_spec in self.extensions:
|
|
282
|
+
# Extract name and repository
|
|
283
|
+
if isinstance(ext_spec, str):
|
|
284
|
+
ext_name = ext_spec
|
|
285
|
+
ext_repo = "community"
|
|
286
|
+
elif isinstance(ext_spec, ExtensionSpec):
|
|
287
|
+
ext_name = ext_spec.name
|
|
288
|
+
ext_repo = ext_spec.repository or "community"
|
|
289
|
+
else:
|
|
290
|
+
raise TypeError(
|
|
291
|
+
f"Extension must be str or ExtensionSpec; got {type(ext_spec)}"
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Install and load the extension
|
|
295
|
+
if ext_repo == "community":
|
|
296
|
+
duckdb_conn.execute(f"INSTALL {ext_name} FROM community")
|
|
297
|
+
else:
|
|
298
|
+
duckdb_conn.execute(f"SET custom_extension_repository='{ext_repo}'")
|
|
299
|
+
duckdb_conn.execute(f"INSTALL {ext_name}")
|
|
300
|
+
|
|
301
|
+
duckdb_conn.execute(f"LOAD {ext_name}")
|
|
302
|
+
|
|
303
|
+
def _create_hash_functions(self):
|
|
304
|
+
"""Create DuckDB-specific hash functions for Ibis expressions.
|
|
305
|
+
|
|
306
|
+
Implements MD5 and xxHash functions using DuckDB's native functions.
|
|
307
|
+
|
|
308
|
+
Returns hash functions that take Ibis column expressions and return
|
|
309
|
+
Ibis expressions that call DuckDB SQL functions.
|
|
310
|
+
"""
|
|
311
|
+
# Import ibis for wrapping built-in SQL functions
|
|
312
|
+
import ibis
|
|
313
|
+
|
|
314
|
+
hash_functions = {}
|
|
315
|
+
|
|
316
|
+
# DuckDB MD5 implementation
|
|
317
|
+
@ibis.udf.scalar.builtin
|
|
318
|
+
def MD5(x: str) -> str:
|
|
319
|
+
"""DuckDB MD5() function."""
|
|
320
|
+
...
|
|
321
|
+
|
|
322
|
+
@ibis.udf.scalar.builtin
|
|
323
|
+
def HEX(x: str) -> str:
|
|
324
|
+
"""DuckDB HEX() function."""
|
|
325
|
+
...
|
|
326
|
+
|
|
327
|
+
@ibis.udf.scalar.builtin
|
|
328
|
+
def LOWER(x: str) -> str:
|
|
329
|
+
"""DuckDB LOWER() function."""
|
|
330
|
+
...
|
|
331
|
+
|
|
332
|
+
def md5_hash(col_expr):
|
|
333
|
+
"""Hash a column using DuckDB's MD5() function."""
|
|
334
|
+
# MD5 already returns hex string, just convert to lowercase
|
|
335
|
+
return LOWER(MD5(col_expr.cast(str)))
|
|
336
|
+
|
|
337
|
+
hash_functions[HashAlgorithm.MD5] = md5_hash
|
|
338
|
+
|
|
339
|
+
# Determine which extensions are available
|
|
340
|
+
extension_names = []
|
|
341
|
+
for ext in self.extensions:
|
|
342
|
+
if isinstance(ext, str):
|
|
343
|
+
extension_names.append(ext)
|
|
344
|
+
elif isinstance(ext, ExtensionSpec):
|
|
345
|
+
extension_names.append(ext.name)
|
|
346
|
+
|
|
347
|
+
# Add xxHash functions if hashfuncs extension is loaded
|
|
348
|
+
if "hashfuncs" in extension_names:
|
|
349
|
+
# Use Ibis's builtin UDF decorator to wrap DuckDB's xxhash functions
|
|
350
|
+
# These functions already exist in DuckDB (via hashfuncs extension)
|
|
351
|
+
# The decorator tells Ibis to call them directly in SQL
|
|
352
|
+
# NOTE: xxh32/xxh64 return integers in DuckDB, not strings
|
|
353
|
+
@ibis.udf.scalar.builtin
|
|
354
|
+
def xxh32(x: str) -> int:
|
|
355
|
+
"""DuckDB xxh32() hash function from hashfuncs extension."""
|
|
356
|
+
...
|
|
357
|
+
|
|
358
|
+
@ibis.udf.scalar.builtin
|
|
359
|
+
def xxh64(x: str) -> int:
|
|
360
|
+
"""DuckDB xxh64() hash function from hashfuncs extension."""
|
|
361
|
+
...
|
|
362
|
+
|
|
363
|
+
# Create hash functions that use these wrapped SQL functions
|
|
364
|
+
def xxhash32_hash(col_expr):
|
|
365
|
+
"""Hash a column using DuckDB's xxh32() function."""
|
|
366
|
+
# Cast to string and then cast result to string (xxh32 returns integer in DuckDB)
|
|
367
|
+
return xxh32(col_expr.cast(str)).cast(str)
|
|
368
|
+
|
|
369
|
+
def xxhash64_hash(col_expr):
|
|
370
|
+
"""Hash a column using DuckDB's xxh64() function."""
|
|
371
|
+
# Cast to string and then cast result to string (xxh64 returns integer in DuckDB)
|
|
372
|
+
return xxh64(col_expr.cast(str)).cast(str)
|
|
373
|
+
|
|
374
|
+
hash_functions[HashAlgorithm.XXHASH32] = xxhash32_hash
|
|
375
|
+
hash_functions[HashAlgorithm.XXHASH64] = xxhash64_hash
|
|
376
|
+
|
|
377
|
+
return hash_functions
|
|
378
|
+
|
|
379
|
+
# ------------------------------------------------------------------ DuckLake
|
|
380
|
+
@contextmanager
|
|
381
|
+
def open(self, mode: AccessMode = "read") -> Iterator[Self]:
|
|
382
|
+
"""Open DuckDB connection with specified access mode.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
mode: Access mode (READ or WRITE). Defaults to READ.
|
|
386
|
+
READ mode sets read_only=True for concurrent access.
|
|
387
|
+
|
|
388
|
+
Yields:
|
|
389
|
+
Self: The store instance with connection open
|
|
390
|
+
"""
|
|
391
|
+
# Setup: Configure connection params based on mode
|
|
392
|
+
if mode == "read":
|
|
393
|
+
self.connection_params["read_only"] = True
|
|
394
|
+
else:
|
|
395
|
+
# Remove read_only if present (switching to WRITE)
|
|
396
|
+
self.connection_params.pop("read_only", None)
|
|
397
|
+
|
|
398
|
+
# Call parent context manager to establish connection
|
|
399
|
+
with super().open(mode):
|
|
400
|
+
try:
|
|
401
|
+
# Configure DuckLake if needed (only on first entry)
|
|
402
|
+
if self._ducklake_attachment is not None and self._context_depth == 1:
|
|
403
|
+
duckdb_conn = self._duckdb_raw_connection()
|
|
404
|
+
self._ducklake_attachment.configure(duckdb_conn)
|
|
405
|
+
|
|
406
|
+
yield self
|
|
407
|
+
finally:
|
|
408
|
+
# Cleanup is handled by parent's finally block
|
|
409
|
+
pass
|
|
410
|
+
|
|
411
|
+
def preview_ducklake_sql(self) -> list[str]:
|
|
412
|
+
"""Return DuckLake attachment SQL if configured."""
|
|
413
|
+
return self.ducklake_attachment.preview_sql()
|
|
414
|
+
|
|
415
|
+
@property
|
|
416
|
+
def ducklake_attachment(self) -> DuckLakeAttachmentManager:
|
|
417
|
+
"""DuckLake attachment manager (raises if not configured)."""
|
|
418
|
+
if self._ducklake_attachment is None:
|
|
419
|
+
raise RuntimeError("DuckLake attachment is not configured.")
|
|
420
|
+
return self._ducklake_attachment
|
|
421
|
+
|
|
422
|
+
@property
|
|
423
|
+
def ducklake_attachment_config(self) -> DuckLakeAttachmentConfig:
|
|
424
|
+
"""DuckLake attachment configuration (raises if not configured)."""
|
|
425
|
+
if self._ducklake_config is None:
|
|
426
|
+
raise RuntimeError("DuckLake attachment is not configured.")
|
|
427
|
+
return self._ducklake_config
|
|
428
|
+
|
|
429
|
+
def _duckdb_raw_connection(self) -> DuckDBPyConnection:
|
|
430
|
+
"""Return the underlying DuckDBPyConnection from the Ibis backend."""
|
|
431
|
+
if self._conn is None:
|
|
432
|
+
raise RuntimeError("DuckDB connection is not open.")
|
|
433
|
+
|
|
434
|
+
candidate = self._conn.con # pyright: ignore[reportAttributeAccessIssue]
|
|
435
|
+
|
|
436
|
+
if not isinstance(candidate, DuckDBPyConnection):
|
|
437
|
+
raise TypeError(
|
|
438
|
+
f"Expected DuckDB backend 'con' to be DuckDBPyConnection, "
|
|
439
|
+
f"got {type(candidate).__name__}"
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
return candidate
|
|
443
|
+
|
|
444
|
+
@classmethod
|
|
445
|
+
def config_model(cls) -> type[DuckDBMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
446
|
+
return DuckDBMetadataStoreConfig
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Exceptions for metadata store operations."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MetadataStoreError(Exception):
|
|
5
|
+
"""Base exception for metadata store errors."""
|
|
6
|
+
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FeatureNotFoundError(MetadataStoreError):
|
|
11
|
+
"""Raised when a feature is not found in the store."""
|
|
12
|
+
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SystemDataNotFoundError(MetadataStoreError):
|
|
17
|
+
"""Raised when system features are not found in the store."""
|
|
18
|
+
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FieldNotFoundError(MetadataStoreError):
|
|
23
|
+
"""Raised when a field is not found for a feature."""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MetadataSchemaError(MetadataStoreError):
|
|
29
|
+
"""Raised when metadata DataFrame has invalid schema."""
|
|
30
|
+
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DependencyError(MetadataStoreError):
|
|
35
|
+
"""Raised when upstream dependencies are missing or invalid."""
|
|
36
|
+
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class StoreNotOpenError(MetadataStoreError):
|
|
41
|
+
"""Raised when attempting to use a store that hasn't been opened."""
|
|
42
|
+
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class HashAlgorithmNotSupportedError(MetadataStoreError):
|
|
47
|
+
"""Raised when a hash algorithm is not supported by the store or its components."""
|
|
48
|
+
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TableNotFoundError(MetadataStoreError):
|
|
53
|
+
"""Raised when a table does not exist and auto_create_tables is disabled."""
|
|
54
|
+
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class VersioningEngineMismatchError(Exception):
|
|
59
|
+
"""Raised when versioning_engine='native' is requested but data has wrong implementation."""
|
|
60
|
+
|
|
61
|
+
pass
|