metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
"""LanceDB metadata store implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from collections.abc import Iterator, Sequence
|
|
7
|
+
from contextlib import contextmanager
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import narwhals as nw
|
|
12
|
+
import polars as pl
|
|
13
|
+
from narwhals.typing import Frame
|
|
14
|
+
from pydantic import Field
|
|
15
|
+
from typing_extensions import Self
|
|
16
|
+
|
|
17
|
+
from metaxy._utils import collect_to_polars
|
|
18
|
+
from metaxy.metadata_store.base import MetadataStore, MetadataStoreConfig
|
|
19
|
+
from metaxy.metadata_store.types import AccessMode
|
|
20
|
+
from metaxy.metadata_store.utils import is_local_path, sanitize_uri
|
|
21
|
+
from metaxy.models.types import CoercibleToFeatureKey, FeatureKey
|
|
22
|
+
from metaxy.versioning.polars import PolarsVersioningEngine
|
|
23
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LanceDBMetadataStoreConfig(MetadataStoreConfig):
|
|
29
|
+
"""Configuration for LanceDBMetadataStore.
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
```python
|
|
33
|
+
config = LanceDBMetadataStoreConfig(
|
|
34
|
+
uri="/path/to/featuregraph",
|
|
35
|
+
connect_kwargs={"api_key": "your-api-key"},
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
store = LanceDBMetadataStore.from_config(config)
|
|
39
|
+
```
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
uri: str | Path = Field(
|
|
43
|
+
description="Directory path or URI for LanceDB tables.",
|
|
44
|
+
)
|
|
45
|
+
connect_kwargs: dict[str, Any] | None = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="Extra keyword arguments passed to lancedb.connect().",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class LanceDBMetadataStore(MetadataStore):
|
|
52
|
+
"""
|
|
53
|
+
[LanceDB](https://lancedb.github.io/lancedb/) metadata store for vector and structured data.
|
|
54
|
+
|
|
55
|
+
LanceDB is a columnar database optimized for vector search and multimodal data.
|
|
56
|
+
Each feature is stored in its own Lance table within the database directory.
|
|
57
|
+
Uses Polars components for data processing (no native SQL execution).
|
|
58
|
+
|
|
59
|
+
Storage layout:
|
|
60
|
+
|
|
61
|
+
- Each feature gets its own table: `{namespace}__{feature_name}`
|
|
62
|
+
|
|
63
|
+
- Tables are stored as Lance format in the directory specified by the URI
|
|
64
|
+
|
|
65
|
+
- LanceDB handles schema evolution, transactions, and compaction automatically
|
|
66
|
+
|
|
67
|
+
Example: Local Directory
|
|
68
|
+
```py
|
|
69
|
+
from pathlib import Path
|
|
70
|
+
from metaxy.metadata_store.lancedb import LanceDBMetadataStore
|
|
71
|
+
|
|
72
|
+
# Local filesystem
|
|
73
|
+
store = LanceDBMetadataStore(Path("/path/to/featuregraph"))
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Example: Object Storage (S3, GCS, Azure)
|
|
77
|
+
```py
|
|
78
|
+
# object store (requires credentials)
|
|
79
|
+
store = LanceDBMetadataStore("s3:///path/to/featuregraph")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Example: LanceDB Cloud
|
|
83
|
+
```py
|
|
84
|
+
import os
|
|
85
|
+
|
|
86
|
+
# Option 1: Environment variable
|
|
87
|
+
os.environ["LANCEDB_API_KEY"] = "your-api-key"
|
|
88
|
+
store = LanceDBMetadataStore("db://my-database")
|
|
89
|
+
|
|
90
|
+
# Option 2: Explicit credentials
|
|
91
|
+
store = LanceDBMetadataStore(
|
|
92
|
+
"db://my-database",
|
|
93
|
+
connect_kwargs={"api_key": "your-api-key", "region": "us-east-1"}
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
_should_warn_auto_create_tables = False
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
uri: str | Path,
|
|
103
|
+
*,
|
|
104
|
+
fallback_stores: list[MetadataStore] | None = None,
|
|
105
|
+
connect_kwargs: dict[str, Any] | None = None,
|
|
106
|
+
**kwargs: Any,
|
|
107
|
+
):
|
|
108
|
+
"""
|
|
109
|
+
Initialize [LanceDB](https://lancedb.com/docs/) metadata store.
|
|
110
|
+
|
|
111
|
+
The database directory is created automatically if it doesn't exist (local paths only).
|
|
112
|
+
Tables are created on-demand when features are first written.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
uri: Directory path or URI for LanceDB tables. Supports:
|
|
116
|
+
|
|
117
|
+
- **Local path**: `"./metadata"` or `Path("/data/metaxy/lancedb")`
|
|
118
|
+
|
|
119
|
+
- **Object stores**: `s3://`, `gs://`, `az://` (requires cloud credentials)
|
|
120
|
+
|
|
121
|
+
- **LanceDB Cloud**: `"db://database-name"` (requires API key)
|
|
122
|
+
|
|
123
|
+
- **Remote HTTP/HTTPS**: Any URI supported by LanceDB
|
|
124
|
+
|
|
125
|
+
fallback_stores: Ordered list of read-only fallback stores.
|
|
126
|
+
When reading features not found in this store, Metaxy searches
|
|
127
|
+
fallback stores in order. Useful for local dev → staging → production chains.
|
|
128
|
+
connect_kwargs: Extra keyword arguments passed directly to
|
|
129
|
+
[lancedb.connect()](https://lancedb.github.io/lancedb/python/python/#lancedb.connect).
|
|
130
|
+
Useful for LanceDB Cloud credentials (api_key, region) when you cannot
|
|
131
|
+
rely on environment variables.
|
|
132
|
+
**kwargs: Passed to [metaxy.metadata_store.base.MetadataStore][]
|
|
133
|
+
(e.g., hash_algorithm, hash_truncation_length, prefer_native)
|
|
134
|
+
|
|
135
|
+
Note:
|
|
136
|
+
Unlike SQL stores, LanceDB doesn't require explicit table creation.
|
|
137
|
+
Tables are created automatically when writing metadata.
|
|
138
|
+
"""
|
|
139
|
+
self.uri: str = str(uri)
|
|
140
|
+
self._conn: Any | None = None
|
|
141
|
+
self._connect_kwargs = connect_kwargs or {}
|
|
142
|
+
super().__init__(
|
|
143
|
+
fallback_stores=fallback_stores,
|
|
144
|
+
auto_create_tables=True,
|
|
145
|
+
versioning_engine_cls=PolarsVersioningEngine,
|
|
146
|
+
**kwargs,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
@contextmanager
|
|
150
|
+
def _create_versioning_engine(self, plan):
|
|
151
|
+
"""Create Polars versioning engine for LanceDB.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
plan: Feature plan for the feature we're tracking provenance for
|
|
155
|
+
|
|
156
|
+
Yields:
|
|
157
|
+
PolarsVersioningEngine instance
|
|
158
|
+
"""
|
|
159
|
+
engine = PolarsVersioningEngine(plan=plan)
|
|
160
|
+
try:
|
|
161
|
+
yield engine
|
|
162
|
+
finally:
|
|
163
|
+
# No cleanup needed for Polars engine
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
@contextmanager
|
|
167
|
+
def open(self, mode: AccessMode = "read") -> Iterator[Self]:
|
|
168
|
+
"""Open LanceDB connection.
|
|
169
|
+
|
|
170
|
+
For local filesystem paths, creates the directory if it doesn't exist.
|
|
171
|
+
For remote URIs (S3, LanceDB Cloud, etc.), connects directly.
|
|
172
|
+
Tables are created on-demand when features are first written.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
mode: Access mode (READ or WRITE). Accepted for consistency but not used
|
|
176
|
+
by LanceDB (LanceDB handles concurrent access internally).
|
|
177
|
+
|
|
178
|
+
Yields:
|
|
179
|
+
Self: The store instance
|
|
180
|
+
|
|
181
|
+
Raises:
|
|
182
|
+
ConnectionError: If remote connection fails (e.g., invalid credentials)
|
|
183
|
+
"""
|
|
184
|
+
# Increment context depth to support nested contexts
|
|
185
|
+
self._context_depth += 1
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
# Only perform actual open on first entry
|
|
189
|
+
if self._context_depth == 1:
|
|
190
|
+
import lancedb
|
|
191
|
+
|
|
192
|
+
if is_local_path(self.uri):
|
|
193
|
+
Path(self.uri).mkdir(parents=True, exist_ok=True)
|
|
194
|
+
|
|
195
|
+
self._conn = lancedb.connect(self.uri, **self._connect_kwargs)
|
|
196
|
+
self._is_open = True
|
|
197
|
+
self._validate_after_open()
|
|
198
|
+
|
|
199
|
+
yield self
|
|
200
|
+
finally:
|
|
201
|
+
# Decrement context depth
|
|
202
|
+
self._context_depth -= 1
|
|
203
|
+
|
|
204
|
+
# Only perform actual close on last exit
|
|
205
|
+
if self._context_depth == 0:
|
|
206
|
+
self._conn = None
|
|
207
|
+
self._is_open = False
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def conn(self) -> Any:
|
|
211
|
+
"""Get LanceDB connection.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Active LanceDB connection
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
StoreNotOpenError: If store is not open
|
|
218
|
+
"""
|
|
219
|
+
from metaxy.metadata_store.exceptions import StoreNotOpenError
|
|
220
|
+
|
|
221
|
+
if self._conn is None:
|
|
222
|
+
raise StoreNotOpenError(
|
|
223
|
+
"LanceDB connection is not open. Store must be used as a context manager."
|
|
224
|
+
)
|
|
225
|
+
return self._conn
|
|
226
|
+
|
|
227
|
+
# Helpers -----------------------------------------------------------------
|
|
228
|
+
|
|
229
|
+
def _table_name(self, feature_key: FeatureKey) -> str:
|
|
230
|
+
return feature_key.table_name
|
|
231
|
+
|
|
232
|
+
def _table_exists(self, table_name: str) -> bool:
|
|
233
|
+
"""Check if a table exists without listing all tables.
|
|
234
|
+
|
|
235
|
+
Uses open_table() which is more efficient than listing all tables,
|
|
236
|
+
especially for remote storage (S3, GCS, etc.) where listing is expensive.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
table_name: Name of the table to check
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
True if table exists, False otherwise
|
|
243
|
+
"""
|
|
244
|
+
try:
|
|
245
|
+
self.conn.open_table(table_name) # type: ignore[attr-defined]
|
|
246
|
+
return True
|
|
247
|
+
except (ValueError, FileNotFoundError):
|
|
248
|
+
# LanceDB raises ValueError when table doesn't exist
|
|
249
|
+
return False
|
|
250
|
+
|
|
251
|
+
def _get_table(self, table_name: str):
|
|
252
|
+
return self.conn.open_table(table_name) # type: ignore[attr-defined]
|
|
253
|
+
|
|
254
|
+
# ===== MetadataStore abstract methods =====
|
|
255
|
+
|
|
256
|
+
def _has_feature_impl(self, feature: CoercibleToFeatureKey) -> bool:
|
|
257
|
+
"""Check if feature exists in LanceDB store.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
feature: Feature to check
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
True if feature exists, False otherwise
|
|
264
|
+
"""
|
|
265
|
+
feature_key = self._resolve_feature_key(feature)
|
|
266
|
+
table_name = self._table_name(feature_key)
|
|
267
|
+
return self._table_exists(table_name)
|
|
268
|
+
|
|
269
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
270
|
+
"""Use XXHASH64 by default to match other non-SQL stores."""
|
|
271
|
+
return HashAlgorithm.XXHASH64
|
|
272
|
+
|
|
273
|
+
# Storage ------------------------------------------------------------------
|
|
274
|
+
|
|
275
|
+
def write_metadata_to_store(
|
|
276
|
+
self,
|
|
277
|
+
feature_key: FeatureKey,
|
|
278
|
+
df: Frame,
|
|
279
|
+
**kwargs: Any,
|
|
280
|
+
) -> None:
|
|
281
|
+
"""Append metadata to Lance table.
|
|
282
|
+
|
|
283
|
+
Creates the table if it doesn't exist, otherwise appends to existing table.
|
|
284
|
+
Uses LanceDB's native Polars/Arrow integration for efficient storage.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
feature_key: Feature key to write to
|
|
288
|
+
df: Narwhals Frame with metadata (already validated by base class)
|
|
289
|
+
"""
|
|
290
|
+
# Convert Narwhals frame to Polars DataFrame
|
|
291
|
+
df_polars = collect_to_polars(df)
|
|
292
|
+
|
|
293
|
+
table_name = self._table_name(feature_key)
|
|
294
|
+
|
|
295
|
+
# LanceDB supports both Polars DataFrames and Arrow tables directly
|
|
296
|
+
# Try Polars first (native integration), fall back to Arrow if needed
|
|
297
|
+
try:
|
|
298
|
+
if self._table_exists(table_name):
|
|
299
|
+
table = self._get_table(table_name)
|
|
300
|
+
# Use Polars DataFrame directly - LanceDB handles conversion
|
|
301
|
+
table.add(df_polars) # type: ignore[attr-defined]
|
|
302
|
+
else:
|
|
303
|
+
# Create table from Polars DataFrame - LanceDB handles schema
|
|
304
|
+
self.conn.create_table(table_name, data=df_polars) # type: ignore[attr-defined]
|
|
305
|
+
except TypeError as exc:
|
|
306
|
+
if not self._should_fallback_to_arrow(exc):
|
|
307
|
+
raise
|
|
308
|
+
# Defensive fallback: Modern LanceDB (>=0.3) accepts Polars DataFrames natively,
|
|
309
|
+
# but fall back to Arrow if an older version or edge case doesn't support it.
|
|
310
|
+
# This ensures compatibility across LanceDB versions.
|
|
311
|
+
logger.debug("Falling back to Arrow format for LanceDB write: %s", exc)
|
|
312
|
+
arrow_table = df_polars.to_arrow()
|
|
313
|
+
if self._table_exists(table_name):
|
|
314
|
+
table = self._get_table(table_name)
|
|
315
|
+
table.add(arrow_table) # type: ignore[attr-defined]
|
|
316
|
+
else:
|
|
317
|
+
self.conn.create_table(table_name, data=arrow_table) # type: ignore[attr-defined]
|
|
318
|
+
|
|
319
|
+
def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
|
|
320
|
+
"""Drop Lance table for feature.
|
|
321
|
+
|
|
322
|
+
Permanently removes the Lance table from the database directory.
|
|
323
|
+
Safe to call even if table doesn't exist (no-op).
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
feature_key: Feature key to drop metadata for
|
|
327
|
+
"""
|
|
328
|
+
table_name = self._table_name(feature_key)
|
|
329
|
+
if self._table_exists(table_name):
|
|
330
|
+
self.conn.drop_table(table_name) # type: ignore[attr-defined]
|
|
331
|
+
|
|
332
|
+
def read_metadata_in_store(
|
|
333
|
+
self,
|
|
334
|
+
feature: CoercibleToFeatureKey,
|
|
335
|
+
*,
|
|
336
|
+
filters: Sequence[nw.Expr] | None = None,
|
|
337
|
+
columns: Sequence[str] | None = None,
|
|
338
|
+
**kwargs: Any,
|
|
339
|
+
) -> nw.LazyFrame[Any] | None:
|
|
340
|
+
"""Read metadata from Lance table.
|
|
341
|
+
|
|
342
|
+
Loads data from Lance, converts to Polars, and returns as Narwhals LazyFrame.
|
|
343
|
+
Applies filters and column selection in memory.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
feature: Feature to read
|
|
347
|
+
filters: List of Narwhals filter expressions
|
|
348
|
+
columns: Optional list of columns to select
|
|
349
|
+
**kwargs: Backend-specific parameters (unused)
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Narwhals LazyFrame with metadata, or None if table not found
|
|
353
|
+
"""
|
|
354
|
+
self._check_open()
|
|
355
|
+
feature_key = self._resolve_feature_key(feature)
|
|
356
|
+
table_name = self._table_name(feature_key)
|
|
357
|
+
if not self._table_exists(table_name):
|
|
358
|
+
return None
|
|
359
|
+
|
|
360
|
+
table = self._get_table(table_name)
|
|
361
|
+
# https://github.com/lancedb/lancedb/issues/1539
|
|
362
|
+
# Fall back to eager Arrow conversion until LanceDB issue #1539 is resolved.
|
|
363
|
+
arrow_table = table.to_arrow()
|
|
364
|
+
pl_lazy = pl.DataFrame(arrow_table).lazy()
|
|
365
|
+
nw_lazy = nw.from_native(pl_lazy)
|
|
366
|
+
|
|
367
|
+
if filters:
|
|
368
|
+
nw_lazy = nw_lazy.filter(*filters)
|
|
369
|
+
|
|
370
|
+
if columns is not None:
|
|
371
|
+
nw_lazy = nw_lazy.select(columns)
|
|
372
|
+
|
|
373
|
+
return nw_lazy
|
|
374
|
+
|
|
375
|
+
@staticmethod
|
|
376
|
+
def _should_fallback_to_arrow(exc: TypeError) -> bool:
|
|
377
|
+
"""Return True when TypeError likely originates from Polars support gaps."""
|
|
378
|
+
message = str(exc).lower()
|
|
379
|
+
polars_markers = ("polars", "dataframe", "lazyframe", "data frame")
|
|
380
|
+
return any(marker in message for marker in polars_markers)
|
|
381
|
+
|
|
382
|
+
# Display ------------------------------------------------------------------
|
|
383
|
+
|
|
384
|
+
def display(self) -> str:
|
|
385
|
+
"""Human-readable representation with sanitized credentials."""
|
|
386
|
+
path = sanitize_uri(self.uri)
|
|
387
|
+
return f"LanceDBMetadataStore(path={path})"
|
|
388
|
+
|
|
389
|
+
@classmethod
|
|
390
|
+
def config_model(cls) -> type[LanceDBMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
391
|
+
return LanceDBMetadataStoreConfig
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""In-memory metadata store implementation."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator, Sequence
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import narwhals as nw
|
|
8
|
+
import polars as pl
|
|
9
|
+
from narwhals.typing import Frame
|
|
10
|
+
from typing_extensions import Self
|
|
11
|
+
|
|
12
|
+
from metaxy._utils import collect_to_polars
|
|
13
|
+
from metaxy.metadata_store.base import MetadataStore, MetadataStoreConfig
|
|
14
|
+
from metaxy.metadata_store.types import AccessMode
|
|
15
|
+
from metaxy.models.types import CoercibleToFeatureKey, FeatureKey
|
|
16
|
+
from metaxy.versioning.polars import PolarsVersioningEngine
|
|
17
|
+
from metaxy.versioning.types import HashAlgorithm
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class InMemoryMetadataStoreConfig(MetadataStoreConfig):
|
|
21
|
+
"""Configuration for InMemoryMetadataStore.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
```python
|
|
25
|
+
config = InMemoryMetadataStoreConfig(
|
|
26
|
+
hash_algorithm=HashAlgorithm.XXHASH64,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
store = InMemoryMetadataStore.from_config(config)
|
|
30
|
+
```
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class InMemoryMetadataStore(MetadataStore):
|
|
37
|
+
"""
|
|
38
|
+
In-memory metadata store using dict-based storage.
|
|
39
|
+
|
|
40
|
+
Features:
|
|
41
|
+
- Simple dict storage: {FeatureKey: pl.DataFrame}
|
|
42
|
+
- Fast for testing and prototyping
|
|
43
|
+
- No persistence (data lost when process exits)
|
|
44
|
+
- Schema validation on write
|
|
45
|
+
- Uses Polars components for all operations
|
|
46
|
+
|
|
47
|
+
Limitations:
|
|
48
|
+
- Not suitable for production
|
|
49
|
+
- Data lost on process exit
|
|
50
|
+
- No concurrency support across processes
|
|
51
|
+
- Memory-bound (all data in RAM)
|
|
52
|
+
|
|
53
|
+
Notes:
|
|
54
|
+
Uses Narwhals LazyFrames (nw.LazyFrame) for all operations
|
|
55
|
+
|
|
56
|
+
Components:
|
|
57
|
+
Components are created on-demand in resolve_update().
|
|
58
|
+
Uses Polars internally but exposes Narwhals interface.
|
|
59
|
+
Only supports Polars components (no native backend).
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
# Disable auto_create_tables warning for in-memory store
|
|
63
|
+
# (table creation concept doesn't apply to memory storage)
|
|
64
|
+
_should_warn_auto_create_tables: bool = False
|
|
65
|
+
|
|
66
|
+
def __init__(self, **kwargs: Any):
|
|
67
|
+
"""
|
|
68
|
+
Initialize in-memory store.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
**kwargs: Passed to MetadataStore.__init__ (e.g., fallback_stores, hash_algorithm)
|
|
72
|
+
"""
|
|
73
|
+
# Use tuple as key (hashable) instead of string to avoid parsing issues
|
|
74
|
+
self._storage: dict[tuple[str, ...], pl.DataFrame] = {}
|
|
75
|
+
super().__init__(**kwargs, versioning_engine_cls=PolarsVersioningEngine)
|
|
76
|
+
|
|
77
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
78
|
+
"""Get default hash algorithm for in-memory store."""
|
|
79
|
+
return HashAlgorithm.XXHASH64
|
|
80
|
+
|
|
81
|
+
def _get_storage_key(self, feature_key: FeatureKey) -> tuple[str, ...]:
|
|
82
|
+
"""Convert feature key to storage key (tuple for hashability)."""
|
|
83
|
+
return tuple(feature_key)
|
|
84
|
+
|
|
85
|
+
@contextmanager
|
|
86
|
+
def _create_versioning_engine(self, plan) -> Iterator[PolarsVersioningEngine]:
|
|
87
|
+
"""Create Polars provenance engine for in-memory store.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
plan: Feature plan for the feature we're tracking provenance for
|
|
91
|
+
|
|
92
|
+
Yields:
|
|
93
|
+
PolarsVersioningEngine instance
|
|
94
|
+
"""
|
|
95
|
+
from metaxy.versioning.polars import PolarsVersioningEngine
|
|
96
|
+
|
|
97
|
+
# Create engine (only accepts plan parameter)
|
|
98
|
+
engine = PolarsVersioningEngine(plan=plan)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
yield engine
|
|
102
|
+
finally:
|
|
103
|
+
# No cleanup needed for Polars engine
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
def _has_feature_impl(self, feature: CoercibleToFeatureKey) -> bool:
|
|
107
|
+
feature_key = self._resolve_feature_key(feature)
|
|
108
|
+
storage_key = self._get_storage_key(feature_key)
|
|
109
|
+
return storage_key in self._storage
|
|
110
|
+
|
|
111
|
+
def write_metadata_to_store(
|
|
112
|
+
self,
|
|
113
|
+
feature_key: FeatureKey,
|
|
114
|
+
df: Frame,
|
|
115
|
+
**kwargs: Any,
|
|
116
|
+
) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Internal write implementation for in-memory storage.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
feature_key: Feature key to write to
|
|
122
|
+
df: Narwhals Frame (eager or lazy) with metadata (already validated)
|
|
123
|
+
**kwargs: Backend-specific parameters (currently unused)
|
|
124
|
+
"""
|
|
125
|
+
df_polars: pl.DataFrame = collect_to_polars(df)
|
|
126
|
+
|
|
127
|
+
storage_key = self._get_storage_key(feature_key)
|
|
128
|
+
|
|
129
|
+
# Append or create
|
|
130
|
+
if storage_key in self._storage:
|
|
131
|
+
existing_df = self._storage[storage_key]
|
|
132
|
+
|
|
133
|
+
# Handle schema evolution: ensure both DataFrames have matching columns
|
|
134
|
+
# Add missing columns as null to the existing DataFrame
|
|
135
|
+
for col_name in df_polars.columns:
|
|
136
|
+
if col_name not in existing_df.columns:
|
|
137
|
+
# Get the data type from the new DataFrame
|
|
138
|
+
col_dtype = df_polars.schema[col_name]
|
|
139
|
+
# Add column with null values of the appropriate type
|
|
140
|
+
existing_df = existing_df.with_columns(
|
|
141
|
+
pl.lit(None).cast(col_dtype).alias(col_name)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Add missing columns to the new DataFrame
|
|
145
|
+
for col_name in existing_df.columns:
|
|
146
|
+
if col_name not in df_polars.columns:
|
|
147
|
+
# Get the data type from the existing DataFrame
|
|
148
|
+
col_dtype = existing_df.schema[col_name]
|
|
149
|
+
# Add column with null values of the appropriate type
|
|
150
|
+
df_polars = df_polars.with_columns(
|
|
151
|
+
pl.lit(None).cast(col_dtype).alias(col_name)
|
|
152
|
+
) # type: ignore[arg-type,union-attr]
|
|
153
|
+
|
|
154
|
+
# Ensure column order matches by selecting columns in consistent order
|
|
155
|
+
all_columns = sorted(set(existing_df.columns) | set(df_polars.columns))
|
|
156
|
+
existing_df = existing_df.select(all_columns)
|
|
157
|
+
df_polars = df_polars.select(all_columns)
|
|
158
|
+
|
|
159
|
+
# Now we can safely concat
|
|
160
|
+
self._storage[storage_key] = pl.concat(
|
|
161
|
+
[existing_df, df_polars],
|
|
162
|
+
how="vertical",
|
|
163
|
+
)
|
|
164
|
+
else:
|
|
165
|
+
# Create new
|
|
166
|
+
self._storage[storage_key] = df_polars
|
|
167
|
+
|
|
168
|
+
def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
|
|
169
|
+
"""Drop all metadata for a feature from in-memory storage.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
feature_key: Feature key to drop metadata for
|
|
173
|
+
"""
|
|
174
|
+
storage_key = self._get_storage_key(feature_key)
|
|
175
|
+
|
|
176
|
+
# Remove from storage if it exists
|
|
177
|
+
if storage_key in self._storage:
|
|
178
|
+
del self._storage[storage_key]
|
|
179
|
+
|
|
180
|
+
def read_metadata_in_store(
|
|
181
|
+
self,
|
|
182
|
+
feature: CoercibleToFeatureKey,
|
|
183
|
+
*,
|
|
184
|
+
feature_version: str | None = None,
|
|
185
|
+
filters: Sequence[nw.Expr] | None = None,
|
|
186
|
+
columns: Sequence[str] | None = None,
|
|
187
|
+
**kwargs: Any,
|
|
188
|
+
) -> nw.LazyFrame[Any] | None:
|
|
189
|
+
"""
|
|
190
|
+
Read metadata from this store only (no fallback).
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
feature: Feature to read
|
|
194
|
+
feature_version: Filter by specific feature_version
|
|
195
|
+
filters: List of Narwhals filter expressions
|
|
196
|
+
columns: Optional list of columns to select
|
|
197
|
+
**kwargs: Backend-specific parameters (currently unused)
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Narwhals LazyFrame with metadata, or None if not found
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
StoreNotOpenError: If store is not open
|
|
204
|
+
"""
|
|
205
|
+
self._check_open()
|
|
206
|
+
|
|
207
|
+
feature_key = self._resolve_feature_key(feature)
|
|
208
|
+
storage_key = self._get_storage_key(feature_key)
|
|
209
|
+
|
|
210
|
+
if storage_key not in self._storage:
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
# Start with lazy Polars DataFrame, wrap with Narwhals
|
|
214
|
+
df_lazy = self._storage[storage_key].lazy()
|
|
215
|
+
nw_lazy = nw.from_native(df_lazy)
|
|
216
|
+
|
|
217
|
+
# Apply feature_version filter
|
|
218
|
+
if feature_version is not None:
|
|
219
|
+
nw_lazy = nw_lazy.filter(
|
|
220
|
+
nw.col("metaxy_feature_version") == feature_version
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Apply generic Narwhals filters
|
|
224
|
+
if filters is not None:
|
|
225
|
+
for filter_expr in filters:
|
|
226
|
+
nw_lazy = nw_lazy.filter(filter_expr)
|
|
227
|
+
|
|
228
|
+
# Select columns
|
|
229
|
+
if columns is not None:
|
|
230
|
+
nw_lazy = nw_lazy.select(columns)
|
|
231
|
+
|
|
232
|
+
# Check if result would be empty (we need to check the underlying frame)
|
|
233
|
+
# For now, return the lazy frame - emptiness check happens when materializing
|
|
234
|
+
return nw_lazy
|
|
235
|
+
|
|
236
|
+
def clear(self) -> None:
|
|
237
|
+
"""
|
|
238
|
+
Clear all metadata from store.
|
|
239
|
+
|
|
240
|
+
Useful for testing.
|
|
241
|
+
"""
|
|
242
|
+
self._storage.clear()
|
|
243
|
+
|
|
244
|
+
# ========== Context Manager Implementation ==========
|
|
245
|
+
|
|
246
|
+
@contextmanager
|
|
247
|
+
def open(self, mode: AccessMode = "read") -> Iterator[Self]:
|
|
248
|
+
"""Open the in-memory store (no-op for in-memory, but accepts mode for consistency).
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
mode: Access mode (accepted for consistency but ignored).
|
|
252
|
+
|
|
253
|
+
Yields:
|
|
254
|
+
Self: The store instance
|
|
255
|
+
"""
|
|
256
|
+
# Increment context depth to support nested contexts
|
|
257
|
+
self._context_depth += 1
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
# Only perform actual open on first entry
|
|
261
|
+
if self._context_depth == 1:
|
|
262
|
+
# No actual connection needed for in-memory
|
|
263
|
+
# Mark store as open and validate
|
|
264
|
+
self._is_open = True
|
|
265
|
+
self._validate_after_open()
|
|
266
|
+
|
|
267
|
+
yield self
|
|
268
|
+
finally:
|
|
269
|
+
# Decrement context depth
|
|
270
|
+
self._context_depth -= 1
|
|
271
|
+
|
|
272
|
+
# Only perform actual close on last exit
|
|
273
|
+
if self._context_depth == 0:
|
|
274
|
+
# Nothing to clean up
|
|
275
|
+
self._is_open = False
|
|
276
|
+
|
|
277
|
+
def __repr__(self) -> str:
|
|
278
|
+
"""String representation."""
|
|
279
|
+
num_fallbacks = len(self.fallback_stores)
|
|
280
|
+
status = "open" if self._is_open else "closed"
|
|
281
|
+
return (
|
|
282
|
+
f"InMemoryMetadataStore(status={status}, fallback_stores={num_fallbacks})"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
def display(self) -> str:
|
|
286
|
+
"""Display string for this store."""
|
|
287
|
+
status = "open" if self._is_open else "closed"
|
|
288
|
+
return f"InMemoryMetadataStore(status={status})"
|
|
289
|
+
|
|
290
|
+
@classmethod
|
|
291
|
+
def config_model(cls) -> type[InMemoryMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
292
|
+
return InMemoryMetadataStoreConfig
|