metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,391 @@
1
+ """LanceDB metadata store implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections.abc import Iterator, Sequence
7
+ from contextlib import contextmanager
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import narwhals as nw
12
+ import polars as pl
13
+ from narwhals.typing import Frame
14
+ from pydantic import Field
15
+ from typing_extensions import Self
16
+
17
+ from metaxy._utils import collect_to_polars
18
+ from metaxy.metadata_store.base import MetadataStore, MetadataStoreConfig
19
+ from metaxy.metadata_store.types import AccessMode
20
+ from metaxy.metadata_store.utils import is_local_path, sanitize_uri
21
+ from metaxy.models.types import CoercibleToFeatureKey, FeatureKey
22
+ from metaxy.versioning.polars import PolarsVersioningEngine
23
+ from metaxy.versioning.types import HashAlgorithm
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class LanceDBMetadataStoreConfig(MetadataStoreConfig):
29
+ """Configuration for LanceDBMetadataStore.
30
+
31
+ Example:
32
+ ```python
33
+ config = LanceDBMetadataStoreConfig(
34
+ uri="/path/to/featuregraph",
35
+ connect_kwargs={"api_key": "your-api-key"},
36
+ )
37
+
38
+ store = LanceDBMetadataStore.from_config(config)
39
+ ```
40
+ """
41
+
42
+ uri: str | Path = Field(
43
+ description="Directory path or URI for LanceDB tables.",
44
+ )
45
+ connect_kwargs: dict[str, Any] | None = Field(
46
+ default=None,
47
+ description="Extra keyword arguments passed to lancedb.connect().",
48
+ )
49
+
50
+
51
+ class LanceDBMetadataStore(MetadataStore):
52
+ """
53
+ [LanceDB](https://lancedb.github.io/lancedb/) metadata store for vector and structured data.
54
+
55
+ LanceDB is a columnar database optimized for vector search and multimodal data.
56
+ Each feature is stored in its own Lance table within the database directory.
57
+ Uses Polars components for data processing (no native SQL execution).
58
+
59
+ Storage layout:
60
+
61
+ - Each feature gets its own table: `{namespace}__{feature_name}`
62
+
63
+ - Tables are stored as Lance format in the directory specified by the URI
64
+
65
+ - LanceDB handles schema evolution, transactions, and compaction automatically
66
+
67
+ Example: Local Directory
68
+ ```py
69
+ from pathlib import Path
70
+ from metaxy.metadata_store.lancedb import LanceDBMetadataStore
71
+
72
+ # Local filesystem
73
+ store = LanceDBMetadataStore(Path("/path/to/featuregraph"))
74
+ ```
75
+
76
+ Example: Object Storage (S3, GCS, Azure)
77
+ ```py
78
+ # object store (requires credentials)
79
+ store = LanceDBMetadataStore("s3:///path/to/featuregraph")
80
+ ```
81
+
82
+ Example: LanceDB Cloud
83
+ ```py
84
+ import os
85
+
86
+ # Option 1: Environment variable
87
+ os.environ["LANCEDB_API_KEY"] = "your-api-key"
88
+ store = LanceDBMetadataStore("db://my-database")
89
+
90
+ # Option 2: Explicit credentials
91
+ store = LanceDBMetadataStore(
92
+ "db://my-database",
93
+ connect_kwargs={"api_key": "your-api-key", "region": "us-east-1"}
94
+ )
95
+ ```
96
+ """
97
+
98
+ _should_warn_auto_create_tables = False
99
+
100
+ def __init__(
101
+ self,
102
+ uri: str | Path,
103
+ *,
104
+ fallback_stores: list[MetadataStore] | None = None,
105
+ connect_kwargs: dict[str, Any] | None = None,
106
+ **kwargs: Any,
107
+ ):
108
+ """
109
+ Initialize [LanceDB](https://lancedb.com/docs/) metadata store.
110
+
111
+ The database directory is created automatically if it doesn't exist (local paths only).
112
+ Tables are created on-demand when features are first written.
113
+
114
+ Args:
115
+ uri: Directory path or URI for LanceDB tables. Supports:
116
+
117
+ - **Local path**: `"./metadata"` or `Path("/data/metaxy/lancedb")`
118
+
119
+ - **Object stores**: `s3://`, `gs://`, `az://` (requires cloud credentials)
120
+
121
+ - **LanceDB Cloud**: `"db://database-name"` (requires API key)
122
+
123
+ - **Remote HTTP/HTTPS**: Any URI supported by LanceDB
124
+
125
+ fallback_stores: Ordered list of read-only fallback stores.
126
+ When reading features not found in this store, Metaxy searches
127
+ fallback stores in order. Useful for local dev → staging → production chains.
128
+ connect_kwargs: Extra keyword arguments passed directly to
129
+ [lancedb.connect()](https://lancedb.github.io/lancedb/python/python/#lancedb.connect).
130
+ Useful for LanceDB Cloud credentials (api_key, region) when you cannot
131
+ rely on environment variables.
132
+ **kwargs: Passed to [metaxy.metadata_store.base.MetadataStore][]
133
+ (e.g., hash_algorithm, hash_truncation_length, prefer_native)
134
+
135
+ Note:
136
+ Unlike SQL stores, LanceDB doesn't require explicit table creation.
137
+ Tables are created automatically when writing metadata.
138
+ """
139
+ self.uri: str = str(uri)
140
+ self._conn: Any | None = None
141
+ self._connect_kwargs = connect_kwargs or {}
142
+ super().__init__(
143
+ fallback_stores=fallback_stores,
144
+ auto_create_tables=True,
145
+ versioning_engine_cls=PolarsVersioningEngine,
146
+ **kwargs,
147
+ )
148
+
149
+ @contextmanager
150
+ def _create_versioning_engine(self, plan):
151
+ """Create Polars versioning engine for LanceDB.
152
+
153
+ Args:
154
+ plan: Feature plan for the feature we're tracking provenance for
155
+
156
+ Yields:
157
+ PolarsVersioningEngine instance
158
+ """
159
+ engine = PolarsVersioningEngine(plan=plan)
160
+ try:
161
+ yield engine
162
+ finally:
163
+ # No cleanup needed for Polars engine
164
+ pass
165
+
166
+ @contextmanager
167
+ def open(self, mode: AccessMode = "read") -> Iterator[Self]:
168
+ """Open LanceDB connection.
169
+
170
+ For local filesystem paths, creates the directory if it doesn't exist.
171
+ For remote URIs (S3, LanceDB Cloud, etc.), connects directly.
172
+ Tables are created on-demand when features are first written.
173
+
174
+ Args:
175
+ mode: Access mode (READ or WRITE). Accepted for consistency but not used
176
+ by LanceDB (LanceDB handles concurrent access internally).
177
+
178
+ Yields:
179
+ Self: The store instance
180
+
181
+ Raises:
182
+ ConnectionError: If remote connection fails (e.g., invalid credentials)
183
+ """
184
+ # Increment context depth to support nested contexts
185
+ self._context_depth += 1
186
+
187
+ try:
188
+ # Only perform actual open on first entry
189
+ if self._context_depth == 1:
190
+ import lancedb
191
+
192
+ if is_local_path(self.uri):
193
+ Path(self.uri).mkdir(parents=True, exist_ok=True)
194
+
195
+ self._conn = lancedb.connect(self.uri, **self._connect_kwargs)
196
+ self._is_open = True
197
+ self._validate_after_open()
198
+
199
+ yield self
200
+ finally:
201
+ # Decrement context depth
202
+ self._context_depth -= 1
203
+
204
+ # Only perform actual close on last exit
205
+ if self._context_depth == 0:
206
+ self._conn = None
207
+ self._is_open = False
208
+
209
+ @property
210
+ def conn(self) -> Any:
211
+ """Get LanceDB connection.
212
+
213
+ Returns:
214
+ Active LanceDB connection
215
+
216
+ Raises:
217
+ StoreNotOpenError: If store is not open
218
+ """
219
+ from metaxy.metadata_store.exceptions import StoreNotOpenError
220
+
221
+ if self._conn is None:
222
+ raise StoreNotOpenError(
223
+ "LanceDB connection is not open. Store must be used as a context manager."
224
+ )
225
+ return self._conn
226
+
227
+ # Helpers -----------------------------------------------------------------
228
+
229
+ def _table_name(self, feature_key: FeatureKey) -> str:
230
+ return feature_key.table_name
231
+
232
+ def _table_exists(self, table_name: str) -> bool:
233
+ """Check if a table exists without listing all tables.
234
+
235
+ Uses open_table() which is more efficient than listing all tables,
236
+ especially for remote storage (S3, GCS, etc.) where listing is expensive.
237
+
238
+ Args:
239
+ table_name: Name of the table to check
240
+
241
+ Returns:
242
+ True if table exists, False otherwise
243
+ """
244
+ try:
245
+ self.conn.open_table(table_name) # type: ignore[attr-defined]
246
+ return True
247
+ except (ValueError, FileNotFoundError):
248
+ # LanceDB raises ValueError when table doesn't exist
249
+ return False
250
+
251
+ def _get_table(self, table_name: str):
252
+ return self.conn.open_table(table_name) # type: ignore[attr-defined]
253
+
254
+ # ===== MetadataStore abstract methods =====
255
+
256
+ def _has_feature_impl(self, feature: CoercibleToFeatureKey) -> bool:
257
+ """Check if feature exists in LanceDB store.
258
+
259
+ Args:
260
+ feature: Feature to check
261
+
262
+ Returns:
263
+ True if feature exists, False otherwise
264
+ """
265
+ feature_key = self._resolve_feature_key(feature)
266
+ table_name = self._table_name(feature_key)
267
+ return self._table_exists(table_name)
268
+
269
+ def _get_default_hash_algorithm(self) -> HashAlgorithm:
270
+ """Use XXHASH64 by default to match other non-SQL stores."""
271
+ return HashAlgorithm.XXHASH64
272
+
273
+ # Storage ------------------------------------------------------------------
274
+
275
+ def write_metadata_to_store(
276
+ self,
277
+ feature_key: FeatureKey,
278
+ df: Frame,
279
+ **kwargs: Any,
280
+ ) -> None:
281
+ """Append metadata to Lance table.
282
+
283
+ Creates the table if it doesn't exist, otherwise appends to existing table.
284
+ Uses LanceDB's native Polars/Arrow integration for efficient storage.
285
+
286
+ Args:
287
+ feature_key: Feature key to write to
288
+ df: Narwhals Frame with metadata (already validated by base class)
289
+ """
290
+ # Convert Narwhals frame to Polars DataFrame
291
+ df_polars = collect_to_polars(df)
292
+
293
+ table_name = self._table_name(feature_key)
294
+
295
+ # LanceDB supports both Polars DataFrames and Arrow tables directly
296
+ # Try Polars first (native integration), fall back to Arrow if needed
297
+ try:
298
+ if self._table_exists(table_name):
299
+ table = self._get_table(table_name)
300
+ # Use Polars DataFrame directly - LanceDB handles conversion
301
+ table.add(df_polars) # type: ignore[attr-defined]
302
+ else:
303
+ # Create table from Polars DataFrame - LanceDB handles schema
304
+ self.conn.create_table(table_name, data=df_polars) # type: ignore[attr-defined]
305
+ except TypeError as exc:
306
+ if not self._should_fallback_to_arrow(exc):
307
+ raise
308
+ # Defensive fallback: Modern LanceDB (>=0.3) accepts Polars DataFrames natively,
309
+ # but fall back to Arrow if an older version or edge case doesn't support it.
310
+ # This ensures compatibility across LanceDB versions.
311
+ logger.debug("Falling back to Arrow format for LanceDB write: %s", exc)
312
+ arrow_table = df_polars.to_arrow()
313
+ if self._table_exists(table_name):
314
+ table = self._get_table(table_name)
315
+ table.add(arrow_table) # type: ignore[attr-defined]
316
+ else:
317
+ self.conn.create_table(table_name, data=arrow_table) # type: ignore[attr-defined]
318
+
319
+ def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
320
+ """Drop Lance table for feature.
321
+
322
+ Permanently removes the Lance table from the database directory.
323
+ Safe to call even if table doesn't exist (no-op).
324
+
325
+ Args:
326
+ feature_key: Feature key to drop metadata for
327
+ """
328
+ table_name = self._table_name(feature_key)
329
+ if self._table_exists(table_name):
330
+ self.conn.drop_table(table_name) # type: ignore[attr-defined]
331
+
332
+ def read_metadata_in_store(
333
+ self,
334
+ feature: CoercibleToFeatureKey,
335
+ *,
336
+ filters: Sequence[nw.Expr] | None = None,
337
+ columns: Sequence[str] | None = None,
338
+ **kwargs: Any,
339
+ ) -> nw.LazyFrame[Any] | None:
340
+ """Read metadata from Lance table.
341
+
342
+ Loads data from Lance, converts to Polars, and returns as Narwhals LazyFrame.
343
+ Applies filters and column selection in memory.
344
+
345
+ Args:
346
+ feature: Feature to read
347
+ filters: List of Narwhals filter expressions
348
+ columns: Optional list of columns to select
349
+ **kwargs: Backend-specific parameters (unused)
350
+
351
+ Returns:
352
+ Narwhals LazyFrame with metadata, or None if table not found
353
+ """
354
+ self._check_open()
355
+ feature_key = self._resolve_feature_key(feature)
356
+ table_name = self._table_name(feature_key)
357
+ if not self._table_exists(table_name):
358
+ return None
359
+
360
+ table = self._get_table(table_name)
361
+ # https://github.com/lancedb/lancedb/issues/1539
362
+ # Fall back to eager Arrow conversion until LanceDB issue #1539 is resolved.
363
+ arrow_table = table.to_arrow()
364
+ pl_lazy = pl.DataFrame(arrow_table).lazy()
365
+ nw_lazy = nw.from_native(pl_lazy)
366
+
367
+ if filters:
368
+ nw_lazy = nw_lazy.filter(*filters)
369
+
370
+ if columns is not None:
371
+ nw_lazy = nw_lazy.select(columns)
372
+
373
+ return nw_lazy
374
+
375
+ @staticmethod
376
+ def _should_fallback_to_arrow(exc: TypeError) -> bool:
377
+ """Return True when TypeError likely originates from Polars support gaps."""
378
+ message = str(exc).lower()
379
+ polars_markers = ("polars", "dataframe", "lazyframe", "data frame")
380
+ return any(marker in message for marker in polars_markers)
381
+
382
+ # Display ------------------------------------------------------------------
383
+
384
+ def display(self) -> str:
385
+ """Human-readable representation with sanitized credentials."""
386
+ path = sanitize_uri(self.uri)
387
+ return f"LanceDBMetadataStore(path={path})"
388
+
389
+ @classmethod
390
+ def config_model(cls) -> type[LanceDBMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
391
+ return LanceDBMetadataStoreConfig
@@ -0,0 +1,292 @@
1
+ """In-memory metadata store implementation."""
2
+
3
+ from collections.abc import Iterator, Sequence
4
+ from contextlib import contextmanager
5
+ from typing import Any
6
+
7
+ import narwhals as nw
8
+ import polars as pl
9
+ from narwhals.typing import Frame
10
+ from typing_extensions import Self
11
+
12
+ from metaxy._utils import collect_to_polars
13
+ from metaxy.metadata_store.base import MetadataStore, MetadataStoreConfig
14
+ from metaxy.metadata_store.types import AccessMode
15
+ from metaxy.models.types import CoercibleToFeatureKey, FeatureKey
16
+ from metaxy.versioning.polars import PolarsVersioningEngine
17
+ from metaxy.versioning.types import HashAlgorithm
18
+
19
+
20
+ class InMemoryMetadataStoreConfig(MetadataStoreConfig):
21
+ """Configuration for InMemoryMetadataStore.
22
+
23
+ Example:
24
+ ```python
25
+ config = InMemoryMetadataStoreConfig(
26
+ hash_algorithm=HashAlgorithm.XXHASH64,
27
+ )
28
+
29
+ store = InMemoryMetadataStore.from_config(config)
30
+ ```
31
+ """
32
+
33
+ pass
34
+
35
+
36
+ class InMemoryMetadataStore(MetadataStore):
37
+ """
38
+ In-memory metadata store using dict-based storage.
39
+
40
+ Features:
41
+ - Simple dict storage: {FeatureKey: pl.DataFrame}
42
+ - Fast for testing and prototyping
43
+ - No persistence (data lost when process exits)
44
+ - Schema validation on write
45
+ - Uses Polars components for all operations
46
+
47
+ Limitations:
48
+ - Not suitable for production
49
+ - Data lost on process exit
50
+ - No concurrency support across processes
51
+ - Memory-bound (all data in RAM)
52
+
53
+ Notes:
54
+ Uses Narwhals LazyFrames (nw.LazyFrame) for all operations
55
+
56
+ Components:
57
+ Components are created on-demand in resolve_update().
58
+ Uses Polars internally but exposes Narwhals interface.
59
+ Only supports Polars components (no native backend).
60
+ """
61
+
62
+ # Disable auto_create_tables warning for in-memory store
63
+ # (table creation concept doesn't apply to memory storage)
64
+ _should_warn_auto_create_tables: bool = False
65
+
66
+ def __init__(self, **kwargs: Any):
67
+ """
68
+ Initialize in-memory store.
69
+
70
+ Args:
71
+ **kwargs: Passed to MetadataStore.__init__ (e.g., fallback_stores, hash_algorithm)
72
+ """
73
+ # Use tuple as key (hashable) instead of string to avoid parsing issues
74
+ self._storage: dict[tuple[str, ...], pl.DataFrame] = {}
75
+ super().__init__(**kwargs, versioning_engine_cls=PolarsVersioningEngine)
76
+
77
+ def _get_default_hash_algorithm(self) -> HashAlgorithm:
78
+ """Get default hash algorithm for in-memory store."""
79
+ return HashAlgorithm.XXHASH64
80
+
81
+ def _get_storage_key(self, feature_key: FeatureKey) -> tuple[str, ...]:
82
+ """Convert feature key to storage key (tuple for hashability)."""
83
+ return tuple(feature_key)
84
+
85
+ @contextmanager
86
+ def _create_versioning_engine(self, plan) -> Iterator[PolarsVersioningEngine]:
87
+ """Create Polars provenance engine for in-memory store.
88
+
89
+ Args:
90
+ plan: Feature plan for the feature we're tracking provenance for
91
+
92
+ Yields:
93
+ PolarsVersioningEngine instance
94
+ """
95
+ from metaxy.versioning.polars import PolarsVersioningEngine
96
+
97
+ # Create engine (only accepts plan parameter)
98
+ engine = PolarsVersioningEngine(plan=plan)
99
+
100
+ try:
101
+ yield engine
102
+ finally:
103
+ # No cleanup needed for Polars engine
104
+ pass
105
+
106
+ def _has_feature_impl(self, feature: CoercibleToFeatureKey) -> bool:
107
+ feature_key = self._resolve_feature_key(feature)
108
+ storage_key = self._get_storage_key(feature_key)
109
+ return storage_key in self._storage
110
+
111
+ def write_metadata_to_store(
112
+ self,
113
+ feature_key: FeatureKey,
114
+ df: Frame,
115
+ **kwargs: Any,
116
+ ) -> None:
117
+ """
118
+ Internal write implementation for in-memory storage.
119
+
120
+ Args:
121
+ feature_key: Feature key to write to
122
+ df: Narwhals Frame (eager or lazy) with metadata (already validated)
123
+ **kwargs: Backend-specific parameters (currently unused)
124
+ """
125
+ df_polars: pl.DataFrame = collect_to_polars(df)
126
+
127
+ storage_key = self._get_storage_key(feature_key)
128
+
129
+ # Append or create
130
+ if storage_key in self._storage:
131
+ existing_df = self._storage[storage_key]
132
+
133
+ # Handle schema evolution: ensure both DataFrames have matching columns
134
+ # Add missing columns as null to the existing DataFrame
135
+ for col_name in df_polars.columns:
136
+ if col_name not in existing_df.columns:
137
+ # Get the data type from the new DataFrame
138
+ col_dtype = df_polars.schema[col_name]
139
+ # Add column with null values of the appropriate type
140
+ existing_df = existing_df.with_columns(
141
+ pl.lit(None).cast(col_dtype).alias(col_name)
142
+ )
143
+
144
+ # Add missing columns to the new DataFrame
145
+ for col_name in existing_df.columns:
146
+ if col_name not in df_polars.columns:
147
+ # Get the data type from the existing DataFrame
148
+ col_dtype = existing_df.schema[col_name]
149
+ # Add column with null values of the appropriate type
150
+ df_polars = df_polars.with_columns(
151
+ pl.lit(None).cast(col_dtype).alias(col_name)
152
+ ) # type: ignore[arg-type,union-attr]
153
+
154
+ # Ensure column order matches by selecting columns in consistent order
155
+ all_columns = sorted(set(existing_df.columns) | set(df_polars.columns))
156
+ existing_df = existing_df.select(all_columns)
157
+ df_polars = df_polars.select(all_columns)
158
+
159
+ # Now we can safely concat
160
+ self._storage[storage_key] = pl.concat(
161
+ [existing_df, df_polars],
162
+ how="vertical",
163
+ )
164
+ else:
165
+ # Create new
166
+ self._storage[storage_key] = df_polars
167
+
168
+ def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
169
+ """Drop all metadata for a feature from in-memory storage.
170
+
171
+ Args:
172
+ feature_key: Feature key to drop metadata for
173
+ """
174
+ storage_key = self._get_storage_key(feature_key)
175
+
176
+ # Remove from storage if it exists
177
+ if storage_key in self._storage:
178
+ del self._storage[storage_key]
179
+
180
+ def read_metadata_in_store(
181
+ self,
182
+ feature: CoercibleToFeatureKey,
183
+ *,
184
+ feature_version: str | None = None,
185
+ filters: Sequence[nw.Expr] | None = None,
186
+ columns: Sequence[str] | None = None,
187
+ **kwargs: Any,
188
+ ) -> nw.LazyFrame[Any] | None:
189
+ """
190
+ Read metadata from this store only (no fallback).
191
+
192
+ Args:
193
+ feature: Feature to read
194
+ feature_version: Filter by specific feature_version
195
+ filters: List of Narwhals filter expressions
196
+ columns: Optional list of columns to select
197
+ **kwargs: Backend-specific parameters (currently unused)
198
+
199
+ Returns:
200
+ Narwhals LazyFrame with metadata, or None if not found
201
+
202
+ Raises:
203
+ StoreNotOpenError: If store is not open
204
+ """
205
+ self._check_open()
206
+
207
+ feature_key = self._resolve_feature_key(feature)
208
+ storage_key = self._get_storage_key(feature_key)
209
+
210
+ if storage_key not in self._storage:
211
+ return None
212
+
213
+ # Start with lazy Polars DataFrame, wrap with Narwhals
214
+ df_lazy = self._storage[storage_key].lazy()
215
+ nw_lazy = nw.from_native(df_lazy)
216
+
217
+ # Apply feature_version filter
218
+ if feature_version is not None:
219
+ nw_lazy = nw_lazy.filter(
220
+ nw.col("metaxy_feature_version") == feature_version
221
+ )
222
+
223
+ # Apply generic Narwhals filters
224
+ if filters is not None:
225
+ for filter_expr in filters:
226
+ nw_lazy = nw_lazy.filter(filter_expr)
227
+
228
+ # Select columns
229
+ if columns is not None:
230
+ nw_lazy = nw_lazy.select(columns)
231
+
232
+ # Check if result would be empty (we need to check the underlying frame)
233
+ # For now, return the lazy frame - emptiness check happens when materializing
234
+ return nw_lazy
235
+
236
+ def clear(self) -> None:
237
+ """
238
+ Clear all metadata from store.
239
+
240
+ Useful for testing.
241
+ """
242
+ self._storage.clear()
243
+
244
+ # ========== Context Manager Implementation ==========
245
+
246
+ @contextmanager
247
+ def open(self, mode: AccessMode = "read") -> Iterator[Self]:
248
+ """Open the in-memory store (no-op for in-memory, but accepts mode for consistency).
249
+
250
+ Args:
251
+ mode: Access mode (accepted for consistency but ignored).
252
+
253
+ Yields:
254
+ Self: The store instance
255
+ """
256
+ # Increment context depth to support nested contexts
257
+ self._context_depth += 1
258
+
259
+ try:
260
+ # Only perform actual open on first entry
261
+ if self._context_depth == 1:
262
+ # No actual connection needed for in-memory
263
+ # Mark store as open and validate
264
+ self._is_open = True
265
+ self._validate_after_open()
266
+
267
+ yield self
268
+ finally:
269
+ # Decrement context depth
270
+ self._context_depth -= 1
271
+
272
+ # Only perform actual close on last exit
273
+ if self._context_depth == 0:
274
+ # Nothing to clean up
275
+ self._is_open = False
276
+
277
+ def __repr__(self) -> str:
278
+ """String representation."""
279
+ num_fallbacks = len(self.fallback_stores)
280
+ status = "open" if self._is_open else "closed"
281
+ return (
282
+ f"InMemoryMetadataStore(status={status}, fallback_stores={num_fallbacks})"
283
+ )
284
+
285
+ def display(self) -> str:
286
+ """Display string for this store."""
287
+ status = "open" if self._is_open else "closed"
288
+ return f"InMemoryMetadataStore(status={status})"
289
+
290
+ @classmethod
291
+ def config_model(cls) -> type[InMemoryMetadataStoreConfig]: # pyright: ignore[reportIncompatibleMethodOverride]
292
+ return InMemoryMetadataStoreConfig