metaxy 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metaxy might be problematic. Click here for more details.
- metaxy/__init__.py +61 -0
- metaxy/_testing.py +542 -0
- metaxy/_utils.py +16 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +76 -0
- metaxy/cli/context.py +71 -0
- metaxy/cli/graph.py +576 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +42 -0
- metaxy/cli/metadata.py +271 -0
- metaxy/cli/migrations.py +862 -0
- metaxy/cli/push.py +55 -0
- metaxy/config.py +450 -0
- metaxy/data_versioning/__init__.py +24 -0
- metaxy/data_versioning/calculators/__init__.py +13 -0
- metaxy/data_versioning/calculators/base.py +97 -0
- metaxy/data_versioning/calculators/duckdb.py +186 -0
- metaxy/data_versioning/calculators/ibis.py +225 -0
- metaxy/data_versioning/calculators/polars.py +135 -0
- metaxy/data_versioning/diff/__init__.py +15 -0
- metaxy/data_versioning/diff/base.py +150 -0
- metaxy/data_versioning/diff/narwhals.py +108 -0
- metaxy/data_versioning/hash_algorithms.py +19 -0
- metaxy/data_versioning/joiners/__init__.py +9 -0
- metaxy/data_versioning/joiners/base.py +70 -0
- metaxy/data_versioning/joiners/narwhals.py +235 -0
- metaxy/entrypoints.py +309 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/alembic.py +326 -0
- metaxy/ext/sqlmodel.py +172 -0
- metaxy/ext/sqlmodel_system_tables.py +139 -0
- metaxy/graph/__init__.py +21 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +399 -0
- metaxy/graph/diff/differ.py +740 -0
- metaxy/graph/diff/models.py +418 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +274 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +320 -0
- metaxy/graph/diff/rendering/rich.py +165 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +31 -0
- metaxy/metadata_store/_protocols.py +38 -0
- metaxy/metadata_store/base.py +1676 -0
- metaxy/metadata_store/clickhouse.py +161 -0
- metaxy/metadata_store/duckdb.py +167 -0
- metaxy/metadata_store/exceptions.py +43 -0
- metaxy/metadata_store/ibis.py +451 -0
- metaxy/metadata_store/memory.py +228 -0
- metaxy/metadata_store/sqlite.py +187 -0
- metaxy/metadata_store/system_tables.py +257 -0
- metaxy/migrations/__init__.py +34 -0
- metaxy/migrations/detector.py +153 -0
- metaxy/migrations/executor.py +208 -0
- metaxy/migrations/loader.py +260 -0
- metaxy/migrations/models.py +718 -0
- metaxy/migrations/ops.py +390 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +6 -0
- metaxy/models/constants.py +24 -0
- metaxy/models/feature.py +665 -0
- metaxy/models/feature_spec.py +105 -0
- metaxy/models/field.py +25 -0
- metaxy/models/plan.py +155 -0
- metaxy/models/types.py +157 -0
- metaxy/py.typed +0 -0
- metaxy-0.0.0.dist-info/METADATA +247 -0
- metaxy-0.0.0.dist-info/RECORD +75 -0
- metaxy-0.0.0.dist-info/WHEEL +4 -0
- metaxy-0.0.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
"""Ibis-based metadata store for SQL databases.
|
|
2
|
+
|
|
3
|
+
Supports any SQL database that Ibis supports:
|
|
4
|
+
- DuckDB, PostgreSQL, MySQL, SQLite (local/embedded)
|
|
5
|
+
- ClickHouse, Snowflake, BigQuery (cloud analytical)
|
|
6
|
+
- And 20+ other backends
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
import narwhals as nw
|
|
13
|
+
import polars as pl
|
|
14
|
+
|
|
15
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
16
|
+
from metaxy.metadata_store.base import MetadataStore
|
|
17
|
+
from metaxy.models.feature import Feature
|
|
18
|
+
from metaxy.models.types import FeatureKey
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import ibis
|
|
22
|
+
import ibis.expr.types
|
|
23
|
+
|
|
24
|
+
from metaxy.data_versioning.calculators.ibis import HashSQLGenerator
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class IbisMetadataStore(MetadataStore):
|
|
28
|
+
"""
|
|
29
|
+
Generic SQL metadata store using Ibis.
|
|
30
|
+
|
|
31
|
+
Supports any Ibis backend including:
|
|
32
|
+
- DuckDB: Fast local analytical database
|
|
33
|
+
- PostgreSQL: Production-grade RDBMS
|
|
34
|
+
- MySQL: Popular RDBMS
|
|
35
|
+
- ClickHouse: High-performance analytical database
|
|
36
|
+
- SQLite: Embedded database
|
|
37
|
+
- And 20+ other backends
|
|
38
|
+
|
|
39
|
+
Storage layout:
|
|
40
|
+
- Each feature gets its own table: {namespace}__{feature_name}
|
|
41
|
+
- System tables: __metaxy__feature_versions, __metaxy__migrations
|
|
42
|
+
- Uses Ibis for cross-database compatibility
|
|
43
|
+
|
|
44
|
+
Note: Uses MD5 hash by default for cross-database compatibility.
|
|
45
|
+
DuckDBMetadataStore overrides this with dynamic algorithm detection.
|
|
46
|
+
For other backends, override the calculator instance variable with backend-specific implementations.
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
>>> # ClickHouse
|
|
50
|
+
>>> store = IbisMetadataStore("clickhouse://user:pass@host:9000/db")
|
|
51
|
+
>>>
|
|
52
|
+
>>> # PostgreSQL
|
|
53
|
+
>>> store = IbisMetadataStore("postgresql://user:pass@host:5432/db")
|
|
54
|
+
>>>
|
|
55
|
+
>>> # DuckDB (use DuckDBMetadataStore instead for better hash support)
|
|
56
|
+
>>> store = IbisMetadataStore("duckdb:///metadata.db")
|
|
57
|
+
>>>
|
|
58
|
+
>>> with store:
|
|
59
|
+
... store.write_metadata(MyFeature, df)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def supports_structs(cls) -> bool:
|
|
64
|
+
"""Check if backend supports struct types natively.
|
|
65
|
+
|
|
66
|
+
Subclasses should override this for backends that don't support structs.
|
|
67
|
+
Default implementation returns True (most SQL databases support structs).
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
True if backend supports structs, False if needs JSON serialization
|
|
71
|
+
"""
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
connection_string: str | None = None,
|
|
77
|
+
*,
|
|
78
|
+
backend: str | None = None,
|
|
79
|
+
connection_params: dict[str, Any] | None = None,
|
|
80
|
+
**kwargs,
|
|
81
|
+
):
|
|
82
|
+
"""
|
|
83
|
+
Initialize Ibis metadata store.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
connection_string: Ibis connection string (e.g., "clickhouse://host:9000/db")
|
|
87
|
+
If provided, backend and connection_params are ignored.
|
|
88
|
+
backend: Ibis backend name (e.g., "clickhouse", "postgres", "duckdb")
|
|
89
|
+
Used with connection_params for more control.
|
|
90
|
+
connection_params: Backend-specific connection parameters
|
|
91
|
+
e.g., {"host": "localhost", "port": 9000, "database": "default"}
|
|
92
|
+
**kwargs: Passed to MetadataStore.__init__ (e.g., fallback_stores, hash_algorithm)
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If neither connection_string nor backend is provided
|
|
96
|
+
ImportError: If Ibis or required backend driver not installed
|
|
97
|
+
|
|
98
|
+
Example:
|
|
99
|
+
>>> # Using connection string
|
|
100
|
+
>>> store = IbisMetadataStore("clickhouse://user:pass@host:9000/db")
|
|
101
|
+
>>>
|
|
102
|
+
>>> # Using backend + params
|
|
103
|
+
>>> store = IbisMetadataStore(
|
|
104
|
+
... backend="clickhouse",
|
|
105
|
+
... connection_params={"host": "localhost", "port": 9000}
|
|
106
|
+
... )
|
|
107
|
+
"""
|
|
108
|
+
try:
|
|
109
|
+
import ibis
|
|
110
|
+
|
|
111
|
+
self._ibis = ibis
|
|
112
|
+
except ImportError as e:
|
|
113
|
+
raise ImportError(
|
|
114
|
+
"Ibis is required for IbisMetadataStore. "
|
|
115
|
+
"Install with: pip install ibis-framework[BACKEND] "
|
|
116
|
+
"where BACKEND is one of: duckdb, postgres, clickhouse, mysql, etc."
|
|
117
|
+
) from e
|
|
118
|
+
|
|
119
|
+
if connection_string is None and backend is None:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
"Must provide either connection_string or backend. "
|
|
122
|
+
"Example: connection_string='clickhouse://host:9000/db' "
|
|
123
|
+
"or backend='clickhouse' with connection_params"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
self.connection_string = connection_string
|
|
127
|
+
self.backend = backend
|
|
128
|
+
self.connection_params = connection_params or {}
|
|
129
|
+
self._conn: ibis.BaseBackend | None = None
|
|
130
|
+
|
|
131
|
+
super().__init__(**kwargs)
|
|
132
|
+
|
|
133
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
134
|
+
"""Get default hash algorithm for Ibis stores.
|
|
135
|
+
|
|
136
|
+
Uses MD5 as it's universally supported across SQL databases.
|
|
137
|
+
Subclasses like DuckDBMetadataStore can override for better algorithms.
|
|
138
|
+
"""
|
|
139
|
+
return HashAlgorithm.MD5
|
|
140
|
+
|
|
141
|
+
def _supports_native_components(self) -> bool:
|
|
142
|
+
"""Ibis stores support native (Ibis-based) components when connection is open."""
|
|
143
|
+
return self._conn is not None
|
|
144
|
+
|
|
145
|
+
def _create_native_components(self):
|
|
146
|
+
"""Create components for native SQL execution via Ibis."""
|
|
147
|
+
from metaxy.data_versioning.calculators.ibis import IbisDataVersionCalculator
|
|
148
|
+
from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
|
|
149
|
+
from metaxy.data_versioning.joiners.narwhals import NarwhalsJoiner
|
|
150
|
+
|
|
151
|
+
if self._conn is None:
|
|
152
|
+
raise RuntimeError(
|
|
153
|
+
"Cannot create native data version calculations: store is not open. "
|
|
154
|
+
"Ensure store is used as context manager."
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# All components accept/return Narwhals LazyFrames
|
|
158
|
+
# IbisDataVersionCalculator converts to Ibis internally for SQL hash generation
|
|
159
|
+
joiner = NarwhalsJoiner()
|
|
160
|
+
calculator = IbisDataVersionCalculator(
|
|
161
|
+
backend=self._conn,
|
|
162
|
+
hash_sql_generators=self._get_hash_sql_generators(),
|
|
163
|
+
)
|
|
164
|
+
diff_resolver = NarwhalsDiffResolver()
|
|
165
|
+
|
|
166
|
+
return joiner, calculator, diff_resolver
|
|
167
|
+
|
|
168
|
+
def _get_hash_sql_generators(self) -> dict[HashAlgorithm, "HashSQLGenerator"]:
|
|
169
|
+
"""Get hash SQL generators for this backend.
|
|
170
|
+
|
|
171
|
+
Base implementation only supports MD5 (universally available in SQL).
|
|
172
|
+
Subclasses override to add backend-specific hash functions.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Dictionary mapping HashAlgorithm to SQL generator functions
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
def md5_generator(table, concat_columns: dict[str, str]) -> str:
|
|
179
|
+
"""Generate SQL to compute MD5 hashes (universal SQL support).
|
|
180
|
+
|
|
181
|
+
Note: This generic implementation assumes MD5() returns a hex string.
|
|
182
|
+
Subclasses should override if their backend returns binary or different format.
|
|
183
|
+
For example, ClickHouse returns binary and needs lower(hex(MD5(...))).
|
|
184
|
+
"""
|
|
185
|
+
# Build SELECT clause with hash columns
|
|
186
|
+
hash_selects: list[str] = []
|
|
187
|
+
for field_key, concat_col in concat_columns.items():
|
|
188
|
+
hash_col = f"__hash_{field_key}"
|
|
189
|
+
# Use MD5 function (universally available in SQL databases)
|
|
190
|
+
# WARNING: Different databases return different formats (hex string vs binary)
|
|
191
|
+
# This generic version assumes hex string output
|
|
192
|
+
hash_expr = f"MD5({concat_col})"
|
|
193
|
+
hash_selects.append(f"{hash_expr} as {hash_col}")
|
|
194
|
+
|
|
195
|
+
hash_clause = ", ".join(hash_selects)
|
|
196
|
+
table_sql = table.compile()
|
|
197
|
+
return f"SELECT *, {hash_clause} FROM ({table_sql}) AS __metaxy_temp"
|
|
198
|
+
|
|
199
|
+
return {HashAlgorithm.MD5: md5_generator}
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def ibis_conn(self) -> "ibis.BaseBackend":
|
|
203
|
+
"""Get Ibis backend connection.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Active Ibis backend connection
|
|
207
|
+
|
|
208
|
+
Raises:
|
|
209
|
+
StoreNotOpenError: If store is not open
|
|
210
|
+
"""
|
|
211
|
+
from metaxy.metadata_store.exceptions import StoreNotOpenError
|
|
212
|
+
|
|
213
|
+
if self._conn is None:
|
|
214
|
+
raise StoreNotOpenError(
|
|
215
|
+
"Ibis connection is not open. Store must be used as a context manager."
|
|
216
|
+
)
|
|
217
|
+
return self._conn
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def conn(self) -> "ibis.BaseBackend":
|
|
221
|
+
"""Get connection (alias for ibis_conn for consistency).
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Active Ibis backend connection
|
|
225
|
+
|
|
226
|
+
Raises:
|
|
227
|
+
StoreNotOpenError: If store is not open
|
|
228
|
+
"""
|
|
229
|
+
return self.ibis_conn
|
|
230
|
+
|
|
231
|
+
def open(self) -> None:
|
|
232
|
+
"""Open connection to database via Ibis.
|
|
233
|
+
|
|
234
|
+
Subclasses should override this to add backend-specific initialization
|
|
235
|
+
(e.g., loading extensions) and should call super().open() first.
|
|
236
|
+
"""
|
|
237
|
+
if self.connection_string:
|
|
238
|
+
# Use connection string
|
|
239
|
+
self._conn = self._ibis.connect(self.connection_string)
|
|
240
|
+
else:
|
|
241
|
+
# Use backend + params
|
|
242
|
+
# Get backend-specific connect function
|
|
243
|
+
assert self.backend is not None, (
|
|
244
|
+
"backend must be set if connection_string is None"
|
|
245
|
+
)
|
|
246
|
+
backend_module = getattr(self._ibis, self.backend)
|
|
247
|
+
self._conn = backend_module.connect(**self.connection_params)
|
|
248
|
+
|
|
249
|
+
def close(self) -> None:
|
|
250
|
+
"""Close the Ibis connection."""
|
|
251
|
+
if self._conn is not None:
|
|
252
|
+
# Ibis connections may not have explicit close method
|
|
253
|
+
# but setting to None releases resources
|
|
254
|
+
self._conn = None
|
|
255
|
+
|
|
256
|
+
def _table_name_to_feature_key(self, table_name: str) -> FeatureKey:
|
|
257
|
+
"""Convert table name back to feature key."""
|
|
258
|
+
return FeatureKey(table_name.split("__"))
|
|
259
|
+
|
|
260
|
+
def _serialize_for_storage(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
261
|
+
"""Serialize DataFrame for storage (e.g., convert structs to JSON).
|
|
262
|
+
|
|
263
|
+
Base implementation does nothing - backends that don't support structs
|
|
264
|
+
should override this method.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
df: DataFrame to serialize
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
Serialized DataFrame
|
|
271
|
+
"""
|
|
272
|
+
return df
|
|
273
|
+
|
|
274
|
+
def _deserialize_from_storage(self, df: pl.DataFrame) -> pl.DataFrame:
|
|
275
|
+
"""Deserialize DataFrame from storage (e.g., convert JSON back to structs).
|
|
276
|
+
|
|
277
|
+
Base implementation does nothing - backends that don't support structs
|
|
278
|
+
should override this method.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
df: DataFrame to deserialize
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Deserialized DataFrame
|
|
285
|
+
"""
|
|
286
|
+
return df
|
|
287
|
+
|
|
288
|
+
def _write_metadata_impl(
|
|
289
|
+
self,
|
|
290
|
+
feature_key: FeatureKey,
|
|
291
|
+
df: pl.DataFrame,
|
|
292
|
+
) -> None:
|
|
293
|
+
"""
|
|
294
|
+
Internal write implementation using Ibis.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
feature_key: Feature key to write to
|
|
298
|
+
df: DataFrame with metadata (already validated)
|
|
299
|
+
"""
|
|
300
|
+
table_name = feature_key.table_name
|
|
301
|
+
|
|
302
|
+
# Serialize for storage (e.g., convert structs to JSON for SQLite)
|
|
303
|
+
df = self._serialize_for_storage(df)
|
|
304
|
+
|
|
305
|
+
# Check if table exists
|
|
306
|
+
existing_tables = self.conn.list_tables()
|
|
307
|
+
|
|
308
|
+
if table_name not in existing_tables:
|
|
309
|
+
# Create table from DataFrame
|
|
310
|
+
# Ensure NULL columns have proper types by filling with a typed value
|
|
311
|
+
# This handles cases like snapshot_version which can be NULL
|
|
312
|
+
df_typed = df
|
|
313
|
+
for col in df.columns:
|
|
314
|
+
if df[col].dtype == pl.Null:
|
|
315
|
+
# Cast NULL columns to String
|
|
316
|
+
df_typed = df_typed.with_columns(pl.col(col).cast(pl.Utf8))
|
|
317
|
+
|
|
318
|
+
self.conn.create_table(table_name, obj=df_typed)
|
|
319
|
+
else:
|
|
320
|
+
# Append to existing table
|
|
321
|
+
self.conn.insert(table_name, obj=df) # type: ignore[attr-defined] # pyright: ignore[reportAttributeAccessIssue]
|
|
322
|
+
|
|
323
|
+
def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
|
|
324
|
+
"""Drop the table for a feature.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
feature_key: Feature key to drop metadata for
|
|
328
|
+
"""
|
|
329
|
+
table_name = feature_key.table_name
|
|
330
|
+
|
|
331
|
+
# Check if table exists
|
|
332
|
+
if table_name in self.conn.list_tables():
|
|
333
|
+
self.conn.drop_table(table_name)
|
|
334
|
+
|
|
335
|
+
def _read_metadata_native(
|
|
336
|
+
self,
|
|
337
|
+
feature: FeatureKey | type[Feature],
|
|
338
|
+
*,
|
|
339
|
+
feature_version: str | None = None,
|
|
340
|
+
filters: Sequence[nw.Expr] | None = None,
|
|
341
|
+
columns: Sequence[str] | None = None,
|
|
342
|
+
) -> nw.LazyFrame[Any] | None:
|
|
343
|
+
"""
|
|
344
|
+
Read metadata from this store only (no fallback).
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
feature: Feature to read
|
|
348
|
+
feature_version: Filter by specific feature_version (applied as SQL WHERE clause)
|
|
349
|
+
filters: List of Narwhals filter expressions (converted to SQL WHERE clauses)
|
|
350
|
+
columns: Optional list of columns to select
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Narwhals LazyFrame with metadata, or None if not found
|
|
354
|
+
"""
|
|
355
|
+
feature_key = self._resolve_feature_key(feature)
|
|
356
|
+
table_name = feature_key.table_name
|
|
357
|
+
|
|
358
|
+
# Check if table exists
|
|
359
|
+
existing_tables = self.conn.list_tables()
|
|
360
|
+
if table_name not in existing_tables:
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
# Get Ibis table reference
|
|
364
|
+
table = self.conn.table(table_name)
|
|
365
|
+
|
|
366
|
+
# Wrap Ibis table with Narwhals (stays lazy in SQL)
|
|
367
|
+
nw_lazy: nw.LazyFrame[Any] = nw.from_native(table, eager_only=False)
|
|
368
|
+
|
|
369
|
+
# Apply feature_version filter (stays in SQL via Narwhals)
|
|
370
|
+
if feature_version is not None:
|
|
371
|
+
nw_lazy = nw_lazy.filter(nw.col("feature_version") == feature_version)
|
|
372
|
+
|
|
373
|
+
# Apply generic Narwhals filters (stays in SQL)
|
|
374
|
+
if filters is not None:
|
|
375
|
+
for filter_expr in filters:
|
|
376
|
+
nw_lazy = nw_lazy.filter(filter_expr)
|
|
377
|
+
|
|
378
|
+
# Select columns (stays in SQL)
|
|
379
|
+
if columns is not None:
|
|
380
|
+
nw_lazy = nw_lazy.select(columns)
|
|
381
|
+
|
|
382
|
+
# For backends that don't support structs (e.g., SQLite),
|
|
383
|
+
# we need to deserialize JSON strings to structs
|
|
384
|
+
if not self.supports_structs():
|
|
385
|
+
# Convert to Polars, deserialize, then wrap back as Narwhals lazy
|
|
386
|
+
table_native = nw_lazy.to_native()
|
|
387
|
+
if hasattr(table_native, "to_polars"):
|
|
388
|
+
# Ibis table
|
|
389
|
+
df_polars = table_native.to_polars()
|
|
390
|
+
else:
|
|
391
|
+
# Already Polars
|
|
392
|
+
df_polars = (
|
|
393
|
+
table_native
|
|
394
|
+
if isinstance(table_native, pl.DataFrame)
|
|
395
|
+
else table_native.collect()
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Deserialize JSON → structs
|
|
399
|
+
df_polars = self._deserialize_from_storage(df_polars)
|
|
400
|
+
|
|
401
|
+
# Make lazy and wrap in Narwhals
|
|
402
|
+
return nw.from_native(df_polars.lazy())
|
|
403
|
+
|
|
404
|
+
# Return Narwhals LazyFrame wrapping Ibis table (stays lazy in SQL)
|
|
405
|
+
return nw_lazy
|
|
406
|
+
|
|
407
|
+
def _list_features_local(self) -> list[FeatureKey]:
|
|
408
|
+
"""
|
|
409
|
+
List all features in this store.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
List of FeatureKey objects (excluding system tables)
|
|
413
|
+
"""
|
|
414
|
+
# Query all table names
|
|
415
|
+
table_names = self.conn.list_tables()
|
|
416
|
+
|
|
417
|
+
features = []
|
|
418
|
+
for table_name in table_names:
|
|
419
|
+
# Skip Ibis internal tables (start with "ibis_")
|
|
420
|
+
if table_name.startswith("ibis_"):
|
|
421
|
+
continue
|
|
422
|
+
|
|
423
|
+
feature_key = self._table_name_to_feature_key(table_name)
|
|
424
|
+
|
|
425
|
+
# Skip system tables
|
|
426
|
+
if not self._is_system_table(feature_key):
|
|
427
|
+
features.append(feature_key)
|
|
428
|
+
|
|
429
|
+
return features
|
|
430
|
+
|
|
431
|
+
def _can_compute_native(self) -> bool:
|
|
432
|
+
"""
|
|
433
|
+
Ibis backends support native data version calculations (Narwhals-based).
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
True (use Narwhals components with Ibis-backed tables)
|
|
437
|
+
|
|
438
|
+
Note: All Ibis stores now use Narwhals-based components (NarwhalsJoiner,
|
|
439
|
+
PolarsDataVersionCalculator, NarwhalsDiffResolver) which work efficiently
|
|
440
|
+
with Ibis-backed tables.
|
|
441
|
+
"""
|
|
442
|
+
return True
|
|
443
|
+
|
|
444
|
+
def display(self) -> str:
|
|
445
|
+
"""Display string for this store."""
|
|
446
|
+
backend_info = self.connection_string or f"{self.backend}"
|
|
447
|
+
if self._is_open:
|
|
448
|
+
num_features = len(self._list_features_local())
|
|
449
|
+
return f"IbisMetadataStore(backend={backend_info}, features={num_features})"
|
|
450
|
+
else:
|
|
451
|
+
return f"IbisMetadataStore(backend={backend_info})"
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""In-memory metadata store implementation."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import narwhals as nw
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from metaxy.data_versioning.calculators.base import DataVersionCalculator
|
|
10
|
+
from metaxy.data_versioning.diff.base import MetadataDiffResolver
|
|
11
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
12
|
+
from metaxy.data_versioning.joiners.base import UpstreamJoiner
|
|
13
|
+
from metaxy.metadata_store.base import MetadataStore
|
|
14
|
+
from metaxy.models.feature import Feature
|
|
15
|
+
from metaxy.models.types import FeatureKey
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class InMemoryMetadataStore(MetadataStore):
|
|
19
|
+
"""
|
|
20
|
+
In-memory metadata store using dict-based storage.
|
|
21
|
+
|
|
22
|
+
Features:
|
|
23
|
+
- Simple dict storage: {FeatureKey: pl.DataFrame}
|
|
24
|
+
- Fast for testing and prototyping
|
|
25
|
+
- No persistence (data lost when process exits)
|
|
26
|
+
- Schema validation on write
|
|
27
|
+
- Uses Polars components for all operations
|
|
28
|
+
|
|
29
|
+
Limitations:
|
|
30
|
+
- Not suitable for production
|
|
31
|
+
- Data lost on process exit
|
|
32
|
+
- No concurrency support across processes
|
|
33
|
+
- Memory-bound (all data in RAM)
|
|
34
|
+
|
|
35
|
+
Type Parameters:
|
|
36
|
+
TRef = nw.LazyFrame (uses Narwhals LazyFrames)
|
|
37
|
+
|
|
38
|
+
Components:
|
|
39
|
+
Components are created on-demand in resolve_update().
|
|
40
|
+
Uses Polars internally but exposes Narwhals interface.
|
|
41
|
+
Only supports Polars components (no native backend).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
"""
|
|
46
|
+
Initialize in-memory store.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
**kwargs: Passed to MetadataStore.__init__ (e.g., fallback_stores, hash_algorithm)
|
|
50
|
+
"""
|
|
51
|
+
# Use tuple as key (hashable) instead of string to avoid parsing issues
|
|
52
|
+
self._storage: dict[tuple[str, ...], pl.DataFrame] = {}
|
|
53
|
+
super().__init__(**kwargs)
|
|
54
|
+
|
|
55
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
56
|
+
"""Get default hash algorithm for in-memory store."""
|
|
57
|
+
return HashAlgorithm.XXHASH64
|
|
58
|
+
|
|
59
|
+
def _get_storage_key(self, feature_key: FeatureKey) -> tuple[str, ...]:
|
|
60
|
+
"""Convert feature key to storage key (tuple for hashability)."""
|
|
61
|
+
return tuple(feature_key)
|
|
62
|
+
|
|
63
|
+
def _supports_native_components(self) -> bool:
|
|
64
|
+
"""In-memory store only supports Polars components."""
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
def _create_native_components(
|
|
68
|
+
self,
|
|
69
|
+
) -> tuple[
|
|
70
|
+
UpstreamJoiner,
|
|
71
|
+
DataVersionCalculator,
|
|
72
|
+
MetadataDiffResolver,
|
|
73
|
+
]:
|
|
74
|
+
"""Not supported - in-memory store only uses Polars components."""
|
|
75
|
+
raise NotImplementedError(
|
|
76
|
+
"InMemoryMetadataStore does not support native data version calculations"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def _write_metadata_impl(
|
|
80
|
+
self,
|
|
81
|
+
feature_key: FeatureKey,
|
|
82
|
+
df: pl.DataFrame,
|
|
83
|
+
) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Internal write implementation for in-memory storage.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
feature_key: Feature key to write to
|
|
89
|
+
df: DataFrame with metadata (already validated)
|
|
90
|
+
"""
|
|
91
|
+
storage_key = self._get_storage_key(feature_key)
|
|
92
|
+
|
|
93
|
+
# Append or create
|
|
94
|
+
if storage_key in self._storage:
|
|
95
|
+
# Append to existing
|
|
96
|
+
self._storage[storage_key] = pl.concat(
|
|
97
|
+
[self._storage[storage_key], df],
|
|
98
|
+
how="vertical",
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
# Create new
|
|
102
|
+
self._storage[storage_key] = df
|
|
103
|
+
|
|
104
|
+
def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
|
|
105
|
+
"""Drop all metadata for a feature from in-memory storage.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
feature_key: Feature key to drop metadata for
|
|
109
|
+
"""
|
|
110
|
+
storage_key = self._get_storage_key(feature_key)
|
|
111
|
+
|
|
112
|
+
# Remove from storage if it exists
|
|
113
|
+
if storage_key in self._storage:
|
|
114
|
+
del self._storage[storage_key]
|
|
115
|
+
|
|
116
|
+
def _read_metadata_native(
|
|
117
|
+
self,
|
|
118
|
+
feature: FeatureKey | type[Feature],
|
|
119
|
+
*,
|
|
120
|
+
feature_version: str | None = None,
|
|
121
|
+
filters: Sequence[nw.Expr] | None = None,
|
|
122
|
+
columns: Sequence[str] | None = None,
|
|
123
|
+
) -> nw.LazyFrame[Any] | None:
|
|
124
|
+
"""
|
|
125
|
+
Read metadata from this store only (no fallback).
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
feature: Feature to read
|
|
129
|
+
feature_version: Filter by specific feature_version
|
|
130
|
+
filters: List of Narwhals filter expressions
|
|
131
|
+
columns: Optional list of columns to select
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Narwhals LazyFrame with metadata, or None if not found
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
StoreNotOpenError: If store is not open
|
|
138
|
+
"""
|
|
139
|
+
self._check_open()
|
|
140
|
+
|
|
141
|
+
feature_key = self._resolve_feature_key(feature)
|
|
142
|
+
storage_key = self._get_storage_key(feature_key)
|
|
143
|
+
|
|
144
|
+
if storage_key not in self._storage:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
# Start with lazy Polars DataFrame, wrap with Narwhals
|
|
148
|
+
df_lazy = self._storage[storage_key].lazy()
|
|
149
|
+
nw_lazy = nw.from_native(df_lazy)
|
|
150
|
+
|
|
151
|
+
# Apply feature_version filter
|
|
152
|
+
if feature_version is not None:
|
|
153
|
+
nw_lazy = nw_lazy.filter(nw.col("feature_version") == feature_version)
|
|
154
|
+
|
|
155
|
+
# Apply generic Narwhals filters
|
|
156
|
+
if filters is not None:
|
|
157
|
+
for filter_expr in filters:
|
|
158
|
+
nw_lazy = nw_lazy.filter(filter_expr)
|
|
159
|
+
|
|
160
|
+
# Select columns
|
|
161
|
+
if columns is not None:
|
|
162
|
+
nw_lazy = nw_lazy.select(columns)
|
|
163
|
+
|
|
164
|
+
# Check if result would be empty (we need to check the underlying frame)
|
|
165
|
+
# For now, return the lazy frame - emptiness check happens when materializing
|
|
166
|
+
return nw_lazy
|
|
167
|
+
|
|
168
|
+
def _list_features_local(self) -> list[FeatureKey]:
|
|
169
|
+
"""
|
|
170
|
+
List all features in this store.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
List of FeatureKey objects (excluding system tables)
|
|
174
|
+
"""
|
|
175
|
+
features = []
|
|
176
|
+
for key_tuple in self._storage.keys():
|
|
177
|
+
# Convert tuple back to FeatureKey
|
|
178
|
+
feature_key = FeatureKey(list(key_tuple))
|
|
179
|
+
|
|
180
|
+
# Skip system tables
|
|
181
|
+
if not self._is_system_table(feature_key):
|
|
182
|
+
features.append(feature_key)
|
|
183
|
+
|
|
184
|
+
return features
|
|
185
|
+
|
|
186
|
+
def clear(self) -> None:
|
|
187
|
+
"""
|
|
188
|
+
Clear all metadata from store.
|
|
189
|
+
|
|
190
|
+
Useful for testing.
|
|
191
|
+
"""
|
|
192
|
+
self._storage.clear()
|
|
193
|
+
|
|
194
|
+
# ========== Context Manager Implementation ==========
|
|
195
|
+
|
|
196
|
+
def open(self) -> None:
|
|
197
|
+
"""Open the in-memory store.
|
|
198
|
+
|
|
199
|
+
For InMemoryMetadataStore, this is a no-op since no external
|
|
200
|
+
resources need initialization.
|
|
201
|
+
"""
|
|
202
|
+
pass # No resources to initialize for in-memory storage
|
|
203
|
+
|
|
204
|
+
def close(self) -> None:
|
|
205
|
+
"""Close the in-memory store.
|
|
206
|
+
|
|
207
|
+
For InMemoryMetadataStore, this is a no-op since no external
|
|
208
|
+
resources need cleanup.
|
|
209
|
+
"""
|
|
210
|
+
pass # No resources to cleanup for in-memory storage
|
|
211
|
+
|
|
212
|
+
def __repr__(self) -> str:
|
|
213
|
+
"""String representation."""
|
|
214
|
+
num_features = len(self._storage)
|
|
215
|
+
num_fallbacks = len(self.fallback_stores)
|
|
216
|
+
return (
|
|
217
|
+
f"InMemoryMetadataStore("
|
|
218
|
+
f"features={num_features}, "
|
|
219
|
+
f"fallback_stores={num_fallbacks})"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def display(self) -> str:
|
|
223
|
+
"""Display string for this store."""
|
|
224
|
+
if self._is_open:
|
|
225
|
+
num_features = len(self._storage)
|
|
226
|
+
return f"InMemoryMetadataStore(features={num_features})"
|
|
227
|
+
else:
|
|
228
|
+
return "InMemoryMetadataStore()"
|