metaxy 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metaxy might be problematic. Click here for more details.
- metaxy/__init__.py +61 -0
- metaxy/_testing.py +542 -0
- metaxy/_utils.py +16 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +76 -0
- metaxy/cli/context.py +71 -0
- metaxy/cli/graph.py +576 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +42 -0
- metaxy/cli/metadata.py +271 -0
- metaxy/cli/migrations.py +862 -0
- metaxy/cli/push.py +55 -0
- metaxy/config.py +450 -0
- metaxy/data_versioning/__init__.py +24 -0
- metaxy/data_versioning/calculators/__init__.py +13 -0
- metaxy/data_versioning/calculators/base.py +97 -0
- metaxy/data_versioning/calculators/duckdb.py +186 -0
- metaxy/data_versioning/calculators/ibis.py +225 -0
- metaxy/data_versioning/calculators/polars.py +135 -0
- metaxy/data_versioning/diff/__init__.py +15 -0
- metaxy/data_versioning/diff/base.py +150 -0
- metaxy/data_versioning/diff/narwhals.py +108 -0
- metaxy/data_versioning/hash_algorithms.py +19 -0
- metaxy/data_versioning/joiners/__init__.py +9 -0
- metaxy/data_versioning/joiners/base.py +70 -0
- metaxy/data_versioning/joiners/narwhals.py +235 -0
- metaxy/entrypoints.py +309 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/alembic.py +326 -0
- metaxy/ext/sqlmodel.py +172 -0
- metaxy/ext/sqlmodel_system_tables.py +139 -0
- metaxy/graph/__init__.py +21 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +399 -0
- metaxy/graph/diff/differ.py +740 -0
- metaxy/graph/diff/models.py +418 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +274 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +320 -0
- metaxy/graph/diff/rendering/rich.py +165 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +31 -0
- metaxy/metadata_store/_protocols.py +38 -0
- metaxy/metadata_store/base.py +1676 -0
- metaxy/metadata_store/clickhouse.py +161 -0
- metaxy/metadata_store/duckdb.py +167 -0
- metaxy/metadata_store/exceptions.py +43 -0
- metaxy/metadata_store/ibis.py +451 -0
- metaxy/metadata_store/memory.py +228 -0
- metaxy/metadata_store/sqlite.py +187 -0
- metaxy/metadata_store/system_tables.py +257 -0
- metaxy/migrations/__init__.py +34 -0
- metaxy/migrations/detector.py +153 -0
- metaxy/migrations/executor.py +208 -0
- metaxy/migrations/loader.py +260 -0
- metaxy/migrations/models.py +718 -0
- metaxy/migrations/ops.py +390 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +6 -0
- metaxy/models/constants.py +24 -0
- metaxy/models/feature.py +665 -0
- metaxy/models/feature_spec.py +105 -0
- metaxy/models/field.py +25 -0
- metaxy/models/plan.py +155 -0
- metaxy/models/types.py +157 -0
- metaxy/py.typed +0 -0
- metaxy-0.0.0.dist-info/METADATA +247 -0
- metaxy-0.0.0.dist-info/RECORD +75 -0
- metaxy-0.0.0.dist-info/WHEEL +4 -0
- metaxy-0.0.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,1676 @@
|
|
|
1
|
+
"""Abstract base class for metadata storage backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from collections.abc import Mapping, Sequence
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Literal, TypeGuard, overload
|
|
10
|
+
|
|
11
|
+
import narwhals as nw
|
|
12
|
+
import polars as pl
|
|
13
|
+
from typing_extensions import Self
|
|
14
|
+
|
|
15
|
+
from metaxy.data_versioning.calculators.base import DataVersionCalculator
|
|
16
|
+
from metaxy.data_versioning.calculators.polars import PolarsDataVersionCalculator
|
|
17
|
+
from metaxy.data_versioning.diff import DiffResult, LazyDiffResult
|
|
18
|
+
from metaxy.data_versioning.diff.base import MetadataDiffResolver
|
|
19
|
+
from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
|
|
20
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
21
|
+
from metaxy.data_versioning.joiners.base import UpstreamJoiner
|
|
22
|
+
from metaxy.data_versioning.joiners.narwhals import NarwhalsJoiner
|
|
23
|
+
from metaxy.metadata_store.exceptions import (
|
|
24
|
+
DependencyError,
|
|
25
|
+
FeatureNotFoundError,
|
|
26
|
+
StoreNotOpenError,
|
|
27
|
+
)
|
|
28
|
+
from metaxy.metadata_store.system_tables import (
|
|
29
|
+
FEATURE_VERSIONS_KEY,
|
|
30
|
+
FEATURE_VERSIONS_SCHEMA,
|
|
31
|
+
SYSTEM_NAMESPACE,
|
|
32
|
+
_suppress_feature_version_warning,
|
|
33
|
+
allow_feature_version_override,
|
|
34
|
+
)
|
|
35
|
+
from metaxy.models.feature import Feature, FeatureGraph
|
|
36
|
+
from metaxy.models.field import FieldDep, SpecialFieldDep
|
|
37
|
+
from metaxy.models.plan import FeaturePlan, FQFieldKey
|
|
38
|
+
from metaxy.models.types import FeatureKey, FieldKey
|
|
39
|
+
|
|
40
|
+
if TYPE_CHECKING:
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
# Removed TRef - all stores now use Narwhals LazyFrames universally
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _is_using_polars_components(
|
|
47
|
+
components: tuple[UpstreamJoiner, DataVersionCalculator, MetadataDiffResolver],
|
|
48
|
+
) -> TypeGuard[
|
|
49
|
+
tuple[NarwhalsJoiner, PolarsDataVersionCalculator, NarwhalsDiffResolver]
|
|
50
|
+
]:
|
|
51
|
+
"""Type guard to check if using Narwhals components.
|
|
52
|
+
|
|
53
|
+
Returns True if all components are Narwhals-based, allowing type narrowing.
|
|
54
|
+
"""
|
|
55
|
+
joiner, calculator, diff_resolver = components
|
|
56
|
+
return (
|
|
57
|
+
isinstance(joiner, NarwhalsJoiner)
|
|
58
|
+
and isinstance(calculator, PolarsDataVersionCalculator)
|
|
59
|
+
and isinstance(diff_resolver, NarwhalsDiffResolver)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class MetadataStore(ABC):
|
|
64
|
+
"""
|
|
65
|
+
Abstract base class for metadata storage backends.
|
|
66
|
+
|
|
67
|
+
Supports:
|
|
68
|
+
- Immutable metadata storage (append-only)
|
|
69
|
+
- Composable fallback store chains (for branch deployments)
|
|
70
|
+
- Automatic data version calculation using three-component architecture
|
|
71
|
+
- Backend-specific computation optimizations
|
|
72
|
+
|
|
73
|
+
All stores use Narwhals LazyFrames as their universal interface,
|
|
74
|
+
regardless of the underlying backend (Polars, Ibis/SQL, etc.).
|
|
75
|
+
|
|
76
|
+
Components:
|
|
77
|
+
Components are created on-demand in resolve_update() based on:
|
|
78
|
+
- User preference (prefer_native flag)
|
|
79
|
+
- Whether all upstream data is local (or needs fallback stores)
|
|
80
|
+
- Store capabilities (whether it supports native data version calculations)
|
|
81
|
+
|
|
82
|
+
If prefer_native=True and all conditions met: use native (Ibis, DuckDB, etc.)
|
|
83
|
+
Otherwise: use Polars components
|
|
84
|
+
|
|
85
|
+
Subclasses declare what native data version calculations they support via abstract methods.
|
|
86
|
+
|
|
87
|
+
Context Manager:
|
|
88
|
+
Stores must be used as context managers for resource management.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
*,
|
|
94
|
+
hash_algorithm: HashAlgorithm | None = None,
|
|
95
|
+
prefer_native: bool = True,
|
|
96
|
+
fallback_stores: list[MetadataStore] | None = None,
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
Initialize metadata store.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
hash_algorithm: Hash algorithm to use for data versioning.
|
|
103
|
+
Default: None (uses default algorithm for this store type)
|
|
104
|
+
prefer_native: If True, prefer native data version calculations when possible.
|
|
105
|
+
If False, always use Polars components. Default: True
|
|
106
|
+
fallback_stores: Ordered list of read-only fallback stores.
|
|
107
|
+
Used when upstream features are not in this store.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
ValueError: If fallback stores use different hash algorithms
|
|
111
|
+
"""
|
|
112
|
+
# Initialize state early so properties can check it
|
|
113
|
+
self._is_open = False
|
|
114
|
+
self._context_depth = 0
|
|
115
|
+
self._prefer_native = prefer_native
|
|
116
|
+
|
|
117
|
+
# Use store's default algorithm if not specified
|
|
118
|
+
if hash_algorithm is None:
|
|
119
|
+
hash_algorithm = self._get_default_hash_algorithm()
|
|
120
|
+
|
|
121
|
+
self.hash_algorithm = hash_algorithm
|
|
122
|
+
self.fallback_stores = fallback_stores or []
|
|
123
|
+
|
|
124
|
+
# Validation happens in open()
|
|
125
|
+
|
|
126
|
+
@abstractmethod
|
|
127
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
128
|
+
"""Get the default hash algorithm for this store type.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Default hash algorithm
|
|
132
|
+
"""
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
@abstractmethod
|
|
136
|
+
def _supports_native_components(self) -> bool:
|
|
137
|
+
"""Check if this store can use native (non-Polars) components.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
True if store has backend-specific native data version calculations
|
|
141
|
+
False if store only supports Polars components
|
|
142
|
+
"""
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
@abstractmethod
|
|
146
|
+
def _create_native_components(
|
|
147
|
+
self,
|
|
148
|
+
) -> tuple[UpstreamJoiner, DataVersionCalculator, MetadataDiffResolver]:
|
|
149
|
+
"""Create native data version calculations for this store.
|
|
150
|
+
|
|
151
|
+
Only called if _supports_native_components() returns True.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Tuple of (joiner, calculator, diff_resolver) with appropriate types
|
|
155
|
+
for this store's backend (Narwhals-compatible)
|
|
156
|
+
|
|
157
|
+
Raises:
|
|
158
|
+
NotImplementedError: If store doesn't support native data version calculations
|
|
159
|
+
"""
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
@abstractmethod
|
|
163
|
+
def open(self) -> None:
|
|
164
|
+
"""Open/initialize the store for operations.
|
|
165
|
+
|
|
166
|
+
Called by __enter__. Subclasses implement connection setup here.
|
|
167
|
+
Can be called manually but context manager usage is recommended.
|
|
168
|
+
"""
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
@abstractmethod
|
|
172
|
+
def close(self) -> None:
|
|
173
|
+
"""Close/cleanup the store.
|
|
174
|
+
|
|
175
|
+
Called by __exit__. Subclasses implement connection cleanup here.
|
|
176
|
+
Can be called manually but context manager usage is recommended.
|
|
177
|
+
"""
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
def __enter__(self) -> Self:
|
|
181
|
+
"""Enter context manager."""
|
|
182
|
+
# Track nesting depth
|
|
183
|
+
self._context_depth += 1
|
|
184
|
+
|
|
185
|
+
# Only open on first enter
|
|
186
|
+
if self._context_depth == 1:
|
|
187
|
+
self.open()
|
|
188
|
+
self._is_open = True
|
|
189
|
+
|
|
190
|
+
# Validate after opening (when all components are ready)
|
|
191
|
+
self._validate_after_open()
|
|
192
|
+
|
|
193
|
+
return self
|
|
194
|
+
|
|
195
|
+
def _validate_after_open(self) -> None:
|
|
196
|
+
"""Validate configuration after store is opened.
|
|
197
|
+
|
|
198
|
+
Called automatically by __enter__ after open().
|
|
199
|
+
Validates hash algorithm compatibility and fallback store consistency.
|
|
200
|
+
"""
|
|
201
|
+
# Validate hash algorithm compatibility with components
|
|
202
|
+
self.validate_hash_algorithm(check_fallback_stores=True)
|
|
203
|
+
|
|
204
|
+
# Validate fallback stores use the same hash algorithm
|
|
205
|
+
for i, fallback_store in enumerate(self.fallback_stores):
|
|
206
|
+
if fallback_store.hash_algorithm != self.hash_algorithm:
|
|
207
|
+
raise ValueError(
|
|
208
|
+
f"Fallback store {i} uses hash_algorithm='{fallback_store.hash_algorithm.value}' "
|
|
209
|
+
f"but this store uses '{self.hash_algorithm.value}'. "
|
|
210
|
+
f"All stores in a fallback chain must use the same hash algorithm."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
214
|
+
"""Exit context manager."""
|
|
215
|
+
# Decrement depth
|
|
216
|
+
self._context_depth -= 1
|
|
217
|
+
|
|
218
|
+
# Only close when fully exited
|
|
219
|
+
if self._context_depth == 0:
|
|
220
|
+
self._is_open = False
|
|
221
|
+
self.close()
|
|
222
|
+
|
|
223
|
+
def _check_open(self) -> None:
|
|
224
|
+
"""Check if store is open, raise error if not.
|
|
225
|
+
|
|
226
|
+
Raises:
|
|
227
|
+
StoreNotOpenError: If store is not open
|
|
228
|
+
"""
|
|
229
|
+
if not self._is_open:
|
|
230
|
+
raise StoreNotOpenError(
|
|
231
|
+
f"{self.__class__.__name__} must be opened before use. "
|
|
232
|
+
"Use it as a context manager: `with store: ...`"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# ========== Hash Algorithm Validation ==========
|
|
236
|
+
|
|
237
|
+
def validate_hash_algorithm(
|
|
238
|
+
self,
|
|
239
|
+
check_fallback_stores: bool = True,
|
|
240
|
+
) -> None:
|
|
241
|
+
"""Validate that hash algorithm is supported by this store's components.
|
|
242
|
+
|
|
243
|
+
Public method - can be called to verify hash compatibility.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
check_fallback_stores: If True, also validate hash is supported by
|
|
247
|
+
fallback stores (ensures compatibility for future cross-store operations)
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
ValueError: If hash algorithm not supported by components or fallback stores
|
|
251
|
+
"""
|
|
252
|
+
# Check if this store can support the algorithm
|
|
253
|
+
# Try native data version calculations first (if supported), then Polars
|
|
254
|
+
supported_algorithms = []
|
|
255
|
+
|
|
256
|
+
if self._supports_native_components():
|
|
257
|
+
try:
|
|
258
|
+
_, calculator, _ = self._create_native_components()
|
|
259
|
+
supported_algorithms = calculator.supported_algorithms
|
|
260
|
+
except Exception:
|
|
261
|
+
# If native data version calculations fail, fall back to Polars
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
# If no native support or prefer_native=False, use Polars
|
|
265
|
+
if not supported_algorithms:
|
|
266
|
+
polars_calc = PolarsDataVersionCalculator()
|
|
267
|
+
supported_algorithms = polars_calc.supported_algorithms
|
|
268
|
+
|
|
269
|
+
if self.hash_algorithm not in supported_algorithms:
|
|
270
|
+
from metaxy.metadata_store.exceptions import (
|
|
271
|
+
HashAlgorithmNotSupportedError,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
raise HashAlgorithmNotSupportedError(
|
|
275
|
+
f"Hash algorithm {self.hash_algorithm} not supported by {self.__class__.__name__}. "
|
|
276
|
+
f"Supported: {supported_algorithms}"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Check fallback stores
|
|
280
|
+
if check_fallback_stores:
|
|
281
|
+
for fallback in self.fallback_stores:
|
|
282
|
+
fallback.validate_hash_algorithm(check_fallback_stores=False)
|
|
283
|
+
|
|
284
|
+
# ========== Helper Methods ==========
|
|
285
|
+
|
|
286
|
+
def _is_system_table(self, feature_key: FeatureKey) -> bool:
|
|
287
|
+
"""Check if feature key is a system table."""
|
|
288
|
+
return len(feature_key) >= 1 and feature_key[0] == SYSTEM_NAMESPACE
|
|
289
|
+
|
|
290
|
+
def _resolve_feature_key(self, feature: FeatureKey | type[Feature]) -> FeatureKey:
|
|
291
|
+
"""Resolve a Feature class or FeatureKey to FeatureKey."""
|
|
292
|
+
if isinstance(feature, FeatureKey):
|
|
293
|
+
return feature
|
|
294
|
+
else:
|
|
295
|
+
return feature.spec.key
|
|
296
|
+
|
|
297
|
+
def _resolve_feature_plan(self, feature: FeatureKey | type[Feature]) -> FeaturePlan:
|
|
298
|
+
"""Resolve to FeaturePlan for dependency resolution."""
|
|
299
|
+
if isinstance(feature, FeatureKey):
|
|
300
|
+
# When given a FeatureKey, get the graph from the active context
|
|
301
|
+
return FeatureGraph.get_active().get_feature_plan(feature)
|
|
302
|
+
else:
|
|
303
|
+
# When given a Feature class, use its bound graph
|
|
304
|
+
return feature.graph.get_feature_plan(feature.spec.key)
|
|
305
|
+
|
|
306
|
+
# ========== Core CRUD Operations ==========
|
|
307
|
+
|
|
308
|
+
@abstractmethod
|
|
309
|
+
def _write_metadata_impl(
|
|
310
|
+
self,
|
|
311
|
+
feature_key: FeatureKey,
|
|
312
|
+
df: pl.DataFrame,
|
|
313
|
+
) -> None:
|
|
314
|
+
"""
|
|
315
|
+
Internal write implementation (backend-specific).
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
feature_key: Feature key to write to
|
|
319
|
+
df: DataFrame with metadata (already validated)
|
|
320
|
+
|
|
321
|
+
Note: Subclasses implement this for their storage backend.
|
|
322
|
+
"""
|
|
323
|
+
pass
|
|
324
|
+
|
|
325
|
+
def write_metadata(
|
|
326
|
+
self,
|
|
327
|
+
feature: FeatureKey | type[Feature],
|
|
328
|
+
df: nw.DataFrame[Any] | pl.DataFrame,
|
|
329
|
+
) -> None:
|
|
330
|
+
"""
|
|
331
|
+
Write metadata for a feature (immutable, append-only).
|
|
332
|
+
|
|
333
|
+
Automatically adds 'feature_version' column from current code state,
|
|
334
|
+
unless the DataFrame already contains one (useful for migrations).
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
feature: Feature to write metadata for
|
|
338
|
+
df: Narwhals DataFrame or Polars DataFrame containing metadata.
|
|
339
|
+
Must have 'data_version' column of type Struct with fields matching feature's fields.
|
|
340
|
+
May optionally contain 'feature_version' column (for migrations).
|
|
341
|
+
|
|
342
|
+
Raises:
|
|
343
|
+
MetadataSchemaError: If DataFrame schema is invalid
|
|
344
|
+
StoreNotOpenError: If store is not open
|
|
345
|
+
|
|
346
|
+
Note:
|
|
347
|
+
- Always writes to current store, never to fallback stores.
|
|
348
|
+
- If df already contains 'feature_version' column, it will be used
|
|
349
|
+
as-is (no replacement). This allows migrations to write historical
|
|
350
|
+
versions. A warning is issued unless suppressed via context manager.
|
|
351
|
+
"""
|
|
352
|
+
self._check_open()
|
|
353
|
+
feature_key = self._resolve_feature_key(feature)
|
|
354
|
+
is_system_table = self._is_system_table(feature_key)
|
|
355
|
+
|
|
356
|
+
# Convert Narwhals to Polars if needed
|
|
357
|
+
if isinstance(df, nw.DataFrame):
|
|
358
|
+
df = df.to_polars()
|
|
359
|
+
# nw.DataFrame also matches as DataFrame in some contexts, ensure it's Polars
|
|
360
|
+
if not isinstance(df, pl.DataFrame):
|
|
361
|
+
# Must be some other type - shouldn't happen but handle defensively
|
|
362
|
+
if hasattr(df, "to_polars"):
|
|
363
|
+
df = df.to_polars()
|
|
364
|
+
elif hasattr(df, "to_pandas"):
|
|
365
|
+
df = pl.from_pandas(df.to_pandas())
|
|
366
|
+
else:
|
|
367
|
+
raise TypeError(f"Cannot convert {type(df)} to Polars DataFrame")
|
|
368
|
+
|
|
369
|
+
# For system tables, write directly without feature_version tracking
|
|
370
|
+
if is_system_table:
|
|
371
|
+
self._validate_schema_system_table(df)
|
|
372
|
+
self._write_metadata_impl(feature_key, df)
|
|
373
|
+
return
|
|
374
|
+
|
|
375
|
+
# For regular features: add feature_version and snapshot_version, validate, and write
|
|
376
|
+
# Check if feature_version and snapshot_version already exist in DataFrame
|
|
377
|
+
if "feature_version" in df.columns and "snapshot_version" in df.columns:
|
|
378
|
+
# DataFrame already has feature_version and snapshot_version - use as-is
|
|
379
|
+
# This is intended for migrations writing historical versions
|
|
380
|
+
# Issue a warning unless we're in a suppression context
|
|
381
|
+
if not _suppress_feature_version_warning.get():
|
|
382
|
+
import warnings
|
|
383
|
+
|
|
384
|
+
warnings.warn(
|
|
385
|
+
f"Writing metadata for {feature_key.to_string()} with existing "
|
|
386
|
+
f"feature_version and snapshot_version columns. This is intended for migrations only. "
|
|
387
|
+
f"Normal code should let write_metadata() add the current versions automatically.",
|
|
388
|
+
UserWarning,
|
|
389
|
+
stacklevel=2,
|
|
390
|
+
)
|
|
391
|
+
else:
|
|
392
|
+
# Get current feature version and snapshot_version from code and add them
|
|
393
|
+
if isinstance(feature, type) and issubclass(feature, Feature):
|
|
394
|
+
current_feature_version = feature.feature_version() # type: ignore[attr-defined]
|
|
395
|
+
else:
|
|
396
|
+
from metaxy.models.feature import FeatureGraph
|
|
397
|
+
|
|
398
|
+
graph = FeatureGraph.get_active()
|
|
399
|
+
feature_cls = graph.features_by_key[feature_key]
|
|
400
|
+
current_feature_version = feature_cls.feature_version() # type: ignore[attr-defined]
|
|
401
|
+
|
|
402
|
+
# Get snapshot_version from active graph
|
|
403
|
+
from metaxy.models.feature import FeatureGraph
|
|
404
|
+
|
|
405
|
+
graph = FeatureGraph.get_active()
|
|
406
|
+
current_snapshot_version = graph.snapshot_version
|
|
407
|
+
|
|
408
|
+
df = df.with_columns(
|
|
409
|
+
[
|
|
410
|
+
pl.lit(current_feature_version).alias("feature_version"),
|
|
411
|
+
pl.lit(current_snapshot_version).alias("snapshot_version"),
|
|
412
|
+
]
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Validate schema
|
|
416
|
+
self._validate_schema(df)
|
|
417
|
+
|
|
418
|
+
# Write metadata
|
|
419
|
+
self._write_metadata_impl(feature_key, df)
|
|
420
|
+
|
|
421
|
+
def _validate_schema(self, df: pl.DataFrame) -> None:
|
|
422
|
+
"""
|
|
423
|
+
Validate that DataFrame has required schema.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
df: DataFrame to validate
|
|
427
|
+
|
|
428
|
+
Raises:
|
|
429
|
+
MetadataSchemaError: If schema is invalid
|
|
430
|
+
"""
|
|
431
|
+
from metaxy.metadata_store.exceptions import MetadataSchemaError
|
|
432
|
+
|
|
433
|
+
# Check for data_version column
|
|
434
|
+
if "data_version" not in df.columns:
|
|
435
|
+
raise MetadataSchemaError("DataFrame must have 'data_version' column")
|
|
436
|
+
|
|
437
|
+
# Check that data_version is a struct
|
|
438
|
+
data_version_type = df.schema["data_version"]
|
|
439
|
+
if not isinstance(data_version_type, pl.Struct):
|
|
440
|
+
raise MetadataSchemaError(
|
|
441
|
+
f"'data_version' column must be pl.Struct, got {data_version_type}"
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Check for feature_version column
|
|
445
|
+
if "feature_version" not in df.columns:
|
|
446
|
+
raise MetadataSchemaError("DataFrame must have 'feature_version' column")
|
|
447
|
+
|
|
448
|
+
# Check for snapshot_version column
|
|
449
|
+
if "snapshot_version" not in df.columns:
|
|
450
|
+
raise MetadataSchemaError("DataFrame must have 'snapshot_version' column")
|
|
451
|
+
|
|
452
|
+
def _validate_schema_system_table(self, df: pl.DataFrame) -> None:
|
|
453
|
+
"""Validate schema for system tables (minimal validation)."""
|
|
454
|
+
# System tables don't need data_version column
|
|
455
|
+
pass
|
|
456
|
+
|
|
457
|
+
@abstractmethod
|
|
458
|
+
def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
|
|
459
|
+
"""Drop/delete all metadata for a feature.
|
|
460
|
+
|
|
461
|
+
Backend-specific implementation for dropping feature metadata.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
feature_key: The feature key to drop metadata for
|
|
465
|
+
"""
|
|
466
|
+
pass
|
|
467
|
+
|
|
468
|
+
def drop_feature_metadata(self, feature: FeatureKey | type[Feature]) -> None:
|
|
469
|
+
"""Drop all metadata for a feature.
|
|
470
|
+
|
|
471
|
+
This removes all stored metadata for the specified feature from the store.
|
|
472
|
+
Useful for cleanup in tests or when re-computing feature metadata from scratch.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
feature: Feature class or key to drop metadata for
|
|
476
|
+
|
|
477
|
+
Example:
|
|
478
|
+
>>> store.drop_feature_metadata(MyFeature)
|
|
479
|
+
>>> assert not store.has_feature(MyFeature)
|
|
480
|
+
"""
|
|
481
|
+
self._check_open()
|
|
482
|
+
feature_key = self._resolve_feature_key(feature)
|
|
483
|
+
self._drop_feature_metadata_impl(feature_key)
|
|
484
|
+
|
|
485
|
+
def record_feature_graph_snapshot(self) -> tuple[str, bool]:
|
|
486
|
+
"""Record all features in graph with a graph snapshot version.
|
|
487
|
+
|
|
488
|
+
This should be called during CD (Continuous Deployment) to record what
|
|
489
|
+
feature versions are being deployed. Typically invoked via `metaxy push`.
|
|
490
|
+
|
|
491
|
+
Records all features in the graph with the same snapshot_version, representing
|
|
492
|
+
a consistent state of the entire feature graph based on code definitions.
|
|
493
|
+
|
|
494
|
+
The snapshot_version is a deterministic hash of all feature_version hashes
|
|
495
|
+
in the graph, making it idempotent - calling multiple times with the
|
|
496
|
+
same feature definitions produces the same snapshot_version.
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
A tuple containing the generated snapshot_version (deterministic hash) and a boolean indicating if the snapshot was recorded or already exists.
|
|
500
|
+
"""
|
|
501
|
+
|
|
502
|
+
from metaxy.models.feature import FeatureGraph
|
|
503
|
+
|
|
504
|
+
graph = FeatureGraph.get_active()
|
|
505
|
+
|
|
506
|
+
# Use to_snapshot() to get the snapshot dict
|
|
507
|
+
snapshot_dict = graph.to_snapshot()
|
|
508
|
+
|
|
509
|
+
# Generate deterministic snapshot_version from graph
|
|
510
|
+
snapshot_version = graph.snapshot_version
|
|
511
|
+
|
|
512
|
+
# Read existing feature versions once
|
|
513
|
+
try:
|
|
514
|
+
existing_versions_lazy = self._read_metadata_native(FEATURE_VERSIONS_KEY)
|
|
515
|
+
# Materialize to Polars for iteration
|
|
516
|
+
existing_versions = (
|
|
517
|
+
existing_versions_lazy.collect().to_polars()
|
|
518
|
+
if existing_versions_lazy is not None
|
|
519
|
+
else None
|
|
520
|
+
)
|
|
521
|
+
except Exception:
|
|
522
|
+
# Table doesn't exist yet
|
|
523
|
+
existing_versions = None
|
|
524
|
+
|
|
525
|
+
# Check if this exact snapshot already exists
|
|
526
|
+
snapshot_already_exists = False
|
|
527
|
+
if existing_versions is not None:
|
|
528
|
+
snapshot_already_exists = (
|
|
529
|
+
existing_versions.filter(
|
|
530
|
+
pl.col("snapshot_version") == snapshot_version
|
|
531
|
+
).height
|
|
532
|
+
> 0
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
# If snapshot already exists, we're done (idempotent)
|
|
536
|
+
if snapshot_already_exists:
|
|
537
|
+
return snapshot_version, True
|
|
538
|
+
|
|
539
|
+
# Build records from snapshot_dict
|
|
540
|
+
records = []
|
|
541
|
+
for feature_key_str in sorted(snapshot_dict.keys()):
|
|
542
|
+
feature_data = snapshot_dict[feature_key_str]
|
|
543
|
+
|
|
544
|
+
# Serialize complete FeatureSpec
|
|
545
|
+
feature_spec_json = json.dumps(feature_data["feature_spec"])
|
|
546
|
+
|
|
547
|
+
# Always record all features for this snapshot (don't skip based on feature_version alone)
|
|
548
|
+
# Each snapshot must be complete to support migration detection
|
|
549
|
+
records.append(
|
|
550
|
+
{
|
|
551
|
+
"feature_key": feature_key_str,
|
|
552
|
+
"feature_version": feature_data["feature_version"],
|
|
553
|
+
"recorded_at": datetime.now(timezone.utc),
|
|
554
|
+
"feature_spec": feature_spec_json,
|
|
555
|
+
"feature_class_path": feature_data["feature_class_path"],
|
|
556
|
+
"snapshot_version": snapshot_version,
|
|
557
|
+
}
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Bulk write all new records at once
|
|
561
|
+
if records:
|
|
562
|
+
version_records = pl.DataFrame(
|
|
563
|
+
records,
|
|
564
|
+
schema=FEATURE_VERSIONS_SCHEMA,
|
|
565
|
+
)
|
|
566
|
+
self._write_metadata_impl(FEATURE_VERSIONS_KEY, version_records)
|
|
567
|
+
|
|
568
|
+
return snapshot_version, False
|
|
569
|
+
|
|
570
|
+
@abstractmethod
|
|
571
|
+
def _read_metadata_native(
|
|
572
|
+
self,
|
|
573
|
+
feature: FeatureKey | type[Feature],
|
|
574
|
+
*,
|
|
575
|
+
feature_version: str | None = None,
|
|
576
|
+
filters: Sequence[nw.Expr] | None = None,
|
|
577
|
+
columns: Sequence[str] | None = None,
|
|
578
|
+
) -> nw.LazyFrame[Any] | None:
|
|
579
|
+
"""
|
|
580
|
+
Read metadata from THIS store only (no fallback).
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
feature: Feature to read metadata for
|
|
584
|
+
feature_version: Filter by specific feature_version (applied natively in store)
|
|
585
|
+
filters: List of Narwhals filter expressions for this specific feature.
|
|
586
|
+
columns: Subset of columns to return
|
|
587
|
+
|
|
588
|
+
Returns:
|
|
589
|
+
Narwhals LazyFrame with metadata, or None if feature not found locally
|
|
590
|
+
"""
|
|
591
|
+
pass
|
|
592
|
+
|
|
593
|
+
def read_metadata(
|
|
594
|
+
self,
|
|
595
|
+
feature: FeatureKey | type[Feature],
|
|
596
|
+
*,
|
|
597
|
+
feature_version: str | None = None,
|
|
598
|
+
filters: Sequence[nw.Expr] | None = None,
|
|
599
|
+
columns: Sequence[str] | None = None,
|
|
600
|
+
allow_fallback: bool = True,
|
|
601
|
+
current_only: bool = True,
|
|
602
|
+
) -> nw.LazyFrame[Any]:
|
|
603
|
+
"""
|
|
604
|
+
Read metadata with optional fallback to upstream stores.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
feature: Feature to read metadata for
|
|
608
|
+
feature_version: Explicit feature_version to filter by (mutually exclusive with current_only=True)
|
|
609
|
+
filters: Sequence of Narwhals filter expressions to apply to this feature.
|
|
610
|
+
Example: [nw.col("x") > 10, nw.col("y") < 5]
|
|
611
|
+
columns: Subset of columns to return
|
|
612
|
+
allow_fallback: If True, check fallback stores on local miss
|
|
613
|
+
current_only: If True, only return rows with current feature_version
|
|
614
|
+
(default: True for safety)
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
Narwhals LazyFrame with metadata
|
|
618
|
+
|
|
619
|
+
Raises:
|
|
620
|
+
FeatureNotFoundError: If feature not found in any store
|
|
621
|
+
ValueError: If both feature_version and current_only=True are provided
|
|
622
|
+
"""
|
|
623
|
+
feature_key = self._resolve_feature_key(feature)
|
|
624
|
+
is_system_table = self._is_system_table(feature_key)
|
|
625
|
+
|
|
626
|
+
# Validate mutually exclusive parameters
|
|
627
|
+
if feature_version is not None and current_only:
|
|
628
|
+
raise ValueError(
|
|
629
|
+
"Cannot specify both feature_version and current_only=True. "
|
|
630
|
+
"Use current_only=False with feature_version parameter."
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
# Determine which feature_version to use
|
|
634
|
+
feature_version_filter = feature_version
|
|
635
|
+
if current_only and not is_system_table:
|
|
636
|
+
# Get current feature_version
|
|
637
|
+
if isinstance(feature, type) and issubclass(feature, Feature):
|
|
638
|
+
feature_version_filter = feature.feature_version() # type: ignore[attr-defined]
|
|
639
|
+
else:
|
|
640
|
+
from metaxy.models.feature import FeatureGraph
|
|
641
|
+
|
|
642
|
+
graph = FeatureGraph.get_active()
|
|
643
|
+
# Only try to get from graph if feature_key exists in graph
|
|
644
|
+
# This allows reading system tables or external features not in current graph
|
|
645
|
+
if feature_key in graph.features_by_key:
|
|
646
|
+
feature_cls = graph.features_by_key[feature_key]
|
|
647
|
+
feature_version_filter = feature_cls.feature_version() # type: ignore[attr-defined]
|
|
648
|
+
else:
|
|
649
|
+
# Feature not in graph - skip feature_version filtering
|
|
650
|
+
feature_version_filter = None
|
|
651
|
+
|
|
652
|
+
# Try local first with filters
|
|
653
|
+
lazy_frame = self._read_metadata_native(
|
|
654
|
+
feature,
|
|
655
|
+
feature_version=feature_version_filter,
|
|
656
|
+
filters=filters, # Pass filters directly
|
|
657
|
+
columns=columns,
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
if lazy_frame is not None:
|
|
661
|
+
return lazy_frame
|
|
662
|
+
|
|
663
|
+
# Try fallback stores
|
|
664
|
+
if allow_fallback:
|
|
665
|
+
for store in self.fallback_stores:
|
|
666
|
+
try:
|
|
667
|
+
# Use full read_metadata to handle nested fallback chains
|
|
668
|
+
return store.read_metadata(
|
|
669
|
+
feature,
|
|
670
|
+
feature_version=feature_version,
|
|
671
|
+
filters=filters, # Pass through filters directly
|
|
672
|
+
columns=columns,
|
|
673
|
+
allow_fallback=True,
|
|
674
|
+
current_only=current_only, # Pass through current_only
|
|
675
|
+
)
|
|
676
|
+
except FeatureNotFoundError:
|
|
677
|
+
# Try next fallback store
|
|
678
|
+
continue
|
|
679
|
+
|
|
680
|
+
# Not found anywhere
|
|
681
|
+
raise FeatureNotFoundError(
|
|
682
|
+
f"Feature {feature_key.to_string()} not found in store"
|
|
683
|
+
+ (" or fallback stores" if allow_fallback else "")
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
# ========== Feature Existence ==========
|
|
687
|
+
|
|
688
|
+
def has_feature(
|
|
689
|
+
self,
|
|
690
|
+
feature: FeatureKey | type[Feature],
|
|
691
|
+
*,
|
|
692
|
+
check_fallback: bool = False,
|
|
693
|
+
) -> bool:
|
|
694
|
+
"""
|
|
695
|
+
Check if feature exists in store.
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
feature: Feature to check
|
|
699
|
+
check_fallback: If True, also check fallback stores
|
|
700
|
+
|
|
701
|
+
Returns:
|
|
702
|
+
True if feature exists, False otherwise
|
|
703
|
+
"""
|
|
704
|
+
# Check local
|
|
705
|
+
if self._read_metadata_native(feature) is not None:
|
|
706
|
+
return True
|
|
707
|
+
|
|
708
|
+
# Check fallback stores
|
|
709
|
+
if check_fallback:
|
|
710
|
+
for store in self.fallback_stores:
|
|
711
|
+
if store.has_feature(feature, check_fallback=True):
|
|
712
|
+
return True
|
|
713
|
+
|
|
714
|
+
return False
|
|
715
|
+
|
|
716
|
+
def list_features(self, *, include_fallback: bool = False) -> list[FeatureKey]:
|
|
717
|
+
"""
|
|
718
|
+
List all features in store.
|
|
719
|
+
|
|
720
|
+
Args:
|
|
721
|
+
include_fallback: If True, include features from fallback stores
|
|
722
|
+
|
|
723
|
+
Returns:
|
|
724
|
+
List of FeatureKey objects
|
|
725
|
+
|
|
726
|
+
Raises:
|
|
727
|
+
StoreNotOpenError: If store is not open
|
|
728
|
+
"""
|
|
729
|
+
self._check_open()
|
|
730
|
+
|
|
731
|
+
features = self._list_features_local()
|
|
732
|
+
|
|
733
|
+
if include_fallback:
|
|
734
|
+
for store in self.fallback_stores:
|
|
735
|
+
features.extend(store.list_features(include_fallback=True))
|
|
736
|
+
|
|
737
|
+
# Deduplicate
|
|
738
|
+
seen = set()
|
|
739
|
+
unique_features = []
|
|
740
|
+
for feature in features:
|
|
741
|
+
key_str = feature.to_string()
|
|
742
|
+
if key_str not in seen:
|
|
743
|
+
seen.add(key_str)
|
|
744
|
+
unique_features.append(feature)
|
|
745
|
+
|
|
746
|
+
return unique_features
|
|
747
|
+
|
|
748
|
+
@abstractmethod
|
|
749
|
+
def _list_features_local(self) -> list[FeatureKey]:
|
|
750
|
+
"""List features in THIS store only."""
|
|
751
|
+
pass
|
|
752
|
+
|
|
753
|
+
def read_graph_snapshots(self) -> pl.DataFrame:
|
|
754
|
+
"""Read all recorded graph snapshots from the feature_versions system table.
|
|
755
|
+
|
|
756
|
+
Returns a DataFrame with columns:
|
|
757
|
+
- snapshot_version: Unique identifier for each graph snapshot
|
|
758
|
+
- recorded_at: Timestamp when the snapshot was recorded
|
|
759
|
+
- feature_count: Number of features in this snapshot
|
|
760
|
+
|
|
761
|
+
Returns:
|
|
762
|
+
Polars DataFrame with snapshot information, sorted by recorded_at descending
|
|
763
|
+
|
|
764
|
+
Raises:
|
|
765
|
+
StoreNotOpenError: If store is not open
|
|
766
|
+
|
|
767
|
+
Example:
|
|
768
|
+
>>> with store:
|
|
769
|
+
... snapshots = store.read_graph_snapshots()
|
|
770
|
+
... latest_snapshot = snapshots["snapshot_version"][0]
|
|
771
|
+
... print(f"Latest snapshot: {latest_snapshot}")
|
|
772
|
+
"""
|
|
773
|
+
self._check_open()
|
|
774
|
+
|
|
775
|
+
versions_lazy = self._read_metadata_native(FEATURE_VERSIONS_KEY)
|
|
776
|
+
if versions_lazy is None:
|
|
777
|
+
# No snapshots recorded yet
|
|
778
|
+
return pl.DataFrame(
|
|
779
|
+
schema={
|
|
780
|
+
"snapshot_version": pl.String,
|
|
781
|
+
"recorded_at": pl.Datetime("us"),
|
|
782
|
+
"feature_count": pl.UInt32,
|
|
783
|
+
}
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
versions_df = versions_lazy.collect().to_polars()
|
|
787
|
+
|
|
788
|
+
# Group by snapshot_version and get earliest recorded_at and count
|
|
789
|
+
snapshots = (
|
|
790
|
+
versions_df.group_by("snapshot_version")
|
|
791
|
+
.agg(
|
|
792
|
+
[
|
|
793
|
+
pl.col("recorded_at").min().alias("recorded_at"),
|
|
794
|
+
pl.col("feature_key").count().alias("feature_count"),
|
|
795
|
+
]
|
|
796
|
+
)
|
|
797
|
+
.sort("recorded_at", descending=True)
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
return snapshots
|
|
801
|
+
|
|
802
|
+
def read_features(
|
|
803
|
+
self,
|
|
804
|
+
*,
|
|
805
|
+
current: bool = True,
|
|
806
|
+
snapshot_version: str | None = None,
|
|
807
|
+
) -> pl.DataFrame:
|
|
808
|
+
"""Read feature version information from the feature_versions system table.
|
|
809
|
+
|
|
810
|
+
Args:
|
|
811
|
+
current: If True, only return features from the current code snapshot.
|
|
812
|
+
If False, must provide snapshot_version.
|
|
813
|
+
snapshot_version: Specific snapshot version to filter by. Required if current=False.
|
|
814
|
+
|
|
815
|
+
Returns:
|
|
816
|
+
Polars DataFrame with columns from FEATURE_VERSIONS_SCHEMA:
|
|
817
|
+
- feature_key: Feature identifier
|
|
818
|
+
- feature_version: Version hash of the feature
|
|
819
|
+
- recorded_at: When this version was recorded
|
|
820
|
+
- feature_spec: JSON serialized feature specification
|
|
821
|
+
- feature_class_path: Python import path to the feature class
|
|
822
|
+
- snapshot_version: Graph snapshot this feature belongs to
|
|
823
|
+
|
|
824
|
+
Raises:
|
|
825
|
+
StoreNotOpenError: If store is not open
|
|
826
|
+
ValueError: If current=False but no snapshot_version provided
|
|
827
|
+
|
|
828
|
+
Examples:
|
|
829
|
+
>>> # Get features from current code
|
|
830
|
+
>>> with store:
|
|
831
|
+
... features = store.read_features(current=True)
|
|
832
|
+
... print(f"Current graph has {len(features)} features")
|
|
833
|
+
|
|
834
|
+
>>> # Get features from a specific snapshot
|
|
835
|
+
>>> with store:
|
|
836
|
+
... features = store.read_features(current=False, snapshot_version="abc123")
|
|
837
|
+
... for row in features.iter_rows(named=True):
|
|
838
|
+
... print(f"{row['feature_key']}: {row['feature_version']}")
|
|
839
|
+
"""
|
|
840
|
+
self._check_open()
|
|
841
|
+
|
|
842
|
+
if not current and snapshot_version is None:
|
|
843
|
+
raise ValueError("Must provide snapshot_version when current=False")
|
|
844
|
+
|
|
845
|
+
versions_lazy = self._read_metadata_native(FEATURE_VERSIONS_KEY)
|
|
846
|
+
if versions_lazy is None:
|
|
847
|
+
# No features recorded yet
|
|
848
|
+
return pl.DataFrame(schema=FEATURE_VERSIONS_SCHEMA)
|
|
849
|
+
|
|
850
|
+
if current:
|
|
851
|
+
# Get current snapshot from active graph
|
|
852
|
+
graph = FeatureGraph.get_active()
|
|
853
|
+
snapshot_version = graph.snapshot_version
|
|
854
|
+
|
|
855
|
+
# Filter by snapshot_version
|
|
856
|
+
versions_df = (
|
|
857
|
+
versions_lazy.filter(nw.col("snapshot_version") == snapshot_version)
|
|
858
|
+
.collect()
|
|
859
|
+
.to_polars()
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
return versions_df
|
|
863
|
+
|
|
864
|
+
def copy_metadata(
|
|
865
|
+
self,
|
|
866
|
+
from_store: MetadataStore,
|
|
867
|
+
features: list[FeatureKey | type[Feature]] | None = None,
|
|
868
|
+
*,
|
|
869
|
+
from_snapshot: str | None = None,
|
|
870
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None = None,
|
|
871
|
+
incremental: bool = True,
|
|
872
|
+
) -> dict[str, int]:
|
|
873
|
+
"""Copy metadata from another store with fine-grained filtering.
|
|
874
|
+
|
|
875
|
+
This is a reusable method that can be called programmatically or from CLI/migrations.
|
|
876
|
+
Copies metadata for specified features, preserving the original snapshot_version.
|
|
877
|
+
|
|
878
|
+
Args:
|
|
879
|
+
from_store: Source metadata store to copy from (must be opened)
|
|
880
|
+
features: List of features to copy. Can be:
|
|
881
|
+
- None: copies all features from source store
|
|
882
|
+
- List of FeatureKey or Feature classes: copies specified features
|
|
883
|
+
from_snapshot: Snapshot version to filter source data by. If None, uses latest snapshot
|
|
884
|
+
from source store. Only rows with this snapshot_version will be copied.
|
|
885
|
+
The snapshot_version is preserved in the destination store.
|
|
886
|
+
filters: Dict mapping feature keys (as strings) to sequences of Narwhals filter expressions.
|
|
887
|
+
These filters are applied when reading from the source store.
|
|
888
|
+
Example: {"feature/key": [nw.col("x") > 10], "other/feature": [...]}
|
|
889
|
+
incremental: If True (default), filter out rows that already exist in the destination
|
|
890
|
+
store by performing an anti-join on sample_uid for the same snapshot_version.
|
|
891
|
+
|
|
892
|
+
The implementation uses an anti-join: source LEFT ANTI JOIN destination ON sample_uid
|
|
893
|
+
filtered by snapshot_version.
|
|
894
|
+
|
|
895
|
+
Disabling incremental (incremental=False) may improve performance when:
|
|
896
|
+
- You know the destination is empty or has no overlap with source
|
|
897
|
+
- The destination store uses deduplication
|
|
898
|
+
|
|
899
|
+
When incremental=False, it's the user's responsibility to avoid duplicates or
|
|
900
|
+
configure deduplication at the storage layer.
|
|
901
|
+
|
|
902
|
+
Returns:
|
|
903
|
+
Dict with statistics: {"features_copied": int, "rows_copied": int}
|
|
904
|
+
|
|
905
|
+
Raises:
|
|
906
|
+
ValueError: If from_store or self (destination) is not open
|
|
907
|
+
FeatureNotFoundError: If a specified feature doesn't exist in source store
|
|
908
|
+
|
|
909
|
+
Examples:
|
|
910
|
+
>>> # Simple: copy all features from latest snapshot
|
|
911
|
+
>>> stats = dest_store.copy_metadata(from_store=source_store)
|
|
912
|
+
|
|
913
|
+
>>> # Copy specific features from a specific snapshot
|
|
914
|
+
>>> stats = dest_store.copy_metadata(
|
|
915
|
+
... from_store=source_store,
|
|
916
|
+
... features=[FeatureKey(["my_feature"])],
|
|
917
|
+
... from_snapshot="abc123",
|
|
918
|
+
... )
|
|
919
|
+
|
|
920
|
+
>>> # Copy with filters
|
|
921
|
+
>>> stats = dest_store.copy_metadata(
|
|
922
|
+
... from_store=source_store,
|
|
923
|
+
... filters={"my/feature": [nw.col("sample_uid").is_in(["s1", "s2"])]},
|
|
924
|
+
... )
|
|
925
|
+
|
|
926
|
+
>>> # Copy specific features with filters
|
|
927
|
+
>>> stats = dest_store.copy_metadata(
|
|
928
|
+
... from_store=source_store,
|
|
929
|
+
... features=[
|
|
930
|
+
... FeatureKey(["feature_a"]),
|
|
931
|
+
... FeatureKey(["feature_b"]),
|
|
932
|
+
... ],
|
|
933
|
+
... filters={
|
|
934
|
+
... "feature_a": [nw.col("field_a") > 10, nw.col("sample_uid").is_in(["s1", "s2"])],
|
|
935
|
+
... "feature_b": [nw.col("field_b") < 30],
|
|
936
|
+
... },
|
|
937
|
+
... )
|
|
938
|
+
"""
|
|
939
|
+
import logging
|
|
940
|
+
|
|
941
|
+
logger = logging.getLogger(__name__)
|
|
942
|
+
|
|
943
|
+
# Validate destination store is open
|
|
944
|
+
if not self._is_open:
|
|
945
|
+
raise ValueError("Destination store must be opened (use context manager)")
|
|
946
|
+
|
|
947
|
+
# Automatically handle source store context manager
|
|
948
|
+
should_close_source = not from_store._is_open
|
|
949
|
+
if should_close_source:
|
|
950
|
+
from_store.__enter__()
|
|
951
|
+
|
|
952
|
+
try:
|
|
953
|
+
return self._copy_metadata_impl(
|
|
954
|
+
from_store=from_store,
|
|
955
|
+
features=features,
|
|
956
|
+
from_snapshot=from_snapshot,
|
|
957
|
+
filters=filters,
|
|
958
|
+
incremental=incremental,
|
|
959
|
+
logger=logger,
|
|
960
|
+
)
|
|
961
|
+
finally:
|
|
962
|
+
if should_close_source:
|
|
963
|
+
from_store.__exit__(None, None, None)
|
|
964
|
+
|
|
965
|
+
def _copy_metadata_impl(
|
|
966
|
+
self,
|
|
967
|
+
from_store: MetadataStore,
|
|
968
|
+
features: list[FeatureKey | type[Feature]] | None,
|
|
969
|
+
from_snapshot: str | None,
|
|
970
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None,
|
|
971
|
+
incremental: bool,
|
|
972
|
+
logger,
|
|
973
|
+
) -> dict[str, int]:
|
|
974
|
+
"""Internal implementation of copy_metadata."""
|
|
975
|
+
# Determine which features to copy
|
|
976
|
+
features_to_copy: list[FeatureKey]
|
|
977
|
+
if features is None:
|
|
978
|
+
# Copy all features from source
|
|
979
|
+
features_to_copy = from_store.list_features(include_fallback=False)
|
|
980
|
+
logger.info(
|
|
981
|
+
f"Copying all features from source: {len(features_to_copy)} features"
|
|
982
|
+
)
|
|
983
|
+
else:
|
|
984
|
+
# Convert all to FeatureKey
|
|
985
|
+
features_to_copy = []
|
|
986
|
+
for item in features:
|
|
987
|
+
if isinstance(item, FeatureKey):
|
|
988
|
+
features_to_copy.append(item)
|
|
989
|
+
else:
|
|
990
|
+
# Must be Feature class
|
|
991
|
+
features_to_copy.append(item.spec.key)
|
|
992
|
+
logger.info(f"Copying {len(features_to_copy)} specified features")
|
|
993
|
+
|
|
994
|
+
# Determine from_snapshot
|
|
995
|
+
if from_snapshot is None:
|
|
996
|
+
# Get latest snapshot from source store
|
|
997
|
+
try:
|
|
998
|
+
versions_lazy = from_store._read_metadata_native(FEATURE_VERSIONS_KEY)
|
|
999
|
+
if versions_lazy is None:
|
|
1000
|
+
# No feature_versions table yet - if no features to copy, that's okay
|
|
1001
|
+
if len(features_to_copy) == 0:
|
|
1002
|
+
logger.info(
|
|
1003
|
+
"No features to copy and no snapshots in source store"
|
|
1004
|
+
)
|
|
1005
|
+
from_snapshot = None # Will be set later if needed
|
|
1006
|
+
else:
|
|
1007
|
+
raise ValueError(
|
|
1008
|
+
"Source store has no feature_versions table. Cannot determine snapshot."
|
|
1009
|
+
)
|
|
1010
|
+
elif versions_lazy is not None:
|
|
1011
|
+
versions_df = versions_lazy.collect().to_polars()
|
|
1012
|
+
if versions_df.height == 0:
|
|
1013
|
+
# Empty versions table - if no features to copy, that's okay
|
|
1014
|
+
if len(features_to_copy) == 0:
|
|
1015
|
+
logger.info(
|
|
1016
|
+
"No features to copy and no snapshots in source store"
|
|
1017
|
+
)
|
|
1018
|
+
from_snapshot = None
|
|
1019
|
+
else:
|
|
1020
|
+
raise ValueError(
|
|
1021
|
+
"Source store feature_versions table is empty. No snapshots found."
|
|
1022
|
+
)
|
|
1023
|
+
else:
|
|
1024
|
+
# Get most recent snapshot_version by recorded_at
|
|
1025
|
+
from_snapshot = (
|
|
1026
|
+
versions_df.sort("recorded_at", descending=True)
|
|
1027
|
+
.select("snapshot_version")
|
|
1028
|
+
.head(1)["snapshot_version"][0]
|
|
1029
|
+
)
|
|
1030
|
+
logger.info(
|
|
1031
|
+
f"Using latest snapshot from source: {from_snapshot}"
|
|
1032
|
+
)
|
|
1033
|
+
except Exception as e:
|
|
1034
|
+
# If we have no features to copy, continue gracefully
|
|
1035
|
+
if len(features_to_copy) == 0:
|
|
1036
|
+
logger.info(f"No features to copy: {e}")
|
|
1037
|
+
from_snapshot = None
|
|
1038
|
+
else:
|
|
1039
|
+
raise ValueError(
|
|
1040
|
+
f"Could not determine latest snapshot from source store: {e}"
|
|
1041
|
+
)
|
|
1042
|
+
else:
|
|
1043
|
+
logger.info(f"Using specified from_snapshot: {from_snapshot}")
|
|
1044
|
+
|
|
1045
|
+
# Copy metadata for each feature
|
|
1046
|
+
total_rows = 0
|
|
1047
|
+
features_copied = 0
|
|
1048
|
+
|
|
1049
|
+
with allow_feature_version_override():
|
|
1050
|
+
for feature_key in features_to_copy:
|
|
1051
|
+
try:
|
|
1052
|
+
# Read metadata from source, filtering by from_snapshot
|
|
1053
|
+
# Use current_only=False to avoid filtering by feature_version
|
|
1054
|
+
source_lazy = from_store.read_metadata(
|
|
1055
|
+
feature_key,
|
|
1056
|
+
allow_fallback=False,
|
|
1057
|
+
current_only=False,
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
# Filter by from_snapshot
|
|
1061
|
+
import narwhals as nw
|
|
1062
|
+
|
|
1063
|
+
source_filtered = source_lazy.filter(
|
|
1064
|
+
nw.col("snapshot_version") == from_snapshot
|
|
1065
|
+
)
|
|
1066
|
+
|
|
1067
|
+
# Apply filters for this feature (if any)
|
|
1068
|
+
if filters:
|
|
1069
|
+
feature_key_str = feature_key.to_string()
|
|
1070
|
+
if feature_key_str in filters:
|
|
1071
|
+
for filter_expr in filters[feature_key_str]:
|
|
1072
|
+
source_filtered = source_filtered.filter(filter_expr)
|
|
1073
|
+
|
|
1074
|
+
# Apply incremental filtering if enabled
|
|
1075
|
+
if incremental:
|
|
1076
|
+
try:
|
|
1077
|
+
# Read existing sample_uids from destination for the same snapshot
|
|
1078
|
+
# This is much cheaper than comparing data_version structs
|
|
1079
|
+
dest_lazy = self.read_metadata(
|
|
1080
|
+
feature_key,
|
|
1081
|
+
allow_fallback=False,
|
|
1082
|
+
current_only=False,
|
|
1083
|
+
)
|
|
1084
|
+
# Filter destination to same snapshot_version
|
|
1085
|
+
dest_for_snapshot = dest_lazy.filter(
|
|
1086
|
+
nw.col("snapshot_version") == from_snapshot
|
|
1087
|
+
)
|
|
1088
|
+
|
|
1089
|
+
# Materialize destination sample_uids to avoid cross-backend join issues
|
|
1090
|
+
# When copying between different stores (e.g., different DuckDB files),
|
|
1091
|
+
# Ibis can't join tables from different backends
|
|
1092
|
+
dest_sample_uids = (
|
|
1093
|
+
dest_for_snapshot.select("sample_uid")
|
|
1094
|
+
.collect()
|
|
1095
|
+
.to_polars()
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
# Convert to Polars LazyFrame and wrap in Narwhals
|
|
1099
|
+
dest_sample_uids_lazy = nw.from_native(
|
|
1100
|
+
dest_sample_uids.lazy(), eager_only=False
|
|
1101
|
+
)
|
|
1102
|
+
|
|
1103
|
+
# Collect source to Polars for anti-join
|
|
1104
|
+
source_df = source_filtered.collect().to_polars()
|
|
1105
|
+
source_lazy = nw.from_native(
|
|
1106
|
+
source_df.lazy(), eager_only=False
|
|
1107
|
+
)
|
|
1108
|
+
|
|
1109
|
+
# Anti-join: keep only source rows with sample_uid not in destination
|
|
1110
|
+
source_filtered = source_lazy.join(
|
|
1111
|
+
dest_sample_uids_lazy,
|
|
1112
|
+
on="sample_uid",
|
|
1113
|
+
how="anti",
|
|
1114
|
+
)
|
|
1115
|
+
|
|
1116
|
+
# Collect after filtering
|
|
1117
|
+
source_df = source_filtered.collect().to_polars()
|
|
1118
|
+
|
|
1119
|
+
logger.info(
|
|
1120
|
+
f"Incremental: copying only new sample_uids for {feature_key.to_string()}"
|
|
1121
|
+
)
|
|
1122
|
+
except FeatureNotFoundError:
|
|
1123
|
+
# Feature doesn't exist in destination yet - copy all rows
|
|
1124
|
+
logger.debug(
|
|
1125
|
+
f"Feature {feature_key.to_string()} not in destination, copying all rows"
|
|
1126
|
+
)
|
|
1127
|
+
source_df = source_filtered.collect().to_polars()
|
|
1128
|
+
except Exception as e:
|
|
1129
|
+
# If incremental check fails, log warning but continue with full copy
|
|
1130
|
+
logger.warning(
|
|
1131
|
+
f"Incremental check failed for {feature_key.to_string()}: {e}. Copying all rows."
|
|
1132
|
+
)
|
|
1133
|
+
source_df = source_filtered.collect().to_polars()
|
|
1134
|
+
else:
|
|
1135
|
+
# Non-incremental: collect all filtered rows
|
|
1136
|
+
source_df = source_filtered.collect().to_polars()
|
|
1137
|
+
|
|
1138
|
+
if source_df.height == 0:
|
|
1139
|
+
logger.warning(
|
|
1140
|
+
f"No rows found for {feature_key.to_string()} with snapshot {from_snapshot}, skipping"
|
|
1141
|
+
)
|
|
1142
|
+
continue
|
|
1143
|
+
|
|
1144
|
+
# Write to destination (preserving snapshot_version and feature_version)
|
|
1145
|
+
self.write_metadata(feature_key, source_df)
|
|
1146
|
+
|
|
1147
|
+
features_copied += 1
|
|
1148
|
+
total_rows += source_df.height
|
|
1149
|
+
logger.info(
|
|
1150
|
+
f"Copied {source_df.height} rows for {feature_key.to_string()}"
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
except FeatureNotFoundError:
|
|
1154
|
+
logger.warning(
|
|
1155
|
+
f"Feature {feature_key.to_string()} not found in source store, skipping"
|
|
1156
|
+
)
|
|
1157
|
+
continue
|
|
1158
|
+
except Exception as e:
|
|
1159
|
+
logger.error(
|
|
1160
|
+
f"Error copying {feature_key.to_string()}: {e}", exc_info=True
|
|
1161
|
+
)
|
|
1162
|
+
raise
|
|
1163
|
+
|
|
1164
|
+
logger.info(
|
|
1165
|
+
f"Copy complete: {features_copied} features, {total_rows} total rows"
|
|
1166
|
+
)
|
|
1167
|
+
|
|
1168
|
+
return {"features_copied": features_copied, "rows_copied": total_rows}
|
|
1169
|
+
|
|
1170
|
+
# ========== Dependency Resolution ==========
|
|
1171
|
+
|
|
1172
|
+
def read_upstream_metadata(
|
|
1173
|
+
self,
|
|
1174
|
+
feature: FeatureKey | type[Feature],
|
|
1175
|
+
field: FieldKey | None = None,
|
|
1176
|
+
*,
|
|
1177
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None = None,
|
|
1178
|
+
allow_fallback: bool = True,
|
|
1179
|
+
current_only: bool = True,
|
|
1180
|
+
) -> dict[str, nw.LazyFrame[Any]]:
|
|
1181
|
+
"""
|
|
1182
|
+
Read all upstream dependencies for a feature/field.
|
|
1183
|
+
|
|
1184
|
+
Args:
|
|
1185
|
+
feature: Feature whose dependencies to load
|
|
1186
|
+
field: Specific field (if None, loads all deps for feature)
|
|
1187
|
+
filters: Dict mapping feature keys (as strings) to lists of Narwhals filter expressions.
|
|
1188
|
+
Example: {"upstream/feature1": [nw.col("x") > 10], "upstream/feature2": [...]}
|
|
1189
|
+
allow_fallback: Whether to check fallback stores
|
|
1190
|
+
current_only: If True, only read current feature_version for upstream
|
|
1191
|
+
|
|
1192
|
+
Returns:
|
|
1193
|
+
Dict mapping upstream feature keys (as strings) to Narwhals LazyFrames.
|
|
1194
|
+
Each LazyFrame has a 'data_version' column (Struct).
|
|
1195
|
+
|
|
1196
|
+
Raises:
|
|
1197
|
+
DependencyError: If required upstream feature is missing
|
|
1198
|
+
"""
|
|
1199
|
+
plan = self._resolve_feature_plan(feature)
|
|
1200
|
+
|
|
1201
|
+
# Get all upstream features we need
|
|
1202
|
+
upstream_features = set()
|
|
1203
|
+
|
|
1204
|
+
if field is None:
|
|
1205
|
+
# All fields' dependencies
|
|
1206
|
+
for cont in plan.feature.fields:
|
|
1207
|
+
upstream_features.update(self._get_field_dependencies(plan, cont.key))
|
|
1208
|
+
else:
|
|
1209
|
+
# Specific field's dependencies
|
|
1210
|
+
upstream_features.update(self._get_field_dependencies(plan, field))
|
|
1211
|
+
|
|
1212
|
+
# Load metadata for each upstream feature
|
|
1213
|
+
# Use the feature's graph to look up upstream feature classes
|
|
1214
|
+
if isinstance(feature, FeatureKey):
|
|
1215
|
+
from metaxy.models.feature import FeatureGraph
|
|
1216
|
+
|
|
1217
|
+
graph = FeatureGraph.get_active()
|
|
1218
|
+
else:
|
|
1219
|
+
graph = feature.graph
|
|
1220
|
+
|
|
1221
|
+
upstream_metadata = {}
|
|
1222
|
+
for upstream_fq_key in upstream_features:
|
|
1223
|
+
upstream_feature_key = upstream_fq_key.feature
|
|
1224
|
+
|
|
1225
|
+
# Extract filters for this specific upstream feature
|
|
1226
|
+
upstream_filters = None
|
|
1227
|
+
if filters:
|
|
1228
|
+
upstream_key_str = upstream_feature_key.to_string()
|
|
1229
|
+
if upstream_key_str in filters:
|
|
1230
|
+
upstream_filters = filters[upstream_key_str]
|
|
1231
|
+
|
|
1232
|
+
try:
|
|
1233
|
+
# Look up the Feature class from the graph and pass it to read_metadata
|
|
1234
|
+
# This way we use the bound graph instead of relying on active context
|
|
1235
|
+
upstream_feature_cls = graph.features_by_key[upstream_feature_key]
|
|
1236
|
+
lazy_frame = self.read_metadata(
|
|
1237
|
+
upstream_feature_cls,
|
|
1238
|
+
filters=upstream_filters, # Pass extracted filters (Sequence or None)
|
|
1239
|
+
allow_fallback=allow_fallback,
|
|
1240
|
+
current_only=current_only, # Pass through current_only
|
|
1241
|
+
)
|
|
1242
|
+
# Use string key for dict
|
|
1243
|
+
upstream_metadata[upstream_feature_key.to_string()] = lazy_frame
|
|
1244
|
+
except FeatureNotFoundError as e:
|
|
1245
|
+
raise DependencyError(
|
|
1246
|
+
f"Missing upstream feature {upstream_feature_key.to_string()} "
|
|
1247
|
+
f"required by {plan.feature.key.to_string()}"
|
|
1248
|
+
) from e
|
|
1249
|
+
|
|
1250
|
+
return upstream_metadata
|
|
1251
|
+
|
|
1252
|
+
def _get_field_dependencies(
|
|
1253
|
+
self, plan: FeaturePlan, field_key: FieldKey
|
|
1254
|
+
) -> set[FQFieldKey]:
|
|
1255
|
+
"""Get all upstream field dependencies for a given field."""
|
|
1256
|
+
field = plan.feature.fields_by_key[field_key]
|
|
1257
|
+
upstream = set()
|
|
1258
|
+
|
|
1259
|
+
if field.deps == SpecialFieldDep.ALL:
|
|
1260
|
+
# All upstream features and fields
|
|
1261
|
+
upstream.update(plan.all_parent_fields_by_key.keys())
|
|
1262
|
+
elif isinstance(field.deps, list):
|
|
1263
|
+
for dep in field.deps:
|
|
1264
|
+
if isinstance(dep, FieldDep):
|
|
1265
|
+
if dep.fields == SpecialFieldDep.ALL:
|
|
1266
|
+
# All fields of this feature
|
|
1267
|
+
upstream_feature = plan.parent_features_by_key[dep.feature_key]
|
|
1268
|
+
for upstream_field in upstream_feature.fields:
|
|
1269
|
+
upstream.add(
|
|
1270
|
+
FQFieldKey(
|
|
1271
|
+
feature=dep.feature_key,
|
|
1272
|
+
field=upstream_field.key,
|
|
1273
|
+
)
|
|
1274
|
+
)
|
|
1275
|
+
elif isinstance(dep.fields, list):
|
|
1276
|
+
# Specific fields
|
|
1277
|
+
for field_key in dep.fields:
|
|
1278
|
+
upstream.add(
|
|
1279
|
+
FQFieldKey(feature=dep.feature_key, field=field_key)
|
|
1280
|
+
)
|
|
1281
|
+
|
|
1282
|
+
return upstream
|
|
1283
|
+
|
|
1284
|
+
# ========== Data Version Calculation ==========
|
|
1285
|
+
|
|
1286
|
+
# ========== Data Versioning API ==========
|
|
1287
|
+
|
|
1288
|
+
@overload
|
|
1289
|
+
def resolve_update(
|
|
1290
|
+
self,
|
|
1291
|
+
feature: type[Feature],
|
|
1292
|
+
*,
|
|
1293
|
+
samples: nw.DataFrame[Any] | nw.LazyFrame[Any] | None = None,
|
|
1294
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None = None,
|
|
1295
|
+
lazy: Literal[False] = False,
|
|
1296
|
+
**kwargs,
|
|
1297
|
+
) -> DiffResult: ...
|
|
1298
|
+
|
|
1299
|
+
@overload
|
|
1300
|
+
def resolve_update(
|
|
1301
|
+
self,
|
|
1302
|
+
feature: type[Feature],
|
|
1303
|
+
*,
|
|
1304
|
+
samples: nw.DataFrame[Any] | nw.LazyFrame[Any] | None = None,
|
|
1305
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None = None,
|
|
1306
|
+
lazy: Literal[True],
|
|
1307
|
+
**kwargs,
|
|
1308
|
+
) -> LazyDiffResult: ...
|
|
1309
|
+
|
|
1310
|
+
def resolve_update(
|
|
1311
|
+
self,
|
|
1312
|
+
feature: type[Feature],
|
|
1313
|
+
*,
|
|
1314
|
+
samples: nw.DataFrame[Any] | nw.LazyFrame[Any] | None = None,
|
|
1315
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None = None,
|
|
1316
|
+
lazy: bool = False,
|
|
1317
|
+
**kwargs,
|
|
1318
|
+
) -> DiffResult | LazyDiffResult:
|
|
1319
|
+
"""Resolve what needs updating for a feature.
|
|
1320
|
+
|
|
1321
|
+
Primary user-facing method. Automatically chooses optimal strategy:
|
|
1322
|
+
1. Root features without samples → Raise error (samples required)
|
|
1323
|
+
2. All upstream local → Use native data version calculations (stay in DB)
|
|
1324
|
+
3. Some upstream in fallback stores → Pull to memory (Polars)
|
|
1325
|
+
4. samples provided → Use as pre-calculated target versions (escape hatch)
|
|
1326
|
+
|
|
1327
|
+
Args:
|
|
1328
|
+
feature: Feature class to resolve updates for
|
|
1329
|
+
samples: **Escape hatch parameter**. Pre-computed DataFrame with sample_uid
|
|
1330
|
+
and data_version columns. When provided, skips upstream loading, joining,
|
|
1331
|
+
and data version calculation - goes straight to diff.
|
|
1332
|
+
|
|
1333
|
+
**Required for root features** (features with no upstream dependencies).
|
|
1334
|
+
Root features don't have upstream to calculate data_version from, so users
|
|
1335
|
+
must provide samples with manually computed data_version.
|
|
1336
|
+
|
|
1337
|
+
**Optional for non-root features** as an escape hatch. Use this when you
|
|
1338
|
+
want to bypass the automatic upstream loading and data version calculation.
|
|
1339
|
+
Examples:
|
|
1340
|
+
- Loading upstream from custom sources
|
|
1341
|
+
- Pre-computing data versions with custom logic
|
|
1342
|
+
- Testing specific scenarios
|
|
1343
|
+
|
|
1344
|
+
**Normal usage**: Don't provide this parameter. The system will automatically
|
|
1345
|
+
load upstream features and calculate data versions.
|
|
1346
|
+
|
|
1347
|
+
filters: Dict mapping feature keys (as strings) to lists of Narwhals filter expressions.
|
|
1348
|
+
Applied when reading upstream metadata to filter samples at the source.
|
|
1349
|
+
Example: {"upstream/feature": [nw.col("x") > 10], ...}
|
|
1350
|
+
lazy: If True, return LazyDiffResult with lazy Narwhals LazyFrames.
|
|
1351
|
+
If False, return DiffResult with eager Narwhals DataFrames (default).
|
|
1352
|
+
**kwargs: Backend-specific parameters (reserved for future use)
|
|
1353
|
+
|
|
1354
|
+
Returns:
|
|
1355
|
+
DiffResult (eager, default) or LazyDiffResult (lazy) with:
|
|
1356
|
+
- added: New samples not in current metadata
|
|
1357
|
+
- changed: Existing samples with different data_versions
|
|
1358
|
+
- removed: Samples in current but not in upstream
|
|
1359
|
+
|
|
1360
|
+
Each frame has columns: [sample_uid, data_version, ...user columns...]
|
|
1361
|
+
|
|
1362
|
+
Raises:
|
|
1363
|
+
ValueError: If samples not provided for root features (no upstream)
|
|
1364
|
+
|
|
1365
|
+
Examples:
|
|
1366
|
+
>>> # Root feature - samples required
|
|
1367
|
+
>>> samples = pl.DataFrame({
|
|
1368
|
+
... "sample_uid": [1, 2, 3],
|
|
1369
|
+
... "data_version": [{"field": "h1"}, {"field": "h2"}, {"field": "h3"}],
|
|
1370
|
+
... })
|
|
1371
|
+
>>> result = store.resolve_update(RootFeature, samples=nw.from_native(samples))
|
|
1372
|
+
|
|
1373
|
+
>>> # Non-root feature - automatic (normal usage)
|
|
1374
|
+
>>> result = store.resolve_update(DownstreamFeature)
|
|
1375
|
+
|
|
1376
|
+
>>> # Non-root feature - with escape hatch (advanced)
|
|
1377
|
+
>>> custom_samples = compute_custom_data_versions(...)
|
|
1378
|
+
>>> result = store.resolve_update(DownstreamFeature, samples=custom_samples)
|
|
1379
|
+
|
|
1380
|
+
Note:
|
|
1381
|
+
Users can then process only added/changed and call write_metadata().
|
|
1382
|
+
"""
|
|
1383
|
+
import narwhals as nw
|
|
1384
|
+
|
|
1385
|
+
plan = feature.graph.get_feature_plan(feature.spec.key)
|
|
1386
|
+
|
|
1387
|
+
# Escape hatch: if samples provided, use them directly (skip join/calculation)
|
|
1388
|
+
if samples is not None:
|
|
1389
|
+
import logging
|
|
1390
|
+
|
|
1391
|
+
import polars as pl
|
|
1392
|
+
|
|
1393
|
+
logger = logging.getLogger(__name__)
|
|
1394
|
+
|
|
1395
|
+
# Convert samples to lazy if needed
|
|
1396
|
+
samples_lazy = (
|
|
1397
|
+
samples
|
|
1398
|
+
if isinstance(samples, nw.LazyFrame)
|
|
1399
|
+
else nw.from_native(samples.to_native().lazy())
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1402
|
+
# Check if samples are Polars-backed (common case for escape hatch)
|
|
1403
|
+
samples_native = samples_lazy.to_native()
|
|
1404
|
+
is_polars_samples = isinstance(samples_native, (pl.DataFrame, pl.LazyFrame))
|
|
1405
|
+
|
|
1406
|
+
if is_polars_samples and self._supports_native_components():
|
|
1407
|
+
# User provided Polars samples but store uses native (SQL) backend
|
|
1408
|
+
# Need to materialize current metadata to Polars for compatibility
|
|
1409
|
+
logger.warning(
|
|
1410
|
+
f"Feature {feature.spec.key}: samples parameter is Polars-backed but store uses native SQL backend. "
|
|
1411
|
+
f"Materializing current metadata to Polars for diff comparison. "
|
|
1412
|
+
f"For better performance, consider using samples with backend matching the store's backend."
|
|
1413
|
+
)
|
|
1414
|
+
# Get current metadata and materialize to Polars
|
|
1415
|
+
current_lazy_native = self._read_metadata_native(
|
|
1416
|
+
feature, feature_version=feature.feature_version()
|
|
1417
|
+
)
|
|
1418
|
+
if current_lazy_native is not None:
|
|
1419
|
+
# Convert to Polars using Narwhals' built-in method
|
|
1420
|
+
current_lazy = nw.from_native(
|
|
1421
|
+
current_lazy_native.collect().to_polars().lazy()
|
|
1422
|
+
)
|
|
1423
|
+
else:
|
|
1424
|
+
current_lazy = None
|
|
1425
|
+
else:
|
|
1426
|
+
# Same backend or no conversion needed - direct read
|
|
1427
|
+
current_lazy = self._read_metadata_native(
|
|
1428
|
+
feature, feature_version=feature.feature_version()
|
|
1429
|
+
)
|
|
1430
|
+
|
|
1431
|
+
# Use diff resolver to compare samples with current
|
|
1432
|
+
from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
|
|
1433
|
+
|
|
1434
|
+
diff_resolver = NarwhalsDiffResolver()
|
|
1435
|
+
|
|
1436
|
+
lazy_result = diff_resolver.find_changes(
|
|
1437
|
+
target_versions=samples_lazy,
|
|
1438
|
+
current_metadata=current_lazy,
|
|
1439
|
+
)
|
|
1440
|
+
|
|
1441
|
+
return lazy_result if lazy else lazy_result.collect()
|
|
1442
|
+
|
|
1443
|
+
# Root features without samples: error (samples required)
|
|
1444
|
+
if not plan.deps:
|
|
1445
|
+
raise ValueError(
|
|
1446
|
+
f"Feature {feature.spec.key} has no upstream dependencies (root feature). "
|
|
1447
|
+
f"Must provide 'samples' parameter with sample_uid and data_version columns. "
|
|
1448
|
+
f"Root features require manual data_version computation."
|
|
1449
|
+
)
|
|
1450
|
+
|
|
1451
|
+
# Non-root features without samples: automatic upstream loading
|
|
1452
|
+
# Check where upstream data lives
|
|
1453
|
+
upstream_location = self._check_upstream_location(feature)
|
|
1454
|
+
|
|
1455
|
+
if upstream_location == "all_local":
|
|
1456
|
+
# All upstream in this store - use native data version calculations
|
|
1457
|
+
return self._resolve_update_native(feature, filters=filters, lazy=lazy)
|
|
1458
|
+
else:
|
|
1459
|
+
# Some upstream in fallback stores - use Polars components
|
|
1460
|
+
return self._resolve_update_polars(feature, filters=filters, lazy=lazy)
|
|
1461
|
+
|
|
1462
|
+
def _check_upstream_location(self, feature: type[Feature]) -> str:
|
|
1463
|
+
"""Check if all upstream is in this store or in fallback stores.
|
|
1464
|
+
|
|
1465
|
+
Returns:
|
|
1466
|
+
"all_local" if all upstream features are in this store
|
|
1467
|
+
"has_fallback" if any upstream is in fallback stores
|
|
1468
|
+
"""
|
|
1469
|
+
plan = feature.graph.get_feature_plan(feature.spec.key)
|
|
1470
|
+
|
|
1471
|
+
if not plan.deps:
|
|
1472
|
+
return "all_local" # No dependencies
|
|
1473
|
+
|
|
1474
|
+
for upstream_spec in plan.deps:
|
|
1475
|
+
if not self.has_feature(upstream_spec.key, check_fallback=False):
|
|
1476
|
+
return "has_fallback" # At least one upstream is in fallback
|
|
1477
|
+
|
|
1478
|
+
return "all_local"
|
|
1479
|
+
|
|
1480
|
+
def _resolve_update_native(
|
|
1481
|
+
self,
|
|
1482
|
+
feature: type[Feature],
|
|
1483
|
+
*,
|
|
1484
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None = None,
|
|
1485
|
+
lazy: bool = False,
|
|
1486
|
+
) -> DiffResult | LazyDiffResult:
|
|
1487
|
+
"""Resolve using native data version calculations (all data in this store).
|
|
1488
|
+
|
|
1489
|
+
Uses native data version calculations when available (e.g., IbisDataVersionCalculator for SQL stores)
|
|
1490
|
+
to execute operations in the database without pulling data into memory.
|
|
1491
|
+
|
|
1492
|
+
For stores that support native data version calculations (DuckDB, ClickHouse), this method:
|
|
1493
|
+
- Executes joins and diffs lazily via Narwhals
|
|
1494
|
+
- Computes hashes using native SQL functions (xxHash64, MD5, etc.)
|
|
1495
|
+
- Does not materialize data into memory (unless lazy=True)
|
|
1496
|
+
|
|
1497
|
+
For stores without native support, falls back to PolarsDataVersionCalculator.
|
|
1498
|
+
"""
|
|
1499
|
+
import logging
|
|
1500
|
+
|
|
1501
|
+
logger = logging.getLogger(__name__)
|
|
1502
|
+
plan = feature.graph.get_feature_plan(feature.spec.key)
|
|
1503
|
+
|
|
1504
|
+
# Root features should be handled in resolve_update() with samples parameter
|
|
1505
|
+
# This method should only be called for features with upstream
|
|
1506
|
+
if not plan.deps:
|
|
1507
|
+
raise RuntimeError(
|
|
1508
|
+
f"Internal error: _resolve_update_native called for root feature {feature.spec.key}. "
|
|
1509
|
+
f"Root features should be handled in resolve_update() with samples parameter."
|
|
1510
|
+
)
|
|
1511
|
+
|
|
1512
|
+
# Create components based on native support
|
|
1513
|
+
# Only fallback to Polars if store explicitly doesn't support native data version calculations
|
|
1514
|
+
if self._supports_native_components():
|
|
1515
|
+
joiner, calculator, diff_resolver = self._create_native_components()
|
|
1516
|
+
logger.debug(
|
|
1517
|
+
f"Using native calculator for {feature.spec.key}: {calculator.__class__.__name__}"
|
|
1518
|
+
)
|
|
1519
|
+
else:
|
|
1520
|
+
# Store doesn't support native data version calculations - use Polars
|
|
1521
|
+
from metaxy.data_versioning.calculators.polars import (
|
|
1522
|
+
PolarsDataVersionCalculator,
|
|
1523
|
+
)
|
|
1524
|
+
from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
|
|
1525
|
+
from metaxy.data_versioning.joiners.narwhals import NarwhalsJoiner
|
|
1526
|
+
|
|
1527
|
+
joiner = NarwhalsJoiner()
|
|
1528
|
+
calculator = PolarsDataVersionCalculator()
|
|
1529
|
+
diff_resolver = NarwhalsDiffResolver()
|
|
1530
|
+
logger.debug(
|
|
1531
|
+
f"Using Polars components for {feature.spec.key} (native not supported)"
|
|
1532
|
+
)
|
|
1533
|
+
|
|
1534
|
+
# Load upstream as Narwhals LazyFrames (stays lazy in SQL for native stores)
|
|
1535
|
+
upstream_refs: dict[str, nw.LazyFrame[Any]] = {}
|
|
1536
|
+
for upstream_spec in plan.deps or []:
|
|
1537
|
+
upstream_key_str = (
|
|
1538
|
+
upstream_spec.key.to_string()
|
|
1539
|
+
if hasattr(upstream_spec.key, "to_string")
|
|
1540
|
+
else "_".join(upstream_spec.key)
|
|
1541
|
+
)
|
|
1542
|
+
# Extract filters for this upstream feature
|
|
1543
|
+
upstream_filters = None
|
|
1544
|
+
if filters and upstream_key_str in filters:
|
|
1545
|
+
upstream_filters = filters[upstream_key_str]
|
|
1546
|
+
|
|
1547
|
+
upstream_lazy = self._read_metadata_native(
|
|
1548
|
+
upstream_spec.key,
|
|
1549
|
+
filters=upstream_filters, # Apply extracted filters
|
|
1550
|
+
)
|
|
1551
|
+
if upstream_lazy is not None:
|
|
1552
|
+
upstream_refs[upstream_key_str] = upstream_lazy
|
|
1553
|
+
|
|
1554
|
+
# Join upstream using Narwhals (stays lazy)
|
|
1555
|
+
joined, mapping = feature.load_input(
|
|
1556
|
+
joiner=joiner,
|
|
1557
|
+
upstream_refs=upstream_refs,
|
|
1558
|
+
)
|
|
1559
|
+
|
|
1560
|
+
# Calculate data_versions using the selected calculator
|
|
1561
|
+
# For IbisDataVersionCalculator: executes hash computation in SQL
|
|
1562
|
+
# For PolarsDataVersionCalculator: materializes to compute hashes in memory
|
|
1563
|
+
target_versions_nw = calculator.calculate_data_versions(
|
|
1564
|
+
joined_upstream=joined,
|
|
1565
|
+
feature_spec=feature.spec,
|
|
1566
|
+
feature_plan=plan,
|
|
1567
|
+
upstream_column_mapping=mapping,
|
|
1568
|
+
hash_algorithm=self.hash_algorithm,
|
|
1569
|
+
)
|
|
1570
|
+
|
|
1571
|
+
# Diff with current (filtered by feature_version at database level)
|
|
1572
|
+
current_lazy_nw = self._read_metadata_native(
|
|
1573
|
+
feature, feature_version=feature.feature_version()
|
|
1574
|
+
)
|
|
1575
|
+
|
|
1576
|
+
return feature.resolve_data_version_diff(
|
|
1577
|
+
diff_resolver=diff_resolver,
|
|
1578
|
+
target_versions=target_versions_nw,
|
|
1579
|
+
current_metadata=current_lazy_nw,
|
|
1580
|
+
lazy=lazy,
|
|
1581
|
+
)
|
|
1582
|
+
|
|
1583
|
+
def _resolve_update_polars(
|
|
1584
|
+
self,
|
|
1585
|
+
feature: type[Feature],
|
|
1586
|
+
*,
|
|
1587
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None = None,
|
|
1588
|
+
lazy: bool = False,
|
|
1589
|
+
) -> DiffResult | LazyDiffResult:
|
|
1590
|
+
"""Resolve using Polars components (cross-store scenario).
|
|
1591
|
+
|
|
1592
|
+
Pulls data from all stores to Polars, performs all operations in memory.
|
|
1593
|
+
Uses Polars components instead of native SQL components because upstream
|
|
1594
|
+
data is distributed across multiple stores.
|
|
1595
|
+
|
|
1596
|
+
This method is called when upstream features are in fallback stores,
|
|
1597
|
+
requiring materialization to join data from different sources.
|
|
1598
|
+
"""
|
|
1599
|
+
import logging
|
|
1600
|
+
|
|
1601
|
+
from metaxy.data_versioning.calculators.polars import (
|
|
1602
|
+
PolarsDataVersionCalculator,
|
|
1603
|
+
)
|
|
1604
|
+
from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
|
|
1605
|
+
from metaxy.data_versioning.joiners.narwhals import NarwhalsJoiner
|
|
1606
|
+
|
|
1607
|
+
logger = logging.getLogger(__name__)
|
|
1608
|
+
|
|
1609
|
+
# Warn if native components are available and preferred but can't be used due to cross-store scenario
|
|
1610
|
+
if self._prefer_native and self._supports_native_components():
|
|
1611
|
+
logger.warning(
|
|
1612
|
+
f"Feature {feature.spec.key} has upstream dependencies in fallback stores. "
|
|
1613
|
+
f"Falling back to in-memory Polars processing instead of native SQL execution. "
|
|
1614
|
+
f"For better performance, ensure all upstream features are in the same store."
|
|
1615
|
+
)
|
|
1616
|
+
|
|
1617
|
+
# Load upstream from all sources (this store + fallbacks) as Narwhals LazyFrames
|
|
1618
|
+
upstream_refs = self.read_upstream_metadata(
|
|
1619
|
+
feature, filters=filters, allow_fallback=True
|
|
1620
|
+
)
|
|
1621
|
+
|
|
1622
|
+
# Create Narwhals components (work with any backend)
|
|
1623
|
+
narwhals_joiner = NarwhalsJoiner()
|
|
1624
|
+
polars_calculator = (
|
|
1625
|
+
PolarsDataVersionCalculator()
|
|
1626
|
+
) # Still need this for hash calculation
|
|
1627
|
+
narwhals_diff = NarwhalsDiffResolver()
|
|
1628
|
+
|
|
1629
|
+
# Step 1: Join upstream using Narwhals
|
|
1630
|
+
plan = feature.graph.get_feature_plan(feature.spec.key)
|
|
1631
|
+
joined, mapping = feature.load_input(
|
|
1632
|
+
joiner=narwhals_joiner,
|
|
1633
|
+
upstream_refs=upstream_refs,
|
|
1634
|
+
)
|
|
1635
|
+
|
|
1636
|
+
# Step 2: Calculate data_versions
|
|
1637
|
+
# to_native() returns underlying type without materializing
|
|
1638
|
+
joined_native = joined.to_native()
|
|
1639
|
+
if isinstance(joined_native, pl.LazyFrame):
|
|
1640
|
+
joined_pl = joined_native
|
|
1641
|
+
elif isinstance(joined_native, pl.DataFrame):
|
|
1642
|
+
joined_pl = joined_native.lazy()
|
|
1643
|
+
else:
|
|
1644
|
+
# Ibis table - convert to Polars
|
|
1645
|
+
joined_pl = joined_native.to_polars()
|
|
1646
|
+
if isinstance(joined_pl, pl.DataFrame):
|
|
1647
|
+
joined_pl = joined_pl.lazy()
|
|
1648
|
+
|
|
1649
|
+
# Wrap in Narwhals before passing to calculator
|
|
1650
|
+
joined_nw = nw.from_native(joined_pl, eager_only=False)
|
|
1651
|
+
|
|
1652
|
+
target_versions_nw = polars_calculator.calculate_data_versions(
|
|
1653
|
+
joined_upstream=joined_nw,
|
|
1654
|
+
feature_spec=feature.spec,
|
|
1655
|
+
feature_plan=plan,
|
|
1656
|
+
upstream_column_mapping=mapping,
|
|
1657
|
+
hash_algorithm=self.hash_algorithm,
|
|
1658
|
+
)
|
|
1659
|
+
|
|
1660
|
+
# Select only sample_uid and data_version for diff
|
|
1661
|
+
# The calculator returns the full joined DataFrame with upstream columns,
|
|
1662
|
+
# but diff resolver only needs these two columns
|
|
1663
|
+
target_versions_nw = target_versions_nw.select(["sample_uid", "data_version"])
|
|
1664
|
+
|
|
1665
|
+
# Step 3: Diff with current (filtered by feature_version at database level)
|
|
1666
|
+
current_lazy = self._read_metadata_native(
|
|
1667
|
+
feature, feature_version=feature.feature_version()
|
|
1668
|
+
)
|
|
1669
|
+
|
|
1670
|
+
# Diff resolver returns Narwhals frames (lazy or eager based on flag)
|
|
1671
|
+
return feature.resolve_data_version_diff(
|
|
1672
|
+
diff_resolver=narwhals_diff,
|
|
1673
|
+
target_versions=target_versions_nw,
|
|
1674
|
+
current_metadata=current_lazy,
|
|
1675
|
+
lazy=lazy,
|
|
1676
|
+
)
|