metaxy 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metaxy might be problematic. Click here for more details.

Files changed (75) hide show
  1. metaxy/__init__.py +61 -0
  2. metaxy/_testing.py +542 -0
  3. metaxy/_utils.py +16 -0
  4. metaxy/_version.py +1 -0
  5. metaxy/cli/app.py +76 -0
  6. metaxy/cli/context.py +71 -0
  7. metaxy/cli/graph.py +576 -0
  8. metaxy/cli/graph_diff.py +290 -0
  9. metaxy/cli/list.py +42 -0
  10. metaxy/cli/metadata.py +271 -0
  11. metaxy/cli/migrations.py +862 -0
  12. metaxy/cli/push.py +55 -0
  13. metaxy/config.py +450 -0
  14. metaxy/data_versioning/__init__.py +24 -0
  15. metaxy/data_versioning/calculators/__init__.py +13 -0
  16. metaxy/data_versioning/calculators/base.py +97 -0
  17. metaxy/data_versioning/calculators/duckdb.py +186 -0
  18. metaxy/data_versioning/calculators/ibis.py +225 -0
  19. metaxy/data_versioning/calculators/polars.py +135 -0
  20. metaxy/data_versioning/diff/__init__.py +15 -0
  21. metaxy/data_versioning/diff/base.py +150 -0
  22. metaxy/data_versioning/diff/narwhals.py +108 -0
  23. metaxy/data_versioning/hash_algorithms.py +19 -0
  24. metaxy/data_versioning/joiners/__init__.py +9 -0
  25. metaxy/data_versioning/joiners/base.py +70 -0
  26. metaxy/data_versioning/joiners/narwhals.py +235 -0
  27. metaxy/entrypoints.py +309 -0
  28. metaxy/ext/__init__.py +1 -0
  29. metaxy/ext/alembic.py +326 -0
  30. metaxy/ext/sqlmodel.py +172 -0
  31. metaxy/ext/sqlmodel_system_tables.py +139 -0
  32. metaxy/graph/__init__.py +21 -0
  33. metaxy/graph/diff/__init__.py +21 -0
  34. metaxy/graph/diff/diff_models.py +399 -0
  35. metaxy/graph/diff/differ.py +740 -0
  36. metaxy/graph/diff/models.py +418 -0
  37. metaxy/graph/diff/rendering/__init__.py +18 -0
  38. metaxy/graph/diff/rendering/base.py +274 -0
  39. metaxy/graph/diff/rendering/cards.py +188 -0
  40. metaxy/graph/diff/rendering/formatter.py +805 -0
  41. metaxy/graph/diff/rendering/graphviz.py +246 -0
  42. metaxy/graph/diff/rendering/mermaid.py +320 -0
  43. metaxy/graph/diff/rendering/rich.py +165 -0
  44. metaxy/graph/diff/rendering/theme.py +48 -0
  45. metaxy/graph/diff/traversal.py +247 -0
  46. metaxy/graph/utils.py +58 -0
  47. metaxy/metadata_store/__init__.py +31 -0
  48. metaxy/metadata_store/_protocols.py +38 -0
  49. metaxy/metadata_store/base.py +1676 -0
  50. metaxy/metadata_store/clickhouse.py +161 -0
  51. metaxy/metadata_store/duckdb.py +167 -0
  52. metaxy/metadata_store/exceptions.py +43 -0
  53. metaxy/metadata_store/ibis.py +451 -0
  54. metaxy/metadata_store/memory.py +228 -0
  55. metaxy/metadata_store/sqlite.py +187 -0
  56. metaxy/metadata_store/system_tables.py +257 -0
  57. metaxy/migrations/__init__.py +34 -0
  58. metaxy/migrations/detector.py +153 -0
  59. metaxy/migrations/executor.py +208 -0
  60. metaxy/migrations/loader.py +260 -0
  61. metaxy/migrations/models.py +718 -0
  62. metaxy/migrations/ops.py +390 -0
  63. metaxy/models/__init__.py +0 -0
  64. metaxy/models/bases.py +6 -0
  65. metaxy/models/constants.py +24 -0
  66. metaxy/models/feature.py +665 -0
  67. metaxy/models/feature_spec.py +105 -0
  68. metaxy/models/field.py +25 -0
  69. metaxy/models/plan.py +155 -0
  70. metaxy/models/types.py +157 -0
  71. metaxy/py.typed +0 -0
  72. metaxy-0.0.0.dist-info/METADATA +247 -0
  73. metaxy-0.0.0.dist-info/RECORD +75 -0
  74. metaxy-0.0.0.dist-info/WHEEL +4 -0
  75. metaxy-0.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,1676 @@
1
+ """Abstract base class for metadata storage backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from abc import ABC, abstractmethod
7
+ from collections.abc import Mapping, Sequence
8
+ from datetime import datetime, timezone
9
+ from typing import TYPE_CHECKING, Any, Literal, TypeGuard, overload
10
+
11
+ import narwhals as nw
12
+ import polars as pl
13
+ from typing_extensions import Self
14
+
15
+ from metaxy.data_versioning.calculators.base import DataVersionCalculator
16
+ from metaxy.data_versioning.calculators.polars import PolarsDataVersionCalculator
17
+ from metaxy.data_versioning.diff import DiffResult, LazyDiffResult
18
+ from metaxy.data_versioning.diff.base import MetadataDiffResolver
19
+ from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
20
+ from metaxy.data_versioning.hash_algorithms import HashAlgorithm
21
+ from metaxy.data_versioning.joiners.base import UpstreamJoiner
22
+ from metaxy.data_versioning.joiners.narwhals import NarwhalsJoiner
23
+ from metaxy.metadata_store.exceptions import (
24
+ DependencyError,
25
+ FeatureNotFoundError,
26
+ StoreNotOpenError,
27
+ )
28
+ from metaxy.metadata_store.system_tables import (
29
+ FEATURE_VERSIONS_KEY,
30
+ FEATURE_VERSIONS_SCHEMA,
31
+ SYSTEM_NAMESPACE,
32
+ _suppress_feature_version_warning,
33
+ allow_feature_version_override,
34
+ )
35
+ from metaxy.models.feature import Feature, FeatureGraph
36
+ from metaxy.models.field import FieldDep, SpecialFieldDep
37
+ from metaxy.models.plan import FeaturePlan, FQFieldKey
38
+ from metaxy.models.types import FeatureKey, FieldKey
39
+
40
+ if TYPE_CHECKING:
41
+ pass
42
+
43
+ # Removed TRef - all stores now use Narwhals LazyFrames universally
44
+
45
+
46
+ def _is_using_polars_components(
47
+ components: tuple[UpstreamJoiner, DataVersionCalculator, MetadataDiffResolver],
48
+ ) -> TypeGuard[
49
+ tuple[NarwhalsJoiner, PolarsDataVersionCalculator, NarwhalsDiffResolver]
50
+ ]:
51
+ """Type guard to check if using Narwhals components.
52
+
53
+ Returns True if all components are Narwhals-based, allowing type narrowing.
54
+ """
55
+ joiner, calculator, diff_resolver = components
56
+ return (
57
+ isinstance(joiner, NarwhalsJoiner)
58
+ and isinstance(calculator, PolarsDataVersionCalculator)
59
+ and isinstance(diff_resolver, NarwhalsDiffResolver)
60
+ )
61
+
62
+
63
+ class MetadataStore(ABC):
64
+ """
65
+ Abstract base class for metadata storage backends.
66
+
67
+ Supports:
68
+ - Immutable metadata storage (append-only)
69
+ - Composable fallback store chains (for branch deployments)
70
+ - Automatic data version calculation using three-component architecture
71
+ - Backend-specific computation optimizations
72
+
73
+ All stores use Narwhals LazyFrames as their universal interface,
74
+ regardless of the underlying backend (Polars, Ibis/SQL, etc.).
75
+
76
+ Components:
77
+ Components are created on-demand in resolve_update() based on:
78
+ - User preference (prefer_native flag)
79
+ - Whether all upstream data is local (or needs fallback stores)
80
+ - Store capabilities (whether it supports native data version calculations)
81
+
82
+ If prefer_native=True and all conditions met: use native (Ibis, DuckDB, etc.)
83
+ Otherwise: use Polars components
84
+
85
+ Subclasses declare what native data version calculations they support via abstract methods.
86
+
87
+ Context Manager:
88
+ Stores must be used as context managers for resource management.
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ *,
94
+ hash_algorithm: HashAlgorithm | None = None,
95
+ prefer_native: bool = True,
96
+ fallback_stores: list[MetadataStore] | None = None,
97
+ ):
98
+ """
99
+ Initialize metadata store.
100
+
101
+ Args:
102
+ hash_algorithm: Hash algorithm to use for data versioning.
103
+ Default: None (uses default algorithm for this store type)
104
+ prefer_native: If True, prefer native data version calculations when possible.
105
+ If False, always use Polars components. Default: True
106
+ fallback_stores: Ordered list of read-only fallback stores.
107
+ Used when upstream features are not in this store.
108
+
109
+ Raises:
110
+ ValueError: If fallback stores use different hash algorithms
111
+ """
112
+ # Initialize state early so properties can check it
113
+ self._is_open = False
114
+ self._context_depth = 0
115
+ self._prefer_native = prefer_native
116
+
117
+ # Use store's default algorithm if not specified
118
+ if hash_algorithm is None:
119
+ hash_algorithm = self._get_default_hash_algorithm()
120
+
121
+ self.hash_algorithm = hash_algorithm
122
+ self.fallback_stores = fallback_stores or []
123
+
124
+ # Validation happens in open()
125
+
126
+ @abstractmethod
127
+ def _get_default_hash_algorithm(self) -> HashAlgorithm:
128
+ """Get the default hash algorithm for this store type.
129
+
130
+ Returns:
131
+ Default hash algorithm
132
+ """
133
+ pass
134
+
135
+ @abstractmethod
136
+ def _supports_native_components(self) -> bool:
137
+ """Check if this store can use native (non-Polars) components.
138
+
139
+ Returns:
140
+ True if store has backend-specific native data version calculations
141
+ False if store only supports Polars components
142
+ """
143
+ pass
144
+
145
+ @abstractmethod
146
+ def _create_native_components(
147
+ self,
148
+ ) -> tuple[UpstreamJoiner, DataVersionCalculator, MetadataDiffResolver]:
149
+ """Create native data version calculations for this store.
150
+
151
+ Only called if _supports_native_components() returns True.
152
+
153
+ Returns:
154
+ Tuple of (joiner, calculator, diff_resolver) with appropriate types
155
+ for this store's backend (Narwhals-compatible)
156
+
157
+ Raises:
158
+ NotImplementedError: If store doesn't support native data version calculations
159
+ """
160
+ pass
161
+
162
+ @abstractmethod
163
+ def open(self) -> None:
164
+ """Open/initialize the store for operations.
165
+
166
+ Called by __enter__. Subclasses implement connection setup here.
167
+ Can be called manually but context manager usage is recommended.
168
+ """
169
+ pass
170
+
171
+ @abstractmethod
172
+ def close(self) -> None:
173
+ """Close/cleanup the store.
174
+
175
+ Called by __exit__. Subclasses implement connection cleanup here.
176
+ Can be called manually but context manager usage is recommended.
177
+ """
178
+ pass
179
+
180
+ def __enter__(self) -> Self:
181
+ """Enter context manager."""
182
+ # Track nesting depth
183
+ self._context_depth += 1
184
+
185
+ # Only open on first enter
186
+ if self._context_depth == 1:
187
+ self.open()
188
+ self._is_open = True
189
+
190
+ # Validate after opening (when all components are ready)
191
+ self._validate_after_open()
192
+
193
+ return self
194
+
195
+ def _validate_after_open(self) -> None:
196
+ """Validate configuration after store is opened.
197
+
198
+ Called automatically by __enter__ after open().
199
+ Validates hash algorithm compatibility and fallback store consistency.
200
+ """
201
+ # Validate hash algorithm compatibility with components
202
+ self.validate_hash_algorithm(check_fallback_stores=True)
203
+
204
+ # Validate fallback stores use the same hash algorithm
205
+ for i, fallback_store in enumerate(self.fallback_stores):
206
+ if fallback_store.hash_algorithm != self.hash_algorithm:
207
+ raise ValueError(
208
+ f"Fallback store {i} uses hash_algorithm='{fallback_store.hash_algorithm.value}' "
209
+ f"but this store uses '{self.hash_algorithm.value}'. "
210
+ f"All stores in a fallback chain must use the same hash algorithm."
211
+ )
212
+
213
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
214
+ """Exit context manager."""
215
+ # Decrement depth
216
+ self._context_depth -= 1
217
+
218
+ # Only close when fully exited
219
+ if self._context_depth == 0:
220
+ self._is_open = False
221
+ self.close()
222
+
223
+ def _check_open(self) -> None:
224
+ """Check if store is open, raise error if not.
225
+
226
+ Raises:
227
+ StoreNotOpenError: If store is not open
228
+ """
229
+ if not self._is_open:
230
+ raise StoreNotOpenError(
231
+ f"{self.__class__.__name__} must be opened before use. "
232
+ "Use it as a context manager: `with store: ...`"
233
+ )
234
+
235
+ # ========== Hash Algorithm Validation ==========
236
+
237
+ def validate_hash_algorithm(
238
+ self,
239
+ check_fallback_stores: bool = True,
240
+ ) -> None:
241
+ """Validate that hash algorithm is supported by this store's components.
242
+
243
+ Public method - can be called to verify hash compatibility.
244
+
245
+ Args:
246
+ check_fallback_stores: If True, also validate hash is supported by
247
+ fallback stores (ensures compatibility for future cross-store operations)
248
+
249
+ Raises:
250
+ ValueError: If hash algorithm not supported by components or fallback stores
251
+ """
252
+ # Check if this store can support the algorithm
253
+ # Try native data version calculations first (if supported), then Polars
254
+ supported_algorithms = []
255
+
256
+ if self._supports_native_components():
257
+ try:
258
+ _, calculator, _ = self._create_native_components()
259
+ supported_algorithms = calculator.supported_algorithms
260
+ except Exception:
261
+ # If native data version calculations fail, fall back to Polars
262
+ pass
263
+
264
+ # If no native support or prefer_native=False, use Polars
265
+ if not supported_algorithms:
266
+ polars_calc = PolarsDataVersionCalculator()
267
+ supported_algorithms = polars_calc.supported_algorithms
268
+
269
+ if self.hash_algorithm not in supported_algorithms:
270
+ from metaxy.metadata_store.exceptions import (
271
+ HashAlgorithmNotSupportedError,
272
+ )
273
+
274
+ raise HashAlgorithmNotSupportedError(
275
+ f"Hash algorithm {self.hash_algorithm} not supported by {self.__class__.__name__}. "
276
+ f"Supported: {supported_algorithms}"
277
+ )
278
+
279
+ # Check fallback stores
280
+ if check_fallback_stores:
281
+ for fallback in self.fallback_stores:
282
+ fallback.validate_hash_algorithm(check_fallback_stores=False)
283
+
284
+ # ========== Helper Methods ==========
285
+
286
+ def _is_system_table(self, feature_key: FeatureKey) -> bool:
287
+ """Check if feature key is a system table."""
288
+ return len(feature_key) >= 1 and feature_key[0] == SYSTEM_NAMESPACE
289
+
290
+ def _resolve_feature_key(self, feature: FeatureKey | type[Feature]) -> FeatureKey:
291
+ """Resolve a Feature class or FeatureKey to FeatureKey."""
292
+ if isinstance(feature, FeatureKey):
293
+ return feature
294
+ else:
295
+ return feature.spec.key
296
+
297
+ def _resolve_feature_plan(self, feature: FeatureKey | type[Feature]) -> FeaturePlan:
298
+ """Resolve to FeaturePlan for dependency resolution."""
299
+ if isinstance(feature, FeatureKey):
300
+ # When given a FeatureKey, get the graph from the active context
301
+ return FeatureGraph.get_active().get_feature_plan(feature)
302
+ else:
303
+ # When given a Feature class, use its bound graph
304
+ return feature.graph.get_feature_plan(feature.spec.key)
305
+
306
+ # ========== Core CRUD Operations ==========
307
+
308
+ @abstractmethod
309
+ def _write_metadata_impl(
310
+ self,
311
+ feature_key: FeatureKey,
312
+ df: pl.DataFrame,
313
+ ) -> None:
314
+ """
315
+ Internal write implementation (backend-specific).
316
+
317
+ Args:
318
+ feature_key: Feature key to write to
319
+ df: DataFrame with metadata (already validated)
320
+
321
+ Note: Subclasses implement this for their storage backend.
322
+ """
323
+ pass
324
+
325
+ def write_metadata(
326
+ self,
327
+ feature: FeatureKey | type[Feature],
328
+ df: nw.DataFrame[Any] | pl.DataFrame,
329
+ ) -> None:
330
+ """
331
+ Write metadata for a feature (immutable, append-only).
332
+
333
+ Automatically adds 'feature_version' column from current code state,
334
+ unless the DataFrame already contains one (useful for migrations).
335
+
336
+ Args:
337
+ feature: Feature to write metadata for
338
+ df: Narwhals DataFrame or Polars DataFrame containing metadata.
339
+ Must have 'data_version' column of type Struct with fields matching feature's fields.
340
+ May optionally contain 'feature_version' column (for migrations).
341
+
342
+ Raises:
343
+ MetadataSchemaError: If DataFrame schema is invalid
344
+ StoreNotOpenError: If store is not open
345
+
346
+ Note:
347
+ - Always writes to current store, never to fallback stores.
348
+ - If df already contains 'feature_version' column, it will be used
349
+ as-is (no replacement). This allows migrations to write historical
350
+ versions. A warning is issued unless suppressed via context manager.
351
+ """
352
+ self._check_open()
353
+ feature_key = self._resolve_feature_key(feature)
354
+ is_system_table = self._is_system_table(feature_key)
355
+
356
+ # Convert Narwhals to Polars if needed
357
+ if isinstance(df, nw.DataFrame):
358
+ df = df.to_polars()
359
+ # nw.DataFrame also matches as DataFrame in some contexts, ensure it's Polars
360
+ if not isinstance(df, pl.DataFrame):
361
+ # Must be some other type - shouldn't happen but handle defensively
362
+ if hasattr(df, "to_polars"):
363
+ df = df.to_polars()
364
+ elif hasattr(df, "to_pandas"):
365
+ df = pl.from_pandas(df.to_pandas())
366
+ else:
367
+ raise TypeError(f"Cannot convert {type(df)} to Polars DataFrame")
368
+
369
+ # For system tables, write directly without feature_version tracking
370
+ if is_system_table:
371
+ self._validate_schema_system_table(df)
372
+ self._write_metadata_impl(feature_key, df)
373
+ return
374
+
375
+ # For regular features: add feature_version and snapshot_version, validate, and write
376
+ # Check if feature_version and snapshot_version already exist in DataFrame
377
+ if "feature_version" in df.columns and "snapshot_version" in df.columns:
378
+ # DataFrame already has feature_version and snapshot_version - use as-is
379
+ # This is intended for migrations writing historical versions
380
+ # Issue a warning unless we're in a suppression context
381
+ if not _suppress_feature_version_warning.get():
382
+ import warnings
383
+
384
+ warnings.warn(
385
+ f"Writing metadata for {feature_key.to_string()} with existing "
386
+ f"feature_version and snapshot_version columns. This is intended for migrations only. "
387
+ f"Normal code should let write_metadata() add the current versions automatically.",
388
+ UserWarning,
389
+ stacklevel=2,
390
+ )
391
+ else:
392
+ # Get current feature version and snapshot_version from code and add them
393
+ if isinstance(feature, type) and issubclass(feature, Feature):
394
+ current_feature_version = feature.feature_version() # type: ignore[attr-defined]
395
+ else:
396
+ from metaxy.models.feature import FeatureGraph
397
+
398
+ graph = FeatureGraph.get_active()
399
+ feature_cls = graph.features_by_key[feature_key]
400
+ current_feature_version = feature_cls.feature_version() # type: ignore[attr-defined]
401
+
402
+ # Get snapshot_version from active graph
403
+ from metaxy.models.feature import FeatureGraph
404
+
405
+ graph = FeatureGraph.get_active()
406
+ current_snapshot_version = graph.snapshot_version
407
+
408
+ df = df.with_columns(
409
+ [
410
+ pl.lit(current_feature_version).alias("feature_version"),
411
+ pl.lit(current_snapshot_version).alias("snapshot_version"),
412
+ ]
413
+ )
414
+
415
+ # Validate schema
416
+ self._validate_schema(df)
417
+
418
+ # Write metadata
419
+ self._write_metadata_impl(feature_key, df)
420
+
421
+ def _validate_schema(self, df: pl.DataFrame) -> None:
422
+ """
423
+ Validate that DataFrame has required schema.
424
+
425
+ Args:
426
+ df: DataFrame to validate
427
+
428
+ Raises:
429
+ MetadataSchemaError: If schema is invalid
430
+ """
431
+ from metaxy.metadata_store.exceptions import MetadataSchemaError
432
+
433
+ # Check for data_version column
434
+ if "data_version" not in df.columns:
435
+ raise MetadataSchemaError("DataFrame must have 'data_version' column")
436
+
437
+ # Check that data_version is a struct
438
+ data_version_type = df.schema["data_version"]
439
+ if not isinstance(data_version_type, pl.Struct):
440
+ raise MetadataSchemaError(
441
+ f"'data_version' column must be pl.Struct, got {data_version_type}"
442
+ )
443
+
444
+ # Check for feature_version column
445
+ if "feature_version" not in df.columns:
446
+ raise MetadataSchemaError("DataFrame must have 'feature_version' column")
447
+
448
+ # Check for snapshot_version column
449
+ if "snapshot_version" not in df.columns:
450
+ raise MetadataSchemaError("DataFrame must have 'snapshot_version' column")
451
+
452
+ def _validate_schema_system_table(self, df: pl.DataFrame) -> None:
453
+ """Validate schema for system tables (minimal validation)."""
454
+ # System tables don't need data_version column
455
+ pass
456
+
457
+ @abstractmethod
458
+ def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
459
+ """Drop/delete all metadata for a feature.
460
+
461
+ Backend-specific implementation for dropping feature metadata.
462
+
463
+ Args:
464
+ feature_key: The feature key to drop metadata for
465
+ """
466
+ pass
467
+
468
+ def drop_feature_metadata(self, feature: FeatureKey | type[Feature]) -> None:
469
+ """Drop all metadata for a feature.
470
+
471
+ This removes all stored metadata for the specified feature from the store.
472
+ Useful for cleanup in tests or when re-computing feature metadata from scratch.
473
+
474
+ Args:
475
+ feature: Feature class or key to drop metadata for
476
+
477
+ Example:
478
+ >>> store.drop_feature_metadata(MyFeature)
479
+ >>> assert not store.has_feature(MyFeature)
480
+ """
481
+ self._check_open()
482
+ feature_key = self._resolve_feature_key(feature)
483
+ self._drop_feature_metadata_impl(feature_key)
484
+
485
+ def record_feature_graph_snapshot(self) -> tuple[str, bool]:
486
+ """Record all features in graph with a graph snapshot version.
487
+
488
+ This should be called during CD (Continuous Deployment) to record what
489
+ feature versions are being deployed. Typically invoked via `metaxy push`.
490
+
491
+ Records all features in the graph with the same snapshot_version, representing
492
+ a consistent state of the entire feature graph based on code definitions.
493
+
494
+ The snapshot_version is a deterministic hash of all feature_version hashes
495
+ in the graph, making it idempotent - calling multiple times with the
496
+ same feature definitions produces the same snapshot_version.
497
+
498
+ Returns:
499
+ A tuple containing the generated snapshot_version (deterministic hash) and a boolean indicating if the snapshot was recorded or already exists.
500
+ """
501
+
502
+ from metaxy.models.feature import FeatureGraph
503
+
504
+ graph = FeatureGraph.get_active()
505
+
506
+ # Use to_snapshot() to get the snapshot dict
507
+ snapshot_dict = graph.to_snapshot()
508
+
509
+ # Generate deterministic snapshot_version from graph
510
+ snapshot_version = graph.snapshot_version
511
+
512
+ # Read existing feature versions once
513
+ try:
514
+ existing_versions_lazy = self._read_metadata_native(FEATURE_VERSIONS_KEY)
515
+ # Materialize to Polars for iteration
516
+ existing_versions = (
517
+ existing_versions_lazy.collect().to_polars()
518
+ if existing_versions_lazy is not None
519
+ else None
520
+ )
521
+ except Exception:
522
+ # Table doesn't exist yet
523
+ existing_versions = None
524
+
525
+ # Check if this exact snapshot already exists
526
+ snapshot_already_exists = False
527
+ if existing_versions is not None:
528
+ snapshot_already_exists = (
529
+ existing_versions.filter(
530
+ pl.col("snapshot_version") == snapshot_version
531
+ ).height
532
+ > 0
533
+ )
534
+
535
+ # If snapshot already exists, we're done (idempotent)
536
+ if snapshot_already_exists:
537
+ return snapshot_version, True
538
+
539
+ # Build records from snapshot_dict
540
+ records = []
541
+ for feature_key_str in sorted(snapshot_dict.keys()):
542
+ feature_data = snapshot_dict[feature_key_str]
543
+
544
+ # Serialize complete FeatureSpec
545
+ feature_spec_json = json.dumps(feature_data["feature_spec"])
546
+
547
+ # Always record all features for this snapshot (don't skip based on feature_version alone)
548
+ # Each snapshot must be complete to support migration detection
549
+ records.append(
550
+ {
551
+ "feature_key": feature_key_str,
552
+ "feature_version": feature_data["feature_version"],
553
+ "recorded_at": datetime.now(timezone.utc),
554
+ "feature_spec": feature_spec_json,
555
+ "feature_class_path": feature_data["feature_class_path"],
556
+ "snapshot_version": snapshot_version,
557
+ }
558
+ )
559
+
560
+ # Bulk write all new records at once
561
+ if records:
562
+ version_records = pl.DataFrame(
563
+ records,
564
+ schema=FEATURE_VERSIONS_SCHEMA,
565
+ )
566
+ self._write_metadata_impl(FEATURE_VERSIONS_KEY, version_records)
567
+
568
+ return snapshot_version, False
569
+
570
+ @abstractmethod
571
+ def _read_metadata_native(
572
+ self,
573
+ feature: FeatureKey | type[Feature],
574
+ *,
575
+ feature_version: str | None = None,
576
+ filters: Sequence[nw.Expr] | None = None,
577
+ columns: Sequence[str] | None = None,
578
+ ) -> nw.LazyFrame[Any] | None:
579
+ """
580
+ Read metadata from THIS store only (no fallback).
581
+
582
+ Args:
583
+ feature: Feature to read metadata for
584
+ feature_version: Filter by specific feature_version (applied natively in store)
585
+ filters: List of Narwhals filter expressions for this specific feature.
586
+ columns: Subset of columns to return
587
+
588
+ Returns:
589
+ Narwhals LazyFrame with metadata, or None if feature not found locally
590
+ """
591
+ pass
592
+
593
+ def read_metadata(
594
+ self,
595
+ feature: FeatureKey | type[Feature],
596
+ *,
597
+ feature_version: str | None = None,
598
+ filters: Sequence[nw.Expr] | None = None,
599
+ columns: Sequence[str] | None = None,
600
+ allow_fallback: bool = True,
601
+ current_only: bool = True,
602
+ ) -> nw.LazyFrame[Any]:
603
+ """
604
+ Read metadata with optional fallback to upstream stores.
605
+
606
+ Args:
607
+ feature: Feature to read metadata for
608
+ feature_version: Explicit feature_version to filter by (mutually exclusive with current_only=True)
609
+ filters: Sequence of Narwhals filter expressions to apply to this feature.
610
+ Example: [nw.col("x") > 10, nw.col("y") < 5]
611
+ columns: Subset of columns to return
612
+ allow_fallback: If True, check fallback stores on local miss
613
+ current_only: If True, only return rows with current feature_version
614
+ (default: True for safety)
615
+
616
+ Returns:
617
+ Narwhals LazyFrame with metadata
618
+
619
+ Raises:
620
+ FeatureNotFoundError: If feature not found in any store
621
+ ValueError: If both feature_version and current_only=True are provided
622
+ """
623
+ feature_key = self._resolve_feature_key(feature)
624
+ is_system_table = self._is_system_table(feature_key)
625
+
626
+ # Validate mutually exclusive parameters
627
+ if feature_version is not None and current_only:
628
+ raise ValueError(
629
+ "Cannot specify both feature_version and current_only=True. "
630
+ "Use current_only=False with feature_version parameter."
631
+ )
632
+
633
+ # Determine which feature_version to use
634
+ feature_version_filter = feature_version
635
+ if current_only and not is_system_table:
636
+ # Get current feature_version
637
+ if isinstance(feature, type) and issubclass(feature, Feature):
638
+ feature_version_filter = feature.feature_version() # type: ignore[attr-defined]
639
+ else:
640
+ from metaxy.models.feature import FeatureGraph
641
+
642
+ graph = FeatureGraph.get_active()
643
+ # Only try to get from graph if feature_key exists in graph
644
+ # This allows reading system tables or external features not in current graph
645
+ if feature_key in graph.features_by_key:
646
+ feature_cls = graph.features_by_key[feature_key]
647
+ feature_version_filter = feature_cls.feature_version() # type: ignore[attr-defined]
648
+ else:
649
+ # Feature not in graph - skip feature_version filtering
650
+ feature_version_filter = None
651
+
652
+ # Try local first with filters
653
+ lazy_frame = self._read_metadata_native(
654
+ feature,
655
+ feature_version=feature_version_filter,
656
+ filters=filters, # Pass filters directly
657
+ columns=columns,
658
+ )
659
+
660
+ if lazy_frame is not None:
661
+ return lazy_frame
662
+
663
+ # Try fallback stores
664
+ if allow_fallback:
665
+ for store in self.fallback_stores:
666
+ try:
667
+ # Use full read_metadata to handle nested fallback chains
668
+ return store.read_metadata(
669
+ feature,
670
+ feature_version=feature_version,
671
+ filters=filters, # Pass through filters directly
672
+ columns=columns,
673
+ allow_fallback=True,
674
+ current_only=current_only, # Pass through current_only
675
+ )
676
+ except FeatureNotFoundError:
677
+ # Try next fallback store
678
+ continue
679
+
680
+ # Not found anywhere
681
+ raise FeatureNotFoundError(
682
+ f"Feature {feature_key.to_string()} not found in store"
683
+ + (" or fallback stores" if allow_fallback else "")
684
+ )
685
+
686
+ # ========== Feature Existence ==========
687
+
688
+ def has_feature(
689
+ self,
690
+ feature: FeatureKey | type[Feature],
691
+ *,
692
+ check_fallback: bool = False,
693
+ ) -> bool:
694
+ """
695
+ Check if feature exists in store.
696
+
697
+ Args:
698
+ feature: Feature to check
699
+ check_fallback: If True, also check fallback stores
700
+
701
+ Returns:
702
+ True if feature exists, False otherwise
703
+ """
704
+ # Check local
705
+ if self._read_metadata_native(feature) is not None:
706
+ return True
707
+
708
+ # Check fallback stores
709
+ if check_fallback:
710
+ for store in self.fallback_stores:
711
+ if store.has_feature(feature, check_fallback=True):
712
+ return True
713
+
714
+ return False
715
+
716
+ def list_features(self, *, include_fallback: bool = False) -> list[FeatureKey]:
717
+ """
718
+ List all features in store.
719
+
720
+ Args:
721
+ include_fallback: If True, include features from fallback stores
722
+
723
+ Returns:
724
+ List of FeatureKey objects
725
+
726
+ Raises:
727
+ StoreNotOpenError: If store is not open
728
+ """
729
+ self._check_open()
730
+
731
+ features = self._list_features_local()
732
+
733
+ if include_fallback:
734
+ for store in self.fallback_stores:
735
+ features.extend(store.list_features(include_fallback=True))
736
+
737
+ # Deduplicate
738
+ seen = set()
739
+ unique_features = []
740
+ for feature in features:
741
+ key_str = feature.to_string()
742
+ if key_str not in seen:
743
+ seen.add(key_str)
744
+ unique_features.append(feature)
745
+
746
+ return unique_features
747
+
748
+ @abstractmethod
749
+ def _list_features_local(self) -> list[FeatureKey]:
750
+ """List features in THIS store only."""
751
+ pass
752
+
753
+ def read_graph_snapshots(self) -> pl.DataFrame:
754
+ """Read all recorded graph snapshots from the feature_versions system table.
755
+
756
+ Returns a DataFrame with columns:
757
+ - snapshot_version: Unique identifier for each graph snapshot
758
+ - recorded_at: Timestamp when the snapshot was recorded
759
+ - feature_count: Number of features in this snapshot
760
+
761
+ Returns:
762
+ Polars DataFrame with snapshot information, sorted by recorded_at descending
763
+
764
+ Raises:
765
+ StoreNotOpenError: If store is not open
766
+
767
+ Example:
768
+ >>> with store:
769
+ ... snapshots = store.read_graph_snapshots()
770
+ ... latest_snapshot = snapshots["snapshot_version"][0]
771
+ ... print(f"Latest snapshot: {latest_snapshot}")
772
+ """
773
+ self._check_open()
774
+
775
+ versions_lazy = self._read_metadata_native(FEATURE_VERSIONS_KEY)
776
+ if versions_lazy is None:
777
+ # No snapshots recorded yet
778
+ return pl.DataFrame(
779
+ schema={
780
+ "snapshot_version": pl.String,
781
+ "recorded_at": pl.Datetime("us"),
782
+ "feature_count": pl.UInt32,
783
+ }
784
+ )
785
+
786
+ versions_df = versions_lazy.collect().to_polars()
787
+
788
+ # Group by snapshot_version and get earliest recorded_at and count
789
+ snapshots = (
790
+ versions_df.group_by("snapshot_version")
791
+ .agg(
792
+ [
793
+ pl.col("recorded_at").min().alias("recorded_at"),
794
+ pl.col("feature_key").count().alias("feature_count"),
795
+ ]
796
+ )
797
+ .sort("recorded_at", descending=True)
798
+ )
799
+
800
+ return snapshots
801
+
802
+ def read_features(
803
+ self,
804
+ *,
805
+ current: bool = True,
806
+ snapshot_version: str | None = None,
807
+ ) -> pl.DataFrame:
808
+ """Read feature version information from the feature_versions system table.
809
+
810
+ Args:
811
+ current: If True, only return features from the current code snapshot.
812
+ If False, must provide snapshot_version.
813
+ snapshot_version: Specific snapshot version to filter by. Required if current=False.
814
+
815
+ Returns:
816
+ Polars DataFrame with columns from FEATURE_VERSIONS_SCHEMA:
817
+ - feature_key: Feature identifier
818
+ - feature_version: Version hash of the feature
819
+ - recorded_at: When this version was recorded
820
+ - feature_spec: JSON serialized feature specification
821
+ - feature_class_path: Python import path to the feature class
822
+ - snapshot_version: Graph snapshot this feature belongs to
823
+
824
+ Raises:
825
+ StoreNotOpenError: If store is not open
826
+ ValueError: If current=False but no snapshot_version provided
827
+
828
+ Examples:
829
+ >>> # Get features from current code
830
+ >>> with store:
831
+ ... features = store.read_features(current=True)
832
+ ... print(f"Current graph has {len(features)} features")
833
+
834
+ >>> # Get features from a specific snapshot
835
+ >>> with store:
836
+ ... features = store.read_features(current=False, snapshot_version="abc123")
837
+ ... for row in features.iter_rows(named=True):
838
+ ... print(f"{row['feature_key']}: {row['feature_version']}")
839
+ """
840
+ self._check_open()
841
+
842
+ if not current and snapshot_version is None:
843
+ raise ValueError("Must provide snapshot_version when current=False")
844
+
845
+ versions_lazy = self._read_metadata_native(FEATURE_VERSIONS_KEY)
846
+ if versions_lazy is None:
847
+ # No features recorded yet
848
+ return pl.DataFrame(schema=FEATURE_VERSIONS_SCHEMA)
849
+
850
+ if current:
851
+ # Get current snapshot from active graph
852
+ graph = FeatureGraph.get_active()
853
+ snapshot_version = graph.snapshot_version
854
+
855
+ # Filter by snapshot_version
856
+ versions_df = (
857
+ versions_lazy.filter(nw.col("snapshot_version") == snapshot_version)
858
+ .collect()
859
+ .to_polars()
860
+ )
861
+
862
+ return versions_df
863
+
864
+ def copy_metadata(
865
+ self,
866
+ from_store: MetadataStore,
867
+ features: list[FeatureKey | type[Feature]] | None = None,
868
+ *,
869
+ from_snapshot: str | None = None,
870
+ filters: Mapping[str, Sequence[nw.Expr]] | None = None,
871
+ incremental: bool = True,
872
+ ) -> dict[str, int]:
873
+ """Copy metadata from another store with fine-grained filtering.
874
+
875
+ This is a reusable method that can be called programmatically or from CLI/migrations.
876
+ Copies metadata for specified features, preserving the original snapshot_version.
877
+
878
+ Args:
879
+ from_store: Source metadata store to copy from (must be opened)
880
+ features: List of features to copy. Can be:
881
+ - None: copies all features from source store
882
+ - List of FeatureKey or Feature classes: copies specified features
883
+ from_snapshot: Snapshot version to filter source data by. If None, uses latest snapshot
884
+ from source store. Only rows with this snapshot_version will be copied.
885
+ The snapshot_version is preserved in the destination store.
886
+ filters: Dict mapping feature keys (as strings) to sequences of Narwhals filter expressions.
887
+ These filters are applied when reading from the source store.
888
+ Example: {"feature/key": [nw.col("x") > 10], "other/feature": [...]}
889
+ incremental: If True (default), filter out rows that already exist in the destination
890
+ store by performing an anti-join on sample_uid for the same snapshot_version.
891
+
892
+ The implementation uses an anti-join: source LEFT ANTI JOIN destination ON sample_uid
893
+ filtered by snapshot_version.
894
+
895
+ Disabling incremental (incremental=False) may improve performance when:
896
+ - You know the destination is empty or has no overlap with source
897
+ - The destination store uses deduplication
898
+
899
+ When incremental=False, it's the user's responsibility to avoid duplicates or
900
+ configure deduplication at the storage layer.
901
+
902
+ Returns:
903
+ Dict with statistics: {"features_copied": int, "rows_copied": int}
904
+
905
+ Raises:
906
+ ValueError: If from_store or self (destination) is not open
907
+ FeatureNotFoundError: If a specified feature doesn't exist in source store
908
+
909
+ Examples:
910
+ >>> # Simple: copy all features from latest snapshot
911
+ >>> stats = dest_store.copy_metadata(from_store=source_store)
912
+
913
+ >>> # Copy specific features from a specific snapshot
914
+ >>> stats = dest_store.copy_metadata(
915
+ ... from_store=source_store,
916
+ ... features=[FeatureKey(["my_feature"])],
917
+ ... from_snapshot="abc123",
918
+ ... )
919
+
920
+ >>> # Copy with filters
921
+ >>> stats = dest_store.copy_metadata(
922
+ ... from_store=source_store,
923
+ ... filters={"my/feature": [nw.col("sample_uid").is_in(["s1", "s2"])]},
924
+ ... )
925
+
926
+ >>> # Copy specific features with filters
927
+ >>> stats = dest_store.copy_metadata(
928
+ ... from_store=source_store,
929
+ ... features=[
930
+ ... FeatureKey(["feature_a"]),
931
+ ... FeatureKey(["feature_b"]),
932
+ ... ],
933
+ ... filters={
934
+ ... "feature_a": [nw.col("field_a") > 10, nw.col("sample_uid").is_in(["s1", "s2"])],
935
+ ... "feature_b": [nw.col("field_b") < 30],
936
+ ... },
937
+ ... )
938
+ """
939
+ import logging
940
+
941
+ logger = logging.getLogger(__name__)
942
+
943
+ # Validate destination store is open
944
+ if not self._is_open:
945
+ raise ValueError("Destination store must be opened (use context manager)")
946
+
947
+ # Automatically handle source store context manager
948
+ should_close_source = not from_store._is_open
949
+ if should_close_source:
950
+ from_store.__enter__()
951
+
952
+ try:
953
+ return self._copy_metadata_impl(
954
+ from_store=from_store,
955
+ features=features,
956
+ from_snapshot=from_snapshot,
957
+ filters=filters,
958
+ incremental=incremental,
959
+ logger=logger,
960
+ )
961
+ finally:
962
+ if should_close_source:
963
+ from_store.__exit__(None, None, None)
964
+
965
+ def _copy_metadata_impl(
966
+ self,
967
+ from_store: MetadataStore,
968
+ features: list[FeatureKey | type[Feature]] | None,
969
+ from_snapshot: str | None,
970
+ filters: Mapping[str, Sequence[nw.Expr]] | None,
971
+ incremental: bool,
972
+ logger,
973
+ ) -> dict[str, int]:
974
+ """Internal implementation of copy_metadata."""
975
+ # Determine which features to copy
976
+ features_to_copy: list[FeatureKey]
977
+ if features is None:
978
+ # Copy all features from source
979
+ features_to_copy = from_store.list_features(include_fallback=False)
980
+ logger.info(
981
+ f"Copying all features from source: {len(features_to_copy)} features"
982
+ )
983
+ else:
984
+ # Convert all to FeatureKey
985
+ features_to_copy = []
986
+ for item in features:
987
+ if isinstance(item, FeatureKey):
988
+ features_to_copy.append(item)
989
+ else:
990
+ # Must be Feature class
991
+ features_to_copy.append(item.spec.key)
992
+ logger.info(f"Copying {len(features_to_copy)} specified features")
993
+
994
+ # Determine from_snapshot
995
+ if from_snapshot is None:
996
+ # Get latest snapshot from source store
997
+ try:
998
+ versions_lazy = from_store._read_metadata_native(FEATURE_VERSIONS_KEY)
999
+ if versions_lazy is None:
1000
+ # No feature_versions table yet - if no features to copy, that's okay
1001
+ if len(features_to_copy) == 0:
1002
+ logger.info(
1003
+ "No features to copy and no snapshots in source store"
1004
+ )
1005
+ from_snapshot = None # Will be set later if needed
1006
+ else:
1007
+ raise ValueError(
1008
+ "Source store has no feature_versions table. Cannot determine snapshot."
1009
+ )
1010
+ elif versions_lazy is not None:
1011
+ versions_df = versions_lazy.collect().to_polars()
1012
+ if versions_df.height == 0:
1013
+ # Empty versions table - if no features to copy, that's okay
1014
+ if len(features_to_copy) == 0:
1015
+ logger.info(
1016
+ "No features to copy and no snapshots in source store"
1017
+ )
1018
+ from_snapshot = None
1019
+ else:
1020
+ raise ValueError(
1021
+ "Source store feature_versions table is empty. No snapshots found."
1022
+ )
1023
+ else:
1024
+ # Get most recent snapshot_version by recorded_at
1025
+ from_snapshot = (
1026
+ versions_df.sort("recorded_at", descending=True)
1027
+ .select("snapshot_version")
1028
+ .head(1)["snapshot_version"][0]
1029
+ )
1030
+ logger.info(
1031
+ f"Using latest snapshot from source: {from_snapshot}"
1032
+ )
1033
+ except Exception as e:
1034
+ # If we have no features to copy, continue gracefully
1035
+ if len(features_to_copy) == 0:
1036
+ logger.info(f"No features to copy: {e}")
1037
+ from_snapshot = None
1038
+ else:
1039
+ raise ValueError(
1040
+ f"Could not determine latest snapshot from source store: {e}"
1041
+ )
1042
+ else:
1043
+ logger.info(f"Using specified from_snapshot: {from_snapshot}")
1044
+
1045
+ # Copy metadata for each feature
1046
+ total_rows = 0
1047
+ features_copied = 0
1048
+
1049
+ with allow_feature_version_override():
1050
+ for feature_key in features_to_copy:
1051
+ try:
1052
+ # Read metadata from source, filtering by from_snapshot
1053
+ # Use current_only=False to avoid filtering by feature_version
1054
+ source_lazy = from_store.read_metadata(
1055
+ feature_key,
1056
+ allow_fallback=False,
1057
+ current_only=False,
1058
+ )
1059
+
1060
+ # Filter by from_snapshot
1061
+ import narwhals as nw
1062
+
1063
+ source_filtered = source_lazy.filter(
1064
+ nw.col("snapshot_version") == from_snapshot
1065
+ )
1066
+
1067
+ # Apply filters for this feature (if any)
1068
+ if filters:
1069
+ feature_key_str = feature_key.to_string()
1070
+ if feature_key_str in filters:
1071
+ for filter_expr in filters[feature_key_str]:
1072
+ source_filtered = source_filtered.filter(filter_expr)
1073
+
1074
+ # Apply incremental filtering if enabled
1075
+ if incremental:
1076
+ try:
1077
+ # Read existing sample_uids from destination for the same snapshot
1078
+ # This is much cheaper than comparing data_version structs
1079
+ dest_lazy = self.read_metadata(
1080
+ feature_key,
1081
+ allow_fallback=False,
1082
+ current_only=False,
1083
+ )
1084
+ # Filter destination to same snapshot_version
1085
+ dest_for_snapshot = dest_lazy.filter(
1086
+ nw.col("snapshot_version") == from_snapshot
1087
+ )
1088
+
1089
+ # Materialize destination sample_uids to avoid cross-backend join issues
1090
+ # When copying between different stores (e.g., different DuckDB files),
1091
+ # Ibis can't join tables from different backends
1092
+ dest_sample_uids = (
1093
+ dest_for_snapshot.select("sample_uid")
1094
+ .collect()
1095
+ .to_polars()
1096
+ )
1097
+
1098
+ # Convert to Polars LazyFrame and wrap in Narwhals
1099
+ dest_sample_uids_lazy = nw.from_native(
1100
+ dest_sample_uids.lazy(), eager_only=False
1101
+ )
1102
+
1103
+ # Collect source to Polars for anti-join
1104
+ source_df = source_filtered.collect().to_polars()
1105
+ source_lazy = nw.from_native(
1106
+ source_df.lazy(), eager_only=False
1107
+ )
1108
+
1109
+ # Anti-join: keep only source rows with sample_uid not in destination
1110
+ source_filtered = source_lazy.join(
1111
+ dest_sample_uids_lazy,
1112
+ on="sample_uid",
1113
+ how="anti",
1114
+ )
1115
+
1116
+ # Collect after filtering
1117
+ source_df = source_filtered.collect().to_polars()
1118
+
1119
+ logger.info(
1120
+ f"Incremental: copying only new sample_uids for {feature_key.to_string()}"
1121
+ )
1122
+ except FeatureNotFoundError:
1123
+ # Feature doesn't exist in destination yet - copy all rows
1124
+ logger.debug(
1125
+ f"Feature {feature_key.to_string()} not in destination, copying all rows"
1126
+ )
1127
+ source_df = source_filtered.collect().to_polars()
1128
+ except Exception as e:
1129
+ # If incremental check fails, log warning but continue with full copy
1130
+ logger.warning(
1131
+ f"Incremental check failed for {feature_key.to_string()}: {e}. Copying all rows."
1132
+ )
1133
+ source_df = source_filtered.collect().to_polars()
1134
+ else:
1135
+ # Non-incremental: collect all filtered rows
1136
+ source_df = source_filtered.collect().to_polars()
1137
+
1138
+ if source_df.height == 0:
1139
+ logger.warning(
1140
+ f"No rows found for {feature_key.to_string()} with snapshot {from_snapshot}, skipping"
1141
+ )
1142
+ continue
1143
+
1144
+ # Write to destination (preserving snapshot_version and feature_version)
1145
+ self.write_metadata(feature_key, source_df)
1146
+
1147
+ features_copied += 1
1148
+ total_rows += source_df.height
1149
+ logger.info(
1150
+ f"Copied {source_df.height} rows for {feature_key.to_string()}"
1151
+ )
1152
+
1153
+ except FeatureNotFoundError:
1154
+ logger.warning(
1155
+ f"Feature {feature_key.to_string()} not found in source store, skipping"
1156
+ )
1157
+ continue
1158
+ except Exception as e:
1159
+ logger.error(
1160
+ f"Error copying {feature_key.to_string()}: {e}", exc_info=True
1161
+ )
1162
+ raise
1163
+
1164
+ logger.info(
1165
+ f"Copy complete: {features_copied} features, {total_rows} total rows"
1166
+ )
1167
+
1168
+ return {"features_copied": features_copied, "rows_copied": total_rows}
1169
+
1170
+ # ========== Dependency Resolution ==========
1171
+
1172
+ def read_upstream_metadata(
1173
+ self,
1174
+ feature: FeatureKey | type[Feature],
1175
+ field: FieldKey | None = None,
1176
+ *,
1177
+ filters: Mapping[str, Sequence[nw.Expr]] | None = None,
1178
+ allow_fallback: bool = True,
1179
+ current_only: bool = True,
1180
+ ) -> dict[str, nw.LazyFrame[Any]]:
1181
+ """
1182
+ Read all upstream dependencies for a feature/field.
1183
+
1184
+ Args:
1185
+ feature: Feature whose dependencies to load
1186
+ field: Specific field (if None, loads all deps for feature)
1187
+ filters: Dict mapping feature keys (as strings) to lists of Narwhals filter expressions.
1188
+ Example: {"upstream/feature1": [nw.col("x") > 10], "upstream/feature2": [...]}
1189
+ allow_fallback: Whether to check fallback stores
1190
+ current_only: If True, only read current feature_version for upstream
1191
+
1192
+ Returns:
1193
+ Dict mapping upstream feature keys (as strings) to Narwhals LazyFrames.
1194
+ Each LazyFrame has a 'data_version' column (Struct).
1195
+
1196
+ Raises:
1197
+ DependencyError: If required upstream feature is missing
1198
+ """
1199
+ plan = self._resolve_feature_plan(feature)
1200
+
1201
+ # Get all upstream features we need
1202
+ upstream_features = set()
1203
+
1204
+ if field is None:
1205
+ # All fields' dependencies
1206
+ for cont in plan.feature.fields:
1207
+ upstream_features.update(self._get_field_dependencies(plan, cont.key))
1208
+ else:
1209
+ # Specific field's dependencies
1210
+ upstream_features.update(self._get_field_dependencies(plan, field))
1211
+
1212
+ # Load metadata for each upstream feature
1213
+ # Use the feature's graph to look up upstream feature classes
1214
+ if isinstance(feature, FeatureKey):
1215
+ from metaxy.models.feature import FeatureGraph
1216
+
1217
+ graph = FeatureGraph.get_active()
1218
+ else:
1219
+ graph = feature.graph
1220
+
1221
+ upstream_metadata = {}
1222
+ for upstream_fq_key in upstream_features:
1223
+ upstream_feature_key = upstream_fq_key.feature
1224
+
1225
+ # Extract filters for this specific upstream feature
1226
+ upstream_filters = None
1227
+ if filters:
1228
+ upstream_key_str = upstream_feature_key.to_string()
1229
+ if upstream_key_str in filters:
1230
+ upstream_filters = filters[upstream_key_str]
1231
+
1232
+ try:
1233
+ # Look up the Feature class from the graph and pass it to read_metadata
1234
+ # This way we use the bound graph instead of relying on active context
1235
+ upstream_feature_cls = graph.features_by_key[upstream_feature_key]
1236
+ lazy_frame = self.read_metadata(
1237
+ upstream_feature_cls,
1238
+ filters=upstream_filters, # Pass extracted filters (Sequence or None)
1239
+ allow_fallback=allow_fallback,
1240
+ current_only=current_only, # Pass through current_only
1241
+ )
1242
+ # Use string key for dict
1243
+ upstream_metadata[upstream_feature_key.to_string()] = lazy_frame
1244
+ except FeatureNotFoundError as e:
1245
+ raise DependencyError(
1246
+ f"Missing upstream feature {upstream_feature_key.to_string()} "
1247
+ f"required by {plan.feature.key.to_string()}"
1248
+ ) from e
1249
+
1250
+ return upstream_metadata
1251
+
1252
+ def _get_field_dependencies(
1253
+ self, plan: FeaturePlan, field_key: FieldKey
1254
+ ) -> set[FQFieldKey]:
1255
+ """Get all upstream field dependencies for a given field."""
1256
+ field = plan.feature.fields_by_key[field_key]
1257
+ upstream = set()
1258
+
1259
+ if field.deps == SpecialFieldDep.ALL:
1260
+ # All upstream features and fields
1261
+ upstream.update(plan.all_parent_fields_by_key.keys())
1262
+ elif isinstance(field.deps, list):
1263
+ for dep in field.deps:
1264
+ if isinstance(dep, FieldDep):
1265
+ if dep.fields == SpecialFieldDep.ALL:
1266
+ # All fields of this feature
1267
+ upstream_feature = plan.parent_features_by_key[dep.feature_key]
1268
+ for upstream_field in upstream_feature.fields:
1269
+ upstream.add(
1270
+ FQFieldKey(
1271
+ feature=dep.feature_key,
1272
+ field=upstream_field.key,
1273
+ )
1274
+ )
1275
+ elif isinstance(dep.fields, list):
1276
+ # Specific fields
1277
+ for field_key in dep.fields:
1278
+ upstream.add(
1279
+ FQFieldKey(feature=dep.feature_key, field=field_key)
1280
+ )
1281
+
1282
+ return upstream
1283
+
1284
+ # ========== Data Version Calculation ==========
1285
+
1286
+ # ========== Data Versioning API ==========
1287
+
1288
+ @overload
1289
+ def resolve_update(
1290
+ self,
1291
+ feature: type[Feature],
1292
+ *,
1293
+ samples: nw.DataFrame[Any] | nw.LazyFrame[Any] | None = None,
1294
+ filters: Mapping[str, Sequence[nw.Expr]] | None = None,
1295
+ lazy: Literal[False] = False,
1296
+ **kwargs,
1297
+ ) -> DiffResult: ...
1298
+
1299
+ @overload
1300
+ def resolve_update(
1301
+ self,
1302
+ feature: type[Feature],
1303
+ *,
1304
+ samples: nw.DataFrame[Any] | nw.LazyFrame[Any] | None = None,
1305
+ filters: Mapping[str, Sequence[nw.Expr]] | None = None,
1306
+ lazy: Literal[True],
1307
+ **kwargs,
1308
+ ) -> LazyDiffResult: ...
1309
+
1310
+ def resolve_update(
1311
+ self,
1312
+ feature: type[Feature],
1313
+ *,
1314
+ samples: nw.DataFrame[Any] | nw.LazyFrame[Any] | None = None,
1315
+ filters: Mapping[str, Sequence[nw.Expr]] | None = None,
1316
+ lazy: bool = False,
1317
+ **kwargs,
1318
+ ) -> DiffResult | LazyDiffResult:
1319
+ """Resolve what needs updating for a feature.
1320
+
1321
+ Primary user-facing method. Automatically chooses optimal strategy:
1322
+ 1. Root features without samples → Raise error (samples required)
1323
+ 2. All upstream local → Use native data version calculations (stay in DB)
1324
+ 3. Some upstream in fallback stores → Pull to memory (Polars)
1325
+ 4. samples provided → Use as pre-calculated target versions (escape hatch)
1326
+
1327
+ Args:
1328
+ feature: Feature class to resolve updates for
1329
+ samples: **Escape hatch parameter**. Pre-computed DataFrame with sample_uid
1330
+ and data_version columns. When provided, skips upstream loading, joining,
1331
+ and data version calculation - goes straight to diff.
1332
+
1333
+ **Required for root features** (features with no upstream dependencies).
1334
+ Root features don't have upstream to calculate data_version from, so users
1335
+ must provide samples with manually computed data_version.
1336
+
1337
+ **Optional for non-root features** as an escape hatch. Use this when you
1338
+ want to bypass the automatic upstream loading and data version calculation.
1339
+ Examples:
1340
+ - Loading upstream from custom sources
1341
+ - Pre-computing data versions with custom logic
1342
+ - Testing specific scenarios
1343
+
1344
+ **Normal usage**: Don't provide this parameter. The system will automatically
1345
+ load upstream features and calculate data versions.
1346
+
1347
+ filters: Dict mapping feature keys (as strings) to lists of Narwhals filter expressions.
1348
+ Applied when reading upstream metadata to filter samples at the source.
1349
+ Example: {"upstream/feature": [nw.col("x") > 10], ...}
1350
+ lazy: If True, return LazyDiffResult with lazy Narwhals LazyFrames.
1351
+ If False, return DiffResult with eager Narwhals DataFrames (default).
1352
+ **kwargs: Backend-specific parameters (reserved for future use)
1353
+
1354
+ Returns:
1355
+ DiffResult (eager, default) or LazyDiffResult (lazy) with:
1356
+ - added: New samples not in current metadata
1357
+ - changed: Existing samples with different data_versions
1358
+ - removed: Samples in current but not in upstream
1359
+
1360
+ Each frame has columns: [sample_uid, data_version, ...user columns...]
1361
+
1362
+ Raises:
1363
+ ValueError: If samples not provided for root features (no upstream)
1364
+
1365
+ Examples:
1366
+ >>> # Root feature - samples required
1367
+ >>> samples = pl.DataFrame({
1368
+ ... "sample_uid": [1, 2, 3],
1369
+ ... "data_version": [{"field": "h1"}, {"field": "h2"}, {"field": "h3"}],
1370
+ ... })
1371
+ >>> result = store.resolve_update(RootFeature, samples=nw.from_native(samples))
1372
+
1373
+ >>> # Non-root feature - automatic (normal usage)
1374
+ >>> result = store.resolve_update(DownstreamFeature)
1375
+
1376
+ >>> # Non-root feature - with escape hatch (advanced)
1377
+ >>> custom_samples = compute_custom_data_versions(...)
1378
+ >>> result = store.resolve_update(DownstreamFeature, samples=custom_samples)
1379
+
1380
+ Note:
1381
+ Users can then process only added/changed and call write_metadata().
1382
+ """
1383
+ import narwhals as nw
1384
+
1385
+ plan = feature.graph.get_feature_plan(feature.spec.key)
1386
+
1387
+ # Escape hatch: if samples provided, use them directly (skip join/calculation)
1388
+ if samples is not None:
1389
+ import logging
1390
+
1391
+ import polars as pl
1392
+
1393
+ logger = logging.getLogger(__name__)
1394
+
1395
+ # Convert samples to lazy if needed
1396
+ samples_lazy = (
1397
+ samples
1398
+ if isinstance(samples, nw.LazyFrame)
1399
+ else nw.from_native(samples.to_native().lazy())
1400
+ )
1401
+
1402
+ # Check if samples are Polars-backed (common case for escape hatch)
1403
+ samples_native = samples_lazy.to_native()
1404
+ is_polars_samples = isinstance(samples_native, (pl.DataFrame, pl.LazyFrame))
1405
+
1406
+ if is_polars_samples and self._supports_native_components():
1407
+ # User provided Polars samples but store uses native (SQL) backend
1408
+ # Need to materialize current metadata to Polars for compatibility
1409
+ logger.warning(
1410
+ f"Feature {feature.spec.key}: samples parameter is Polars-backed but store uses native SQL backend. "
1411
+ f"Materializing current metadata to Polars for diff comparison. "
1412
+ f"For better performance, consider using samples with backend matching the store's backend."
1413
+ )
1414
+ # Get current metadata and materialize to Polars
1415
+ current_lazy_native = self._read_metadata_native(
1416
+ feature, feature_version=feature.feature_version()
1417
+ )
1418
+ if current_lazy_native is not None:
1419
+ # Convert to Polars using Narwhals' built-in method
1420
+ current_lazy = nw.from_native(
1421
+ current_lazy_native.collect().to_polars().lazy()
1422
+ )
1423
+ else:
1424
+ current_lazy = None
1425
+ else:
1426
+ # Same backend or no conversion needed - direct read
1427
+ current_lazy = self._read_metadata_native(
1428
+ feature, feature_version=feature.feature_version()
1429
+ )
1430
+
1431
+ # Use diff resolver to compare samples with current
1432
+ from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
1433
+
1434
+ diff_resolver = NarwhalsDiffResolver()
1435
+
1436
+ lazy_result = diff_resolver.find_changes(
1437
+ target_versions=samples_lazy,
1438
+ current_metadata=current_lazy,
1439
+ )
1440
+
1441
+ return lazy_result if lazy else lazy_result.collect()
1442
+
1443
+ # Root features without samples: error (samples required)
1444
+ if not plan.deps:
1445
+ raise ValueError(
1446
+ f"Feature {feature.spec.key} has no upstream dependencies (root feature). "
1447
+ f"Must provide 'samples' parameter with sample_uid and data_version columns. "
1448
+ f"Root features require manual data_version computation."
1449
+ )
1450
+
1451
+ # Non-root features without samples: automatic upstream loading
1452
+ # Check where upstream data lives
1453
+ upstream_location = self._check_upstream_location(feature)
1454
+
1455
+ if upstream_location == "all_local":
1456
+ # All upstream in this store - use native data version calculations
1457
+ return self._resolve_update_native(feature, filters=filters, lazy=lazy)
1458
+ else:
1459
+ # Some upstream in fallback stores - use Polars components
1460
+ return self._resolve_update_polars(feature, filters=filters, lazy=lazy)
1461
+
1462
+ def _check_upstream_location(self, feature: type[Feature]) -> str:
1463
+ """Check if all upstream is in this store or in fallback stores.
1464
+
1465
+ Returns:
1466
+ "all_local" if all upstream features are in this store
1467
+ "has_fallback" if any upstream is in fallback stores
1468
+ """
1469
+ plan = feature.graph.get_feature_plan(feature.spec.key)
1470
+
1471
+ if not plan.deps:
1472
+ return "all_local" # No dependencies
1473
+
1474
+ for upstream_spec in plan.deps:
1475
+ if not self.has_feature(upstream_spec.key, check_fallback=False):
1476
+ return "has_fallback" # At least one upstream is in fallback
1477
+
1478
+ return "all_local"
1479
+
1480
+ def _resolve_update_native(
1481
+ self,
1482
+ feature: type[Feature],
1483
+ *,
1484
+ filters: Mapping[str, Sequence[nw.Expr]] | None = None,
1485
+ lazy: bool = False,
1486
+ ) -> DiffResult | LazyDiffResult:
1487
+ """Resolve using native data version calculations (all data in this store).
1488
+
1489
+ Uses native data version calculations when available (e.g., IbisDataVersionCalculator for SQL stores)
1490
+ to execute operations in the database without pulling data into memory.
1491
+
1492
+ For stores that support native data version calculations (DuckDB, ClickHouse), this method:
1493
+ - Executes joins and diffs lazily via Narwhals
1494
+ - Computes hashes using native SQL functions (xxHash64, MD5, etc.)
1495
+ - Does not materialize data into memory (unless lazy=True)
1496
+
1497
+ For stores without native support, falls back to PolarsDataVersionCalculator.
1498
+ """
1499
+ import logging
1500
+
1501
+ logger = logging.getLogger(__name__)
1502
+ plan = feature.graph.get_feature_plan(feature.spec.key)
1503
+
1504
+ # Root features should be handled in resolve_update() with samples parameter
1505
+ # This method should only be called for features with upstream
1506
+ if not plan.deps:
1507
+ raise RuntimeError(
1508
+ f"Internal error: _resolve_update_native called for root feature {feature.spec.key}. "
1509
+ f"Root features should be handled in resolve_update() with samples parameter."
1510
+ )
1511
+
1512
+ # Create components based on native support
1513
+ # Only fallback to Polars if store explicitly doesn't support native data version calculations
1514
+ if self._supports_native_components():
1515
+ joiner, calculator, diff_resolver = self._create_native_components()
1516
+ logger.debug(
1517
+ f"Using native calculator for {feature.spec.key}: {calculator.__class__.__name__}"
1518
+ )
1519
+ else:
1520
+ # Store doesn't support native data version calculations - use Polars
1521
+ from metaxy.data_versioning.calculators.polars import (
1522
+ PolarsDataVersionCalculator,
1523
+ )
1524
+ from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
1525
+ from metaxy.data_versioning.joiners.narwhals import NarwhalsJoiner
1526
+
1527
+ joiner = NarwhalsJoiner()
1528
+ calculator = PolarsDataVersionCalculator()
1529
+ diff_resolver = NarwhalsDiffResolver()
1530
+ logger.debug(
1531
+ f"Using Polars components for {feature.spec.key} (native not supported)"
1532
+ )
1533
+
1534
+ # Load upstream as Narwhals LazyFrames (stays lazy in SQL for native stores)
1535
+ upstream_refs: dict[str, nw.LazyFrame[Any]] = {}
1536
+ for upstream_spec in plan.deps or []:
1537
+ upstream_key_str = (
1538
+ upstream_spec.key.to_string()
1539
+ if hasattr(upstream_spec.key, "to_string")
1540
+ else "_".join(upstream_spec.key)
1541
+ )
1542
+ # Extract filters for this upstream feature
1543
+ upstream_filters = None
1544
+ if filters and upstream_key_str in filters:
1545
+ upstream_filters = filters[upstream_key_str]
1546
+
1547
+ upstream_lazy = self._read_metadata_native(
1548
+ upstream_spec.key,
1549
+ filters=upstream_filters, # Apply extracted filters
1550
+ )
1551
+ if upstream_lazy is not None:
1552
+ upstream_refs[upstream_key_str] = upstream_lazy
1553
+
1554
+ # Join upstream using Narwhals (stays lazy)
1555
+ joined, mapping = feature.load_input(
1556
+ joiner=joiner,
1557
+ upstream_refs=upstream_refs,
1558
+ )
1559
+
1560
+ # Calculate data_versions using the selected calculator
1561
+ # For IbisDataVersionCalculator: executes hash computation in SQL
1562
+ # For PolarsDataVersionCalculator: materializes to compute hashes in memory
1563
+ target_versions_nw = calculator.calculate_data_versions(
1564
+ joined_upstream=joined,
1565
+ feature_spec=feature.spec,
1566
+ feature_plan=plan,
1567
+ upstream_column_mapping=mapping,
1568
+ hash_algorithm=self.hash_algorithm,
1569
+ )
1570
+
1571
+ # Diff with current (filtered by feature_version at database level)
1572
+ current_lazy_nw = self._read_metadata_native(
1573
+ feature, feature_version=feature.feature_version()
1574
+ )
1575
+
1576
+ return feature.resolve_data_version_diff(
1577
+ diff_resolver=diff_resolver,
1578
+ target_versions=target_versions_nw,
1579
+ current_metadata=current_lazy_nw,
1580
+ lazy=lazy,
1581
+ )
1582
+
1583
+ def _resolve_update_polars(
1584
+ self,
1585
+ feature: type[Feature],
1586
+ *,
1587
+ filters: Mapping[str, Sequence[nw.Expr]] | None = None,
1588
+ lazy: bool = False,
1589
+ ) -> DiffResult | LazyDiffResult:
1590
+ """Resolve using Polars components (cross-store scenario).
1591
+
1592
+ Pulls data from all stores to Polars, performs all operations in memory.
1593
+ Uses Polars components instead of native SQL components because upstream
1594
+ data is distributed across multiple stores.
1595
+
1596
+ This method is called when upstream features are in fallback stores,
1597
+ requiring materialization to join data from different sources.
1598
+ """
1599
+ import logging
1600
+
1601
+ from metaxy.data_versioning.calculators.polars import (
1602
+ PolarsDataVersionCalculator,
1603
+ )
1604
+ from metaxy.data_versioning.diff.narwhals import NarwhalsDiffResolver
1605
+ from metaxy.data_versioning.joiners.narwhals import NarwhalsJoiner
1606
+
1607
+ logger = logging.getLogger(__name__)
1608
+
1609
+ # Warn if native components are available and preferred but can't be used due to cross-store scenario
1610
+ if self._prefer_native and self._supports_native_components():
1611
+ logger.warning(
1612
+ f"Feature {feature.spec.key} has upstream dependencies in fallback stores. "
1613
+ f"Falling back to in-memory Polars processing instead of native SQL execution. "
1614
+ f"For better performance, ensure all upstream features are in the same store."
1615
+ )
1616
+
1617
+ # Load upstream from all sources (this store + fallbacks) as Narwhals LazyFrames
1618
+ upstream_refs = self.read_upstream_metadata(
1619
+ feature, filters=filters, allow_fallback=True
1620
+ )
1621
+
1622
+ # Create Narwhals components (work with any backend)
1623
+ narwhals_joiner = NarwhalsJoiner()
1624
+ polars_calculator = (
1625
+ PolarsDataVersionCalculator()
1626
+ ) # Still need this for hash calculation
1627
+ narwhals_diff = NarwhalsDiffResolver()
1628
+
1629
+ # Step 1: Join upstream using Narwhals
1630
+ plan = feature.graph.get_feature_plan(feature.spec.key)
1631
+ joined, mapping = feature.load_input(
1632
+ joiner=narwhals_joiner,
1633
+ upstream_refs=upstream_refs,
1634
+ )
1635
+
1636
+ # Step 2: Calculate data_versions
1637
+ # to_native() returns underlying type without materializing
1638
+ joined_native = joined.to_native()
1639
+ if isinstance(joined_native, pl.LazyFrame):
1640
+ joined_pl = joined_native
1641
+ elif isinstance(joined_native, pl.DataFrame):
1642
+ joined_pl = joined_native.lazy()
1643
+ else:
1644
+ # Ibis table - convert to Polars
1645
+ joined_pl = joined_native.to_polars()
1646
+ if isinstance(joined_pl, pl.DataFrame):
1647
+ joined_pl = joined_pl.lazy()
1648
+
1649
+ # Wrap in Narwhals before passing to calculator
1650
+ joined_nw = nw.from_native(joined_pl, eager_only=False)
1651
+
1652
+ target_versions_nw = polars_calculator.calculate_data_versions(
1653
+ joined_upstream=joined_nw,
1654
+ feature_spec=feature.spec,
1655
+ feature_plan=plan,
1656
+ upstream_column_mapping=mapping,
1657
+ hash_algorithm=self.hash_algorithm,
1658
+ )
1659
+
1660
+ # Select only sample_uid and data_version for diff
1661
+ # The calculator returns the full joined DataFrame with upstream columns,
1662
+ # but diff resolver only needs these two columns
1663
+ target_versions_nw = target_versions_nw.select(["sample_uid", "data_version"])
1664
+
1665
+ # Step 3: Diff with current (filtered by feature_version at database level)
1666
+ current_lazy = self._read_metadata_native(
1667
+ feature, feature_version=feature.feature_version()
1668
+ )
1669
+
1670
+ # Diff resolver returns Narwhals frames (lazy or eager based on flag)
1671
+ return feature.resolve_data_version_diff(
1672
+ diff_resolver=narwhals_diff,
1673
+ target_versions=target_versions_nw,
1674
+ current_metadata=current_lazy,
1675
+ lazy=lazy,
1676
+ )