metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,1792 @@
1
+ """Abstract base class for metadata storage backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import Iterator, Mapping, Sequence
7
+ from contextlib import AbstractContextManager, contextmanager
8
+ from types import TracebackType
9
+ from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload
10
+
11
+ import narwhals as nw
12
+ from narwhals.typing import Frame, IntoFrame
13
+ from pydantic import Field
14
+ from pydantic_settings import BaseSettings, SettingsConfigDict
15
+ from typing_extensions import Self
16
+
17
+ from metaxy._utils import switch_implementation_to_polars
18
+ from metaxy.config import MetaxyConfig
19
+ from metaxy.metadata_store.exceptions import (
20
+ FeatureNotFoundError,
21
+ StoreNotOpenError,
22
+ SystemDataNotFoundError,
23
+ VersioningEngineMismatchError,
24
+ )
25
+ from metaxy.metadata_store.system.keys import METAXY_SYSTEM_KEY_PREFIX
26
+ from metaxy.metadata_store.types import AccessMode
27
+ from metaxy.metadata_store.utils import (
28
+ _suppress_feature_version_warning,
29
+ allow_feature_version_override,
30
+ empty_frame_like,
31
+ )
32
+ from metaxy.metadata_store.warnings import (
33
+ MetaxyColumnMissingWarning,
34
+ PolarsMaterializationWarning,
35
+ )
36
+ from metaxy.models.constants import (
37
+ ALL_SYSTEM_COLUMNS,
38
+ METAXY_CREATED_AT,
39
+ METAXY_DATA_VERSION,
40
+ METAXY_DATA_VERSION_BY_FIELD,
41
+ METAXY_FEATURE_VERSION,
42
+ METAXY_MATERIALIZATION_ID,
43
+ METAXY_PROVENANCE,
44
+ METAXY_PROVENANCE_BY_FIELD,
45
+ METAXY_SNAPSHOT_VERSION,
46
+ )
47
+ from metaxy.models.feature import BaseFeature, FeatureGraph, current_graph
48
+ from metaxy.models.plan import FeaturePlan
49
+ from metaxy.models.types import (
50
+ CoercibleToFeatureKey,
51
+ FeatureKey,
52
+ ValidatedFeatureKeyAdapter,
53
+ )
54
+ from metaxy.versioning import VersioningEngine
55
+ from metaxy.versioning.polars import PolarsVersioningEngine
56
+ from metaxy.versioning.types import HashAlgorithm, Increment, LazyIncrement
57
+
58
+ if TYPE_CHECKING:
59
+ pass
60
+
61
+
62
+ # TypeVar for config types - used for typing from_config method
63
+ MetadataStoreConfigT = TypeVar("MetadataStoreConfigT", bound="MetadataStoreConfig")
64
+
65
+
66
+ class MetadataStoreConfig(BaseSettings):
67
+ """Base configuration class for metadata stores.
68
+
69
+ This class defines common configuration fields shared by all metadata store types.
70
+ Store-specific config classes should inherit from this and add their own fields.
71
+
72
+ Example:
73
+ ```python
74
+ from metaxy.metadata_store.duckdb import DuckDBMetadataStoreConfig
75
+
76
+ config = DuckDBMetadataStoreConfig(
77
+ database="metadata.db",
78
+ hash_algorithm=HashAlgorithm.MD5,
79
+ )
80
+
81
+ store = DuckDBMetadataStore.from_config(config)
82
+ ```
83
+ """
84
+
85
+ model_config = SettingsConfigDict(frozen=True, extra="forbid")
86
+
87
+ fallback_stores: list[str] = Field(
88
+ default_factory=list,
89
+ description="List of fallback store names to search when features are not found in the current store.",
90
+ )
91
+
92
+ hash_algorithm: HashAlgorithm | None = Field(
93
+ default=None,
94
+ description="Hash algorithm for versioning. If None, uses store's default.",
95
+ )
96
+
97
+ versioning_engine: Literal["auto", "native", "polars"] = Field(
98
+ default="auto",
99
+ description="Which versioning engine to use: 'auto' (prefer native), 'native', or 'polars'.",
100
+ )
101
+
102
+
103
+ VersioningEngineT = TypeVar("VersioningEngineT", bound=VersioningEngine)
104
+ VersioningEngineOptions = Literal["auto", "native", "polars"]
105
+
106
+ # Mapping of system columns to their expected Narwhals dtypes
107
+ # Used to cast Null-typed columns to correct types
108
+ # Note: Struct columns (METAXY_PROVENANCE_BY_FIELD, METAXY_DATA_VERSION_BY_FIELD) are not cast
109
+ _SYSTEM_COLUMN_DTYPES = {
110
+ METAXY_PROVENANCE: nw.String,
111
+ METAXY_FEATURE_VERSION: nw.String,
112
+ METAXY_SNAPSHOT_VERSION: nw.String,
113
+ METAXY_DATA_VERSION: nw.String,
114
+ METAXY_CREATED_AT: nw.Datetime,
115
+ METAXY_MATERIALIZATION_ID: nw.String,
116
+ }
117
+
118
+
119
+ def _cast_present_system_columns(
120
+ df: nw.DataFrame[Any] | nw.LazyFrame[Any],
121
+ ) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
122
+ """Cast system columns with Null/Unknown dtype to their correct types.
123
+
124
+ This handles edge cases where empty DataFrames or certain operations
125
+ result in Null-typed columns (represented as nw.Unknown in Narwhals)
126
+ that break downstream processing.
127
+
128
+ Args:
129
+ df: Narwhals DataFrame or LazyFrame
130
+
131
+ Returns:
132
+ DataFrame with system columns cast to correct types
133
+ """
134
+ schema = df.collect_schema()
135
+ columns_to_cast = []
136
+
137
+ for col_name, expected_dtype in _SYSTEM_COLUMN_DTYPES.items():
138
+ if col_name in schema and schema[col_name] == nw.Unknown:
139
+ columns_to_cast.append(nw.col(col_name).cast(expected_dtype))
140
+
141
+ if columns_to_cast:
142
+ df = df.with_columns(columns_to_cast)
143
+
144
+ return df
145
+
146
+
147
+ class MetadataStore(ABC):
148
+ """
149
+ Abstract base class for metadata storage backends.
150
+ """
151
+
152
+ # Subclasses can override this to disable auto_create_tables warning
153
+ # Set to False for stores where table creation is not applicable (e.g., InMemoryMetadataStore)
154
+ _should_warn_auto_create_tables: bool = True
155
+
156
+ def __init__(
157
+ self,
158
+ *,
159
+ versioning_engine_cls: type[VersioningEngineT],
160
+ hash_algorithm: HashAlgorithm | None = None,
161
+ versioning_engine: VersioningEngineOptions = "auto",
162
+ fallback_stores: list[MetadataStore] | None = None,
163
+ auto_create_tables: bool | None = None,
164
+ materialization_id: str | None = None,
165
+ ):
166
+ """
167
+ Initialize the metadata store.
168
+
169
+ Args:
170
+ hash_algorithm: Hash algorithm to use for the versioning engine.
171
+
172
+ versioning_engine: Which versioning engine to use.
173
+
174
+ - "auto": Prefer the store's native engine, fall back to Polars if needed
175
+
176
+ - "native": Always use the store's native engine, raise `VersioningEngineMismatchError`
177
+ if provided dataframes are incompatible
178
+
179
+ - "polars": Always use the Polars engine
180
+
181
+ fallback_stores: Ordered list of read-only fallback stores.
182
+ Used when upstream features are not in this store.
183
+ `VersioningEngineMismatchError` is not raised when reading from fallback stores.
184
+ auto_create_tables: If True, automatically create tables when opening the store.
185
+ If None (default), reads from global MetaxyConfig (which reads from METAXY_AUTO_CREATE_TABLES env var).
186
+ If False, never auto-create tables.
187
+
188
+ !!! warning
189
+ Auto-create is intended for development/testing only.
190
+ Use proper database migration tools like Alembic for production deployments.
191
+
192
+ materialization_id: Optional external orchestration ID.
193
+ If provided, all metadata writes will include this ID in the `metaxy_materialization_id` column.
194
+ Can be overridden per [`MetadataStore.write_metadata`][metaxy.MetadataStore.write_metadata] call.
195
+
196
+ Raises:
197
+ ValueError: If fallback stores use different hash algorithms or truncation lengths
198
+ VersioningEngineMismatchError: If a user-provided dataframe has a wrong implementation
199
+ and versioning_engine is set to `native`
200
+ """
201
+ # Initialize state early so properties can check it
202
+ self._is_open = False
203
+ self._context_depth = 0
204
+ self._versioning_engine = versioning_engine
205
+ self._allow_cross_project_writes = False
206
+ self._materialization_id = materialization_id
207
+ self._open_cm: AbstractContextManager[Self] | None = (
208
+ None # Track the open() context manager
209
+ )
210
+ self.versioning_engine_cls = versioning_engine_cls
211
+
212
+ # Resolve auto_create_tables from global config if not explicitly provided
213
+ if auto_create_tables is None:
214
+ from metaxy.config import MetaxyConfig
215
+
216
+ self.auto_create_tables = MetaxyConfig.get().auto_create_tables
217
+ else:
218
+ self.auto_create_tables = auto_create_tables
219
+
220
+ # Use store's default algorithm if not specified
221
+ if hash_algorithm is None:
222
+ hash_algorithm = self._get_default_hash_algorithm()
223
+
224
+ self.hash_algorithm = hash_algorithm
225
+
226
+ self.fallback_stores = fallback_stores or []
227
+
228
+ @classmethod
229
+ @abstractmethod
230
+ def config_model(cls) -> type[MetadataStoreConfig]:
231
+ """Return the configuration model class for this store type.
232
+
233
+ Subclasses must override this to return their specific config class.
234
+
235
+ Returns:
236
+ The config class type (e.g., DuckDBMetadataStoreConfig)
237
+
238
+ Note:
239
+ Subclasses override this with a more specific return type.
240
+ Type checkers may show a warning about incompatible override,
241
+ but this is intentional - each store returns its own config type.
242
+ """
243
+ ...
244
+
245
+ @classmethod
246
+ def from_config(cls, config: MetadataStoreConfig, **kwargs: Any) -> Self:
247
+ """Create a store instance from a configuration object.
248
+
249
+ This method creates a store by:
250
+ 1. Converting the config to a dict
251
+ 2. Resolving fallback store names to actual store instances
252
+ 3. Calling the store's __init__ with the config parameters
253
+
254
+ Args:
255
+ config: Configuration object (should be the type returned by config_model())
256
+ **kwargs: Additional arguments passed directly to the store constructor
257
+ (e.g., materialization_id for runtime parameters not in config)
258
+
259
+ Returns:
260
+ A new store instance configured according to the config object
261
+
262
+ Example:
263
+ ```python
264
+ from metaxy.metadata_store.duckdb import (
265
+ DuckDBMetadataStore,
266
+ DuckDBMetadataStoreConfig,
267
+ )
268
+
269
+ config = DuckDBMetadataStoreConfig(
270
+ database="metadata.db",
271
+ fallback_stores=["prod"],
272
+ )
273
+
274
+ store = DuckDBMetadataStore.from_config(config)
275
+ ```
276
+ """
277
+ # Convert config to dict, excluding unset values
278
+ config_dict = config.model_dump(exclude_unset=True)
279
+
280
+ # Pop and resolve fallback store names to actual store instances
281
+ fallback_store_names = config_dict.pop("fallback_stores", [])
282
+ fallback_stores = [
283
+ MetaxyConfig.get().get_store(name) for name in fallback_store_names
284
+ ]
285
+
286
+ # Create store with resolved fallback stores, config, and extra kwargs
287
+ return cls(fallback_stores=fallback_stores, **config_dict, **kwargs)
288
+
289
+ @property
290
+ def hash_truncation_length(self) -> int:
291
+ return MetaxyConfig.get().hash_truncation_length or 64
292
+
293
+ @property
294
+ def materialization_id(self) -> str | None:
295
+ """The external orchestration ID for this store instance.
296
+
297
+ If set, all metadata writes include this ID in the `metaxy_materialization_id` column,
298
+ allowing filtering of rows written during a specific materialization run.
299
+ """
300
+ return self._materialization_id
301
+
302
+ @overload
303
+ def resolve_update(
304
+ self,
305
+ feature: type[BaseFeature],
306
+ *,
307
+ samples: IntoFrame | Frame | None = None,
308
+ filters: Mapping[CoercibleToFeatureKey, Sequence[nw.Expr]] | None = None,
309
+ global_filters: Sequence[nw.Expr] | None = None,
310
+ lazy: Literal[False] = False,
311
+ versioning_engine: Literal["auto", "native", "polars"] | None = None,
312
+ skip_comparison: bool = False,
313
+ **kwargs: Any,
314
+ ) -> Increment: ...
315
+
316
+ @overload
317
+ def resolve_update(
318
+ self,
319
+ feature: type[BaseFeature],
320
+ *,
321
+ samples: IntoFrame | Frame | None = None,
322
+ filters: Mapping[CoercibleToFeatureKey, Sequence[nw.Expr]] | None = None,
323
+ global_filters: Sequence[nw.Expr] | None = None,
324
+ lazy: Literal[True],
325
+ versioning_engine: Literal["auto", "native", "polars"] | None = None,
326
+ skip_comparison: bool = False,
327
+ **kwargs: Any,
328
+ ) -> LazyIncrement: ...
329
+
330
+ def resolve_update(
331
+ self,
332
+ feature: type[BaseFeature],
333
+ *,
334
+ samples: IntoFrame | Frame | None = None,
335
+ filters: Mapping[CoercibleToFeatureKey, Sequence[nw.Expr]] | None = None,
336
+ global_filters: Sequence[nw.Expr] | None = None,
337
+ lazy: bool = False,
338
+ versioning_engine: Literal["auto", "native", "polars"] | None = None,
339
+ skip_comparison: bool = False,
340
+ **kwargs: Any,
341
+ ) -> Increment | LazyIncrement:
342
+ """Calculate an incremental update for a feature.
343
+
344
+ This is the main workhorse in Metaxy.
345
+
346
+ Args:
347
+ feature: Feature class to resolve updates for
348
+ samples: A dataframe with joined upstream metadata and `"metaxy_provenance_by_field"` column set.
349
+ When provided, `MetadataStore` skips loading upstream feature metadata and provenance calculations.
350
+
351
+ !!! info "Required for root features"
352
+ Metaxy doesn't know how to populate input metadata for root features,
353
+ so `samples` argument for **must** be provided for them.
354
+
355
+ !!! tip
356
+ For non-root features, use `samples` to customize the automatic upstream loading and field provenance calculation.
357
+ For example, it can be used to requires processing for specific sample IDs.
358
+
359
+ Setting this parameter during normal operations is not required.
360
+
361
+ filters: A mapping from feature keys to lists of Narwhals filter expressions.
362
+ Keys can be feature classes, FeatureKey objects, or string paths.
363
+ Applied at read-time. May filter the current feature,
364
+ in this case it will also be applied to `samples` (if provided).
365
+ Example: `{UpstreamFeature: [nw.col("x") > 10], ...}`
366
+ global_filters: A list of Narwhals filter expressions applied to all features.
367
+ These filters are combined with any feature-specific filters from `filters`.
368
+ Useful for filtering by common columns like `sample_uid` across all features.
369
+ Example: `[nw.col("sample_uid").is_in(["s1", "s2"])]`
370
+ lazy: Whether to return a [metaxy.versioning.types.LazyIncrement][] or a [metaxy.versioning.types.Increment][].
371
+ versioning_engine: Override the store's versioning engine for this operation.
372
+ skip_comparison: If True, skip the increment comparison logic and return all
373
+ upstream samples in `Increment.added`. The `changed` and `removed` frames will
374
+ be empty.
375
+
376
+ Raises:
377
+ ValueError: If no `samples` dataframe has been provided when resolving an update for a root feature.
378
+ VersioningEngineMismatchError: If `versioning_engine` has been set to `"native"`
379
+ and a dataframe of a different implementation has been encountered during `resolve_update`.
380
+
381
+ !!! example "With a root feature"
382
+
383
+ ```py
384
+ samples = pl.DataFrame({
385
+ "sample_uid": [1, 2, 3],
386
+ "metaxy_provenance_by_field": [{"field": "h1"}, {"field": "h2"}, {"field": "h3"}],
387
+ })
388
+ result = store.resolve_update(RootFeature, samples=nw.from_native(samples))
389
+ ```
390
+ """
391
+ import narwhals as nw
392
+
393
+ # Convert samples to Narwhals frame if not already
394
+ samples_nw: nw.DataFrame[Any] | nw.LazyFrame[Any] | None = None
395
+ if samples is not None:
396
+ if isinstance(samples, (nw.DataFrame, nw.LazyFrame)):
397
+ samples_nw = samples
398
+ else:
399
+ samples_nw = nw.from_native(samples)
400
+
401
+ # Normalize filter keys to FeatureKey
402
+ normalized_filters: dict[FeatureKey, list[nw.Expr]] = {}
403
+ if filters:
404
+ for key, exprs in filters.items():
405
+ feature_key = self._resolve_feature_key(key)
406
+ normalized_filters[feature_key] = list(exprs)
407
+
408
+ # Convert global_filters to a list for easy concatenation
409
+ global_filter_list = list(global_filters) if global_filters else []
410
+
411
+ graph = current_graph()
412
+ plan = graph.get_feature_plan(feature.spec().key)
413
+
414
+ # Root features without samples: error (samples required)
415
+ if not plan.deps and samples_nw is None:
416
+ raise ValueError(
417
+ f"Feature {feature.spec().key} has no upstream dependencies (root feature). "
418
+ f"Must provide 'samples' parameter with sample_uid and {METAXY_PROVENANCE_BY_FIELD} columns. "
419
+ f"Root features require manual {METAXY_PROVENANCE_BY_FIELD} computation."
420
+ )
421
+
422
+ # Combine feature-specific filters with global filters
423
+ current_feature_filters = [
424
+ *normalized_filters.get(feature.spec().key, []),
425
+ *global_filter_list,
426
+ ]
427
+
428
+ current_metadata = self.read_metadata_in_store(
429
+ feature,
430
+ filters=[
431
+ nw.col(METAXY_FEATURE_VERSION)
432
+ == graph.get_feature_version(feature.spec().key),
433
+ *current_feature_filters,
434
+ ],
435
+ )
436
+
437
+ upstream_by_key: dict[FeatureKey, nw.LazyFrame[Any]] = {}
438
+ filters_by_key: dict[FeatureKey, list[nw.Expr]] = {}
439
+
440
+ # if samples are provided, use them as source of truth for upstream data
441
+ if samples_nw is not None:
442
+ # Apply filters to samples if any
443
+ filtered_samples = samples_nw
444
+ if current_feature_filters:
445
+ filtered_samples = samples_nw.filter(current_feature_filters)
446
+
447
+ # fill in METAXY_PROVENANCE column if it's missing (e.g. for root features)
448
+ samples_nw = self.hash_struct_version_column(
449
+ plan,
450
+ df=filtered_samples,
451
+ struct_column=METAXY_PROVENANCE_BY_FIELD,
452
+ hash_column=METAXY_PROVENANCE,
453
+ )
454
+
455
+ # For root features, add data_version columns if they don't exist
456
+ # (root features have no computation, so data_version equals provenance)
457
+ if METAXY_DATA_VERSION_BY_FIELD not in samples_nw.columns:
458
+ samples_nw = samples_nw.with_columns(
459
+ nw.col(METAXY_PROVENANCE_BY_FIELD).alias(
460
+ METAXY_DATA_VERSION_BY_FIELD
461
+ ),
462
+ nw.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION),
463
+ )
464
+ else:
465
+ for upstream_spec in plan.deps or []:
466
+ # Combine feature-specific filters with global filters for upstream
467
+ upstream_filters = [
468
+ *normalized_filters.get(upstream_spec.key, []),
469
+ *global_filter_list,
470
+ ]
471
+ upstream_feature_metadata = self.read_metadata(
472
+ upstream_spec.key,
473
+ filters=upstream_filters,
474
+ )
475
+ if upstream_feature_metadata is not None:
476
+ upstream_by_key[upstream_spec.key] = upstream_feature_metadata
477
+
478
+ # determine which implementation to use for resolving the increment
479
+ # consider (1) whether all upstream metadata has been loaded with the native implementation
480
+ # (2) if samples have native implementation
481
+
482
+ # Use parameter if provided, otherwise use store default
483
+ engine_mode = (
484
+ versioning_engine
485
+ if versioning_engine is not None
486
+ else self._versioning_engine
487
+ )
488
+
489
+ # If "polars" mode, force Polars immediately
490
+ if engine_mode == "polars":
491
+ implementation = nw.Implementation.POLARS
492
+ switched_to_polars = True
493
+ else:
494
+ implementation = self.native_implementation()
495
+ switched_to_polars = False
496
+
497
+ for upstream_key, df in upstream_by_key.items():
498
+ if df.implementation != implementation:
499
+ switched_to_polars = True
500
+ # Only raise error in "native" mode if no fallback stores configured.
501
+ # If fallback stores exist, the implementation mismatch indicates data came
502
+ # from fallback (different implementation), which is legitimate fallback access.
503
+ # If data were local, it would have the native implementation.
504
+ if engine_mode == "native" and not self.fallback_stores:
505
+ raise VersioningEngineMismatchError(
506
+ f"versioning_engine='native' but upstream feature `{upstream_key.to_string()}` "
507
+ f"has implementation {df.implementation}, expected {self.native_implementation()}"
508
+ )
509
+ elif engine_mode == "auto" or (
510
+ engine_mode == "native" and self.fallback_stores
511
+ ):
512
+ PolarsMaterializationWarning.warn_on_implementation_mismatch(
513
+ expected=self.native_implementation(),
514
+ actual=df.implementation,
515
+ message=f"Using Polars for resolving the increment instead. This was caused by upstream feature `{upstream_key.to_string()}`.",
516
+ )
517
+ implementation = nw.Implementation.POLARS
518
+ break
519
+
520
+ if (
521
+ samples_nw is not None
522
+ and samples_nw.implementation != self.native_implementation()
523
+ ):
524
+ if not switched_to_polars:
525
+ if engine_mode == "native":
526
+ # Always raise error for samples with wrong implementation, regardless
527
+ # of fallback stores, because samples come from user argument, not from fallback
528
+ raise VersioningEngineMismatchError(
529
+ f"versioning_engine='native' but provided `samples` have implementation {samples_nw.implementation}, "
530
+ f"expected {self.native_implementation()}"
531
+ )
532
+ elif engine_mode == "auto":
533
+ PolarsMaterializationWarning.warn_on_implementation_mismatch(
534
+ expected=self.native_implementation(),
535
+ actual=samples_nw.implementation,
536
+ message=f"Provided `samples` have implementation {samples_nw.implementation}. Using Polars for resolving the increment instead.",
537
+ )
538
+ implementation = nw.Implementation.POLARS
539
+ switched_to_polars = True
540
+
541
+ if switched_to_polars:
542
+ if current_metadata:
543
+ current_metadata = switch_implementation_to_polars(current_metadata)
544
+ if samples_nw:
545
+ samples_nw = switch_implementation_to_polars(samples_nw)
546
+ for upstream_key, df in upstream_by_key.items():
547
+ upstream_by_key[upstream_key] = switch_implementation_to_polars(df)
548
+
549
+ with self.create_versioning_engine(
550
+ plan=plan, implementation=implementation
551
+ ) as engine:
552
+ if skip_comparison:
553
+ # Skip comparison: return all upstream samples as added
554
+ if samples_nw is not None:
555
+ # Root features or user-provided samples: use samples directly
556
+ # Note: samples already has metaxy_provenance computed
557
+ added = samples_nw.lazy()
558
+ else:
559
+ # Non-root features: load all upstream with provenance
560
+ added = engine.load_upstream_with_provenance(
561
+ upstream=upstream_by_key,
562
+ hash_algo=self.hash_algorithm,
563
+ filters=filters_by_key,
564
+ )
565
+ changed = None
566
+ removed = None
567
+ else:
568
+ added, changed, removed = engine.resolve_increment_with_provenance(
569
+ current=current_metadata,
570
+ upstream=upstream_by_key,
571
+ hash_algorithm=self.hash_algorithm,
572
+ filters=filters_by_key,
573
+ sample=samples_nw.lazy() if samples_nw is not None else None,
574
+ )
575
+
576
+ # Convert None to empty DataFrames
577
+ if changed is None:
578
+ changed = empty_frame_like(added)
579
+ if removed is None:
580
+ removed = empty_frame_like(added)
581
+
582
+ if lazy:
583
+ return LazyIncrement(
584
+ added=added
585
+ if isinstance(added, nw.LazyFrame)
586
+ else nw.from_native(added),
587
+ changed=changed
588
+ if isinstance(changed, nw.LazyFrame)
589
+ else nw.from_native(changed),
590
+ removed=removed
591
+ if isinstance(removed, nw.LazyFrame)
592
+ else nw.from_native(removed),
593
+ )
594
+ else:
595
+ return Increment(
596
+ added=added.collect() if isinstance(added, nw.LazyFrame) else added,
597
+ changed=changed.collect()
598
+ if isinstance(changed, nw.LazyFrame)
599
+ else changed,
600
+ removed=removed.collect()
601
+ if isinstance(removed, nw.LazyFrame)
602
+ else removed,
603
+ )
604
+
605
+ def read_metadata(
606
+ self,
607
+ feature: CoercibleToFeatureKey,
608
+ *,
609
+ feature_version: str | None = None,
610
+ filters: Sequence[nw.Expr] | None = None,
611
+ columns: Sequence[str] | None = None,
612
+ allow_fallback: bool = True,
613
+ current_only: bool = True,
614
+ latest_only: bool = True,
615
+ ) -> nw.LazyFrame[Any]:
616
+ """
617
+ Read metadata with optional fallback to upstream stores.
618
+
619
+ Args:
620
+ feature: Feature to read metadata for
621
+ feature_version: Explicit feature_version to filter by (mutually exclusive with current_only=True)
622
+ filters: Sequence of Narwhals filter expressions to apply to this feature.
623
+ Example: `[nw.col("x") > 10, nw.col("y") < 5]`
624
+ columns: Subset of columns to include. Metaxy's system columns are always included.
625
+ allow_fallback: If `True`, check fallback stores on local miss
626
+ current_only: If `True`, only return rows with current feature_version
627
+ latest_only: Whether to deduplicate samples within `id_columns` groups ordered by `metaxy_created_at`.
628
+
629
+ Returns:
630
+ Narwhals LazyFrame with metadata
631
+
632
+ Raises:
633
+ FeatureNotFoundError: If feature not found in any store
634
+ SystemDataNotFoundError: When attempting to read non-existent Metaxy system data
635
+ ValueError: If both feature_version and current_only=True are provided
636
+
637
+ !!! info
638
+ When this method is called with default arguments, it will return the latest (by `metaxy_created_at`)
639
+ metadata for the current feature version. Therefore, it's perfectly suitable for most use cases.
640
+
641
+ !!! warning
642
+ The order of rows is not guaranteed.
643
+ """
644
+ filters = filters or []
645
+ columns = columns or []
646
+
647
+ feature_key = self._resolve_feature_key(feature)
648
+ is_system_table = self._is_system_table(feature_key)
649
+
650
+ # Validate mutually exclusive parameters
651
+ if feature_version is not None and current_only:
652
+ raise ValueError(
653
+ "Cannot specify both feature_version and current_only=True. "
654
+ "Use current_only=False with feature_version parameter."
655
+ )
656
+
657
+ # Add feature_version filter only when needed
658
+ if current_only or feature_version is not None and not is_system_table:
659
+ version_filter = nw.col(METAXY_FEATURE_VERSION) == (
660
+ current_graph().get_feature_version(feature_key)
661
+ if current_only
662
+ else feature_version
663
+ )
664
+ filters = [version_filter, *filters]
665
+
666
+ if columns and not is_system_table:
667
+ # Add only system columns that aren't already in the user's columns list
668
+ columns_set = set(columns)
669
+ missing_system_cols = [
670
+ c for c in ALL_SYSTEM_COLUMNS if c not in columns_set
671
+ ]
672
+ read_columns = [*columns, *missing_system_cols]
673
+ else:
674
+ read_columns = None
675
+
676
+ lazy_frame = None
677
+ try:
678
+ lazy_frame = self.read_metadata_in_store(
679
+ feature, filters=filters, columns=read_columns
680
+ )
681
+ except FeatureNotFoundError as e:
682
+ # do not read system features from fallback stores
683
+ if is_system_table:
684
+ raise SystemDataNotFoundError(
685
+ f"System Metaxy data with key {feature_key} is missing in {self.display()}. Invoke `metaxy graph push` before attempting to read system data."
686
+ ) from e
687
+
688
+ # Handle case where read_metadata_in_store returns None (no exception raised)
689
+ if lazy_frame is None and is_system_table:
690
+ raise SystemDataNotFoundError(
691
+ f"System Metaxy data with key {feature_key} is missing in {self.display()}. Invoke `metaxy graph push` before attempting to read system data."
692
+ )
693
+
694
+ if lazy_frame is not None and not is_system_table and latest_only:
695
+ from metaxy.models.constants import METAXY_CREATED_AT
696
+
697
+ # Apply deduplication
698
+ lazy_frame = self.versioning_engine_cls.keep_latest_by_group(
699
+ df=lazy_frame,
700
+ group_columns=list(
701
+ self._resolve_feature_plan(feature_key).feature.id_columns
702
+ ),
703
+ timestamp_column=METAXY_CREATED_AT,
704
+ )
705
+
706
+ if lazy_frame is not None:
707
+ # After dedup, filter to requested columns if specified
708
+ if columns:
709
+ lazy_frame = lazy_frame.select(columns)
710
+
711
+ return lazy_frame
712
+
713
+ # Try fallback stores
714
+ if allow_fallback:
715
+ for store in self.fallback_stores:
716
+ try:
717
+ # Use full read_metadata to handle nested fallback chains
718
+ return store.read_metadata(
719
+ feature,
720
+ feature_version=feature_version,
721
+ filters=filters,
722
+ columns=columns,
723
+ allow_fallback=True,
724
+ current_only=current_only,
725
+ latest_only=latest_only,
726
+ )
727
+ except FeatureNotFoundError:
728
+ # Try next fallback store
729
+ continue
730
+
731
+ # Not found anywhere
732
+ raise FeatureNotFoundError(
733
+ f"Feature {feature_key.to_string()} not found in store"
734
+ + (" or fallback stores" if allow_fallback else "")
735
+ )
736
+
737
+ def write_metadata(
738
+ self,
739
+ feature: CoercibleToFeatureKey,
740
+ df: IntoFrame,
741
+ materialization_id: str | None = None,
742
+ ) -> None:
743
+ """
744
+ Write metadata for a feature (append-only by design).
745
+
746
+ Automatically adds the Metaxy system columns, unless they already exist in the DataFrame.
747
+
748
+ Args:
749
+ feature: Feature to write metadata for
750
+ df: Metadata DataFrame of any type supported by [Narwhals](https://narwhals-dev.github.io/narwhals/).
751
+ Must have `metaxy_provenance_by_field` column of type Struct with fields matching feature's fields.
752
+ Optionally, may also contain `metaxy_data_version_by_field`.
753
+ materialization_id: Optional external orchestration ID for this write.
754
+ Overrides the store's default `materialization_id` if provided.
755
+ Useful for tracking which orchestration run produced this metadata.
756
+
757
+ Raises:
758
+ MetadataSchemaError: If DataFrame schema is invalid
759
+ StoreNotOpenError: If store is not open
760
+ ValueError: If writing to a feature from a different project than expected
761
+
762
+ Note:
763
+ - Must be called within a `MetadataStore.open(mode="write")` context manager.
764
+
765
+ - Metaxy always performs an "append" operation. Metadata is never deleted or mutated.
766
+
767
+ - Fallback stores are never used for writes.
768
+
769
+ - Features from other Metaxy projects cannot be written to, unless project validation has been disabled with [MetadataStore.allow_cross_project_writes][].
770
+
771
+ """
772
+ self._check_open()
773
+
774
+ feature_key = self._resolve_feature_key(feature)
775
+ is_system_table = self._is_system_table(feature_key)
776
+
777
+ # Validate project for non-system tables
778
+ if not is_system_table:
779
+ self._validate_project_write(feature)
780
+
781
+ # Convert Polars to Narwhals to Polars if needed
782
+ # if isinstance(df_nw, (pl.DataFrame, pl.LazyFrame)):
783
+ df_nw = nw.from_native(df)
784
+
785
+ assert isinstance(df_nw, nw.DataFrame), "df must be a Narwhal DataFrame"
786
+
787
+ # For system tables, write directly without feature_version tracking
788
+ if is_system_table:
789
+ self._validate_schema_system_table(df_nw)
790
+ self.write_metadata_to_store(feature_key, df_nw)
791
+ return
792
+
793
+ if METAXY_PROVENANCE_BY_FIELD not in df_nw.columns:
794
+ from metaxy.metadata_store.exceptions import MetadataSchemaError
795
+
796
+ raise MetadataSchemaError(
797
+ f"DataFrame must have '{METAXY_PROVENANCE_BY_FIELD}' column"
798
+ )
799
+
800
+ # Add all required system columns
801
+ # warning: for dataframes that do not match the native MetadataStore implementation
802
+ # and are missing the METAXY_DATA_VERSION column, this call will lead to materializing the equivalent Polars DataFrame
803
+ # while calculating the missing METAXY_DATA_VERSION column
804
+ df_nw = self._add_system_columns(
805
+ df_nw, feature, materialization_id=materialization_id
806
+ )
807
+
808
+ self._validate_schema(df_nw)
809
+ self.write_metadata_to_store(feature_key, df_nw)
810
+
811
+ def write_metadata_multi(
812
+ self,
813
+ metadata: Mapping[Any, IntoFrame],
814
+ materialization_id: str | None = None,
815
+ ) -> None:
816
+ """
817
+ Write metadata for multiple features in reverse topological order.
818
+
819
+ Processes features so that dependents are written before their dependencies.
820
+ This ordering ensures that downstream features are written first, which can
821
+ be useful for certain data consistency requirements or when features need
822
+ to be processed in a specific order.
823
+
824
+ Args:
825
+ metadata: Mapping from feature keys to metadata DataFrames.
826
+ Keys can be any type coercible to FeatureKey (string, sequence,
827
+ FeatureKey, or BaseFeature class). Values must be DataFrames
828
+ compatible with Narwhals, containing required system columns.
829
+ materialization_id: Optional external orchestration ID for all writes.
830
+ Overrides the store's default `materialization_id` if provided.
831
+ Applied to all feature writes in this batch.
832
+
833
+ Raises:
834
+ MetadataSchemaError: If any DataFrame schema is invalid
835
+ StoreNotOpenError: If store is not open
836
+ ValueError: If writing to a feature from a different project than expected
837
+
838
+ Note:
839
+ - Must be called within a `MetadataStore.open(mode="write")` context manager.
840
+ - Empty mappings are handled gracefully (no-op).
841
+ - Each feature's metadata is written via `write_metadata`, so all
842
+ validation and system column handling from that method applies.
843
+
844
+ Example:
845
+ ```py
846
+ with store.open(mode="write"):
847
+ store.write_metadata_multi({
848
+ ChildFeature: child_df,
849
+ ParentFeature: parent_df,
850
+ })
851
+ # Features are written in reverse topological order:
852
+ # ChildFeature first, then ParentFeature
853
+ ```
854
+ """
855
+ if not metadata:
856
+ return
857
+
858
+ # Build mapping from resolved keys to dataframes in one pass
859
+ resolved_metadata = {
860
+ self._resolve_feature_key(key): df for key, df in metadata.items()
861
+ }
862
+
863
+ # Get reverse topological order (dependents first)
864
+ graph = current_graph()
865
+ sorted_keys = graph.topological_sort_features(
866
+ list(resolved_metadata.keys()), descending=True
867
+ )
868
+
869
+ # Write metadata in reverse topological order
870
+ for feature_key in sorted_keys:
871
+ self.write_metadata(
872
+ feature_key,
873
+ resolved_metadata[feature_key],
874
+ materialization_id=materialization_id,
875
+ )
876
+
877
+ @abstractmethod
878
+ def _get_default_hash_algorithm(self) -> HashAlgorithm:
879
+ """Get the default hash algorithm for this store type.
880
+
881
+ Returns:
882
+ Default hash algorithm
883
+ """
884
+ pass
885
+
886
+ def native_implementation(self) -> nw.Implementation:
887
+ """Get the native Narwhals implementation for this store's backend."""
888
+ return self.versioning_engine_cls.implementation()
889
+
890
+ @abstractmethod
891
+ @contextmanager
892
+ def _create_versioning_engine(
893
+ self, plan: FeaturePlan
894
+ ) -> Iterator[VersioningEngineT]:
895
+ """Create provenance engine for this store as a context manager.
896
+
897
+ Args:
898
+ plan: Feature plan for the feature we're tracking provenance for
899
+
900
+ Yields:
901
+ VersioningEngine instance appropriate for this store's backend.
902
+ - For SQL stores (DuckDB, ClickHouse): Returns IbisVersioningEngine
903
+ - For in-memory/Polars stores: Returns PolarsVersioningEngine
904
+
905
+ Raises:
906
+ NotImplementedError: If provenance tracking not supported by this store
907
+
908
+ Example:
909
+ ```python
910
+ with self._create_versioning_engine(plan) as engine:
911
+ result = engine.resolve_update(...)
912
+ ```
913
+ """
914
+ ...
915
+
916
+ @contextmanager
917
+ def _create_polars_versioning_engine(
918
+ self, plan: FeaturePlan
919
+ ) -> Iterator[PolarsVersioningEngine]:
920
+ yield PolarsVersioningEngine(plan=plan)
921
+
922
+ @contextmanager
923
+ def create_versioning_engine(
924
+ self, plan: FeaturePlan, implementation: nw.Implementation
925
+ ) -> Iterator[VersioningEngine | PolarsVersioningEngine]:
926
+ """
927
+ Creates an appropriate provenance engine.
928
+
929
+ Falls back to Polars implementation if the required implementation differs from the store's native implementation.
930
+
931
+ Args:
932
+ plan: The feature plan.
933
+ implementation: The desired engine implementation.
934
+
935
+ Returns:
936
+ An appropriate provenance engine.
937
+ """
938
+
939
+ if implementation == nw.Implementation.POLARS:
940
+ cm = self._create_polars_versioning_engine(plan)
941
+ elif implementation == self.native_implementation():
942
+ cm = self._create_versioning_engine(plan)
943
+ else:
944
+ cm = self._create_polars_versioning_engine(plan)
945
+
946
+ with cm as engine:
947
+ yield engine
948
+
949
+ def hash_struct_version_column(
950
+ self,
951
+ plan: FeaturePlan,
952
+ df: Frame,
953
+ struct_column: str,
954
+ hash_column: str,
955
+ ) -> Frame:
956
+ with self.create_versioning_engine(plan, df.implementation) as engine:
957
+ if (
958
+ isinstance(engine, PolarsVersioningEngine)
959
+ and df.implementation != nw.Implementation.POLARS
960
+ ):
961
+ PolarsMaterializationWarning.warn_on_implementation_mismatch(
962
+ self.native_implementation(),
963
+ df.implementation,
964
+ message=f"`{hash_column}` will be calculated in Polars.",
965
+ )
966
+ df = nw.from_native(df.lazy().collect().to_polars())
967
+
968
+ return cast(
969
+ Frame,
970
+ engine.hash_struct_version_column(
971
+ df, # pyright: ignore[reportArgumentType]
972
+ hash_algorithm=self.hash_algorithm,
973
+ struct_column=struct_column,
974
+ hash_column=hash_column,
975
+ ),
976
+ )
977
+
978
+ @abstractmethod
979
+ @contextmanager
980
+ def open(self, mode: AccessMode = "read") -> Iterator[Self]:
981
+ """Open/initialize the store for operations.
982
+
983
+ Context manager that opens the store with specified access mode.
984
+ Called internally by `__enter__`.
985
+ Child classes should implement backend-specific connection setup/teardown here.
986
+
987
+ Args:
988
+ mode: Access mode for this connection session.
989
+
990
+ Yields:
991
+ Self: The store instance with connection open
992
+
993
+ Note:
994
+ Users should prefer using `with store:` pattern except when write access mode is needed.
995
+ """
996
+ ...
997
+
998
+ def __enter__(self) -> Self:
999
+ """Enter context manager - opens store in READ mode by default.
1000
+
1001
+ Use [`MetadataStore.open`][metaxy.metadata_store.base.MetadataStore.open] for write access mode instead.
1002
+
1003
+ Returns:
1004
+ Self: The opened store instance
1005
+ """
1006
+ # Determine mode based on auto_create_tables
1007
+ mode = "write" if self.auto_create_tables else "read"
1008
+
1009
+ # Open the store (open() manages _context_depth internally)
1010
+ self._open_cm = self.open(mode)
1011
+ self._open_cm.__enter__()
1012
+
1013
+ return self
1014
+
1015
+ def _validate_after_open(self) -> None:
1016
+ """Validate configuration after store is opened.
1017
+
1018
+ Called automatically by __enter__ after open().
1019
+ Validates hash algorithm compatibility and fallback store consistency.
1020
+ """
1021
+ # Validate hash algorithm compatibility with components
1022
+ self.validate_hash_algorithm(check_fallback_stores=True)
1023
+
1024
+ # Validate fallback stores use the same hash algorithm
1025
+ for i, fallback_store in enumerate(self.fallback_stores):
1026
+ if fallback_store.hash_algorithm != self.hash_algorithm:
1027
+ raise ValueError(
1028
+ f"Fallback store {i} uses hash_algorithm='{fallback_store.hash_algorithm.value}' "
1029
+ f"but this store uses '{self.hash_algorithm.value}'. "
1030
+ f"All stores in a fallback chain must use the same hash algorithm."
1031
+ )
1032
+
1033
+ def __exit__(
1034
+ self,
1035
+ exc_type: type[BaseException] | None,
1036
+ exc_val: BaseException | None,
1037
+ exc_tb: TracebackType | None,
1038
+ ) -> None:
1039
+ # Delegate to open()'s context manager (which manages _context_depth)
1040
+ if self._open_cm is not None:
1041
+ self._open_cm.__exit__(exc_type, exc_val, exc_tb)
1042
+ self._open_cm = None
1043
+
1044
+ def _check_open(self) -> None:
1045
+ """Check if store is open, raise error if not.
1046
+
1047
+ Raises:
1048
+ StoreNotOpenError: If store is not open
1049
+ """
1050
+ if not self._is_open:
1051
+ raise StoreNotOpenError(
1052
+ f"{self.__class__.__name__} must be opened before use. "
1053
+ 'Use it as a context manager: `with store: ...` or `with store.open(mode="write"): ...`'
1054
+ )
1055
+
1056
+ # ========== Hash Algorithm Validation ==========
1057
+
1058
+ def validate_hash_algorithm(
1059
+ self,
1060
+ check_fallback_stores: bool = True,
1061
+ ) -> None:
1062
+ """Validate that hash algorithm is supported by this store's components.
1063
+
1064
+ Public method - can be called to verify hash compatibility.
1065
+
1066
+ Args:
1067
+ check_fallback_stores: If True, also validate hash is supported by
1068
+ fallback stores (ensures compatibility for future cross-store operations)
1069
+
1070
+ Raises:
1071
+ ValueError: If hash algorithm not supported by components or fallback stores
1072
+ """
1073
+ # Validate hash algorithm support without creating a full engine
1074
+ # (engine creation requires a graph which isn't available during store init)
1075
+ self._validate_hash_algorithm_support()
1076
+
1077
+ # Check fallback stores
1078
+ if check_fallback_stores:
1079
+ for fallback in self.fallback_stores:
1080
+ fallback.validate_hash_algorithm(check_fallback_stores=False)
1081
+
1082
+ def _validate_hash_algorithm_support(self) -> None:
1083
+ """Validate that the configured hash algorithm is supported.
1084
+
1085
+ Default implementation does nothing (assumes all algorithms supported).
1086
+ Subclasses can override to check algorithm support.
1087
+
1088
+ Raises:
1089
+ Exception: If hash algorithm is not supported
1090
+ """
1091
+ # Default: no validation (assume all algorithms supported)
1092
+ pass
1093
+
1094
+ # ========== Helper Methods ==========
1095
+
1096
+ def _is_system_table(self, feature_key: FeatureKey) -> bool:
1097
+ """Check if feature key is a system table."""
1098
+ return len(feature_key) >= 1 and feature_key[0] == METAXY_SYSTEM_KEY_PREFIX
1099
+
1100
+ def _resolve_feature_key(self, feature: CoercibleToFeatureKey) -> FeatureKey:
1101
+ """Resolve various types to FeatureKey.
1102
+
1103
+ Accepts types that can be converted into a FeatureKey.
1104
+
1105
+ Args:
1106
+ feature: Feature to resolve to FeatureKey
1107
+
1108
+ Returns:
1109
+ FeatureKey instance
1110
+ """
1111
+ return ValidatedFeatureKeyAdapter.validate_python(feature)
1112
+
1113
+ def _resolve_feature_plan(self, feature: CoercibleToFeatureKey) -> FeaturePlan:
1114
+ """Resolve to FeaturePlan for dependency resolution."""
1115
+ # First resolve to FeatureKey
1116
+ feature_key = self._resolve_feature_key(feature)
1117
+ # Then get the plan
1118
+ graph = current_graph()
1119
+ return graph.get_feature_plan(feature_key)
1120
+
1121
+ # ========== Core CRUD Operations ==========
1122
+
1123
+ @contextmanager
1124
+ def allow_cross_project_writes(self) -> Iterator[None]:
1125
+ """Context manager to temporarily allow cross-project writes.
1126
+
1127
+ This is an escape hatch for legitimate cross-project operations like migrations,
1128
+ where metadata needs to be written to features from different projects.
1129
+
1130
+ Example:
1131
+ ```py
1132
+ # During migration, allow writing to features from different projects
1133
+ with store.allow_cross_project_writes():
1134
+ store.write_metadata(feature_from_project_a, metadata_a)
1135
+ store.write_metadata(feature_from_project_b, metadata_b)
1136
+ ```
1137
+
1138
+ Yields:
1139
+ None: The context manager temporarily disables project validation
1140
+ """
1141
+ previous_value = self._allow_cross_project_writes
1142
+ try:
1143
+ self._allow_cross_project_writes = True
1144
+ yield
1145
+ finally:
1146
+ self._allow_cross_project_writes = previous_value
1147
+
1148
+ def _validate_project_write(self, feature: CoercibleToFeatureKey) -> None:
1149
+ """Validate that writing to a feature matches the expected project from config.
1150
+
1151
+ Args:
1152
+ feature: Feature to validate project for
1153
+
1154
+ Raises:
1155
+ ValueError: If feature's project doesn't match the global config project
1156
+ """
1157
+ # Skip validation if cross-project writes are allowed
1158
+ if self._allow_cross_project_writes:
1159
+ return
1160
+
1161
+ # Get the expected project from global config
1162
+ from metaxy.config import MetaxyConfig
1163
+
1164
+ config = MetaxyConfig.get()
1165
+ expected_project = config.project
1166
+
1167
+ # Use existing method to resolve to FeatureKey
1168
+ feature_key = self._resolve_feature_key(feature)
1169
+
1170
+ # Get the Feature class from the graph
1171
+
1172
+ graph = FeatureGraph.get_active()
1173
+ if feature_key not in graph.features_by_key:
1174
+ # Feature not in graph - can't validate, skip
1175
+ return
1176
+
1177
+ feature_cls = graph.features_by_key[feature_key]
1178
+ feature_project = feature_cls.project # type: ignore[attr-defined]
1179
+
1180
+ # Validate the project matches
1181
+ if feature_project != expected_project:
1182
+ raise ValueError(
1183
+ f"Cannot write to feature {feature_key.to_string()} from project '{feature_project}' "
1184
+ f"when the global configuration expects project '{expected_project}'. "
1185
+ f"Use store.allow_cross_project_writes() context manager for legitimate "
1186
+ f"cross-project operations like migrations."
1187
+ )
1188
+
1189
+ @abstractmethod
1190
+ def write_metadata_to_store(
1191
+ self,
1192
+ feature_key: FeatureKey,
1193
+ df: Frame,
1194
+ **kwargs: Any,
1195
+ ) -> None:
1196
+ """
1197
+ Internal write implementation (backend-specific).
1198
+
1199
+ Backends may convert to their specific type if needed (e.g., Polars, Ibis).
1200
+
1201
+ Args:
1202
+ feature_key: Feature key to write to
1203
+ df: [Narwhals](https://narwhals-dev.github.io/narwhals/)-compatible DataFrame with metadata to write
1204
+ **kwargs: Backend-specific parameters
1205
+
1206
+ Note: Subclasses implement this for their storage backend.
1207
+ """
1208
+ pass
1209
+
1210
+ def _add_system_columns(
1211
+ self,
1212
+ df: Frame,
1213
+ feature: CoercibleToFeatureKey,
1214
+ materialization_id: str | None = None,
1215
+ ) -> Frame:
1216
+ """Add all required system columns to the DataFrame.
1217
+
1218
+ Args:
1219
+ df: Narwhals DataFrame/LazyFrame
1220
+ feature: Feature class or key
1221
+ materialization_id: Optional external orchestration ID for this write.
1222
+ Overrides the store's default if provided.
1223
+
1224
+ Returns:
1225
+ DataFrame with all system columns added
1226
+ """
1227
+ feature_key = self._resolve_feature_key(feature)
1228
+
1229
+ # Check if feature_version and snapshot_version already exist in DataFrame
1230
+ has_feature_version = METAXY_FEATURE_VERSION in df.columns
1231
+ has_snapshot_version = METAXY_SNAPSHOT_VERSION in df.columns
1232
+
1233
+ # In suppression mode (migrations), use existing values as-is
1234
+ if (
1235
+ _suppress_feature_version_warning.get()
1236
+ and has_feature_version
1237
+ and has_snapshot_version
1238
+ ):
1239
+ pass # Use existing values for migrations
1240
+ else:
1241
+ # Drop any existing version columns (e.g., from SQLModel with null values)
1242
+ # and add current versions
1243
+ columns_to_drop = []
1244
+ if has_feature_version:
1245
+ columns_to_drop.append(METAXY_FEATURE_VERSION)
1246
+ if has_snapshot_version:
1247
+ columns_to_drop.append(METAXY_SNAPSHOT_VERSION)
1248
+ if columns_to_drop:
1249
+ df = df.drop(*columns_to_drop)
1250
+
1251
+ # Get current feature version and snapshot_version from code and add them
1252
+ # Use duck typing to avoid Ray serialization issues with issubclass
1253
+ if (
1254
+ isinstance(feature, type)
1255
+ and hasattr(feature, "feature_version")
1256
+ and callable(feature.feature_version)
1257
+ ):
1258
+ current_feature_version = feature.feature_version()
1259
+ else:
1260
+ from metaxy import get_feature_by_key
1261
+
1262
+ feature_cls = get_feature_by_key(feature_key)
1263
+ current_feature_version = feature_cls.feature_version()
1264
+
1265
+ # Get snapshot_version from active graph
1266
+ from metaxy.models.feature import FeatureGraph
1267
+
1268
+ graph = FeatureGraph.get_active()
1269
+ current_snapshot_version = graph.snapshot_version
1270
+
1271
+ df = df.with_columns(
1272
+ [
1273
+ nw.lit(current_feature_version).alias(METAXY_FEATURE_VERSION),
1274
+ nw.lit(current_snapshot_version).alias(METAXY_SNAPSHOT_VERSION),
1275
+ ]
1276
+ )
1277
+
1278
+ # These should normally be added by the provenance engine during resolve_update
1279
+ from metaxy.models.constants import (
1280
+ METAXY_CREATED_AT,
1281
+ METAXY_DATA_VERSION,
1282
+ METAXY_DATA_VERSION_BY_FIELD,
1283
+ )
1284
+
1285
+ if METAXY_PROVENANCE_BY_FIELD not in df.columns:
1286
+ raise ValueError(
1287
+ f"Metadata is missing a required column `{METAXY_PROVENANCE_BY_FIELD}`. It should have been created by a prior `MetadataStore.resolve_update` call. Did you drop it on the way?"
1288
+ )
1289
+
1290
+ if METAXY_PROVENANCE not in df.columns:
1291
+ plan = self._resolve_feature_plan(feature_key)
1292
+
1293
+ # Only warn for non-root features (features with dependencies).
1294
+ # Root features don't have upstream dependencies, so they don't go through
1295
+ # resolve_update() - they just need metaxy_provenance_by_field to be set.
1296
+ if plan.deps:
1297
+ MetaxyColumnMissingWarning.warn_on_missing_column(
1298
+ expected=METAXY_PROVENANCE,
1299
+ df=df,
1300
+ message=f"It should have been created by a prior `MetadataStore.resolve_update` call. Re-crearing it from `{METAXY_PROVENANCE_BY_FIELD}` Did you drop it on the way?",
1301
+ )
1302
+
1303
+ df = self.hash_struct_version_column(
1304
+ plan=plan,
1305
+ df=df,
1306
+ struct_column=METAXY_PROVENANCE_BY_FIELD,
1307
+ hash_column=METAXY_PROVENANCE,
1308
+ )
1309
+
1310
+ if METAXY_CREATED_AT not in df.columns:
1311
+ from datetime import datetime, timezone
1312
+
1313
+ df = df.with_columns(
1314
+ nw.lit(datetime.now(timezone.utc)).alias(METAXY_CREATED_AT)
1315
+ )
1316
+
1317
+ # Add materialization_id if not already present
1318
+ from metaxy.models.constants import METAXY_MATERIALIZATION_ID
1319
+
1320
+ df = df.with_columns(
1321
+ nw.lit(
1322
+ materialization_id or self._materialization_id, dtype=nw.String
1323
+ ).alias(METAXY_MATERIALIZATION_ID)
1324
+ )
1325
+
1326
+ # Check for missing data_version columns (should come from resolve_update but it's acceptable to just use provenance columns if they are missing)
1327
+
1328
+ if METAXY_DATA_VERSION_BY_FIELD not in df.columns:
1329
+ df = df.with_columns(
1330
+ nw.col(METAXY_PROVENANCE_BY_FIELD).alias(METAXY_DATA_VERSION_BY_FIELD)
1331
+ )
1332
+ df = df.with_columns(nw.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION))
1333
+ elif METAXY_DATA_VERSION not in df.columns:
1334
+ df = self.hash_struct_version_column(
1335
+ plan=self._resolve_feature_plan(feature_key),
1336
+ df=df,
1337
+ struct_column=METAXY_DATA_VERSION_BY_FIELD,
1338
+ hash_column=METAXY_DATA_VERSION,
1339
+ )
1340
+
1341
+ # Cast system columns with Null dtype to their correct types
1342
+ # This handles edge cases where empty DataFrames or certain operations
1343
+ # result in Null-typed columns that break downstream processing
1344
+ df = _cast_present_system_columns(df)
1345
+
1346
+ return df
1347
+
1348
+ def _validate_schema(self, df: Frame) -> None:
1349
+ """
1350
+ Validate that DataFrame has required schema.
1351
+
1352
+ Args:
1353
+ df: Narwhals DataFrame or LazyFrame to validate
1354
+
1355
+ Raises:
1356
+ MetadataSchemaError: If schema is invalid
1357
+ """
1358
+ from metaxy.metadata_store.exceptions import MetadataSchemaError
1359
+
1360
+ schema = df.collect_schema()
1361
+
1362
+ # Check for metaxy_provenance_by_field column
1363
+ if METAXY_PROVENANCE_BY_FIELD not in schema.names():
1364
+ raise MetadataSchemaError(
1365
+ f"DataFrame must have '{METAXY_PROVENANCE_BY_FIELD}' column"
1366
+ )
1367
+
1368
+ # Check that metaxy_provenance_by_field is a struct
1369
+ provenance_dtype = schema[METAXY_PROVENANCE_BY_FIELD]
1370
+ if not isinstance(provenance_dtype, nw.Struct):
1371
+ raise MetadataSchemaError(
1372
+ f"'{METAXY_PROVENANCE_BY_FIELD}' column must be a Struct, got {provenance_dtype}"
1373
+ )
1374
+
1375
+ # Note: metaxy_provenance is auto-computed if missing, so we don't validate it here
1376
+
1377
+ # Check for feature_version column
1378
+ if METAXY_FEATURE_VERSION not in schema.names():
1379
+ raise MetadataSchemaError(
1380
+ f"DataFrame must have '{METAXY_FEATURE_VERSION}' column"
1381
+ )
1382
+
1383
+ # Check for snapshot_version column
1384
+ if METAXY_SNAPSHOT_VERSION not in schema.names():
1385
+ raise MetadataSchemaError(
1386
+ f"DataFrame must have '{METAXY_SNAPSHOT_VERSION}' column"
1387
+ )
1388
+
1389
+ def _validate_schema_system_table(self, df: Frame) -> None:
1390
+ """Validate schema for system tables (minimal validation).
1391
+
1392
+ Args:
1393
+ df: Narwhals DataFrame to validate
1394
+ """
1395
+ # System tables don't need metaxy_provenance_by_field column
1396
+ pass
1397
+
1398
+ @abstractmethod
1399
+ def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
1400
+ """Drop/delete all metadata for a feature.
1401
+
1402
+ Backend-specific implementation for dropping feature metadata.
1403
+
1404
+ Args:
1405
+ feature_key: The feature key to drop metadata for
1406
+ """
1407
+ pass
1408
+
1409
+ def drop_feature_metadata(self, feature: CoercibleToFeatureKey) -> None:
1410
+ """Drop all metadata for a feature.
1411
+
1412
+ This removes all stored metadata for the specified feature from the store.
1413
+ Useful for cleanup in tests or when re-computing feature metadata from scratch.
1414
+
1415
+ Warning:
1416
+ This operation is irreversible and will **permanently delete all metadata** for the specified feature.
1417
+
1418
+ Args:
1419
+ feature: Feature class or key to drop metadata for
1420
+
1421
+ Example:
1422
+ ```py
1423
+ store.drop_feature_metadata(MyFeature)
1424
+ assert not store.has_feature(MyFeature)
1425
+ ```
1426
+ """
1427
+ self._check_open()
1428
+ feature_key = self._resolve_feature_key(feature)
1429
+ self._drop_feature_metadata_impl(feature_key)
1430
+
1431
+ @abstractmethod
1432
+ def read_metadata_in_store(
1433
+ self,
1434
+ feature: CoercibleToFeatureKey,
1435
+ *,
1436
+ filters: Sequence[nw.Expr] | None = None,
1437
+ columns: Sequence[str] | None = None,
1438
+ **kwargs: Any,
1439
+ ) -> nw.LazyFrame[Any] | None:
1440
+ """
1441
+ Read metadata from THIS store only without using any fallbacks stores.
1442
+
1443
+ Args:
1444
+ feature: Feature to read metadata for
1445
+ filters: List of Narwhals filter expressions for this specific feature.
1446
+ columns: Subset of columns to return
1447
+ **kwargs: Backend-specific parameters
1448
+
1449
+ Returns:
1450
+ Narwhals LazyFrame with metadata, or None if feature not found in the store
1451
+ """
1452
+ pass
1453
+
1454
+ # ========== Feature Existence ==========
1455
+
1456
+ def has_feature(
1457
+ self,
1458
+ feature: CoercibleToFeatureKey,
1459
+ *,
1460
+ check_fallback: bool = False,
1461
+ ) -> bool:
1462
+ """
1463
+ Check if feature exists in store.
1464
+
1465
+ Args:
1466
+ feature: Feature to check
1467
+ check_fallback: If True, also check fallback stores
1468
+
1469
+ Returns:
1470
+ True if feature exists, False otherwise
1471
+ """
1472
+ self._check_open()
1473
+
1474
+ if self.read_metadata_in_store(feature) is not None:
1475
+ return True
1476
+
1477
+ # Check fallback stores
1478
+ if not check_fallback:
1479
+ return self._has_feature_impl(feature)
1480
+ else:
1481
+ for store in self.fallback_stores:
1482
+ if store.has_feature(feature, check_fallback=True):
1483
+ return True
1484
+
1485
+ return False
1486
+
1487
+ @abstractmethod
1488
+ def _has_feature_impl(self, feature: CoercibleToFeatureKey) -> bool:
1489
+ """Implementation of _has_feature.
1490
+
1491
+ Args:
1492
+ feature: Feature to check
1493
+
1494
+ Returns:
1495
+ True if feature exists, False otherwise
1496
+ """
1497
+ pass
1498
+
1499
+ @abstractmethod
1500
+ def display(self) -> str:
1501
+ """Return a human-readable display string for this store.
1502
+
1503
+ Used in warnings, logs, and CLI output to identify the store.
1504
+
1505
+ Returns:
1506
+ Display string (e.g., "DuckDBMetadataStore(database=/path/to/db.duckdb)")
1507
+ """
1508
+ pass
1509
+
1510
+ def get_store_metadata(self, feature_key: CoercibleToFeatureKey) -> dict[str, Any]:
1511
+ """Arbitrary key-value pairs with useful metadata like path in storage.
1512
+
1513
+ Useful for logging purposes. This method should not expose sensitive information.
1514
+ """
1515
+ return {}
1516
+
1517
+ def copy_metadata(
1518
+ self,
1519
+ from_store: MetadataStore,
1520
+ features: list[CoercibleToFeatureKey] | None = None,
1521
+ *,
1522
+ from_snapshot: str | None = None,
1523
+ filters: Mapping[str, Sequence[nw.Expr]] | None = None,
1524
+ incremental: bool = True,
1525
+ ) -> dict[str, int]:
1526
+ """Copy metadata from another store with fine-grained filtering.
1527
+
1528
+ This is a reusable method that can be called programmatically or from CLI/migrations.
1529
+ Copies metadata for specified features, preserving the original snapshot_version.
1530
+
1531
+ Args:
1532
+ from_store: Source metadata store to copy from (must be opened)
1533
+ features: List of features to copy. Can be:
1534
+ - None: copies all features from source store
1535
+ - List of FeatureKey or Feature classes: copies specified features
1536
+ from_snapshot: Snapshot version to filter source data by. If None, uses latest snapshot
1537
+ from source store. Only rows with this snapshot_version will be copied.
1538
+ The snapshot_version is preserved in the destination store.
1539
+ filters: Dict mapping feature keys (as strings) to sequences of Narwhals filter expressions.
1540
+ These filters are applied when reading from the source store.
1541
+ Example: {"feature/key": [nw.col("x") > 10], "other/feature": [...]}
1542
+ incremental: If True (default), filter out rows that already exist in the destination
1543
+ store by performing an anti-join on sample_uid for the same snapshot_version.
1544
+
1545
+ The implementation uses an anti-join: source LEFT ANTI JOIN destination ON sample_uid
1546
+ filtered by snapshot_version.
1547
+
1548
+ Disabling incremental (incremental=False) may improve performance when:
1549
+ - You know the destination is empty or has no overlap with source
1550
+ - The destination store uses deduplication
1551
+
1552
+ When incremental=False, it's the user's responsibility to avoid duplicates or
1553
+ configure deduplication at the storage layer.
1554
+
1555
+ Returns:
1556
+ Dict with statistics: {"features_copied": int, "rows_copied": int}
1557
+
1558
+ Raises:
1559
+ ValueError: If from_store or self (destination) is not open
1560
+ FeatureNotFoundError: If a specified feature doesn't exist in source store
1561
+
1562
+ Examples:
1563
+ ```py
1564
+ # Simple: copy all features from latest snapshot
1565
+ stats = dest_store.copy_metadata(from_store=source_store)
1566
+ ```
1567
+
1568
+ ```py
1569
+ # Copy specific features from a specific snapshot
1570
+ stats = dest_store.copy_metadata(
1571
+ from_store=source_store,
1572
+ features=[FeatureKey(["my_feature"])],
1573
+ from_snapshot="abc123",
1574
+ )
1575
+ ```
1576
+
1577
+ ```py
1578
+ # Copy with filters
1579
+ stats = dest_store.copy_metadata(
1580
+ from_store=source_store,
1581
+ filters={"my/feature": [nw.col("sample_uid").is_in(["s1", "s2"])]},
1582
+ )
1583
+ ```
1584
+
1585
+ ```py
1586
+ # Copy specific features with filters
1587
+ stats = dest_store.copy_metadata(
1588
+ from_store=source_store,
1589
+ features=[
1590
+ FeatureKey(["feature_a"]),
1591
+ FeatureKey(["feature_b"]),
1592
+ ],
1593
+ filters={
1594
+ "feature_a": [nw.col("field_a") > 10, nw.col("sample_uid").is_in(["s1", "s2"])],
1595
+ "feature_b": [nw.col("field_b") < 30],
1596
+ },
1597
+ )
1598
+ ```
1599
+ """
1600
+ import logging
1601
+
1602
+ logger = logging.getLogger(__name__)
1603
+
1604
+ # Validate destination store is open
1605
+ if not self._is_open:
1606
+ raise ValueError(
1607
+ 'Destination store must be opened with store.open("write") before use'
1608
+ )
1609
+
1610
+ # Auto-open source store if not already open
1611
+ if not from_store._is_open:
1612
+ with from_store.open("read"):
1613
+ return self._copy_metadata_impl(
1614
+ from_store=from_store,
1615
+ features=features,
1616
+ from_snapshot=from_snapshot,
1617
+ filters=filters,
1618
+ incremental=incremental,
1619
+ logger=logger,
1620
+ )
1621
+ else:
1622
+ return self._copy_metadata_impl(
1623
+ from_store=from_store,
1624
+ features=features,
1625
+ from_snapshot=from_snapshot,
1626
+ filters=filters,
1627
+ incremental=incremental,
1628
+ logger=logger,
1629
+ )
1630
+
1631
+ def _copy_metadata_impl(
1632
+ self,
1633
+ from_store: MetadataStore,
1634
+ features: list[CoercibleToFeatureKey] | None,
1635
+ from_snapshot: str | None,
1636
+ filters: Mapping[str, Sequence[nw.Expr]] | None,
1637
+ incremental: bool,
1638
+ logger,
1639
+ ) -> dict[str, int]:
1640
+ """Internal implementation of copy_metadata."""
1641
+ # Determine which features to copy
1642
+ features_to_copy: list[FeatureKey]
1643
+ if features is None:
1644
+ # Copy all features from active graph (features defined in current project)
1645
+ from metaxy.models.feature import FeatureGraph
1646
+
1647
+ graph = FeatureGraph.get_active()
1648
+ features_to_copy = graph.list_features(only_current_project=True)
1649
+ logger.info(
1650
+ f"Copying all features from active graph: {len(features_to_copy)} features"
1651
+ )
1652
+ else:
1653
+ # Convert all to FeatureKey using the adapter
1654
+ features_to_copy = [self._resolve_feature_key(item) for item in features]
1655
+ logger.info(f"Copying {len(features_to_copy)} specified features")
1656
+
1657
+ # Log snapshot usage
1658
+ if from_snapshot is not None:
1659
+ logger.info(f"Filtering by snapshot: {from_snapshot}")
1660
+ else:
1661
+ logger.info("Copying all data (no snapshot filter)")
1662
+
1663
+ # Copy metadata for each feature
1664
+ total_rows = 0
1665
+ features_copied = 0
1666
+
1667
+ with allow_feature_version_override():
1668
+ for feature_key in features_to_copy:
1669
+ try:
1670
+ # Read metadata from source, filtering by from_snapshot
1671
+ # Use current_only=False to avoid filtering by feature_version
1672
+ source_lazy = from_store.read_metadata(
1673
+ feature_key,
1674
+ allow_fallback=False,
1675
+ current_only=False,
1676
+ )
1677
+
1678
+ # Filter by from_snapshot if specified
1679
+ import narwhals as nw
1680
+
1681
+ if from_snapshot is not None:
1682
+ source_filtered = source_lazy.filter(
1683
+ nw.col(METAXY_SNAPSHOT_VERSION) == from_snapshot
1684
+ )
1685
+ else:
1686
+ source_filtered = source_lazy
1687
+
1688
+ # Apply filters for this feature (if any)
1689
+ if filters:
1690
+ feature_key_str = feature_key.to_string()
1691
+ if feature_key_str in filters:
1692
+ for filter_expr in filters[feature_key_str]:
1693
+ source_filtered = source_filtered.filter(filter_expr)
1694
+
1695
+ # Apply incremental filtering if enabled
1696
+ if incremental:
1697
+ try:
1698
+ # Read existing sample_uids from destination for the same snapshot
1699
+ # This is much cheaper than comparing metaxy_provenance_by_field structs
1700
+ dest_lazy = self.read_metadata(
1701
+ feature_key,
1702
+ allow_fallback=False,
1703
+ current_only=False,
1704
+ )
1705
+ # Filter destination to same snapshot_version (if specified)
1706
+ if from_snapshot is not None:
1707
+ dest_for_snapshot = dest_lazy.filter(
1708
+ nw.col(METAXY_SNAPSHOT_VERSION) == from_snapshot
1709
+ )
1710
+ else:
1711
+ dest_for_snapshot = dest_lazy
1712
+
1713
+ # Materialize destination sample_uids to avoid cross-backend join issues
1714
+ # When copying between different stores (e.g., different DuckDB files),
1715
+ # Ibis can't join tables from different backends
1716
+ dest_sample_uids = (
1717
+ dest_for_snapshot.select("sample_uid")
1718
+ .collect()
1719
+ .to_polars()
1720
+ )
1721
+
1722
+ # Convert to Polars LazyFrame and wrap in Narwhals
1723
+ dest_sample_uids_lazy = nw.from_native(
1724
+ dest_sample_uids.lazy(), eager_only=False
1725
+ )
1726
+
1727
+ # Collect source to Polars for anti-join
1728
+ source_df = source_filtered.collect().to_polars()
1729
+ source_lazy = nw.from_native(
1730
+ source_df.lazy(), eager_only=False
1731
+ )
1732
+
1733
+ # Anti-join: keep only source rows with sample_uid not in destination
1734
+ source_filtered = source_lazy.join(
1735
+ dest_sample_uids_lazy,
1736
+ on="sample_uid",
1737
+ how="anti",
1738
+ )
1739
+
1740
+ # Collect after filtering
1741
+ source_df = source_filtered.collect().to_polars()
1742
+
1743
+ logger.info(
1744
+ f"Incremental: copying only new sample_uids for {feature_key.to_string()}"
1745
+ )
1746
+ except FeatureNotFoundError:
1747
+ # Feature doesn't exist in destination yet - copy all rows
1748
+ logger.debug(
1749
+ f"Feature {feature_key.to_string()} not in destination, copying all rows"
1750
+ )
1751
+ source_df = source_filtered.collect().to_polars()
1752
+ except Exception as e:
1753
+ # If incremental check fails, log warning but continue with full copy
1754
+ logger.warning(
1755
+ f"Incremental check failed for {feature_key.to_string()}: {e}. Copying all rows."
1756
+ )
1757
+ source_df = source_filtered.collect().to_polars()
1758
+ else:
1759
+ # Non-incremental: collect all filtered rows
1760
+ source_df = source_filtered.collect().to_polars()
1761
+
1762
+ if source_df.height == 0:
1763
+ logger.warning(
1764
+ f"No rows found for {feature_key.to_string()} with snapshot {from_snapshot}, skipping"
1765
+ )
1766
+ continue
1767
+
1768
+ # Write to destination (preserving snapshot_version and feature_version)
1769
+ self.write_metadata(feature_key, source_df)
1770
+
1771
+ features_copied += 1
1772
+ total_rows += source_df.height
1773
+ logger.info(
1774
+ f"Copied {source_df.height} rows for {feature_key.to_string()}"
1775
+ )
1776
+
1777
+ except FeatureNotFoundError:
1778
+ logger.warning(
1779
+ f"Feature {feature_key.to_string()} not found in source store, skipping"
1780
+ )
1781
+ continue
1782
+ except Exception as e:
1783
+ logger.error(
1784
+ f"Error copying {feature_key.to_string()}: {e}", exc_info=True
1785
+ )
1786
+ raise
1787
+
1788
+ logger.info(
1789
+ f"Copy complete: {features_copied} features, {total_rows} total rows"
1790
+ )
1791
+
1792
+ return {"features_copied": features_copied, "rows_copied": total_rows}