metaxy 0.0.1.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaxy/__init__.py +170 -0
- metaxy/_packaging.py +96 -0
- metaxy/_testing/__init__.py +55 -0
- metaxy/_testing/config.py +43 -0
- metaxy/_testing/metaxy_project.py +780 -0
- metaxy/_testing/models.py +111 -0
- metaxy/_testing/parametric/__init__.py +13 -0
- metaxy/_testing/parametric/metadata.py +664 -0
- metaxy/_testing/pytest_helpers.py +74 -0
- metaxy/_testing/runbook.py +533 -0
- metaxy/_utils.py +35 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +97 -0
- metaxy/cli/console.py +13 -0
- metaxy/cli/context.py +167 -0
- metaxy/cli/graph.py +610 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +46 -0
- metaxy/cli/metadata.py +317 -0
- metaxy/cli/migrations.py +999 -0
- metaxy/cli/utils.py +268 -0
- metaxy/config.py +680 -0
- metaxy/entrypoints.py +296 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/dagster/__init__.py +54 -0
- metaxy/ext/dagster/constants.py +10 -0
- metaxy/ext/dagster/dagster_type.py +156 -0
- metaxy/ext/dagster/io_manager.py +200 -0
- metaxy/ext/dagster/metaxify.py +512 -0
- metaxy/ext/dagster/observable.py +115 -0
- metaxy/ext/dagster/resources.py +27 -0
- metaxy/ext/dagster/selection.py +73 -0
- metaxy/ext/dagster/table_metadata.py +417 -0
- metaxy/ext/dagster/utils.py +462 -0
- metaxy/ext/sqlalchemy/__init__.py +23 -0
- metaxy/ext/sqlalchemy/config.py +29 -0
- metaxy/ext/sqlalchemy/plugin.py +353 -0
- metaxy/ext/sqlmodel/__init__.py +13 -0
- metaxy/ext/sqlmodel/config.py +29 -0
- metaxy/ext/sqlmodel/plugin.py +499 -0
- metaxy/graph/__init__.py +29 -0
- metaxy/graph/describe.py +325 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +446 -0
- metaxy/graph/diff/differ.py +769 -0
- metaxy/graph/diff/models.py +443 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +323 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +326 -0
- metaxy/graph/diff/rendering/rich.py +169 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/status.py +329 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +32 -0
- metaxy/metadata_store/_ducklake_support.py +419 -0
- metaxy/metadata_store/base.py +1792 -0
- metaxy/metadata_store/bigquery.py +354 -0
- metaxy/metadata_store/clickhouse.py +184 -0
- metaxy/metadata_store/delta.py +371 -0
- metaxy/metadata_store/duckdb.py +446 -0
- metaxy/metadata_store/exceptions.py +61 -0
- metaxy/metadata_store/ibis.py +542 -0
- metaxy/metadata_store/lancedb.py +391 -0
- metaxy/metadata_store/memory.py +292 -0
- metaxy/metadata_store/system/__init__.py +57 -0
- metaxy/metadata_store/system/events.py +264 -0
- metaxy/metadata_store/system/keys.py +9 -0
- metaxy/metadata_store/system/models.py +129 -0
- metaxy/metadata_store/system/storage.py +957 -0
- metaxy/metadata_store/types.py +10 -0
- metaxy/metadata_store/utils.py +104 -0
- metaxy/metadata_store/warnings.py +36 -0
- metaxy/migrations/__init__.py +32 -0
- metaxy/migrations/detector.py +291 -0
- metaxy/migrations/executor.py +516 -0
- metaxy/migrations/generator.py +319 -0
- metaxy/migrations/loader.py +231 -0
- metaxy/migrations/models.py +528 -0
- metaxy/migrations/ops.py +447 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +12 -0
- metaxy/models/constants.py +139 -0
- metaxy/models/feature.py +1335 -0
- metaxy/models/feature_spec.py +338 -0
- metaxy/models/field.py +263 -0
- metaxy/models/fields_mapping.py +307 -0
- metaxy/models/filter_expression.py +297 -0
- metaxy/models/lineage.py +285 -0
- metaxy/models/plan.py +232 -0
- metaxy/models/types.py +475 -0
- metaxy/py.typed +0 -0
- metaxy/utils/__init__.py +1 -0
- metaxy/utils/constants.py +2 -0
- metaxy/utils/exceptions.py +23 -0
- metaxy/utils/hashing.py +230 -0
- metaxy/versioning/__init__.py +31 -0
- metaxy/versioning/engine.py +656 -0
- metaxy/versioning/feature_dep_transformer.py +151 -0
- metaxy/versioning/ibis.py +249 -0
- metaxy/versioning/lineage_handler.py +205 -0
- metaxy/versioning/polars.py +189 -0
- metaxy/versioning/renamed_df.py +35 -0
- metaxy/versioning/types.py +63 -0
- metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
- metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
- metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
- metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,1792 @@
|
|
|
1
|
+
"""Abstract base class for metadata storage backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
7
|
+
from contextlib import AbstractContextManager, contextmanager
|
|
8
|
+
from types import TracebackType
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload
|
|
10
|
+
|
|
11
|
+
import narwhals as nw
|
|
12
|
+
from narwhals.typing import Frame, IntoFrame
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
15
|
+
from typing_extensions import Self
|
|
16
|
+
|
|
17
|
+
from metaxy._utils import switch_implementation_to_polars
|
|
18
|
+
from metaxy.config import MetaxyConfig
|
|
19
|
+
from metaxy.metadata_store.exceptions import (
|
|
20
|
+
FeatureNotFoundError,
|
|
21
|
+
StoreNotOpenError,
|
|
22
|
+
SystemDataNotFoundError,
|
|
23
|
+
VersioningEngineMismatchError,
|
|
24
|
+
)
|
|
25
|
+
from metaxy.metadata_store.system.keys import METAXY_SYSTEM_KEY_PREFIX
|
|
26
|
+
from metaxy.metadata_store.types import AccessMode
|
|
27
|
+
from metaxy.metadata_store.utils import (
|
|
28
|
+
_suppress_feature_version_warning,
|
|
29
|
+
allow_feature_version_override,
|
|
30
|
+
empty_frame_like,
|
|
31
|
+
)
|
|
32
|
+
from metaxy.metadata_store.warnings import (
|
|
33
|
+
MetaxyColumnMissingWarning,
|
|
34
|
+
PolarsMaterializationWarning,
|
|
35
|
+
)
|
|
36
|
+
from metaxy.models.constants import (
|
|
37
|
+
ALL_SYSTEM_COLUMNS,
|
|
38
|
+
METAXY_CREATED_AT,
|
|
39
|
+
METAXY_DATA_VERSION,
|
|
40
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
41
|
+
METAXY_FEATURE_VERSION,
|
|
42
|
+
METAXY_MATERIALIZATION_ID,
|
|
43
|
+
METAXY_PROVENANCE,
|
|
44
|
+
METAXY_PROVENANCE_BY_FIELD,
|
|
45
|
+
METAXY_SNAPSHOT_VERSION,
|
|
46
|
+
)
|
|
47
|
+
from metaxy.models.feature import BaseFeature, FeatureGraph, current_graph
|
|
48
|
+
from metaxy.models.plan import FeaturePlan
|
|
49
|
+
from metaxy.models.types import (
|
|
50
|
+
CoercibleToFeatureKey,
|
|
51
|
+
FeatureKey,
|
|
52
|
+
ValidatedFeatureKeyAdapter,
|
|
53
|
+
)
|
|
54
|
+
from metaxy.versioning import VersioningEngine
|
|
55
|
+
from metaxy.versioning.polars import PolarsVersioningEngine
|
|
56
|
+
from metaxy.versioning.types import HashAlgorithm, Increment, LazyIncrement
|
|
57
|
+
|
|
58
|
+
if TYPE_CHECKING:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# TypeVar for config types - used for typing from_config method
|
|
63
|
+
MetadataStoreConfigT = TypeVar("MetadataStoreConfigT", bound="MetadataStoreConfig")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class MetadataStoreConfig(BaseSettings):
|
|
67
|
+
"""Base configuration class for metadata stores.
|
|
68
|
+
|
|
69
|
+
This class defines common configuration fields shared by all metadata store types.
|
|
70
|
+
Store-specific config classes should inherit from this and add their own fields.
|
|
71
|
+
|
|
72
|
+
Example:
|
|
73
|
+
```python
|
|
74
|
+
from metaxy.metadata_store.duckdb import DuckDBMetadataStoreConfig
|
|
75
|
+
|
|
76
|
+
config = DuckDBMetadataStoreConfig(
|
|
77
|
+
database="metadata.db",
|
|
78
|
+
hash_algorithm=HashAlgorithm.MD5,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
store = DuckDBMetadataStore.from_config(config)
|
|
82
|
+
```
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
model_config = SettingsConfigDict(frozen=True, extra="forbid")
|
|
86
|
+
|
|
87
|
+
fallback_stores: list[str] = Field(
|
|
88
|
+
default_factory=list,
|
|
89
|
+
description="List of fallback store names to search when features are not found in the current store.",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
hash_algorithm: HashAlgorithm | None = Field(
|
|
93
|
+
default=None,
|
|
94
|
+
description="Hash algorithm for versioning. If None, uses store's default.",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
versioning_engine: Literal["auto", "native", "polars"] = Field(
|
|
98
|
+
default="auto",
|
|
99
|
+
description="Which versioning engine to use: 'auto' (prefer native), 'native', or 'polars'.",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
VersioningEngineT = TypeVar("VersioningEngineT", bound=VersioningEngine)
|
|
104
|
+
VersioningEngineOptions = Literal["auto", "native", "polars"]
|
|
105
|
+
|
|
106
|
+
# Mapping of system columns to their expected Narwhals dtypes
|
|
107
|
+
# Used to cast Null-typed columns to correct types
|
|
108
|
+
# Note: Struct columns (METAXY_PROVENANCE_BY_FIELD, METAXY_DATA_VERSION_BY_FIELD) are not cast
|
|
109
|
+
_SYSTEM_COLUMN_DTYPES = {
|
|
110
|
+
METAXY_PROVENANCE: nw.String,
|
|
111
|
+
METAXY_FEATURE_VERSION: nw.String,
|
|
112
|
+
METAXY_SNAPSHOT_VERSION: nw.String,
|
|
113
|
+
METAXY_DATA_VERSION: nw.String,
|
|
114
|
+
METAXY_CREATED_AT: nw.Datetime,
|
|
115
|
+
METAXY_MATERIALIZATION_ID: nw.String,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _cast_present_system_columns(
|
|
120
|
+
df: nw.DataFrame[Any] | nw.LazyFrame[Any],
|
|
121
|
+
) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
|
|
122
|
+
"""Cast system columns with Null/Unknown dtype to their correct types.
|
|
123
|
+
|
|
124
|
+
This handles edge cases where empty DataFrames or certain operations
|
|
125
|
+
result in Null-typed columns (represented as nw.Unknown in Narwhals)
|
|
126
|
+
that break downstream processing.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
df: Narwhals DataFrame or LazyFrame
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
DataFrame with system columns cast to correct types
|
|
133
|
+
"""
|
|
134
|
+
schema = df.collect_schema()
|
|
135
|
+
columns_to_cast = []
|
|
136
|
+
|
|
137
|
+
for col_name, expected_dtype in _SYSTEM_COLUMN_DTYPES.items():
|
|
138
|
+
if col_name in schema and schema[col_name] == nw.Unknown:
|
|
139
|
+
columns_to_cast.append(nw.col(col_name).cast(expected_dtype))
|
|
140
|
+
|
|
141
|
+
if columns_to_cast:
|
|
142
|
+
df = df.with_columns(columns_to_cast)
|
|
143
|
+
|
|
144
|
+
return df
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class MetadataStore(ABC):
|
|
148
|
+
"""
|
|
149
|
+
Abstract base class for metadata storage backends.
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
# Subclasses can override this to disable auto_create_tables warning
|
|
153
|
+
# Set to False for stores where table creation is not applicable (e.g., InMemoryMetadataStore)
|
|
154
|
+
_should_warn_auto_create_tables: bool = True
|
|
155
|
+
|
|
156
|
+
def __init__(
|
|
157
|
+
self,
|
|
158
|
+
*,
|
|
159
|
+
versioning_engine_cls: type[VersioningEngineT],
|
|
160
|
+
hash_algorithm: HashAlgorithm | None = None,
|
|
161
|
+
versioning_engine: VersioningEngineOptions = "auto",
|
|
162
|
+
fallback_stores: list[MetadataStore] | None = None,
|
|
163
|
+
auto_create_tables: bool | None = None,
|
|
164
|
+
materialization_id: str | None = None,
|
|
165
|
+
):
|
|
166
|
+
"""
|
|
167
|
+
Initialize the metadata store.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
hash_algorithm: Hash algorithm to use for the versioning engine.
|
|
171
|
+
|
|
172
|
+
versioning_engine: Which versioning engine to use.
|
|
173
|
+
|
|
174
|
+
- "auto": Prefer the store's native engine, fall back to Polars if needed
|
|
175
|
+
|
|
176
|
+
- "native": Always use the store's native engine, raise `VersioningEngineMismatchError`
|
|
177
|
+
if provided dataframes are incompatible
|
|
178
|
+
|
|
179
|
+
- "polars": Always use the Polars engine
|
|
180
|
+
|
|
181
|
+
fallback_stores: Ordered list of read-only fallback stores.
|
|
182
|
+
Used when upstream features are not in this store.
|
|
183
|
+
`VersioningEngineMismatchError` is not raised when reading from fallback stores.
|
|
184
|
+
auto_create_tables: If True, automatically create tables when opening the store.
|
|
185
|
+
If None (default), reads from global MetaxyConfig (which reads from METAXY_AUTO_CREATE_TABLES env var).
|
|
186
|
+
If False, never auto-create tables.
|
|
187
|
+
|
|
188
|
+
!!! warning
|
|
189
|
+
Auto-create is intended for development/testing only.
|
|
190
|
+
Use proper database migration tools like Alembic for production deployments.
|
|
191
|
+
|
|
192
|
+
materialization_id: Optional external orchestration ID.
|
|
193
|
+
If provided, all metadata writes will include this ID in the `metaxy_materialization_id` column.
|
|
194
|
+
Can be overridden per [`MetadataStore.write_metadata`][metaxy.MetadataStore.write_metadata] call.
|
|
195
|
+
|
|
196
|
+
Raises:
|
|
197
|
+
ValueError: If fallback stores use different hash algorithms or truncation lengths
|
|
198
|
+
VersioningEngineMismatchError: If a user-provided dataframe has a wrong implementation
|
|
199
|
+
and versioning_engine is set to `native`
|
|
200
|
+
"""
|
|
201
|
+
# Initialize state early so properties can check it
|
|
202
|
+
self._is_open = False
|
|
203
|
+
self._context_depth = 0
|
|
204
|
+
self._versioning_engine = versioning_engine
|
|
205
|
+
self._allow_cross_project_writes = False
|
|
206
|
+
self._materialization_id = materialization_id
|
|
207
|
+
self._open_cm: AbstractContextManager[Self] | None = (
|
|
208
|
+
None # Track the open() context manager
|
|
209
|
+
)
|
|
210
|
+
self.versioning_engine_cls = versioning_engine_cls
|
|
211
|
+
|
|
212
|
+
# Resolve auto_create_tables from global config if not explicitly provided
|
|
213
|
+
if auto_create_tables is None:
|
|
214
|
+
from metaxy.config import MetaxyConfig
|
|
215
|
+
|
|
216
|
+
self.auto_create_tables = MetaxyConfig.get().auto_create_tables
|
|
217
|
+
else:
|
|
218
|
+
self.auto_create_tables = auto_create_tables
|
|
219
|
+
|
|
220
|
+
# Use store's default algorithm if not specified
|
|
221
|
+
if hash_algorithm is None:
|
|
222
|
+
hash_algorithm = self._get_default_hash_algorithm()
|
|
223
|
+
|
|
224
|
+
self.hash_algorithm = hash_algorithm
|
|
225
|
+
|
|
226
|
+
self.fallback_stores = fallback_stores or []
|
|
227
|
+
|
|
228
|
+
@classmethod
|
|
229
|
+
@abstractmethod
|
|
230
|
+
def config_model(cls) -> type[MetadataStoreConfig]:
|
|
231
|
+
"""Return the configuration model class for this store type.
|
|
232
|
+
|
|
233
|
+
Subclasses must override this to return their specific config class.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
The config class type (e.g., DuckDBMetadataStoreConfig)
|
|
237
|
+
|
|
238
|
+
Note:
|
|
239
|
+
Subclasses override this with a more specific return type.
|
|
240
|
+
Type checkers may show a warning about incompatible override,
|
|
241
|
+
but this is intentional - each store returns its own config type.
|
|
242
|
+
"""
|
|
243
|
+
...
|
|
244
|
+
|
|
245
|
+
@classmethod
|
|
246
|
+
def from_config(cls, config: MetadataStoreConfig, **kwargs: Any) -> Self:
|
|
247
|
+
"""Create a store instance from a configuration object.
|
|
248
|
+
|
|
249
|
+
This method creates a store by:
|
|
250
|
+
1. Converting the config to a dict
|
|
251
|
+
2. Resolving fallback store names to actual store instances
|
|
252
|
+
3. Calling the store's __init__ with the config parameters
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
config: Configuration object (should be the type returned by config_model())
|
|
256
|
+
**kwargs: Additional arguments passed directly to the store constructor
|
|
257
|
+
(e.g., materialization_id for runtime parameters not in config)
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
A new store instance configured according to the config object
|
|
261
|
+
|
|
262
|
+
Example:
|
|
263
|
+
```python
|
|
264
|
+
from metaxy.metadata_store.duckdb import (
|
|
265
|
+
DuckDBMetadataStore,
|
|
266
|
+
DuckDBMetadataStoreConfig,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
config = DuckDBMetadataStoreConfig(
|
|
270
|
+
database="metadata.db",
|
|
271
|
+
fallback_stores=["prod"],
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
store = DuckDBMetadataStore.from_config(config)
|
|
275
|
+
```
|
|
276
|
+
"""
|
|
277
|
+
# Convert config to dict, excluding unset values
|
|
278
|
+
config_dict = config.model_dump(exclude_unset=True)
|
|
279
|
+
|
|
280
|
+
# Pop and resolve fallback store names to actual store instances
|
|
281
|
+
fallback_store_names = config_dict.pop("fallback_stores", [])
|
|
282
|
+
fallback_stores = [
|
|
283
|
+
MetaxyConfig.get().get_store(name) for name in fallback_store_names
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
# Create store with resolved fallback stores, config, and extra kwargs
|
|
287
|
+
return cls(fallback_stores=fallback_stores, **config_dict, **kwargs)
|
|
288
|
+
|
|
289
|
+
@property
|
|
290
|
+
def hash_truncation_length(self) -> int:
|
|
291
|
+
return MetaxyConfig.get().hash_truncation_length or 64
|
|
292
|
+
|
|
293
|
+
@property
|
|
294
|
+
def materialization_id(self) -> str | None:
|
|
295
|
+
"""The external orchestration ID for this store instance.
|
|
296
|
+
|
|
297
|
+
If set, all metadata writes include this ID in the `metaxy_materialization_id` column,
|
|
298
|
+
allowing filtering of rows written during a specific materialization run.
|
|
299
|
+
"""
|
|
300
|
+
return self._materialization_id
|
|
301
|
+
|
|
302
|
+
@overload
|
|
303
|
+
def resolve_update(
|
|
304
|
+
self,
|
|
305
|
+
feature: type[BaseFeature],
|
|
306
|
+
*,
|
|
307
|
+
samples: IntoFrame | Frame | None = None,
|
|
308
|
+
filters: Mapping[CoercibleToFeatureKey, Sequence[nw.Expr]] | None = None,
|
|
309
|
+
global_filters: Sequence[nw.Expr] | None = None,
|
|
310
|
+
lazy: Literal[False] = False,
|
|
311
|
+
versioning_engine: Literal["auto", "native", "polars"] | None = None,
|
|
312
|
+
skip_comparison: bool = False,
|
|
313
|
+
**kwargs: Any,
|
|
314
|
+
) -> Increment: ...
|
|
315
|
+
|
|
316
|
+
@overload
|
|
317
|
+
def resolve_update(
|
|
318
|
+
self,
|
|
319
|
+
feature: type[BaseFeature],
|
|
320
|
+
*,
|
|
321
|
+
samples: IntoFrame | Frame | None = None,
|
|
322
|
+
filters: Mapping[CoercibleToFeatureKey, Sequence[nw.Expr]] | None = None,
|
|
323
|
+
global_filters: Sequence[nw.Expr] | None = None,
|
|
324
|
+
lazy: Literal[True],
|
|
325
|
+
versioning_engine: Literal["auto", "native", "polars"] | None = None,
|
|
326
|
+
skip_comparison: bool = False,
|
|
327
|
+
**kwargs: Any,
|
|
328
|
+
) -> LazyIncrement: ...
|
|
329
|
+
|
|
330
|
+
def resolve_update(
|
|
331
|
+
self,
|
|
332
|
+
feature: type[BaseFeature],
|
|
333
|
+
*,
|
|
334
|
+
samples: IntoFrame | Frame | None = None,
|
|
335
|
+
filters: Mapping[CoercibleToFeatureKey, Sequence[nw.Expr]] | None = None,
|
|
336
|
+
global_filters: Sequence[nw.Expr] | None = None,
|
|
337
|
+
lazy: bool = False,
|
|
338
|
+
versioning_engine: Literal["auto", "native", "polars"] | None = None,
|
|
339
|
+
skip_comparison: bool = False,
|
|
340
|
+
**kwargs: Any,
|
|
341
|
+
) -> Increment | LazyIncrement:
|
|
342
|
+
"""Calculate an incremental update for a feature.
|
|
343
|
+
|
|
344
|
+
This is the main workhorse in Metaxy.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
feature: Feature class to resolve updates for
|
|
348
|
+
samples: A dataframe with joined upstream metadata and `"metaxy_provenance_by_field"` column set.
|
|
349
|
+
When provided, `MetadataStore` skips loading upstream feature metadata and provenance calculations.
|
|
350
|
+
|
|
351
|
+
!!! info "Required for root features"
|
|
352
|
+
Metaxy doesn't know how to populate input metadata for root features,
|
|
353
|
+
so `samples` argument for **must** be provided for them.
|
|
354
|
+
|
|
355
|
+
!!! tip
|
|
356
|
+
For non-root features, use `samples` to customize the automatic upstream loading and field provenance calculation.
|
|
357
|
+
For example, it can be used to requires processing for specific sample IDs.
|
|
358
|
+
|
|
359
|
+
Setting this parameter during normal operations is not required.
|
|
360
|
+
|
|
361
|
+
filters: A mapping from feature keys to lists of Narwhals filter expressions.
|
|
362
|
+
Keys can be feature classes, FeatureKey objects, or string paths.
|
|
363
|
+
Applied at read-time. May filter the current feature,
|
|
364
|
+
in this case it will also be applied to `samples` (if provided).
|
|
365
|
+
Example: `{UpstreamFeature: [nw.col("x") > 10], ...}`
|
|
366
|
+
global_filters: A list of Narwhals filter expressions applied to all features.
|
|
367
|
+
These filters are combined with any feature-specific filters from `filters`.
|
|
368
|
+
Useful for filtering by common columns like `sample_uid` across all features.
|
|
369
|
+
Example: `[nw.col("sample_uid").is_in(["s1", "s2"])]`
|
|
370
|
+
lazy: Whether to return a [metaxy.versioning.types.LazyIncrement][] or a [metaxy.versioning.types.Increment][].
|
|
371
|
+
versioning_engine: Override the store's versioning engine for this operation.
|
|
372
|
+
skip_comparison: If True, skip the increment comparison logic and return all
|
|
373
|
+
upstream samples in `Increment.added`. The `changed` and `removed` frames will
|
|
374
|
+
be empty.
|
|
375
|
+
|
|
376
|
+
Raises:
|
|
377
|
+
ValueError: If no `samples` dataframe has been provided when resolving an update for a root feature.
|
|
378
|
+
VersioningEngineMismatchError: If `versioning_engine` has been set to `"native"`
|
|
379
|
+
and a dataframe of a different implementation has been encountered during `resolve_update`.
|
|
380
|
+
|
|
381
|
+
!!! example "With a root feature"
|
|
382
|
+
|
|
383
|
+
```py
|
|
384
|
+
samples = pl.DataFrame({
|
|
385
|
+
"sample_uid": [1, 2, 3],
|
|
386
|
+
"metaxy_provenance_by_field": [{"field": "h1"}, {"field": "h2"}, {"field": "h3"}],
|
|
387
|
+
})
|
|
388
|
+
result = store.resolve_update(RootFeature, samples=nw.from_native(samples))
|
|
389
|
+
```
|
|
390
|
+
"""
|
|
391
|
+
import narwhals as nw
|
|
392
|
+
|
|
393
|
+
# Convert samples to Narwhals frame if not already
|
|
394
|
+
samples_nw: nw.DataFrame[Any] | nw.LazyFrame[Any] | None = None
|
|
395
|
+
if samples is not None:
|
|
396
|
+
if isinstance(samples, (nw.DataFrame, nw.LazyFrame)):
|
|
397
|
+
samples_nw = samples
|
|
398
|
+
else:
|
|
399
|
+
samples_nw = nw.from_native(samples)
|
|
400
|
+
|
|
401
|
+
# Normalize filter keys to FeatureKey
|
|
402
|
+
normalized_filters: dict[FeatureKey, list[nw.Expr]] = {}
|
|
403
|
+
if filters:
|
|
404
|
+
for key, exprs in filters.items():
|
|
405
|
+
feature_key = self._resolve_feature_key(key)
|
|
406
|
+
normalized_filters[feature_key] = list(exprs)
|
|
407
|
+
|
|
408
|
+
# Convert global_filters to a list for easy concatenation
|
|
409
|
+
global_filter_list = list(global_filters) if global_filters else []
|
|
410
|
+
|
|
411
|
+
graph = current_graph()
|
|
412
|
+
plan = graph.get_feature_plan(feature.spec().key)
|
|
413
|
+
|
|
414
|
+
# Root features without samples: error (samples required)
|
|
415
|
+
if not plan.deps and samples_nw is None:
|
|
416
|
+
raise ValueError(
|
|
417
|
+
f"Feature {feature.spec().key} has no upstream dependencies (root feature). "
|
|
418
|
+
f"Must provide 'samples' parameter with sample_uid and {METAXY_PROVENANCE_BY_FIELD} columns. "
|
|
419
|
+
f"Root features require manual {METAXY_PROVENANCE_BY_FIELD} computation."
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
# Combine feature-specific filters with global filters
|
|
423
|
+
current_feature_filters = [
|
|
424
|
+
*normalized_filters.get(feature.spec().key, []),
|
|
425
|
+
*global_filter_list,
|
|
426
|
+
]
|
|
427
|
+
|
|
428
|
+
current_metadata = self.read_metadata_in_store(
|
|
429
|
+
feature,
|
|
430
|
+
filters=[
|
|
431
|
+
nw.col(METAXY_FEATURE_VERSION)
|
|
432
|
+
== graph.get_feature_version(feature.spec().key),
|
|
433
|
+
*current_feature_filters,
|
|
434
|
+
],
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
upstream_by_key: dict[FeatureKey, nw.LazyFrame[Any]] = {}
|
|
438
|
+
filters_by_key: dict[FeatureKey, list[nw.Expr]] = {}
|
|
439
|
+
|
|
440
|
+
# if samples are provided, use them as source of truth for upstream data
|
|
441
|
+
if samples_nw is not None:
|
|
442
|
+
# Apply filters to samples if any
|
|
443
|
+
filtered_samples = samples_nw
|
|
444
|
+
if current_feature_filters:
|
|
445
|
+
filtered_samples = samples_nw.filter(current_feature_filters)
|
|
446
|
+
|
|
447
|
+
# fill in METAXY_PROVENANCE column if it's missing (e.g. for root features)
|
|
448
|
+
samples_nw = self.hash_struct_version_column(
|
|
449
|
+
plan,
|
|
450
|
+
df=filtered_samples,
|
|
451
|
+
struct_column=METAXY_PROVENANCE_BY_FIELD,
|
|
452
|
+
hash_column=METAXY_PROVENANCE,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# For root features, add data_version columns if they don't exist
|
|
456
|
+
# (root features have no computation, so data_version equals provenance)
|
|
457
|
+
if METAXY_DATA_VERSION_BY_FIELD not in samples_nw.columns:
|
|
458
|
+
samples_nw = samples_nw.with_columns(
|
|
459
|
+
nw.col(METAXY_PROVENANCE_BY_FIELD).alias(
|
|
460
|
+
METAXY_DATA_VERSION_BY_FIELD
|
|
461
|
+
),
|
|
462
|
+
nw.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION),
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
for upstream_spec in plan.deps or []:
|
|
466
|
+
# Combine feature-specific filters with global filters for upstream
|
|
467
|
+
upstream_filters = [
|
|
468
|
+
*normalized_filters.get(upstream_spec.key, []),
|
|
469
|
+
*global_filter_list,
|
|
470
|
+
]
|
|
471
|
+
upstream_feature_metadata = self.read_metadata(
|
|
472
|
+
upstream_spec.key,
|
|
473
|
+
filters=upstream_filters,
|
|
474
|
+
)
|
|
475
|
+
if upstream_feature_metadata is not None:
|
|
476
|
+
upstream_by_key[upstream_spec.key] = upstream_feature_metadata
|
|
477
|
+
|
|
478
|
+
# determine which implementation to use for resolving the increment
|
|
479
|
+
# consider (1) whether all upstream metadata has been loaded with the native implementation
|
|
480
|
+
# (2) if samples have native implementation
|
|
481
|
+
|
|
482
|
+
# Use parameter if provided, otherwise use store default
|
|
483
|
+
engine_mode = (
|
|
484
|
+
versioning_engine
|
|
485
|
+
if versioning_engine is not None
|
|
486
|
+
else self._versioning_engine
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# If "polars" mode, force Polars immediately
|
|
490
|
+
if engine_mode == "polars":
|
|
491
|
+
implementation = nw.Implementation.POLARS
|
|
492
|
+
switched_to_polars = True
|
|
493
|
+
else:
|
|
494
|
+
implementation = self.native_implementation()
|
|
495
|
+
switched_to_polars = False
|
|
496
|
+
|
|
497
|
+
for upstream_key, df in upstream_by_key.items():
|
|
498
|
+
if df.implementation != implementation:
|
|
499
|
+
switched_to_polars = True
|
|
500
|
+
# Only raise error in "native" mode if no fallback stores configured.
|
|
501
|
+
# If fallback stores exist, the implementation mismatch indicates data came
|
|
502
|
+
# from fallback (different implementation), which is legitimate fallback access.
|
|
503
|
+
# If data were local, it would have the native implementation.
|
|
504
|
+
if engine_mode == "native" and not self.fallback_stores:
|
|
505
|
+
raise VersioningEngineMismatchError(
|
|
506
|
+
f"versioning_engine='native' but upstream feature `{upstream_key.to_string()}` "
|
|
507
|
+
f"has implementation {df.implementation}, expected {self.native_implementation()}"
|
|
508
|
+
)
|
|
509
|
+
elif engine_mode == "auto" or (
|
|
510
|
+
engine_mode == "native" and self.fallback_stores
|
|
511
|
+
):
|
|
512
|
+
PolarsMaterializationWarning.warn_on_implementation_mismatch(
|
|
513
|
+
expected=self.native_implementation(),
|
|
514
|
+
actual=df.implementation,
|
|
515
|
+
message=f"Using Polars for resolving the increment instead. This was caused by upstream feature `{upstream_key.to_string()}`.",
|
|
516
|
+
)
|
|
517
|
+
implementation = nw.Implementation.POLARS
|
|
518
|
+
break
|
|
519
|
+
|
|
520
|
+
if (
|
|
521
|
+
samples_nw is not None
|
|
522
|
+
and samples_nw.implementation != self.native_implementation()
|
|
523
|
+
):
|
|
524
|
+
if not switched_to_polars:
|
|
525
|
+
if engine_mode == "native":
|
|
526
|
+
# Always raise error for samples with wrong implementation, regardless
|
|
527
|
+
# of fallback stores, because samples come from user argument, not from fallback
|
|
528
|
+
raise VersioningEngineMismatchError(
|
|
529
|
+
f"versioning_engine='native' but provided `samples` have implementation {samples_nw.implementation}, "
|
|
530
|
+
f"expected {self.native_implementation()}"
|
|
531
|
+
)
|
|
532
|
+
elif engine_mode == "auto":
|
|
533
|
+
PolarsMaterializationWarning.warn_on_implementation_mismatch(
|
|
534
|
+
expected=self.native_implementation(),
|
|
535
|
+
actual=samples_nw.implementation,
|
|
536
|
+
message=f"Provided `samples` have implementation {samples_nw.implementation}. Using Polars for resolving the increment instead.",
|
|
537
|
+
)
|
|
538
|
+
implementation = nw.Implementation.POLARS
|
|
539
|
+
switched_to_polars = True
|
|
540
|
+
|
|
541
|
+
if switched_to_polars:
|
|
542
|
+
if current_metadata:
|
|
543
|
+
current_metadata = switch_implementation_to_polars(current_metadata)
|
|
544
|
+
if samples_nw:
|
|
545
|
+
samples_nw = switch_implementation_to_polars(samples_nw)
|
|
546
|
+
for upstream_key, df in upstream_by_key.items():
|
|
547
|
+
upstream_by_key[upstream_key] = switch_implementation_to_polars(df)
|
|
548
|
+
|
|
549
|
+
with self.create_versioning_engine(
|
|
550
|
+
plan=plan, implementation=implementation
|
|
551
|
+
) as engine:
|
|
552
|
+
if skip_comparison:
|
|
553
|
+
# Skip comparison: return all upstream samples as added
|
|
554
|
+
if samples_nw is not None:
|
|
555
|
+
# Root features or user-provided samples: use samples directly
|
|
556
|
+
# Note: samples already has metaxy_provenance computed
|
|
557
|
+
added = samples_nw.lazy()
|
|
558
|
+
else:
|
|
559
|
+
# Non-root features: load all upstream with provenance
|
|
560
|
+
added = engine.load_upstream_with_provenance(
|
|
561
|
+
upstream=upstream_by_key,
|
|
562
|
+
hash_algo=self.hash_algorithm,
|
|
563
|
+
filters=filters_by_key,
|
|
564
|
+
)
|
|
565
|
+
changed = None
|
|
566
|
+
removed = None
|
|
567
|
+
else:
|
|
568
|
+
added, changed, removed = engine.resolve_increment_with_provenance(
|
|
569
|
+
current=current_metadata,
|
|
570
|
+
upstream=upstream_by_key,
|
|
571
|
+
hash_algorithm=self.hash_algorithm,
|
|
572
|
+
filters=filters_by_key,
|
|
573
|
+
sample=samples_nw.lazy() if samples_nw is not None else None,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Convert None to empty DataFrames
|
|
577
|
+
if changed is None:
|
|
578
|
+
changed = empty_frame_like(added)
|
|
579
|
+
if removed is None:
|
|
580
|
+
removed = empty_frame_like(added)
|
|
581
|
+
|
|
582
|
+
if lazy:
|
|
583
|
+
return LazyIncrement(
|
|
584
|
+
added=added
|
|
585
|
+
if isinstance(added, nw.LazyFrame)
|
|
586
|
+
else nw.from_native(added),
|
|
587
|
+
changed=changed
|
|
588
|
+
if isinstance(changed, nw.LazyFrame)
|
|
589
|
+
else nw.from_native(changed),
|
|
590
|
+
removed=removed
|
|
591
|
+
if isinstance(removed, nw.LazyFrame)
|
|
592
|
+
else nw.from_native(removed),
|
|
593
|
+
)
|
|
594
|
+
else:
|
|
595
|
+
return Increment(
|
|
596
|
+
added=added.collect() if isinstance(added, nw.LazyFrame) else added,
|
|
597
|
+
changed=changed.collect()
|
|
598
|
+
if isinstance(changed, nw.LazyFrame)
|
|
599
|
+
else changed,
|
|
600
|
+
removed=removed.collect()
|
|
601
|
+
if isinstance(removed, nw.LazyFrame)
|
|
602
|
+
else removed,
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
def read_metadata(
|
|
606
|
+
self,
|
|
607
|
+
feature: CoercibleToFeatureKey,
|
|
608
|
+
*,
|
|
609
|
+
feature_version: str | None = None,
|
|
610
|
+
filters: Sequence[nw.Expr] | None = None,
|
|
611
|
+
columns: Sequence[str] | None = None,
|
|
612
|
+
allow_fallback: bool = True,
|
|
613
|
+
current_only: bool = True,
|
|
614
|
+
latest_only: bool = True,
|
|
615
|
+
) -> nw.LazyFrame[Any]:
|
|
616
|
+
"""
|
|
617
|
+
Read metadata with optional fallback to upstream stores.
|
|
618
|
+
|
|
619
|
+
Args:
|
|
620
|
+
feature: Feature to read metadata for
|
|
621
|
+
feature_version: Explicit feature_version to filter by (mutually exclusive with current_only=True)
|
|
622
|
+
filters: Sequence of Narwhals filter expressions to apply to this feature.
|
|
623
|
+
Example: `[nw.col("x") > 10, nw.col("y") < 5]`
|
|
624
|
+
columns: Subset of columns to include. Metaxy's system columns are always included.
|
|
625
|
+
allow_fallback: If `True`, check fallback stores on local miss
|
|
626
|
+
current_only: If `True`, only return rows with current feature_version
|
|
627
|
+
latest_only: Whether to deduplicate samples within `id_columns` groups ordered by `metaxy_created_at`.
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
Narwhals LazyFrame with metadata
|
|
631
|
+
|
|
632
|
+
Raises:
|
|
633
|
+
FeatureNotFoundError: If feature not found in any store
|
|
634
|
+
SystemDataNotFoundError: When attempting to read non-existent Metaxy system data
|
|
635
|
+
ValueError: If both feature_version and current_only=True are provided
|
|
636
|
+
|
|
637
|
+
!!! info
|
|
638
|
+
When this method is called with default arguments, it will return the latest (by `metaxy_created_at`)
|
|
639
|
+
metadata for the current feature version. Therefore, it's perfectly suitable for most use cases.
|
|
640
|
+
|
|
641
|
+
!!! warning
|
|
642
|
+
The order of rows is not guaranteed.
|
|
643
|
+
"""
|
|
644
|
+
filters = filters or []
|
|
645
|
+
columns = columns or []
|
|
646
|
+
|
|
647
|
+
feature_key = self._resolve_feature_key(feature)
|
|
648
|
+
is_system_table = self._is_system_table(feature_key)
|
|
649
|
+
|
|
650
|
+
# Validate mutually exclusive parameters
|
|
651
|
+
if feature_version is not None and current_only:
|
|
652
|
+
raise ValueError(
|
|
653
|
+
"Cannot specify both feature_version and current_only=True. "
|
|
654
|
+
"Use current_only=False with feature_version parameter."
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# Add feature_version filter only when needed
|
|
658
|
+
if current_only or feature_version is not None and not is_system_table:
|
|
659
|
+
version_filter = nw.col(METAXY_FEATURE_VERSION) == (
|
|
660
|
+
current_graph().get_feature_version(feature_key)
|
|
661
|
+
if current_only
|
|
662
|
+
else feature_version
|
|
663
|
+
)
|
|
664
|
+
filters = [version_filter, *filters]
|
|
665
|
+
|
|
666
|
+
if columns and not is_system_table:
|
|
667
|
+
# Add only system columns that aren't already in the user's columns list
|
|
668
|
+
columns_set = set(columns)
|
|
669
|
+
missing_system_cols = [
|
|
670
|
+
c for c in ALL_SYSTEM_COLUMNS if c not in columns_set
|
|
671
|
+
]
|
|
672
|
+
read_columns = [*columns, *missing_system_cols]
|
|
673
|
+
else:
|
|
674
|
+
read_columns = None
|
|
675
|
+
|
|
676
|
+
lazy_frame = None
|
|
677
|
+
try:
|
|
678
|
+
lazy_frame = self.read_metadata_in_store(
|
|
679
|
+
feature, filters=filters, columns=read_columns
|
|
680
|
+
)
|
|
681
|
+
except FeatureNotFoundError as e:
|
|
682
|
+
# do not read system features from fallback stores
|
|
683
|
+
if is_system_table:
|
|
684
|
+
raise SystemDataNotFoundError(
|
|
685
|
+
f"System Metaxy data with key {feature_key} is missing in {self.display()}. Invoke `metaxy graph push` before attempting to read system data."
|
|
686
|
+
) from e
|
|
687
|
+
|
|
688
|
+
# Handle case where read_metadata_in_store returns None (no exception raised)
|
|
689
|
+
if lazy_frame is None and is_system_table:
|
|
690
|
+
raise SystemDataNotFoundError(
|
|
691
|
+
f"System Metaxy data with key {feature_key} is missing in {self.display()}. Invoke `metaxy graph push` before attempting to read system data."
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
if lazy_frame is not None and not is_system_table and latest_only:
|
|
695
|
+
from metaxy.models.constants import METAXY_CREATED_AT
|
|
696
|
+
|
|
697
|
+
# Apply deduplication
|
|
698
|
+
lazy_frame = self.versioning_engine_cls.keep_latest_by_group(
|
|
699
|
+
df=lazy_frame,
|
|
700
|
+
group_columns=list(
|
|
701
|
+
self._resolve_feature_plan(feature_key).feature.id_columns
|
|
702
|
+
),
|
|
703
|
+
timestamp_column=METAXY_CREATED_AT,
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
if lazy_frame is not None:
|
|
707
|
+
# After dedup, filter to requested columns if specified
|
|
708
|
+
if columns:
|
|
709
|
+
lazy_frame = lazy_frame.select(columns)
|
|
710
|
+
|
|
711
|
+
return lazy_frame
|
|
712
|
+
|
|
713
|
+
# Try fallback stores
|
|
714
|
+
if allow_fallback:
|
|
715
|
+
for store in self.fallback_stores:
|
|
716
|
+
try:
|
|
717
|
+
# Use full read_metadata to handle nested fallback chains
|
|
718
|
+
return store.read_metadata(
|
|
719
|
+
feature,
|
|
720
|
+
feature_version=feature_version,
|
|
721
|
+
filters=filters,
|
|
722
|
+
columns=columns,
|
|
723
|
+
allow_fallback=True,
|
|
724
|
+
current_only=current_only,
|
|
725
|
+
latest_only=latest_only,
|
|
726
|
+
)
|
|
727
|
+
except FeatureNotFoundError:
|
|
728
|
+
# Try next fallback store
|
|
729
|
+
continue
|
|
730
|
+
|
|
731
|
+
# Not found anywhere
|
|
732
|
+
raise FeatureNotFoundError(
|
|
733
|
+
f"Feature {feature_key.to_string()} not found in store"
|
|
734
|
+
+ (" or fallback stores" if allow_fallback else "")
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
def write_metadata(
|
|
738
|
+
self,
|
|
739
|
+
feature: CoercibleToFeatureKey,
|
|
740
|
+
df: IntoFrame,
|
|
741
|
+
materialization_id: str | None = None,
|
|
742
|
+
) -> None:
|
|
743
|
+
"""
|
|
744
|
+
Write metadata for a feature (append-only by design).
|
|
745
|
+
|
|
746
|
+
Automatically adds the Metaxy system columns, unless they already exist in the DataFrame.
|
|
747
|
+
|
|
748
|
+
Args:
|
|
749
|
+
feature: Feature to write metadata for
|
|
750
|
+
df: Metadata DataFrame of any type supported by [Narwhals](https://narwhals-dev.github.io/narwhals/).
|
|
751
|
+
Must have `metaxy_provenance_by_field` column of type Struct with fields matching feature's fields.
|
|
752
|
+
Optionally, may also contain `metaxy_data_version_by_field`.
|
|
753
|
+
materialization_id: Optional external orchestration ID for this write.
|
|
754
|
+
Overrides the store's default `materialization_id` if provided.
|
|
755
|
+
Useful for tracking which orchestration run produced this metadata.
|
|
756
|
+
|
|
757
|
+
Raises:
|
|
758
|
+
MetadataSchemaError: If DataFrame schema is invalid
|
|
759
|
+
StoreNotOpenError: If store is not open
|
|
760
|
+
ValueError: If writing to a feature from a different project than expected
|
|
761
|
+
|
|
762
|
+
Note:
|
|
763
|
+
- Must be called within a `MetadataStore.open(mode="write")` context manager.
|
|
764
|
+
|
|
765
|
+
- Metaxy always performs an "append" operation. Metadata is never deleted or mutated.
|
|
766
|
+
|
|
767
|
+
- Fallback stores are never used for writes.
|
|
768
|
+
|
|
769
|
+
- Features from other Metaxy projects cannot be written to, unless project validation has been disabled with [MetadataStore.allow_cross_project_writes][].
|
|
770
|
+
|
|
771
|
+
"""
|
|
772
|
+
self._check_open()
|
|
773
|
+
|
|
774
|
+
feature_key = self._resolve_feature_key(feature)
|
|
775
|
+
is_system_table = self._is_system_table(feature_key)
|
|
776
|
+
|
|
777
|
+
# Validate project for non-system tables
|
|
778
|
+
if not is_system_table:
|
|
779
|
+
self._validate_project_write(feature)
|
|
780
|
+
|
|
781
|
+
# Convert Polars to Narwhals to Polars if needed
|
|
782
|
+
# if isinstance(df_nw, (pl.DataFrame, pl.LazyFrame)):
|
|
783
|
+
df_nw = nw.from_native(df)
|
|
784
|
+
|
|
785
|
+
assert isinstance(df_nw, nw.DataFrame), "df must be a Narwhal DataFrame"
|
|
786
|
+
|
|
787
|
+
# For system tables, write directly without feature_version tracking
|
|
788
|
+
if is_system_table:
|
|
789
|
+
self._validate_schema_system_table(df_nw)
|
|
790
|
+
self.write_metadata_to_store(feature_key, df_nw)
|
|
791
|
+
return
|
|
792
|
+
|
|
793
|
+
if METAXY_PROVENANCE_BY_FIELD not in df_nw.columns:
|
|
794
|
+
from metaxy.metadata_store.exceptions import MetadataSchemaError
|
|
795
|
+
|
|
796
|
+
raise MetadataSchemaError(
|
|
797
|
+
f"DataFrame must have '{METAXY_PROVENANCE_BY_FIELD}' column"
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
# Add all required system columns
|
|
801
|
+
# warning: for dataframes that do not match the native MetadataStore implementation
|
|
802
|
+
# and are missing the METAXY_DATA_VERSION column, this call will lead to materializing the equivalent Polars DataFrame
|
|
803
|
+
# while calculating the missing METAXY_DATA_VERSION column
|
|
804
|
+
df_nw = self._add_system_columns(
|
|
805
|
+
df_nw, feature, materialization_id=materialization_id
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
self._validate_schema(df_nw)
|
|
809
|
+
self.write_metadata_to_store(feature_key, df_nw)
|
|
810
|
+
|
|
811
|
+
def write_metadata_multi(
|
|
812
|
+
self,
|
|
813
|
+
metadata: Mapping[Any, IntoFrame],
|
|
814
|
+
materialization_id: str | None = None,
|
|
815
|
+
) -> None:
|
|
816
|
+
"""
|
|
817
|
+
Write metadata for multiple features in reverse topological order.
|
|
818
|
+
|
|
819
|
+
Processes features so that dependents are written before their dependencies.
|
|
820
|
+
This ordering ensures that downstream features are written first, which can
|
|
821
|
+
be useful for certain data consistency requirements or when features need
|
|
822
|
+
to be processed in a specific order.
|
|
823
|
+
|
|
824
|
+
Args:
|
|
825
|
+
metadata: Mapping from feature keys to metadata DataFrames.
|
|
826
|
+
Keys can be any type coercible to FeatureKey (string, sequence,
|
|
827
|
+
FeatureKey, or BaseFeature class). Values must be DataFrames
|
|
828
|
+
compatible with Narwhals, containing required system columns.
|
|
829
|
+
materialization_id: Optional external orchestration ID for all writes.
|
|
830
|
+
Overrides the store's default `materialization_id` if provided.
|
|
831
|
+
Applied to all feature writes in this batch.
|
|
832
|
+
|
|
833
|
+
Raises:
|
|
834
|
+
MetadataSchemaError: If any DataFrame schema is invalid
|
|
835
|
+
StoreNotOpenError: If store is not open
|
|
836
|
+
ValueError: If writing to a feature from a different project than expected
|
|
837
|
+
|
|
838
|
+
Note:
|
|
839
|
+
- Must be called within a `MetadataStore.open(mode="write")` context manager.
|
|
840
|
+
- Empty mappings are handled gracefully (no-op).
|
|
841
|
+
- Each feature's metadata is written via `write_metadata`, so all
|
|
842
|
+
validation and system column handling from that method applies.
|
|
843
|
+
|
|
844
|
+
Example:
|
|
845
|
+
```py
|
|
846
|
+
with store.open(mode="write"):
|
|
847
|
+
store.write_metadata_multi({
|
|
848
|
+
ChildFeature: child_df,
|
|
849
|
+
ParentFeature: parent_df,
|
|
850
|
+
})
|
|
851
|
+
# Features are written in reverse topological order:
|
|
852
|
+
# ChildFeature first, then ParentFeature
|
|
853
|
+
```
|
|
854
|
+
"""
|
|
855
|
+
if not metadata:
|
|
856
|
+
return
|
|
857
|
+
|
|
858
|
+
# Build mapping from resolved keys to dataframes in one pass
|
|
859
|
+
resolved_metadata = {
|
|
860
|
+
self._resolve_feature_key(key): df for key, df in metadata.items()
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
# Get reverse topological order (dependents first)
|
|
864
|
+
graph = current_graph()
|
|
865
|
+
sorted_keys = graph.topological_sort_features(
|
|
866
|
+
list(resolved_metadata.keys()), descending=True
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
# Write metadata in reverse topological order
|
|
870
|
+
for feature_key in sorted_keys:
|
|
871
|
+
self.write_metadata(
|
|
872
|
+
feature_key,
|
|
873
|
+
resolved_metadata[feature_key],
|
|
874
|
+
materialization_id=materialization_id,
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
@abstractmethod
|
|
878
|
+
def _get_default_hash_algorithm(self) -> HashAlgorithm:
|
|
879
|
+
"""Get the default hash algorithm for this store type.
|
|
880
|
+
|
|
881
|
+
Returns:
|
|
882
|
+
Default hash algorithm
|
|
883
|
+
"""
|
|
884
|
+
pass
|
|
885
|
+
|
|
886
|
+
def native_implementation(self) -> nw.Implementation:
|
|
887
|
+
"""Get the native Narwhals implementation for this store's backend."""
|
|
888
|
+
return self.versioning_engine_cls.implementation()
|
|
889
|
+
|
|
890
|
+
@abstractmethod
|
|
891
|
+
@contextmanager
|
|
892
|
+
def _create_versioning_engine(
|
|
893
|
+
self, plan: FeaturePlan
|
|
894
|
+
) -> Iterator[VersioningEngineT]:
|
|
895
|
+
"""Create provenance engine for this store as a context manager.
|
|
896
|
+
|
|
897
|
+
Args:
|
|
898
|
+
plan: Feature plan for the feature we're tracking provenance for
|
|
899
|
+
|
|
900
|
+
Yields:
|
|
901
|
+
VersioningEngine instance appropriate for this store's backend.
|
|
902
|
+
- For SQL stores (DuckDB, ClickHouse): Returns IbisVersioningEngine
|
|
903
|
+
- For in-memory/Polars stores: Returns PolarsVersioningEngine
|
|
904
|
+
|
|
905
|
+
Raises:
|
|
906
|
+
NotImplementedError: If provenance tracking not supported by this store
|
|
907
|
+
|
|
908
|
+
Example:
|
|
909
|
+
```python
|
|
910
|
+
with self._create_versioning_engine(plan) as engine:
|
|
911
|
+
result = engine.resolve_update(...)
|
|
912
|
+
```
|
|
913
|
+
"""
|
|
914
|
+
...
|
|
915
|
+
|
|
916
|
+
@contextmanager
|
|
917
|
+
def _create_polars_versioning_engine(
|
|
918
|
+
self, plan: FeaturePlan
|
|
919
|
+
) -> Iterator[PolarsVersioningEngine]:
|
|
920
|
+
yield PolarsVersioningEngine(plan=plan)
|
|
921
|
+
|
|
922
|
+
@contextmanager
|
|
923
|
+
def create_versioning_engine(
|
|
924
|
+
self, plan: FeaturePlan, implementation: nw.Implementation
|
|
925
|
+
) -> Iterator[VersioningEngine | PolarsVersioningEngine]:
|
|
926
|
+
"""
|
|
927
|
+
Creates an appropriate provenance engine.
|
|
928
|
+
|
|
929
|
+
Falls back to Polars implementation if the required implementation differs from the store's native implementation.
|
|
930
|
+
|
|
931
|
+
Args:
|
|
932
|
+
plan: The feature plan.
|
|
933
|
+
implementation: The desired engine implementation.
|
|
934
|
+
|
|
935
|
+
Returns:
|
|
936
|
+
An appropriate provenance engine.
|
|
937
|
+
"""
|
|
938
|
+
|
|
939
|
+
if implementation == nw.Implementation.POLARS:
|
|
940
|
+
cm = self._create_polars_versioning_engine(plan)
|
|
941
|
+
elif implementation == self.native_implementation():
|
|
942
|
+
cm = self._create_versioning_engine(plan)
|
|
943
|
+
else:
|
|
944
|
+
cm = self._create_polars_versioning_engine(plan)
|
|
945
|
+
|
|
946
|
+
with cm as engine:
|
|
947
|
+
yield engine
|
|
948
|
+
|
|
949
|
+
def hash_struct_version_column(
|
|
950
|
+
self,
|
|
951
|
+
plan: FeaturePlan,
|
|
952
|
+
df: Frame,
|
|
953
|
+
struct_column: str,
|
|
954
|
+
hash_column: str,
|
|
955
|
+
) -> Frame:
|
|
956
|
+
with self.create_versioning_engine(plan, df.implementation) as engine:
|
|
957
|
+
if (
|
|
958
|
+
isinstance(engine, PolarsVersioningEngine)
|
|
959
|
+
and df.implementation != nw.Implementation.POLARS
|
|
960
|
+
):
|
|
961
|
+
PolarsMaterializationWarning.warn_on_implementation_mismatch(
|
|
962
|
+
self.native_implementation(),
|
|
963
|
+
df.implementation,
|
|
964
|
+
message=f"`{hash_column}` will be calculated in Polars.",
|
|
965
|
+
)
|
|
966
|
+
df = nw.from_native(df.lazy().collect().to_polars())
|
|
967
|
+
|
|
968
|
+
return cast(
|
|
969
|
+
Frame,
|
|
970
|
+
engine.hash_struct_version_column(
|
|
971
|
+
df, # pyright: ignore[reportArgumentType]
|
|
972
|
+
hash_algorithm=self.hash_algorithm,
|
|
973
|
+
struct_column=struct_column,
|
|
974
|
+
hash_column=hash_column,
|
|
975
|
+
),
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
@abstractmethod
|
|
979
|
+
@contextmanager
|
|
980
|
+
def open(self, mode: AccessMode = "read") -> Iterator[Self]:
|
|
981
|
+
"""Open/initialize the store for operations.
|
|
982
|
+
|
|
983
|
+
Context manager that opens the store with specified access mode.
|
|
984
|
+
Called internally by `__enter__`.
|
|
985
|
+
Child classes should implement backend-specific connection setup/teardown here.
|
|
986
|
+
|
|
987
|
+
Args:
|
|
988
|
+
mode: Access mode for this connection session.
|
|
989
|
+
|
|
990
|
+
Yields:
|
|
991
|
+
Self: The store instance with connection open
|
|
992
|
+
|
|
993
|
+
Note:
|
|
994
|
+
Users should prefer using `with store:` pattern except when write access mode is needed.
|
|
995
|
+
"""
|
|
996
|
+
...
|
|
997
|
+
|
|
998
|
+
def __enter__(self) -> Self:
|
|
999
|
+
"""Enter context manager - opens store in READ mode by default.
|
|
1000
|
+
|
|
1001
|
+
Use [`MetadataStore.open`][metaxy.metadata_store.base.MetadataStore.open] for write access mode instead.
|
|
1002
|
+
|
|
1003
|
+
Returns:
|
|
1004
|
+
Self: The opened store instance
|
|
1005
|
+
"""
|
|
1006
|
+
# Determine mode based on auto_create_tables
|
|
1007
|
+
mode = "write" if self.auto_create_tables else "read"
|
|
1008
|
+
|
|
1009
|
+
# Open the store (open() manages _context_depth internally)
|
|
1010
|
+
self._open_cm = self.open(mode)
|
|
1011
|
+
self._open_cm.__enter__()
|
|
1012
|
+
|
|
1013
|
+
return self
|
|
1014
|
+
|
|
1015
|
+
def _validate_after_open(self) -> None:
|
|
1016
|
+
"""Validate configuration after store is opened.
|
|
1017
|
+
|
|
1018
|
+
Called automatically by __enter__ after open().
|
|
1019
|
+
Validates hash algorithm compatibility and fallback store consistency.
|
|
1020
|
+
"""
|
|
1021
|
+
# Validate hash algorithm compatibility with components
|
|
1022
|
+
self.validate_hash_algorithm(check_fallback_stores=True)
|
|
1023
|
+
|
|
1024
|
+
# Validate fallback stores use the same hash algorithm
|
|
1025
|
+
for i, fallback_store in enumerate(self.fallback_stores):
|
|
1026
|
+
if fallback_store.hash_algorithm != self.hash_algorithm:
|
|
1027
|
+
raise ValueError(
|
|
1028
|
+
f"Fallback store {i} uses hash_algorithm='{fallback_store.hash_algorithm.value}' "
|
|
1029
|
+
f"but this store uses '{self.hash_algorithm.value}'. "
|
|
1030
|
+
f"All stores in a fallback chain must use the same hash algorithm."
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
def __exit__(
|
|
1034
|
+
self,
|
|
1035
|
+
exc_type: type[BaseException] | None,
|
|
1036
|
+
exc_val: BaseException | None,
|
|
1037
|
+
exc_tb: TracebackType | None,
|
|
1038
|
+
) -> None:
|
|
1039
|
+
# Delegate to open()'s context manager (which manages _context_depth)
|
|
1040
|
+
if self._open_cm is not None:
|
|
1041
|
+
self._open_cm.__exit__(exc_type, exc_val, exc_tb)
|
|
1042
|
+
self._open_cm = None
|
|
1043
|
+
|
|
1044
|
+
def _check_open(self) -> None:
|
|
1045
|
+
"""Check if store is open, raise error if not.
|
|
1046
|
+
|
|
1047
|
+
Raises:
|
|
1048
|
+
StoreNotOpenError: If store is not open
|
|
1049
|
+
"""
|
|
1050
|
+
if not self._is_open:
|
|
1051
|
+
raise StoreNotOpenError(
|
|
1052
|
+
f"{self.__class__.__name__} must be opened before use. "
|
|
1053
|
+
'Use it as a context manager: `with store: ...` or `with store.open(mode="write"): ...`'
|
|
1054
|
+
)
|
|
1055
|
+
|
|
1056
|
+
# ========== Hash Algorithm Validation ==========
|
|
1057
|
+
|
|
1058
|
+
def validate_hash_algorithm(
|
|
1059
|
+
self,
|
|
1060
|
+
check_fallback_stores: bool = True,
|
|
1061
|
+
) -> None:
|
|
1062
|
+
"""Validate that hash algorithm is supported by this store's components.
|
|
1063
|
+
|
|
1064
|
+
Public method - can be called to verify hash compatibility.
|
|
1065
|
+
|
|
1066
|
+
Args:
|
|
1067
|
+
check_fallback_stores: If True, also validate hash is supported by
|
|
1068
|
+
fallback stores (ensures compatibility for future cross-store operations)
|
|
1069
|
+
|
|
1070
|
+
Raises:
|
|
1071
|
+
ValueError: If hash algorithm not supported by components or fallback stores
|
|
1072
|
+
"""
|
|
1073
|
+
# Validate hash algorithm support without creating a full engine
|
|
1074
|
+
# (engine creation requires a graph which isn't available during store init)
|
|
1075
|
+
self._validate_hash_algorithm_support()
|
|
1076
|
+
|
|
1077
|
+
# Check fallback stores
|
|
1078
|
+
if check_fallback_stores:
|
|
1079
|
+
for fallback in self.fallback_stores:
|
|
1080
|
+
fallback.validate_hash_algorithm(check_fallback_stores=False)
|
|
1081
|
+
|
|
1082
|
+
def _validate_hash_algorithm_support(self) -> None:
|
|
1083
|
+
"""Validate that the configured hash algorithm is supported.
|
|
1084
|
+
|
|
1085
|
+
Default implementation does nothing (assumes all algorithms supported).
|
|
1086
|
+
Subclasses can override to check algorithm support.
|
|
1087
|
+
|
|
1088
|
+
Raises:
|
|
1089
|
+
Exception: If hash algorithm is not supported
|
|
1090
|
+
"""
|
|
1091
|
+
# Default: no validation (assume all algorithms supported)
|
|
1092
|
+
pass
|
|
1093
|
+
|
|
1094
|
+
# ========== Helper Methods ==========
|
|
1095
|
+
|
|
1096
|
+
def _is_system_table(self, feature_key: FeatureKey) -> bool:
|
|
1097
|
+
"""Check if feature key is a system table."""
|
|
1098
|
+
return len(feature_key) >= 1 and feature_key[0] == METAXY_SYSTEM_KEY_PREFIX
|
|
1099
|
+
|
|
1100
|
+
def _resolve_feature_key(self, feature: CoercibleToFeatureKey) -> FeatureKey:
|
|
1101
|
+
"""Resolve various types to FeatureKey.
|
|
1102
|
+
|
|
1103
|
+
Accepts types that can be converted into a FeatureKey.
|
|
1104
|
+
|
|
1105
|
+
Args:
|
|
1106
|
+
feature: Feature to resolve to FeatureKey
|
|
1107
|
+
|
|
1108
|
+
Returns:
|
|
1109
|
+
FeatureKey instance
|
|
1110
|
+
"""
|
|
1111
|
+
return ValidatedFeatureKeyAdapter.validate_python(feature)
|
|
1112
|
+
|
|
1113
|
+
def _resolve_feature_plan(self, feature: CoercibleToFeatureKey) -> FeaturePlan:
|
|
1114
|
+
"""Resolve to FeaturePlan for dependency resolution."""
|
|
1115
|
+
# First resolve to FeatureKey
|
|
1116
|
+
feature_key = self._resolve_feature_key(feature)
|
|
1117
|
+
# Then get the plan
|
|
1118
|
+
graph = current_graph()
|
|
1119
|
+
return graph.get_feature_plan(feature_key)
|
|
1120
|
+
|
|
1121
|
+
# ========== Core CRUD Operations ==========
|
|
1122
|
+
|
|
1123
|
+
@contextmanager
|
|
1124
|
+
def allow_cross_project_writes(self) -> Iterator[None]:
|
|
1125
|
+
"""Context manager to temporarily allow cross-project writes.
|
|
1126
|
+
|
|
1127
|
+
This is an escape hatch for legitimate cross-project operations like migrations,
|
|
1128
|
+
where metadata needs to be written to features from different projects.
|
|
1129
|
+
|
|
1130
|
+
Example:
|
|
1131
|
+
```py
|
|
1132
|
+
# During migration, allow writing to features from different projects
|
|
1133
|
+
with store.allow_cross_project_writes():
|
|
1134
|
+
store.write_metadata(feature_from_project_a, metadata_a)
|
|
1135
|
+
store.write_metadata(feature_from_project_b, metadata_b)
|
|
1136
|
+
```
|
|
1137
|
+
|
|
1138
|
+
Yields:
|
|
1139
|
+
None: The context manager temporarily disables project validation
|
|
1140
|
+
"""
|
|
1141
|
+
previous_value = self._allow_cross_project_writes
|
|
1142
|
+
try:
|
|
1143
|
+
self._allow_cross_project_writes = True
|
|
1144
|
+
yield
|
|
1145
|
+
finally:
|
|
1146
|
+
self._allow_cross_project_writes = previous_value
|
|
1147
|
+
|
|
1148
|
+
def _validate_project_write(self, feature: CoercibleToFeatureKey) -> None:
|
|
1149
|
+
"""Validate that writing to a feature matches the expected project from config.
|
|
1150
|
+
|
|
1151
|
+
Args:
|
|
1152
|
+
feature: Feature to validate project for
|
|
1153
|
+
|
|
1154
|
+
Raises:
|
|
1155
|
+
ValueError: If feature's project doesn't match the global config project
|
|
1156
|
+
"""
|
|
1157
|
+
# Skip validation if cross-project writes are allowed
|
|
1158
|
+
if self._allow_cross_project_writes:
|
|
1159
|
+
return
|
|
1160
|
+
|
|
1161
|
+
# Get the expected project from global config
|
|
1162
|
+
from metaxy.config import MetaxyConfig
|
|
1163
|
+
|
|
1164
|
+
config = MetaxyConfig.get()
|
|
1165
|
+
expected_project = config.project
|
|
1166
|
+
|
|
1167
|
+
# Use existing method to resolve to FeatureKey
|
|
1168
|
+
feature_key = self._resolve_feature_key(feature)
|
|
1169
|
+
|
|
1170
|
+
# Get the Feature class from the graph
|
|
1171
|
+
|
|
1172
|
+
graph = FeatureGraph.get_active()
|
|
1173
|
+
if feature_key not in graph.features_by_key:
|
|
1174
|
+
# Feature not in graph - can't validate, skip
|
|
1175
|
+
return
|
|
1176
|
+
|
|
1177
|
+
feature_cls = graph.features_by_key[feature_key]
|
|
1178
|
+
feature_project = feature_cls.project # type: ignore[attr-defined]
|
|
1179
|
+
|
|
1180
|
+
# Validate the project matches
|
|
1181
|
+
if feature_project != expected_project:
|
|
1182
|
+
raise ValueError(
|
|
1183
|
+
f"Cannot write to feature {feature_key.to_string()} from project '{feature_project}' "
|
|
1184
|
+
f"when the global configuration expects project '{expected_project}'. "
|
|
1185
|
+
f"Use store.allow_cross_project_writes() context manager for legitimate "
|
|
1186
|
+
f"cross-project operations like migrations."
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1189
|
+
@abstractmethod
|
|
1190
|
+
def write_metadata_to_store(
|
|
1191
|
+
self,
|
|
1192
|
+
feature_key: FeatureKey,
|
|
1193
|
+
df: Frame,
|
|
1194
|
+
**kwargs: Any,
|
|
1195
|
+
) -> None:
|
|
1196
|
+
"""
|
|
1197
|
+
Internal write implementation (backend-specific).
|
|
1198
|
+
|
|
1199
|
+
Backends may convert to their specific type if needed (e.g., Polars, Ibis).
|
|
1200
|
+
|
|
1201
|
+
Args:
|
|
1202
|
+
feature_key: Feature key to write to
|
|
1203
|
+
df: [Narwhals](https://narwhals-dev.github.io/narwhals/)-compatible DataFrame with metadata to write
|
|
1204
|
+
**kwargs: Backend-specific parameters
|
|
1205
|
+
|
|
1206
|
+
Note: Subclasses implement this for their storage backend.
|
|
1207
|
+
"""
|
|
1208
|
+
pass
|
|
1209
|
+
|
|
1210
|
+
def _add_system_columns(
|
|
1211
|
+
self,
|
|
1212
|
+
df: Frame,
|
|
1213
|
+
feature: CoercibleToFeatureKey,
|
|
1214
|
+
materialization_id: str | None = None,
|
|
1215
|
+
) -> Frame:
|
|
1216
|
+
"""Add all required system columns to the DataFrame.
|
|
1217
|
+
|
|
1218
|
+
Args:
|
|
1219
|
+
df: Narwhals DataFrame/LazyFrame
|
|
1220
|
+
feature: Feature class or key
|
|
1221
|
+
materialization_id: Optional external orchestration ID for this write.
|
|
1222
|
+
Overrides the store's default if provided.
|
|
1223
|
+
|
|
1224
|
+
Returns:
|
|
1225
|
+
DataFrame with all system columns added
|
|
1226
|
+
"""
|
|
1227
|
+
feature_key = self._resolve_feature_key(feature)
|
|
1228
|
+
|
|
1229
|
+
# Check if feature_version and snapshot_version already exist in DataFrame
|
|
1230
|
+
has_feature_version = METAXY_FEATURE_VERSION in df.columns
|
|
1231
|
+
has_snapshot_version = METAXY_SNAPSHOT_VERSION in df.columns
|
|
1232
|
+
|
|
1233
|
+
# In suppression mode (migrations), use existing values as-is
|
|
1234
|
+
if (
|
|
1235
|
+
_suppress_feature_version_warning.get()
|
|
1236
|
+
and has_feature_version
|
|
1237
|
+
and has_snapshot_version
|
|
1238
|
+
):
|
|
1239
|
+
pass # Use existing values for migrations
|
|
1240
|
+
else:
|
|
1241
|
+
# Drop any existing version columns (e.g., from SQLModel with null values)
|
|
1242
|
+
# and add current versions
|
|
1243
|
+
columns_to_drop = []
|
|
1244
|
+
if has_feature_version:
|
|
1245
|
+
columns_to_drop.append(METAXY_FEATURE_VERSION)
|
|
1246
|
+
if has_snapshot_version:
|
|
1247
|
+
columns_to_drop.append(METAXY_SNAPSHOT_VERSION)
|
|
1248
|
+
if columns_to_drop:
|
|
1249
|
+
df = df.drop(*columns_to_drop)
|
|
1250
|
+
|
|
1251
|
+
# Get current feature version and snapshot_version from code and add them
|
|
1252
|
+
# Use duck typing to avoid Ray serialization issues with issubclass
|
|
1253
|
+
if (
|
|
1254
|
+
isinstance(feature, type)
|
|
1255
|
+
and hasattr(feature, "feature_version")
|
|
1256
|
+
and callable(feature.feature_version)
|
|
1257
|
+
):
|
|
1258
|
+
current_feature_version = feature.feature_version()
|
|
1259
|
+
else:
|
|
1260
|
+
from metaxy import get_feature_by_key
|
|
1261
|
+
|
|
1262
|
+
feature_cls = get_feature_by_key(feature_key)
|
|
1263
|
+
current_feature_version = feature_cls.feature_version()
|
|
1264
|
+
|
|
1265
|
+
# Get snapshot_version from active graph
|
|
1266
|
+
from metaxy.models.feature import FeatureGraph
|
|
1267
|
+
|
|
1268
|
+
graph = FeatureGraph.get_active()
|
|
1269
|
+
current_snapshot_version = graph.snapshot_version
|
|
1270
|
+
|
|
1271
|
+
df = df.with_columns(
|
|
1272
|
+
[
|
|
1273
|
+
nw.lit(current_feature_version).alias(METAXY_FEATURE_VERSION),
|
|
1274
|
+
nw.lit(current_snapshot_version).alias(METAXY_SNAPSHOT_VERSION),
|
|
1275
|
+
]
|
|
1276
|
+
)
|
|
1277
|
+
|
|
1278
|
+
# These should normally be added by the provenance engine during resolve_update
|
|
1279
|
+
from metaxy.models.constants import (
|
|
1280
|
+
METAXY_CREATED_AT,
|
|
1281
|
+
METAXY_DATA_VERSION,
|
|
1282
|
+
METAXY_DATA_VERSION_BY_FIELD,
|
|
1283
|
+
)
|
|
1284
|
+
|
|
1285
|
+
if METAXY_PROVENANCE_BY_FIELD not in df.columns:
|
|
1286
|
+
raise ValueError(
|
|
1287
|
+
f"Metadata is missing a required column `{METAXY_PROVENANCE_BY_FIELD}`. It should have been created by a prior `MetadataStore.resolve_update` call. Did you drop it on the way?"
|
|
1288
|
+
)
|
|
1289
|
+
|
|
1290
|
+
if METAXY_PROVENANCE not in df.columns:
|
|
1291
|
+
plan = self._resolve_feature_plan(feature_key)
|
|
1292
|
+
|
|
1293
|
+
# Only warn for non-root features (features with dependencies).
|
|
1294
|
+
# Root features don't have upstream dependencies, so they don't go through
|
|
1295
|
+
# resolve_update() - they just need metaxy_provenance_by_field to be set.
|
|
1296
|
+
if plan.deps:
|
|
1297
|
+
MetaxyColumnMissingWarning.warn_on_missing_column(
|
|
1298
|
+
expected=METAXY_PROVENANCE,
|
|
1299
|
+
df=df,
|
|
1300
|
+
message=f"It should have been created by a prior `MetadataStore.resolve_update` call. Re-crearing it from `{METAXY_PROVENANCE_BY_FIELD}` Did you drop it on the way?",
|
|
1301
|
+
)
|
|
1302
|
+
|
|
1303
|
+
df = self.hash_struct_version_column(
|
|
1304
|
+
plan=plan,
|
|
1305
|
+
df=df,
|
|
1306
|
+
struct_column=METAXY_PROVENANCE_BY_FIELD,
|
|
1307
|
+
hash_column=METAXY_PROVENANCE,
|
|
1308
|
+
)
|
|
1309
|
+
|
|
1310
|
+
if METAXY_CREATED_AT not in df.columns:
|
|
1311
|
+
from datetime import datetime, timezone
|
|
1312
|
+
|
|
1313
|
+
df = df.with_columns(
|
|
1314
|
+
nw.lit(datetime.now(timezone.utc)).alias(METAXY_CREATED_AT)
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
# Add materialization_id if not already present
|
|
1318
|
+
from metaxy.models.constants import METAXY_MATERIALIZATION_ID
|
|
1319
|
+
|
|
1320
|
+
df = df.with_columns(
|
|
1321
|
+
nw.lit(
|
|
1322
|
+
materialization_id or self._materialization_id, dtype=nw.String
|
|
1323
|
+
).alias(METAXY_MATERIALIZATION_ID)
|
|
1324
|
+
)
|
|
1325
|
+
|
|
1326
|
+
# Check for missing data_version columns (should come from resolve_update but it's acceptable to just use provenance columns if they are missing)
|
|
1327
|
+
|
|
1328
|
+
if METAXY_DATA_VERSION_BY_FIELD not in df.columns:
|
|
1329
|
+
df = df.with_columns(
|
|
1330
|
+
nw.col(METAXY_PROVENANCE_BY_FIELD).alias(METAXY_DATA_VERSION_BY_FIELD)
|
|
1331
|
+
)
|
|
1332
|
+
df = df.with_columns(nw.col(METAXY_PROVENANCE).alias(METAXY_DATA_VERSION))
|
|
1333
|
+
elif METAXY_DATA_VERSION not in df.columns:
|
|
1334
|
+
df = self.hash_struct_version_column(
|
|
1335
|
+
plan=self._resolve_feature_plan(feature_key),
|
|
1336
|
+
df=df,
|
|
1337
|
+
struct_column=METAXY_DATA_VERSION_BY_FIELD,
|
|
1338
|
+
hash_column=METAXY_DATA_VERSION,
|
|
1339
|
+
)
|
|
1340
|
+
|
|
1341
|
+
# Cast system columns with Null dtype to their correct types
|
|
1342
|
+
# This handles edge cases where empty DataFrames or certain operations
|
|
1343
|
+
# result in Null-typed columns that break downstream processing
|
|
1344
|
+
df = _cast_present_system_columns(df)
|
|
1345
|
+
|
|
1346
|
+
return df
|
|
1347
|
+
|
|
1348
|
+
def _validate_schema(self, df: Frame) -> None:
|
|
1349
|
+
"""
|
|
1350
|
+
Validate that DataFrame has required schema.
|
|
1351
|
+
|
|
1352
|
+
Args:
|
|
1353
|
+
df: Narwhals DataFrame or LazyFrame to validate
|
|
1354
|
+
|
|
1355
|
+
Raises:
|
|
1356
|
+
MetadataSchemaError: If schema is invalid
|
|
1357
|
+
"""
|
|
1358
|
+
from metaxy.metadata_store.exceptions import MetadataSchemaError
|
|
1359
|
+
|
|
1360
|
+
schema = df.collect_schema()
|
|
1361
|
+
|
|
1362
|
+
# Check for metaxy_provenance_by_field column
|
|
1363
|
+
if METAXY_PROVENANCE_BY_FIELD not in schema.names():
|
|
1364
|
+
raise MetadataSchemaError(
|
|
1365
|
+
f"DataFrame must have '{METAXY_PROVENANCE_BY_FIELD}' column"
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
# Check that metaxy_provenance_by_field is a struct
|
|
1369
|
+
provenance_dtype = schema[METAXY_PROVENANCE_BY_FIELD]
|
|
1370
|
+
if not isinstance(provenance_dtype, nw.Struct):
|
|
1371
|
+
raise MetadataSchemaError(
|
|
1372
|
+
f"'{METAXY_PROVENANCE_BY_FIELD}' column must be a Struct, got {provenance_dtype}"
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
# Note: metaxy_provenance is auto-computed if missing, so we don't validate it here
|
|
1376
|
+
|
|
1377
|
+
# Check for feature_version column
|
|
1378
|
+
if METAXY_FEATURE_VERSION not in schema.names():
|
|
1379
|
+
raise MetadataSchemaError(
|
|
1380
|
+
f"DataFrame must have '{METAXY_FEATURE_VERSION}' column"
|
|
1381
|
+
)
|
|
1382
|
+
|
|
1383
|
+
# Check for snapshot_version column
|
|
1384
|
+
if METAXY_SNAPSHOT_VERSION not in schema.names():
|
|
1385
|
+
raise MetadataSchemaError(
|
|
1386
|
+
f"DataFrame must have '{METAXY_SNAPSHOT_VERSION}' column"
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
def _validate_schema_system_table(self, df: Frame) -> None:
|
|
1390
|
+
"""Validate schema for system tables (minimal validation).
|
|
1391
|
+
|
|
1392
|
+
Args:
|
|
1393
|
+
df: Narwhals DataFrame to validate
|
|
1394
|
+
"""
|
|
1395
|
+
# System tables don't need metaxy_provenance_by_field column
|
|
1396
|
+
pass
|
|
1397
|
+
|
|
1398
|
+
@abstractmethod
|
|
1399
|
+
def _drop_feature_metadata_impl(self, feature_key: FeatureKey) -> None:
|
|
1400
|
+
"""Drop/delete all metadata for a feature.
|
|
1401
|
+
|
|
1402
|
+
Backend-specific implementation for dropping feature metadata.
|
|
1403
|
+
|
|
1404
|
+
Args:
|
|
1405
|
+
feature_key: The feature key to drop metadata for
|
|
1406
|
+
"""
|
|
1407
|
+
pass
|
|
1408
|
+
|
|
1409
|
+
def drop_feature_metadata(self, feature: CoercibleToFeatureKey) -> None:
|
|
1410
|
+
"""Drop all metadata for a feature.
|
|
1411
|
+
|
|
1412
|
+
This removes all stored metadata for the specified feature from the store.
|
|
1413
|
+
Useful for cleanup in tests or when re-computing feature metadata from scratch.
|
|
1414
|
+
|
|
1415
|
+
Warning:
|
|
1416
|
+
This operation is irreversible and will **permanently delete all metadata** for the specified feature.
|
|
1417
|
+
|
|
1418
|
+
Args:
|
|
1419
|
+
feature: Feature class or key to drop metadata for
|
|
1420
|
+
|
|
1421
|
+
Example:
|
|
1422
|
+
```py
|
|
1423
|
+
store.drop_feature_metadata(MyFeature)
|
|
1424
|
+
assert not store.has_feature(MyFeature)
|
|
1425
|
+
```
|
|
1426
|
+
"""
|
|
1427
|
+
self._check_open()
|
|
1428
|
+
feature_key = self._resolve_feature_key(feature)
|
|
1429
|
+
self._drop_feature_metadata_impl(feature_key)
|
|
1430
|
+
|
|
1431
|
+
@abstractmethod
|
|
1432
|
+
def read_metadata_in_store(
|
|
1433
|
+
self,
|
|
1434
|
+
feature: CoercibleToFeatureKey,
|
|
1435
|
+
*,
|
|
1436
|
+
filters: Sequence[nw.Expr] | None = None,
|
|
1437
|
+
columns: Sequence[str] | None = None,
|
|
1438
|
+
**kwargs: Any,
|
|
1439
|
+
) -> nw.LazyFrame[Any] | None:
|
|
1440
|
+
"""
|
|
1441
|
+
Read metadata from THIS store only without using any fallbacks stores.
|
|
1442
|
+
|
|
1443
|
+
Args:
|
|
1444
|
+
feature: Feature to read metadata for
|
|
1445
|
+
filters: List of Narwhals filter expressions for this specific feature.
|
|
1446
|
+
columns: Subset of columns to return
|
|
1447
|
+
**kwargs: Backend-specific parameters
|
|
1448
|
+
|
|
1449
|
+
Returns:
|
|
1450
|
+
Narwhals LazyFrame with metadata, or None if feature not found in the store
|
|
1451
|
+
"""
|
|
1452
|
+
pass
|
|
1453
|
+
|
|
1454
|
+
# ========== Feature Existence ==========
|
|
1455
|
+
|
|
1456
|
+
def has_feature(
|
|
1457
|
+
self,
|
|
1458
|
+
feature: CoercibleToFeatureKey,
|
|
1459
|
+
*,
|
|
1460
|
+
check_fallback: bool = False,
|
|
1461
|
+
) -> bool:
|
|
1462
|
+
"""
|
|
1463
|
+
Check if feature exists in store.
|
|
1464
|
+
|
|
1465
|
+
Args:
|
|
1466
|
+
feature: Feature to check
|
|
1467
|
+
check_fallback: If True, also check fallback stores
|
|
1468
|
+
|
|
1469
|
+
Returns:
|
|
1470
|
+
True if feature exists, False otherwise
|
|
1471
|
+
"""
|
|
1472
|
+
self._check_open()
|
|
1473
|
+
|
|
1474
|
+
if self.read_metadata_in_store(feature) is not None:
|
|
1475
|
+
return True
|
|
1476
|
+
|
|
1477
|
+
# Check fallback stores
|
|
1478
|
+
if not check_fallback:
|
|
1479
|
+
return self._has_feature_impl(feature)
|
|
1480
|
+
else:
|
|
1481
|
+
for store in self.fallback_stores:
|
|
1482
|
+
if store.has_feature(feature, check_fallback=True):
|
|
1483
|
+
return True
|
|
1484
|
+
|
|
1485
|
+
return False
|
|
1486
|
+
|
|
1487
|
+
@abstractmethod
|
|
1488
|
+
def _has_feature_impl(self, feature: CoercibleToFeatureKey) -> bool:
|
|
1489
|
+
"""Implementation of _has_feature.
|
|
1490
|
+
|
|
1491
|
+
Args:
|
|
1492
|
+
feature: Feature to check
|
|
1493
|
+
|
|
1494
|
+
Returns:
|
|
1495
|
+
True if feature exists, False otherwise
|
|
1496
|
+
"""
|
|
1497
|
+
pass
|
|
1498
|
+
|
|
1499
|
+
@abstractmethod
|
|
1500
|
+
def display(self) -> str:
|
|
1501
|
+
"""Return a human-readable display string for this store.
|
|
1502
|
+
|
|
1503
|
+
Used in warnings, logs, and CLI output to identify the store.
|
|
1504
|
+
|
|
1505
|
+
Returns:
|
|
1506
|
+
Display string (e.g., "DuckDBMetadataStore(database=/path/to/db.duckdb)")
|
|
1507
|
+
"""
|
|
1508
|
+
pass
|
|
1509
|
+
|
|
1510
|
+
def get_store_metadata(self, feature_key: CoercibleToFeatureKey) -> dict[str, Any]:
|
|
1511
|
+
"""Arbitrary key-value pairs with useful metadata like path in storage.
|
|
1512
|
+
|
|
1513
|
+
Useful for logging purposes. This method should not expose sensitive information.
|
|
1514
|
+
"""
|
|
1515
|
+
return {}
|
|
1516
|
+
|
|
1517
|
+
def copy_metadata(
|
|
1518
|
+
self,
|
|
1519
|
+
from_store: MetadataStore,
|
|
1520
|
+
features: list[CoercibleToFeatureKey] | None = None,
|
|
1521
|
+
*,
|
|
1522
|
+
from_snapshot: str | None = None,
|
|
1523
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None = None,
|
|
1524
|
+
incremental: bool = True,
|
|
1525
|
+
) -> dict[str, int]:
|
|
1526
|
+
"""Copy metadata from another store with fine-grained filtering.
|
|
1527
|
+
|
|
1528
|
+
This is a reusable method that can be called programmatically or from CLI/migrations.
|
|
1529
|
+
Copies metadata for specified features, preserving the original snapshot_version.
|
|
1530
|
+
|
|
1531
|
+
Args:
|
|
1532
|
+
from_store: Source metadata store to copy from (must be opened)
|
|
1533
|
+
features: List of features to copy. Can be:
|
|
1534
|
+
- None: copies all features from source store
|
|
1535
|
+
- List of FeatureKey or Feature classes: copies specified features
|
|
1536
|
+
from_snapshot: Snapshot version to filter source data by. If None, uses latest snapshot
|
|
1537
|
+
from source store. Only rows with this snapshot_version will be copied.
|
|
1538
|
+
The snapshot_version is preserved in the destination store.
|
|
1539
|
+
filters: Dict mapping feature keys (as strings) to sequences of Narwhals filter expressions.
|
|
1540
|
+
These filters are applied when reading from the source store.
|
|
1541
|
+
Example: {"feature/key": [nw.col("x") > 10], "other/feature": [...]}
|
|
1542
|
+
incremental: If True (default), filter out rows that already exist in the destination
|
|
1543
|
+
store by performing an anti-join on sample_uid for the same snapshot_version.
|
|
1544
|
+
|
|
1545
|
+
The implementation uses an anti-join: source LEFT ANTI JOIN destination ON sample_uid
|
|
1546
|
+
filtered by snapshot_version.
|
|
1547
|
+
|
|
1548
|
+
Disabling incremental (incremental=False) may improve performance when:
|
|
1549
|
+
- You know the destination is empty or has no overlap with source
|
|
1550
|
+
- The destination store uses deduplication
|
|
1551
|
+
|
|
1552
|
+
When incremental=False, it's the user's responsibility to avoid duplicates or
|
|
1553
|
+
configure deduplication at the storage layer.
|
|
1554
|
+
|
|
1555
|
+
Returns:
|
|
1556
|
+
Dict with statistics: {"features_copied": int, "rows_copied": int}
|
|
1557
|
+
|
|
1558
|
+
Raises:
|
|
1559
|
+
ValueError: If from_store or self (destination) is not open
|
|
1560
|
+
FeatureNotFoundError: If a specified feature doesn't exist in source store
|
|
1561
|
+
|
|
1562
|
+
Examples:
|
|
1563
|
+
```py
|
|
1564
|
+
# Simple: copy all features from latest snapshot
|
|
1565
|
+
stats = dest_store.copy_metadata(from_store=source_store)
|
|
1566
|
+
```
|
|
1567
|
+
|
|
1568
|
+
```py
|
|
1569
|
+
# Copy specific features from a specific snapshot
|
|
1570
|
+
stats = dest_store.copy_metadata(
|
|
1571
|
+
from_store=source_store,
|
|
1572
|
+
features=[FeatureKey(["my_feature"])],
|
|
1573
|
+
from_snapshot="abc123",
|
|
1574
|
+
)
|
|
1575
|
+
```
|
|
1576
|
+
|
|
1577
|
+
```py
|
|
1578
|
+
# Copy with filters
|
|
1579
|
+
stats = dest_store.copy_metadata(
|
|
1580
|
+
from_store=source_store,
|
|
1581
|
+
filters={"my/feature": [nw.col("sample_uid").is_in(["s1", "s2"])]},
|
|
1582
|
+
)
|
|
1583
|
+
```
|
|
1584
|
+
|
|
1585
|
+
```py
|
|
1586
|
+
# Copy specific features with filters
|
|
1587
|
+
stats = dest_store.copy_metadata(
|
|
1588
|
+
from_store=source_store,
|
|
1589
|
+
features=[
|
|
1590
|
+
FeatureKey(["feature_a"]),
|
|
1591
|
+
FeatureKey(["feature_b"]),
|
|
1592
|
+
],
|
|
1593
|
+
filters={
|
|
1594
|
+
"feature_a": [nw.col("field_a") > 10, nw.col("sample_uid").is_in(["s1", "s2"])],
|
|
1595
|
+
"feature_b": [nw.col("field_b") < 30],
|
|
1596
|
+
},
|
|
1597
|
+
)
|
|
1598
|
+
```
|
|
1599
|
+
"""
|
|
1600
|
+
import logging
|
|
1601
|
+
|
|
1602
|
+
logger = logging.getLogger(__name__)
|
|
1603
|
+
|
|
1604
|
+
# Validate destination store is open
|
|
1605
|
+
if not self._is_open:
|
|
1606
|
+
raise ValueError(
|
|
1607
|
+
'Destination store must be opened with store.open("write") before use'
|
|
1608
|
+
)
|
|
1609
|
+
|
|
1610
|
+
# Auto-open source store if not already open
|
|
1611
|
+
if not from_store._is_open:
|
|
1612
|
+
with from_store.open("read"):
|
|
1613
|
+
return self._copy_metadata_impl(
|
|
1614
|
+
from_store=from_store,
|
|
1615
|
+
features=features,
|
|
1616
|
+
from_snapshot=from_snapshot,
|
|
1617
|
+
filters=filters,
|
|
1618
|
+
incremental=incremental,
|
|
1619
|
+
logger=logger,
|
|
1620
|
+
)
|
|
1621
|
+
else:
|
|
1622
|
+
return self._copy_metadata_impl(
|
|
1623
|
+
from_store=from_store,
|
|
1624
|
+
features=features,
|
|
1625
|
+
from_snapshot=from_snapshot,
|
|
1626
|
+
filters=filters,
|
|
1627
|
+
incremental=incremental,
|
|
1628
|
+
logger=logger,
|
|
1629
|
+
)
|
|
1630
|
+
|
|
1631
|
+
def _copy_metadata_impl(
|
|
1632
|
+
self,
|
|
1633
|
+
from_store: MetadataStore,
|
|
1634
|
+
features: list[CoercibleToFeatureKey] | None,
|
|
1635
|
+
from_snapshot: str | None,
|
|
1636
|
+
filters: Mapping[str, Sequence[nw.Expr]] | None,
|
|
1637
|
+
incremental: bool,
|
|
1638
|
+
logger,
|
|
1639
|
+
) -> dict[str, int]:
|
|
1640
|
+
"""Internal implementation of copy_metadata."""
|
|
1641
|
+
# Determine which features to copy
|
|
1642
|
+
features_to_copy: list[FeatureKey]
|
|
1643
|
+
if features is None:
|
|
1644
|
+
# Copy all features from active graph (features defined in current project)
|
|
1645
|
+
from metaxy.models.feature import FeatureGraph
|
|
1646
|
+
|
|
1647
|
+
graph = FeatureGraph.get_active()
|
|
1648
|
+
features_to_copy = graph.list_features(only_current_project=True)
|
|
1649
|
+
logger.info(
|
|
1650
|
+
f"Copying all features from active graph: {len(features_to_copy)} features"
|
|
1651
|
+
)
|
|
1652
|
+
else:
|
|
1653
|
+
# Convert all to FeatureKey using the adapter
|
|
1654
|
+
features_to_copy = [self._resolve_feature_key(item) for item in features]
|
|
1655
|
+
logger.info(f"Copying {len(features_to_copy)} specified features")
|
|
1656
|
+
|
|
1657
|
+
# Log snapshot usage
|
|
1658
|
+
if from_snapshot is not None:
|
|
1659
|
+
logger.info(f"Filtering by snapshot: {from_snapshot}")
|
|
1660
|
+
else:
|
|
1661
|
+
logger.info("Copying all data (no snapshot filter)")
|
|
1662
|
+
|
|
1663
|
+
# Copy metadata for each feature
|
|
1664
|
+
total_rows = 0
|
|
1665
|
+
features_copied = 0
|
|
1666
|
+
|
|
1667
|
+
with allow_feature_version_override():
|
|
1668
|
+
for feature_key in features_to_copy:
|
|
1669
|
+
try:
|
|
1670
|
+
# Read metadata from source, filtering by from_snapshot
|
|
1671
|
+
# Use current_only=False to avoid filtering by feature_version
|
|
1672
|
+
source_lazy = from_store.read_metadata(
|
|
1673
|
+
feature_key,
|
|
1674
|
+
allow_fallback=False,
|
|
1675
|
+
current_only=False,
|
|
1676
|
+
)
|
|
1677
|
+
|
|
1678
|
+
# Filter by from_snapshot if specified
|
|
1679
|
+
import narwhals as nw
|
|
1680
|
+
|
|
1681
|
+
if from_snapshot is not None:
|
|
1682
|
+
source_filtered = source_lazy.filter(
|
|
1683
|
+
nw.col(METAXY_SNAPSHOT_VERSION) == from_snapshot
|
|
1684
|
+
)
|
|
1685
|
+
else:
|
|
1686
|
+
source_filtered = source_lazy
|
|
1687
|
+
|
|
1688
|
+
# Apply filters for this feature (if any)
|
|
1689
|
+
if filters:
|
|
1690
|
+
feature_key_str = feature_key.to_string()
|
|
1691
|
+
if feature_key_str in filters:
|
|
1692
|
+
for filter_expr in filters[feature_key_str]:
|
|
1693
|
+
source_filtered = source_filtered.filter(filter_expr)
|
|
1694
|
+
|
|
1695
|
+
# Apply incremental filtering if enabled
|
|
1696
|
+
if incremental:
|
|
1697
|
+
try:
|
|
1698
|
+
# Read existing sample_uids from destination for the same snapshot
|
|
1699
|
+
# This is much cheaper than comparing metaxy_provenance_by_field structs
|
|
1700
|
+
dest_lazy = self.read_metadata(
|
|
1701
|
+
feature_key,
|
|
1702
|
+
allow_fallback=False,
|
|
1703
|
+
current_only=False,
|
|
1704
|
+
)
|
|
1705
|
+
# Filter destination to same snapshot_version (if specified)
|
|
1706
|
+
if from_snapshot is not None:
|
|
1707
|
+
dest_for_snapshot = dest_lazy.filter(
|
|
1708
|
+
nw.col(METAXY_SNAPSHOT_VERSION) == from_snapshot
|
|
1709
|
+
)
|
|
1710
|
+
else:
|
|
1711
|
+
dest_for_snapshot = dest_lazy
|
|
1712
|
+
|
|
1713
|
+
# Materialize destination sample_uids to avoid cross-backend join issues
|
|
1714
|
+
# When copying between different stores (e.g., different DuckDB files),
|
|
1715
|
+
# Ibis can't join tables from different backends
|
|
1716
|
+
dest_sample_uids = (
|
|
1717
|
+
dest_for_snapshot.select("sample_uid")
|
|
1718
|
+
.collect()
|
|
1719
|
+
.to_polars()
|
|
1720
|
+
)
|
|
1721
|
+
|
|
1722
|
+
# Convert to Polars LazyFrame and wrap in Narwhals
|
|
1723
|
+
dest_sample_uids_lazy = nw.from_native(
|
|
1724
|
+
dest_sample_uids.lazy(), eager_only=False
|
|
1725
|
+
)
|
|
1726
|
+
|
|
1727
|
+
# Collect source to Polars for anti-join
|
|
1728
|
+
source_df = source_filtered.collect().to_polars()
|
|
1729
|
+
source_lazy = nw.from_native(
|
|
1730
|
+
source_df.lazy(), eager_only=False
|
|
1731
|
+
)
|
|
1732
|
+
|
|
1733
|
+
# Anti-join: keep only source rows with sample_uid not in destination
|
|
1734
|
+
source_filtered = source_lazy.join(
|
|
1735
|
+
dest_sample_uids_lazy,
|
|
1736
|
+
on="sample_uid",
|
|
1737
|
+
how="anti",
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
# Collect after filtering
|
|
1741
|
+
source_df = source_filtered.collect().to_polars()
|
|
1742
|
+
|
|
1743
|
+
logger.info(
|
|
1744
|
+
f"Incremental: copying only new sample_uids for {feature_key.to_string()}"
|
|
1745
|
+
)
|
|
1746
|
+
except FeatureNotFoundError:
|
|
1747
|
+
# Feature doesn't exist in destination yet - copy all rows
|
|
1748
|
+
logger.debug(
|
|
1749
|
+
f"Feature {feature_key.to_string()} not in destination, copying all rows"
|
|
1750
|
+
)
|
|
1751
|
+
source_df = source_filtered.collect().to_polars()
|
|
1752
|
+
except Exception as e:
|
|
1753
|
+
# If incremental check fails, log warning but continue with full copy
|
|
1754
|
+
logger.warning(
|
|
1755
|
+
f"Incremental check failed for {feature_key.to_string()}: {e}. Copying all rows."
|
|
1756
|
+
)
|
|
1757
|
+
source_df = source_filtered.collect().to_polars()
|
|
1758
|
+
else:
|
|
1759
|
+
# Non-incremental: collect all filtered rows
|
|
1760
|
+
source_df = source_filtered.collect().to_polars()
|
|
1761
|
+
|
|
1762
|
+
if source_df.height == 0:
|
|
1763
|
+
logger.warning(
|
|
1764
|
+
f"No rows found for {feature_key.to_string()} with snapshot {from_snapshot}, skipping"
|
|
1765
|
+
)
|
|
1766
|
+
continue
|
|
1767
|
+
|
|
1768
|
+
# Write to destination (preserving snapshot_version and feature_version)
|
|
1769
|
+
self.write_metadata(feature_key, source_df)
|
|
1770
|
+
|
|
1771
|
+
features_copied += 1
|
|
1772
|
+
total_rows += source_df.height
|
|
1773
|
+
logger.info(
|
|
1774
|
+
f"Copied {source_df.height} rows for {feature_key.to_string()}"
|
|
1775
|
+
)
|
|
1776
|
+
|
|
1777
|
+
except FeatureNotFoundError:
|
|
1778
|
+
logger.warning(
|
|
1779
|
+
f"Feature {feature_key.to_string()} not found in source store, skipping"
|
|
1780
|
+
)
|
|
1781
|
+
continue
|
|
1782
|
+
except Exception as e:
|
|
1783
|
+
logger.error(
|
|
1784
|
+
f"Error copying {feature_key.to_string()}: {e}", exc_info=True
|
|
1785
|
+
)
|
|
1786
|
+
raise
|
|
1787
|
+
|
|
1788
|
+
logger.info(
|
|
1789
|
+
f"Copy complete: {features_copied} features, {total_rows} total rows"
|
|
1790
|
+
)
|
|
1791
|
+
|
|
1792
|
+
return {"features_copied": features_copied, "rows_copied": total_rows}
|