metaxy 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of metaxy might be problematic. Click here for more details.
- metaxy/__init__.py +61 -0
- metaxy/_testing.py +542 -0
- metaxy/_utils.py +16 -0
- metaxy/_version.py +1 -0
- metaxy/cli/app.py +76 -0
- metaxy/cli/context.py +71 -0
- metaxy/cli/graph.py +576 -0
- metaxy/cli/graph_diff.py +290 -0
- metaxy/cli/list.py +42 -0
- metaxy/cli/metadata.py +271 -0
- metaxy/cli/migrations.py +862 -0
- metaxy/cli/push.py +55 -0
- metaxy/config.py +450 -0
- metaxy/data_versioning/__init__.py +24 -0
- metaxy/data_versioning/calculators/__init__.py +13 -0
- metaxy/data_versioning/calculators/base.py +97 -0
- metaxy/data_versioning/calculators/duckdb.py +186 -0
- metaxy/data_versioning/calculators/ibis.py +225 -0
- metaxy/data_versioning/calculators/polars.py +135 -0
- metaxy/data_versioning/diff/__init__.py +15 -0
- metaxy/data_versioning/diff/base.py +150 -0
- metaxy/data_versioning/diff/narwhals.py +108 -0
- metaxy/data_versioning/hash_algorithms.py +19 -0
- metaxy/data_versioning/joiners/__init__.py +9 -0
- metaxy/data_versioning/joiners/base.py +70 -0
- metaxy/data_versioning/joiners/narwhals.py +235 -0
- metaxy/entrypoints.py +309 -0
- metaxy/ext/__init__.py +1 -0
- metaxy/ext/alembic.py +326 -0
- metaxy/ext/sqlmodel.py +172 -0
- metaxy/ext/sqlmodel_system_tables.py +139 -0
- metaxy/graph/__init__.py +21 -0
- metaxy/graph/diff/__init__.py +21 -0
- metaxy/graph/diff/diff_models.py +399 -0
- metaxy/graph/diff/differ.py +740 -0
- metaxy/graph/diff/models.py +418 -0
- metaxy/graph/diff/rendering/__init__.py +18 -0
- metaxy/graph/diff/rendering/base.py +274 -0
- metaxy/graph/diff/rendering/cards.py +188 -0
- metaxy/graph/diff/rendering/formatter.py +805 -0
- metaxy/graph/diff/rendering/graphviz.py +246 -0
- metaxy/graph/diff/rendering/mermaid.py +320 -0
- metaxy/graph/diff/rendering/rich.py +165 -0
- metaxy/graph/diff/rendering/theme.py +48 -0
- metaxy/graph/diff/traversal.py +247 -0
- metaxy/graph/utils.py +58 -0
- metaxy/metadata_store/__init__.py +31 -0
- metaxy/metadata_store/_protocols.py +38 -0
- metaxy/metadata_store/base.py +1676 -0
- metaxy/metadata_store/clickhouse.py +161 -0
- metaxy/metadata_store/duckdb.py +167 -0
- metaxy/metadata_store/exceptions.py +43 -0
- metaxy/metadata_store/ibis.py +451 -0
- metaxy/metadata_store/memory.py +228 -0
- metaxy/metadata_store/sqlite.py +187 -0
- metaxy/metadata_store/system_tables.py +257 -0
- metaxy/migrations/__init__.py +34 -0
- metaxy/migrations/detector.py +153 -0
- metaxy/migrations/executor.py +208 -0
- metaxy/migrations/loader.py +260 -0
- metaxy/migrations/models.py +718 -0
- metaxy/migrations/ops.py +390 -0
- metaxy/models/__init__.py +0 -0
- metaxy/models/bases.py +6 -0
- metaxy/models/constants.py +24 -0
- metaxy/models/feature.py +665 -0
- metaxy/models/feature_spec.py +105 -0
- metaxy/models/field.py +25 -0
- metaxy/models/plan.py +155 -0
- metaxy/models/types.py +157 -0
- metaxy/py.typed +0 -0
- metaxy-0.0.0.dist-info/METADATA +247 -0
- metaxy-0.0.0.dist-info/RECORD +75 -0
- metaxy-0.0.0.dist-info/WHEEL +4 -0
- metaxy-0.0.0.dist-info/entry_points.txt +3 -0
metaxy/cli/push.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Push command for recording feature versions."""
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
|
|
5
|
+
console = Console()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def push(store: str | None = None):
|
|
9
|
+
"""Record all feature versions (push graph snapshot).
|
|
10
|
+
|
|
11
|
+
Records all features in the active graph to the metadata store
|
|
12
|
+
with a deterministic snapshot version. This should be run after deploying
|
|
13
|
+
new feature definitions.
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
$ metaxy push
|
|
17
|
+
|
|
18
|
+
✓ Recorded feature graph
|
|
19
|
+
Snapshot version: abc123def456...
|
|
20
|
+
|
|
21
|
+
# Or if already recorded:
|
|
22
|
+
ℹ Snapshot already recorded (skipped)
|
|
23
|
+
Snapshot version: abc123def456...
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
store: The metadata store to use. Defaults to the default store.
|
|
27
|
+
"""
|
|
28
|
+
from metaxy.cli.context import get_store
|
|
29
|
+
from metaxy.models.feature import FeatureGraph
|
|
30
|
+
|
|
31
|
+
metadata_store = get_store(store)
|
|
32
|
+
|
|
33
|
+
with metadata_store:
|
|
34
|
+
# Get active graph
|
|
35
|
+
active_graph = FeatureGraph.get_active()
|
|
36
|
+
if len(active_graph.features_by_key) == 0:
|
|
37
|
+
console.print("[yellow]⚠[/yellow] No features in active graph")
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
# Record feature graph snapshot (idempotent)
|
|
41
|
+
# Returns (snapshot_version, already_exists)
|
|
42
|
+
snapshot_version, already_exists = (
|
|
43
|
+
metadata_store.record_feature_graph_snapshot()
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if already_exists:
|
|
47
|
+
console.print("[blue]ℹ[/blue] Snapshot already recorded (skipped)")
|
|
48
|
+
console.print(f" Snapshot version: {snapshot_version}")
|
|
49
|
+
else:
|
|
50
|
+
console.print("[green]✓[/green] Recorded feature graph")
|
|
51
|
+
console.print(f" Snapshot version: {snapshot_version}")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if __name__ == "__main__":
|
|
55
|
+
push()
|
metaxy/config.py
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
"""Configuration system for Metaxy using pydantic-settings."""
|
|
2
|
+
# pyright: reportImportCycles=false
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
import tomllib # Python 3.11+ # pyright: ignore[reportMissingImports]
|
|
9
|
+
except ImportError:
|
|
10
|
+
import tomli as tomllib # Fallback for Python 3.10
|
|
11
|
+
|
|
12
|
+
import warnings
|
|
13
|
+
from contextvars import ContextVar
|
|
14
|
+
|
|
15
|
+
from pydantic import Field as PydanticField
|
|
16
|
+
from pydantic import PrivateAttr
|
|
17
|
+
from pydantic_settings import (
|
|
18
|
+
BaseSettings,
|
|
19
|
+
PydanticBaseSettingsSource,
|
|
20
|
+
SettingsConfigDict,
|
|
21
|
+
)
|
|
22
|
+
from typing_extensions import Self
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from metaxy.metadata_store.base import (
|
|
26
|
+
MetadataStore, # pyright: ignore[reportImportCycles]
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
T = TypeVar("T")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TomlConfigSettingsSource(PydanticBaseSettingsSource):
|
|
33
|
+
"""Custom settings source for TOML configuration files.
|
|
34
|
+
|
|
35
|
+
Auto-discovers configuration in this order:
|
|
36
|
+
1. Explicit file path if provided
|
|
37
|
+
2. metaxy.toml in current directory (preferred)
|
|
38
|
+
3. pyproject.toml [tool.metaxy] section (fallback)
|
|
39
|
+
4. No config (returns empty dict)
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, settings_cls: type[BaseSettings], toml_file: Path | None = None):
|
|
43
|
+
super().__init__(settings_cls)
|
|
44
|
+
self.toml_file = toml_file or self._discover_config_file()
|
|
45
|
+
self.toml_data = self._load_toml()
|
|
46
|
+
|
|
47
|
+
def _discover_config_file(self) -> Path | None:
|
|
48
|
+
"""Auto-discover config file."""
|
|
49
|
+
# Prefer metaxy.toml
|
|
50
|
+
if Path("metaxy.toml").exists():
|
|
51
|
+
return Path("metaxy.toml")
|
|
52
|
+
|
|
53
|
+
# Fallback to pyproject.toml
|
|
54
|
+
if Path("pyproject.toml").exists():
|
|
55
|
+
return Path("pyproject.toml")
|
|
56
|
+
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
def _load_toml(self) -> dict[str, Any]:
|
|
60
|
+
"""Load TOML file and extract metaxy config."""
|
|
61
|
+
if self.toml_file is None:
|
|
62
|
+
return {}
|
|
63
|
+
|
|
64
|
+
with open(self.toml_file, "rb") as f:
|
|
65
|
+
data = tomllib.load(f)
|
|
66
|
+
|
|
67
|
+
# Extract [tool.metaxy] from pyproject.toml or root from metaxy.toml
|
|
68
|
+
if self.toml_file.name == "pyproject.toml":
|
|
69
|
+
return data.get("tool", {}).get("metaxy", {})
|
|
70
|
+
else:
|
|
71
|
+
return data
|
|
72
|
+
|
|
73
|
+
def get_field_value(self, field: Any, field_name: str) -> tuple[Any, str, bool]:
|
|
74
|
+
"""Get field value from TOML data."""
|
|
75
|
+
field_value = self.toml_data.get(field_name)
|
|
76
|
+
return field_value, field_name, False
|
|
77
|
+
|
|
78
|
+
def __call__(self) -> dict[str, Any]:
|
|
79
|
+
"""Return all settings from TOML."""
|
|
80
|
+
return self.toml_data
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class StoreConfig(BaseSettings):
|
|
84
|
+
"""Configuration for a single metadata store.
|
|
85
|
+
|
|
86
|
+
Structure:
|
|
87
|
+
type: Full import path to store class
|
|
88
|
+
config: Dict of all configuration (including fallback_stores)
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
>>> config = StoreConfig(
|
|
92
|
+
... type="metaxy_delta.DeltaMetadataStore",
|
|
93
|
+
... config={
|
|
94
|
+
... "table_uri": "s3://bucket/metadata",
|
|
95
|
+
... "region": "us-west-2",
|
|
96
|
+
... "fallback_stores": ["prod"],
|
|
97
|
+
... }
|
|
98
|
+
... )
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
model_config = SettingsConfigDict(
|
|
102
|
+
extra="forbid", # Only type and config fields allowed
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Store class (full import path)
|
|
106
|
+
type: str
|
|
107
|
+
|
|
108
|
+
# Store configuration (all kwargs for __init__)
|
|
109
|
+
# This includes fallback_stores, table_uri, db_path, storage_options, etc.
|
|
110
|
+
config: dict[str, Any] = PydanticField(default_factory=dict)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class PluginConfig(BaseSettings):
|
|
114
|
+
"""Configuration for Metaxy plugins"""
|
|
115
|
+
|
|
116
|
+
enabled: bool = PydanticField(
|
|
117
|
+
default=False,
|
|
118
|
+
description="Whether to enable plugin.",
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
_plugin: str = PrivateAttr()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class SQLModelConfig(PluginConfig):
|
|
125
|
+
"""Configuration for SQLModel"""
|
|
126
|
+
|
|
127
|
+
infer_db_table_names: bool = PydanticField(
|
|
128
|
+
default=True,
|
|
129
|
+
description="Whether to automatically use `FeatureKey.table_name` for sqlalchemy's __tablename__ value.",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Whether to use SQLModel definitions for system tables (for Alembic migrations)
|
|
133
|
+
system_tables: bool = PydanticField(
|
|
134
|
+
default=False,
|
|
135
|
+
description="Whether to use SQLModel definitions for system tables (for Alembic migrations).",
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
_plugin: str = PrivateAttr(default="sqlmodel")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class ExtConfig(BaseSettings):
|
|
142
|
+
"""Configuration for Metaxy integrations with third-party tools"""
|
|
143
|
+
|
|
144
|
+
model_config = SettingsConfigDict(
|
|
145
|
+
extra="allow",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
sqlmodel: SQLModelConfig = PydanticField(default_factory=SQLModelConfig)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# Context variable for storing the app context
|
|
152
|
+
_metaxy_config: ContextVar["MetaxyConfig | None"] = ContextVar(
|
|
153
|
+
"_metaxy_config", default=None
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class MetaxyConfig(BaseSettings):
|
|
158
|
+
"""Main Metaxy configuration.
|
|
159
|
+
|
|
160
|
+
Loads from:
|
|
161
|
+
1. TOML file (metaxy.toml or pyproject.toml [tool.metaxy])
|
|
162
|
+
2. Environment variables (METAXY_*)
|
|
163
|
+
3. Init arguments
|
|
164
|
+
|
|
165
|
+
Priority: init > env vars > TOML
|
|
166
|
+
|
|
167
|
+
Example:
|
|
168
|
+
>>> # Auto-discover config
|
|
169
|
+
>>> config = MetaxyConfig.load()
|
|
170
|
+
>>>
|
|
171
|
+
>>> # Get store instance
|
|
172
|
+
>>> store = config.get_store("prod")
|
|
173
|
+
>>>
|
|
174
|
+
>>> # Override via env var
|
|
175
|
+
>>> # METAXY_STORE=staging METAXY_REGISTRY=myapp.features:my_graph
|
|
176
|
+
>>> config = MetaxyConfig.load()
|
|
177
|
+
>>> store = config.get_store() # Uses staging with custom graph
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
model_config = SettingsConfigDict(
|
|
181
|
+
env_prefix="METAXY_",
|
|
182
|
+
env_nested_delimiter="__",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Store to use
|
|
186
|
+
store: str = "dev"
|
|
187
|
+
|
|
188
|
+
# Named store configurations
|
|
189
|
+
stores: dict[str, StoreConfig] = PydanticField(default_factory=dict)
|
|
190
|
+
|
|
191
|
+
# Migrations directory
|
|
192
|
+
migrations_dir: str = ".metaxy/migrations"
|
|
193
|
+
|
|
194
|
+
# Entrypoints to load (list of module paths)
|
|
195
|
+
entrypoints: list[str] = PydanticField(default_factory=list)
|
|
196
|
+
|
|
197
|
+
# Graph rendering theme
|
|
198
|
+
theme: str = "default"
|
|
199
|
+
|
|
200
|
+
ext: ExtConfig = PydanticField(default_factory=ExtConfig)
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def plugins(self) -> list[str]:
|
|
204
|
+
"""Returns all enabled plugin names from ext configuration."""
|
|
205
|
+
plugins = []
|
|
206
|
+
for field_name in type(self.ext).model_fields:
|
|
207
|
+
field_value = getattr(self.ext, field_name)
|
|
208
|
+
if hasattr(field_value, "_plugin") and field_value.enabled:
|
|
209
|
+
plugins.append(field_value._plugin)
|
|
210
|
+
return plugins
|
|
211
|
+
|
|
212
|
+
@classmethod
|
|
213
|
+
def settings_customise_sources(
|
|
214
|
+
cls,
|
|
215
|
+
settings_cls: type[BaseSettings],
|
|
216
|
+
init_settings: PydanticBaseSettingsSource,
|
|
217
|
+
env_settings: PydanticBaseSettingsSource,
|
|
218
|
+
dotenv_settings: PydanticBaseSettingsSource,
|
|
219
|
+
file_secret_settings: PydanticBaseSettingsSource,
|
|
220
|
+
) -> tuple[PydanticBaseSettingsSource, ...]:
|
|
221
|
+
"""Customize settings sources: init → env → TOML.
|
|
222
|
+
|
|
223
|
+
Priority (first wins):
|
|
224
|
+
1. Init arguments
|
|
225
|
+
2. Environment variables
|
|
226
|
+
3. TOML file
|
|
227
|
+
"""
|
|
228
|
+
toml_settings = TomlConfigSettingsSource(settings_cls)
|
|
229
|
+
return (init_settings, env_settings, toml_settings)
|
|
230
|
+
|
|
231
|
+
@classmethod
|
|
232
|
+
def get(cls) -> "MetaxyConfig":
|
|
233
|
+
"""Get the current Metaxy configuration."""
|
|
234
|
+
cfg = _metaxy_config.get()
|
|
235
|
+
if cfg is None:
|
|
236
|
+
warnings.warn(
|
|
237
|
+
UserWarning(
|
|
238
|
+
"Global Metaxy configuration not initialized. It can be set with MetaxyConfig.set(config) typically after loading it from a toml file. Returning default configuration (with environment variables and other pydantic settings sources resolved)."
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
return cls()
|
|
242
|
+
else:
|
|
243
|
+
return cfg
|
|
244
|
+
|
|
245
|
+
@classmethod
|
|
246
|
+
def set(cls, config: Self) -> None:
|
|
247
|
+
"""Set the current Metaxy configuration."""
|
|
248
|
+
_metaxy_config.set(config)
|
|
249
|
+
|
|
250
|
+
@classmethod
|
|
251
|
+
def load(
|
|
252
|
+
cls, config_file: str | Path | None = None, *, search_parents: bool = True
|
|
253
|
+
) -> "MetaxyConfig":
|
|
254
|
+
"""Load config with auto-discovery and parent directory search.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
config_file: Optional config file path (overrides auto-discovery)
|
|
258
|
+
search_parents: Search parent directories for config file (default: True)
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Loaded config (TOML + env vars merged)
|
|
262
|
+
|
|
263
|
+
Example:
|
|
264
|
+
>>> # Auto-discover with parent search
|
|
265
|
+
>>> config = MetaxyConfig.load()
|
|
266
|
+
>>>
|
|
267
|
+
>>> # Explicit file
|
|
268
|
+
>>> config = MetaxyConfig.load("custom.toml")
|
|
269
|
+
|
|
270
|
+
>>> # Auto-discover without parent search
|
|
271
|
+
>>> config = MetaxyConfig.load(search_parents=False)
|
|
272
|
+
"""
|
|
273
|
+
# Search for config file if not explicitly provided
|
|
274
|
+
if config_file is None and search_parents:
|
|
275
|
+
config_file = cls._discover_config_with_parents()
|
|
276
|
+
|
|
277
|
+
# For explicit file, temporarily patch the TomlConfigSettingsSource
|
|
278
|
+
# to use that file, then use normal instantiation
|
|
279
|
+
# This ensures env vars still work
|
|
280
|
+
|
|
281
|
+
if config_file:
|
|
282
|
+
# Create a custom settings source class for this file
|
|
283
|
+
toml_path = Path(config_file)
|
|
284
|
+
|
|
285
|
+
class CustomTomlSource(TomlConfigSettingsSource):
|
|
286
|
+
def __init__(self, settings_cls: type[BaseSettings]):
|
|
287
|
+
# Skip auto-discovery, use explicit file
|
|
288
|
+
super(TomlConfigSettingsSource, self).__init__(settings_cls)
|
|
289
|
+
self.toml_file = toml_path
|
|
290
|
+
self.toml_data = self._load_toml()
|
|
291
|
+
|
|
292
|
+
# Customize sources to use custom TOML file
|
|
293
|
+
original_method = cls.settings_customise_sources
|
|
294
|
+
|
|
295
|
+
@classmethod # type: ignore[misc]
|
|
296
|
+
def custom_sources(
|
|
297
|
+
cls_inner,
|
|
298
|
+
settings_cls,
|
|
299
|
+
init_settings,
|
|
300
|
+
env_settings,
|
|
301
|
+
dotenv_settings,
|
|
302
|
+
file_secret_settings,
|
|
303
|
+
):
|
|
304
|
+
toml_settings = CustomTomlSource(settings_cls)
|
|
305
|
+
return (init_settings, env_settings, toml_settings)
|
|
306
|
+
|
|
307
|
+
# Temporarily replace method
|
|
308
|
+
cls.settings_customise_sources = custom_sources # type: ignore[assignment]
|
|
309
|
+
config = cls()
|
|
310
|
+
cls.settings_customise_sources = original_method # type: ignore[method-assign]
|
|
311
|
+
else:
|
|
312
|
+
# Use default sources (auto-discovery + env vars)
|
|
313
|
+
config = cls()
|
|
314
|
+
|
|
315
|
+
cls.set(config)
|
|
316
|
+
|
|
317
|
+
return config
|
|
318
|
+
|
|
319
|
+
@staticmethod
|
|
320
|
+
def _discover_config_with_parents() -> Path | None:
|
|
321
|
+
"""Discover config file by searching current and parent directories.
|
|
322
|
+
|
|
323
|
+
Searches for metaxy.toml or pyproject.toml in current directory,
|
|
324
|
+
then iteratively searches parent directories.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Path to config file if found, None otherwise
|
|
328
|
+
"""
|
|
329
|
+
current = Path.cwd()
|
|
330
|
+
|
|
331
|
+
while True:
|
|
332
|
+
# Check for metaxy.toml (preferred)
|
|
333
|
+
metaxy_toml = current / "metaxy.toml"
|
|
334
|
+
if metaxy_toml.exists():
|
|
335
|
+
return metaxy_toml
|
|
336
|
+
|
|
337
|
+
# Check for pyproject.toml
|
|
338
|
+
pyproject_toml = current / "pyproject.toml"
|
|
339
|
+
if pyproject_toml.exists():
|
|
340
|
+
return pyproject_toml
|
|
341
|
+
|
|
342
|
+
# Move to parent
|
|
343
|
+
parent = current.parent
|
|
344
|
+
if parent == current:
|
|
345
|
+
# Reached root
|
|
346
|
+
break
|
|
347
|
+
current = parent
|
|
348
|
+
|
|
349
|
+
return None
|
|
350
|
+
|
|
351
|
+
def get_store(
|
|
352
|
+
self,
|
|
353
|
+
name: str | None = None,
|
|
354
|
+
) -> "MetadataStore":
|
|
355
|
+
"""Instantiate metadata store by name.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
name: Store name (uses config.store if None)
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
Instantiated metadata store
|
|
362
|
+
|
|
363
|
+
Raises:
|
|
364
|
+
ValueError: If store name not found in config, or if fallback stores
|
|
365
|
+
have different hash algorithms than the parent store
|
|
366
|
+
ImportError: If store class cannot be imported
|
|
367
|
+
|
|
368
|
+
Example:
|
|
369
|
+
>>> config = MetaxyConfig.load()
|
|
370
|
+
>>> store = config.get_store("prod")
|
|
371
|
+
>>>
|
|
372
|
+
>>> # Use default store
|
|
373
|
+
>>> store = config.get_store()
|
|
374
|
+
"""
|
|
375
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
376
|
+
|
|
377
|
+
if len(self.stores) == 0:
|
|
378
|
+
raise ValueError(
|
|
379
|
+
"No Metaxy stores available. They should be configured in metaxy.toml|pyproject.toml or via environment variables."
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
name = name or self.store
|
|
383
|
+
|
|
384
|
+
if name not in self.stores:
|
|
385
|
+
raise ValueError(
|
|
386
|
+
f"Store '{name}' not found in config. "
|
|
387
|
+
f"Available stores: {list(self.stores.keys())}"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
store_config = self.stores[name]
|
|
391
|
+
|
|
392
|
+
# Import store class
|
|
393
|
+
store_class = self._import_class(store_config.type)
|
|
394
|
+
|
|
395
|
+
# Extract configuration
|
|
396
|
+
config_copy = store_config.config.copy()
|
|
397
|
+
fallback_store_names = config_copy.pop("fallback_stores", [])
|
|
398
|
+
|
|
399
|
+
# Get hash_algorithm from config (if specified) and convert to enum
|
|
400
|
+
configured_hash_algorithm = config_copy.get("hash_algorithm")
|
|
401
|
+
if configured_hash_algorithm is not None:
|
|
402
|
+
# Convert string to enum if needed
|
|
403
|
+
if isinstance(configured_hash_algorithm, str):
|
|
404
|
+
configured_hash_algorithm = HashAlgorithm(configured_hash_algorithm)
|
|
405
|
+
config_copy["hash_algorithm"] = configured_hash_algorithm
|
|
406
|
+
else:
|
|
407
|
+
# Use default
|
|
408
|
+
configured_hash_algorithm = HashAlgorithm.XXHASH64
|
|
409
|
+
config_copy["hash_algorithm"] = configured_hash_algorithm
|
|
410
|
+
|
|
411
|
+
# Build fallback stores recursively
|
|
412
|
+
fallback_stores = []
|
|
413
|
+
for fallback_name in fallback_store_names:
|
|
414
|
+
fallback_store = self.get_store(fallback_name)
|
|
415
|
+
fallback_stores.append(fallback_store)
|
|
416
|
+
|
|
417
|
+
# Instantiate store with config + fallback_stores
|
|
418
|
+
store = store_class(
|
|
419
|
+
fallback_stores=fallback_stores,
|
|
420
|
+
**config_copy,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Verify the store actually uses the hash algorithm we configured
|
|
424
|
+
# (in case a store subclass overrides the default or ignores the parameter)
|
|
425
|
+
if store.hash_algorithm != configured_hash_algorithm:
|
|
426
|
+
raise ValueError(
|
|
427
|
+
f"Store '{name}' ({store_class.__name__}) was configured with "
|
|
428
|
+
f"hash_algorithm='{configured_hash_algorithm.value}' but is using "
|
|
429
|
+
f"'{store.hash_algorithm.value}'. The store class may have overridden "
|
|
430
|
+
f"the hash algorithm. All stores must use the same hash algorithm."
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
return store
|
|
434
|
+
|
|
435
|
+
@staticmethod
|
|
436
|
+
def _import_class(class_path: str) -> type:
|
|
437
|
+
"""Import class from module path.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
class_path: Full import path like "metaxy.metadata_store.InMemoryMetadataStore"
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
Imported class
|
|
444
|
+
|
|
445
|
+
Raises:
|
|
446
|
+
ImportError: If module or class not found
|
|
447
|
+
"""
|
|
448
|
+
module_path, class_name = class_path.rsplit(".", 1)
|
|
449
|
+
module = __import__(module_path, fromlist=[class_name])
|
|
450
|
+
return getattr(module, class_name)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Data versioning module for sample-level data version calculation."""
|
|
2
|
+
|
|
3
|
+
from metaxy.data_versioning.calculators import (
|
|
4
|
+
DataVersionCalculator,
|
|
5
|
+
PolarsDataVersionCalculator,
|
|
6
|
+
)
|
|
7
|
+
from metaxy.data_versioning.diff import (
|
|
8
|
+
DiffResult,
|
|
9
|
+
MetadataDiffResolver,
|
|
10
|
+
NarwhalsDiffResolver,
|
|
11
|
+
)
|
|
12
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
13
|
+
from metaxy.data_versioning.joiners import NarwhalsJoiner, UpstreamJoiner
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"HashAlgorithm",
|
|
17
|
+
"UpstreamJoiner",
|
|
18
|
+
"NarwhalsJoiner",
|
|
19
|
+
"DataVersionCalculator",
|
|
20
|
+
"PolarsDataVersionCalculator",
|
|
21
|
+
"DiffResult",
|
|
22
|
+
"MetadataDiffResolver",
|
|
23
|
+
"NarwhalsDiffResolver",
|
|
24
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Data version calculators for computing hash from upstream data."""
|
|
2
|
+
|
|
3
|
+
from metaxy.data_versioning.calculators.base import DataVersionCalculator
|
|
4
|
+
from metaxy.data_versioning.calculators.duckdb import DuckDBDataVersionCalculator
|
|
5
|
+
from metaxy.data_versioning.calculators.ibis import IbisDataVersionCalculator
|
|
6
|
+
from metaxy.data_versioning.calculators.polars import PolarsDataVersionCalculator
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"DataVersionCalculator",
|
|
10
|
+
"DuckDBDataVersionCalculator",
|
|
11
|
+
"IbisDataVersionCalculator",
|
|
12
|
+
"PolarsDataVersionCalculator",
|
|
13
|
+
]
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Abstract base class for data version calculators."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import narwhals as nw
|
|
7
|
+
|
|
8
|
+
from metaxy.data_versioning.hash_algorithms import HashAlgorithm
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from metaxy.models.feature_spec import FeatureSpec
|
|
12
|
+
from metaxy.models.plan import FeaturePlan
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DataVersionCalculator(ABC):
|
|
16
|
+
"""Calculates data_version hash from joined upstream data.
|
|
17
|
+
|
|
18
|
+
The calculator takes joined upstream data (output from UpstreamJoiner)
|
|
19
|
+
and computes the data_version hash for each sample.
|
|
20
|
+
|
|
21
|
+
This is Step 2 in the data versioning process:
|
|
22
|
+
1. Join upstream features → unified upstream view
|
|
23
|
+
2. Calculate data_version from upstream → target versions ← THIS STEP
|
|
24
|
+
3. Diff with current metadata → identify changes
|
|
25
|
+
|
|
26
|
+
All calculators work with Narwhals LazyFrames for backend compatibility.
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
- PolarsDataVersionCalculator: Uses polars-hash for in-memory hashing
|
|
30
|
+
- NarwhalsDataVersionCalculator: Uses native SQL hash functions in the database
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def supported_algorithms(self) -> list[HashAlgorithm]:
|
|
36
|
+
"""List of hash algorithms this calculator supports.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
List of supported HashAlgorithm values
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
>>> calc = PolarsDataVersionCalculator()
|
|
43
|
+
>>> HashAlgorithm.XXHASH64 in calc.supported_algorithms
|
|
44
|
+
True
|
|
45
|
+
"""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def default_algorithm(self) -> HashAlgorithm:
|
|
51
|
+
"""Default hash algorithm for this calculator.
|
|
52
|
+
|
|
53
|
+
Should be the most performant algorithm that's widely compatible.
|
|
54
|
+
Typically xxHash64 for cross-database compatibility.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Default HashAlgorithm
|
|
58
|
+
"""
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def calculate_data_versions(
|
|
63
|
+
self,
|
|
64
|
+
joined_upstream: nw.LazyFrame[Any],
|
|
65
|
+
feature_spec: "FeatureSpec",
|
|
66
|
+
feature_plan: "FeaturePlan",
|
|
67
|
+
upstream_column_mapping: dict[str, str],
|
|
68
|
+
hash_algorithm: HashAlgorithm | None = None,
|
|
69
|
+
) -> nw.LazyFrame[Any]:
|
|
70
|
+
"""Calculate data_version column from joined upstream data.
|
|
71
|
+
|
|
72
|
+
Computes a Merkle tree hash for each sample by:
|
|
73
|
+
1. For each field in the feature:
|
|
74
|
+
a. Concatenate: field_key | code_version | upstream hashes
|
|
75
|
+
b. Hash the concatenated string
|
|
76
|
+
2. Create struct with all field hashes
|
|
77
|
+
3. Add as data_version column
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
joined_upstream: Narwhals LazyFrame with all upstream data_version columns joined
|
|
81
|
+
(output from UpstreamJoiner.join_upstream)
|
|
82
|
+
feature_spec: Specification of the feature being computed
|
|
83
|
+
feature_plan: Resolved feature plan with dependencies
|
|
84
|
+
upstream_column_mapping: Maps upstream feature key -> column name
|
|
85
|
+
where its data_version struct is located in joined_upstream
|
|
86
|
+
Example: {"video": "__upstream_video__data_version"}
|
|
87
|
+
hash_algorithm: Hash algorithm to use. If None, uses self.default_algorithm.
|
|
88
|
+
Must be in self.supported_algorithms.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Narwhals LazyFrame with data_version column added
|
|
92
|
+
Shape: [sample_uid, __upstream_*__data_version columns, data_version (new)]
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If hash_algorithm not in supported_algorithms
|
|
96
|
+
"""
|
|
97
|
+
pass
|