metaxy 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metaxy might be problematic. Click here for more details.

Files changed (75) hide show
  1. metaxy/__init__.py +61 -0
  2. metaxy/_testing.py +542 -0
  3. metaxy/_utils.py +16 -0
  4. metaxy/_version.py +1 -0
  5. metaxy/cli/app.py +76 -0
  6. metaxy/cli/context.py +71 -0
  7. metaxy/cli/graph.py +576 -0
  8. metaxy/cli/graph_diff.py +290 -0
  9. metaxy/cli/list.py +42 -0
  10. metaxy/cli/metadata.py +271 -0
  11. metaxy/cli/migrations.py +862 -0
  12. metaxy/cli/push.py +55 -0
  13. metaxy/config.py +450 -0
  14. metaxy/data_versioning/__init__.py +24 -0
  15. metaxy/data_versioning/calculators/__init__.py +13 -0
  16. metaxy/data_versioning/calculators/base.py +97 -0
  17. metaxy/data_versioning/calculators/duckdb.py +186 -0
  18. metaxy/data_versioning/calculators/ibis.py +225 -0
  19. metaxy/data_versioning/calculators/polars.py +135 -0
  20. metaxy/data_versioning/diff/__init__.py +15 -0
  21. metaxy/data_versioning/diff/base.py +150 -0
  22. metaxy/data_versioning/diff/narwhals.py +108 -0
  23. metaxy/data_versioning/hash_algorithms.py +19 -0
  24. metaxy/data_versioning/joiners/__init__.py +9 -0
  25. metaxy/data_versioning/joiners/base.py +70 -0
  26. metaxy/data_versioning/joiners/narwhals.py +235 -0
  27. metaxy/entrypoints.py +309 -0
  28. metaxy/ext/__init__.py +1 -0
  29. metaxy/ext/alembic.py +326 -0
  30. metaxy/ext/sqlmodel.py +172 -0
  31. metaxy/ext/sqlmodel_system_tables.py +139 -0
  32. metaxy/graph/__init__.py +21 -0
  33. metaxy/graph/diff/__init__.py +21 -0
  34. metaxy/graph/diff/diff_models.py +399 -0
  35. metaxy/graph/diff/differ.py +740 -0
  36. metaxy/graph/diff/models.py +418 -0
  37. metaxy/graph/diff/rendering/__init__.py +18 -0
  38. metaxy/graph/diff/rendering/base.py +274 -0
  39. metaxy/graph/diff/rendering/cards.py +188 -0
  40. metaxy/graph/diff/rendering/formatter.py +805 -0
  41. metaxy/graph/diff/rendering/graphviz.py +246 -0
  42. metaxy/graph/diff/rendering/mermaid.py +320 -0
  43. metaxy/graph/diff/rendering/rich.py +165 -0
  44. metaxy/graph/diff/rendering/theme.py +48 -0
  45. metaxy/graph/diff/traversal.py +247 -0
  46. metaxy/graph/utils.py +58 -0
  47. metaxy/metadata_store/__init__.py +31 -0
  48. metaxy/metadata_store/_protocols.py +38 -0
  49. metaxy/metadata_store/base.py +1676 -0
  50. metaxy/metadata_store/clickhouse.py +161 -0
  51. metaxy/metadata_store/duckdb.py +167 -0
  52. metaxy/metadata_store/exceptions.py +43 -0
  53. metaxy/metadata_store/ibis.py +451 -0
  54. metaxy/metadata_store/memory.py +228 -0
  55. metaxy/metadata_store/sqlite.py +187 -0
  56. metaxy/metadata_store/system_tables.py +257 -0
  57. metaxy/migrations/__init__.py +34 -0
  58. metaxy/migrations/detector.py +153 -0
  59. metaxy/migrations/executor.py +208 -0
  60. metaxy/migrations/loader.py +260 -0
  61. metaxy/migrations/models.py +718 -0
  62. metaxy/migrations/ops.py +390 -0
  63. metaxy/models/__init__.py +0 -0
  64. metaxy/models/bases.py +6 -0
  65. metaxy/models/constants.py +24 -0
  66. metaxy/models/feature.py +665 -0
  67. metaxy/models/feature_spec.py +105 -0
  68. metaxy/models/field.py +25 -0
  69. metaxy/models/plan.py +155 -0
  70. metaxy/models/types.py +157 -0
  71. metaxy/py.typed +0 -0
  72. metaxy-0.0.0.dist-info/METADATA +247 -0
  73. metaxy-0.0.0.dist-info/RECORD +75 -0
  74. metaxy-0.0.0.dist-info/WHEEL +4 -0
  75. metaxy-0.0.0.dist-info/entry_points.txt +3 -0
metaxy/cli/push.py ADDED
@@ -0,0 +1,55 @@
1
+ """Push command for recording feature versions."""
2
+
3
+ from rich.console import Console
4
+
5
+ console = Console()
6
+
7
+
8
+ def push(store: str | None = None):
9
+ """Record all feature versions (push graph snapshot).
10
+
11
+ Records all features in the active graph to the metadata store
12
+ with a deterministic snapshot version. This should be run after deploying
13
+ new feature definitions.
14
+
15
+ Example:
16
+ $ metaxy push
17
+
18
+ ✓ Recorded feature graph
19
+ Snapshot version: abc123def456...
20
+
21
+ # Or if already recorded:
22
+ ℹ Snapshot already recorded (skipped)
23
+ Snapshot version: abc123def456...
24
+
25
+ Args:
26
+ store: The metadata store to use. Defaults to the default store.
27
+ """
28
+ from metaxy.cli.context import get_store
29
+ from metaxy.models.feature import FeatureGraph
30
+
31
+ metadata_store = get_store(store)
32
+
33
+ with metadata_store:
34
+ # Get active graph
35
+ active_graph = FeatureGraph.get_active()
36
+ if len(active_graph.features_by_key) == 0:
37
+ console.print("[yellow]⚠[/yellow] No features in active graph")
38
+ return
39
+
40
+ # Record feature graph snapshot (idempotent)
41
+ # Returns (snapshot_version, already_exists)
42
+ snapshot_version, already_exists = (
43
+ metadata_store.record_feature_graph_snapshot()
44
+ )
45
+
46
+ if already_exists:
47
+ console.print("[blue]ℹ[/blue] Snapshot already recorded (skipped)")
48
+ console.print(f" Snapshot version: {snapshot_version}")
49
+ else:
50
+ console.print("[green]✓[/green] Recorded feature graph")
51
+ console.print(f" Snapshot version: {snapshot_version}")
52
+
53
+
54
+ if __name__ == "__main__":
55
+ push()
metaxy/config.py ADDED
@@ -0,0 +1,450 @@
1
+ """Configuration system for Metaxy using pydantic-settings."""
2
+ # pyright: reportImportCycles=false
3
+
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, TypeVar
6
+
7
+ try:
8
+ import tomllib # Python 3.11+ # pyright: ignore[reportMissingImports]
9
+ except ImportError:
10
+ import tomli as tomllib # Fallback for Python 3.10
11
+
12
+ import warnings
13
+ from contextvars import ContextVar
14
+
15
+ from pydantic import Field as PydanticField
16
+ from pydantic import PrivateAttr
17
+ from pydantic_settings import (
18
+ BaseSettings,
19
+ PydanticBaseSettingsSource,
20
+ SettingsConfigDict,
21
+ )
22
+ from typing_extensions import Self
23
+
24
+ if TYPE_CHECKING:
25
+ from metaxy.metadata_store.base import (
26
+ MetadataStore, # pyright: ignore[reportImportCycles]
27
+ )
28
+
29
+ T = TypeVar("T")
30
+
31
+
32
+ class TomlConfigSettingsSource(PydanticBaseSettingsSource):
33
+ """Custom settings source for TOML configuration files.
34
+
35
+ Auto-discovers configuration in this order:
36
+ 1. Explicit file path if provided
37
+ 2. metaxy.toml in current directory (preferred)
38
+ 3. pyproject.toml [tool.metaxy] section (fallback)
39
+ 4. No config (returns empty dict)
40
+ """
41
+
42
+ def __init__(self, settings_cls: type[BaseSettings], toml_file: Path | None = None):
43
+ super().__init__(settings_cls)
44
+ self.toml_file = toml_file or self._discover_config_file()
45
+ self.toml_data = self._load_toml()
46
+
47
+ def _discover_config_file(self) -> Path | None:
48
+ """Auto-discover config file."""
49
+ # Prefer metaxy.toml
50
+ if Path("metaxy.toml").exists():
51
+ return Path("metaxy.toml")
52
+
53
+ # Fallback to pyproject.toml
54
+ if Path("pyproject.toml").exists():
55
+ return Path("pyproject.toml")
56
+
57
+ return None
58
+
59
+ def _load_toml(self) -> dict[str, Any]:
60
+ """Load TOML file and extract metaxy config."""
61
+ if self.toml_file is None:
62
+ return {}
63
+
64
+ with open(self.toml_file, "rb") as f:
65
+ data = tomllib.load(f)
66
+
67
+ # Extract [tool.metaxy] from pyproject.toml or root from metaxy.toml
68
+ if self.toml_file.name == "pyproject.toml":
69
+ return data.get("tool", {}).get("metaxy", {})
70
+ else:
71
+ return data
72
+
73
+ def get_field_value(self, field: Any, field_name: str) -> tuple[Any, str, bool]:
74
+ """Get field value from TOML data."""
75
+ field_value = self.toml_data.get(field_name)
76
+ return field_value, field_name, False
77
+
78
+ def __call__(self) -> dict[str, Any]:
79
+ """Return all settings from TOML."""
80
+ return self.toml_data
81
+
82
+
83
+ class StoreConfig(BaseSettings):
84
+ """Configuration for a single metadata store.
85
+
86
+ Structure:
87
+ type: Full import path to store class
88
+ config: Dict of all configuration (including fallback_stores)
89
+
90
+ Example:
91
+ >>> config = StoreConfig(
92
+ ... type="metaxy_delta.DeltaMetadataStore",
93
+ ... config={
94
+ ... "table_uri": "s3://bucket/metadata",
95
+ ... "region": "us-west-2",
96
+ ... "fallback_stores": ["prod"],
97
+ ... }
98
+ ... )
99
+ """
100
+
101
+ model_config = SettingsConfigDict(
102
+ extra="forbid", # Only type and config fields allowed
103
+ )
104
+
105
+ # Store class (full import path)
106
+ type: str
107
+
108
+ # Store configuration (all kwargs for __init__)
109
+ # This includes fallback_stores, table_uri, db_path, storage_options, etc.
110
+ config: dict[str, Any] = PydanticField(default_factory=dict)
111
+
112
+
113
+ class PluginConfig(BaseSettings):
114
+ """Configuration for Metaxy plugins"""
115
+
116
+ enabled: bool = PydanticField(
117
+ default=False,
118
+ description="Whether to enable plugin.",
119
+ )
120
+
121
+ _plugin: str = PrivateAttr()
122
+
123
+
124
+ class SQLModelConfig(PluginConfig):
125
+ """Configuration for SQLModel"""
126
+
127
+ infer_db_table_names: bool = PydanticField(
128
+ default=True,
129
+ description="Whether to automatically use `FeatureKey.table_name` for sqlalchemy's __tablename__ value.",
130
+ )
131
+
132
+ # Whether to use SQLModel definitions for system tables (for Alembic migrations)
133
+ system_tables: bool = PydanticField(
134
+ default=False,
135
+ description="Whether to use SQLModel definitions for system tables (for Alembic migrations).",
136
+ )
137
+
138
+ _plugin: str = PrivateAttr(default="sqlmodel")
139
+
140
+
141
+ class ExtConfig(BaseSettings):
142
+ """Configuration for Metaxy integrations with third-party tools"""
143
+
144
+ model_config = SettingsConfigDict(
145
+ extra="allow",
146
+ )
147
+
148
+ sqlmodel: SQLModelConfig = PydanticField(default_factory=SQLModelConfig)
149
+
150
+
151
+ # Context variable for storing the app context
152
+ _metaxy_config: ContextVar["MetaxyConfig | None"] = ContextVar(
153
+ "_metaxy_config", default=None
154
+ )
155
+
156
+
157
+ class MetaxyConfig(BaseSettings):
158
+ """Main Metaxy configuration.
159
+
160
+ Loads from:
161
+ 1. TOML file (metaxy.toml or pyproject.toml [tool.metaxy])
162
+ 2. Environment variables (METAXY_*)
163
+ 3. Init arguments
164
+
165
+ Priority: init > env vars > TOML
166
+
167
+ Example:
168
+ >>> # Auto-discover config
169
+ >>> config = MetaxyConfig.load()
170
+ >>>
171
+ >>> # Get store instance
172
+ >>> store = config.get_store("prod")
173
+ >>>
174
+ >>> # Override via env var
175
+ >>> # METAXY_STORE=staging METAXY_REGISTRY=myapp.features:my_graph
176
+ >>> config = MetaxyConfig.load()
177
+ >>> store = config.get_store() # Uses staging with custom graph
178
+ """
179
+
180
+ model_config = SettingsConfigDict(
181
+ env_prefix="METAXY_",
182
+ env_nested_delimiter="__",
183
+ )
184
+
185
+ # Store to use
186
+ store: str = "dev"
187
+
188
+ # Named store configurations
189
+ stores: dict[str, StoreConfig] = PydanticField(default_factory=dict)
190
+
191
+ # Migrations directory
192
+ migrations_dir: str = ".metaxy/migrations"
193
+
194
+ # Entrypoints to load (list of module paths)
195
+ entrypoints: list[str] = PydanticField(default_factory=list)
196
+
197
+ # Graph rendering theme
198
+ theme: str = "default"
199
+
200
+ ext: ExtConfig = PydanticField(default_factory=ExtConfig)
201
+
202
+ @property
203
+ def plugins(self) -> list[str]:
204
+ """Returns all enabled plugin names from ext configuration."""
205
+ plugins = []
206
+ for field_name in type(self.ext).model_fields:
207
+ field_value = getattr(self.ext, field_name)
208
+ if hasattr(field_value, "_plugin") and field_value.enabled:
209
+ plugins.append(field_value._plugin)
210
+ return plugins
211
+
212
+ @classmethod
213
+ def settings_customise_sources(
214
+ cls,
215
+ settings_cls: type[BaseSettings],
216
+ init_settings: PydanticBaseSettingsSource,
217
+ env_settings: PydanticBaseSettingsSource,
218
+ dotenv_settings: PydanticBaseSettingsSource,
219
+ file_secret_settings: PydanticBaseSettingsSource,
220
+ ) -> tuple[PydanticBaseSettingsSource, ...]:
221
+ """Customize settings sources: init → env → TOML.
222
+
223
+ Priority (first wins):
224
+ 1. Init arguments
225
+ 2. Environment variables
226
+ 3. TOML file
227
+ """
228
+ toml_settings = TomlConfigSettingsSource(settings_cls)
229
+ return (init_settings, env_settings, toml_settings)
230
+
231
+ @classmethod
232
+ def get(cls) -> "MetaxyConfig":
233
+ """Get the current Metaxy configuration."""
234
+ cfg = _metaxy_config.get()
235
+ if cfg is None:
236
+ warnings.warn(
237
+ UserWarning(
238
+ "Global Metaxy configuration not initialized. It can be set with MetaxyConfig.set(config) typically after loading it from a toml file. Returning default configuration (with environment variables and other pydantic settings sources resolved)."
239
+ )
240
+ )
241
+ return cls()
242
+ else:
243
+ return cfg
244
+
245
+ @classmethod
246
+ def set(cls, config: Self) -> None:
247
+ """Set the current Metaxy configuration."""
248
+ _metaxy_config.set(config)
249
+
250
+ @classmethod
251
+ def load(
252
+ cls, config_file: str | Path | None = None, *, search_parents: bool = True
253
+ ) -> "MetaxyConfig":
254
+ """Load config with auto-discovery and parent directory search.
255
+
256
+ Args:
257
+ config_file: Optional config file path (overrides auto-discovery)
258
+ search_parents: Search parent directories for config file (default: True)
259
+
260
+ Returns:
261
+ Loaded config (TOML + env vars merged)
262
+
263
+ Example:
264
+ >>> # Auto-discover with parent search
265
+ >>> config = MetaxyConfig.load()
266
+ >>>
267
+ >>> # Explicit file
268
+ >>> config = MetaxyConfig.load("custom.toml")
269
+
270
+ >>> # Auto-discover without parent search
271
+ >>> config = MetaxyConfig.load(search_parents=False)
272
+ """
273
+ # Search for config file if not explicitly provided
274
+ if config_file is None and search_parents:
275
+ config_file = cls._discover_config_with_parents()
276
+
277
+ # For explicit file, temporarily patch the TomlConfigSettingsSource
278
+ # to use that file, then use normal instantiation
279
+ # This ensures env vars still work
280
+
281
+ if config_file:
282
+ # Create a custom settings source class for this file
283
+ toml_path = Path(config_file)
284
+
285
+ class CustomTomlSource(TomlConfigSettingsSource):
286
+ def __init__(self, settings_cls: type[BaseSettings]):
287
+ # Skip auto-discovery, use explicit file
288
+ super(TomlConfigSettingsSource, self).__init__(settings_cls)
289
+ self.toml_file = toml_path
290
+ self.toml_data = self._load_toml()
291
+
292
+ # Customize sources to use custom TOML file
293
+ original_method = cls.settings_customise_sources
294
+
295
+ @classmethod # type: ignore[misc]
296
+ def custom_sources(
297
+ cls_inner,
298
+ settings_cls,
299
+ init_settings,
300
+ env_settings,
301
+ dotenv_settings,
302
+ file_secret_settings,
303
+ ):
304
+ toml_settings = CustomTomlSource(settings_cls)
305
+ return (init_settings, env_settings, toml_settings)
306
+
307
+ # Temporarily replace method
308
+ cls.settings_customise_sources = custom_sources # type: ignore[assignment]
309
+ config = cls()
310
+ cls.settings_customise_sources = original_method # type: ignore[method-assign]
311
+ else:
312
+ # Use default sources (auto-discovery + env vars)
313
+ config = cls()
314
+
315
+ cls.set(config)
316
+
317
+ return config
318
+
319
+ @staticmethod
320
+ def _discover_config_with_parents() -> Path | None:
321
+ """Discover config file by searching current and parent directories.
322
+
323
+ Searches for metaxy.toml or pyproject.toml in current directory,
324
+ then iteratively searches parent directories.
325
+
326
+ Returns:
327
+ Path to config file if found, None otherwise
328
+ """
329
+ current = Path.cwd()
330
+
331
+ while True:
332
+ # Check for metaxy.toml (preferred)
333
+ metaxy_toml = current / "metaxy.toml"
334
+ if metaxy_toml.exists():
335
+ return metaxy_toml
336
+
337
+ # Check for pyproject.toml
338
+ pyproject_toml = current / "pyproject.toml"
339
+ if pyproject_toml.exists():
340
+ return pyproject_toml
341
+
342
+ # Move to parent
343
+ parent = current.parent
344
+ if parent == current:
345
+ # Reached root
346
+ break
347
+ current = parent
348
+
349
+ return None
350
+
351
+ def get_store(
352
+ self,
353
+ name: str | None = None,
354
+ ) -> "MetadataStore":
355
+ """Instantiate metadata store by name.
356
+
357
+ Args:
358
+ name: Store name (uses config.store if None)
359
+
360
+ Returns:
361
+ Instantiated metadata store
362
+
363
+ Raises:
364
+ ValueError: If store name not found in config, or if fallback stores
365
+ have different hash algorithms than the parent store
366
+ ImportError: If store class cannot be imported
367
+
368
+ Example:
369
+ >>> config = MetaxyConfig.load()
370
+ >>> store = config.get_store("prod")
371
+ >>>
372
+ >>> # Use default store
373
+ >>> store = config.get_store()
374
+ """
375
+ from metaxy.data_versioning.hash_algorithms import HashAlgorithm
376
+
377
+ if len(self.stores) == 0:
378
+ raise ValueError(
379
+ "No Metaxy stores available. They should be configured in metaxy.toml|pyproject.toml or via environment variables."
380
+ )
381
+
382
+ name = name or self.store
383
+
384
+ if name not in self.stores:
385
+ raise ValueError(
386
+ f"Store '{name}' not found in config. "
387
+ f"Available stores: {list(self.stores.keys())}"
388
+ )
389
+
390
+ store_config = self.stores[name]
391
+
392
+ # Import store class
393
+ store_class = self._import_class(store_config.type)
394
+
395
+ # Extract configuration
396
+ config_copy = store_config.config.copy()
397
+ fallback_store_names = config_copy.pop("fallback_stores", [])
398
+
399
+ # Get hash_algorithm from config (if specified) and convert to enum
400
+ configured_hash_algorithm = config_copy.get("hash_algorithm")
401
+ if configured_hash_algorithm is not None:
402
+ # Convert string to enum if needed
403
+ if isinstance(configured_hash_algorithm, str):
404
+ configured_hash_algorithm = HashAlgorithm(configured_hash_algorithm)
405
+ config_copy["hash_algorithm"] = configured_hash_algorithm
406
+ else:
407
+ # Use default
408
+ configured_hash_algorithm = HashAlgorithm.XXHASH64
409
+ config_copy["hash_algorithm"] = configured_hash_algorithm
410
+
411
+ # Build fallback stores recursively
412
+ fallback_stores = []
413
+ for fallback_name in fallback_store_names:
414
+ fallback_store = self.get_store(fallback_name)
415
+ fallback_stores.append(fallback_store)
416
+
417
+ # Instantiate store with config + fallback_stores
418
+ store = store_class(
419
+ fallback_stores=fallback_stores,
420
+ **config_copy,
421
+ )
422
+
423
+ # Verify the store actually uses the hash algorithm we configured
424
+ # (in case a store subclass overrides the default or ignores the parameter)
425
+ if store.hash_algorithm != configured_hash_algorithm:
426
+ raise ValueError(
427
+ f"Store '{name}' ({store_class.__name__}) was configured with "
428
+ f"hash_algorithm='{configured_hash_algorithm.value}' but is using "
429
+ f"'{store.hash_algorithm.value}'. The store class may have overridden "
430
+ f"the hash algorithm. All stores must use the same hash algorithm."
431
+ )
432
+
433
+ return store
434
+
435
+ @staticmethod
436
+ def _import_class(class_path: str) -> type:
437
+ """Import class from module path.
438
+
439
+ Args:
440
+ class_path: Full import path like "metaxy.metadata_store.InMemoryMetadataStore"
441
+
442
+ Returns:
443
+ Imported class
444
+
445
+ Raises:
446
+ ImportError: If module or class not found
447
+ """
448
+ module_path, class_name = class_path.rsplit(".", 1)
449
+ module = __import__(module_path, fromlist=[class_name])
450
+ return getattr(module, class_name)
@@ -0,0 +1,24 @@
1
+ """Data versioning module for sample-level data version calculation."""
2
+
3
+ from metaxy.data_versioning.calculators import (
4
+ DataVersionCalculator,
5
+ PolarsDataVersionCalculator,
6
+ )
7
+ from metaxy.data_versioning.diff import (
8
+ DiffResult,
9
+ MetadataDiffResolver,
10
+ NarwhalsDiffResolver,
11
+ )
12
+ from metaxy.data_versioning.hash_algorithms import HashAlgorithm
13
+ from metaxy.data_versioning.joiners import NarwhalsJoiner, UpstreamJoiner
14
+
15
+ __all__ = [
16
+ "HashAlgorithm",
17
+ "UpstreamJoiner",
18
+ "NarwhalsJoiner",
19
+ "DataVersionCalculator",
20
+ "PolarsDataVersionCalculator",
21
+ "DiffResult",
22
+ "MetadataDiffResolver",
23
+ "NarwhalsDiffResolver",
24
+ ]
@@ -0,0 +1,13 @@
1
+ """Data version calculators for computing hash from upstream data."""
2
+
3
+ from metaxy.data_versioning.calculators.base import DataVersionCalculator
4
+ from metaxy.data_versioning.calculators.duckdb import DuckDBDataVersionCalculator
5
+ from metaxy.data_versioning.calculators.ibis import IbisDataVersionCalculator
6
+ from metaxy.data_versioning.calculators.polars import PolarsDataVersionCalculator
7
+
8
+ __all__ = [
9
+ "DataVersionCalculator",
10
+ "DuckDBDataVersionCalculator",
11
+ "IbisDataVersionCalculator",
12
+ "PolarsDataVersionCalculator",
13
+ ]
@@ -0,0 +1,97 @@
1
+ """Abstract base class for data version calculators."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ import narwhals as nw
7
+
8
+ from metaxy.data_versioning.hash_algorithms import HashAlgorithm
9
+
10
+ if TYPE_CHECKING:
11
+ from metaxy.models.feature_spec import FeatureSpec
12
+ from metaxy.models.plan import FeaturePlan
13
+
14
+
15
+ class DataVersionCalculator(ABC):
16
+ """Calculates data_version hash from joined upstream data.
17
+
18
+ The calculator takes joined upstream data (output from UpstreamJoiner)
19
+ and computes the data_version hash for each sample.
20
+
21
+ This is Step 2 in the data versioning process:
22
+ 1. Join upstream features → unified upstream view
23
+ 2. Calculate data_version from upstream → target versions ← THIS STEP
24
+ 3. Diff with current metadata → identify changes
25
+
26
+ All calculators work with Narwhals LazyFrames for backend compatibility.
27
+
28
+ Examples:
29
+ - PolarsDataVersionCalculator: Uses polars-hash for in-memory hashing
30
+ - NarwhalsDataVersionCalculator: Uses native SQL hash functions in the database
31
+ """
32
+
33
+ @property
34
+ @abstractmethod
35
+ def supported_algorithms(self) -> list[HashAlgorithm]:
36
+ """List of hash algorithms this calculator supports.
37
+
38
+ Returns:
39
+ List of supported HashAlgorithm values
40
+
41
+ Example:
42
+ >>> calc = PolarsDataVersionCalculator()
43
+ >>> HashAlgorithm.XXHASH64 in calc.supported_algorithms
44
+ True
45
+ """
46
+ pass
47
+
48
+ @property
49
+ @abstractmethod
50
+ def default_algorithm(self) -> HashAlgorithm:
51
+ """Default hash algorithm for this calculator.
52
+
53
+ Should be the most performant algorithm that's widely compatible.
54
+ Typically xxHash64 for cross-database compatibility.
55
+
56
+ Returns:
57
+ Default HashAlgorithm
58
+ """
59
+ pass
60
+
61
+ @abstractmethod
62
+ def calculate_data_versions(
63
+ self,
64
+ joined_upstream: nw.LazyFrame[Any],
65
+ feature_spec: "FeatureSpec",
66
+ feature_plan: "FeaturePlan",
67
+ upstream_column_mapping: dict[str, str],
68
+ hash_algorithm: HashAlgorithm | None = None,
69
+ ) -> nw.LazyFrame[Any]:
70
+ """Calculate data_version column from joined upstream data.
71
+
72
+ Computes a Merkle tree hash for each sample by:
73
+ 1. For each field in the feature:
74
+ a. Concatenate: field_key | code_version | upstream hashes
75
+ b. Hash the concatenated string
76
+ 2. Create struct with all field hashes
77
+ 3. Add as data_version column
78
+
79
+ Args:
80
+ joined_upstream: Narwhals LazyFrame with all upstream data_version columns joined
81
+ (output from UpstreamJoiner.join_upstream)
82
+ feature_spec: Specification of the feature being computed
83
+ feature_plan: Resolved feature plan with dependencies
84
+ upstream_column_mapping: Maps upstream feature key -> column name
85
+ where its data_version struct is located in joined_upstream
86
+ Example: {"video": "__upstream_video__data_version"}
87
+ hash_algorithm: Hash algorithm to use. If None, uses self.default_algorithm.
88
+ Must be in self.supported_algorithms.
89
+
90
+ Returns:
91
+ Narwhals LazyFrame with data_version column added
92
+ Shape: [sample_uid, __upstream_*__data_version columns, data_version (new)]
93
+
94
+ Raises:
95
+ ValueError: If hash_algorithm not in supported_algorithms
96
+ """
97
+ pass