metaxy 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of metaxy might be problematic. Click here for more details.

Files changed (75) hide show
  1. metaxy/__init__.py +61 -0
  2. metaxy/_testing.py +542 -0
  3. metaxy/_utils.py +16 -0
  4. metaxy/_version.py +1 -0
  5. metaxy/cli/app.py +76 -0
  6. metaxy/cli/context.py +71 -0
  7. metaxy/cli/graph.py +576 -0
  8. metaxy/cli/graph_diff.py +290 -0
  9. metaxy/cli/list.py +42 -0
  10. metaxy/cli/metadata.py +271 -0
  11. metaxy/cli/migrations.py +862 -0
  12. metaxy/cli/push.py +55 -0
  13. metaxy/config.py +450 -0
  14. metaxy/data_versioning/__init__.py +24 -0
  15. metaxy/data_versioning/calculators/__init__.py +13 -0
  16. metaxy/data_versioning/calculators/base.py +97 -0
  17. metaxy/data_versioning/calculators/duckdb.py +186 -0
  18. metaxy/data_versioning/calculators/ibis.py +225 -0
  19. metaxy/data_versioning/calculators/polars.py +135 -0
  20. metaxy/data_versioning/diff/__init__.py +15 -0
  21. metaxy/data_versioning/diff/base.py +150 -0
  22. metaxy/data_versioning/diff/narwhals.py +108 -0
  23. metaxy/data_versioning/hash_algorithms.py +19 -0
  24. metaxy/data_versioning/joiners/__init__.py +9 -0
  25. metaxy/data_versioning/joiners/base.py +70 -0
  26. metaxy/data_versioning/joiners/narwhals.py +235 -0
  27. metaxy/entrypoints.py +309 -0
  28. metaxy/ext/__init__.py +1 -0
  29. metaxy/ext/alembic.py +326 -0
  30. metaxy/ext/sqlmodel.py +172 -0
  31. metaxy/ext/sqlmodel_system_tables.py +139 -0
  32. metaxy/graph/__init__.py +21 -0
  33. metaxy/graph/diff/__init__.py +21 -0
  34. metaxy/graph/diff/diff_models.py +399 -0
  35. metaxy/graph/diff/differ.py +740 -0
  36. metaxy/graph/diff/models.py +418 -0
  37. metaxy/graph/diff/rendering/__init__.py +18 -0
  38. metaxy/graph/diff/rendering/base.py +274 -0
  39. metaxy/graph/diff/rendering/cards.py +188 -0
  40. metaxy/graph/diff/rendering/formatter.py +805 -0
  41. metaxy/graph/diff/rendering/graphviz.py +246 -0
  42. metaxy/graph/diff/rendering/mermaid.py +320 -0
  43. metaxy/graph/diff/rendering/rich.py +165 -0
  44. metaxy/graph/diff/rendering/theme.py +48 -0
  45. metaxy/graph/diff/traversal.py +247 -0
  46. metaxy/graph/utils.py +58 -0
  47. metaxy/metadata_store/__init__.py +31 -0
  48. metaxy/metadata_store/_protocols.py +38 -0
  49. metaxy/metadata_store/base.py +1676 -0
  50. metaxy/metadata_store/clickhouse.py +161 -0
  51. metaxy/metadata_store/duckdb.py +167 -0
  52. metaxy/metadata_store/exceptions.py +43 -0
  53. metaxy/metadata_store/ibis.py +451 -0
  54. metaxy/metadata_store/memory.py +228 -0
  55. metaxy/metadata_store/sqlite.py +187 -0
  56. metaxy/metadata_store/system_tables.py +257 -0
  57. metaxy/migrations/__init__.py +34 -0
  58. metaxy/migrations/detector.py +153 -0
  59. metaxy/migrations/executor.py +208 -0
  60. metaxy/migrations/loader.py +260 -0
  61. metaxy/migrations/models.py +718 -0
  62. metaxy/migrations/ops.py +390 -0
  63. metaxy/models/__init__.py +0 -0
  64. metaxy/models/bases.py +6 -0
  65. metaxy/models/constants.py +24 -0
  66. metaxy/models/feature.py +665 -0
  67. metaxy/models/feature_spec.py +105 -0
  68. metaxy/models/field.py +25 -0
  69. metaxy/models/plan.py +155 -0
  70. metaxy/models/types.py +157 -0
  71. metaxy/py.typed +0 -0
  72. metaxy-0.0.0.dist-info/METADATA +247 -0
  73. metaxy-0.0.0.dist-info/RECORD +75 -0
  74. metaxy-0.0.0.dist-info/WHEEL +4 -0
  75. metaxy-0.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,390 @@
1
+ """Migration operation types."""
2
+
3
+ import hashlib
4
+ from abc import ABC, abstractmethod
5
+ from typing import TYPE_CHECKING, Literal
6
+
7
+ import pydantic
8
+
9
+ if TYPE_CHECKING:
10
+ from metaxy.metadata_store.base import MetadataStore
11
+
12
+
13
+ class BaseOperation(pydantic.BaseModel, ABC): # pyright: ignore[reportUnsafeMultipleInheritance]
14
+ """Base class for all migration operations.
15
+
16
+ All operations must have:
17
+ - id: Unique identifier within migration
18
+ - type: Full class path for polymorphic deserialization (must be Literal in subclasses)
19
+ - feature_key: Root feature this operation affects
20
+ - reason: Human-readable explanation
21
+
22
+ Subclasses implement execute() to perform the actual migration logic.
23
+
24
+ Note: The 'type' field must be defined as a Literal in each subclass
25
+ for Pydantic discriminated unions to work.
26
+ """
27
+
28
+ id: str # Required, user-provided or auto-generated
29
+ # type field must be Literal in subclasses for discriminated unions
30
+ feature_key: list[str]
31
+ reason: str
32
+
33
+ @abstractmethod
34
+ def execute(
35
+ self,
36
+ store: "MetadataStore",
37
+ *,
38
+ from_snapshot_version: str,
39
+ to_snapshot_version: str,
40
+ dry_run: bool = False,
41
+ ) -> int:
42
+ """Execute the operation.
43
+
44
+ Args:
45
+ store: Metadata store to operate on
46
+ from_snapshot_version: Source snapshot version (old state)
47
+ to_snapshot_version: Target snapshot version (new state)
48
+ dry_run: If True, only validate and return count without executing
49
+
50
+ Returns:
51
+ Number of rows affected
52
+
53
+ Raises:
54
+ Exception: If operation fails
55
+ """
56
+ pass
57
+
58
+ def operation_config_hash(self) -> str:
59
+ """Generate hash of operation config (excluding id).
60
+
61
+ Used to detect if operation content changed after partial migration.
62
+
63
+ Returns:
64
+ 16-character hex hash
65
+ """
66
+ content = self.model_dump_json(exclude={"id"}, by_alias=True)
67
+ return hashlib.sha256(content.encode()).hexdigest()
68
+
69
+
70
+ class DataVersionReconciliation(pydantic.BaseModel):
71
+ """Reconcile data versions when feature definition changes BUT computation is unchanged.
72
+
73
+ This operation applies to ALL affected features in the migration.
74
+ Feature keys are deducible from snapshot changes, so they're not specified here.
75
+
76
+ This operation:
77
+ 1. For each affected feature, derives old/new feature_versions from snapshots
78
+ 2. Finds rows with old feature_version
79
+ 3. Recalculates data_versions based on new feature definition
80
+ 4. Writes new rows with updated feature_version and data_version
81
+ 5. Preserves all user metadata columns (immutable)
82
+
83
+ Use ONLY when code changed but computation results would be identical:
84
+ - Dependency graph refactoring (more precise field dependencies)
85
+ - Field structure changes (renaming, splitting, better schema)
86
+ - Code organization improvements (imports, typing, refactoring)
87
+
88
+ Do NOT use when computation actually changed:
89
+ - Different algorithm/model → re-run pipeline instead
90
+ - Bug fixes that affect output → re-run pipeline instead
91
+ - New model version → re-run pipeline instead
92
+
93
+ Feature versions are automatically derived from the migration's snapshot versions.
94
+ Affected features are determined from the snapshot diff.
95
+
96
+ Example YAML:
97
+ operations:
98
+ - type: metaxy.migrations.ops.DataVersionReconciliation
99
+ """
100
+
101
+ type: Literal["metaxy.migrations.ops.DataVersionReconciliation"] = (
102
+ "metaxy.migrations.ops.DataVersionReconciliation"
103
+ )
104
+
105
+ def execute_for_feature(
106
+ self,
107
+ store: "MetadataStore",
108
+ feature_key: str,
109
+ *,
110
+ from_snapshot_version: str,
111
+ to_snapshot_version: str,
112
+ dry_run: bool = False,
113
+ ) -> int:
114
+ """Execute data version reconciliation for a single feature.
115
+
116
+ Only works for features with upstream dependencies. For root features
117
+ (no upstream), data_versions are user-defined and cannot be automatically
118
+ reconciled - user must re-run their computation pipeline.
119
+
120
+ Process:
121
+ 1. Verify feature has upstream dependencies
122
+ 2. Query old and new feature_versions from snapshot metadata
123
+ 3. Load existing metadata with old feature_version
124
+ 4. Use resolve_update() to calculate expected data_versions based on current upstream
125
+ 5. Join existing user metadata with new data_versions
126
+ 6. Write with new feature_version and snapshot_version
127
+
128
+ Args:
129
+ store: Metadata store
130
+ feature_key: Feature key string (e.g., "examples/child")
131
+ from_snapshot_version: Source snapshot version (old state)
132
+ to_snapshot_version: Target snapshot version (new state)
133
+ dry_run: If True, return row count without executing
134
+
135
+ Returns:
136
+ Number of rows affected
137
+
138
+ Raises:
139
+ ValueError: If feature has no upstream dependencies (root feature)
140
+ """
141
+ import narwhals as nw
142
+
143
+ from metaxy.metadata_store.base import (
144
+ FEATURE_VERSIONS_KEY,
145
+ allow_feature_version_override,
146
+ )
147
+ from metaxy.metadata_store.exceptions import FeatureNotFoundError
148
+ from metaxy.models.feature import FeatureGraph
149
+ from metaxy.models.types import FeatureKey
150
+
151
+ feature_key_obj = FeatureKey(feature_key.split("/"))
152
+ feature_key_str = feature_key_obj.to_string()
153
+ graph = FeatureGraph.get_active()
154
+ feature_cls = graph.features_by_key[feature_key_obj]
155
+
156
+ # 1. Verify feature has upstream dependencies
157
+ plan = graph.get_feature_plan(feature_key_obj)
158
+ has_upstream = plan.deps is not None and len(plan.deps) > 0
159
+
160
+ if not has_upstream:
161
+ raise ValueError(
162
+ f"DataVersionReconciliation cannot be used for root feature {feature_key_str}. "
163
+ f"Root features have user-defined data_versions that cannot be automatically reconciled. "
164
+ f"User must re-run their computation pipeline to generate new data."
165
+ )
166
+
167
+ # 2. Query feature versions from snapshot metadata
168
+ try:
169
+ from_version_data = store.read_metadata(
170
+ FEATURE_VERSIONS_KEY,
171
+ current_only=False,
172
+ allow_fallback=False,
173
+ filters=[
174
+ (nw.col("snapshot_version") == from_snapshot_version)
175
+ & (nw.col("feature_key") == feature_key_str)
176
+ ],
177
+ )
178
+ except FeatureNotFoundError:
179
+ from_version_data = None
180
+
181
+ try:
182
+ to_version_data = store.read_metadata(
183
+ FEATURE_VERSIONS_KEY,
184
+ current_only=False,
185
+ allow_fallback=False,
186
+ filters=[
187
+ (nw.col("snapshot_version") == to_snapshot_version)
188
+ & (nw.col("feature_key") == feature_key_str)
189
+ ],
190
+ )
191
+ except FeatureNotFoundError:
192
+ to_version_data = None
193
+
194
+ # Extract feature versions from lazy frames
195
+ from_feature_version: str | None = None
196
+ to_feature_version: str | None = None
197
+
198
+ if from_version_data is not None:
199
+ from_version_df = from_version_data.head(1).collect()
200
+ if from_version_df.shape[0] > 0:
201
+ from_feature_version = str(from_version_df["feature_version"][0])
202
+ else:
203
+ from_version_data = None
204
+
205
+ if to_version_data is not None:
206
+ to_version_df = to_version_data.head(1).collect()
207
+ if to_version_df.shape[0] > 0:
208
+ to_feature_version = str(to_version_df["feature_version"][0])
209
+ else:
210
+ to_version_data = None
211
+
212
+ if from_version_data is None:
213
+ raise ValueError(
214
+ f"Feature {feature_key_str} not found in from_snapshot {from_snapshot_version}"
215
+ )
216
+ if to_version_data is None:
217
+ raise ValueError(
218
+ f"Feature {feature_key_str} not found in to_snapshot {to_snapshot_version}"
219
+ )
220
+
221
+ assert from_feature_version is not None
222
+ assert to_feature_version is not None
223
+
224
+ # 3. Load existing metadata with old feature_version
225
+ try:
226
+ existing_metadata = store.read_metadata(
227
+ feature_cls,
228
+ current_only=False,
229
+ filters=[nw.col("feature_version") == from_feature_version],
230
+ allow_fallback=False,
231
+ )
232
+ except FeatureNotFoundError:
233
+ # Feature doesn't exist yet - nothing to migrate
234
+ return 0
235
+
236
+ # Collect to check existence and get row count
237
+ existing_metadata_df = existing_metadata.collect()
238
+ if existing_metadata_df.shape[0] == 0:
239
+ # Already migrated (idempotent)
240
+ return 0
241
+
242
+ if dry_run:
243
+ return existing_metadata_df.shape[0]
244
+
245
+ # 4. Get sample metadata (exclude system columns)
246
+ user_columns = [
247
+ c
248
+ for c in existing_metadata_df.columns
249
+ if c not in ["data_version", "feature_version", "snapshot_version"]
250
+ ]
251
+ sample_metadata = existing_metadata_df.select(user_columns)
252
+
253
+ # 5. Use resolve_update to calculate data_versions based on current upstream
254
+ # Convert to Polars for the join to avoid cross-backend issues
255
+ sample_metadata_pl = nw.from_native(sample_metadata.to_native()).to_polars()
256
+
257
+ diff_result = store.resolve_update(feature_cls, sample_df=sample_metadata_pl)
258
+
259
+ # Use 'changed' for reconciliation (data_versions changed due to upstream)
260
+ # Use 'added' for new feature materialization
261
+ # Convert results to Polars for consistent joining
262
+ if len(diff_result.changed) > 0:
263
+ changed_pl = nw.from_native(diff_result.changed.to_native()).to_polars()
264
+ new_data_versions = changed_pl.select(["sample_uid", "data_version"])
265
+ df_to_write = sample_metadata_pl.join(
266
+ new_data_versions, on="sample_uid", how="inner"
267
+ )
268
+ elif len(diff_result.added) > 0:
269
+ df_to_write = nw.from_native(diff_result.added.to_native()).to_polars()
270
+ else:
271
+ return 0
272
+
273
+ # 6. Write with new feature_version and snapshot_version
274
+ # Wrap in Narwhals for write_metadata
275
+ df_to_write_nw = nw.from_native(df_to_write)
276
+ df_to_write_nw = df_to_write_nw.with_columns(
277
+ nw.lit(to_feature_version).alias("feature_version"),
278
+ nw.lit(to_snapshot_version).alias("snapshot_version"),
279
+ )
280
+
281
+ with allow_feature_version_override():
282
+ store.write_metadata(feature_cls, df_to_write_nw)
283
+
284
+ return len(df_to_write)
285
+
286
+
287
+ class MetadataBackfill(BaseOperation, ABC):
288
+ """Base class for metadata backfill operations.
289
+
290
+ Users subclass this to implement custom backfill logic with complete
291
+ control over the entire process: loading, transforming, joining, filtering,
292
+ and writing metadata.
293
+
294
+ The user implements execute() and can:
295
+ - Load metadata from any external source (S3, database, API, etc.)
296
+ - Perform custom transformations and filtering
297
+ - Join with Metaxy's calculated data_versions however they want
298
+ - Write results to the store
299
+
300
+ Example Subclass:
301
+ class S3VideoBackfill(MetadataBackfill):
302
+ type: Literal["myproject.migrations.S3VideoBackfill"]
303
+ s3_bucket: str
304
+ s3_prefix: str
305
+ min_size_mb: int = 10
306
+
307
+ def execute(self, store, *, dry_run=False):
308
+ import boto3
309
+
310
+ # Load from S3
311
+ s3 = boto3.client('s3')
312
+ objects = s3.list_objects_v2(
313
+ Bucket=self.s3_bucket,
314
+ Prefix=self.s3_prefix
315
+ )
316
+
317
+ external_df = pl.DataFrame([
318
+ {
319
+ "sample_uid": obj['Key'],
320
+ "path": f"s3://{self.s3_bucket}/{obj['Key']}",
321
+ "size_bytes": obj['Size']
322
+ }
323
+ for obj in objects['Contents']
324
+ ])
325
+
326
+ # Filter
327
+ external_df = external_df.filter(
328
+ pl.col("size_bytes") > self.min_size_mb * 1024 * 1024
329
+ )
330
+
331
+ if dry_run:
332
+ return len(external_df)
333
+
334
+ # Get data versions from Metaxy
335
+ feature_cls = graph.features_by_key[FeatureKey(self.feature_key)]
336
+ diff = store.resolve_update(
337
+ feature_cls,
338
+ sample_df=external_df.select(["sample_uid"])
339
+ )
340
+
341
+ # Join external metadata with calculated data_versions
342
+ to_write = external_df.join(diff.added, on="sample_uid", how="inner")
343
+
344
+ # Write
345
+ store.write_metadata(feature_cls, to_write)
346
+ return len(to_write)
347
+
348
+ Example YAML:
349
+ - id: "backfill_videos_from_s3"
350
+ type: "myproject.migrations.S3VideoBackfill"
351
+ feature_key: ["video", "files"]
352
+ s3_bucket: "prod-videos"
353
+ s3_prefix: "processed/"
354
+ min_size_mb: 10
355
+ reason: "Initial backfill from production S3 bucket"
356
+ """
357
+
358
+ # No additional required fields - user subclasses add their own
359
+
360
+ @abstractmethod
361
+ def execute(
362
+ self,
363
+ store: "MetadataStore",
364
+ *,
365
+ from_snapshot_version: str,
366
+ to_snapshot_version: str,
367
+ dry_run: bool = False,
368
+ **kwargs,
369
+ ) -> int:
370
+ """User implements full backfill logic.
371
+
372
+ User has complete control over:
373
+ - Loading external metadata (S3, database, API, files, etc.)
374
+ - Transforming and filtering data
375
+ - Joining with Metaxy's data_versions
376
+ - Writing to store
377
+
378
+ Args:
379
+ store: Metadata store to write to
380
+ from_snapshot_version: Source snapshot version (old state)
381
+ to_snapshot_version: Target snapshot version (new state)
382
+ dry_run: If True, validate and return count without writing
383
+
384
+ Returns:
385
+ Number of rows written (or would be written if dry_run)
386
+
387
+ Raises:
388
+ Exception: If backfill fails (will be recorded in migration progress)
389
+ """
390
+ pass
File without changes
metaxy/models/bases.py ADDED
@@ -0,0 +1,6 @@
1
+ import pydantic
2
+
3
+
4
+ class FrozenBaseModel(pydantic.BaseModel):
5
+ class Config:
6
+ frozen = True
@@ -0,0 +1,24 @@
1
+ """Shared constants for system column names."""
2
+
3
+ # Essential system columns that must always be preserved for joining/versioning
4
+ ESSENTIAL_SYSTEM_COLUMNS = frozenset(
5
+ {
6
+ "sample_uid",
7
+ "data_version",
8
+ }
9
+ )
10
+
11
+ # System columns that should be dropped to avoid conflicts when joining upstream features
12
+ # These will be recalculated for the target feature, so keeping them from upstream causes conflicts
13
+ DROPPABLE_SYSTEM_COLUMNS = frozenset(
14
+ {
15
+ "feature_version",
16
+ "snapshot_version",
17
+ "metaxy_feature_version",
18
+ "metaxy_snapshot_version",
19
+ "metaxy_data_version",
20
+ }
21
+ )
22
+
23
+ # All system columns (essential + droppable)
24
+ ALL_SYSTEM_COLUMNS = ESSENTIAL_SYSTEM_COLUMNS | DROPPABLE_SYSTEM_COLUMNS