metaxy 0.0.1.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. metaxy/__init__.py +170 -0
  2. metaxy/_packaging.py +96 -0
  3. metaxy/_testing/__init__.py +55 -0
  4. metaxy/_testing/config.py +43 -0
  5. metaxy/_testing/metaxy_project.py +780 -0
  6. metaxy/_testing/models.py +111 -0
  7. metaxy/_testing/parametric/__init__.py +13 -0
  8. metaxy/_testing/parametric/metadata.py +664 -0
  9. metaxy/_testing/pytest_helpers.py +74 -0
  10. metaxy/_testing/runbook.py +533 -0
  11. metaxy/_utils.py +35 -0
  12. metaxy/_version.py +1 -0
  13. metaxy/cli/app.py +97 -0
  14. metaxy/cli/console.py +13 -0
  15. metaxy/cli/context.py +167 -0
  16. metaxy/cli/graph.py +610 -0
  17. metaxy/cli/graph_diff.py +290 -0
  18. metaxy/cli/list.py +46 -0
  19. metaxy/cli/metadata.py +317 -0
  20. metaxy/cli/migrations.py +999 -0
  21. metaxy/cli/utils.py +268 -0
  22. metaxy/config.py +680 -0
  23. metaxy/entrypoints.py +296 -0
  24. metaxy/ext/__init__.py +1 -0
  25. metaxy/ext/dagster/__init__.py +54 -0
  26. metaxy/ext/dagster/constants.py +10 -0
  27. metaxy/ext/dagster/dagster_type.py +156 -0
  28. metaxy/ext/dagster/io_manager.py +200 -0
  29. metaxy/ext/dagster/metaxify.py +512 -0
  30. metaxy/ext/dagster/observable.py +115 -0
  31. metaxy/ext/dagster/resources.py +27 -0
  32. metaxy/ext/dagster/selection.py +73 -0
  33. metaxy/ext/dagster/table_metadata.py +417 -0
  34. metaxy/ext/dagster/utils.py +462 -0
  35. metaxy/ext/sqlalchemy/__init__.py +23 -0
  36. metaxy/ext/sqlalchemy/config.py +29 -0
  37. metaxy/ext/sqlalchemy/plugin.py +353 -0
  38. metaxy/ext/sqlmodel/__init__.py +13 -0
  39. metaxy/ext/sqlmodel/config.py +29 -0
  40. metaxy/ext/sqlmodel/plugin.py +499 -0
  41. metaxy/graph/__init__.py +29 -0
  42. metaxy/graph/describe.py +325 -0
  43. metaxy/graph/diff/__init__.py +21 -0
  44. metaxy/graph/diff/diff_models.py +446 -0
  45. metaxy/graph/diff/differ.py +769 -0
  46. metaxy/graph/diff/models.py +443 -0
  47. metaxy/graph/diff/rendering/__init__.py +18 -0
  48. metaxy/graph/diff/rendering/base.py +323 -0
  49. metaxy/graph/diff/rendering/cards.py +188 -0
  50. metaxy/graph/diff/rendering/formatter.py +805 -0
  51. metaxy/graph/diff/rendering/graphviz.py +246 -0
  52. metaxy/graph/diff/rendering/mermaid.py +326 -0
  53. metaxy/graph/diff/rendering/rich.py +169 -0
  54. metaxy/graph/diff/rendering/theme.py +48 -0
  55. metaxy/graph/diff/traversal.py +247 -0
  56. metaxy/graph/status.py +329 -0
  57. metaxy/graph/utils.py +58 -0
  58. metaxy/metadata_store/__init__.py +32 -0
  59. metaxy/metadata_store/_ducklake_support.py +419 -0
  60. metaxy/metadata_store/base.py +1792 -0
  61. metaxy/metadata_store/bigquery.py +354 -0
  62. metaxy/metadata_store/clickhouse.py +184 -0
  63. metaxy/metadata_store/delta.py +371 -0
  64. metaxy/metadata_store/duckdb.py +446 -0
  65. metaxy/metadata_store/exceptions.py +61 -0
  66. metaxy/metadata_store/ibis.py +542 -0
  67. metaxy/metadata_store/lancedb.py +391 -0
  68. metaxy/metadata_store/memory.py +292 -0
  69. metaxy/metadata_store/system/__init__.py +57 -0
  70. metaxy/metadata_store/system/events.py +264 -0
  71. metaxy/metadata_store/system/keys.py +9 -0
  72. metaxy/metadata_store/system/models.py +129 -0
  73. metaxy/metadata_store/system/storage.py +957 -0
  74. metaxy/metadata_store/types.py +10 -0
  75. metaxy/metadata_store/utils.py +104 -0
  76. metaxy/metadata_store/warnings.py +36 -0
  77. metaxy/migrations/__init__.py +32 -0
  78. metaxy/migrations/detector.py +291 -0
  79. metaxy/migrations/executor.py +516 -0
  80. metaxy/migrations/generator.py +319 -0
  81. metaxy/migrations/loader.py +231 -0
  82. metaxy/migrations/models.py +528 -0
  83. metaxy/migrations/ops.py +447 -0
  84. metaxy/models/__init__.py +0 -0
  85. metaxy/models/bases.py +12 -0
  86. metaxy/models/constants.py +139 -0
  87. metaxy/models/feature.py +1335 -0
  88. metaxy/models/feature_spec.py +338 -0
  89. metaxy/models/field.py +263 -0
  90. metaxy/models/fields_mapping.py +307 -0
  91. metaxy/models/filter_expression.py +297 -0
  92. metaxy/models/lineage.py +285 -0
  93. metaxy/models/plan.py +232 -0
  94. metaxy/models/types.py +475 -0
  95. metaxy/py.typed +0 -0
  96. metaxy/utils/__init__.py +1 -0
  97. metaxy/utils/constants.py +2 -0
  98. metaxy/utils/exceptions.py +23 -0
  99. metaxy/utils/hashing.py +230 -0
  100. metaxy/versioning/__init__.py +31 -0
  101. metaxy/versioning/engine.py +656 -0
  102. metaxy/versioning/feature_dep_transformer.py +151 -0
  103. metaxy/versioning/ibis.py +249 -0
  104. metaxy/versioning/lineage_handler.py +205 -0
  105. metaxy/versioning/polars.py +189 -0
  106. metaxy/versioning/renamed_df.py +35 -0
  107. metaxy/versioning/types.py +63 -0
  108. metaxy-0.0.1.dev3.dist-info/METADATA +96 -0
  109. metaxy-0.0.1.dev3.dist-info/RECORD +111 -0
  110. metaxy-0.0.1.dev3.dist-info/WHEEL +4 -0
  111. metaxy-0.0.1.dev3.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,516 @@
1
+ """Migration executor using event-based tracking.
2
+
3
+ This is the new executor that replaces the old 3-table system with a single
4
+ event-based system stored in system tables via SystemTableStorage.
5
+ """
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from metaxy.metadata_store.base import MetadataStore
11
+ from metaxy.metadata_store.system import SystemTableStorage
12
+ from metaxy.migrations.models import (
13
+ DiffMigration,
14
+ FullGraphMigration,
15
+ Migration,
16
+ MigrationResult,
17
+ )
18
+
19
+ from metaxy.migrations.models import OperationConfig
20
+
21
+
22
+ class MigrationExecutor:
23
+ """Executes migrations with event-based progress tracking.
24
+
25
+ Uses FeatureGraph.topological_sort_features() for topological traversal
26
+ and SystemTableStorage for event logging. Supports resumability after failures.
27
+ """
28
+
29
+ def __init__(self, storage: "SystemTableStorage"):
30
+ """Initialize executor.
31
+
32
+ Args:
33
+ storage: System table storage for event logging
34
+ """
35
+ self.storage = storage
36
+
37
+ def _find_root_causes(
38
+ self,
39
+ failed_deps: list[str],
40
+ errors: dict[str, str],
41
+ ) -> list[str]:
42
+ """Find the root cause features (features with actual errors, not skipped).
43
+
44
+ Args:
45
+ failed_deps: List of direct failed dependencies
46
+ errors: Dict mapping feature keys to error messages
47
+
48
+ Returns:
49
+ List of root cause feature keys (features that had actual errors)
50
+ """
51
+ root_causes = []
52
+ for dep in failed_deps:
53
+ if dep not in errors:
54
+ continue
55
+ error_msg = errors[dep]
56
+ # If this dependency was skipped, recursively find its root causes
57
+ if error_msg.startswith("Skipped due to failed dependencies:"):
58
+ # Extract the dependencies from the error message
59
+ deps_part = error_msg.split(":", 1)[1].strip()
60
+ transitive_deps = [d.strip() for d in deps_part.split(",")]
61
+ # Recursively find root causes
62
+ root_causes.extend(self._find_root_causes(transitive_deps, errors))
63
+ else:
64
+ # This is an actual error, not a skip
65
+ root_causes.append(dep)
66
+ return list(
67
+ dict.fromkeys(root_causes)
68
+ ) # Remove duplicates while preserving order
69
+
70
+ def execute(
71
+ self,
72
+ migration: "Migration",
73
+ store: "MetadataStore",
74
+ project: str,
75
+ *,
76
+ dry_run: bool = False,
77
+ ) -> "MigrationResult":
78
+ """Execute migration with event logging and resumability.
79
+
80
+ Process:
81
+ 1. Log migration_started event
82
+ 2. Get features to process from migration
83
+ 3. Sort features topologically using FeatureGraph.topological_sort_features()
84
+ 4. For each feature:
85
+ - Check if already completed (resume support)
86
+ - Log feature_started
87
+ - Execute migration logic
88
+ - Log feature_completed/failed
89
+ 5. Log migration_completed/failed
90
+
91
+ Args:
92
+ migration: Migration to execute
93
+ store: Metadata store to operate on
94
+ project: Project name for event tracking
95
+ dry_run: If True, only validate without executing
96
+
97
+ Returns:
98
+ MigrationResult with execution details
99
+
100
+ Raises:
101
+ Exception: If migration fails and cannot continue
102
+ """
103
+ # Import here to avoid circular dependency
104
+ from metaxy.migrations.models import DiffMigration, FullGraphMigration
105
+
106
+ # Delegate to migration's execute method (which uses this executor internally)
107
+ if isinstance(migration, DiffMigration):
108
+ return self._execute_diff_migration(
109
+ migration, store, project, dry_run=dry_run
110
+ )
111
+ elif isinstance(migration, FullGraphMigration):
112
+ return self._execute_full_graph_migration(
113
+ migration, store, project, dry_run=dry_run
114
+ )
115
+ else:
116
+ # Custom migration subclass - call its execute method directly
117
+ return migration.execute(store, project, dry_run=dry_run)
118
+
119
+ def _execute_diff_migration(
120
+ self,
121
+ migration: "DiffMigration",
122
+ store: "MetadataStore",
123
+ project: str,
124
+ dry_run: bool,
125
+ ) -> "MigrationResult":
126
+ """Execute DiffMigration with topological sorting.
127
+
128
+ Args:
129
+ migration: DiffMigration to execute
130
+ store: Metadata store
131
+ project: Project name for event tracking
132
+ dry_run: If True, only validate
133
+
134
+ Returns:
135
+ MigrationResult
136
+ """
137
+ import logging
138
+ from datetime import datetime, timezone
139
+
140
+ from metaxy.metadata_store.system import Event
141
+ from metaxy.migrations.ops import DataVersionReconciliation
142
+ from metaxy.models.feature import FeatureGraph
143
+ from metaxy.models.types import FeatureKey
144
+
145
+ logger = logging.getLogger(__name__)
146
+
147
+ start_time = datetime.now(timezone.utc)
148
+
149
+ # Log migration started
150
+ if not dry_run:
151
+ self.storage.write_event(
152
+ Event.migration_started(
153
+ project=project, migration_id=migration.migration_id
154
+ )
155
+ )
156
+
157
+ affected_features_list = []
158
+ errors = {}
159
+ skipped = {} # Track skipped features separately
160
+ rows_affected_total = 0
161
+
162
+ # Get graph for topological sorting
163
+ graph = FeatureGraph.get_active()
164
+
165
+ # Execute operations (currently only DataVersionReconciliation is supported)
166
+ if len(migration.operations) == 1 and isinstance(
167
+ migration.operations[0], DataVersionReconciliation
168
+ ):
169
+ # Get features from operation config
170
+ op = migration.operations[0]
171
+ op_config = OperationConfig.model_validate(migration.ops[0])
172
+
173
+ # Determine which features to process:
174
+ # - If features explicitly listed in operation config, use those
175
+ # - Otherwise, use all affected features from graph diff
176
+ if op_config.features:
177
+ # Sort features topologically
178
+ feature_keys = [FeatureKey(fk.split("/")) for fk in op_config.features]
179
+ sorted_features = graph.topological_sort_features(feature_keys)
180
+ affected_features_to_process = [
181
+ fk.to_string() for fk in sorted_features
182
+ ]
183
+ else:
184
+ # Fall back to graph diff (all affected features)
185
+ affected_features_to_process = migration.get_affected_features(
186
+ store, project
187
+ )
188
+
189
+ for feature_key_str in affected_features_to_process:
190
+ # Check if already completed (resume support)
191
+ if not dry_run and self.storage.is_feature_completed(
192
+ migration.migration_id, feature_key_str, project
193
+ ):
194
+ affected_features_list.append(feature_key_str)
195
+ continue
196
+
197
+ # Check if any upstream dependencies failed in this migration run
198
+ feature_key_obj = FeatureKey(feature_key_str.split("/"))
199
+ plan = graph.get_feature_plan(feature_key_obj)
200
+
201
+ if plan.deps:
202
+ failed_deps = [
203
+ dep.key.to_string()
204
+ for dep in plan.deps
205
+ if dep.key.to_string() in errors
206
+ or dep.key.to_string() in skipped
207
+ ]
208
+
209
+ if failed_deps:
210
+ # Find root causes (features with actual errors, not just skipped)
211
+ root_causes = self._find_root_causes(
212
+ failed_deps, {**errors, **skipped}
213
+ )
214
+ error_msg = f"Skipped due to failed dependencies: {', '.join(root_causes)}"
215
+ skipped[feature_key_str] = error_msg
216
+
217
+ # Log as failed
218
+ if not dry_run:
219
+ self.storage.write_event(
220
+ Event.feature_failed(
221
+ project=project,
222
+ migration_id=migration.migration_id,
223
+ feature_key=feature_key_str,
224
+ error_message=error_msg,
225
+ )
226
+ )
227
+ continue
228
+
229
+ # Log feature started
230
+ if not dry_run:
231
+ self.storage.write_event(
232
+ Event.feature_started(
233
+ project=project,
234
+ migration_id=migration.migration_id,
235
+ feature_key=feature_key_str,
236
+ )
237
+ )
238
+
239
+ try:
240
+ # Execute operation for this feature
241
+ rows_affected = op.execute_for_feature(
242
+ store,
243
+ feature_key_str,
244
+ snapshot_version=migration.to_snapshot_version,
245
+ from_snapshot_version=migration.from_snapshot_version,
246
+ dry_run=dry_run,
247
+ )
248
+
249
+ # Log feature completed
250
+ if not dry_run:
251
+ self.storage.write_event(
252
+ Event.feature_completed(
253
+ project=project,
254
+ migration_id=migration.migration_id,
255
+ feature_key=feature_key_str,
256
+ rows_affected=rows_affected,
257
+ )
258
+ )
259
+
260
+ affected_features_list.append(feature_key_str)
261
+ rows_affected_total += rows_affected
262
+
263
+ except Exception as e:
264
+ # Get full error message
265
+ error_msg = str(e) if str(e) else repr(e)
266
+ errors[feature_key_str] = error_msg
267
+
268
+ # Log exception with full traceback
269
+ logger.exception(f"Error in feature {feature_key_str}")
270
+
271
+ # Log feature failed
272
+ if not dry_run:
273
+ self.storage.write_event(
274
+ Event.feature_failed(
275
+ project=project,
276
+ migration_id=migration.migration_id,
277
+ feature_key=feature_key_str,
278
+ error_message=error_msg,
279
+ )
280
+ )
281
+
282
+ continue
283
+ else:
284
+ # Future: Support other operation types here
285
+ raise NotImplementedError(
286
+ "Only DataVersionReconciliation is currently supported"
287
+ )
288
+
289
+ # Determine status
290
+ if dry_run:
291
+ status = "skipped"
292
+ elif len(errors) == 0:
293
+ status = "completed"
294
+ if not dry_run:
295
+ self.storage.write_event(
296
+ Event.migration_completed(
297
+ project=project, migration_id=migration.migration_id
298
+ )
299
+ )
300
+ else:
301
+ status = "failed"
302
+ if not dry_run:
303
+ self.storage.write_event(
304
+ Event.migration_failed(
305
+ project=project,
306
+ migration_id=migration.migration_id,
307
+ error_message="",
308
+ )
309
+ )
310
+
311
+ duration = (datetime.now(timezone.utc) - start_time).total_seconds()
312
+
313
+ from metaxy.migrations.models import MigrationResult
314
+
315
+ return MigrationResult(
316
+ migration_id=migration.migration_id,
317
+ status=status,
318
+ features_completed=len(affected_features_list),
319
+ features_failed=len(errors),
320
+ features_skipped=len(skipped),
321
+ affected_features=affected_features_list,
322
+ errors={**errors, **skipped}, # Combine for display
323
+ rows_affected=rows_affected_total,
324
+ duration_seconds=duration,
325
+ timestamp=start_time,
326
+ )
327
+
328
+ def _execute_full_graph_migration(
329
+ self,
330
+ migration: "FullGraphMigration",
331
+ store: "MetadataStore",
332
+ project: str,
333
+ dry_run: bool,
334
+ ) -> "MigrationResult":
335
+ """Execute FullGraphMigration with topological sorting.
336
+
337
+ Args:
338
+ migration: FullGraphMigration to execute
339
+ store: Metadata store
340
+ project: Project name for event tracking
341
+ dry_run: If True, only validate
342
+
343
+ Returns:
344
+ MigrationResult
345
+ """
346
+ import logging
347
+ from datetime import datetime, timezone
348
+
349
+ from metaxy.metadata_store.system import Event
350
+ from metaxy.models.feature import FeatureGraph
351
+ from metaxy.models.types import FeatureKey
352
+
353
+ logger = logging.getLogger(__name__)
354
+
355
+ start_time = datetime.now(timezone.utc)
356
+
357
+ # Log migration started
358
+ if not dry_run:
359
+ self.storage.write_event(
360
+ Event.migration_started(
361
+ project=project, migration_id=migration.migration_id
362
+ )
363
+ )
364
+
365
+ affected_features_list = []
366
+ errors = {}
367
+ skipped = {} # Track skipped features separately
368
+ rows_affected_total = 0
369
+
370
+ # Get graph for topological sorting
371
+ graph = FeatureGraph.get_active()
372
+
373
+ # Execute each operation (already instantiated by migration.operations property)
374
+ # Zip with ops to maintain correspondence between operation instance and its config
375
+ for operation, op_dict in zip(migration.operations, migration.ops):
376
+ op_config = OperationConfig.model_validate(op_dict)
377
+
378
+ # Sort features topologically
379
+ feature_keys = [FeatureKey(fk.split("/")) for fk in op_config.features]
380
+ sorted_features = graph.topological_sort_features(feature_keys)
381
+
382
+ # Execute for each feature in topological order
383
+ for feature_key_obj in sorted_features:
384
+ feature_key_str = feature_key_obj.to_string()
385
+
386
+ # Check if already completed (resume support)
387
+ if not dry_run and self.storage.is_feature_completed(
388
+ migration.migration_id, feature_key_str, project
389
+ ):
390
+ affected_features_list.append(feature_key_str)
391
+ continue
392
+
393
+ # Check if any upstream dependencies failed in this migration run
394
+ plan = graph.get_feature_plan(feature_key_obj)
395
+
396
+ if plan.deps:
397
+ failed_deps = [
398
+ dep.key.to_string()
399
+ for dep in plan.deps
400
+ if dep.key.to_string() in errors
401
+ or dep.key.to_string() in skipped
402
+ ]
403
+
404
+ if failed_deps:
405
+ # Find root causes (features with actual errors, not just skipped)
406
+ root_causes = self._find_root_causes(
407
+ failed_deps, {**errors, **skipped}
408
+ )
409
+ error_msg = f"Skipped due to failed dependencies: {', '.join(root_causes)}"
410
+ skipped[feature_key_str] = error_msg
411
+
412
+ # Log as failed
413
+ if not dry_run:
414
+ self.storage.write_event(
415
+ Event.feature_failed(
416
+ project=project,
417
+ migration_id=migration.migration_id,
418
+ feature_key=feature_key_str,
419
+ error_message=error_msg,
420
+ )
421
+ )
422
+ continue
423
+
424
+ # Log feature started
425
+ if not dry_run:
426
+ self.storage.write_event(
427
+ Event.feature_started(
428
+ project=project,
429
+ migration_id=migration.migration_id,
430
+ feature_key=feature_key_str,
431
+ )
432
+ )
433
+
434
+ try:
435
+ # Execute operation for this feature
436
+ rows_affected = operation.execute_for_feature(
437
+ store,
438
+ feature_key_str,
439
+ snapshot_version=migration.snapshot_version,
440
+ from_snapshot_version=migration.from_snapshot_version,
441
+ dry_run=dry_run,
442
+ )
443
+
444
+ # Log feature completed
445
+ if not dry_run:
446
+ self.storage.write_event(
447
+ Event.feature_completed(
448
+ project=project,
449
+ migration_id=migration.migration_id,
450
+ feature_key=feature_key_str,
451
+ rows_affected=rows_affected,
452
+ )
453
+ )
454
+
455
+ affected_features_list.append(feature_key_str)
456
+ rows_affected_total += rows_affected
457
+
458
+ except Exception as e:
459
+ # Get full error message
460
+ error_msg = str(e) if str(e) else repr(e)
461
+ errors[feature_key_str] = error_msg
462
+
463
+ # Log exception with full traceback
464
+ logger.exception(f"Error in feature {feature_key_str}")
465
+
466
+ # Log feature failed
467
+ if not dry_run:
468
+ self.storage.write_event(
469
+ Event.feature_failed(
470
+ project=project,
471
+ migration_id=migration.migration_id,
472
+ feature_key=feature_key_str,
473
+ error_message=error_msg,
474
+ )
475
+ )
476
+
477
+ continue
478
+
479
+ # Determine status
480
+ if dry_run:
481
+ status = "skipped"
482
+ elif len(errors) == 0:
483
+ status = "completed"
484
+ if not dry_run:
485
+ self.storage.write_event(
486
+ Event.migration_completed(
487
+ project=project, migration_id=migration.migration_id
488
+ )
489
+ )
490
+ else:
491
+ status = "failed"
492
+ if not dry_run:
493
+ self.storage.write_event(
494
+ Event.migration_failed(
495
+ project=project,
496
+ migration_id=migration.migration_id,
497
+ error_message="",
498
+ )
499
+ )
500
+
501
+ duration = (datetime.now(timezone.utc) - start_time).total_seconds()
502
+
503
+ from metaxy.migrations.models import MigrationResult
504
+
505
+ return MigrationResult(
506
+ migration_id=migration.migration_id,
507
+ status=status,
508
+ features_completed=len(affected_features_list),
509
+ features_failed=len(errors),
510
+ features_skipped=len(skipped),
511
+ affected_features=affected_features_list,
512
+ errors={**errors, **skipped}, # Combine for display
513
+ rows_affected=rows_affected_total,
514
+ duration_seconds=duration,
515
+ timestamp=start_time,
516
+ )