odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/pipeline.py ADDED
@@ -0,0 +1,2382 @@
1
+ """Pipeline executor and orchestration."""
2
+
3
+ import time
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
8
+
9
+ if TYPE_CHECKING:
10
+ import pandas as pd
11
+
12
+ from odibi.config import AlertConfig, ErrorStrategy, PipelineConfig, ProjectConfig, RetryConfig
13
+ from odibi.context import create_context
14
+ from odibi.engine.registry import get_engine_class
15
+ from odibi.exceptions import DependencyError
16
+ from odibi.graph import DependencyGraph
17
+ from odibi.lineage import OpenLineageAdapter
18
+ from odibi.node import Node, NodeResult
19
+ from odibi.plugins import get_connection_factory, load_plugins
20
+ from odibi.registry import FunctionRegistry
21
+ from odibi.state import StateManager, create_state_backend
22
+ from odibi.story import StoryGenerator
23
+ from odibi.story.lineage_utils import generate_lineage
24
+ from odibi.transformers import register_standard_library
25
+ from odibi.utils import load_yaml_with_env
26
+ from odibi.utils.alerting import send_alert
27
+ from odibi.utils.logging import configure_logging, logger
28
+ from odibi.utils.logging_context import (
29
+ create_logging_context,
30
+ set_logging_context,
31
+ )
32
+ from odibi.utils.progress import NodeStatus, PipelineProgress
33
+
34
+
35
+ @dataclass
36
+ class PipelineResults:
37
+ """Results from pipeline execution."""
38
+
39
+ pipeline_name: str
40
+ completed: List[str] = field(default_factory=list)
41
+ failed: List[str] = field(default_factory=list)
42
+ skipped: List[str] = field(default_factory=list)
43
+ node_results: Dict[str, NodeResult] = field(default_factory=dict)
44
+ duration: float = 0.0
45
+ start_time: Optional[str] = None
46
+ end_time: Optional[str] = None
47
+ story_path: Optional[str] = None
48
+
49
+ def get_node_result(self, name: str) -> Optional[NodeResult]:
50
+ """Get result for specific node.
51
+
52
+ Args:
53
+ name: Node name
54
+
55
+ Returns:
56
+ NodeResult if available, None otherwise
57
+ """
58
+ return self.node_results.get(name)
59
+
60
+ def to_dict(self) -> Dict[str, Any]:
61
+ """Convert to dictionary.
62
+
63
+ Returns:
64
+ Dictionary representation
65
+ """
66
+ return {
67
+ "pipeline_name": self.pipeline_name,
68
+ "completed": self.completed,
69
+ "failed": self.failed,
70
+ "skipped": self.skipped,
71
+ "duration": self.duration,
72
+ "start_time": self.start_time,
73
+ "end_time": self.end_time,
74
+ "node_count": len(self.node_results),
75
+ }
76
+
77
+
78
+ class Pipeline:
79
+ """Pipeline executor and orchestrator."""
80
+
81
+ def __init__(
82
+ self,
83
+ pipeline_config: PipelineConfig,
84
+ engine: str = "pandas",
85
+ connections: Optional[Dict[str, Any]] = None,
86
+ generate_story: bool = True,
87
+ story_config: Optional[Dict[str, Any]] = None,
88
+ retry_config: Optional[RetryConfig] = None,
89
+ alerts: Optional[List[AlertConfig]] = None,
90
+ performance_config: Optional[Any] = None,
91
+ catalog_manager: Optional[Any] = None,
92
+ lineage_adapter: Optional[Any] = None,
93
+ ):
94
+ """Initialize pipeline.
95
+
96
+ Args:
97
+ pipeline_config: Pipeline configuration
98
+ engine: Engine type ('pandas' or 'spark')
99
+ connections: Available connections
100
+ generate_story: Whether to generate execution stories
101
+ story_config: Story generator configuration
102
+ retry_config: Retry configuration
103
+ alerts: Alert configurations
104
+ performance_config: Performance tuning configuration
105
+ catalog_manager: System Catalog Manager (Phase 1)
106
+ lineage_adapter: OpenLineage Adapter
107
+ """
108
+ self.config = pipeline_config
109
+ self.project_config = None # Set by PipelineManager if available
110
+ self.engine_type = engine
111
+ self.connections = connections or {}
112
+ self.generate_story = generate_story
113
+ self.retry_config = retry_config
114
+ self.alerts = alerts or []
115
+ self.performance_config = performance_config
116
+ self.catalog_manager = catalog_manager
117
+ self.lineage = lineage_adapter
118
+
119
+ # Batch write buffers to collect catalog writes during execution
120
+ # These are flushed at pipeline end to eliminate concurrency conflicts
121
+ self._pending_lineage_records: List[Dict[str, Any]] = []
122
+ self._pending_asset_records: List[Dict[str, Any]] = []
123
+ self._pending_hwm_updates: List[Dict[str, Any]] = []
124
+ self._batch_mode_enabled: bool = True # Enable batch mode by default
125
+
126
+ # Track async story futures for flush_stories()
127
+ self._story_future = None
128
+ self._story_executor = None
129
+
130
+ # Create logging context for this pipeline
131
+ self._ctx = create_logging_context(
132
+ pipeline_id=pipeline_config.pipeline,
133
+ engine=engine,
134
+ )
135
+
136
+ self._ctx.info(
137
+ f"Initializing pipeline: {pipeline_config.pipeline}",
138
+ engine=engine,
139
+ node_count=len(pipeline_config.nodes),
140
+ connections=list(self.connections.keys()) if self.connections else [],
141
+ )
142
+
143
+ # Initialize story generator
144
+ story_config = story_config or {}
145
+ self.story_config = story_config # Store for async_generation check
146
+
147
+ self.story_generator = StoryGenerator(
148
+ pipeline_name=pipeline_config.pipeline,
149
+ max_sample_rows=story_config.get("max_sample_rows", 10),
150
+ output_path=story_config.get("output_path", "stories/"),
151
+ storage_options=story_config.get("storage_options", {}),
152
+ catalog_manager=catalog_manager,
153
+ )
154
+
155
+ # Initialize engine
156
+ engine_config = {}
157
+ if performance_config:
158
+ if hasattr(performance_config, "model_dump"):
159
+ engine_config["performance"] = performance_config.model_dump()
160
+ elif hasattr(performance_config, "dict"):
161
+ engine_config["performance"] = performance_config.model_dump()
162
+ else:
163
+ engine_config["performance"] = performance_config
164
+
165
+ try:
166
+ EngineClass = get_engine_class(engine)
167
+ except ValueError as e:
168
+ # Handle Spark special case message
169
+ if engine == "spark":
170
+ raise ImportError(
171
+ "Spark engine not available. "
172
+ "Install with 'pip install odibi[spark]' or ensure pyspark is installed."
173
+ )
174
+ raise e
175
+
176
+ if engine == "spark":
177
+ # SparkEngine can take existing session if needed, but here we let it create/get one
178
+ # We might need to pass connections to it for ADLS auth config
179
+ self.engine = EngineClass(connections=connections, config=engine_config)
180
+ else:
181
+ self.engine = EngineClass(config=engine_config)
182
+
183
+ self._ctx.debug(f"Engine initialized: {engine}")
184
+
185
+ # Initialize context
186
+ spark_session = getattr(self.engine, "spark", None)
187
+ self.context = create_context(engine, spark_session=spark_session)
188
+
189
+ # Build dependency graph
190
+ self.graph = DependencyGraph(pipeline_config.nodes)
191
+
192
+ # Log graph structure
193
+ layers = self.graph.get_execution_layers()
194
+ edge_count = sum(len(n.depends_on) for n in pipeline_config.nodes)
195
+ self._ctx.log_graph_operation(
196
+ operation="build",
197
+ node_count=len(pipeline_config.nodes),
198
+ edge_count=edge_count,
199
+ layer_count=len(layers),
200
+ )
201
+
202
+ def __enter__(self) -> "Pipeline":
203
+ """Context manager entry."""
204
+ return self
205
+
206
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
207
+ """Context manager exit - cleanup connections."""
208
+ self._cleanup_connections()
209
+
210
+ def _cleanup_connections(self) -> None:
211
+ """Clean up all connection resources."""
212
+ if not self.connections:
213
+ return
214
+
215
+ for name, conn in self.connections.items():
216
+ if hasattr(conn, "close"):
217
+ try:
218
+ conn.close()
219
+ self._ctx.debug(f"Closed connection: {name}")
220
+ except Exception as e:
221
+ self._ctx.warning(f"Failed to close connection {name}: {e}", exc_info=True)
222
+
223
+ @classmethod
224
+ def from_yaml(cls, yaml_path: str) -> "PipelineManager":
225
+ """Create PipelineManager from YAML file (recommended).
226
+
227
+ This method now returns a PipelineManager that can run all or specific pipelines.
228
+
229
+ Args:
230
+ yaml_path: Path to YAML configuration file
231
+
232
+ Returns:
233
+ PipelineManager instance (use .run() to execute)
234
+
235
+ Example:
236
+ >>> from odibi.pipeline import Pipeline
237
+ >>> manager = Pipeline.from_yaml("config.yaml")
238
+ >>> results = manager.run() # Run all pipelines
239
+ >>> results = manager.run('bronze_to_silver') # Run specific pipeline
240
+
241
+ Note:
242
+ For direct access to PipelineManager class:
243
+ >>> from odibi.pipeline import PipelineManager
244
+ >>> manager = PipelineManager.from_yaml("config.yaml")
245
+ """
246
+ # Delegate to PipelineManager
247
+ return PipelineManager.from_yaml(yaml_path)
248
+
249
+ def register_outputs(self) -> int:
250
+ """
251
+ Pre-register node outputs from pipeline config without running the pipeline.
252
+
253
+ Scans pipeline nodes for output locations (write blocks, merge/scd2 params)
254
+ and registers them to meta_outputs. This enables cross-pipeline references
255
+ without requiring the source pipeline to have run first.
256
+
257
+ Returns:
258
+ Number of outputs registered
259
+
260
+ Example:
261
+ >>> pipeline = Pipeline(config, engine="spark", catalog_manager=catalog)
262
+ >>> count = pipeline.register_outputs()
263
+ >>> print(f"Registered {count} outputs")
264
+ """
265
+ if not self.catalog_manager:
266
+ self._ctx.warning("No catalog_manager configured, cannot register outputs")
267
+ return 0
268
+
269
+ count = self.catalog_manager.register_outputs_from_config(self.config)
270
+ self._ctx.info(f"Pre-registered {count} outputs from pipeline config")
271
+ return count
272
+
273
+ def run(
274
+ self,
275
+ parallel: bool = False,
276
+ dry_run: bool = False,
277
+ resume_from_failure: bool = False,
278
+ max_workers: int = 4,
279
+ on_error: Optional[str] = None,
280
+ tag: Optional[str] = None,
281
+ node: Optional[Union[str, List[str]]] = None,
282
+ console: bool = False,
283
+ ) -> PipelineResults:
284
+ """Execute the pipeline.
285
+
286
+ Args:
287
+ parallel: Whether to use parallel execution
288
+ dry_run: Whether to simulate execution without running operations
289
+ resume_from_failure: Whether to skip successfully completed nodes from last run
290
+ max_workers: Maximum number of parallel threads (default: 4)
291
+ on_error: Override error handling strategy
292
+ tag: Filter nodes by tag (only nodes with this tag will run)
293
+ node: Run only specific node(s) by name - can be a string or list of strings
294
+ console: Whether to show rich console output with progress
295
+
296
+ Returns:
297
+ PipelineResults with execution details
298
+ """
299
+ start_time = time.time()
300
+ start_timestamp = datetime.now().isoformat()
301
+
302
+ results = PipelineResults(pipeline_name=self.config.pipeline, start_time=start_timestamp)
303
+
304
+ # Set global logging context for this pipeline run
305
+ set_logging_context(self._ctx)
306
+
307
+ # Pre-register outputs so cross-pipeline references can resolve on first run
308
+ if self.catalog_manager:
309
+ try:
310
+ count = self.register_outputs()
311
+ if count > 0:
312
+ self._ctx.debug(f"Pre-registered {count} outputs for reference resolution")
313
+ except Exception as e:
314
+ self._ctx.debug(f"Output pre-registration skipped: {e}")
315
+
316
+ # Get execution plan info for logging
317
+ layers = self.graph.get_execution_layers()
318
+ execution_order = self.graph.topological_sort()
319
+
320
+ # Apply node filters (--tag, --node)
321
+ filtered_nodes = set(execution_order)
322
+ if tag:
323
+ filtered_nodes = {name for name in filtered_nodes if tag in self.graph.nodes[name].tags}
324
+ self._ctx.info(f"Filtering by tag '{tag}': {len(filtered_nodes)} nodes match")
325
+ if node:
326
+ # Normalize to list
327
+ node_list = [node] if isinstance(node, str) else node
328
+ # Validate all nodes exist
329
+ missing = [n for n in node_list if n not in self.graph.nodes]
330
+ if missing:
331
+ available = ", ".join(self.graph.nodes.keys())
332
+ raise ValueError(f"Node(s) not found: {missing}. Available: {available}")
333
+ # Auto-include all upstream dependencies
334
+ filtered_nodes = set(node_list)
335
+ for n in node_list:
336
+ deps = self.graph.get_dependencies(n)
337
+ filtered_nodes.update(deps)
338
+ if len(filtered_nodes) > len(node_list):
339
+ dep_count = len(filtered_nodes) - len(node_list)
340
+ self._ctx.info(f"Running node(s): {node_list} (+ {dep_count} dependencies)")
341
+ else:
342
+ self._ctx.info(f"Running specific node(s): {node_list}")
343
+
344
+ # Update execution order to only include filtered nodes
345
+ execution_order = [n for n in execution_order if n in filtered_nodes]
346
+ layers = [[n for n in layer if n in filtered_nodes] for layer in layers]
347
+ layers = [layer for layer in layers if layer] # Remove empty layers
348
+
349
+ self._ctx.info(
350
+ f"Starting pipeline: {self.config.pipeline}",
351
+ mode="parallel" if parallel else "serial",
352
+ dry_run=dry_run,
353
+ resume_from_failure=resume_from_failure,
354
+ node_count=len(self.graph.nodes),
355
+ layer_count=len(layers),
356
+ max_workers=max_workers if parallel else 1,
357
+ )
358
+
359
+ if parallel:
360
+ self._ctx.debug(
361
+ f"Parallel execution plan: {len(layers)} layers",
362
+ layers=[list(layer) for layer in layers],
363
+ )
364
+ else:
365
+ self._ctx.debug(
366
+ f"Serial execution order: {len(execution_order)} nodes",
367
+ order=execution_order,
368
+ )
369
+
370
+ # Initialize progress tracker for console output
371
+ progress: Optional[PipelineProgress] = None
372
+ if console:
373
+ progress = PipelineProgress(
374
+ pipeline_name=self.config.pipeline,
375
+ node_names=execution_order,
376
+ engine=self.engine_type,
377
+ )
378
+ progress.start()
379
+
380
+ # Alert: on_start
381
+ self._send_alerts("on_start", results)
382
+
383
+ # Lineage: Start
384
+ parent_run_id = None
385
+ if self.lineage:
386
+ parent_run_id = self.lineage.emit_pipeline_start(self.config)
387
+
388
+ # Drift Detection (Governance)
389
+ if self.catalog_manager:
390
+ try:
391
+ import hashlib
392
+ import json
393
+
394
+ # Calculate Local Hash
395
+ if hasattr(self.config, "model_dump"):
396
+ dump = self.config.model_dump(mode="json")
397
+ else:
398
+ dump = self.config.model_dump()
399
+ dump_str = json.dumps(dump, sort_keys=True)
400
+ local_hash = hashlib.md5(dump_str.encode("utf-8")).hexdigest()
401
+
402
+ # Get Remote Hash
403
+ remote_hash = self.catalog_manager.get_pipeline_hash(self.config.pipeline)
404
+
405
+ if remote_hash and remote_hash != local_hash:
406
+ self._ctx.warning(
407
+ "DRIFT DETECTED: Local pipeline differs from Catalog",
408
+ local_hash=local_hash[:8],
409
+ catalog_hash=remote_hash[:8],
410
+ suggestion="Deploy changes using 'odibi deploy' before production",
411
+ )
412
+ elif not remote_hash:
413
+ self._ctx.info(
414
+ "Pipeline not found in Catalog (Running un-deployed code)",
415
+ catalog_status="not_deployed",
416
+ )
417
+ else:
418
+ self._ctx.debug(
419
+ "Drift check passed",
420
+ hash=local_hash[:8],
421
+ )
422
+ except Exception as e:
423
+ self._ctx.debug(f"Drift detection check failed: {e}")
424
+
425
+ state_manager = None
426
+ if resume_from_failure:
427
+ self._ctx.info("Resume from failure enabled - checking previous run state")
428
+ if self.project_config:
429
+ try:
430
+ backend = create_state_backend(
431
+ config=self.project_config,
432
+ project_root=".",
433
+ spark_session=getattr(self.engine, "spark", None),
434
+ )
435
+ state_manager = StateManager(backend=backend)
436
+ self._ctx.debug("StateManager initialized for resume capability")
437
+ except Exception as e:
438
+ self._ctx.warning(
439
+ f"Could not initialize StateManager: {e}",
440
+ suggestion="Check state backend configuration",
441
+ )
442
+ else:
443
+ self._ctx.warning(
444
+ "Resume capability unavailable: Project configuration missing",
445
+ suggestion="Ensure project config is set for resume support",
446
+ )
447
+
448
+ # Define node processing function (inner function to capture self/context)
449
+ def process_node(node_name: str) -> NodeResult:
450
+ node_ctx = self._ctx.with_context(node_id=node_name)
451
+
452
+ node_config = self.graph.nodes[node_name]
453
+ deps_failed_list = [dep for dep in node_config.depends_on if dep in results.failed]
454
+ deps_failed = len(deps_failed_list) > 0
455
+
456
+ if deps_failed:
457
+ node_ctx.warning(
458
+ "Skipping node due to dependency failure",
459
+ skipped=True,
460
+ failed_dependencies=deps_failed_list,
461
+ suggestion="Fix upstream node failures first",
462
+ )
463
+ return NodeResult(
464
+ node_name=node_name,
465
+ success=False,
466
+ duration=0.0,
467
+ metadata={"skipped": True, "reason": "dependency_failed"},
468
+ )
469
+
470
+ # Check for resume capability
471
+ if resume_from_failure and state_manager:
472
+ last_info = state_manager.get_last_run_info(self.config.pipeline, node_name)
473
+
474
+ can_resume = False
475
+ resume_reason = ""
476
+
477
+ if last_info and last_info.get("success"):
478
+ last_hash = last_info.get("metadata", {}).get("version_hash")
479
+
480
+ from odibi.utils.hashing import calculate_node_hash
481
+
482
+ node_cfg = self.graph.nodes[node_name]
483
+ current_hash = calculate_node_hash(node_cfg)
484
+
485
+ if last_hash == current_hash:
486
+ deps_ran = False
487
+ for dep in node_config.depends_on:
488
+ if dep in results.completed and dep not in results.skipped:
489
+ deps_ran = True
490
+ break
491
+
492
+ if not deps_ran:
493
+ can_resume = True
494
+ resume_reason = "Previously succeeded and restored from storage"
495
+ else:
496
+ resume_reason = "Upstream dependency executed"
497
+ else:
498
+ resume_reason = (
499
+ f"Configuration changed (Hash: {str(last_hash)[:7]}... "
500
+ f"!= {str(current_hash)[:7]}...)"
501
+ )
502
+ else:
503
+ resume_reason = "No successful previous run found"
504
+
505
+ if can_resume:
506
+ if node_config.write:
507
+ try:
508
+ temp_node = Node(
509
+ config=node_config,
510
+ context=self.context,
511
+ engine=self.engine,
512
+ connections=self.connections,
513
+ performance_config=self.performance_config,
514
+ pipeline_name=self.config.pipeline,
515
+ )
516
+ if temp_node.restore():
517
+ node_ctx.info(
518
+ "Skipping node (restored from previous run)",
519
+ skipped=True,
520
+ reason="resume_from_failure",
521
+ version_hash=current_hash[:8],
522
+ )
523
+ result = NodeResult(
524
+ node_name=node_name,
525
+ success=True,
526
+ duration=0.0,
527
+ metadata={
528
+ "skipped": True,
529
+ "reason": "resume_from_failure",
530
+ "version_hash": current_hash,
531
+ },
532
+ )
533
+ return result
534
+ else:
535
+ node_ctx.debug(
536
+ "Re-running node: Restore failed",
537
+ reason="restore_failed",
538
+ )
539
+ except Exception as e:
540
+ node_ctx.warning(
541
+ f"Could not restore node: {e}",
542
+ reason="restore_error",
543
+ )
544
+ else:
545
+ node_ctx.debug(
546
+ "Re-running node: In-memory transform (cannot be restored)",
547
+ reason="no_write_config",
548
+ )
549
+ else:
550
+ node_ctx.debug(f"Re-running node: {resume_reason}")
551
+
552
+ # Lineage: Node Start
553
+ node_run_id = None
554
+ if self.lineage and parent_run_id:
555
+ node_run_id = self.lineage.emit_node_start(node_config, parent_run_id)
556
+
557
+ # Execute node with operation context
558
+ result = None
559
+ node_start = time.time()
560
+ node_ctx.debug(
561
+ "Executing node",
562
+ transformer=node_config.transformer,
563
+ has_read=bool(node_config.read),
564
+ has_write=bool(node_config.write),
565
+ )
566
+
567
+ try:
568
+ # Prepare batch write buffers for eliminating concurrency conflicts
569
+ batch_buffers = None
570
+ if self._batch_mode_enabled:
571
+ batch_buffers = {
572
+ "lineage": self._pending_lineage_records,
573
+ "assets": self._pending_asset_records,
574
+ "hwm": self._pending_hwm_updates,
575
+ }
576
+
577
+ node = Node(
578
+ config=node_config,
579
+ context=self.context,
580
+ engine=self.engine,
581
+ connections=self.connections,
582
+ dry_run=dry_run,
583
+ retry_config=self.retry_config,
584
+ catalog_manager=self.catalog_manager,
585
+ performance_config=self.performance_config,
586
+ pipeline_name=self.config.pipeline,
587
+ batch_write_buffers=batch_buffers,
588
+ config_file=node_config.source_yaml,
589
+ )
590
+ result = node.execute()
591
+
592
+ node_duration = time.time() - node_start
593
+ if result.success:
594
+ node_ctx.info(
595
+ "Node completed successfully",
596
+ duration_ms=round(node_duration * 1000, 2),
597
+ rows_processed=result.rows_processed,
598
+ )
599
+ else:
600
+ node_ctx.error(
601
+ "Node execution failed",
602
+ duration_ms=round(node_duration * 1000, 2),
603
+ error=result.error,
604
+ )
605
+
606
+ except Exception as e:
607
+ node_duration = time.time() - node_start
608
+ node_ctx.error(
609
+ f"Node raised exception: {e}",
610
+ duration_ms=round(node_duration * 1000, 2),
611
+ error_type=type(e).__name__,
612
+ suggestion="Check node configuration and input data",
613
+ )
614
+ result = NodeResult(node_name=node_name, success=False, duration=0.0, error=str(e))
615
+
616
+ # Lineage: Node Complete
617
+ if self.lineage and node_run_id:
618
+ self.lineage.emit_node_complete(node_config, result, node_run_id)
619
+
620
+ return result
621
+
622
+ if parallel:
623
+ from concurrent.futures import ThreadPoolExecutor, as_completed
624
+
625
+ # NOTE: 'layers' already filtered by node/tag above - don't re-fetch from graph
626
+ self._ctx.info(
627
+ f"Starting parallel execution with {max_workers} workers",
628
+ total_layers=len(layers),
629
+ max_workers=max_workers,
630
+ )
631
+
632
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
633
+ for layer_idx, layer in enumerate(layers):
634
+ layer_start = time.time()
635
+ self._ctx.debug(
636
+ f"Executing layer {layer_idx + 1}/{len(layers)}",
637
+ layer_index=layer_idx,
638
+ nodes_in_layer=list(layer),
639
+ node_count=len(layer),
640
+ )
641
+
642
+ future_to_node = {
643
+ executor.submit(process_node, node_name): node_name for node_name in layer
644
+ }
645
+
646
+ layer_failed = False
647
+ for future in as_completed(future_to_node):
648
+ node_name = future_to_node[future]
649
+ try:
650
+ result = future.result()
651
+ results.node_results[node_name] = result
652
+
653
+ if result.success:
654
+ if result.metadata.get("skipped"):
655
+ if result.metadata.get("reason") == "dependency_failed":
656
+ results.skipped.append(node_name)
657
+ if progress:
658
+ progress.update_node(
659
+ node_name,
660
+ NodeStatus.SKIPPED,
661
+ result.duration,
662
+ result.rows_processed,
663
+ )
664
+ else:
665
+ results.completed.append(node_name)
666
+ if progress:
667
+ progress.update_node(
668
+ node_name,
669
+ NodeStatus.SKIPPED,
670
+ result.duration,
671
+ result.rows_processed,
672
+ )
673
+ else:
674
+ results.completed.append(node_name)
675
+ if progress:
676
+ progress.update_node(
677
+ node_name,
678
+ NodeStatus.SUCCESS,
679
+ result.duration,
680
+ result.rows_processed,
681
+ result.metadata.get("phase_timings_ms"),
682
+ )
683
+ else:
684
+ if result.metadata.get("skipped"):
685
+ results.skipped.append(node_name)
686
+ if progress:
687
+ progress.update_node(
688
+ node_name,
689
+ NodeStatus.SKIPPED,
690
+ result.duration,
691
+ result.rows_processed,
692
+ )
693
+ else:
694
+ results.failed.append(node_name)
695
+ layer_failed = True
696
+ if progress:
697
+ progress.update_node(
698
+ node_name,
699
+ NodeStatus.FAILED,
700
+ result.duration,
701
+ result.rows_processed,
702
+ )
703
+
704
+ node_config = self.graph.nodes[node_name]
705
+ strategy = (
706
+ ErrorStrategy(on_error)
707
+ if on_error
708
+ else node_config.on_error
709
+ )
710
+
711
+ if strategy == ErrorStrategy.FAIL_FAST:
712
+ self._ctx.error(
713
+ "FAIL_FAST triggered: Stopping pipeline",
714
+ failed_node=node_name,
715
+ error=result.error,
716
+ remaining_nodes=len(future_to_node) - 1,
717
+ )
718
+ executor.shutdown(cancel_futures=True, wait=False)
719
+ break
720
+
721
+ except Exception as exc:
722
+ self._ctx.error(
723
+ "Node generated exception",
724
+ node=node_name,
725
+ error=str(exc),
726
+ error_type=type(exc).__name__,
727
+ )
728
+ results.failed.append(node_name)
729
+ layer_failed = True
730
+ if progress:
731
+ progress.update_node(node_name, NodeStatus.FAILED)
732
+
733
+ node_config = self.graph.nodes[node_name]
734
+ strategy = ErrorStrategy(on_error) if on_error else node_config.on_error
735
+ if strategy == ErrorStrategy.FAIL_FAST:
736
+ self._ctx.error(
737
+ "FAIL_FAST triggered: Stopping pipeline",
738
+ failed_node=node_name,
739
+ )
740
+ executor.shutdown(cancel_futures=True, wait=False)
741
+ break
742
+
743
+ layer_duration = time.time() - layer_start
744
+ self._ctx.debug(
745
+ f"Layer {layer_idx + 1} completed",
746
+ layer_index=layer_idx,
747
+ duration_ms=round(layer_duration * 1000, 2),
748
+ layer_failed=layer_failed,
749
+ )
750
+
751
+ if layer_failed:
752
+ for failed_node in results.failed:
753
+ if self.graph.nodes[failed_node].on_error == ErrorStrategy.FAIL_FAST:
754
+ return results
755
+
756
+ else:
757
+ self._ctx.info("Starting serial execution")
758
+ execution_order = self.graph.topological_sort()
759
+ for idx, node_name in enumerate(execution_order):
760
+ self._ctx.debug(
761
+ f"Executing node {idx + 1}/{len(execution_order)}",
762
+ node=node_name,
763
+ order=idx + 1,
764
+ total=len(execution_order),
765
+ )
766
+
767
+ result = process_node(node_name)
768
+ results.node_results[node_name] = result
769
+
770
+ if result.success:
771
+ if (
772
+ result.metadata.get("skipped")
773
+ and result.metadata.get("reason") == "dependency_failed"
774
+ ):
775
+ results.skipped.append(node_name)
776
+ results.failed.append(node_name)
777
+ if progress:
778
+ progress.update_node(
779
+ node_name,
780
+ NodeStatus.SKIPPED,
781
+ result.duration,
782
+ result.rows_processed,
783
+ )
784
+ else:
785
+ results.completed.append(node_name)
786
+ if progress:
787
+ status = (
788
+ NodeStatus.SKIPPED
789
+ if result.metadata.get("skipped")
790
+ else NodeStatus.SUCCESS
791
+ )
792
+ progress.update_node(
793
+ node_name,
794
+ status,
795
+ result.duration,
796
+ result.rows_processed,
797
+ )
798
+ else:
799
+ if result.metadata.get("skipped"):
800
+ results.skipped.append(node_name)
801
+ results.failed.append(node_name)
802
+ if progress:
803
+ progress.update_node(
804
+ node_name,
805
+ NodeStatus.SKIPPED,
806
+ result.duration,
807
+ result.rows_processed,
808
+ )
809
+ else:
810
+ results.failed.append(node_name)
811
+ if progress:
812
+ progress.update_node(
813
+ node_name,
814
+ NodeStatus.FAILED,
815
+ result.duration,
816
+ result.rows_processed,
817
+ )
818
+
819
+ node_config = self.graph.nodes[node_name]
820
+ strategy = ErrorStrategy(on_error) if on_error else node_config.on_error
821
+
822
+ if strategy == ErrorStrategy.FAIL_FAST:
823
+ self._ctx.error(
824
+ "FAIL_FAST triggered: Stopping pipeline",
825
+ failed_node=node_name,
826
+ error=result.error,
827
+ remaining_nodes=len(execution_order) - idx - 1,
828
+ )
829
+ break
830
+
831
+ # Calculate duration
832
+ results.duration = time.time() - start_time
833
+ results.end_time = datetime.now().isoformat()
834
+
835
+ # Batch write run records to catalog (much faster than per-node writes)
836
+ # Skip if performance.skip_run_logging is enabled
837
+ skip_run_logging = self.performance_config and getattr(
838
+ self.performance_config, "skip_run_logging", False
839
+ )
840
+ if self.catalog_manager and not skip_run_logging:
841
+ run_records = []
842
+ for node_result in results.node_results.values():
843
+ if node_result.metadata and "_run_record" in node_result.metadata:
844
+ run_records.append(node_result.metadata.pop("_run_record"))
845
+ if run_records:
846
+ self.catalog_manager.log_runs_batch(run_records)
847
+ self._ctx.debug(
848
+ f"Batch logged {len(run_records)} run records",
849
+ record_count=len(run_records),
850
+ )
851
+
852
+ # Batch write output metadata for cross-pipeline dependencies
853
+ output_records = []
854
+ for node_result in results.node_results.values():
855
+ if node_result.metadata and "_output_record" in node_result.metadata:
856
+ output_records.append(node_result.metadata.pop("_output_record"))
857
+ if output_records:
858
+ try:
859
+ self.catalog_manager.register_outputs_batch(output_records)
860
+ self._ctx.debug(
861
+ f"Batch registered {len(output_records)} output(s)",
862
+ output_count=len(output_records),
863
+ )
864
+ except Exception as e:
865
+ self._ctx.warning(
866
+ f"Failed to register outputs (non-fatal): {e}",
867
+ error_type=type(e).__name__,
868
+ )
869
+
870
+ # Flush buffered catalog writes (lineage, assets, HWM)
871
+ self._flush_batch_writes()
872
+
873
+ elif skip_run_logging:
874
+ self._ctx.debug("Skipping run logging (skip_run_logging=true)")
875
+
876
+ # Finish progress display
877
+ if progress:
878
+ progress.finish(
879
+ completed=len(results.completed),
880
+ failed=len(results.failed),
881
+ skipped=len(results.skipped),
882
+ duration=results.duration,
883
+ )
884
+ # Print phase timing breakdown for performance analysis
885
+ progress.print_phase_timing_report(pipeline_duration_s=results.duration)
886
+
887
+ # Log pipeline completion summary
888
+ status = "SUCCESS" if not results.failed else "FAILED"
889
+ self._ctx.info(
890
+ f"Pipeline {status}: {self.config.pipeline}",
891
+ status=status,
892
+ duration_s=round(results.duration, 2),
893
+ completed=len(results.completed),
894
+ failed=len(results.failed),
895
+ skipped=len(results.skipped),
896
+ total_nodes=len(self.graph.nodes),
897
+ )
898
+
899
+ # Start story generation in background thread (pure Python/file I/O, safe to parallelize)
900
+ # This runs concurrently with state saving below
901
+ story_future = None
902
+ story_executor = None
903
+ async_story = self.story_config.get("async_generation", False)
904
+
905
+ if self.generate_story:
906
+ from concurrent.futures import ThreadPoolExecutor
907
+
908
+ if hasattr(self.config, "model_dump"):
909
+ config_dump = self.config.model_dump(mode="json")
910
+ else:
911
+ config_dump = self.config.model_dump()
912
+
913
+ if self.project_config:
914
+ project_dump = (
915
+ self.project_config.model_dump(mode="json")
916
+ if hasattr(self.project_config, "model_dump")
917
+ else self.project_config.model_dump()
918
+ )
919
+ for field in ["project", "plant", "asset", "business_unit", "layer"]:
920
+ if field in project_dump and project_dump[field]:
921
+ config_dump[field] = project_dump[field]
922
+
923
+ def generate_story():
924
+ try:
925
+ # Get graph data for interactive DAG visualization
926
+ graph_data_dict = self.graph.to_dict() if self.graph else None
927
+
928
+ return self.story_generator.generate(
929
+ node_results=results.node_results,
930
+ completed=results.completed,
931
+ failed=results.failed,
932
+ skipped=results.skipped,
933
+ duration=results.duration,
934
+ start_time=results.start_time,
935
+ end_time=results.end_time,
936
+ context=self.context,
937
+ config=config_dump,
938
+ graph_data=graph_data_dict,
939
+ )
940
+ except Exception as e:
941
+ self._ctx.warning(f"Story generation failed: {e}")
942
+ return None
943
+
944
+ story_executor = ThreadPoolExecutor(max_workers=1)
945
+ story_future = story_executor.submit(generate_story)
946
+
947
+ # Save state if running normally (not dry run)
948
+ # This runs while story generation happens in background
949
+ if not dry_run:
950
+ if not state_manager and self.project_config:
951
+ try:
952
+ backend = create_state_backend(
953
+ config=self.project_config,
954
+ project_root=".",
955
+ spark_session=getattr(self.engine, "spark", None),
956
+ )
957
+ state_manager = StateManager(backend=backend)
958
+ except Exception as e:
959
+ self._ctx.warning(
960
+ f"Could not initialize StateManager for saving run: {e}",
961
+ suggestion="Check state backend configuration",
962
+ )
963
+
964
+ if state_manager:
965
+ state_manager.save_pipeline_run(self.config.pipeline, results)
966
+ self._ctx.debug("Pipeline run state saved")
967
+
968
+ # Handle story completion based on async_generation setting
969
+ if story_future:
970
+ if async_story:
971
+ # Store future and executor for flush_stories()
972
+ self._story_future = story_future
973
+ self._story_executor = story_executor
974
+ self._ctx.debug("Story generation running async (can be flushed later)")
975
+ else:
976
+ # Wait for story generation to complete
977
+ try:
978
+ story_path = story_future.result(timeout=60)
979
+ if story_path:
980
+ results.story_path = story_path
981
+ self._ctx.info("Story generated", story_path=story_path)
982
+ except Exception as e:
983
+ self._ctx.warning(f"Story generation failed: {e}")
984
+ finally:
985
+ if story_executor:
986
+ story_executor.shutdown(wait=False)
987
+
988
+ # Alert: on_success / on_failure
989
+ if results.failed:
990
+ self._send_alerts("on_failure", results)
991
+ else:
992
+ self._send_alerts("on_success", results)
993
+
994
+ # Catalog optimization (optional - can be slow, ~15-20s)
995
+ # Only run if explicitly enabled via optimize_catalog flag
996
+ if self.catalog_manager and getattr(self, "optimize_catalog", False):
997
+ self.catalog_manager.optimize()
998
+ self._ctx.debug("Catalog optimized")
999
+
1000
+ # Lineage: Complete
1001
+ if self.lineage:
1002
+ self.lineage.emit_pipeline_complete(self.config, results)
1003
+
1004
+ return results
1005
+
1006
+ def flush_stories(self, timeout: float = 60.0) -> Optional[str]:
1007
+ """Wait for any pending async story generation to complete.
1008
+
1009
+ Call this before operations that need story files to be written,
1010
+ such as lineage generation.
1011
+
1012
+ Args:
1013
+ timeout: Maximum seconds to wait for story generation
1014
+
1015
+ Returns:
1016
+ Story path if generated, None otherwise
1017
+ """
1018
+ if self._story_future is None:
1019
+ return None
1020
+
1021
+ try:
1022
+ story_path = self._story_future.result(timeout=timeout)
1023
+ self._ctx.info("Async story generation completed", story_path=story_path)
1024
+ return story_path
1025
+ except Exception as e:
1026
+ self._ctx.warning(f"Async story generation failed: {e}")
1027
+ return None
1028
+ finally:
1029
+ if self._story_executor:
1030
+ self._story_executor.shutdown(wait=False)
1031
+ self._story_future = None
1032
+ self._story_executor = None
1033
+
1034
+ def _send_alerts(self, event: str, results: PipelineResults) -> None:
1035
+ """Send alerts for a specific event.
1036
+
1037
+ Args:
1038
+ event: Event name (on_start, on_success, on_failure)
1039
+ results: Pipeline results
1040
+ """
1041
+ for alert_config in self.alerts:
1042
+ event_values = [e.value if hasattr(e, "value") else e for e in alert_config.on_events]
1043
+ if event in event_values:
1044
+ status = "FAILED" if results.failed else "SUCCESS"
1045
+ if event == "on_start":
1046
+ status = "STARTED"
1047
+
1048
+ context = {
1049
+ "pipeline": self.config.pipeline,
1050
+ "status": status,
1051
+ "duration": results.duration,
1052
+ "timestamp": datetime.now().isoformat(),
1053
+ "project_config": self.project_config,
1054
+ "event_type": event,
1055
+ }
1056
+
1057
+ # Enrich with story summary (row counts, story URL)
1058
+ if event != "on_start" and self.generate_story:
1059
+ story_summary = self.story_generator.get_alert_summary()
1060
+ context.update(story_summary)
1061
+
1062
+ msg = f"Pipeline '{self.config.pipeline}' {status}"
1063
+ if results.failed:
1064
+ msg += f". Failed nodes: {', '.join(results.failed)}"
1065
+
1066
+ send_alert(alert_config, msg, context)
1067
+
1068
+ def buffer_lineage_record(self, record: Dict[str, Any]) -> None:
1069
+ """Buffer a lineage record for batch write at pipeline end.
1070
+
1071
+ Args:
1072
+ record: Dict with keys: source_table, target_table, target_pipeline,
1073
+ target_node, run_id, and optional source_pipeline, source_node
1074
+ """
1075
+ self._pending_lineage_records.append(record)
1076
+
1077
+ def buffer_asset_record(self, record: Dict[str, Any]) -> None:
1078
+ """Buffer an asset registration record for batch write at pipeline end.
1079
+
1080
+ Args:
1081
+ record: Dict with keys: project_name, table_name, path, format,
1082
+ pattern_type, and optional schema_hash
1083
+ """
1084
+ self._pending_asset_records.append(record)
1085
+
1086
+ def buffer_hwm_update(self, key: str, value: Any) -> None:
1087
+ """Buffer a HWM update for batch write at pipeline end.
1088
+
1089
+ Args:
1090
+ key: HWM state key
1091
+ value: HWM value
1092
+ """
1093
+ self._pending_hwm_updates.append({"key": key, "value": value})
1094
+
1095
+ def _flush_batch_writes(self) -> None:
1096
+ """Flush all buffered catalog writes in single batch operations.
1097
+
1098
+ This eliminates concurrency conflicts when running 35+ parallel nodes
1099
+ by writing all lineage, assets, and HWM updates at once.
1100
+ """
1101
+ if not self.catalog_manager:
1102
+ return
1103
+
1104
+ # Flush lineage records
1105
+ if self._pending_lineage_records:
1106
+ try:
1107
+ self.catalog_manager.record_lineage_batch(self._pending_lineage_records)
1108
+ self._ctx.debug(
1109
+ f"Batch recorded {len(self._pending_lineage_records)} lineage relationship(s)",
1110
+ lineage_count=len(self._pending_lineage_records),
1111
+ )
1112
+ except Exception as e:
1113
+ self._ctx.warning(
1114
+ f"Failed to batch record lineage (non-fatal): {e}",
1115
+ error_type=type(e).__name__,
1116
+ )
1117
+ finally:
1118
+ self._pending_lineage_records = []
1119
+
1120
+ # Flush asset records
1121
+ if self._pending_asset_records:
1122
+ try:
1123
+ self.catalog_manager.register_assets_batch(self._pending_asset_records)
1124
+ self._ctx.debug(
1125
+ f"Batch registered {len(self._pending_asset_records)} asset(s)",
1126
+ asset_count=len(self._pending_asset_records),
1127
+ )
1128
+ except Exception as e:
1129
+ self._ctx.warning(
1130
+ f"Failed to batch register assets (non-fatal): {e}",
1131
+ error_type=type(e).__name__,
1132
+ )
1133
+ finally:
1134
+ self._pending_asset_records = []
1135
+
1136
+ # Flush HWM updates
1137
+ if self._pending_hwm_updates:
1138
+ try:
1139
+ if self.project_config:
1140
+ backend = create_state_backend(
1141
+ config=self.project_config,
1142
+ project_root=".",
1143
+ spark_session=getattr(self.engine, "spark", None),
1144
+ )
1145
+ state_manager = StateManager(backend=backend)
1146
+ state_manager.set_hwm_batch(self._pending_hwm_updates)
1147
+ self._ctx.debug(
1148
+ f"Batch updated {len(self._pending_hwm_updates)} HWM value(s)",
1149
+ hwm_count=len(self._pending_hwm_updates),
1150
+ )
1151
+ except Exception as e:
1152
+ self._ctx.warning(
1153
+ f"Failed to batch update HWM (non-fatal): {e}",
1154
+ error_type=type(e).__name__,
1155
+ )
1156
+ finally:
1157
+ self._pending_hwm_updates = []
1158
+
1159
+ def run_node(self, node_name: str, mock_data: Optional[Dict[str, Any]] = None) -> NodeResult:
1160
+ """Execute a single node (for testing/debugging).
1161
+
1162
+ Args:
1163
+ node_name: Name of node to execute
1164
+ mock_data: Optional mock data to register in context
1165
+
1166
+ Returns:
1167
+ NodeResult
1168
+ """
1169
+ if node_name not in self.graph.nodes:
1170
+ available = ", ".join(self.graph.nodes.keys()) or "none"
1171
+ raise ValueError(
1172
+ f"Node '{node_name}' not found in pipeline. " f"Available nodes: {available}"
1173
+ )
1174
+
1175
+ # Register mock data if provided
1176
+ if mock_data:
1177
+ for name, data in mock_data.items():
1178
+ self.context.register(name, data)
1179
+
1180
+ # Execute the node
1181
+ node_config = self.graph.nodes[node_name]
1182
+ node = Node(
1183
+ config=node_config,
1184
+ context=self.context,
1185
+ engine=self.engine,
1186
+ connections=self.connections,
1187
+ performance_config=self.performance_config,
1188
+ pipeline_name=self.config.pipeline,
1189
+ config_file=node_config.source_yaml,
1190
+ )
1191
+
1192
+ return node.execute()
1193
+
1194
+ def validate(self) -> Dict[str, Any]:
1195
+ """Validate pipeline without executing.
1196
+
1197
+ Returns:
1198
+ Validation results
1199
+ """
1200
+ self._ctx.info("Validating pipeline configuration")
1201
+
1202
+ validation = {
1203
+ "valid": True,
1204
+ "errors": [],
1205
+ "warnings": [],
1206
+ "node_count": len(self.graph.nodes),
1207
+ "execution_order": [],
1208
+ }
1209
+
1210
+ try:
1211
+ execution_order = self.graph.topological_sort()
1212
+ validation["execution_order"] = execution_order
1213
+ self._ctx.debug(
1214
+ "Dependency graph validated",
1215
+ execution_order=execution_order,
1216
+ )
1217
+
1218
+ for node_name, node in self.graph.nodes.items():
1219
+ if node.transformer:
1220
+ try:
1221
+ FunctionRegistry.validate_params(node.transformer, node.params)
1222
+ except ValueError as e:
1223
+ validation["errors"].append(f"Node '{node_name}' transformer error: {e}")
1224
+ validation["valid"] = False
1225
+ self._ctx.log_validation_result(
1226
+ passed=False,
1227
+ rule_name=f"transformer_params:{node_name}",
1228
+ failures=[str(e)],
1229
+ )
1230
+
1231
+ if node.transform and node.transform.steps:
1232
+ for i, step in enumerate(node.transform.steps):
1233
+ if isinstance(step, str):
1234
+ continue
1235
+
1236
+ if hasattr(step, "function") and step.function:
1237
+ try:
1238
+ FunctionRegistry.validate_params(step.function, step.params)
1239
+ except ValueError as e:
1240
+ validation["errors"].append(
1241
+ f"Node '{node_name}' step {i + 1} error: {e}"
1242
+ )
1243
+ validation["valid"] = False
1244
+ self._ctx.log_validation_result(
1245
+ passed=False,
1246
+ rule_name=f"step_params:{node_name}:step_{i + 1}",
1247
+ failures=[str(e)],
1248
+ )
1249
+
1250
+ except DependencyError as e:
1251
+ validation["valid"] = False
1252
+ validation["errors"].append(str(e))
1253
+ self._ctx.error(
1254
+ "Dependency graph validation failed",
1255
+ error=str(e),
1256
+ )
1257
+
1258
+ for node in self.config.nodes:
1259
+ if node.read and node.read.connection not in self.connections:
1260
+ validation["warnings"].append(
1261
+ f"Node '{node.name}': connection '{node.read.connection}' not configured"
1262
+ )
1263
+ if node.write and node.write.connection not in self.connections:
1264
+ validation["warnings"].append(
1265
+ f"Node '{node.name}': connection '{node.write.connection}' not configured"
1266
+ )
1267
+
1268
+ self._ctx.info(
1269
+ f"Validation {'passed' if validation['valid'] else 'failed'}",
1270
+ valid=validation["valid"],
1271
+ errors=len(validation["errors"]),
1272
+ warnings=len(validation["warnings"]),
1273
+ )
1274
+
1275
+ return validation
1276
+
1277
+ def get_execution_layers(self) -> List[List[str]]:
1278
+ """Get nodes grouped by execution layers.
1279
+
1280
+ Returns:
1281
+ List of layers, where each layer is a list of node names
1282
+ """
1283
+ return self.graph.get_execution_layers()
1284
+
1285
+ def visualize(self) -> str:
1286
+ """Get text visualization of pipeline.
1287
+
1288
+ Returns:
1289
+ String representation of pipeline graph
1290
+ """
1291
+ return self.graph.visualize()
1292
+
1293
+
1294
+ class PipelineManager:
1295
+ """Manages multiple pipelines from a YAML configuration."""
1296
+
1297
+ def __init__(
1298
+ self,
1299
+ project_config: ProjectConfig,
1300
+ connections: Dict[str, Any],
1301
+ ):
1302
+ """Initialize pipeline manager.
1303
+
1304
+ Args:
1305
+ project_config: Validated project configuration
1306
+ connections: Connection objects (already instantiated)
1307
+ """
1308
+ self.project_config = project_config
1309
+ self.connections = connections
1310
+ self._pipelines: Dict[str, Pipeline] = {}
1311
+ self.catalog_manager = None
1312
+ self.lineage_adapter = None
1313
+
1314
+ # Configure logging
1315
+ configure_logging(
1316
+ structured=project_config.logging.structured, level=project_config.logging.level.value
1317
+ )
1318
+
1319
+ # Create manager-level logging context
1320
+ self._ctx = create_logging_context(engine=project_config.engine)
1321
+
1322
+ self._ctx.info(
1323
+ "Initializing PipelineManager",
1324
+ project=project_config.project,
1325
+ engine=project_config.engine,
1326
+ pipeline_count=len(project_config.pipelines),
1327
+ connection_count=len(connections),
1328
+ )
1329
+
1330
+ # Initialize Lineage Adapter
1331
+ self.lineage_adapter = OpenLineageAdapter(project_config.lineage)
1332
+
1333
+ # Initialize CatalogManager if configured
1334
+ if project_config.system:
1335
+ from odibi.catalog import CatalogManager
1336
+
1337
+ spark = None
1338
+ engine_instance = None
1339
+
1340
+ if project_config.engine == "spark":
1341
+ try:
1342
+ from odibi.engine.spark_engine import SparkEngine
1343
+
1344
+ temp_engine = SparkEngine(connections=connections, config={})
1345
+ spark = temp_engine.spark
1346
+ self._ctx.debug("Spark session initialized for System Catalog")
1347
+ except Exception as e:
1348
+ self._ctx.warning(
1349
+ f"Failed to initialize Spark for System Catalog: {e}",
1350
+ suggestion="Check Spark configuration",
1351
+ )
1352
+
1353
+ sys_conn = connections.get(project_config.system.connection)
1354
+ if sys_conn:
1355
+ base_path = sys_conn.get_path(project_config.system.path)
1356
+
1357
+ if not spark:
1358
+ try:
1359
+ from odibi.engine.pandas_engine import PandasEngine
1360
+
1361
+ engine_instance = PandasEngine(config={})
1362
+ self._ctx.debug("PandasEngine initialized for System Catalog")
1363
+ except Exception as e:
1364
+ self._ctx.warning(
1365
+ f"Failed to initialize PandasEngine for System Catalog: {e}"
1366
+ )
1367
+
1368
+ if spark or engine_instance:
1369
+ self.catalog_manager = CatalogManager(
1370
+ spark=spark,
1371
+ config=project_config.system,
1372
+ base_path=base_path,
1373
+ engine=engine_instance,
1374
+ connection=sys_conn,
1375
+ )
1376
+ self.catalog_manager.bootstrap()
1377
+ self._ctx.info("System Catalog initialized", path=base_path)
1378
+ else:
1379
+ self._ctx.warning(
1380
+ f"System connection '{project_config.system.connection}' not found",
1381
+ suggestion="Configure the system connection in your config",
1382
+ )
1383
+
1384
+ # Get story configuration
1385
+ story_config = self._get_story_config()
1386
+
1387
+ # Create all pipeline instances
1388
+ self._ctx.debug(
1389
+ "Creating pipeline instances",
1390
+ pipelines=[p.pipeline for p in project_config.pipelines],
1391
+ )
1392
+ for pipeline_config in project_config.pipelines:
1393
+ pipeline_name = pipeline_config.pipeline
1394
+
1395
+ self._pipelines[pipeline_name] = Pipeline(
1396
+ pipeline_config=pipeline_config,
1397
+ engine=project_config.engine,
1398
+ connections=connections,
1399
+ generate_story=story_config.get("auto_generate", True),
1400
+ story_config=story_config,
1401
+ retry_config=project_config.retry,
1402
+ alerts=project_config.alerts,
1403
+ performance_config=project_config.performance,
1404
+ catalog_manager=self.catalog_manager,
1405
+ lineage_adapter=self.lineage_adapter,
1406
+ )
1407
+ self._pipelines[pipeline_name].project_config = project_config
1408
+
1409
+ self._ctx.info(
1410
+ "PipelineManager ready",
1411
+ pipelines=list(self._pipelines.keys()),
1412
+ )
1413
+
1414
+ def _get_story_config(self) -> Dict[str, Any]:
1415
+ """Build story config from project_config.story.
1416
+
1417
+ Resolves story output path using connection.
1418
+
1419
+ Returns:
1420
+ Dictionary for StoryGenerator initialization
1421
+ """
1422
+ story_cfg = self.project_config.story
1423
+
1424
+ # Resolve story path using connection
1425
+ story_conn = self.connections[story_cfg.connection]
1426
+ output_path = story_conn.get_path(story_cfg.path)
1427
+
1428
+ # Get storage options (e.g., credentials) from connection if available
1429
+ storage_options = {}
1430
+ if hasattr(story_conn, "pandas_storage_options"):
1431
+ storage_options = story_conn.pandas_storage_options()
1432
+
1433
+ return {
1434
+ "auto_generate": story_cfg.auto_generate,
1435
+ "max_sample_rows": story_cfg.max_sample_rows,
1436
+ "output_path": output_path,
1437
+ "storage_options": storage_options,
1438
+ "async_generation": story_cfg.async_generation,
1439
+ }
1440
+
1441
+ @classmethod
1442
+ def from_yaml(cls, yaml_path: str, env: str = None) -> "PipelineManager":
1443
+ """Create PipelineManager from YAML file.
1444
+
1445
+ Args:
1446
+ yaml_path: Path to YAML configuration file
1447
+ env: Environment name to apply overrides (e.g. 'prod')
1448
+
1449
+ Returns:
1450
+ PipelineManager instance ready to run pipelines
1451
+
1452
+ Example:
1453
+ >>> manager = PipelineManager.from_yaml("config.yaml", env="prod")
1454
+ >>> results = manager.run() # Run all pipelines
1455
+ """
1456
+ logger.info(f"Loading configuration from: {yaml_path}")
1457
+
1458
+ register_standard_library()
1459
+
1460
+ yaml_path_obj = Path(yaml_path)
1461
+ config_dir = yaml_path_obj.parent.absolute()
1462
+
1463
+ import importlib.util
1464
+ import os
1465
+ import sys
1466
+
1467
+ def load_transforms_module(path):
1468
+ if os.path.exists(path):
1469
+ try:
1470
+ spec = importlib.util.spec_from_file_location("transforms_autodiscovered", path)
1471
+ if spec and spec.loader:
1472
+ module = importlib.util.module_from_spec(spec)
1473
+ sys.modules["transforms_autodiscovered"] = module
1474
+ spec.loader.exec_module(module)
1475
+ logger.info(f"Auto-loaded transforms from: {path}")
1476
+ except Exception as e:
1477
+ logger.warning(f"Failed to auto-load transforms from {path}: {e}")
1478
+
1479
+ load_transforms_module(os.path.join(config_dir, "transforms.py"))
1480
+
1481
+ cwd = os.getcwd()
1482
+ if os.path.abspath(cwd) != str(config_dir):
1483
+ load_transforms_module(os.path.join(cwd, "transforms.py"))
1484
+
1485
+ try:
1486
+ config = load_yaml_with_env(str(yaml_path_obj), env=env)
1487
+ logger.debug("Configuration loaded successfully")
1488
+ except FileNotFoundError:
1489
+ logger.error(f"YAML file not found: {yaml_path}")
1490
+ raise FileNotFoundError(
1491
+ f"YAML file not found: {yaml_path}. "
1492
+ f"Verify the file exists and consider using an absolute path."
1493
+ )
1494
+
1495
+ project_config = ProjectConfig(**config)
1496
+ logger.debug(
1497
+ "Project config validated",
1498
+ project=project_config.project,
1499
+ pipelines=len(project_config.pipelines),
1500
+ )
1501
+
1502
+ connections = cls._build_connections(project_config.connections)
1503
+
1504
+ return cls(
1505
+ project_config=project_config,
1506
+ connections=connections,
1507
+ )
1508
+
1509
+ @staticmethod
1510
+ def _build_connections(conn_configs: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
1511
+ """Convert connection configs to connection objects.
1512
+
1513
+ Args:
1514
+ conn_configs: Connection configurations from ProjectConfig
1515
+
1516
+ Returns:
1517
+ Dictionary of connection name -> connection object
1518
+
1519
+ Raises:
1520
+ ValueError: If connection type is not supported
1521
+ """
1522
+ from odibi.connections.factory import register_builtins
1523
+
1524
+ logger.debug(f"Building {len(conn_configs)} connections")
1525
+
1526
+ connections = {}
1527
+
1528
+ register_builtins()
1529
+ load_plugins()
1530
+
1531
+ for conn_name, conn_config in conn_configs.items():
1532
+ if hasattr(conn_config, "model_dump"):
1533
+ conn_config = conn_config.model_dump()
1534
+ elif hasattr(conn_config, "dict"):
1535
+ conn_config = conn_config.model_dump()
1536
+
1537
+ conn_type = conn_config.get("type", "local")
1538
+
1539
+ factory = get_connection_factory(conn_type)
1540
+ if factory:
1541
+ try:
1542
+ connections[conn_name] = factory(conn_name, conn_config)
1543
+ logger.debug(
1544
+ f"Connection created: {conn_name}",
1545
+ type=conn_type,
1546
+ )
1547
+ except Exception as e:
1548
+ logger.error(
1549
+ f"Failed to create connection '{conn_name}'",
1550
+ type=conn_type,
1551
+ error=str(e),
1552
+ )
1553
+ raise ValueError(
1554
+ f"Failed to create connection '{conn_name}' (type={conn_type}): {e}"
1555
+ ) from e
1556
+ else:
1557
+ logger.error(
1558
+ f"Unsupported connection type: {conn_type}",
1559
+ connection=conn_name,
1560
+ suggestion="Check supported connection types in docs",
1561
+ )
1562
+ raise ValueError(
1563
+ f"Unsupported connection type: {conn_type}. "
1564
+ f"Supported types: local, azure_adls, azure_sql, delta, etc. "
1565
+ f"See docs for connection setup."
1566
+ )
1567
+
1568
+ try:
1569
+ from odibi.utils import configure_connections_parallel
1570
+
1571
+ connections, errors = configure_connections_parallel(connections, verbose=False)
1572
+ if errors:
1573
+ for error in errors:
1574
+ logger.warning(error)
1575
+ except ImportError:
1576
+ pass
1577
+
1578
+ logger.info(f"Built {len(connections)} connections successfully")
1579
+
1580
+ return connections
1581
+
1582
+ def register_outputs(
1583
+ self,
1584
+ pipelines: Optional[Union[str, List[str]]] = None,
1585
+ ) -> Dict[str, int]:
1586
+ """
1587
+ Pre-register node outputs from pipeline configs without running them.
1588
+
1589
+ Scans pipeline nodes for output locations (write blocks, merge/scd2 params)
1590
+ and registers them to meta_outputs. This enables cross-pipeline references
1591
+ without requiring the source pipelines to have run first.
1592
+
1593
+ Args:
1594
+ pipelines: Pipeline name(s) to register. If None, registers all pipelines.
1595
+
1596
+ Returns:
1597
+ Dict mapping pipeline name to number of outputs registered
1598
+
1599
+ Example:
1600
+ >>> manager = PipelineManager.from_yaml("pipelines.yaml")
1601
+ >>> counts = manager.register_outputs("silver") # Register just silver
1602
+ >>> counts = manager.register_outputs() # Register all pipelines
1603
+ """
1604
+ if pipelines is None:
1605
+ pipeline_names = list(self._pipelines.keys())
1606
+ elif isinstance(pipelines, str):
1607
+ pipeline_names = [pipelines]
1608
+ else:
1609
+ pipeline_names = pipelines
1610
+
1611
+ results = {}
1612
+ for name in pipeline_names:
1613
+ if name not in self._pipelines:
1614
+ self._ctx.warning(f"Pipeline not found: {name}")
1615
+ continue
1616
+
1617
+ pipeline = self._pipelines[name]
1618
+ count = pipeline.register_outputs()
1619
+ results[name] = count
1620
+
1621
+ total = sum(results.values())
1622
+ self._ctx.info(f"Pre-registered {total} outputs from {len(results)} pipelines")
1623
+ return results
1624
+
1625
+ def run(
1626
+ self,
1627
+ pipelines: Optional[Union[str, List[str]]] = None,
1628
+ dry_run: bool = False,
1629
+ resume_from_failure: bool = False,
1630
+ parallel: bool = False,
1631
+ max_workers: int = 4,
1632
+ on_error: Optional[str] = None,
1633
+ tag: Optional[str] = None,
1634
+ node: Optional[Union[str, List[str]]] = None,
1635
+ console: bool = False,
1636
+ ) -> Union[PipelineResults, Dict[str, PipelineResults]]:
1637
+ """Run one, multiple, or all pipelines.
1638
+
1639
+ Args:
1640
+ pipelines: Pipeline name(s) to run.
1641
+ dry_run: Whether to simulate execution.
1642
+ resume_from_failure: Whether to skip successfully completed nodes from last run.
1643
+ parallel: Whether to run nodes in parallel.
1644
+ max_workers: Maximum number of worker threads for parallel execution.
1645
+ on_error: Override error handling strategy (fail_fast, fail_later, ignore).
1646
+ tag: Filter nodes by tag (only nodes with this tag will run).
1647
+ node: Run only specific node(s) by name - can be a string or list of strings.
1648
+ console: Whether to show rich console output with progress.
1649
+
1650
+ Returns:
1651
+ PipelineResults or Dict of results
1652
+ """
1653
+ if pipelines is None:
1654
+ pipeline_names = list(self._pipelines.keys())
1655
+ elif isinstance(pipelines, str):
1656
+ pipeline_names = [pipelines]
1657
+ else:
1658
+ pipeline_names = pipelines
1659
+
1660
+ for name in pipeline_names:
1661
+ if name not in self._pipelines:
1662
+ available = ", ".join(self._pipelines.keys())
1663
+ self._ctx.error(
1664
+ f"Pipeline not found: {name}",
1665
+ available=list(self._pipelines.keys()),
1666
+ )
1667
+ raise ValueError(f"Pipeline '{name}' not found. Available pipelines: {available}")
1668
+
1669
+ # Phase 2: Auto-register pipelines and nodes before execution
1670
+ if self.catalog_manager:
1671
+ self._auto_register_pipelines(pipeline_names)
1672
+
1673
+ self._ctx.info(
1674
+ f"Running {len(pipeline_names)} pipeline(s)",
1675
+ pipelines=pipeline_names,
1676
+ dry_run=dry_run,
1677
+ parallel=parallel,
1678
+ )
1679
+
1680
+ results = {}
1681
+ for idx, name in enumerate(pipeline_names):
1682
+ # Invalidate cache before each pipeline so it sees latest outputs
1683
+ if self.catalog_manager:
1684
+ self.catalog_manager.invalidate_cache()
1685
+
1686
+ self._ctx.info(
1687
+ f"Executing pipeline {idx + 1}/{len(pipeline_names)}: {name}",
1688
+ pipeline=name,
1689
+ order=idx + 1,
1690
+ )
1691
+
1692
+ results[name] = self._pipelines[name].run(
1693
+ dry_run=dry_run,
1694
+ resume_from_failure=resume_from_failure,
1695
+ parallel=parallel,
1696
+ max_workers=max_workers,
1697
+ on_error=on_error,
1698
+ tag=tag,
1699
+ node=node,
1700
+ console=console,
1701
+ )
1702
+
1703
+ result = results[name]
1704
+ status = "SUCCESS" if not result.failed else "FAILED"
1705
+ self._ctx.info(
1706
+ f"Pipeline {status}: {name}",
1707
+ status=status,
1708
+ duration_s=round(result.duration, 2),
1709
+ completed=len(result.completed),
1710
+ failed=len(result.failed),
1711
+ )
1712
+
1713
+ if result.story_path:
1714
+ self._ctx.debug(f"Story generated: {result.story_path}")
1715
+
1716
+ # Generate combined lineage if configured
1717
+ has_story = hasattr(self.project_config, "story") and self.project_config.story
1718
+ generate_lineage_enabled = has_story and self.project_config.story.generate_lineage
1719
+
1720
+ self._ctx.debug(
1721
+ "Lineage check",
1722
+ has_story=has_story,
1723
+ generate_lineage_enabled=generate_lineage_enabled,
1724
+ )
1725
+
1726
+ if generate_lineage_enabled:
1727
+ # Flush any pending async story writes before generating lineage
1728
+ self._ctx.info("Generating combined lineage...")
1729
+ self.flush_stories()
1730
+
1731
+ try:
1732
+ lineage_result = generate_lineage(self.project_config)
1733
+ if lineage_result:
1734
+ self._ctx.info(
1735
+ "Combined lineage generated",
1736
+ nodes=len(lineage_result.nodes),
1737
+ edges=len(lineage_result.edges),
1738
+ json_path=lineage_result.json_path,
1739
+ )
1740
+ else:
1741
+ self._ctx.warning("Lineage generation returned None")
1742
+ except Exception as e:
1743
+ self._ctx.warning(f"Failed to generate combined lineage: {e}")
1744
+
1745
+ if len(pipeline_names) == 1:
1746
+ return results[pipeline_names[0]]
1747
+ else:
1748
+ return results
1749
+
1750
+ def list_pipelines(self) -> List[str]:
1751
+ """Get list of available pipeline names.
1752
+
1753
+ Returns:
1754
+ List of pipeline names
1755
+ """
1756
+ return list(self._pipelines.keys())
1757
+
1758
+ def flush_stories(self, timeout: float = 60.0) -> Dict[str, Optional[str]]:
1759
+ """Wait for all pending async story generation to complete.
1760
+
1761
+ Call this before operations that need story files to be written,
1762
+ such as lineage generation with SemanticLayerRunner.
1763
+
1764
+ Args:
1765
+ timeout: Maximum seconds to wait per pipeline
1766
+
1767
+ Returns:
1768
+ Dict mapping pipeline name to story path (or None if no pending story)
1769
+
1770
+ Example:
1771
+ >>> manager.run(pipelines=['bronze', 'silver', 'gold'])
1772
+ >>> manager.flush_stories() # Wait for all stories to be written
1773
+ >>> semantic_runner.run() # Now lineage can read the stories
1774
+ """
1775
+ results = {}
1776
+ for name, pipeline in self._pipelines.items():
1777
+ story_path = pipeline.flush_stories(timeout=timeout)
1778
+ if story_path:
1779
+ results[name] = story_path
1780
+ self._ctx.debug(f"Story flushed for {name}", path=story_path)
1781
+ if results:
1782
+ self._ctx.info(f"Flushed {len(results)} pending story writes")
1783
+ return results
1784
+
1785
+ def get_pipeline(self, name: str) -> Pipeline:
1786
+ """Get a specific pipeline instance.
1787
+
1788
+ Args:
1789
+ name: Pipeline name
1790
+
1791
+ Returns:
1792
+ Pipeline instance
1793
+
1794
+ Raises:
1795
+ ValueError: If pipeline not found
1796
+ """
1797
+ if name not in self._pipelines:
1798
+ available = ", ".join(self._pipelines.keys())
1799
+ raise ValueError(f"Pipeline '{name}' not found. Available: {available}")
1800
+ return self._pipelines[name]
1801
+
1802
+ def deploy(self, pipelines: Optional[Union[str, List[str]]] = None) -> bool:
1803
+ """Deploy pipeline definitions to the System Catalog.
1804
+
1805
+ This registers pipeline and node configurations in the catalog,
1806
+ enabling drift detection and governance features.
1807
+
1808
+ Args:
1809
+ pipelines: Optional pipeline name(s) to deploy. If None, deploys all.
1810
+
1811
+ Returns:
1812
+ True if deployment succeeded, False otherwise.
1813
+
1814
+ Example:
1815
+ >>> manager = PipelineManager.from_yaml("odibi.yaml")
1816
+ >>> manager.deploy() # Deploy all pipelines
1817
+ >>> manager.deploy("sales_daily") # Deploy specific pipeline
1818
+ """
1819
+ if not self.catalog_manager:
1820
+ self._ctx.warning(
1821
+ "System Catalog not configured. Cannot deploy.",
1822
+ suggestion="Configure system catalog in your YAML config",
1823
+ )
1824
+ return False
1825
+
1826
+ if pipelines is None:
1827
+ to_deploy = self.project_config.pipelines
1828
+ elif isinstance(pipelines, str):
1829
+ to_deploy = [p for p in self.project_config.pipelines if p.pipeline == pipelines]
1830
+ else:
1831
+ to_deploy = [p for p in self.project_config.pipelines if p.pipeline in pipelines]
1832
+
1833
+ if not to_deploy:
1834
+ self._ctx.warning("No matching pipelines found to deploy.")
1835
+ return False
1836
+
1837
+ self._ctx.info(
1838
+ f"Deploying {len(to_deploy)} pipeline(s) to System Catalog",
1839
+ pipelines=[p.pipeline for p in to_deploy],
1840
+ )
1841
+
1842
+ try:
1843
+ self.catalog_manager.bootstrap()
1844
+
1845
+ for pipeline_config in to_deploy:
1846
+ self._ctx.debug(
1847
+ f"Deploying pipeline: {pipeline_config.pipeline}",
1848
+ node_count=len(pipeline_config.nodes),
1849
+ )
1850
+ self.catalog_manager.register_pipeline(pipeline_config, self.project_config)
1851
+
1852
+ for node in pipeline_config.nodes:
1853
+ self.catalog_manager.register_node(pipeline_config.pipeline, node)
1854
+
1855
+ self._ctx.info(
1856
+ f"Deployment complete: {len(to_deploy)} pipeline(s)",
1857
+ deployed=[p.pipeline for p in to_deploy],
1858
+ )
1859
+ return True
1860
+
1861
+ except Exception as e:
1862
+ self._ctx.error(
1863
+ f"Deployment failed: {e}",
1864
+ error_type=type(e).__name__,
1865
+ suggestion="Check catalog configuration and permissions",
1866
+ )
1867
+ return False
1868
+
1869
+ def _auto_register_pipelines(self, pipeline_names: List[str]) -> None:
1870
+ """Auto-register pipelines and nodes before execution.
1871
+
1872
+ This ensures meta_pipelines and meta_nodes are populated automatically
1873
+ when running pipelines, without requiring explicit deploy() calls.
1874
+
1875
+ Uses "check-before-write" pattern with batch writes for performance:
1876
+ - Reads existing hashes in one read
1877
+ - Compares version_hash to skip unchanged records
1878
+ - Batch writes only changed/new records
1879
+
1880
+ Args:
1881
+ pipeline_names: List of pipeline names to register
1882
+ """
1883
+ if not self.catalog_manager:
1884
+ return
1885
+
1886
+ try:
1887
+ import hashlib
1888
+ import json
1889
+
1890
+ existing_pipelines = self.catalog_manager.get_all_registered_pipelines()
1891
+ existing_nodes = self.catalog_manager.get_all_registered_nodes(pipeline_names)
1892
+
1893
+ pipeline_records = []
1894
+ node_records = []
1895
+
1896
+ for name in pipeline_names:
1897
+ pipeline = self._pipelines[name]
1898
+ config = pipeline.config
1899
+
1900
+ if hasattr(config, "model_dump"):
1901
+ dump = config.model_dump(mode="json")
1902
+ else:
1903
+ dump = config.model_dump()
1904
+ dump_str = json.dumps(dump, sort_keys=True)
1905
+ pipeline_hash = hashlib.md5(dump_str.encode("utf-8")).hexdigest()
1906
+
1907
+ if existing_pipelines.get(name) != pipeline_hash:
1908
+ all_tags = set()
1909
+ for node in config.nodes:
1910
+ if node.tags:
1911
+ all_tags.update(node.tags)
1912
+
1913
+ pipeline_records.append(
1914
+ {
1915
+ "pipeline_name": name,
1916
+ "version_hash": pipeline_hash,
1917
+ "description": config.description or "",
1918
+ "layer": config.layer or "",
1919
+ "schedule": "",
1920
+ "tags_json": json.dumps(list(all_tags)),
1921
+ }
1922
+ )
1923
+
1924
+ pipeline_existing_nodes = existing_nodes.get(name, {})
1925
+ for node in config.nodes:
1926
+ if hasattr(node, "model_dump"):
1927
+ node_dump = node.model_dump(
1928
+ mode="json", exclude={"description", "tags", "log_level"}
1929
+ )
1930
+ else:
1931
+ node_dump = node.model_dump(exclude={"description", "tags", "log_level"})
1932
+ node_dump_str = json.dumps(node_dump, sort_keys=True)
1933
+ node_hash = hashlib.md5(node_dump_str.encode("utf-8")).hexdigest()
1934
+
1935
+ if pipeline_existing_nodes.get(node.name) != node_hash:
1936
+ node_type = "transform"
1937
+ if node.read:
1938
+ node_type = "read"
1939
+ if node.write:
1940
+ node_type = "write"
1941
+
1942
+ node_records.append(
1943
+ {
1944
+ "pipeline_name": name,
1945
+ "node_name": node.name,
1946
+ "version_hash": node_hash,
1947
+ "type": node_type,
1948
+ "config_json": json.dumps(node_dump),
1949
+ }
1950
+ )
1951
+
1952
+ if pipeline_records:
1953
+ self.catalog_manager.register_pipelines_batch(pipeline_records)
1954
+ self._ctx.debug(
1955
+ f"Batch registered {len(pipeline_records)} changed pipeline(s)",
1956
+ pipelines=[r["pipeline_name"] for r in pipeline_records],
1957
+ )
1958
+ else:
1959
+ self._ctx.debug("All pipelines unchanged - skipping registration")
1960
+
1961
+ if node_records:
1962
+ self.catalog_manager.register_nodes_batch(node_records)
1963
+ self._ctx.debug(
1964
+ f"Batch registered {len(node_records)} changed node(s)",
1965
+ nodes=[r["node_name"] for r in node_records],
1966
+ )
1967
+ else:
1968
+ self._ctx.debug("All nodes unchanged - skipping registration")
1969
+
1970
+ except Exception as e:
1971
+ self._ctx.warning(
1972
+ f"Auto-registration failed (non-fatal): {e}",
1973
+ error_type=type(e).__name__,
1974
+ )
1975
+
1976
+ # -------------------------------------------------------------------------
1977
+ # Phase 5: List/Query Methods
1978
+ # -------------------------------------------------------------------------
1979
+
1980
+ def list_registered_pipelines(self) -> "pd.DataFrame":
1981
+ """List all registered pipelines from the system catalog.
1982
+
1983
+ Returns:
1984
+ DataFrame with pipeline metadata from meta_pipelines
1985
+ """
1986
+ import pandas as pd
1987
+
1988
+ if not self.catalog_manager:
1989
+ self._ctx.warning("Catalog manager not configured")
1990
+ return pd.DataFrame()
1991
+
1992
+ try:
1993
+ df = self.catalog_manager._read_local_table(
1994
+ self.catalog_manager.tables["meta_pipelines"]
1995
+ )
1996
+ return df
1997
+ except Exception as e:
1998
+ self._ctx.warning(f"Failed to list pipelines: {e}")
1999
+ return pd.DataFrame()
2000
+
2001
+ def list_registered_nodes(self, pipeline: Optional[str] = None) -> "pd.DataFrame":
2002
+ """List nodes from the system catalog.
2003
+
2004
+ Args:
2005
+ pipeline: Optional pipeline name to filter by
2006
+
2007
+ Returns:
2008
+ DataFrame with node metadata from meta_nodes
2009
+ """
2010
+ import pandas as pd
2011
+
2012
+ if not self.catalog_manager:
2013
+ self._ctx.warning("Catalog manager not configured")
2014
+ return pd.DataFrame()
2015
+
2016
+ try:
2017
+ df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_nodes"])
2018
+ if not df.empty and pipeline:
2019
+ df = df[df["pipeline_name"] == pipeline]
2020
+ return df
2021
+ except Exception as e:
2022
+ self._ctx.warning(f"Failed to list nodes: {e}")
2023
+ return pd.DataFrame()
2024
+
2025
+ def list_runs(
2026
+ self,
2027
+ pipeline: Optional[str] = None,
2028
+ node: Optional[str] = None,
2029
+ status: Optional[str] = None,
2030
+ limit: int = 10,
2031
+ ) -> "pd.DataFrame":
2032
+ """List recent runs with optional filters.
2033
+
2034
+ Args:
2035
+ pipeline: Optional pipeline name to filter by
2036
+ node: Optional node name to filter by
2037
+ status: Optional status to filter by (SUCCESS, FAILURE)
2038
+ limit: Maximum number of runs to return
2039
+
2040
+ Returns:
2041
+ DataFrame with run history from meta_runs
2042
+ """
2043
+ import pandas as pd
2044
+
2045
+ if not self.catalog_manager:
2046
+ self._ctx.warning("Catalog manager not configured")
2047
+ return pd.DataFrame()
2048
+
2049
+ try:
2050
+ df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_runs"])
2051
+ if df.empty:
2052
+ return df
2053
+
2054
+ if pipeline:
2055
+ df = df[df["pipeline_name"] == pipeline]
2056
+ if node:
2057
+ df = df[df["node_name"] == node]
2058
+ if status:
2059
+ df = df[df["status"] == status]
2060
+
2061
+ if "timestamp" in df.columns:
2062
+ df = df.sort_values("timestamp", ascending=False)
2063
+
2064
+ return df.head(limit)
2065
+ except Exception as e:
2066
+ self._ctx.warning(f"Failed to list runs: {e}")
2067
+ return pd.DataFrame()
2068
+
2069
+ def list_tables(self) -> "pd.DataFrame":
2070
+ """List registered assets from meta_tables.
2071
+
2072
+ Returns:
2073
+ DataFrame with table/asset metadata
2074
+ """
2075
+ import pandas as pd
2076
+
2077
+ if not self.catalog_manager:
2078
+ self._ctx.warning("Catalog manager not configured")
2079
+ return pd.DataFrame()
2080
+
2081
+ try:
2082
+ df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_tables"])
2083
+ return df
2084
+ except Exception as e:
2085
+ self._ctx.warning(f"Failed to list tables: {e}")
2086
+ return pd.DataFrame()
2087
+
2088
+ # -------------------------------------------------------------------------
2089
+ # Phase 5.2: State Methods
2090
+ # -------------------------------------------------------------------------
2091
+
2092
+ def get_state(self, key: str) -> Optional[Dict[str, Any]]:
2093
+ """Get a specific state entry (HWM, content hash, etc.).
2094
+
2095
+ Args:
2096
+ key: The state key to look up
2097
+
2098
+ Returns:
2099
+ Dictionary with state data or None if not found
2100
+ """
2101
+
2102
+ if not self.catalog_manager:
2103
+ return None
2104
+
2105
+ try:
2106
+ df = self.catalog_manager._read_table(self.catalog_manager.tables["meta_state"])
2107
+ if df.empty or "key" not in df.columns:
2108
+ return None
2109
+
2110
+ row = df[df["key"] == key]
2111
+ if row.empty:
2112
+ return None
2113
+
2114
+ return row.iloc[0].to_dict()
2115
+ except Exception:
2116
+ return None
2117
+
2118
+ def get_all_state(self, prefix: Optional[str] = None) -> "pd.DataFrame":
2119
+ """Get all state entries, optionally filtered by key prefix.
2120
+
2121
+ Args:
2122
+ prefix: Optional key prefix to filter by
2123
+
2124
+ Returns:
2125
+ DataFrame with state entries
2126
+ """
2127
+ import pandas as pd
2128
+
2129
+ if not self.catalog_manager:
2130
+ return pd.DataFrame()
2131
+
2132
+ try:
2133
+ df = self.catalog_manager._read_table(self.catalog_manager.tables["meta_state"])
2134
+ if not df.empty and prefix and "key" in df.columns:
2135
+ df = df[df["key"].str.startswith(prefix)]
2136
+ return df
2137
+ except Exception as e:
2138
+ self._ctx.warning(f"Failed to get state: {e}")
2139
+ return pd.DataFrame()
2140
+
2141
+ def clear_state(self, key: str) -> bool:
2142
+ """Remove a state entry.
2143
+
2144
+ Args:
2145
+ key: The state key to remove
2146
+
2147
+ Returns:
2148
+ True if deleted, False otherwise
2149
+ """
2150
+ if not self.catalog_manager:
2151
+ return False
2152
+
2153
+ try:
2154
+ return self.catalog_manager.clear_state_key(key)
2155
+ except Exception as e:
2156
+ self._ctx.warning(f"Failed to clear state: {e}")
2157
+ return False
2158
+
2159
+ # -------------------------------------------------------------------------
2160
+ # Phase 5.3-5.4: Schema/Lineage and Stats Methods
2161
+ # -------------------------------------------------------------------------
2162
+
2163
+ def get_schema_history(
2164
+ self,
2165
+ table: str,
2166
+ limit: int = 5,
2167
+ ) -> "pd.DataFrame":
2168
+ """Get schema version history for a table.
2169
+
2170
+ Args:
2171
+ table: Table identifier (supports smart path resolution)
2172
+ limit: Maximum number of versions to return
2173
+
2174
+ Returns:
2175
+ DataFrame with schema history
2176
+ """
2177
+ import pandas as pd
2178
+
2179
+ if not self.catalog_manager:
2180
+ return pd.DataFrame()
2181
+
2182
+ try:
2183
+ resolved_path = self._resolve_table_path(table)
2184
+ history = self.catalog_manager.get_schema_history(resolved_path, limit)
2185
+ return pd.DataFrame(history)
2186
+ except Exception as e:
2187
+ self._ctx.warning(f"Failed to get schema history: {e}")
2188
+ return pd.DataFrame()
2189
+
2190
+ def get_lineage(
2191
+ self,
2192
+ table: str,
2193
+ direction: str = "both",
2194
+ ) -> "pd.DataFrame":
2195
+ """Get lineage for a table.
2196
+
2197
+ Args:
2198
+ table: Table identifier (supports smart path resolution)
2199
+ direction: "upstream", "downstream", or "both"
2200
+
2201
+ Returns:
2202
+ DataFrame with lineage relationships
2203
+ """
2204
+ import pandas as pd
2205
+
2206
+ if not self.catalog_manager:
2207
+ return pd.DataFrame()
2208
+
2209
+ try:
2210
+ resolved_path = self._resolve_table_path(table)
2211
+
2212
+ results = []
2213
+ if direction in ("upstream", "both"):
2214
+ upstream = self.catalog_manager.get_upstream(resolved_path)
2215
+ for r in upstream:
2216
+ r["direction"] = "upstream"
2217
+ results.extend(upstream)
2218
+
2219
+ if direction in ("downstream", "both"):
2220
+ downstream = self.catalog_manager.get_downstream(resolved_path)
2221
+ for r in downstream:
2222
+ r["direction"] = "downstream"
2223
+ results.extend(downstream)
2224
+
2225
+ return pd.DataFrame(results)
2226
+ except Exception as e:
2227
+ self._ctx.warning(f"Failed to get lineage: {e}")
2228
+ return pd.DataFrame()
2229
+
2230
+ def get_pipeline_status(self, pipeline: str) -> Dict[str, Any]:
2231
+ """Get last run status, duration, timestamp for a pipeline.
2232
+
2233
+ Args:
2234
+ pipeline: Pipeline name
2235
+
2236
+ Returns:
2237
+ Dict with status info
2238
+ """
2239
+ if not self.catalog_manager:
2240
+ return {}
2241
+
2242
+ try:
2243
+ runs = self.list_runs(pipeline=pipeline, limit=1)
2244
+ if runs.empty:
2245
+ return {"status": "never_run", "pipeline": pipeline}
2246
+
2247
+ last_run = runs.iloc[0].to_dict()
2248
+ return {
2249
+ "pipeline": pipeline,
2250
+ "last_status": last_run.get("status"),
2251
+ "last_run_at": last_run.get("timestamp"),
2252
+ "last_duration_ms": last_run.get("duration_ms"),
2253
+ "last_node": last_run.get("node_name"),
2254
+ }
2255
+ except Exception as e:
2256
+ self._ctx.warning(f"Failed to get pipeline status: {e}")
2257
+ return {}
2258
+
2259
+ def get_node_stats(self, node: str, days: int = 7) -> Dict[str, Any]:
2260
+ """Get average duration, row counts, success rate over period.
2261
+
2262
+ Args:
2263
+ node: Node name
2264
+ days: Number of days to look back
2265
+
2266
+ Returns:
2267
+ Dict with node statistics
2268
+ """
2269
+ import pandas as pd
2270
+
2271
+ if not self.catalog_manager:
2272
+ return {}
2273
+
2274
+ try:
2275
+ avg_duration = self.catalog_manager.get_average_duration(node, days)
2276
+
2277
+ df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_runs"])
2278
+ if df.empty:
2279
+ return {"node": node, "runs": 0}
2280
+
2281
+ if "timestamp" in df.columns:
2282
+ cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)
2283
+ if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
2284
+ df["timestamp"] = pd.to_datetime(df["timestamp"])
2285
+ if df["timestamp"].dt.tz is None:
2286
+ df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
2287
+ df = df[df["timestamp"] >= cutoff]
2288
+
2289
+ node_runs = df[df["node_name"] == node]
2290
+ if node_runs.empty:
2291
+ return {"node": node, "runs": 0}
2292
+
2293
+ total = len(node_runs)
2294
+ success = len(node_runs[node_runs["status"] == "SUCCESS"])
2295
+ avg_rows = node_runs["rows_processed"].mean() if "rows_processed" in node_runs else None
2296
+
2297
+ return {
2298
+ "node": node,
2299
+ "runs": total,
2300
+ "success_rate": success / total if total > 0 else 0,
2301
+ "avg_duration_s": avg_duration,
2302
+ "avg_rows": avg_rows,
2303
+ "period_days": days,
2304
+ }
2305
+ except Exception as e:
2306
+ self._ctx.warning(f"Failed to get node stats: {e}")
2307
+ return {}
2308
+
2309
+ # -------------------------------------------------------------------------
2310
+ # Phase 6: Smart Path Resolution
2311
+ # -------------------------------------------------------------------------
2312
+
2313
+ def _resolve_table_path(self, identifier: str) -> str:
2314
+ """Resolve a user-friendly identifier to a full table path.
2315
+
2316
+ Accepts:
2317
+ - Relative path: "bronze/OEE/vw_OSMPerformanceOEE"
2318
+ - Registered table: "test.vw_OSMPerformanceOEE"
2319
+ - Node name: "opsvisdata_vw_OSMPerformanceOEE"
2320
+ - Full path: "abfss://..." (used as-is)
2321
+
2322
+ Args:
2323
+ identifier: User-friendly table identifier
2324
+
2325
+ Returns:
2326
+ Full table path
2327
+ """
2328
+ if self._is_full_path(identifier):
2329
+ return identifier
2330
+
2331
+ if self.catalog_manager:
2332
+ resolved = self._lookup_in_catalog(identifier)
2333
+ if resolved:
2334
+ return resolved
2335
+
2336
+ for pipeline in self._pipelines.values():
2337
+ for node in pipeline.config.nodes:
2338
+ if node.name == identifier and node.write:
2339
+ conn = self.connections.get(node.write.connection)
2340
+ if conn:
2341
+ return conn.get_path(node.write.path or node.write.table)
2342
+
2343
+ sys_conn_name = (
2344
+ self.project_config.system.connection if self.project_config.system else None
2345
+ )
2346
+ if sys_conn_name:
2347
+ sys_conn = self.connections.get(sys_conn_name)
2348
+ if sys_conn:
2349
+ return sys_conn.get_path(identifier)
2350
+
2351
+ return identifier
2352
+
2353
+ def _is_full_path(self, identifier: str) -> bool:
2354
+ """Check if identifier is already a full path."""
2355
+ full_path_prefixes = ("abfss://", "s3://", "gs://", "hdfs://", "/", "C:", "D:")
2356
+ return identifier.startswith(full_path_prefixes)
2357
+
2358
+ def _lookup_in_catalog(self, identifier: str) -> Optional[str]:
2359
+ """Look up identifier in meta_tables catalog."""
2360
+ if not self.catalog_manager:
2361
+ return None
2362
+
2363
+ try:
2364
+ df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_tables"])
2365
+ if df.empty or "table_name" not in df.columns:
2366
+ return None
2367
+
2368
+ match = df[df["table_name"] == identifier]
2369
+ if not match.empty and "path" in match.columns:
2370
+ return match.iloc[0]["path"]
2371
+
2372
+ if "." in identifier:
2373
+ parts = identifier.split(".", 1)
2374
+ if len(parts) == 2:
2375
+ match = df[df["table_name"] == parts[1]]
2376
+ if not match.empty and "path" in match.columns:
2377
+ return match.iloc[0]["path"]
2378
+
2379
+ except Exception:
2380
+ pass
2381
+
2382
+ return None