odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/pipeline.py
ADDED
|
@@ -0,0 +1,2382 @@
|
|
|
1
|
+
"""Pipeline executor and orchestration."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from odibi.config import AlertConfig, ErrorStrategy, PipelineConfig, ProjectConfig, RetryConfig
|
|
13
|
+
from odibi.context import create_context
|
|
14
|
+
from odibi.engine.registry import get_engine_class
|
|
15
|
+
from odibi.exceptions import DependencyError
|
|
16
|
+
from odibi.graph import DependencyGraph
|
|
17
|
+
from odibi.lineage import OpenLineageAdapter
|
|
18
|
+
from odibi.node import Node, NodeResult
|
|
19
|
+
from odibi.plugins import get_connection_factory, load_plugins
|
|
20
|
+
from odibi.registry import FunctionRegistry
|
|
21
|
+
from odibi.state import StateManager, create_state_backend
|
|
22
|
+
from odibi.story import StoryGenerator
|
|
23
|
+
from odibi.story.lineage_utils import generate_lineage
|
|
24
|
+
from odibi.transformers import register_standard_library
|
|
25
|
+
from odibi.utils import load_yaml_with_env
|
|
26
|
+
from odibi.utils.alerting import send_alert
|
|
27
|
+
from odibi.utils.logging import configure_logging, logger
|
|
28
|
+
from odibi.utils.logging_context import (
|
|
29
|
+
create_logging_context,
|
|
30
|
+
set_logging_context,
|
|
31
|
+
)
|
|
32
|
+
from odibi.utils.progress import NodeStatus, PipelineProgress
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class PipelineResults:
|
|
37
|
+
"""Results from pipeline execution."""
|
|
38
|
+
|
|
39
|
+
pipeline_name: str
|
|
40
|
+
completed: List[str] = field(default_factory=list)
|
|
41
|
+
failed: List[str] = field(default_factory=list)
|
|
42
|
+
skipped: List[str] = field(default_factory=list)
|
|
43
|
+
node_results: Dict[str, NodeResult] = field(default_factory=dict)
|
|
44
|
+
duration: float = 0.0
|
|
45
|
+
start_time: Optional[str] = None
|
|
46
|
+
end_time: Optional[str] = None
|
|
47
|
+
story_path: Optional[str] = None
|
|
48
|
+
|
|
49
|
+
def get_node_result(self, name: str) -> Optional[NodeResult]:
|
|
50
|
+
"""Get result for specific node.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
name: Node name
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
NodeResult if available, None otherwise
|
|
57
|
+
"""
|
|
58
|
+
return self.node_results.get(name)
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
61
|
+
"""Convert to dictionary.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Dictionary representation
|
|
65
|
+
"""
|
|
66
|
+
return {
|
|
67
|
+
"pipeline_name": self.pipeline_name,
|
|
68
|
+
"completed": self.completed,
|
|
69
|
+
"failed": self.failed,
|
|
70
|
+
"skipped": self.skipped,
|
|
71
|
+
"duration": self.duration,
|
|
72
|
+
"start_time": self.start_time,
|
|
73
|
+
"end_time": self.end_time,
|
|
74
|
+
"node_count": len(self.node_results),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Pipeline:
|
|
79
|
+
"""Pipeline executor and orchestrator."""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
pipeline_config: PipelineConfig,
|
|
84
|
+
engine: str = "pandas",
|
|
85
|
+
connections: Optional[Dict[str, Any]] = None,
|
|
86
|
+
generate_story: bool = True,
|
|
87
|
+
story_config: Optional[Dict[str, Any]] = None,
|
|
88
|
+
retry_config: Optional[RetryConfig] = None,
|
|
89
|
+
alerts: Optional[List[AlertConfig]] = None,
|
|
90
|
+
performance_config: Optional[Any] = None,
|
|
91
|
+
catalog_manager: Optional[Any] = None,
|
|
92
|
+
lineage_adapter: Optional[Any] = None,
|
|
93
|
+
):
|
|
94
|
+
"""Initialize pipeline.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
pipeline_config: Pipeline configuration
|
|
98
|
+
engine: Engine type ('pandas' or 'spark')
|
|
99
|
+
connections: Available connections
|
|
100
|
+
generate_story: Whether to generate execution stories
|
|
101
|
+
story_config: Story generator configuration
|
|
102
|
+
retry_config: Retry configuration
|
|
103
|
+
alerts: Alert configurations
|
|
104
|
+
performance_config: Performance tuning configuration
|
|
105
|
+
catalog_manager: System Catalog Manager (Phase 1)
|
|
106
|
+
lineage_adapter: OpenLineage Adapter
|
|
107
|
+
"""
|
|
108
|
+
self.config = pipeline_config
|
|
109
|
+
self.project_config = None # Set by PipelineManager if available
|
|
110
|
+
self.engine_type = engine
|
|
111
|
+
self.connections = connections or {}
|
|
112
|
+
self.generate_story = generate_story
|
|
113
|
+
self.retry_config = retry_config
|
|
114
|
+
self.alerts = alerts or []
|
|
115
|
+
self.performance_config = performance_config
|
|
116
|
+
self.catalog_manager = catalog_manager
|
|
117
|
+
self.lineage = lineage_adapter
|
|
118
|
+
|
|
119
|
+
# Batch write buffers to collect catalog writes during execution
|
|
120
|
+
# These are flushed at pipeline end to eliminate concurrency conflicts
|
|
121
|
+
self._pending_lineage_records: List[Dict[str, Any]] = []
|
|
122
|
+
self._pending_asset_records: List[Dict[str, Any]] = []
|
|
123
|
+
self._pending_hwm_updates: List[Dict[str, Any]] = []
|
|
124
|
+
self._batch_mode_enabled: bool = True # Enable batch mode by default
|
|
125
|
+
|
|
126
|
+
# Track async story futures for flush_stories()
|
|
127
|
+
self._story_future = None
|
|
128
|
+
self._story_executor = None
|
|
129
|
+
|
|
130
|
+
# Create logging context for this pipeline
|
|
131
|
+
self._ctx = create_logging_context(
|
|
132
|
+
pipeline_id=pipeline_config.pipeline,
|
|
133
|
+
engine=engine,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
self._ctx.info(
|
|
137
|
+
f"Initializing pipeline: {pipeline_config.pipeline}",
|
|
138
|
+
engine=engine,
|
|
139
|
+
node_count=len(pipeline_config.nodes),
|
|
140
|
+
connections=list(self.connections.keys()) if self.connections else [],
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Initialize story generator
|
|
144
|
+
story_config = story_config or {}
|
|
145
|
+
self.story_config = story_config # Store for async_generation check
|
|
146
|
+
|
|
147
|
+
self.story_generator = StoryGenerator(
|
|
148
|
+
pipeline_name=pipeline_config.pipeline,
|
|
149
|
+
max_sample_rows=story_config.get("max_sample_rows", 10),
|
|
150
|
+
output_path=story_config.get("output_path", "stories/"),
|
|
151
|
+
storage_options=story_config.get("storage_options", {}),
|
|
152
|
+
catalog_manager=catalog_manager,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Initialize engine
|
|
156
|
+
engine_config = {}
|
|
157
|
+
if performance_config:
|
|
158
|
+
if hasattr(performance_config, "model_dump"):
|
|
159
|
+
engine_config["performance"] = performance_config.model_dump()
|
|
160
|
+
elif hasattr(performance_config, "dict"):
|
|
161
|
+
engine_config["performance"] = performance_config.model_dump()
|
|
162
|
+
else:
|
|
163
|
+
engine_config["performance"] = performance_config
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
EngineClass = get_engine_class(engine)
|
|
167
|
+
except ValueError as e:
|
|
168
|
+
# Handle Spark special case message
|
|
169
|
+
if engine == "spark":
|
|
170
|
+
raise ImportError(
|
|
171
|
+
"Spark engine not available. "
|
|
172
|
+
"Install with 'pip install odibi[spark]' or ensure pyspark is installed."
|
|
173
|
+
)
|
|
174
|
+
raise e
|
|
175
|
+
|
|
176
|
+
if engine == "spark":
|
|
177
|
+
# SparkEngine can take existing session if needed, but here we let it create/get one
|
|
178
|
+
# We might need to pass connections to it for ADLS auth config
|
|
179
|
+
self.engine = EngineClass(connections=connections, config=engine_config)
|
|
180
|
+
else:
|
|
181
|
+
self.engine = EngineClass(config=engine_config)
|
|
182
|
+
|
|
183
|
+
self._ctx.debug(f"Engine initialized: {engine}")
|
|
184
|
+
|
|
185
|
+
# Initialize context
|
|
186
|
+
spark_session = getattr(self.engine, "spark", None)
|
|
187
|
+
self.context = create_context(engine, spark_session=spark_session)
|
|
188
|
+
|
|
189
|
+
# Build dependency graph
|
|
190
|
+
self.graph = DependencyGraph(pipeline_config.nodes)
|
|
191
|
+
|
|
192
|
+
# Log graph structure
|
|
193
|
+
layers = self.graph.get_execution_layers()
|
|
194
|
+
edge_count = sum(len(n.depends_on) for n in pipeline_config.nodes)
|
|
195
|
+
self._ctx.log_graph_operation(
|
|
196
|
+
operation="build",
|
|
197
|
+
node_count=len(pipeline_config.nodes),
|
|
198
|
+
edge_count=edge_count,
|
|
199
|
+
layer_count=len(layers),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
def __enter__(self) -> "Pipeline":
|
|
203
|
+
"""Context manager entry."""
|
|
204
|
+
return self
|
|
205
|
+
|
|
206
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
207
|
+
"""Context manager exit - cleanup connections."""
|
|
208
|
+
self._cleanup_connections()
|
|
209
|
+
|
|
210
|
+
def _cleanup_connections(self) -> None:
|
|
211
|
+
"""Clean up all connection resources."""
|
|
212
|
+
if not self.connections:
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
for name, conn in self.connections.items():
|
|
216
|
+
if hasattr(conn, "close"):
|
|
217
|
+
try:
|
|
218
|
+
conn.close()
|
|
219
|
+
self._ctx.debug(f"Closed connection: {name}")
|
|
220
|
+
except Exception as e:
|
|
221
|
+
self._ctx.warning(f"Failed to close connection {name}: {e}", exc_info=True)
|
|
222
|
+
|
|
223
|
+
@classmethod
|
|
224
|
+
def from_yaml(cls, yaml_path: str) -> "PipelineManager":
|
|
225
|
+
"""Create PipelineManager from YAML file (recommended).
|
|
226
|
+
|
|
227
|
+
This method now returns a PipelineManager that can run all or specific pipelines.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
yaml_path: Path to YAML configuration file
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
PipelineManager instance (use .run() to execute)
|
|
234
|
+
|
|
235
|
+
Example:
|
|
236
|
+
>>> from odibi.pipeline import Pipeline
|
|
237
|
+
>>> manager = Pipeline.from_yaml("config.yaml")
|
|
238
|
+
>>> results = manager.run() # Run all pipelines
|
|
239
|
+
>>> results = manager.run('bronze_to_silver') # Run specific pipeline
|
|
240
|
+
|
|
241
|
+
Note:
|
|
242
|
+
For direct access to PipelineManager class:
|
|
243
|
+
>>> from odibi.pipeline import PipelineManager
|
|
244
|
+
>>> manager = PipelineManager.from_yaml("config.yaml")
|
|
245
|
+
"""
|
|
246
|
+
# Delegate to PipelineManager
|
|
247
|
+
return PipelineManager.from_yaml(yaml_path)
|
|
248
|
+
|
|
249
|
+
def register_outputs(self) -> int:
|
|
250
|
+
"""
|
|
251
|
+
Pre-register node outputs from pipeline config without running the pipeline.
|
|
252
|
+
|
|
253
|
+
Scans pipeline nodes for output locations (write blocks, merge/scd2 params)
|
|
254
|
+
and registers them to meta_outputs. This enables cross-pipeline references
|
|
255
|
+
without requiring the source pipeline to have run first.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Number of outputs registered
|
|
259
|
+
|
|
260
|
+
Example:
|
|
261
|
+
>>> pipeline = Pipeline(config, engine="spark", catalog_manager=catalog)
|
|
262
|
+
>>> count = pipeline.register_outputs()
|
|
263
|
+
>>> print(f"Registered {count} outputs")
|
|
264
|
+
"""
|
|
265
|
+
if not self.catalog_manager:
|
|
266
|
+
self._ctx.warning("No catalog_manager configured, cannot register outputs")
|
|
267
|
+
return 0
|
|
268
|
+
|
|
269
|
+
count = self.catalog_manager.register_outputs_from_config(self.config)
|
|
270
|
+
self._ctx.info(f"Pre-registered {count} outputs from pipeline config")
|
|
271
|
+
return count
|
|
272
|
+
|
|
273
|
+
def run(
|
|
274
|
+
self,
|
|
275
|
+
parallel: bool = False,
|
|
276
|
+
dry_run: bool = False,
|
|
277
|
+
resume_from_failure: bool = False,
|
|
278
|
+
max_workers: int = 4,
|
|
279
|
+
on_error: Optional[str] = None,
|
|
280
|
+
tag: Optional[str] = None,
|
|
281
|
+
node: Optional[Union[str, List[str]]] = None,
|
|
282
|
+
console: bool = False,
|
|
283
|
+
) -> PipelineResults:
|
|
284
|
+
"""Execute the pipeline.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
parallel: Whether to use parallel execution
|
|
288
|
+
dry_run: Whether to simulate execution without running operations
|
|
289
|
+
resume_from_failure: Whether to skip successfully completed nodes from last run
|
|
290
|
+
max_workers: Maximum number of parallel threads (default: 4)
|
|
291
|
+
on_error: Override error handling strategy
|
|
292
|
+
tag: Filter nodes by tag (only nodes with this tag will run)
|
|
293
|
+
node: Run only specific node(s) by name - can be a string or list of strings
|
|
294
|
+
console: Whether to show rich console output with progress
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
PipelineResults with execution details
|
|
298
|
+
"""
|
|
299
|
+
start_time = time.time()
|
|
300
|
+
start_timestamp = datetime.now().isoformat()
|
|
301
|
+
|
|
302
|
+
results = PipelineResults(pipeline_name=self.config.pipeline, start_time=start_timestamp)
|
|
303
|
+
|
|
304
|
+
# Set global logging context for this pipeline run
|
|
305
|
+
set_logging_context(self._ctx)
|
|
306
|
+
|
|
307
|
+
# Pre-register outputs so cross-pipeline references can resolve on first run
|
|
308
|
+
if self.catalog_manager:
|
|
309
|
+
try:
|
|
310
|
+
count = self.register_outputs()
|
|
311
|
+
if count > 0:
|
|
312
|
+
self._ctx.debug(f"Pre-registered {count} outputs for reference resolution")
|
|
313
|
+
except Exception as e:
|
|
314
|
+
self._ctx.debug(f"Output pre-registration skipped: {e}")
|
|
315
|
+
|
|
316
|
+
# Get execution plan info for logging
|
|
317
|
+
layers = self.graph.get_execution_layers()
|
|
318
|
+
execution_order = self.graph.topological_sort()
|
|
319
|
+
|
|
320
|
+
# Apply node filters (--tag, --node)
|
|
321
|
+
filtered_nodes = set(execution_order)
|
|
322
|
+
if tag:
|
|
323
|
+
filtered_nodes = {name for name in filtered_nodes if tag in self.graph.nodes[name].tags}
|
|
324
|
+
self._ctx.info(f"Filtering by tag '{tag}': {len(filtered_nodes)} nodes match")
|
|
325
|
+
if node:
|
|
326
|
+
# Normalize to list
|
|
327
|
+
node_list = [node] if isinstance(node, str) else node
|
|
328
|
+
# Validate all nodes exist
|
|
329
|
+
missing = [n for n in node_list if n not in self.graph.nodes]
|
|
330
|
+
if missing:
|
|
331
|
+
available = ", ".join(self.graph.nodes.keys())
|
|
332
|
+
raise ValueError(f"Node(s) not found: {missing}. Available: {available}")
|
|
333
|
+
# Auto-include all upstream dependencies
|
|
334
|
+
filtered_nodes = set(node_list)
|
|
335
|
+
for n in node_list:
|
|
336
|
+
deps = self.graph.get_dependencies(n)
|
|
337
|
+
filtered_nodes.update(deps)
|
|
338
|
+
if len(filtered_nodes) > len(node_list):
|
|
339
|
+
dep_count = len(filtered_nodes) - len(node_list)
|
|
340
|
+
self._ctx.info(f"Running node(s): {node_list} (+ {dep_count} dependencies)")
|
|
341
|
+
else:
|
|
342
|
+
self._ctx.info(f"Running specific node(s): {node_list}")
|
|
343
|
+
|
|
344
|
+
# Update execution order to only include filtered nodes
|
|
345
|
+
execution_order = [n for n in execution_order if n in filtered_nodes]
|
|
346
|
+
layers = [[n for n in layer if n in filtered_nodes] for layer in layers]
|
|
347
|
+
layers = [layer for layer in layers if layer] # Remove empty layers
|
|
348
|
+
|
|
349
|
+
self._ctx.info(
|
|
350
|
+
f"Starting pipeline: {self.config.pipeline}",
|
|
351
|
+
mode="parallel" if parallel else "serial",
|
|
352
|
+
dry_run=dry_run,
|
|
353
|
+
resume_from_failure=resume_from_failure,
|
|
354
|
+
node_count=len(self.graph.nodes),
|
|
355
|
+
layer_count=len(layers),
|
|
356
|
+
max_workers=max_workers if parallel else 1,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
if parallel:
|
|
360
|
+
self._ctx.debug(
|
|
361
|
+
f"Parallel execution plan: {len(layers)} layers",
|
|
362
|
+
layers=[list(layer) for layer in layers],
|
|
363
|
+
)
|
|
364
|
+
else:
|
|
365
|
+
self._ctx.debug(
|
|
366
|
+
f"Serial execution order: {len(execution_order)} nodes",
|
|
367
|
+
order=execution_order,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Initialize progress tracker for console output
|
|
371
|
+
progress: Optional[PipelineProgress] = None
|
|
372
|
+
if console:
|
|
373
|
+
progress = PipelineProgress(
|
|
374
|
+
pipeline_name=self.config.pipeline,
|
|
375
|
+
node_names=execution_order,
|
|
376
|
+
engine=self.engine_type,
|
|
377
|
+
)
|
|
378
|
+
progress.start()
|
|
379
|
+
|
|
380
|
+
# Alert: on_start
|
|
381
|
+
self._send_alerts("on_start", results)
|
|
382
|
+
|
|
383
|
+
# Lineage: Start
|
|
384
|
+
parent_run_id = None
|
|
385
|
+
if self.lineage:
|
|
386
|
+
parent_run_id = self.lineage.emit_pipeline_start(self.config)
|
|
387
|
+
|
|
388
|
+
# Drift Detection (Governance)
|
|
389
|
+
if self.catalog_manager:
|
|
390
|
+
try:
|
|
391
|
+
import hashlib
|
|
392
|
+
import json
|
|
393
|
+
|
|
394
|
+
# Calculate Local Hash
|
|
395
|
+
if hasattr(self.config, "model_dump"):
|
|
396
|
+
dump = self.config.model_dump(mode="json")
|
|
397
|
+
else:
|
|
398
|
+
dump = self.config.model_dump()
|
|
399
|
+
dump_str = json.dumps(dump, sort_keys=True)
|
|
400
|
+
local_hash = hashlib.md5(dump_str.encode("utf-8")).hexdigest()
|
|
401
|
+
|
|
402
|
+
# Get Remote Hash
|
|
403
|
+
remote_hash = self.catalog_manager.get_pipeline_hash(self.config.pipeline)
|
|
404
|
+
|
|
405
|
+
if remote_hash and remote_hash != local_hash:
|
|
406
|
+
self._ctx.warning(
|
|
407
|
+
"DRIFT DETECTED: Local pipeline differs from Catalog",
|
|
408
|
+
local_hash=local_hash[:8],
|
|
409
|
+
catalog_hash=remote_hash[:8],
|
|
410
|
+
suggestion="Deploy changes using 'odibi deploy' before production",
|
|
411
|
+
)
|
|
412
|
+
elif not remote_hash:
|
|
413
|
+
self._ctx.info(
|
|
414
|
+
"Pipeline not found in Catalog (Running un-deployed code)",
|
|
415
|
+
catalog_status="not_deployed",
|
|
416
|
+
)
|
|
417
|
+
else:
|
|
418
|
+
self._ctx.debug(
|
|
419
|
+
"Drift check passed",
|
|
420
|
+
hash=local_hash[:8],
|
|
421
|
+
)
|
|
422
|
+
except Exception as e:
|
|
423
|
+
self._ctx.debug(f"Drift detection check failed: {e}")
|
|
424
|
+
|
|
425
|
+
state_manager = None
|
|
426
|
+
if resume_from_failure:
|
|
427
|
+
self._ctx.info("Resume from failure enabled - checking previous run state")
|
|
428
|
+
if self.project_config:
|
|
429
|
+
try:
|
|
430
|
+
backend = create_state_backend(
|
|
431
|
+
config=self.project_config,
|
|
432
|
+
project_root=".",
|
|
433
|
+
spark_session=getattr(self.engine, "spark", None),
|
|
434
|
+
)
|
|
435
|
+
state_manager = StateManager(backend=backend)
|
|
436
|
+
self._ctx.debug("StateManager initialized for resume capability")
|
|
437
|
+
except Exception as e:
|
|
438
|
+
self._ctx.warning(
|
|
439
|
+
f"Could not initialize StateManager: {e}",
|
|
440
|
+
suggestion="Check state backend configuration",
|
|
441
|
+
)
|
|
442
|
+
else:
|
|
443
|
+
self._ctx.warning(
|
|
444
|
+
"Resume capability unavailable: Project configuration missing",
|
|
445
|
+
suggestion="Ensure project config is set for resume support",
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
# Define node processing function (inner function to capture self/context)
|
|
449
|
+
def process_node(node_name: str) -> NodeResult:
|
|
450
|
+
node_ctx = self._ctx.with_context(node_id=node_name)
|
|
451
|
+
|
|
452
|
+
node_config = self.graph.nodes[node_name]
|
|
453
|
+
deps_failed_list = [dep for dep in node_config.depends_on if dep in results.failed]
|
|
454
|
+
deps_failed = len(deps_failed_list) > 0
|
|
455
|
+
|
|
456
|
+
if deps_failed:
|
|
457
|
+
node_ctx.warning(
|
|
458
|
+
"Skipping node due to dependency failure",
|
|
459
|
+
skipped=True,
|
|
460
|
+
failed_dependencies=deps_failed_list,
|
|
461
|
+
suggestion="Fix upstream node failures first",
|
|
462
|
+
)
|
|
463
|
+
return NodeResult(
|
|
464
|
+
node_name=node_name,
|
|
465
|
+
success=False,
|
|
466
|
+
duration=0.0,
|
|
467
|
+
metadata={"skipped": True, "reason": "dependency_failed"},
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
# Check for resume capability
|
|
471
|
+
if resume_from_failure and state_manager:
|
|
472
|
+
last_info = state_manager.get_last_run_info(self.config.pipeline, node_name)
|
|
473
|
+
|
|
474
|
+
can_resume = False
|
|
475
|
+
resume_reason = ""
|
|
476
|
+
|
|
477
|
+
if last_info and last_info.get("success"):
|
|
478
|
+
last_hash = last_info.get("metadata", {}).get("version_hash")
|
|
479
|
+
|
|
480
|
+
from odibi.utils.hashing import calculate_node_hash
|
|
481
|
+
|
|
482
|
+
node_cfg = self.graph.nodes[node_name]
|
|
483
|
+
current_hash = calculate_node_hash(node_cfg)
|
|
484
|
+
|
|
485
|
+
if last_hash == current_hash:
|
|
486
|
+
deps_ran = False
|
|
487
|
+
for dep in node_config.depends_on:
|
|
488
|
+
if dep in results.completed and dep not in results.skipped:
|
|
489
|
+
deps_ran = True
|
|
490
|
+
break
|
|
491
|
+
|
|
492
|
+
if not deps_ran:
|
|
493
|
+
can_resume = True
|
|
494
|
+
resume_reason = "Previously succeeded and restored from storage"
|
|
495
|
+
else:
|
|
496
|
+
resume_reason = "Upstream dependency executed"
|
|
497
|
+
else:
|
|
498
|
+
resume_reason = (
|
|
499
|
+
f"Configuration changed (Hash: {str(last_hash)[:7]}... "
|
|
500
|
+
f"!= {str(current_hash)[:7]}...)"
|
|
501
|
+
)
|
|
502
|
+
else:
|
|
503
|
+
resume_reason = "No successful previous run found"
|
|
504
|
+
|
|
505
|
+
if can_resume:
|
|
506
|
+
if node_config.write:
|
|
507
|
+
try:
|
|
508
|
+
temp_node = Node(
|
|
509
|
+
config=node_config,
|
|
510
|
+
context=self.context,
|
|
511
|
+
engine=self.engine,
|
|
512
|
+
connections=self.connections,
|
|
513
|
+
performance_config=self.performance_config,
|
|
514
|
+
pipeline_name=self.config.pipeline,
|
|
515
|
+
)
|
|
516
|
+
if temp_node.restore():
|
|
517
|
+
node_ctx.info(
|
|
518
|
+
"Skipping node (restored from previous run)",
|
|
519
|
+
skipped=True,
|
|
520
|
+
reason="resume_from_failure",
|
|
521
|
+
version_hash=current_hash[:8],
|
|
522
|
+
)
|
|
523
|
+
result = NodeResult(
|
|
524
|
+
node_name=node_name,
|
|
525
|
+
success=True,
|
|
526
|
+
duration=0.0,
|
|
527
|
+
metadata={
|
|
528
|
+
"skipped": True,
|
|
529
|
+
"reason": "resume_from_failure",
|
|
530
|
+
"version_hash": current_hash,
|
|
531
|
+
},
|
|
532
|
+
)
|
|
533
|
+
return result
|
|
534
|
+
else:
|
|
535
|
+
node_ctx.debug(
|
|
536
|
+
"Re-running node: Restore failed",
|
|
537
|
+
reason="restore_failed",
|
|
538
|
+
)
|
|
539
|
+
except Exception as e:
|
|
540
|
+
node_ctx.warning(
|
|
541
|
+
f"Could not restore node: {e}",
|
|
542
|
+
reason="restore_error",
|
|
543
|
+
)
|
|
544
|
+
else:
|
|
545
|
+
node_ctx.debug(
|
|
546
|
+
"Re-running node: In-memory transform (cannot be restored)",
|
|
547
|
+
reason="no_write_config",
|
|
548
|
+
)
|
|
549
|
+
else:
|
|
550
|
+
node_ctx.debug(f"Re-running node: {resume_reason}")
|
|
551
|
+
|
|
552
|
+
# Lineage: Node Start
|
|
553
|
+
node_run_id = None
|
|
554
|
+
if self.lineage and parent_run_id:
|
|
555
|
+
node_run_id = self.lineage.emit_node_start(node_config, parent_run_id)
|
|
556
|
+
|
|
557
|
+
# Execute node with operation context
|
|
558
|
+
result = None
|
|
559
|
+
node_start = time.time()
|
|
560
|
+
node_ctx.debug(
|
|
561
|
+
"Executing node",
|
|
562
|
+
transformer=node_config.transformer,
|
|
563
|
+
has_read=bool(node_config.read),
|
|
564
|
+
has_write=bool(node_config.write),
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
try:
|
|
568
|
+
# Prepare batch write buffers for eliminating concurrency conflicts
|
|
569
|
+
batch_buffers = None
|
|
570
|
+
if self._batch_mode_enabled:
|
|
571
|
+
batch_buffers = {
|
|
572
|
+
"lineage": self._pending_lineage_records,
|
|
573
|
+
"assets": self._pending_asset_records,
|
|
574
|
+
"hwm": self._pending_hwm_updates,
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
node = Node(
|
|
578
|
+
config=node_config,
|
|
579
|
+
context=self.context,
|
|
580
|
+
engine=self.engine,
|
|
581
|
+
connections=self.connections,
|
|
582
|
+
dry_run=dry_run,
|
|
583
|
+
retry_config=self.retry_config,
|
|
584
|
+
catalog_manager=self.catalog_manager,
|
|
585
|
+
performance_config=self.performance_config,
|
|
586
|
+
pipeline_name=self.config.pipeline,
|
|
587
|
+
batch_write_buffers=batch_buffers,
|
|
588
|
+
config_file=node_config.source_yaml,
|
|
589
|
+
)
|
|
590
|
+
result = node.execute()
|
|
591
|
+
|
|
592
|
+
node_duration = time.time() - node_start
|
|
593
|
+
if result.success:
|
|
594
|
+
node_ctx.info(
|
|
595
|
+
"Node completed successfully",
|
|
596
|
+
duration_ms=round(node_duration * 1000, 2),
|
|
597
|
+
rows_processed=result.rows_processed,
|
|
598
|
+
)
|
|
599
|
+
else:
|
|
600
|
+
node_ctx.error(
|
|
601
|
+
"Node execution failed",
|
|
602
|
+
duration_ms=round(node_duration * 1000, 2),
|
|
603
|
+
error=result.error,
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
except Exception as e:
|
|
607
|
+
node_duration = time.time() - node_start
|
|
608
|
+
node_ctx.error(
|
|
609
|
+
f"Node raised exception: {e}",
|
|
610
|
+
duration_ms=round(node_duration * 1000, 2),
|
|
611
|
+
error_type=type(e).__name__,
|
|
612
|
+
suggestion="Check node configuration and input data",
|
|
613
|
+
)
|
|
614
|
+
result = NodeResult(node_name=node_name, success=False, duration=0.0, error=str(e))
|
|
615
|
+
|
|
616
|
+
# Lineage: Node Complete
|
|
617
|
+
if self.lineage and node_run_id:
|
|
618
|
+
self.lineage.emit_node_complete(node_config, result, node_run_id)
|
|
619
|
+
|
|
620
|
+
return result
|
|
621
|
+
|
|
622
|
+
if parallel:
|
|
623
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
624
|
+
|
|
625
|
+
# NOTE: 'layers' already filtered by node/tag above - don't re-fetch from graph
|
|
626
|
+
self._ctx.info(
|
|
627
|
+
f"Starting parallel execution with {max_workers} workers",
|
|
628
|
+
total_layers=len(layers),
|
|
629
|
+
max_workers=max_workers,
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
633
|
+
for layer_idx, layer in enumerate(layers):
|
|
634
|
+
layer_start = time.time()
|
|
635
|
+
self._ctx.debug(
|
|
636
|
+
f"Executing layer {layer_idx + 1}/{len(layers)}",
|
|
637
|
+
layer_index=layer_idx,
|
|
638
|
+
nodes_in_layer=list(layer),
|
|
639
|
+
node_count=len(layer),
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
future_to_node = {
|
|
643
|
+
executor.submit(process_node, node_name): node_name for node_name in layer
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
layer_failed = False
|
|
647
|
+
for future in as_completed(future_to_node):
|
|
648
|
+
node_name = future_to_node[future]
|
|
649
|
+
try:
|
|
650
|
+
result = future.result()
|
|
651
|
+
results.node_results[node_name] = result
|
|
652
|
+
|
|
653
|
+
if result.success:
|
|
654
|
+
if result.metadata.get("skipped"):
|
|
655
|
+
if result.metadata.get("reason") == "dependency_failed":
|
|
656
|
+
results.skipped.append(node_name)
|
|
657
|
+
if progress:
|
|
658
|
+
progress.update_node(
|
|
659
|
+
node_name,
|
|
660
|
+
NodeStatus.SKIPPED,
|
|
661
|
+
result.duration,
|
|
662
|
+
result.rows_processed,
|
|
663
|
+
)
|
|
664
|
+
else:
|
|
665
|
+
results.completed.append(node_name)
|
|
666
|
+
if progress:
|
|
667
|
+
progress.update_node(
|
|
668
|
+
node_name,
|
|
669
|
+
NodeStatus.SKIPPED,
|
|
670
|
+
result.duration,
|
|
671
|
+
result.rows_processed,
|
|
672
|
+
)
|
|
673
|
+
else:
|
|
674
|
+
results.completed.append(node_name)
|
|
675
|
+
if progress:
|
|
676
|
+
progress.update_node(
|
|
677
|
+
node_name,
|
|
678
|
+
NodeStatus.SUCCESS,
|
|
679
|
+
result.duration,
|
|
680
|
+
result.rows_processed,
|
|
681
|
+
result.metadata.get("phase_timings_ms"),
|
|
682
|
+
)
|
|
683
|
+
else:
|
|
684
|
+
if result.metadata.get("skipped"):
|
|
685
|
+
results.skipped.append(node_name)
|
|
686
|
+
if progress:
|
|
687
|
+
progress.update_node(
|
|
688
|
+
node_name,
|
|
689
|
+
NodeStatus.SKIPPED,
|
|
690
|
+
result.duration,
|
|
691
|
+
result.rows_processed,
|
|
692
|
+
)
|
|
693
|
+
else:
|
|
694
|
+
results.failed.append(node_name)
|
|
695
|
+
layer_failed = True
|
|
696
|
+
if progress:
|
|
697
|
+
progress.update_node(
|
|
698
|
+
node_name,
|
|
699
|
+
NodeStatus.FAILED,
|
|
700
|
+
result.duration,
|
|
701
|
+
result.rows_processed,
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
node_config = self.graph.nodes[node_name]
|
|
705
|
+
strategy = (
|
|
706
|
+
ErrorStrategy(on_error)
|
|
707
|
+
if on_error
|
|
708
|
+
else node_config.on_error
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
if strategy == ErrorStrategy.FAIL_FAST:
|
|
712
|
+
self._ctx.error(
|
|
713
|
+
"FAIL_FAST triggered: Stopping pipeline",
|
|
714
|
+
failed_node=node_name,
|
|
715
|
+
error=result.error,
|
|
716
|
+
remaining_nodes=len(future_to_node) - 1,
|
|
717
|
+
)
|
|
718
|
+
executor.shutdown(cancel_futures=True, wait=False)
|
|
719
|
+
break
|
|
720
|
+
|
|
721
|
+
except Exception as exc:
|
|
722
|
+
self._ctx.error(
|
|
723
|
+
"Node generated exception",
|
|
724
|
+
node=node_name,
|
|
725
|
+
error=str(exc),
|
|
726
|
+
error_type=type(exc).__name__,
|
|
727
|
+
)
|
|
728
|
+
results.failed.append(node_name)
|
|
729
|
+
layer_failed = True
|
|
730
|
+
if progress:
|
|
731
|
+
progress.update_node(node_name, NodeStatus.FAILED)
|
|
732
|
+
|
|
733
|
+
node_config = self.graph.nodes[node_name]
|
|
734
|
+
strategy = ErrorStrategy(on_error) if on_error else node_config.on_error
|
|
735
|
+
if strategy == ErrorStrategy.FAIL_FAST:
|
|
736
|
+
self._ctx.error(
|
|
737
|
+
"FAIL_FAST triggered: Stopping pipeline",
|
|
738
|
+
failed_node=node_name,
|
|
739
|
+
)
|
|
740
|
+
executor.shutdown(cancel_futures=True, wait=False)
|
|
741
|
+
break
|
|
742
|
+
|
|
743
|
+
layer_duration = time.time() - layer_start
|
|
744
|
+
self._ctx.debug(
|
|
745
|
+
f"Layer {layer_idx + 1} completed",
|
|
746
|
+
layer_index=layer_idx,
|
|
747
|
+
duration_ms=round(layer_duration * 1000, 2),
|
|
748
|
+
layer_failed=layer_failed,
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
if layer_failed:
|
|
752
|
+
for failed_node in results.failed:
|
|
753
|
+
if self.graph.nodes[failed_node].on_error == ErrorStrategy.FAIL_FAST:
|
|
754
|
+
return results
|
|
755
|
+
|
|
756
|
+
else:
|
|
757
|
+
self._ctx.info("Starting serial execution")
|
|
758
|
+
execution_order = self.graph.topological_sort()
|
|
759
|
+
for idx, node_name in enumerate(execution_order):
|
|
760
|
+
self._ctx.debug(
|
|
761
|
+
f"Executing node {idx + 1}/{len(execution_order)}",
|
|
762
|
+
node=node_name,
|
|
763
|
+
order=idx + 1,
|
|
764
|
+
total=len(execution_order),
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
result = process_node(node_name)
|
|
768
|
+
results.node_results[node_name] = result
|
|
769
|
+
|
|
770
|
+
if result.success:
|
|
771
|
+
if (
|
|
772
|
+
result.metadata.get("skipped")
|
|
773
|
+
and result.metadata.get("reason") == "dependency_failed"
|
|
774
|
+
):
|
|
775
|
+
results.skipped.append(node_name)
|
|
776
|
+
results.failed.append(node_name)
|
|
777
|
+
if progress:
|
|
778
|
+
progress.update_node(
|
|
779
|
+
node_name,
|
|
780
|
+
NodeStatus.SKIPPED,
|
|
781
|
+
result.duration,
|
|
782
|
+
result.rows_processed,
|
|
783
|
+
)
|
|
784
|
+
else:
|
|
785
|
+
results.completed.append(node_name)
|
|
786
|
+
if progress:
|
|
787
|
+
status = (
|
|
788
|
+
NodeStatus.SKIPPED
|
|
789
|
+
if result.metadata.get("skipped")
|
|
790
|
+
else NodeStatus.SUCCESS
|
|
791
|
+
)
|
|
792
|
+
progress.update_node(
|
|
793
|
+
node_name,
|
|
794
|
+
status,
|
|
795
|
+
result.duration,
|
|
796
|
+
result.rows_processed,
|
|
797
|
+
)
|
|
798
|
+
else:
|
|
799
|
+
if result.metadata.get("skipped"):
|
|
800
|
+
results.skipped.append(node_name)
|
|
801
|
+
results.failed.append(node_name)
|
|
802
|
+
if progress:
|
|
803
|
+
progress.update_node(
|
|
804
|
+
node_name,
|
|
805
|
+
NodeStatus.SKIPPED,
|
|
806
|
+
result.duration,
|
|
807
|
+
result.rows_processed,
|
|
808
|
+
)
|
|
809
|
+
else:
|
|
810
|
+
results.failed.append(node_name)
|
|
811
|
+
if progress:
|
|
812
|
+
progress.update_node(
|
|
813
|
+
node_name,
|
|
814
|
+
NodeStatus.FAILED,
|
|
815
|
+
result.duration,
|
|
816
|
+
result.rows_processed,
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
node_config = self.graph.nodes[node_name]
|
|
820
|
+
strategy = ErrorStrategy(on_error) if on_error else node_config.on_error
|
|
821
|
+
|
|
822
|
+
if strategy == ErrorStrategy.FAIL_FAST:
|
|
823
|
+
self._ctx.error(
|
|
824
|
+
"FAIL_FAST triggered: Stopping pipeline",
|
|
825
|
+
failed_node=node_name,
|
|
826
|
+
error=result.error,
|
|
827
|
+
remaining_nodes=len(execution_order) - idx - 1,
|
|
828
|
+
)
|
|
829
|
+
break
|
|
830
|
+
|
|
831
|
+
# Calculate duration
|
|
832
|
+
results.duration = time.time() - start_time
|
|
833
|
+
results.end_time = datetime.now().isoformat()
|
|
834
|
+
|
|
835
|
+
# Batch write run records to catalog (much faster than per-node writes)
|
|
836
|
+
# Skip if performance.skip_run_logging is enabled
|
|
837
|
+
skip_run_logging = self.performance_config and getattr(
|
|
838
|
+
self.performance_config, "skip_run_logging", False
|
|
839
|
+
)
|
|
840
|
+
if self.catalog_manager and not skip_run_logging:
|
|
841
|
+
run_records = []
|
|
842
|
+
for node_result in results.node_results.values():
|
|
843
|
+
if node_result.metadata and "_run_record" in node_result.metadata:
|
|
844
|
+
run_records.append(node_result.metadata.pop("_run_record"))
|
|
845
|
+
if run_records:
|
|
846
|
+
self.catalog_manager.log_runs_batch(run_records)
|
|
847
|
+
self._ctx.debug(
|
|
848
|
+
f"Batch logged {len(run_records)} run records",
|
|
849
|
+
record_count=len(run_records),
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
# Batch write output metadata for cross-pipeline dependencies
|
|
853
|
+
output_records = []
|
|
854
|
+
for node_result in results.node_results.values():
|
|
855
|
+
if node_result.metadata and "_output_record" in node_result.metadata:
|
|
856
|
+
output_records.append(node_result.metadata.pop("_output_record"))
|
|
857
|
+
if output_records:
|
|
858
|
+
try:
|
|
859
|
+
self.catalog_manager.register_outputs_batch(output_records)
|
|
860
|
+
self._ctx.debug(
|
|
861
|
+
f"Batch registered {len(output_records)} output(s)",
|
|
862
|
+
output_count=len(output_records),
|
|
863
|
+
)
|
|
864
|
+
except Exception as e:
|
|
865
|
+
self._ctx.warning(
|
|
866
|
+
f"Failed to register outputs (non-fatal): {e}",
|
|
867
|
+
error_type=type(e).__name__,
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
# Flush buffered catalog writes (lineage, assets, HWM)
|
|
871
|
+
self._flush_batch_writes()
|
|
872
|
+
|
|
873
|
+
elif skip_run_logging:
|
|
874
|
+
self._ctx.debug("Skipping run logging (skip_run_logging=true)")
|
|
875
|
+
|
|
876
|
+
# Finish progress display
|
|
877
|
+
if progress:
|
|
878
|
+
progress.finish(
|
|
879
|
+
completed=len(results.completed),
|
|
880
|
+
failed=len(results.failed),
|
|
881
|
+
skipped=len(results.skipped),
|
|
882
|
+
duration=results.duration,
|
|
883
|
+
)
|
|
884
|
+
# Print phase timing breakdown for performance analysis
|
|
885
|
+
progress.print_phase_timing_report(pipeline_duration_s=results.duration)
|
|
886
|
+
|
|
887
|
+
# Log pipeline completion summary
|
|
888
|
+
status = "SUCCESS" if not results.failed else "FAILED"
|
|
889
|
+
self._ctx.info(
|
|
890
|
+
f"Pipeline {status}: {self.config.pipeline}",
|
|
891
|
+
status=status,
|
|
892
|
+
duration_s=round(results.duration, 2),
|
|
893
|
+
completed=len(results.completed),
|
|
894
|
+
failed=len(results.failed),
|
|
895
|
+
skipped=len(results.skipped),
|
|
896
|
+
total_nodes=len(self.graph.nodes),
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
# Start story generation in background thread (pure Python/file I/O, safe to parallelize)
|
|
900
|
+
# This runs concurrently with state saving below
|
|
901
|
+
story_future = None
|
|
902
|
+
story_executor = None
|
|
903
|
+
async_story = self.story_config.get("async_generation", False)
|
|
904
|
+
|
|
905
|
+
if self.generate_story:
|
|
906
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
907
|
+
|
|
908
|
+
if hasattr(self.config, "model_dump"):
|
|
909
|
+
config_dump = self.config.model_dump(mode="json")
|
|
910
|
+
else:
|
|
911
|
+
config_dump = self.config.model_dump()
|
|
912
|
+
|
|
913
|
+
if self.project_config:
|
|
914
|
+
project_dump = (
|
|
915
|
+
self.project_config.model_dump(mode="json")
|
|
916
|
+
if hasattr(self.project_config, "model_dump")
|
|
917
|
+
else self.project_config.model_dump()
|
|
918
|
+
)
|
|
919
|
+
for field in ["project", "plant", "asset", "business_unit", "layer"]:
|
|
920
|
+
if field in project_dump and project_dump[field]:
|
|
921
|
+
config_dump[field] = project_dump[field]
|
|
922
|
+
|
|
923
|
+
def generate_story():
|
|
924
|
+
try:
|
|
925
|
+
# Get graph data for interactive DAG visualization
|
|
926
|
+
graph_data_dict = self.graph.to_dict() if self.graph else None
|
|
927
|
+
|
|
928
|
+
return self.story_generator.generate(
|
|
929
|
+
node_results=results.node_results,
|
|
930
|
+
completed=results.completed,
|
|
931
|
+
failed=results.failed,
|
|
932
|
+
skipped=results.skipped,
|
|
933
|
+
duration=results.duration,
|
|
934
|
+
start_time=results.start_time,
|
|
935
|
+
end_time=results.end_time,
|
|
936
|
+
context=self.context,
|
|
937
|
+
config=config_dump,
|
|
938
|
+
graph_data=graph_data_dict,
|
|
939
|
+
)
|
|
940
|
+
except Exception as e:
|
|
941
|
+
self._ctx.warning(f"Story generation failed: {e}")
|
|
942
|
+
return None
|
|
943
|
+
|
|
944
|
+
story_executor = ThreadPoolExecutor(max_workers=1)
|
|
945
|
+
story_future = story_executor.submit(generate_story)
|
|
946
|
+
|
|
947
|
+
# Save state if running normally (not dry run)
|
|
948
|
+
# This runs while story generation happens in background
|
|
949
|
+
if not dry_run:
|
|
950
|
+
if not state_manager and self.project_config:
|
|
951
|
+
try:
|
|
952
|
+
backend = create_state_backend(
|
|
953
|
+
config=self.project_config,
|
|
954
|
+
project_root=".",
|
|
955
|
+
spark_session=getattr(self.engine, "spark", None),
|
|
956
|
+
)
|
|
957
|
+
state_manager = StateManager(backend=backend)
|
|
958
|
+
except Exception as e:
|
|
959
|
+
self._ctx.warning(
|
|
960
|
+
f"Could not initialize StateManager for saving run: {e}",
|
|
961
|
+
suggestion="Check state backend configuration",
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
if state_manager:
|
|
965
|
+
state_manager.save_pipeline_run(self.config.pipeline, results)
|
|
966
|
+
self._ctx.debug("Pipeline run state saved")
|
|
967
|
+
|
|
968
|
+
# Handle story completion based on async_generation setting
|
|
969
|
+
if story_future:
|
|
970
|
+
if async_story:
|
|
971
|
+
# Store future and executor for flush_stories()
|
|
972
|
+
self._story_future = story_future
|
|
973
|
+
self._story_executor = story_executor
|
|
974
|
+
self._ctx.debug("Story generation running async (can be flushed later)")
|
|
975
|
+
else:
|
|
976
|
+
# Wait for story generation to complete
|
|
977
|
+
try:
|
|
978
|
+
story_path = story_future.result(timeout=60)
|
|
979
|
+
if story_path:
|
|
980
|
+
results.story_path = story_path
|
|
981
|
+
self._ctx.info("Story generated", story_path=story_path)
|
|
982
|
+
except Exception as e:
|
|
983
|
+
self._ctx.warning(f"Story generation failed: {e}")
|
|
984
|
+
finally:
|
|
985
|
+
if story_executor:
|
|
986
|
+
story_executor.shutdown(wait=False)
|
|
987
|
+
|
|
988
|
+
# Alert: on_success / on_failure
|
|
989
|
+
if results.failed:
|
|
990
|
+
self._send_alerts("on_failure", results)
|
|
991
|
+
else:
|
|
992
|
+
self._send_alerts("on_success", results)
|
|
993
|
+
|
|
994
|
+
# Catalog optimization (optional - can be slow, ~15-20s)
|
|
995
|
+
# Only run if explicitly enabled via optimize_catalog flag
|
|
996
|
+
if self.catalog_manager and getattr(self, "optimize_catalog", False):
|
|
997
|
+
self.catalog_manager.optimize()
|
|
998
|
+
self._ctx.debug("Catalog optimized")
|
|
999
|
+
|
|
1000
|
+
# Lineage: Complete
|
|
1001
|
+
if self.lineage:
|
|
1002
|
+
self.lineage.emit_pipeline_complete(self.config, results)
|
|
1003
|
+
|
|
1004
|
+
return results
|
|
1005
|
+
|
|
1006
|
+
def flush_stories(self, timeout: float = 60.0) -> Optional[str]:
|
|
1007
|
+
"""Wait for any pending async story generation to complete.
|
|
1008
|
+
|
|
1009
|
+
Call this before operations that need story files to be written,
|
|
1010
|
+
such as lineage generation.
|
|
1011
|
+
|
|
1012
|
+
Args:
|
|
1013
|
+
timeout: Maximum seconds to wait for story generation
|
|
1014
|
+
|
|
1015
|
+
Returns:
|
|
1016
|
+
Story path if generated, None otherwise
|
|
1017
|
+
"""
|
|
1018
|
+
if self._story_future is None:
|
|
1019
|
+
return None
|
|
1020
|
+
|
|
1021
|
+
try:
|
|
1022
|
+
story_path = self._story_future.result(timeout=timeout)
|
|
1023
|
+
self._ctx.info("Async story generation completed", story_path=story_path)
|
|
1024
|
+
return story_path
|
|
1025
|
+
except Exception as e:
|
|
1026
|
+
self._ctx.warning(f"Async story generation failed: {e}")
|
|
1027
|
+
return None
|
|
1028
|
+
finally:
|
|
1029
|
+
if self._story_executor:
|
|
1030
|
+
self._story_executor.shutdown(wait=False)
|
|
1031
|
+
self._story_future = None
|
|
1032
|
+
self._story_executor = None
|
|
1033
|
+
|
|
1034
|
+
def _send_alerts(self, event: str, results: PipelineResults) -> None:
|
|
1035
|
+
"""Send alerts for a specific event.
|
|
1036
|
+
|
|
1037
|
+
Args:
|
|
1038
|
+
event: Event name (on_start, on_success, on_failure)
|
|
1039
|
+
results: Pipeline results
|
|
1040
|
+
"""
|
|
1041
|
+
for alert_config in self.alerts:
|
|
1042
|
+
event_values = [e.value if hasattr(e, "value") else e for e in alert_config.on_events]
|
|
1043
|
+
if event in event_values:
|
|
1044
|
+
status = "FAILED" if results.failed else "SUCCESS"
|
|
1045
|
+
if event == "on_start":
|
|
1046
|
+
status = "STARTED"
|
|
1047
|
+
|
|
1048
|
+
context = {
|
|
1049
|
+
"pipeline": self.config.pipeline,
|
|
1050
|
+
"status": status,
|
|
1051
|
+
"duration": results.duration,
|
|
1052
|
+
"timestamp": datetime.now().isoformat(),
|
|
1053
|
+
"project_config": self.project_config,
|
|
1054
|
+
"event_type": event,
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
# Enrich with story summary (row counts, story URL)
|
|
1058
|
+
if event != "on_start" and self.generate_story:
|
|
1059
|
+
story_summary = self.story_generator.get_alert_summary()
|
|
1060
|
+
context.update(story_summary)
|
|
1061
|
+
|
|
1062
|
+
msg = f"Pipeline '{self.config.pipeline}' {status}"
|
|
1063
|
+
if results.failed:
|
|
1064
|
+
msg += f". Failed nodes: {', '.join(results.failed)}"
|
|
1065
|
+
|
|
1066
|
+
send_alert(alert_config, msg, context)
|
|
1067
|
+
|
|
1068
|
+
def buffer_lineage_record(self, record: Dict[str, Any]) -> None:
|
|
1069
|
+
"""Buffer a lineage record for batch write at pipeline end.
|
|
1070
|
+
|
|
1071
|
+
Args:
|
|
1072
|
+
record: Dict with keys: source_table, target_table, target_pipeline,
|
|
1073
|
+
target_node, run_id, and optional source_pipeline, source_node
|
|
1074
|
+
"""
|
|
1075
|
+
self._pending_lineage_records.append(record)
|
|
1076
|
+
|
|
1077
|
+
def buffer_asset_record(self, record: Dict[str, Any]) -> None:
|
|
1078
|
+
"""Buffer an asset registration record for batch write at pipeline end.
|
|
1079
|
+
|
|
1080
|
+
Args:
|
|
1081
|
+
record: Dict with keys: project_name, table_name, path, format,
|
|
1082
|
+
pattern_type, and optional schema_hash
|
|
1083
|
+
"""
|
|
1084
|
+
self._pending_asset_records.append(record)
|
|
1085
|
+
|
|
1086
|
+
def buffer_hwm_update(self, key: str, value: Any) -> None:
|
|
1087
|
+
"""Buffer a HWM update for batch write at pipeline end.
|
|
1088
|
+
|
|
1089
|
+
Args:
|
|
1090
|
+
key: HWM state key
|
|
1091
|
+
value: HWM value
|
|
1092
|
+
"""
|
|
1093
|
+
self._pending_hwm_updates.append({"key": key, "value": value})
|
|
1094
|
+
|
|
1095
|
+
def _flush_batch_writes(self) -> None:
|
|
1096
|
+
"""Flush all buffered catalog writes in single batch operations.
|
|
1097
|
+
|
|
1098
|
+
This eliminates concurrency conflicts when running 35+ parallel nodes
|
|
1099
|
+
by writing all lineage, assets, and HWM updates at once.
|
|
1100
|
+
"""
|
|
1101
|
+
if not self.catalog_manager:
|
|
1102
|
+
return
|
|
1103
|
+
|
|
1104
|
+
# Flush lineage records
|
|
1105
|
+
if self._pending_lineage_records:
|
|
1106
|
+
try:
|
|
1107
|
+
self.catalog_manager.record_lineage_batch(self._pending_lineage_records)
|
|
1108
|
+
self._ctx.debug(
|
|
1109
|
+
f"Batch recorded {len(self._pending_lineage_records)} lineage relationship(s)",
|
|
1110
|
+
lineage_count=len(self._pending_lineage_records),
|
|
1111
|
+
)
|
|
1112
|
+
except Exception as e:
|
|
1113
|
+
self._ctx.warning(
|
|
1114
|
+
f"Failed to batch record lineage (non-fatal): {e}",
|
|
1115
|
+
error_type=type(e).__name__,
|
|
1116
|
+
)
|
|
1117
|
+
finally:
|
|
1118
|
+
self._pending_lineage_records = []
|
|
1119
|
+
|
|
1120
|
+
# Flush asset records
|
|
1121
|
+
if self._pending_asset_records:
|
|
1122
|
+
try:
|
|
1123
|
+
self.catalog_manager.register_assets_batch(self._pending_asset_records)
|
|
1124
|
+
self._ctx.debug(
|
|
1125
|
+
f"Batch registered {len(self._pending_asset_records)} asset(s)",
|
|
1126
|
+
asset_count=len(self._pending_asset_records),
|
|
1127
|
+
)
|
|
1128
|
+
except Exception as e:
|
|
1129
|
+
self._ctx.warning(
|
|
1130
|
+
f"Failed to batch register assets (non-fatal): {e}",
|
|
1131
|
+
error_type=type(e).__name__,
|
|
1132
|
+
)
|
|
1133
|
+
finally:
|
|
1134
|
+
self._pending_asset_records = []
|
|
1135
|
+
|
|
1136
|
+
# Flush HWM updates
|
|
1137
|
+
if self._pending_hwm_updates:
|
|
1138
|
+
try:
|
|
1139
|
+
if self.project_config:
|
|
1140
|
+
backend = create_state_backend(
|
|
1141
|
+
config=self.project_config,
|
|
1142
|
+
project_root=".",
|
|
1143
|
+
spark_session=getattr(self.engine, "spark", None),
|
|
1144
|
+
)
|
|
1145
|
+
state_manager = StateManager(backend=backend)
|
|
1146
|
+
state_manager.set_hwm_batch(self._pending_hwm_updates)
|
|
1147
|
+
self._ctx.debug(
|
|
1148
|
+
f"Batch updated {len(self._pending_hwm_updates)} HWM value(s)",
|
|
1149
|
+
hwm_count=len(self._pending_hwm_updates),
|
|
1150
|
+
)
|
|
1151
|
+
except Exception as e:
|
|
1152
|
+
self._ctx.warning(
|
|
1153
|
+
f"Failed to batch update HWM (non-fatal): {e}",
|
|
1154
|
+
error_type=type(e).__name__,
|
|
1155
|
+
)
|
|
1156
|
+
finally:
|
|
1157
|
+
self._pending_hwm_updates = []
|
|
1158
|
+
|
|
1159
|
+
def run_node(self, node_name: str, mock_data: Optional[Dict[str, Any]] = None) -> NodeResult:
|
|
1160
|
+
"""Execute a single node (for testing/debugging).
|
|
1161
|
+
|
|
1162
|
+
Args:
|
|
1163
|
+
node_name: Name of node to execute
|
|
1164
|
+
mock_data: Optional mock data to register in context
|
|
1165
|
+
|
|
1166
|
+
Returns:
|
|
1167
|
+
NodeResult
|
|
1168
|
+
"""
|
|
1169
|
+
if node_name not in self.graph.nodes:
|
|
1170
|
+
available = ", ".join(self.graph.nodes.keys()) or "none"
|
|
1171
|
+
raise ValueError(
|
|
1172
|
+
f"Node '{node_name}' not found in pipeline. " f"Available nodes: {available}"
|
|
1173
|
+
)
|
|
1174
|
+
|
|
1175
|
+
# Register mock data if provided
|
|
1176
|
+
if mock_data:
|
|
1177
|
+
for name, data in mock_data.items():
|
|
1178
|
+
self.context.register(name, data)
|
|
1179
|
+
|
|
1180
|
+
# Execute the node
|
|
1181
|
+
node_config = self.graph.nodes[node_name]
|
|
1182
|
+
node = Node(
|
|
1183
|
+
config=node_config,
|
|
1184
|
+
context=self.context,
|
|
1185
|
+
engine=self.engine,
|
|
1186
|
+
connections=self.connections,
|
|
1187
|
+
performance_config=self.performance_config,
|
|
1188
|
+
pipeline_name=self.config.pipeline,
|
|
1189
|
+
config_file=node_config.source_yaml,
|
|
1190
|
+
)
|
|
1191
|
+
|
|
1192
|
+
return node.execute()
|
|
1193
|
+
|
|
1194
|
+
def validate(self) -> Dict[str, Any]:
|
|
1195
|
+
"""Validate pipeline without executing.
|
|
1196
|
+
|
|
1197
|
+
Returns:
|
|
1198
|
+
Validation results
|
|
1199
|
+
"""
|
|
1200
|
+
self._ctx.info("Validating pipeline configuration")
|
|
1201
|
+
|
|
1202
|
+
validation = {
|
|
1203
|
+
"valid": True,
|
|
1204
|
+
"errors": [],
|
|
1205
|
+
"warnings": [],
|
|
1206
|
+
"node_count": len(self.graph.nodes),
|
|
1207
|
+
"execution_order": [],
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
try:
|
|
1211
|
+
execution_order = self.graph.topological_sort()
|
|
1212
|
+
validation["execution_order"] = execution_order
|
|
1213
|
+
self._ctx.debug(
|
|
1214
|
+
"Dependency graph validated",
|
|
1215
|
+
execution_order=execution_order,
|
|
1216
|
+
)
|
|
1217
|
+
|
|
1218
|
+
for node_name, node in self.graph.nodes.items():
|
|
1219
|
+
if node.transformer:
|
|
1220
|
+
try:
|
|
1221
|
+
FunctionRegistry.validate_params(node.transformer, node.params)
|
|
1222
|
+
except ValueError as e:
|
|
1223
|
+
validation["errors"].append(f"Node '{node_name}' transformer error: {e}")
|
|
1224
|
+
validation["valid"] = False
|
|
1225
|
+
self._ctx.log_validation_result(
|
|
1226
|
+
passed=False,
|
|
1227
|
+
rule_name=f"transformer_params:{node_name}",
|
|
1228
|
+
failures=[str(e)],
|
|
1229
|
+
)
|
|
1230
|
+
|
|
1231
|
+
if node.transform and node.transform.steps:
|
|
1232
|
+
for i, step in enumerate(node.transform.steps):
|
|
1233
|
+
if isinstance(step, str):
|
|
1234
|
+
continue
|
|
1235
|
+
|
|
1236
|
+
if hasattr(step, "function") and step.function:
|
|
1237
|
+
try:
|
|
1238
|
+
FunctionRegistry.validate_params(step.function, step.params)
|
|
1239
|
+
except ValueError as e:
|
|
1240
|
+
validation["errors"].append(
|
|
1241
|
+
f"Node '{node_name}' step {i + 1} error: {e}"
|
|
1242
|
+
)
|
|
1243
|
+
validation["valid"] = False
|
|
1244
|
+
self._ctx.log_validation_result(
|
|
1245
|
+
passed=False,
|
|
1246
|
+
rule_name=f"step_params:{node_name}:step_{i + 1}",
|
|
1247
|
+
failures=[str(e)],
|
|
1248
|
+
)
|
|
1249
|
+
|
|
1250
|
+
except DependencyError as e:
|
|
1251
|
+
validation["valid"] = False
|
|
1252
|
+
validation["errors"].append(str(e))
|
|
1253
|
+
self._ctx.error(
|
|
1254
|
+
"Dependency graph validation failed",
|
|
1255
|
+
error=str(e),
|
|
1256
|
+
)
|
|
1257
|
+
|
|
1258
|
+
for node in self.config.nodes:
|
|
1259
|
+
if node.read and node.read.connection not in self.connections:
|
|
1260
|
+
validation["warnings"].append(
|
|
1261
|
+
f"Node '{node.name}': connection '{node.read.connection}' not configured"
|
|
1262
|
+
)
|
|
1263
|
+
if node.write and node.write.connection not in self.connections:
|
|
1264
|
+
validation["warnings"].append(
|
|
1265
|
+
f"Node '{node.name}': connection '{node.write.connection}' not configured"
|
|
1266
|
+
)
|
|
1267
|
+
|
|
1268
|
+
self._ctx.info(
|
|
1269
|
+
f"Validation {'passed' if validation['valid'] else 'failed'}",
|
|
1270
|
+
valid=validation["valid"],
|
|
1271
|
+
errors=len(validation["errors"]),
|
|
1272
|
+
warnings=len(validation["warnings"]),
|
|
1273
|
+
)
|
|
1274
|
+
|
|
1275
|
+
return validation
|
|
1276
|
+
|
|
1277
|
+
def get_execution_layers(self) -> List[List[str]]:
|
|
1278
|
+
"""Get nodes grouped by execution layers.
|
|
1279
|
+
|
|
1280
|
+
Returns:
|
|
1281
|
+
List of layers, where each layer is a list of node names
|
|
1282
|
+
"""
|
|
1283
|
+
return self.graph.get_execution_layers()
|
|
1284
|
+
|
|
1285
|
+
def visualize(self) -> str:
|
|
1286
|
+
"""Get text visualization of pipeline.
|
|
1287
|
+
|
|
1288
|
+
Returns:
|
|
1289
|
+
String representation of pipeline graph
|
|
1290
|
+
"""
|
|
1291
|
+
return self.graph.visualize()
|
|
1292
|
+
|
|
1293
|
+
|
|
1294
|
+
class PipelineManager:
|
|
1295
|
+
"""Manages multiple pipelines from a YAML configuration."""
|
|
1296
|
+
|
|
1297
|
+
def __init__(
|
|
1298
|
+
self,
|
|
1299
|
+
project_config: ProjectConfig,
|
|
1300
|
+
connections: Dict[str, Any],
|
|
1301
|
+
):
|
|
1302
|
+
"""Initialize pipeline manager.
|
|
1303
|
+
|
|
1304
|
+
Args:
|
|
1305
|
+
project_config: Validated project configuration
|
|
1306
|
+
connections: Connection objects (already instantiated)
|
|
1307
|
+
"""
|
|
1308
|
+
self.project_config = project_config
|
|
1309
|
+
self.connections = connections
|
|
1310
|
+
self._pipelines: Dict[str, Pipeline] = {}
|
|
1311
|
+
self.catalog_manager = None
|
|
1312
|
+
self.lineage_adapter = None
|
|
1313
|
+
|
|
1314
|
+
# Configure logging
|
|
1315
|
+
configure_logging(
|
|
1316
|
+
structured=project_config.logging.structured, level=project_config.logging.level.value
|
|
1317
|
+
)
|
|
1318
|
+
|
|
1319
|
+
# Create manager-level logging context
|
|
1320
|
+
self._ctx = create_logging_context(engine=project_config.engine)
|
|
1321
|
+
|
|
1322
|
+
self._ctx.info(
|
|
1323
|
+
"Initializing PipelineManager",
|
|
1324
|
+
project=project_config.project,
|
|
1325
|
+
engine=project_config.engine,
|
|
1326
|
+
pipeline_count=len(project_config.pipelines),
|
|
1327
|
+
connection_count=len(connections),
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
# Initialize Lineage Adapter
|
|
1331
|
+
self.lineage_adapter = OpenLineageAdapter(project_config.lineage)
|
|
1332
|
+
|
|
1333
|
+
# Initialize CatalogManager if configured
|
|
1334
|
+
if project_config.system:
|
|
1335
|
+
from odibi.catalog import CatalogManager
|
|
1336
|
+
|
|
1337
|
+
spark = None
|
|
1338
|
+
engine_instance = None
|
|
1339
|
+
|
|
1340
|
+
if project_config.engine == "spark":
|
|
1341
|
+
try:
|
|
1342
|
+
from odibi.engine.spark_engine import SparkEngine
|
|
1343
|
+
|
|
1344
|
+
temp_engine = SparkEngine(connections=connections, config={})
|
|
1345
|
+
spark = temp_engine.spark
|
|
1346
|
+
self._ctx.debug("Spark session initialized for System Catalog")
|
|
1347
|
+
except Exception as e:
|
|
1348
|
+
self._ctx.warning(
|
|
1349
|
+
f"Failed to initialize Spark for System Catalog: {e}",
|
|
1350
|
+
suggestion="Check Spark configuration",
|
|
1351
|
+
)
|
|
1352
|
+
|
|
1353
|
+
sys_conn = connections.get(project_config.system.connection)
|
|
1354
|
+
if sys_conn:
|
|
1355
|
+
base_path = sys_conn.get_path(project_config.system.path)
|
|
1356
|
+
|
|
1357
|
+
if not spark:
|
|
1358
|
+
try:
|
|
1359
|
+
from odibi.engine.pandas_engine import PandasEngine
|
|
1360
|
+
|
|
1361
|
+
engine_instance = PandasEngine(config={})
|
|
1362
|
+
self._ctx.debug("PandasEngine initialized for System Catalog")
|
|
1363
|
+
except Exception as e:
|
|
1364
|
+
self._ctx.warning(
|
|
1365
|
+
f"Failed to initialize PandasEngine for System Catalog: {e}"
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
if spark or engine_instance:
|
|
1369
|
+
self.catalog_manager = CatalogManager(
|
|
1370
|
+
spark=spark,
|
|
1371
|
+
config=project_config.system,
|
|
1372
|
+
base_path=base_path,
|
|
1373
|
+
engine=engine_instance,
|
|
1374
|
+
connection=sys_conn,
|
|
1375
|
+
)
|
|
1376
|
+
self.catalog_manager.bootstrap()
|
|
1377
|
+
self._ctx.info("System Catalog initialized", path=base_path)
|
|
1378
|
+
else:
|
|
1379
|
+
self._ctx.warning(
|
|
1380
|
+
f"System connection '{project_config.system.connection}' not found",
|
|
1381
|
+
suggestion="Configure the system connection in your config",
|
|
1382
|
+
)
|
|
1383
|
+
|
|
1384
|
+
# Get story configuration
|
|
1385
|
+
story_config = self._get_story_config()
|
|
1386
|
+
|
|
1387
|
+
# Create all pipeline instances
|
|
1388
|
+
self._ctx.debug(
|
|
1389
|
+
"Creating pipeline instances",
|
|
1390
|
+
pipelines=[p.pipeline for p in project_config.pipelines],
|
|
1391
|
+
)
|
|
1392
|
+
for pipeline_config in project_config.pipelines:
|
|
1393
|
+
pipeline_name = pipeline_config.pipeline
|
|
1394
|
+
|
|
1395
|
+
self._pipelines[pipeline_name] = Pipeline(
|
|
1396
|
+
pipeline_config=pipeline_config,
|
|
1397
|
+
engine=project_config.engine,
|
|
1398
|
+
connections=connections,
|
|
1399
|
+
generate_story=story_config.get("auto_generate", True),
|
|
1400
|
+
story_config=story_config,
|
|
1401
|
+
retry_config=project_config.retry,
|
|
1402
|
+
alerts=project_config.alerts,
|
|
1403
|
+
performance_config=project_config.performance,
|
|
1404
|
+
catalog_manager=self.catalog_manager,
|
|
1405
|
+
lineage_adapter=self.lineage_adapter,
|
|
1406
|
+
)
|
|
1407
|
+
self._pipelines[pipeline_name].project_config = project_config
|
|
1408
|
+
|
|
1409
|
+
self._ctx.info(
|
|
1410
|
+
"PipelineManager ready",
|
|
1411
|
+
pipelines=list(self._pipelines.keys()),
|
|
1412
|
+
)
|
|
1413
|
+
|
|
1414
|
+
def _get_story_config(self) -> Dict[str, Any]:
|
|
1415
|
+
"""Build story config from project_config.story.
|
|
1416
|
+
|
|
1417
|
+
Resolves story output path using connection.
|
|
1418
|
+
|
|
1419
|
+
Returns:
|
|
1420
|
+
Dictionary for StoryGenerator initialization
|
|
1421
|
+
"""
|
|
1422
|
+
story_cfg = self.project_config.story
|
|
1423
|
+
|
|
1424
|
+
# Resolve story path using connection
|
|
1425
|
+
story_conn = self.connections[story_cfg.connection]
|
|
1426
|
+
output_path = story_conn.get_path(story_cfg.path)
|
|
1427
|
+
|
|
1428
|
+
# Get storage options (e.g., credentials) from connection if available
|
|
1429
|
+
storage_options = {}
|
|
1430
|
+
if hasattr(story_conn, "pandas_storage_options"):
|
|
1431
|
+
storage_options = story_conn.pandas_storage_options()
|
|
1432
|
+
|
|
1433
|
+
return {
|
|
1434
|
+
"auto_generate": story_cfg.auto_generate,
|
|
1435
|
+
"max_sample_rows": story_cfg.max_sample_rows,
|
|
1436
|
+
"output_path": output_path,
|
|
1437
|
+
"storage_options": storage_options,
|
|
1438
|
+
"async_generation": story_cfg.async_generation,
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
@classmethod
|
|
1442
|
+
def from_yaml(cls, yaml_path: str, env: str = None) -> "PipelineManager":
|
|
1443
|
+
"""Create PipelineManager from YAML file.
|
|
1444
|
+
|
|
1445
|
+
Args:
|
|
1446
|
+
yaml_path: Path to YAML configuration file
|
|
1447
|
+
env: Environment name to apply overrides (e.g. 'prod')
|
|
1448
|
+
|
|
1449
|
+
Returns:
|
|
1450
|
+
PipelineManager instance ready to run pipelines
|
|
1451
|
+
|
|
1452
|
+
Example:
|
|
1453
|
+
>>> manager = PipelineManager.from_yaml("config.yaml", env="prod")
|
|
1454
|
+
>>> results = manager.run() # Run all pipelines
|
|
1455
|
+
"""
|
|
1456
|
+
logger.info(f"Loading configuration from: {yaml_path}")
|
|
1457
|
+
|
|
1458
|
+
register_standard_library()
|
|
1459
|
+
|
|
1460
|
+
yaml_path_obj = Path(yaml_path)
|
|
1461
|
+
config_dir = yaml_path_obj.parent.absolute()
|
|
1462
|
+
|
|
1463
|
+
import importlib.util
|
|
1464
|
+
import os
|
|
1465
|
+
import sys
|
|
1466
|
+
|
|
1467
|
+
def load_transforms_module(path):
|
|
1468
|
+
if os.path.exists(path):
|
|
1469
|
+
try:
|
|
1470
|
+
spec = importlib.util.spec_from_file_location("transforms_autodiscovered", path)
|
|
1471
|
+
if spec and spec.loader:
|
|
1472
|
+
module = importlib.util.module_from_spec(spec)
|
|
1473
|
+
sys.modules["transforms_autodiscovered"] = module
|
|
1474
|
+
spec.loader.exec_module(module)
|
|
1475
|
+
logger.info(f"Auto-loaded transforms from: {path}")
|
|
1476
|
+
except Exception as e:
|
|
1477
|
+
logger.warning(f"Failed to auto-load transforms from {path}: {e}")
|
|
1478
|
+
|
|
1479
|
+
load_transforms_module(os.path.join(config_dir, "transforms.py"))
|
|
1480
|
+
|
|
1481
|
+
cwd = os.getcwd()
|
|
1482
|
+
if os.path.abspath(cwd) != str(config_dir):
|
|
1483
|
+
load_transforms_module(os.path.join(cwd, "transforms.py"))
|
|
1484
|
+
|
|
1485
|
+
try:
|
|
1486
|
+
config = load_yaml_with_env(str(yaml_path_obj), env=env)
|
|
1487
|
+
logger.debug("Configuration loaded successfully")
|
|
1488
|
+
except FileNotFoundError:
|
|
1489
|
+
logger.error(f"YAML file not found: {yaml_path}")
|
|
1490
|
+
raise FileNotFoundError(
|
|
1491
|
+
f"YAML file not found: {yaml_path}. "
|
|
1492
|
+
f"Verify the file exists and consider using an absolute path."
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
project_config = ProjectConfig(**config)
|
|
1496
|
+
logger.debug(
|
|
1497
|
+
"Project config validated",
|
|
1498
|
+
project=project_config.project,
|
|
1499
|
+
pipelines=len(project_config.pipelines),
|
|
1500
|
+
)
|
|
1501
|
+
|
|
1502
|
+
connections = cls._build_connections(project_config.connections)
|
|
1503
|
+
|
|
1504
|
+
return cls(
|
|
1505
|
+
project_config=project_config,
|
|
1506
|
+
connections=connections,
|
|
1507
|
+
)
|
|
1508
|
+
|
|
1509
|
+
@staticmethod
|
|
1510
|
+
def _build_connections(conn_configs: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
1511
|
+
"""Convert connection configs to connection objects.
|
|
1512
|
+
|
|
1513
|
+
Args:
|
|
1514
|
+
conn_configs: Connection configurations from ProjectConfig
|
|
1515
|
+
|
|
1516
|
+
Returns:
|
|
1517
|
+
Dictionary of connection name -> connection object
|
|
1518
|
+
|
|
1519
|
+
Raises:
|
|
1520
|
+
ValueError: If connection type is not supported
|
|
1521
|
+
"""
|
|
1522
|
+
from odibi.connections.factory import register_builtins
|
|
1523
|
+
|
|
1524
|
+
logger.debug(f"Building {len(conn_configs)} connections")
|
|
1525
|
+
|
|
1526
|
+
connections = {}
|
|
1527
|
+
|
|
1528
|
+
register_builtins()
|
|
1529
|
+
load_plugins()
|
|
1530
|
+
|
|
1531
|
+
for conn_name, conn_config in conn_configs.items():
|
|
1532
|
+
if hasattr(conn_config, "model_dump"):
|
|
1533
|
+
conn_config = conn_config.model_dump()
|
|
1534
|
+
elif hasattr(conn_config, "dict"):
|
|
1535
|
+
conn_config = conn_config.model_dump()
|
|
1536
|
+
|
|
1537
|
+
conn_type = conn_config.get("type", "local")
|
|
1538
|
+
|
|
1539
|
+
factory = get_connection_factory(conn_type)
|
|
1540
|
+
if factory:
|
|
1541
|
+
try:
|
|
1542
|
+
connections[conn_name] = factory(conn_name, conn_config)
|
|
1543
|
+
logger.debug(
|
|
1544
|
+
f"Connection created: {conn_name}",
|
|
1545
|
+
type=conn_type,
|
|
1546
|
+
)
|
|
1547
|
+
except Exception as e:
|
|
1548
|
+
logger.error(
|
|
1549
|
+
f"Failed to create connection '{conn_name}'",
|
|
1550
|
+
type=conn_type,
|
|
1551
|
+
error=str(e),
|
|
1552
|
+
)
|
|
1553
|
+
raise ValueError(
|
|
1554
|
+
f"Failed to create connection '{conn_name}' (type={conn_type}): {e}"
|
|
1555
|
+
) from e
|
|
1556
|
+
else:
|
|
1557
|
+
logger.error(
|
|
1558
|
+
f"Unsupported connection type: {conn_type}",
|
|
1559
|
+
connection=conn_name,
|
|
1560
|
+
suggestion="Check supported connection types in docs",
|
|
1561
|
+
)
|
|
1562
|
+
raise ValueError(
|
|
1563
|
+
f"Unsupported connection type: {conn_type}. "
|
|
1564
|
+
f"Supported types: local, azure_adls, azure_sql, delta, etc. "
|
|
1565
|
+
f"See docs for connection setup."
|
|
1566
|
+
)
|
|
1567
|
+
|
|
1568
|
+
try:
|
|
1569
|
+
from odibi.utils import configure_connections_parallel
|
|
1570
|
+
|
|
1571
|
+
connections, errors = configure_connections_parallel(connections, verbose=False)
|
|
1572
|
+
if errors:
|
|
1573
|
+
for error in errors:
|
|
1574
|
+
logger.warning(error)
|
|
1575
|
+
except ImportError:
|
|
1576
|
+
pass
|
|
1577
|
+
|
|
1578
|
+
logger.info(f"Built {len(connections)} connections successfully")
|
|
1579
|
+
|
|
1580
|
+
return connections
|
|
1581
|
+
|
|
1582
|
+
def register_outputs(
|
|
1583
|
+
self,
|
|
1584
|
+
pipelines: Optional[Union[str, List[str]]] = None,
|
|
1585
|
+
) -> Dict[str, int]:
|
|
1586
|
+
"""
|
|
1587
|
+
Pre-register node outputs from pipeline configs without running them.
|
|
1588
|
+
|
|
1589
|
+
Scans pipeline nodes for output locations (write blocks, merge/scd2 params)
|
|
1590
|
+
and registers them to meta_outputs. This enables cross-pipeline references
|
|
1591
|
+
without requiring the source pipelines to have run first.
|
|
1592
|
+
|
|
1593
|
+
Args:
|
|
1594
|
+
pipelines: Pipeline name(s) to register. If None, registers all pipelines.
|
|
1595
|
+
|
|
1596
|
+
Returns:
|
|
1597
|
+
Dict mapping pipeline name to number of outputs registered
|
|
1598
|
+
|
|
1599
|
+
Example:
|
|
1600
|
+
>>> manager = PipelineManager.from_yaml("pipelines.yaml")
|
|
1601
|
+
>>> counts = manager.register_outputs("silver") # Register just silver
|
|
1602
|
+
>>> counts = manager.register_outputs() # Register all pipelines
|
|
1603
|
+
"""
|
|
1604
|
+
if pipelines is None:
|
|
1605
|
+
pipeline_names = list(self._pipelines.keys())
|
|
1606
|
+
elif isinstance(pipelines, str):
|
|
1607
|
+
pipeline_names = [pipelines]
|
|
1608
|
+
else:
|
|
1609
|
+
pipeline_names = pipelines
|
|
1610
|
+
|
|
1611
|
+
results = {}
|
|
1612
|
+
for name in pipeline_names:
|
|
1613
|
+
if name not in self._pipelines:
|
|
1614
|
+
self._ctx.warning(f"Pipeline not found: {name}")
|
|
1615
|
+
continue
|
|
1616
|
+
|
|
1617
|
+
pipeline = self._pipelines[name]
|
|
1618
|
+
count = pipeline.register_outputs()
|
|
1619
|
+
results[name] = count
|
|
1620
|
+
|
|
1621
|
+
total = sum(results.values())
|
|
1622
|
+
self._ctx.info(f"Pre-registered {total} outputs from {len(results)} pipelines")
|
|
1623
|
+
return results
|
|
1624
|
+
|
|
1625
|
+
def run(
|
|
1626
|
+
self,
|
|
1627
|
+
pipelines: Optional[Union[str, List[str]]] = None,
|
|
1628
|
+
dry_run: bool = False,
|
|
1629
|
+
resume_from_failure: bool = False,
|
|
1630
|
+
parallel: bool = False,
|
|
1631
|
+
max_workers: int = 4,
|
|
1632
|
+
on_error: Optional[str] = None,
|
|
1633
|
+
tag: Optional[str] = None,
|
|
1634
|
+
node: Optional[Union[str, List[str]]] = None,
|
|
1635
|
+
console: bool = False,
|
|
1636
|
+
) -> Union[PipelineResults, Dict[str, PipelineResults]]:
|
|
1637
|
+
"""Run one, multiple, or all pipelines.
|
|
1638
|
+
|
|
1639
|
+
Args:
|
|
1640
|
+
pipelines: Pipeline name(s) to run.
|
|
1641
|
+
dry_run: Whether to simulate execution.
|
|
1642
|
+
resume_from_failure: Whether to skip successfully completed nodes from last run.
|
|
1643
|
+
parallel: Whether to run nodes in parallel.
|
|
1644
|
+
max_workers: Maximum number of worker threads for parallel execution.
|
|
1645
|
+
on_error: Override error handling strategy (fail_fast, fail_later, ignore).
|
|
1646
|
+
tag: Filter nodes by tag (only nodes with this tag will run).
|
|
1647
|
+
node: Run only specific node(s) by name - can be a string or list of strings.
|
|
1648
|
+
console: Whether to show rich console output with progress.
|
|
1649
|
+
|
|
1650
|
+
Returns:
|
|
1651
|
+
PipelineResults or Dict of results
|
|
1652
|
+
"""
|
|
1653
|
+
if pipelines is None:
|
|
1654
|
+
pipeline_names = list(self._pipelines.keys())
|
|
1655
|
+
elif isinstance(pipelines, str):
|
|
1656
|
+
pipeline_names = [pipelines]
|
|
1657
|
+
else:
|
|
1658
|
+
pipeline_names = pipelines
|
|
1659
|
+
|
|
1660
|
+
for name in pipeline_names:
|
|
1661
|
+
if name not in self._pipelines:
|
|
1662
|
+
available = ", ".join(self._pipelines.keys())
|
|
1663
|
+
self._ctx.error(
|
|
1664
|
+
f"Pipeline not found: {name}",
|
|
1665
|
+
available=list(self._pipelines.keys()),
|
|
1666
|
+
)
|
|
1667
|
+
raise ValueError(f"Pipeline '{name}' not found. Available pipelines: {available}")
|
|
1668
|
+
|
|
1669
|
+
# Phase 2: Auto-register pipelines and nodes before execution
|
|
1670
|
+
if self.catalog_manager:
|
|
1671
|
+
self._auto_register_pipelines(pipeline_names)
|
|
1672
|
+
|
|
1673
|
+
self._ctx.info(
|
|
1674
|
+
f"Running {len(pipeline_names)} pipeline(s)",
|
|
1675
|
+
pipelines=pipeline_names,
|
|
1676
|
+
dry_run=dry_run,
|
|
1677
|
+
parallel=parallel,
|
|
1678
|
+
)
|
|
1679
|
+
|
|
1680
|
+
results = {}
|
|
1681
|
+
for idx, name in enumerate(pipeline_names):
|
|
1682
|
+
# Invalidate cache before each pipeline so it sees latest outputs
|
|
1683
|
+
if self.catalog_manager:
|
|
1684
|
+
self.catalog_manager.invalidate_cache()
|
|
1685
|
+
|
|
1686
|
+
self._ctx.info(
|
|
1687
|
+
f"Executing pipeline {idx + 1}/{len(pipeline_names)}: {name}",
|
|
1688
|
+
pipeline=name,
|
|
1689
|
+
order=idx + 1,
|
|
1690
|
+
)
|
|
1691
|
+
|
|
1692
|
+
results[name] = self._pipelines[name].run(
|
|
1693
|
+
dry_run=dry_run,
|
|
1694
|
+
resume_from_failure=resume_from_failure,
|
|
1695
|
+
parallel=parallel,
|
|
1696
|
+
max_workers=max_workers,
|
|
1697
|
+
on_error=on_error,
|
|
1698
|
+
tag=tag,
|
|
1699
|
+
node=node,
|
|
1700
|
+
console=console,
|
|
1701
|
+
)
|
|
1702
|
+
|
|
1703
|
+
result = results[name]
|
|
1704
|
+
status = "SUCCESS" if not result.failed else "FAILED"
|
|
1705
|
+
self._ctx.info(
|
|
1706
|
+
f"Pipeline {status}: {name}",
|
|
1707
|
+
status=status,
|
|
1708
|
+
duration_s=round(result.duration, 2),
|
|
1709
|
+
completed=len(result.completed),
|
|
1710
|
+
failed=len(result.failed),
|
|
1711
|
+
)
|
|
1712
|
+
|
|
1713
|
+
if result.story_path:
|
|
1714
|
+
self._ctx.debug(f"Story generated: {result.story_path}")
|
|
1715
|
+
|
|
1716
|
+
# Generate combined lineage if configured
|
|
1717
|
+
has_story = hasattr(self.project_config, "story") and self.project_config.story
|
|
1718
|
+
generate_lineage_enabled = has_story and self.project_config.story.generate_lineage
|
|
1719
|
+
|
|
1720
|
+
self._ctx.debug(
|
|
1721
|
+
"Lineage check",
|
|
1722
|
+
has_story=has_story,
|
|
1723
|
+
generate_lineage_enabled=generate_lineage_enabled,
|
|
1724
|
+
)
|
|
1725
|
+
|
|
1726
|
+
if generate_lineage_enabled:
|
|
1727
|
+
# Flush any pending async story writes before generating lineage
|
|
1728
|
+
self._ctx.info("Generating combined lineage...")
|
|
1729
|
+
self.flush_stories()
|
|
1730
|
+
|
|
1731
|
+
try:
|
|
1732
|
+
lineage_result = generate_lineage(self.project_config)
|
|
1733
|
+
if lineage_result:
|
|
1734
|
+
self._ctx.info(
|
|
1735
|
+
"Combined lineage generated",
|
|
1736
|
+
nodes=len(lineage_result.nodes),
|
|
1737
|
+
edges=len(lineage_result.edges),
|
|
1738
|
+
json_path=lineage_result.json_path,
|
|
1739
|
+
)
|
|
1740
|
+
else:
|
|
1741
|
+
self._ctx.warning("Lineage generation returned None")
|
|
1742
|
+
except Exception as e:
|
|
1743
|
+
self._ctx.warning(f"Failed to generate combined lineage: {e}")
|
|
1744
|
+
|
|
1745
|
+
if len(pipeline_names) == 1:
|
|
1746
|
+
return results[pipeline_names[0]]
|
|
1747
|
+
else:
|
|
1748
|
+
return results
|
|
1749
|
+
|
|
1750
|
+
def list_pipelines(self) -> List[str]:
|
|
1751
|
+
"""Get list of available pipeline names.
|
|
1752
|
+
|
|
1753
|
+
Returns:
|
|
1754
|
+
List of pipeline names
|
|
1755
|
+
"""
|
|
1756
|
+
return list(self._pipelines.keys())
|
|
1757
|
+
|
|
1758
|
+
def flush_stories(self, timeout: float = 60.0) -> Dict[str, Optional[str]]:
|
|
1759
|
+
"""Wait for all pending async story generation to complete.
|
|
1760
|
+
|
|
1761
|
+
Call this before operations that need story files to be written,
|
|
1762
|
+
such as lineage generation with SemanticLayerRunner.
|
|
1763
|
+
|
|
1764
|
+
Args:
|
|
1765
|
+
timeout: Maximum seconds to wait per pipeline
|
|
1766
|
+
|
|
1767
|
+
Returns:
|
|
1768
|
+
Dict mapping pipeline name to story path (or None if no pending story)
|
|
1769
|
+
|
|
1770
|
+
Example:
|
|
1771
|
+
>>> manager.run(pipelines=['bronze', 'silver', 'gold'])
|
|
1772
|
+
>>> manager.flush_stories() # Wait for all stories to be written
|
|
1773
|
+
>>> semantic_runner.run() # Now lineage can read the stories
|
|
1774
|
+
"""
|
|
1775
|
+
results = {}
|
|
1776
|
+
for name, pipeline in self._pipelines.items():
|
|
1777
|
+
story_path = pipeline.flush_stories(timeout=timeout)
|
|
1778
|
+
if story_path:
|
|
1779
|
+
results[name] = story_path
|
|
1780
|
+
self._ctx.debug(f"Story flushed for {name}", path=story_path)
|
|
1781
|
+
if results:
|
|
1782
|
+
self._ctx.info(f"Flushed {len(results)} pending story writes")
|
|
1783
|
+
return results
|
|
1784
|
+
|
|
1785
|
+
def get_pipeline(self, name: str) -> Pipeline:
|
|
1786
|
+
"""Get a specific pipeline instance.
|
|
1787
|
+
|
|
1788
|
+
Args:
|
|
1789
|
+
name: Pipeline name
|
|
1790
|
+
|
|
1791
|
+
Returns:
|
|
1792
|
+
Pipeline instance
|
|
1793
|
+
|
|
1794
|
+
Raises:
|
|
1795
|
+
ValueError: If pipeline not found
|
|
1796
|
+
"""
|
|
1797
|
+
if name not in self._pipelines:
|
|
1798
|
+
available = ", ".join(self._pipelines.keys())
|
|
1799
|
+
raise ValueError(f"Pipeline '{name}' not found. Available: {available}")
|
|
1800
|
+
return self._pipelines[name]
|
|
1801
|
+
|
|
1802
|
+
def deploy(self, pipelines: Optional[Union[str, List[str]]] = None) -> bool:
|
|
1803
|
+
"""Deploy pipeline definitions to the System Catalog.
|
|
1804
|
+
|
|
1805
|
+
This registers pipeline and node configurations in the catalog,
|
|
1806
|
+
enabling drift detection and governance features.
|
|
1807
|
+
|
|
1808
|
+
Args:
|
|
1809
|
+
pipelines: Optional pipeline name(s) to deploy. If None, deploys all.
|
|
1810
|
+
|
|
1811
|
+
Returns:
|
|
1812
|
+
True if deployment succeeded, False otherwise.
|
|
1813
|
+
|
|
1814
|
+
Example:
|
|
1815
|
+
>>> manager = PipelineManager.from_yaml("odibi.yaml")
|
|
1816
|
+
>>> manager.deploy() # Deploy all pipelines
|
|
1817
|
+
>>> manager.deploy("sales_daily") # Deploy specific pipeline
|
|
1818
|
+
"""
|
|
1819
|
+
if not self.catalog_manager:
|
|
1820
|
+
self._ctx.warning(
|
|
1821
|
+
"System Catalog not configured. Cannot deploy.",
|
|
1822
|
+
suggestion="Configure system catalog in your YAML config",
|
|
1823
|
+
)
|
|
1824
|
+
return False
|
|
1825
|
+
|
|
1826
|
+
if pipelines is None:
|
|
1827
|
+
to_deploy = self.project_config.pipelines
|
|
1828
|
+
elif isinstance(pipelines, str):
|
|
1829
|
+
to_deploy = [p for p in self.project_config.pipelines if p.pipeline == pipelines]
|
|
1830
|
+
else:
|
|
1831
|
+
to_deploy = [p for p in self.project_config.pipelines if p.pipeline in pipelines]
|
|
1832
|
+
|
|
1833
|
+
if not to_deploy:
|
|
1834
|
+
self._ctx.warning("No matching pipelines found to deploy.")
|
|
1835
|
+
return False
|
|
1836
|
+
|
|
1837
|
+
self._ctx.info(
|
|
1838
|
+
f"Deploying {len(to_deploy)} pipeline(s) to System Catalog",
|
|
1839
|
+
pipelines=[p.pipeline for p in to_deploy],
|
|
1840
|
+
)
|
|
1841
|
+
|
|
1842
|
+
try:
|
|
1843
|
+
self.catalog_manager.bootstrap()
|
|
1844
|
+
|
|
1845
|
+
for pipeline_config in to_deploy:
|
|
1846
|
+
self._ctx.debug(
|
|
1847
|
+
f"Deploying pipeline: {pipeline_config.pipeline}",
|
|
1848
|
+
node_count=len(pipeline_config.nodes),
|
|
1849
|
+
)
|
|
1850
|
+
self.catalog_manager.register_pipeline(pipeline_config, self.project_config)
|
|
1851
|
+
|
|
1852
|
+
for node in pipeline_config.nodes:
|
|
1853
|
+
self.catalog_manager.register_node(pipeline_config.pipeline, node)
|
|
1854
|
+
|
|
1855
|
+
self._ctx.info(
|
|
1856
|
+
f"Deployment complete: {len(to_deploy)} pipeline(s)",
|
|
1857
|
+
deployed=[p.pipeline for p in to_deploy],
|
|
1858
|
+
)
|
|
1859
|
+
return True
|
|
1860
|
+
|
|
1861
|
+
except Exception as e:
|
|
1862
|
+
self._ctx.error(
|
|
1863
|
+
f"Deployment failed: {e}",
|
|
1864
|
+
error_type=type(e).__name__,
|
|
1865
|
+
suggestion="Check catalog configuration and permissions",
|
|
1866
|
+
)
|
|
1867
|
+
return False
|
|
1868
|
+
|
|
1869
|
+
def _auto_register_pipelines(self, pipeline_names: List[str]) -> None:
|
|
1870
|
+
"""Auto-register pipelines and nodes before execution.
|
|
1871
|
+
|
|
1872
|
+
This ensures meta_pipelines and meta_nodes are populated automatically
|
|
1873
|
+
when running pipelines, without requiring explicit deploy() calls.
|
|
1874
|
+
|
|
1875
|
+
Uses "check-before-write" pattern with batch writes for performance:
|
|
1876
|
+
- Reads existing hashes in one read
|
|
1877
|
+
- Compares version_hash to skip unchanged records
|
|
1878
|
+
- Batch writes only changed/new records
|
|
1879
|
+
|
|
1880
|
+
Args:
|
|
1881
|
+
pipeline_names: List of pipeline names to register
|
|
1882
|
+
"""
|
|
1883
|
+
if not self.catalog_manager:
|
|
1884
|
+
return
|
|
1885
|
+
|
|
1886
|
+
try:
|
|
1887
|
+
import hashlib
|
|
1888
|
+
import json
|
|
1889
|
+
|
|
1890
|
+
existing_pipelines = self.catalog_manager.get_all_registered_pipelines()
|
|
1891
|
+
existing_nodes = self.catalog_manager.get_all_registered_nodes(pipeline_names)
|
|
1892
|
+
|
|
1893
|
+
pipeline_records = []
|
|
1894
|
+
node_records = []
|
|
1895
|
+
|
|
1896
|
+
for name in pipeline_names:
|
|
1897
|
+
pipeline = self._pipelines[name]
|
|
1898
|
+
config = pipeline.config
|
|
1899
|
+
|
|
1900
|
+
if hasattr(config, "model_dump"):
|
|
1901
|
+
dump = config.model_dump(mode="json")
|
|
1902
|
+
else:
|
|
1903
|
+
dump = config.model_dump()
|
|
1904
|
+
dump_str = json.dumps(dump, sort_keys=True)
|
|
1905
|
+
pipeline_hash = hashlib.md5(dump_str.encode("utf-8")).hexdigest()
|
|
1906
|
+
|
|
1907
|
+
if existing_pipelines.get(name) != pipeline_hash:
|
|
1908
|
+
all_tags = set()
|
|
1909
|
+
for node in config.nodes:
|
|
1910
|
+
if node.tags:
|
|
1911
|
+
all_tags.update(node.tags)
|
|
1912
|
+
|
|
1913
|
+
pipeline_records.append(
|
|
1914
|
+
{
|
|
1915
|
+
"pipeline_name": name,
|
|
1916
|
+
"version_hash": pipeline_hash,
|
|
1917
|
+
"description": config.description or "",
|
|
1918
|
+
"layer": config.layer or "",
|
|
1919
|
+
"schedule": "",
|
|
1920
|
+
"tags_json": json.dumps(list(all_tags)),
|
|
1921
|
+
}
|
|
1922
|
+
)
|
|
1923
|
+
|
|
1924
|
+
pipeline_existing_nodes = existing_nodes.get(name, {})
|
|
1925
|
+
for node in config.nodes:
|
|
1926
|
+
if hasattr(node, "model_dump"):
|
|
1927
|
+
node_dump = node.model_dump(
|
|
1928
|
+
mode="json", exclude={"description", "tags", "log_level"}
|
|
1929
|
+
)
|
|
1930
|
+
else:
|
|
1931
|
+
node_dump = node.model_dump(exclude={"description", "tags", "log_level"})
|
|
1932
|
+
node_dump_str = json.dumps(node_dump, sort_keys=True)
|
|
1933
|
+
node_hash = hashlib.md5(node_dump_str.encode("utf-8")).hexdigest()
|
|
1934
|
+
|
|
1935
|
+
if pipeline_existing_nodes.get(node.name) != node_hash:
|
|
1936
|
+
node_type = "transform"
|
|
1937
|
+
if node.read:
|
|
1938
|
+
node_type = "read"
|
|
1939
|
+
if node.write:
|
|
1940
|
+
node_type = "write"
|
|
1941
|
+
|
|
1942
|
+
node_records.append(
|
|
1943
|
+
{
|
|
1944
|
+
"pipeline_name": name,
|
|
1945
|
+
"node_name": node.name,
|
|
1946
|
+
"version_hash": node_hash,
|
|
1947
|
+
"type": node_type,
|
|
1948
|
+
"config_json": json.dumps(node_dump),
|
|
1949
|
+
}
|
|
1950
|
+
)
|
|
1951
|
+
|
|
1952
|
+
if pipeline_records:
|
|
1953
|
+
self.catalog_manager.register_pipelines_batch(pipeline_records)
|
|
1954
|
+
self._ctx.debug(
|
|
1955
|
+
f"Batch registered {len(pipeline_records)} changed pipeline(s)",
|
|
1956
|
+
pipelines=[r["pipeline_name"] for r in pipeline_records],
|
|
1957
|
+
)
|
|
1958
|
+
else:
|
|
1959
|
+
self._ctx.debug("All pipelines unchanged - skipping registration")
|
|
1960
|
+
|
|
1961
|
+
if node_records:
|
|
1962
|
+
self.catalog_manager.register_nodes_batch(node_records)
|
|
1963
|
+
self._ctx.debug(
|
|
1964
|
+
f"Batch registered {len(node_records)} changed node(s)",
|
|
1965
|
+
nodes=[r["node_name"] for r in node_records],
|
|
1966
|
+
)
|
|
1967
|
+
else:
|
|
1968
|
+
self._ctx.debug("All nodes unchanged - skipping registration")
|
|
1969
|
+
|
|
1970
|
+
except Exception as e:
|
|
1971
|
+
self._ctx.warning(
|
|
1972
|
+
f"Auto-registration failed (non-fatal): {e}",
|
|
1973
|
+
error_type=type(e).__name__,
|
|
1974
|
+
)
|
|
1975
|
+
|
|
1976
|
+
# -------------------------------------------------------------------------
|
|
1977
|
+
# Phase 5: List/Query Methods
|
|
1978
|
+
# -------------------------------------------------------------------------
|
|
1979
|
+
|
|
1980
|
+
def list_registered_pipelines(self) -> "pd.DataFrame":
|
|
1981
|
+
"""List all registered pipelines from the system catalog.
|
|
1982
|
+
|
|
1983
|
+
Returns:
|
|
1984
|
+
DataFrame with pipeline metadata from meta_pipelines
|
|
1985
|
+
"""
|
|
1986
|
+
import pandas as pd
|
|
1987
|
+
|
|
1988
|
+
if not self.catalog_manager:
|
|
1989
|
+
self._ctx.warning("Catalog manager not configured")
|
|
1990
|
+
return pd.DataFrame()
|
|
1991
|
+
|
|
1992
|
+
try:
|
|
1993
|
+
df = self.catalog_manager._read_local_table(
|
|
1994
|
+
self.catalog_manager.tables["meta_pipelines"]
|
|
1995
|
+
)
|
|
1996
|
+
return df
|
|
1997
|
+
except Exception as e:
|
|
1998
|
+
self._ctx.warning(f"Failed to list pipelines: {e}")
|
|
1999
|
+
return pd.DataFrame()
|
|
2000
|
+
|
|
2001
|
+
def list_registered_nodes(self, pipeline: Optional[str] = None) -> "pd.DataFrame":
|
|
2002
|
+
"""List nodes from the system catalog.
|
|
2003
|
+
|
|
2004
|
+
Args:
|
|
2005
|
+
pipeline: Optional pipeline name to filter by
|
|
2006
|
+
|
|
2007
|
+
Returns:
|
|
2008
|
+
DataFrame with node metadata from meta_nodes
|
|
2009
|
+
"""
|
|
2010
|
+
import pandas as pd
|
|
2011
|
+
|
|
2012
|
+
if not self.catalog_manager:
|
|
2013
|
+
self._ctx.warning("Catalog manager not configured")
|
|
2014
|
+
return pd.DataFrame()
|
|
2015
|
+
|
|
2016
|
+
try:
|
|
2017
|
+
df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_nodes"])
|
|
2018
|
+
if not df.empty and pipeline:
|
|
2019
|
+
df = df[df["pipeline_name"] == pipeline]
|
|
2020
|
+
return df
|
|
2021
|
+
except Exception as e:
|
|
2022
|
+
self._ctx.warning(f"Failed to list nodes: {e}")
|
|
2023
|
+
return pd.DataFrame()
|
|
2024
|
+
|
|
2025
|
+
def list_runs(
|
|
2026
|
+
self,
|
|
2027
|
+
pipeline: Optional[str] = None,
|
|
2028
|
+
node: Optional[str] = None,
|
|
2029
|
+
status: Optional[str] = None,
|
|
2030
|
+
limit: int = 10,
|
|
2031
|
+
) -> "pd.DataFrame":
|
|
2032
|
+
"""List recent runs with optional filters.
|
|
2033
|
+
|
|
2034
|
+
Args:
|
|
2035
|
+
pipeline: Optional pipeline name to filter by
|
|
2036
|
+
node: Optional node name to filter by
|
|
2037
|
+
status: Optional status to filter by (SUCCESS, FAILURE)
|
|
2038
|
+
limit: Maximum number of runs to return
|
|
2039
|
+
|
|
2040
|
+
Returns:
|
|
2041
|
+
DataFrame with run history from meta_runs
|
|
2042
|
+
"""
|
|
2043
|
+
import pandas as pd
|
|
2044
|
+
|
|
2045
|
+
if not self.catalog_manager:
|
|
2046
|
+
self._ctx.warning("Catalog manager not configured")
|
|
2047
|
+
return pd.DataFrame()
|
|
2048
|
+
|
|
2049
|
+
try:
|
|
2050
|
+
df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_runs"])
|
|
2051
|
+
if df.empty:
|
|
2052
|
+
return df
|
|
2053
|
+
|
|
2054
|
+
if pipeline:
|
|
2055
|
+
df = df[df["pipeline_name"] == pipeline]
|
|
2056
|
+
if node:
|
|
2057
|
+
df = df[df["node_name"] == node]
|
|
2058
|
+
if status:
|
|
2059
|
+
df = df[df["status"] == status]
|
|
2060
|
+
|
|
2061
|
+
if "timestamp" in df.columns:
|
|
2062
|
+
df = df.sort_values("timestamp", ascending=False)
|
|
2063
|
+
|
|
2064
|
+
return df.head(limit)
|
|
2065
|
+
except Exception as e:
|
|
2066
|
+
self._ctx.warning(f"Failed to list runs: {e}")
|
|
2067
|
+
return pd.DataFrame()
|
|
2068
|
+
|
|
2069
|
+
def list_tables(self) -> "pd.DataFrame":
|
|
2070
|
+
"""List registered assets from meta_tables.
|
|
2071
|
+
|
|
2072
|
+
Returns:
|
|
2073
|
+
DataFrame with table/asset metadata
|
|
2074
|
+
"""
|
|
2075
|
+
import pandas as pd
|
|
2076
|
+
|
|
2077
|
+
if not self.catalog_manager:
|
|
2078
|
+
self._ctx.warning("Catalog manager not configured")
|
|
2079
|
+
return pd.DataFrame()
|
|
2080
|
+
|
|
2081
|
+
try:
|
|
2082
|
+
df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_tables"])
|
|
2083
|
+
return df
|
|
2084
|
+
except Exception as e:
|
|
2085
|
+
self._ctx.warning(f"Failed to list tables: {e}")
|
|
2086
|
+
return pd.DataFrame()
|
|
2087
|
+
|
|
2088
|
+
# -------------------------------------------------------------------------
|
|
2089
|
+
# Phase 5.2: State Methods
|
|
2090
|
+
# -------------------------------------------------------------------------
|
|
2091
|
+
|
|
2092
|
+
def get_state(self, key: str) -> Optional[Dict[str, Any]]:
|
|
2093
|
+
"""Get a specific state entry (HWM, content hash, etc.).
|
|
2094
|
+
|
|
2095
|
+
Args:
|
|
2096
|
+
key: The state key to look up
|
|
2097
|
+
|
|
2098
|
+
Returns:
|
|
2099
|
+
Dictionary with state data or None if not found
|
|
2100
|
+
"""
|
|
2101
|
+
|
|
2102
|
+
if not self.catalog_manager:
|
|
2103
|
+
return None
|
|
2104
|
+
|
|
2105
|
+
try:
|
|
2106
|
+
df = self.catalog_manager._read_table(self.catalog_manager.tables["meta_state"])
|
|
2107
|
+
if df.empty or "key" not in df.columns:
|
|
2108
|
+
return None
|
|
2109
|
+
|
|
2110
|
+
row = df[df["key"] == key]
|
|
2111
|
+
if row.empty:
|
|
2112
|
+
return None
|
|
2113
|
+
|
|
2114
|
+
return row.iloc[0].to_dict()
|
|
2115
|
+
except Exception:
|
|
2116
|
+
return None
|
|
2117
|
+
|
|
2118
|
+
def get_all_state(self, prefix: Optional[str] = None) -> "pd.DataFrame":
|
|
2119
|
+
"""Get all state entries, optionally filtered by key prefix.
|
|
2120
|
+
|
|
2121
|
+
Args:
|
|
2122
|
+
prefix: Optional key prefix to filter by
|
|
2123
|
+
|
|
2124
|
+
Returns:
|
|
2125
|
+
DataFrame with state entries
|
|
2126
|
+
"""
|
|
2127
|
+
import pandas as pd
|
|
2128
|
+
|
|
2129
|
+
if not self.catalog_manager:
|
|
2130
|
+
return pd.DataFrame()
|
|
2131
|
+
|
|
2132
|
+
try:
|
|
2133
|
+
df = self.catalog_manager._read_table(self.catalog_manager.tables["meta_state"])
|
|
2134
|
+
if not df.empty and prefix and "key" in df.columns:
|
|
2135
|
+
df = df[df["key"].str.startswith(prefix)]
|
|
2136
|
+
return df
|
|
2137
|
+
except Exception as e:
|
|
2138
|
+
self._ctx.warning(f"Failed to get state: {e}")
|
|
2139
|
+
return pd.DataFrame()
|
|
2140
|
+
|
|
2141
|
+
def clear_state(self, key: str) -> bool:
|
|
2142
|
+
"""Remove a state entry.
|
|
2143
|
+
|
|
2144
|
+
Args:
|
|
2145
|
+
key: The state key to remove
|
|
2146
|
+
|
|
2147
|
+
Returns:
|
|
2148
|
+
True if deleted, False otherwise
|
|
2149
|
+
"""
|
|
2150
|
+
if not self.catalog_manager:
|
|
2151
|
+
return False
|
|
2152
|
+
|
|
2153
|
+
try:
|
|
2154
|
+
return self.catalog_manager.clear_state_key(key)
|
|
2155
|
+
except Exception as e:
|
|
2156
|
+
self._ctx.warning(f"Failed to clear state: {e}")
|
|
2157
|
+
return False
|
|
2158
|
+
|
|
2159
|
+
# -------------------------------------------------------------------------
|
|
2160
|
+
# Phase 5.3-5.4: Schema/Lineage and Stats Methods
|
|
2161
|
+
# -------------------------------------------------------------------------
|
|
2162
|
+
|
|
2163
|
+
def get_schema_history(
|
|
2164
|
+
self,
|
|
2165
|
+
table: str,
|
|
2166
|
+
limit: int = 5,
|
|
2167
|
+
) -> "pd.DataFrame":
|
|
2168
|
+
"""Get schema version history for a table.
|
|
2169
|
+
|
|
2170
|
+
Args:
|
|
2171
|
+
table: Table identifier (supports smart path resolution)
|
|
2172
|
+
limit: Maximum number of versions to return
|
|
2173
|
+
|
|
2174
|
+
Returns:
|
|
2175
|
+
DataFrame with schema history
|
|
2176
|
+
"""
|
|
2177
|
+
import pandas as pd
|
|
2178
|
+
|
|
2179
|
+
if not self.catalog_manager:
|
|
2180
|
+
return pd.DataFrame()
|
|
2181
|
+
|
|
2182
|
+
try:
|
|
2183
|
+
resolved_path = self._resolve_table_path(table)
|
|
2184
|
+
history = self.catalog_manager.get_schema_history(resolved_path, limit)
|
|
2185
|
+
return pd.DataFrame(history)
|
|
2186
|
+
except Exception as e:
|
|
2187
|
+
self._ctx.warning(f"Failed to get schema history: {e}")
|
|
2188
|
+
return pd.DataFrame()
|
|
2189
|
+
|
|
2190
|
+
def get_lineage(
|
|
2191
|
+
self,
|
|
2192
|
+
table: str,
|
|
2193
|
+
direction: str = "both",
|
|
2194
|
+
) -> "pd.DataFrame":
|
|
2195
|
+
"""Get lineage for a table.
|
|
2196
|
+
|
|
2197
|
+
Args:
|
|
2198
|
+
table: Table identifier (supports smart path resolution)
|
|
2199
|
+
direction: "upstream", "downstream", or "both"
|
|
2200
|
+
|
|
2201
|
+
Returns:
|
|
2202
|
+
DataFrame with lineage relationships
|
|
2203
|
+
"""
|
|
2204
|
+
import pandas as pd
|
|
2205
|
+
|
|
2206
|
+
if not self.catalog_manager:
|
|
2207
|
+
return pd.DataFrame()
|
|
2208
|
+
|
|
2209
|
+
try:
|
|
2210
|
+
resolved_path = self._resolve_table_path(table)
|
|
2211
|
+
|
|
2212
|
+
results = []
|
|
2213
|
+
if direction in ("upstream", "both"):
|
|
2214
|
+
upstream = self.catalog_manager.get_upstream(resolved_path)
|
|
2215
|
+
for r in upstream:
|
|
2216
|
+
r["direction"] = "upstream"
|
|
2217
|
+
results.extend(upstream)
|
|
2218
|
+
|
|
2219
|
+
if direction in ("downstream", "both"):
|
|
2220
|
+
downstream = self.catalog_manager.get_downstream(resolved_path)
|
|
2221
|
+
for r in downstream:
|
|
2222
|
+
r["direction"] = "downstream"
|
|
2223
|
+
results.extend(downstream)
|
|
2224
|
+
|
|
2225
|
+
return pd.DataFrame(results)
|
|
2226
|
+
except Exception as e:
|
|
2227
|
+
self._ctx.warning(f"Failed to get lineage: {e}")
|
|
2228
|
+
return pd.DataFrame()
|
|
2229
|
+
|
|
2230
|
+
def get_pipeline_status(self, pipeline: str) -> Dict[str, Any]:
|
|
2231
|
+
"""Get last run status, duration, timestamp for a pipeline.
|
|
2232
|
+
|
|
2233
|
+
Args:
|
|
2234
|
+
pipeline: Pipeline name
|
|
2235
|
+
|
|
2236
|
+
Returns:
|
|
2237
|
+
Dict with status info
|
|
2238
|
+
"""
|
|
2239
|
+
if not self.catalog_manager:
|
|
2240
|
+
return {}
|
|
2241
|
+
|
|
2242
|
+
try:
|
|
2243
|
+
runs = self.list_runs(pipeline=pipeline, limit=1)
|
|
2244
|
+
if runs.empty:
|
|
2245
|
+
return {"status": "never_run", "pipeline": pipeline}
|
|
2246
|
+
|
|
2247
|
+
last_run = runs.iloc[0].to_dict()
|
|
2248
|
+
return {
|
|
2249
|
+
"pipeline": pipeline,
|
|
2250
|
+
"last_status": last_run.get("status"),
|
|
2251
|
+
"last_run_at": last_run.get("timestamp"),
|
|
2252
|
+
"last_duration_ms": last_run.get("duration_ms"),
|
|
2253
|
+
"last_node": last_run.get("node_name"),
|
|
2254
|
+
}
|
|
2255
|
+
except Exception as e:
|
|
2256
|
+
self._ctx.warning(f"Failed to get pipeline status: {e}")
|
|
2257
|
+
return {}
|
|
2258
|
+
|
|
2259
|
+
def get_node_stats(self, node: str, days: int = 7) -> Dict[str, Any]:
|
|
2260
|
+
"""Get average duration, row counts, success rate over period.
|
|
2261
|
+
|
|
2262
|
+
Args:
|
|
2263
|
+
node: Node name
|
|
2264
|
+
days: Number of days to look back
|
|
2265
|
+
|
|
2266
|
+
Returns:
|
|
2267
|
+
Dict with node statistics
|
|
2268
|
+
"""
|
|
2269
|
+
import pandas as pd
|
|
2270
|
+
|
|
2271
|
+
if not self.catalog_manager:
|
|
2272
|
+
return {}
|
|
2273
|
+
|
|
2274
|
+
try:
|
|
2275
|
+
avg_duration = self.catalog_manager.get_average_duration(node, days)
|
|
2276
|
+
|
|
2277
|
+
df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_runs"])
|
|
2278
|
+
if df.empty:
|
|
2279
|
+
return {"node": node, "runs": 0}
|
|
2280
|
+
|
|
2281
|
+
if "timestamp" in df.columns:
|
|
2282
|
+
cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)
|
|
2283
|
+
if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
|
|
2284
|
+
df["timestamp"] = pd.to_datetime(df["timestamp"])
|
|
2285
|
+
if df["timestamp"].dt.tz is None:
|
|
2286
|
+
df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")
|
|
2287
|
+
df = df[df["timestamp"] >= cutoff]
|
|
2288
|
+
|
|
2289
|
+
node_runs = df[df["node_name"] == node]
|
|
2290
|
+
if node_runs.empty:
|
|
2291
|
+
return {"node": node, "runs": 0}
|
|
2292
|
+
|
|
2293
|
+
total = len(node_runs)
|
|
2294
|
+
success = len(node_runs[node_runs["status"] == "SUCCESS"])
|
|
2295
|
+
avg_rows = node_runs["rows_processed"].mean() if "rows_processed" in node_runs else None
|
|
2296
|
+
|
|
2297
|
+
return {
|
|
2298
|
+
"node": node,
|
|
2299
|
+
"runs": total,
|
|
2300
|
+
"success_rate": success / total if total > 0 else 0,
|
|
2301
|
+
"avg_duration_s": avg_duration,
|
|
2302
|
+
"avg_rows": avg_rows,
|
|
2303
|
+
"period_days": days,
|
|
2304
|
+
}
|
|
2305
|
+
except Exception as e:
|
|
2306
|
+
self._ctx.warning(f"Failed to get node stats: {e}")
|
|
2307
|
+
return {}
|
|
2308
|
+
|
|
2309
|
+
# -------------------------------------------------------------------------
|
|
2310
|
+
# Phase 6: Smart Path Resolution
|
|
2311
|
+
# -------------------------------------------------------------------------
|
|
2312
|
+
|
|
2313
|
+
def _resolve_table_path(self, identifier: str) -> str:
|
|
2314
|
+
"""Resolve a user-friendly identifier to a full table path.
|
|
2315
|
+
|
|
2316
|
+
Accepts:
|
|
2317
|
+
- Relative path: "bronze/OEE/vw_OSMPerformanceOEE"
|
|
2318
|
+
- Registered table: "test.vw_OSMPerformanceOEE"
|
|
2319
|
+
- Node name: "opsvisdata_vw_OSMPerformanceOEE"
|
|
2320
|
+
- Full path: "abfss://..." (used as-is)
|
|
2321
|
+
|
|
2322
|
+
Args:
|
|
2323
|
+
identifier: User-friendly table identifier
|
|
2324
|
+
|
|
2325
|
+
Returns:
|
|
2326
|
+
Full table path
|
|
2327
|
+
"""
|
|
2328
|
+
if self._is_full_path(identifier):
|
|
2329
|
+
return identifier
|
|
2330
|
+
|
|
2331
|
+
if self.catalog_manager:
|
|
2332
|
+
resolved = self._lookup_in_catalog(identifier)
|
|
2333
|
+
if resolved:
|
|
2334
|
+
return resolved
|
|
2335
|
+
|
|
2336
|
+
for pipeline in self._pipelines.values():
|
|
2337
|
+
for node in pipeline.config.nodes:
|
|
2338
|
+
if node.name == identifier and node.write:
|
|
2339
|
+
conn = self.connections.get(node.write.connection)
|
|
2340
|
+
if conn:
|
|
2341
|
+
return conn.get_path(node.write.path or node.write.table)
|
|
2342
|
+
|
|
2343
|
+
sys_conn_name = (
|
|
2344
|
+
self.project_config.system.connection if self.project_config.system else None
|
|
2345
|
+
)
|
|
2346
|
+
if sys_conn_name:
|
|
2347
|
+
sys_conn = self.connections.get(sys_conn_name)
|
|
2348
|
+
if sys_conn:
|
|
2349
|
+
return sys_conn.get_path(identifier)
|
|
2350
|
+
|
|
2351
|
+
return identifier
|
|
2352
|
+
|
|
2353
|
+
def _is_full_path(self, identifier: str) -> bool:
|
|
2354
|
+
"""Check if identifier is already a full path."""
|
|
2355
|
+
full_path_prefixes = ("abfss://", "s3://", "gs://", "hdfs://", "/", "C:", "D:")
|
|
2356
|
+
return identifier.startswith(full_path_prefixes)
|
|
2357
|
+
|
|
2358
|
+
def _lookup_in_catalog(self, identifier: str) -> Optional[str]:
|
|
2359
|
+
"""Look up identifier in meta_tables catalog."""
|
|
2360
|
+
if not self.catalog_manager:
|
|
2361
|
+
return None
|
|
2362
|
+
|
|
2363
|
+
try:
|
|
2364
|
+
df = self.catalog_manager._read_local_table(self.catalog_manager.tables["meta_tables"])
|
|
2365
|
+
if df.empty or "table_name" not in df.columns:
|
|
2366
|
+
return None
|
|
2367
|
+
|
|
2368
|
+
match = df[df["table_name"] == identifier]
|
|
2369
|
+
if not match.empty and "path" in match.columns:
|
|
2370
|
+
return match.iloc[0]["path"]
|
|
2371
|
+
|
|
2372
|
+
if "." in identifier:
|
|
2373
|
+
parts = identifier.split(".", 1)
|
|
2374
|
+
if len(parts) == 2:
|
|
2375
|
+
match = df[df["table_name"] == parts[1]]
|
|
2376
|
+
if not match.empty and "path" in match.columns:
|
|
2377
|
+
return match.iloc[0]["path"]
|
|
2378
|
+
|
|
2379
|
+
except Exception:
|
|
2380
|
+
pass
|
|
2381
|
+
|
|
2382
|
+
return None
|