odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/node.py
ADDED
|
@@ -0,0 +1,3341 @@
|
|
|
1
|
+
"""Node execution engine."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import inspect
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
import traceback
|
|
9
|
+
from contextlib import contextmanager
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
from odibi.config import IncrementalConfig, IncrementalMode, NodeConfig, RetryConfig, WriteMode
|
|
17
|
+
from odibi.context import Context, EngineContext, _get_unique_view_name
|
|
18
|
+
from odibi.enums import EngineType
|
|
19
|
+
from odibi.exceptions import ExecutionContext, NodeExecutionError, TransformError, ValidationError
|
|
20
|
+
from odibi.registry import FunctionRegistry
|
|
21
|
+
from odibi.state import (
|
|
22
|
+
CatalogStateBackend,
|
|
23
|
+
StateManager,
|
|
24
|
+
)
|
|
25
|
+
from odibi.utils.duration import parse_duration
|
|
26
|
+
from odibi.utils.logging_context import (
|
|
27
|
+
LoggingContext,
|
|
28
|
+
OperationType,
|
|
29
|
+
create_logging_context,
|
|
30
|
+
get_logging_context,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PhaseTimer:
|
|
35
|
+
"""Track timing for individual execution phases.
|
|
36
|
+
|
|
37
|
+
Usage:
|
|
38
|
+
timer = PhaseTimer()
|
|
39
|
+
with timer.phase("read"):
|
|
40
|
+
# do read
|
|
41
|
+
with timer.phase("transform"):
|
|
42
|
+
# do transform
|
|
43
|
+
print(timer.summary()) # {"read": 1.23, "transform": 0.45, ...}
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
self._timings: Dict[str, float] = {}
|
|
48
|
+
self._current_phase: Optional[str] = None
|
|
49
|
+
self._phase_start: Optional[float] = None
|
|
50
|
+
|
|
51
|
+
@contextmanager
|
|
52
|
+
def phase(self, name: str):
|
|
53
|
+
"""Context manager to time a phase."""
|
|
54
|
+
start = time.time()
|
|
55
|
+
try:
|
|
56
|
+
yield
|
|
57
|
+
finally:
|
|
58
|
+
elapsed = time.time() - start
|
|
59
|
+
self._timings[name] = self._timings.get(name, 0) + elapsed
|
|
60
|
+
|
|
61
|
+
def record(self, name: str, duration: float):
|
|
62
|
+
"""Manually record a phase duration."""
|
|
63
|
+
self._timings[name] = self._timings.get(name, 0) + duration
|
|
64
|
+
|
|
65
|
+
def get(self, name: str) -> float:
|
|
66
|
+
"""Get duration for a specific phase."""
|
|
67
|
+
return self._timings.get(name, 0)
|
|
68
|
+
|
|
69
|
+
def summary(self) -> Dict[str, float]:
|
|
70
|
+
"""Get all phase timings rounded to 3 decimal places."""
|
|
71
|
+
return {k: round(v, 3) for k, v in self._timings.items()}
|
|
72
|
+
|
|
73
|
+
def summary_ms(self) -> Dict[str, float]:
|
|
74
|
+
"""Get all phase timings in milliseconds."""
|
|
75
|
+
return {k: round(v * 1000, 2) for k, v in self._timings.items()}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class NodeResult(BaseModel):
|
|
79
|
+
"""Result of node execution."""
|
|
80
|
+
|
|
81
|
+
model_config = {"arbitrary_types_allowed": True} # Allow Exception type
|
|
82
|
+
|
|
83
|
+
node_name: str
|
|
84
|
+
success: bool
|
|
85
|
+
duration: float
|
|
86
|
+
rows_processed: Optional[int] = None
|
|
87
|
+
rows_read: Optional[int] = None
|
|
88
|
+
rows_written: Optional[int] = None
|
|
89
|
+
result_schema: Optional[Any] = Field(default=None, alias="schema") # Renamed to avoid shadowing
|
|
90
|
+
error: Optional[Exception] = None
|
|
91
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@contextmanager
|
|
95
|
+
def _override_log_level(log_level: Optional[str]):
|
|
96
|
+
"""Temporarily override the logging level for a node execution."""
|
|
97
|
+
if not log_level:
|
|
98
|
+
yield
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
from odibi.utils.logging import logger as odibi_logger
|
|
102
|
+
|
|
103
|
+
original_level = odibi_logger.level
|
|
104
|
+
new_level = getattr(logging, log_level.upper(), original_level)
|
|
105
|
+
odibi_logger.level = new_level
|
|
106
|
+
odibi_logger.logger.setLevel(new_level)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
yield
|
|
110
|
+
finally:
|
|
111
|
+
odibi_logger.level = original_level
|
|
112
|
+
odibi_logger.logger.setLevel(original_level)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class NodeExecutor:
|
|
116
|
+
"""Handles the execution logic (read, transform, write) of a node."""
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
context: Context,
|
|
121
|
+
engine: Any,
|
|
122
|
+
connections: Dict[str, Any],
|
|
123
|
+
catalog_manager: Optional[Any] = None,
|
|
124
|
+
config_file: Optional[str] = None,
|
|
125
|
+
max_sample_rows: int = 10,
|
|
126
|
+
performance_config: Optional[Any] = None,
|
|
127
|
+
state_manager: Optional[Any] = None,
|
|
128
|
+
pipeline_name: Optional[str] = None,
|
|
129
|
+
batch_write_buffers: Optional[Dict[str, List]] = None,
|
|
130
|
+
):
|
|
131
|
+
self.context = context
|
|
132
|
+
self.engine = engine
|
|
133
|
+
self.connections = connections
|
|
134
|
+
self.catalog_manager = catalog_manager
|
|
135
|
+
self.config_file = config_file
|
|
136
|
+
self.max_sample_rows = max_sample_rows
|
|
137
|
+
self.performance_config = performance_config
|
|
138
|
+
self.state_manager = state_manager
|
|
139
|
+
self.pipeline_name = pipeline_name
|
|
140
|
+
self.batch_write_buffers = batch_write_buffers
|
|
141
|
+
|
|
142
|
+
# Ephemeral state per execution
|
|
143
|
+
self._execution_steps: List[str] = []
|
|
144
|
+
self._executed_sql: List[str] = []
|
|
145
|
+
self._delta_write_info: Optional[Dict[str, Any]] = None
|
|
146
|
+
self._validation_warnings: List[str] = []
|
|
147
|
+
self._read_row_count: Optional[int] = None # Cache row count from read phase
|
|
148
|
+
self._table_exists_cache: Dict[str, bool] = {} # Cache table existence checks
|
|
149
|
+
|
|
150
|
+
def _cached_table_exists(
|
|
151
|
+
self,
|
|
152
|
+
connection: Any,
|
|
153
|
+
table: Optional[str] = None,
|
|
154
|
+
path: Optional[str] = None,
|
|
155
|
+
) -> bool:
|
|
156
|
+
"""Check if table exists with caching to avoid repeated Delta operations.
|
|
157
|
+
|
|
158
|
+
Performance: Table existence checks involve Delta table open + limit(0).collect()
|
|
159
|
+
which can take 3-5s. Caching saves significant time for nodes that check
|
|
160
|
+
existence multiple times (incremental filter, write phase, etc.).
|
|
161
|
+
"""
|
|
162
|
+
cache_key = f"{id(connection)}:{table}:{path}"
|
|
163
|
+
if cache_key not in self._table_exists_cache:
|
|
164
|
+
self._table_exists_cache[cache_key] = self.engine.table_exists(connection, table, path)
|
|
165
|
+
return self._table_exists_cache[cache_key]
|
|
166
|
+
|
|
167
|
+
def execute(
|
|
168
|
+
self,
|
|
169
|
+
config: NodeConfig,
|
|
170
|
+
input_df: Optional[Any] = None,
|
|
171
|
+
dry_run: bool = False,
|
|
172
|
+
hwm_state: Optional[Tuple[str, Any]] = None,
|
|
173
|
+
suppress_error_log: bool = False,
|
|
174
|
+
current_pipeline: Optional[str] = None,
|
|
175
|
+
) -> NodeResult:
|
|
176
|
+
"""Execute the node logic.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
config: Node configuration
|
|
180
|
+
input_df: Optional input dataframe (e.g. from dependencies)
|
|
181
|
+
dry_run: Whether to simulate execution
|
|
182
|
+
hwm_state: Current High Water Mark state (key, value)
|
|
183
|
+
suppress_error_log: If True, suppress error logging (used during retries)
|
|
184
|
+
current_pipeline: Name of current pipeline (for same-pipeline cache lookup)
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
NodeResult
|
|
188
|
+
"""
|
|
189
|
+
self._current_pipeline = current_pipeline
|
|
190
|
+
start_time = time.time()
|
|
191
|
+
|
|
192
|
+
# Reset ephemeral state
|
|
193
|
+
self._execution_steps = []
|
|
194
|
+
self._executed_sql = []
|
|
195
|
+
self._delta_write_info = None
|
|
196
|
+
self._validation_warnings = []
|
|
197
|
+
self._read_row_count = None
|
|
198
|
+
self._table_exists_cache = {} # Reset cache per execution
|
|
199
|
+
|
|
200
|
+
ctx = create_logging_context(
|
|
201
|
+
node_id=config.name,
|
|
202
|
+
engine=self.engine.__class__.__name__,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Handle materialized field - controls output as table/view/incremental
|
|
206
|
+
if config.materialized:
|
|
207
|
+
ctx.info(
|
|
208
|
+
f"Materialization strategy: {config.materialized}",
|
|
209
|
+
materialized=config.materialized,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if dry_run:
|
|
213
|
+
ctx.debug("Executing node in dry-run mode")
|
|
214
|
+
return self._execute_dry_run(config)
|
|
215
|
+
|
|
216
|
+
with ctx.operation(OperationType.EXECUTE, f"node:{config.name}") as metrics:
|
|
217
|
+
try:
|
|
218
|
+
input_schema = None
|
|
219
|
+
input_sample = None
|
|
220
|
+
pending_hwm_update = None
|
|
221
|
+
rows_in = None
|
|
222
|
+
phase_timer = PhaseTimer()
|
|
223
|
+
|
|
224
|
+
# 0. Pre-SQL Phase
|
|
225
|
+
with phase_timer.phase("pre_sql"):
|
|
226
|
+
self._execute_pre_sql(config, ctx)
|
|
227
|
+
|
|
228
|
+
# 1. Read Phase (either single read, multi-input, or dependency)
|
|
229
|
+
input_dataframes: Dict[str, Any] = {}
|
|
230
|
+
|
|
231
|
+
if config.inputs:
|
|
232
|
+
# Multi-input mode for cross-pipeline dependencies
|
|
233
|
+
with phase_timer.phase("inputs"):
|
|
234
|
+
input_dataframes = self._execute_inputs_phase(
|
|
235
|
+
config, ctx, current_pipeline=self._current_pipeline
|
|
236
|
+
)
|
|
237
|
+
# For transform phase, use first input as primary (or "df" if named)
|
|
238
|
+
if "df" in input_dataframes:
|
|
239
|
+
result_df = input_dataframes["df"]
|
|
240
|
+
elif input_dataframes:
|
|
241
|
+
first_key = next(iter(input_dataframes))
|
|
242
|
+
result_df = input_dataframes[first_key]
|
|
243
|
+
input_df = result_df
|
|
244
|
+
else:
|
|
245
|
+
# Standard single read or dependency
|
|
246
|
+
with phase_timer.phase("read"):
|
|
247
|
+
result_df, pending_hwm_update = self._execute_read_phase(
|
|
248
|
+
config, hwm_state, ctx
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# If no direct read, check dependencies or use passed input_df
|
|
252
|
+
if result_df is None:
|
|
253
|
+
if input_df is not None:
|
|
254
|
+
result_df = input_df
|
|
255
|
+
ctx.debug(
|
|
256
|
+
"Using provided input_df",
|
|
257
|
+
rows=self._count_rows(input_df) if input_df is not None else 0,
|
|
258
|
+
)
|
|
259
|
+
elif config.depends_on:
|
|
260
|
+
result_df = self.context.get(config.depends_on[0])
|
|
261
|
+
if input_df is None:
|
|
262
|
+
input_df = result_df
|
|
263
|
+
ctx.debug(
|
|
264
|
+
f"Using data from dependency: {config.depends_on[0]}",
|
|
265
|
+
rows=self._count_rows(result_df) if result_df is not None else 0,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
if config.read:
|
|
269
|
+
input_df = result_df
|
|
270
|
+
|
|
271
|
+
# Capture input schema before transformation
|
|
272
|
+
with phase_timer.phase("schema_capture"):
|
|
273
|
+
if input_df is not None:
|
|
274
|
+
input_schema = self._get_schema(input_df)
|
|
275
|
+
# Reuse row count from read phase if available (avoids redundant count)
|
|
276
|
+
rows_in = (
|
|
277
|
+
self._read_row_count
|
|
278
|
+
if self._read_row_count is not None
|
|
279
|
+
else self._count_rows(input_df)
|
|
280
|
+
)
|
|
281
|
+
metrics.rows_in = rows_in
|
|
282
|
+
metrics.schema_before = (
|
|
283
|
+
input_schema if isinstance(input_schema, dict) else None
|
|
284
|
+
)
|
|
285
|
+
if self.max_sample_rows > 0:
|
|
286
|
+
try:
|
|
287
|
+
input_sample = self.engine.get_sample(
|
|
288
|
+
input_df, n=self.max_sample_rows
|
|
289
|
+
)
|
|
290
|
+
except Exception:
|
|
291
|
+
pass
|
|
292
|
+
|
|
293
|
+
# 1.5 Contracts Phase (Pre-conditions)
|
|
294
|
+
with phase_timer.phase("contracts"):
|
|
295
|
+
self._execute_contracts_phase(config, input_df, ctx)
|
|
296
|
+
|
|
297
|
+
# 2. Transform Phase
|
|
298
|
+
with phase_timer.phase("transform"):
|
|
299
|
+
result_df = self._execute_transform_phase(
|
|
300
|
+
config, result_df, input_df, ctx, input_dataframes
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# 3. Validation Phase (returns filtered df if quarantine is used)
|
|
304
|
+
with phase_timer.phase("validation"):
|
|
305
|
+
result_df = self._execute_validation_phase(config, result_df, ctx)
|
|
306
|
+
|
|
307
|
+
# 4. Write Phase
|
|
308
|
+
with phase_timer.phase("write"):
|
|
309
|
+
override_mode = self._determine_write_mode(config)
|
|
310
|
+
self._execute_write_phase(config, result_df, override_mode, ctx)
|
|
311
|
+
|
|
312
|
+
# 4.5 Post-SQL Phase
|
|
313
|
+
with phase_timer.phase("post_sql"):
|
|
314
|
+
self._execute_post_sql(config, ctx)
|
|
315
|
+
|
|
316
|
+
# 5. Register & Cache
|
|
317
|
+
with phase_timer.phase("register"):
|
|
318
|
+
if result_df is not None:
|
|
319
|
+
pii_meta = self._calculate_pii(config)
|
|
320
|
+
self.context.register(
|
|
321
|
+
config.name, result_df, metadata={"pii_columns": pii_meta}
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# 6. Metadata Collection
|
|
325
|
+
with phase_timer.phase("metadata"):
|
|
326
|
+
duration = time.time() - start_time
|
|
327
|
+
metadata = self._collect_metadata(config, result_df, input_schema, input_sample)
|
|
328
|
+
|
|
329
|
+
rows_out = metadata.get("rows")
|
|
330
|
+
metrics.rows_out = rows_out
|
|
331
|
+
|
|
332
|
+
# Log schema changes
|
|
333
|
+
if input_schema and metadata.get("schema"):
|
|
334
|
+
output_schema = metadata["schema"]
|
|
335
|
+
if isinstance(input_schema, dict) and isinstance(output_schema, dict):
|
|
336
|
+
ctx.log_schema_change(
|
|
337
|
+
input_schema, output_schema, operation="node_execution"
|
|
338
|
+
)
|
|
339
|
+
cols_added = metadata.get("columns_added", [])
|
|
340
|
+
cols_removed = metadata.get("columns_removed", [])
|
|
341
|
+
if cols_added or cols_removed:
|
|
342
|
+
ctx.debug(
|
|
343
|
+
"Schema modified",
|
|
344
|
+
columns_added=cols_added,
|
|
345
|
+
columns_removed=cols_removed,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# Log row count delta
|
|
349
|
+
if isinstance(rows_in, (int, float)) and isinstance(rows_out, (int, float)):
|
|
350
|
+
delta = rows_out - rows_in
|
|
351
|
+
if delta != 0:
|
|
352
|
+
ctx.log_row_count_change(rows_in, rows_out, operation="node_execution")
|
|
353
|
+
|
|
354
|
+
# Pass back HWM update if any
|
|
355
|
+
if pending_hwm_update:
|
|
356
|
+
key, value = pending_hwm_update
|
|
357
|
+
metadata["hwm_update"] = {"key": key, "value": value}
|
|
358
|
+
metadata["hwm_pending"] = True
|
|
359
|
+
ctx.debug(f"HWM pending update: {key}={value}")
|
|
360
|
+
|
|
361
|
+
# Add phase timings to metadata
|
|
362
|
+
metadata["phase_timings_ms"] = phase_timer.summary_ms()
|
|
363
|
+
|
|
364
|
+
ctx.info(
|
|
365
|
+
"Node execution completed successfully",
|
|
366
|
+
rows_in=rows_in,
|
|
367
|
+
rows_out=rows_out,
|
|
368
|
+
elapsed_ms=round((time.time() - start_time) * 1000, 2),
|
|
369
|
+
phase_timings_ms=phase_timer.summary_ms(),
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
return NodeResult(
|
|
373
|
+
node_name=config.name,
|
|
374
|
+
success=True,
|
|
375
|
+
duration=duration,
|
|
376
|
+
rows_processed=metadata.get("rows"),
|
|
377
|
+
rows_read=metadata.get("rows_read"),
|
|
378
|
+
rows_written=metadata.get("rows_written"),
|
|
379
|
+
schema=metadata.get("schema"),
|
|
380
|
+
metadata=metadata,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
except Exception as e:
|
|
384
|
+
duration = time.time() - start_time
|
|
385
|
+
suggestions = self._generate_suggestions(e, config)
|
|
386
|
+
|
|
387
|
+
# Capture traceback
|
|
388
|
+
raw_traceback = traceback.format_exc()
|
|
389
|
+
cleaned_traceback = self._clean_spark_traceback(raw_traceback)
|
|
390
|
+
|
|
391
|
+
# Log error with full context (suppress during retries)
|
|
392
|
+
if not suppress_error_log:
|
|
393
|
+
ctx.error(
|
|
394
|
+
f"Node execution failed: {type(e).__name__}: {e}",
|
|
395
|
+
elapsed_ms=round(duration * 1000, 2),
|
|
396
|
+
steps_completed=self._execution_steps.copy(),
|
|
397
|
+
)
|
|
398
|
+
if suggestions:
|
|
399
|
+
ctx.info(f"Suggestions: {'; '.join(suggestions)}")
|
|
400
|
+
|
|
401
|
+
# Wrap error
|
|
402
|
+
if not isinstance(e, NodeExecutionError):
|
|
403
|
+
exec_context = ExecutionContext(
|
|
404
|
+
node_name=config.name,
|
|
405
|
+
config_file=self.config_file,
|
|
406
|
+
previous_steps=self._execution_steps,
|
|
407
|
+
)
|
|
408
|
+
error = NodeExecutionError(
|
|
409
|
+
message=str(e),
|
|
410
|
+
context=exec_context,
|
|
411
|
+
original_error=e,
|
|
412
|
+
suggestions=suggestions,
|
|
413
|
+
)
|
|
414
|
+
else:
|
|
415
|
+
error = e
|
|
416
|
+
|
|
417
|
+
return NodeResult(
|
|
418
|
+
node_name=config.name,
|
|
419
|
+
success=False,
|
|
420
|
+
duration=duration,
|
|
421
|
+
error=error,
|
|
422
|
+
metadata={
|
|
423
|
+
"steps": self._execution_steps.copy(),
|
|
424
|
+
"error_traceback": raw_traceback,
|
|
425
|
+
"error_traceback_cleaned": cleaned_traceback,
|
|
426
|
+
},
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
def _execute_dry_run(self, config: NodeConfig) -> NodeResult:
|
|
430
|
+
"""Simulate execution."""
|
|
431
|
+
self._execution_steps.append("Dry run: Skipping actual execution")
|
|
432
|
+
|
|
433
|
+
if config.read:
|
|
434
|
+
self._execution_steps.append(f"Dry run: Would read from {config.read.connection}")
|
|
435
|
+
|
|
436
|
+
if config.transform:
|
|
437
|
+
self._execution_steps.append(
|
|
438
|
+
f"Dry run: Would apply {len(config.transform.steps)} transform steps"
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
if config.write:
|
|
442
|
+
self._execution_steps.append(f"Dry run: Would write to {config.write.connection}")
|
|
443
|
+
|
|
444
|
+
return NodeResult(
|
|
445
|
+
node_name=config.name,
|
|
446
|
+
success=True,
|
|
447
|
+
duration=0.0,
|
|
448
|
+
rows_processed=0,
|
|
449
|
+
metadata={"dry_run": True, "steps": self._execution_steps},
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
def _execute_read_phase(
|
|
453
|
+
self,
|
|
454
|
+
config: NodeConfig,
|
|
455
|
+
hwm_state: Optional[Tuple[str, Any]],
|
|
456
|
+
ctx: Optional["LoggingContext"] = None,
|
|
457
|
+
) -> Tuple[Optional[Any], Optional[Tuple[str, Any]]]:
|
|
458
|
+
"""Execute read operation. Returns (df, pending_hwm_update)."""
|
|
459
|
+
if ctx is None:
|
|
460
|
+
ctx = get_logging_context()
|
|
461
|
+
|
|
462
|
+
if not config.read:
|
|
463
|
+
return None, None
|
|
464
|
+
|
|
465
|
+
read_config = config.read
|
|
466
|
+
connection = self.connections.get(read_config.connection)
|
|
467
|
+
|
|
468
|
+
if connection is None:
|
|
469
|
+
available = ", ".join(sorted(self.connections.keys())) or "(none defined)"
|
|
470
|
+
raise ValueError(
|
|
471
|
+
f"Read phase failed: Connection '{read_config.connection}' not found in configured connections. "
|
|
472
|
+
f"Available connections: [{available}]. "
|
|
473
|
+
f"Check your read.connection value in the node configuration or add the missing connection to project.yaml."
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
with ctx.operation(
|
|
477
|
+
OperationType.READ,
|
|
478
|
+
f"source:{read_config.connection}",
|
|
479
|
+
format=read_config.format,
|
|
480
|
+
table=read_config.table,
|
|
481
|
+
path=read_config.path,
|
|
482
|
+
) as metrics:
|
|
483
|
+
# Time Travel
|
|
484
|
+
as_of_version = None
|
|
485
|
+
as_of_timestamp = None
|
|
486
|
+
if read_config.time_travel:
|
|
487
|
+
as_of_version = read_config.time_travel.as_of_version
|
|
488
|
+
as_of_timestamp = read_config.time_travel.as_of_timestamp
|
|
489
|
+
ctx.debug(
|
|
490
|
+
"Time travel read",
|
|
491
|
+
as_of_version=as_of_version,
|
|
492
|
+
as_of_timestamp=str(as_of_timestamp) if as_of_timestamp else None,
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# Legacy HWM: First Run Query Logic
|
|
496
|
+
read_options = read_config.options.copy() if read_config.options else {}
|
|
497
|
+
|
|
498
|
+
if config.write and config.write.first_run_query:
|
|
499
|
+
write_config = config.write
|
|
500
|
+
target_conn = self.connections.get(write_config.connection)
|
|
501
|
+
if target_conn:
|
|
502
|
+
if not self._cached_table_exists(
|
|
503
|
+
target_conn, write_config.table, write_config.path
|
|
504
|
+
):
|
|
505
|
+
read_options["query"] = config.write.first_run_query
|
|
506
|
+
ctx.debug("Using first_run_query (target table does not exist)")
|
|
507
|
+
|
|
508
|
+
# Merge archive_options into read_options (e.g., badRecordsPath for Spark)
|
|
509
|
+
if read_config.archive_options:
|
|
510
|
+
read_options.update(read_config.archive_options)
|
|
511
|
+
ctx.debug(
|
|
512
|
+
"Applied archive_options",
|
|
513
|
+
archive_options=list(read_config.archive_options.keys()),
|
|
514
|
+
)
|
|
515
|
+
self._execution_steps.append(
|
|
516
|
+
f"Applied archive_options: {list(read_config.archive_options.keys())}"
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Incremental SQL Pushdown: Generate filter for SQL sources
|
|
520
|
+
if read_config.incremental and read_config.format in [
|
|
521
|
+
"sql",
|
|
522
|
+
"sql_server",
|
|
523
|
+
"azure_sql",
|
|
524
|
+
]:
|
|
525
|
+
incremental_filter = self._generate_incremental_sql_filter(
|
|
526
|
+
read_config.incremental, config, ctx
|
|
527
|
+
)
|
|
528
|
+
if incremental_filter:
|
|
529
|
+
# Combine with existing filter if present
|
|
530
|
+
existing_filter = read_options.get("filter")
|
|
531
|
+
if existing_filter:
|
|
532
|
+
read_options["filter"] = f"({existing_filter}) AND ({incremental_filter})"
|
|
533
|
+
else:
|
|
534
|
+
read_options["filter"] = incremental_filter
|
|
535
|
+
ctx.debug(
|
|
536
|
+
"Added incremental SQL pushdown filter",
|
|
537
|
+
filter=read_options["filter"],
|
|
538
|
+
)
|
|
539
|
+
self._execution_steps.append(f"Incremental SQL pushdown: {incremental_filter}")
|
|
540
|
+
|
|
541
|
+
# Execute Read
|
|
542
|
+
df = self.engine.read(
|
|
543
|
+
connection=connection,
|
|
544
|
+
format=read_config.format,
|
|
545
|
+
table=read_config.table,
|
|
546
|
+
path=read_config.path,
|
|
547
|
+
streaming=read_config.streaming,
|
|
548
|
+
schema=getattr(read_config, "schema_ddl", None),
|
|
549
|
+
options=read_options,
|
|
550
|
+
as_of_version=as_of_version,
|
|
551
|
+
as_of_timestamp=as_of_timestamp,
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
if read_config.streaming:
|
|
555
|
+
ctx.info("Streaming read enabled")
|
|
556
|
+
self._execution_steps.append("Streaming read enabled")
|
|
557
|
+
|
|
558
|
+
row_count = self._count_rows(df) if df is not None else 0
|
|
559
|
+
metrics.rows_out = row_count
|
|
560
|
+
# Cache row count to avoid redundant counting in schema_capture phase
|
|
561
|
+
self._read_row_count = row_count
|
|
562
|
+
|
|
563
|
+
ctx.info(
|
|
564
|
+
f"Read completed from {read_config.connection}",
|
|
565
|
+
format=read_config.format,
|
|
566
|
+
table=read_config.table,
|
|
567
|
+
path=read_config.path,
|
|
568
|
+
rows=row_count,
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
# Apply Incremental Logic
|
|
572
|
+
pending_hwm = None
|
|
573
|
+
if config.read.incremental:
|
|
574
|
+
df, pending_hwm = self._apply_incremental_filtering(df, config, hwm_state)
|
|
575
|
+
if pending_hwm:
|
|
576
|
+
ctx.debug(
|
|
577
|
+
"Incremental filtering applied",
|
|
578
|
+
hwm_key=pending_hwm[0],
|
|
579
|
+
hwm_value=str(pending_hwm[1]),
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
self._execution_steps.append(f"Read from {config.read.connection}")
|
|
583
|
+
return df, pending_hwm
|
|
584
|
+
|
|
585
|
+
def _execute_inputs_phase(
|
|
586
|
+
self,
|
|
587
|
+
config: NodeConfig,
|
|
588
|
+
ctx: Optional["LoggingContext"] = None,
|
|
589
|
+
current_pipeline: Optional[str] = None,
|
|
590
|
+
) -> Dict[str, Any]:
|
|
591
|
+
"""
|
|
592
|
+
Execute inputs block for cross-pipeline dependencies.
|
|
593
|
+
|
|
594
|
+
Returns a dict of {input_name: DataFrame} for use in transforms.
|
|
595
|
+
|
|
596
|
+
For same-pipeline references, checks context cache first before catalog lookup.
|
|
597
|
+
This enables first-run scenarios where Delta tables don't exist yet.
|
|
598
|
+
"""
|
|
599
|
+
if ctx is None:
|
|
600
|
+
ctx = get_logging_context()
|
|
601
|
+
|
|
602
|
+
if not config.inputs:
|
|
603
|
+
return {}
|
|
604
|
+
|
|
605
|
+
from odibi.references import is_pipeline_reference, resolve_input_reference
|
|
606
|
+
|
|
607
|
+
dataframes = {}
|
|
608
|
+
|
|
609
|
+
for name, ref in config.inputs.items():
|
|
610
|
+
if is_pipeline_reference(ref):
|
|
611
|
+
# Parse the reference to check if it's same-pipeline
|
|
612
|
+
parts = ref[1:].split(".", 1) # Remove $ and split
|
|
613
|
+
ref_pipeline = parts[0] if len(parts) == 2 else None
|
|
614
|
+
ref_node = parts[1] if len(parts) == 2 else None
|
|
615
|
+
|
|
616
|
+
# Try catalog lookup first (read from Delta table - the canonical source)
|
|
617
|
+
df = None
|
|
618
|
+
read_from_catalog = False
|
|
619
|
+
|
|
620
|
+
if self.catalog_manager:
|
|
621
|
+
try:
|
|
622
|
+
read_config = resolve_input_reference(ref, self.catalog_manager)
|
|
623
|
+
ctx.debug(
|
|
624
|
+
f"Resolved reference '{ref}'",
|
|
625
|
+
input_name=name,
|
|
626
|
+
resolved_config=read_config,
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
connection = None
|
|
630
|
+
if "connection" in read_config and read_config["connection"]:
|
|
631
|
+
connection = self.connections.get(read_config["connection"])
|
|
632
|
+
if connection is None:
|
|
633
|
+
available = (
|
|
634
|
+
", ".join(sorted(self.connections.keys())) or "(none defined)"
|
|
635
|
+
)
|
|
636
|
+
raise ValueError(
|
|
637
|
+
f"Input '{name}' failed: Connection '{read_config['connection']}' not found. "
|
|
638
|
+
f"Available connections: [{available}]. "
|
|
639
|
+
f"Check the connection name in your input reference or add it to project.yaml connections."
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# Check if table/path exists before reading
|
|
643
|
+
table_or_path = read_config.get("table") or read_config.get("path")
|
|
644
|
+
if table_or_path and self.engine.table_exists(
|
|
645
|
+
connection, read_config.get("table"), read_config.get("path")
|
|
646
|
+
):
|
|
647
|
+
df = self.engine.read(
|
|
648
|
+
connection=connection,
|
|
649
|
+
format=read_config.get("format"),
|
|
650
|
+
table=read_config.get("table"),
|
|
651
|
+
path=read_config.get("path"),
|
|
652
|
+
)
|
|
653
|
+
read_from_catalog = True
|
|
654
|
+
except Exception as e:
|
|
655
|
+
# Catalog lookup failed - will try cache fallback
|
|
656
|
+
ctx.debug(
|
|
657
|
+
f"Catalog lookup failed for '{ref}': {e}",
|
|
658
|
+
input_name=name,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
# Fallback to context cache for same-pipeline refs (first run scenario)
|
|
662
|
+
if (
|
|
663
|
+
df is None
|
|
664
|
+
and ref_node
|
|
665
|
+
and current_pipeline
|
|
666
|
+
and ref_pipeline == current_pipeline
|
|
667
|
+
):
|
|
668
|
+
cached_df = self.context.get(ref_node)
|
|
669
|
+
if cached_df is not None:
|
|
670
|
+
ctx.debug(
|
|
671
|
+
f"Using cached data for same-pipeline reference '{ref}' (Delta not available)",
|
|
672
|
+
input_name=name,
|
|
673
|
+
source_node=ref_node,
|
|
674
|
+
)
|
|
675
|
+
df = cached_df
|
|
676
|
+
|
|
677
|
+
if df is None:
|
|
678
|
+
raise ValueError(
|
|
679
|
+
f"Input '{name}' failed: Cannot resolve reference '{ref}'. "
|
|
680
|
+
f"The referenced data was not found in the catalog or context cache. "
|
|
681
|
+
f"Ensure the referenced node has run successfully and written its output before this node executes. "
|
|
682
|
+
f"Check: 1) The node name is spelled correctly. 2) The referenced pipeline ran first. 3) depends_on is configured if same-pipeline."
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
# Store input source path for transforms that need it (e.g., detect_deletes)
|
|
686
|
+
# Only if we read from catalog (read_config was set)
|
|
687
|
+
if read_from_catalog:
|
|
688
|
+
input_path = read_config.get("path") or read_config.get("table")
|
|
689
|
+
if input_path:
|
|
690
|
+
if connection and hasattr(connection, "get_path"):
|
|
691
|
+
input_path = connection.get_path(input_path)
|
|
692
|
+
self.engine._current_input_path = input_path
|
|
693
|
+
|
|
694
|
+
elif isinstance(ref, dict):
|
|
695
|
+
conn_name = ref.get("connection")
|
|
696
|
+
connection = self.connections.get(conn_name) if conn_name else None
|
|
697
|
+
|
|
698
|
+
if conn_name and connection is None:
|
|
699
|
+
available = ", ".join(sorted(self.connections.keys())) or "(none defined)"
|
|
700
|
+
raise ValueError(
|
|
701
|
+
f"Input '{name}' failed: Connection '{conn_name}' not found. "
|
|
702
|
+
f"Available connections: [{available}]. "
|
|
703
|
+
f"Check your input configuration or add the missing connection to project.yaml."
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
df = self.engine.read(
|
|
707
|
+
connection=connection,
|
|
708
|
+
format=ref.get("format"),
|
|
709
|
+
table=ref.get("table"),
|
|
710
|
+
path=ref.get("path"),
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
else:
|
|
714
|
+
raise ValueError(
|
|
715
|
+
f"Input '{name}' failed: Invalid input format. Got: {type(ref).__name__} = {repr(ref)[:100]}. "
|
|
716
|
+
f"Expected either: 1) A pipeline reference string like '$pipeline_name.node_name', or "
|
|
717
|
+
f"2) A read config dict with 'connection', 'format', and 'table'/'path' keys."
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
dataframes[name] = df
|
|
721
|
+
row_count = self._count_rows(df) if df is not None else 0
|
|
722
|
+
ctx.info(
|
|
723
|
+
f"Loaded input '{name}'",
|
|
724
|
+
rows=row_count,
|
|
725
|
+
source=ref if isinstance(ref, str) else ref.get("path") or ref.get("table"),
|
|
726
|
+
)
|
|
727
|
+
self._execution_steps.append(f"Loaded input '{name}' ({row_count} rows)")
|
|
728
|
+
|
|
729
|
+
return dataframes
|
|
730
|
+
|
|
731
|
+
def _quote_sql_column(self, column: str, format: Optional[str] = None) -> str:
|
|
732
|
+
"""Quote a column name for SQL to handle spaces and special characters.
|
|
733
|
+
|
|
734
|
+
Uses [] for SQL Server dialects, backticks for others.
|
|
735
|
+
"""
|
|
736
|
+
if format in ("sql_server", "azure_sql"):
|
|
737
|
+
return f"[{column}]"
|
|
738
|
+
else:
|
|
739
|
+
return f"`{column}`"
|
|
740
|
+
|
|
741
|
+
def _get_date_expr(
|
|
742
|
+
self, quoted_col: str, cutoff: datetime, date_format: Optional[str]
|
|
743
|
+
) -> Tuple[str, str]:
|
|
744
|
+
"""Get SQL expressions for date column and cutoff value.
|
|
745
|
+
|
|
746
|
+
Args:
|
|
747
|
+
quoted_col: The quoted column name
|
|
748
|
+
cutoff: The cutoff datetime value
|
|
749
|
+
date_format: The source date format
|
|
750
|
+
|
|
751
|
+
Returns:
|
|
752
|
+
Tuple of (column_expression, cutoff_expression)
|
|
753
|
+
|
|
754
|
+
Supported date_format values:
|
|
755
|
+
- None: Default ISO format (YYYY-MM-DD HH:MM:SS)
|
|
756
|
+
- "oracle": DD-MON-YY format (e.g., 20-APR-24 07:11:01.0)
|
|
757
|
+
- "sql_server": SQL Server CONVERT with style 120
|
|
758
|
+
- "us": MM/DD/YYYY format
|
|
759
|
+
- "eu": DD/MM/YYYY format
|
|
760
|
+
- "iso": Explicit ISO format with T separator
|
|
761
|
+
"""
|
|
762
|
+
if date_format == "oracle":
|
|
763
|
+
cutoff_str = cutoff.strftime("%d-%b-%y %H:%M:%S").upper()
|
|
764
|
+
col_expr = f"TO_TIMESTAMP({quoted_col}, 'DD-MON-RR HH24:MI:SS.FF')"
|
|
765
|
+
cutoff_expr = f"TO_TIMESTAMP('{cutoff_str}', 'DD-MON-RR HH24:MI:SS')"
|
|
766
|
+
elif date_format == "oracle_sqlserver":
|
|
767
|
+
cutoff_str = cutoff.strftime("%Y-%m-%d %H:%M:%S")
|
|
768
|
+
col_expr = (
|
|
769
|
+
f"TRY_CAST("
|
|
770
|
+
f"RIGHT('20' + SUBSTRING({quoted_col}, 8, 2), 4) + '-' + "
|
|
771
|
+
f"CASE SUBSTRING({quoted_col}, 4, 3) "
|
|
772
|
+
f"WHEN 'JAN' THEN '01' WHEN 'FEB' THEN '02' WHEN 'MAR' THEN '03' "
|
|
773
|
+
f"WHEN 'APR' THEN '04' WHEN 'MAY' THEN '05' WHEN 'JUN' THEN '06' "
|
|
774
|
+
f"WHEN 'JUL' THEN '07' WHEN 'AUG' THEN '08' WHEN 'SEP' THEN '09' "
|
|
775
|
+
f"WHEN 'OCT' THEN '10' WHEN 'NOV' THEN '11' WHEN 'DEC' THEN '12' END + '-' + "
|
|
776
|
+
f"SUBSTRING({quoted_col}, 1, 2) + ' ' + "
|
|
777
|
+
f"SUBSTRING({quoted_col}, 11, 8) AS DATETIME)"
|
|
778
|
+
)
|
|
779
|
+
cutoff_expr = f"'{cutoff_str}'"
|
|
780
|
+
elif date_format == "sql_server":
|
|
781
|
+
cutoff_str = cutoff.strftime("%Y-%m-%d %H:%M:%S")
|
|
782
|
+
col_expr = f"CONVERT(DATETIME, {quoted_col}, 120)"
|
|
783
|
+
cutoff_expr = f"'{cutoff_str}'"
|
|
784
|
+
elif date_format == "us":
|
|
785
|
+
cutoff_str = cutoff.strftime("%m/%d/%Y %H:%M:%S")
|
|
786
|
+
col_expr = f"TO_TIMESTAMP({quoted_col}, 'MM/DD/YYYY HH24:MI:SS')"
|
|
787
|
+
cutoff_expr = f"TO_TIMESTAMP('{cutoff_str}', 'MM/DD/YYYY HH24:MI:SS')"
|
|
788
|
+
elif date_format == "eu":
|
|
789
|
+
cutoff_str = cutoff.strftime("%d/%m/%Y %H:%M:%S")
|
|
790
|
+
col_expr = f"TO_TIMESTAMP({quoted_col}, 'DD/MM/YYYY HH24:MI:SS')"
|
|
791
|
+
cutoff_expr = f"TO_TIMESTAMP('{cutoff_str}', 'DD/MM/YYYY HH24:MI:SS')"
|
|
792
|
+
elif date_format == "iso":
|
|
793
|
+
cutoff_str = cutoff.strftime("%Y-%m-%dT%H:%M:%S")
|
|
794
|
+
col_expr = f"TO_TIMESTAMP({quoted_col}, 'YYYY-MM-DD\"T\"HH24:MI:SS')"
|
|
795
|
+
cutoff_expr = f"TO_TIMESTAMP('{cutoff_str}', 'YYYY-MM-DD\"T\"HH24:MI:SS')"
|
|
796
|
+
else:
|
|
797
|
+
cutoff_str = cutoff.strftime("%Y-%m-%d %H:%M:%S")
|
|
798
|
+
col_expr = quoted_col
|
|
799
|
+
cutoff_expr = f"'{cutoff_str}'"
|
|
800
|
+
|
|
801
|
+
return col_expr, cutoff_expr
|
|
802
|
+
|
|
803
|
+
def _generate_incremental_sql_filter(
|
|
804
|
+
self,
|
|
805
|
+
inc: IncrementalConfig,
|
|
806
|
+
config: NodeConfig,
|
|
807
|
+
ctx: Optional["LoggingContext"] = None,
|
|
808
|
+
) -> Optional[str]:
|
|
809
|
+
"""Generate SQL WHERE clause for incremental filtering (pushdown to SQL source).
|
|
810
|
+
|
|
811
|
+
Returns a SQL filter string or None if no filter should be applied.
|
|
812
|
+
"""
|
|
813
|
+
if ctx is None:
|
|
814
|
+
ctx = get_logging_context()
|
|
815
|
+
|
|
816
|
+
# Check if target table exists - if not, this is first run (full load)
|
|
817
|
+
if config.write:
|
|
818
|
+
target_conn = self.connections.get(config.write.connection)
|
|
819
|
+
# Use register_table if table is not set (path-based Delta with registration)
|
|
820
|
+
table_to_check = config.write.table or config.write.register_table
|
|
821
|
+
if target_conn and not self._cached_table_exists(
|
|
822
|
+
target_conn, table_to_check, config.write.path
|
|
823
|
+
):
|
|
824
|
+
ctx.debug("First run detected - skipping incremental SQL pushdown")
|
|
825
|
+
return None
|
|
826
|
+
|
|
827
|
+
# Get the SQL format for proper column quoting
|
|
828
|
+
sql_format = config.read.format if config.read else None
|
|
829
|
+
|
|
830
|
+
if inc.mode == IncrementalMode.ROLLING_WINDOW:
|
|
831
|
+
if not inc.lookback or not inc.unit:
|
|
832
|
+
return None
|
|
833
|
+
|
|
834
|
+
# Calculate cutoff
|
|
835
|
+
now = datetime.now()
|
|
836
|
+
|
|
837
|
+
delta = None
|
|
838
|
+
if inc.unit == "hour":
|
|
839
|
+
delta = timedelta(hours=inc.lookback)
|
|
840
|
+
elif inc.unit == "day":
|
|
841
|
+
delta = timedelta(days=inc.lookback)
|
|
842
|
+
elif inc.unit == "month":
|
|
843
|
+
delta = timedelta(days=inc.lookback * 30)
|
|
844
|
+
elif inc.unit == "year":
|
|
845
|
+
delta = timedelta(days=inc.lookback * 365)
|
|
846
|
+
|
|
847
|
+
if delta:
|
|
848
|
+
cutoff = now - delta
|
|
849
|
+
quoted_col = self._quote_sql_column(inc.column, sql_format)
|
|
850
|
+
col_expr, cutoff_expr = self._get_date_expr(quoted_col, cutoff, inc.date_format)
|
|
851
|
+
|
|
852
|
+
if inc.fallback_column:
|
|
853
|
+
quoted_fallback = self._quote_sql_column(inc.fallback_column, sql_format)
|
|
854
|
+
fallback_expr, _ = self._get_date_expr(quoted_fallback, cutoff, inc.date_format)
|
|
855
|
+
return f"COALESCE({col_expr}, {fallback_expr}) >= {cutoff_expr}"
|
|
856
|
+
else:
|
|
857
|
+
return f"{col_expr} >= {cutoff_expr}"
|
|
858
|
+
|
|
859
|
+
elif inc.mode == IncrementalMode.STATEFUL:
|
|
860
|
+
# For stateful, we need to get the HWM from state
|
|
861
|
+
state_key = inc.state_key or f"{config.name}_hwm"
|
|
862
|
+
|
|
863
|
+
if self.state_manager:
|
|
864
|
+
last_hwm = self.state_manager.get_hwm(state_key)
|
|
865
|
+
if last_hwm is not None:
|
|
866
|
+
# Apply watermark_lag if configured
|
|
867
|
+
if inc.watermark_lag:
|
|
868
|
+
from odibi.utils.duration import parse_duration
|
|
869
|
+
|
|
870
|
+
lag_delta = parse_duration(inc.watermark_lag)
|
|
871
|
+
if lag_delta and isinstance(last_hwm, str):
|
|
872
|
+
try:
|
|
873
|
+
hwm_dt = datetime.fromisoformat(last_hwm)
|
|
874
|
+
last_hwm = (hwm_dt - lag_delta).isoformat()
|
|
875
|
+
except ValueError:
|
|
876
|
+
pass
|
|
877
|
+
|
|
878
|
+
# Format HWM for SQL compatibility (SQL Server doesn't like ISO 'T')
|
|
879
|
+
hwm_str = str(last_hwm)
|
|
880
|
+
if isinstance(last_hwm, str) and "T" in last_hwm:
|
|
881
|
+
try:
|
|
882
|
+
hwm_dt = datetime.fromisoformat(last_hwm)
|
|
883
|
+
hwm_str = hwm_dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
884
|
+
except ValueError:
|
|
885
|
+
hwm_str = last_hwm.replace("T", " ")
|
|
886
|
+
|
|
887
|
+
quoted_col = self._quote_sql_column(inc.column, sql_format)
|
|
888
|
+
if inc.fallback_column:
|
|
889
|
+
quoted_fallback = self._quote_sql_column(inc.fallback_column, sql_format)
|
|
890
|
+
return f"COALESCE({quoted_col}, {quoted_fallback}) > '{hwm_str}'"
|
|
891
|
+
else:
|
|
892
|
+
return f"{quoted_col} > '{hwm_str}'"
|
|
893
|
+
|
|
894
|
+
return None
|
|
895
|
+
|
|
896
|
+
def _apply_incremental_filtering(
|
|
897
|
+
self, df: Any, config: NodeConfig, hwm_state: Optional[Tuple[str, Any]]
|
|
898
|
+
) -> Tuple[Any, Optional[Tuple[str, Any]]]:
|
|
899
|
+
"""Apply incremental filtering and capture new HWM.
|
|
900
|
+
|
|
901
|
+
Note: For SQL sources, filtering is done via SQL pushdown in _generate_incremental_sql_filter.
|
|
902
|
+
This method handles non-SQL sources and HWM capture for stateful mode.
|
|
903
|
+
"""
|
|
904
|
+
inc = config.read.incremental
|
|
905
|
+
if not inc:
|
|
906
|
+
return df, None
|
|
907
|
+
|
|
908
|
+
# Skip in-memory filtering for SQL sources (already pushed down)
|
|
909
|
+
if config.read.format in ["sql", "sql_server", "azure_sql"]:
|
|
910
|
+
# Still need to capture HWM for stateful mode
|
|
911
|
+
if inc.mode == IncrementalMode.STATEFUL:
|
|
912
|
+
state_key = inc.state_key or f"{config.name}_hwm"
|
|
913
|
+
new_max = self._get_column_max(df, inc.column)
|
|
914
|
+
if new_max is not None:
|
|
915
|
+
return df, (state_key, new_max)
|
|
916
|
+
return df, None
|
|
917
|
+
|
|
918
|
+
# Smart Read Pattern: If target table doesn't exist, skip filtering (Full Load)
|
|
919
|
+
if config.write:
|
|
920
|
+
target_conn = self.connections.get(config.write.connection)
|
|
921
|
+
# Use register_table if table is not set (path-based Delta with registration)
|
|
922
|
+
table_to_check = config.write.table or config.write.register_table
|
|
923
|
+
if target_conn and not self._cached_table_exists(
|
|
924
|
+
target_conn, table_to_check, config.write.path
|
|
925
|
+
):
|
|
926
|
+
# First Run detected -> Full Load
|
|
927
|
+
# We still need to capture HWM if stateful!
|
|
928
|
+
if inc.mode == IncrementalMode.STATEFUL:
|
|
929
|
+
state_key = inc.state_key or f"{config.name}_hwm"
|
|
930
|
+
new_max = self._get_column_max(df, inc.column)
|
|
931
|
+
if new_max is not None:
|
|
932
|
+
return df, (state_key, new_max)
|
|
933
|
+
|
|
934
|
+
return df, None
|
|
935
|
+
|
|
936
|
+
if inc.mode == IncrementalMode.ROLLING_WINDOW:
|
|
937
|
+
if not inc.lookback or not inc.unit:
|
|
938
|
+
return df, None
|
|
939
|
+
|
|
940
|
+
# Calculate cutoff
|
|
941
|
+
now = datetime.now()
|
|
942
|
+
|
|
943
|
+
delta = None
|
|
944
|
+
if inc.unit == "hour":
|
|
945
|
+
delta = timedelta(hours=inc.lookback)
|
|
946
|
+
elif inc.unit == "day":
|
|
947
|
+
delta = timedelta(days=inc.lookback)
|
|
948
|
+
elif inc.unit == "month":
|
|
949
|
+
delta = timedelta(days=inc.lookback * 30)
|
|
950
|
+
elif inc.unit == "year":
|
|
951
|
+
delta = timedelta(days=inc.lookback * 365)
|
|
952
|
+
|
|
953
|
+
if delta:
|
|
954
|
+
cutoff = now - delta
|
|
955
|
+
|
|
956
|
+
if inc.fallback_column:
|
|
957
|
+
if hasattr(self.engine, "filter_coalesce"):
|
|
958
|
+
# Use >= for inclusive rolling window usually? Or >?
|
|
959
|
+
# Standard is usually >= (within the last X days)
|
|
960
|
+
df = self.engine.filter_coalesce(
|
|
961
|
+
df, inc.column, inc.fallback_column, ">=", cutoff
|
|
962
|
+
)
|
|
963
|
+
elif hasattr(self.engine, "filter_greater_than"):
|
|
964
|
+
df = self.engine.filter_greater_than(df, inc.column, cutoff)
|
|
965
|
+
else:
|
|
966
|
+
if hasattr(self.engine, "filter_greater_than"):
|
|
967
|
+
# Note: engine.filter_greater_than is strictly >.
|
|
968
|
+
# For rolling window, we usually want >= cutoff.
|
|
969
|
+
# But filter_greater_than implementation is >.
|
|
970
|
+
# Let's check if we should add filter_greater_than_or_equal?
|
|
971
|
+
# Or just use > (cutoff - epsilon)?
|
|
972
|
+
# Given existing test expectation (kept rows at cutoff?), use >.
|
|
973
|
+
# Test says: Cutoff 2023-10-24 12:00:00.
|
|
974
|
+
# Row 2: 2023-10-25 11:00:00. (Kept)
|
|
975
|
+
# Row 3: 2023-10-25 11:30:00. (Kept)
|
|
976
|
+
# Row 1: 2023-10-01 (Filtered)
|
|
977
|
+
# So > is fine.
|
|
978
|
+
df = self.engine.filter_greater_than(df, inc.column, cutoff)
|
|
979
|
+
|
|
980
|
+
elif inc.mode == IncrementalMode.STATEFUL:
|
|
981
|
+
# Check if we have state
|
|
982
|
+
# hwm_state is (key, value)
|
|
983
|
+
|
|
984
|
+
last_hwm = None
|
|
985
|
+
state_key = inc.state_key or f"{config.name}_hwm"
|
|
986
|
+
|
|
987
|
+
if hwm_state and hwm_state[0] == state_key:
|
|
988
|
+
last_hwm = hwm_state[1]
|
|
989
|
+
|
|
990
|
+
# Apply watermark_lag: subtract lag duration from HWM for late-arriving data
|
|
991
|
+
if last_hwm is not None and inc.watermark_lag:
|
|
992
|
+
lag_delta = parse_duration(inc.watermark_lag)
|
|
993
|
+
if lag_delta:
|
|
994
|
+
ctx = get_logging_context()
|
|
995
|
+
ctx.debug(
|
|
996
|
+
f"Applying watermark_lag: {inc.watermark_lag}",
|
|
997
|
+
original_hwm=str(last_hwm),
|
|
998
|
+
)
|
|
999
|
+
# Parse string HWM to datetime if needed (HWM is stored as JSON string)
|
|
1000
|
+
if isinstance(last_hwm, str):
|
|
1001
|
+
try:
|
|
1002
|
+
last_hwm = datetime.fromisoformat(last_hwm)
|
|
1003
|
+
except ValueError:
|
|
1004
|
+
ctx.warning(
|
|
1005
|
+
f"Could not parse HWM '{last_hwm}' as datetime for watermark_lag"
|
|
1006
|
+
)
|
|
1007
|
+
# Subtract lag from HWM to handle late-arriving data
|
|
1008
|
+
if hasattr(last_hwm, "__sub__"):
|
|
1009
|
+
last_hwm = last_hwm - lag_delta
|
|
1010
|
+
ctx.info(
|
|
1011
|
+
"Watermark lag applied",
|
|
1012
|
+
lag=inc.watermark_lag,
|
|
1013
|
+
adjusted_hwm=str(last_hwm),
|
|
1014
|
+
)
|
|
1015
|
+
self._execution_steps.append(f"Applied watermark_lag: {inc.watermark_lag}")
|
|
1016
|
+
|
|
1017
|
+
# Filter
|
|
1018
|
+
if last_hwm is not None:
|
|
1019
|
+
# Apply filter: col > last_hwm (with fallback if configured)
|
|
1020
|
+
if inc.fallback_column and hasattr(self.engine, "filter_coalesce"):
|
|
1021
|
+
df = self.engine.filter_coalesce(
|
|
1022
|
+
df, inc.column, inc.fallback_column, ">", last_hwm
|
|
1023
|
+
)
|
|
1024
|
+
self._execution_steps.append(
|
|
1025
|
+
f"Incremental: Filtered COALESCE({inc.column}, "
|
|
1026
|
+
f"{inc.fallback_column}) > {last_hwm}"
|
|
1027
|
+
)
|
|
1028
|
+
else:
|
|
1029
|
+
df = self.engine.filter_greater_than(df, inc.column, last_hwm)
|
|
1030
|
+
self._execution_steps.append(f"Incremental: Filtered {inc.column} > {last_hwm}")
|
|
1031
|
+
|
|
1032
|
+
# Capture new HWM (use fallback column if configured)
|
|
1033
|
+
new_max = self._get_column_max(df, inc.column, inc.fallback_column)
|
|
1034
|
+
|
|
1035
|
+
if new_max is not None:
|
|
1036
|
+
return df, (state_key, new_max)
|
|
1037
|
+
|
|
1038
|
+
return df, None
|
|
1039
|
+
|
|
1040
|
+
def _execute_pre_sql(
|
|
1041
|
+
self,
|
|
1042
|
+
config: NodeConfig,
|
|
1043
|
+
ctx: Optional["LoggingContext"] = None,
|
|
1044
|
+
) -> None:
|
|
1045
|
+
"""Execute pre-SQL statements before node runs."""
|
|
1046
|
+
if ctx is None:
|
|
1047
|
+
ctx = get_logging_context()
|
|
1048
|
+
|
|
1049
|
+
if not config.pre_sql:
|
|
1050
|
+
return
|
|
1051
|
+
|
|
1052
|
+
ctx.info(f"Executing {len(config.pre_sql)} pre-SQL statement(s)")
|
|
1053
|
+
|
|
1054
|
+
for i, sql in enumerate(config.pre_sql, 1):
|
|
1055
|
+
ctx.debug(f"Executing pre_sql [{i}/{len(config.pre_sql)}]", sql_preview=sql[:100])
|
|
1056
|
+
try:
|
|
1057
|
+
self.engine.execute_sql(sql, self.context)
|
|
1058
|
+
self._executed_sql.append(f"pre_sql[{i}]: {sql[:50]}...")
|
|
1059
|
+
except Exception as e:
|
|
1060
|
+
ctx.error(
|
|
1061
|
+
"Pre-SQL statement failed",
|
|
1062
|
+
statement_index=i,
|
|
1063
|
+
error=str(e),
|
|
1064
|
+
)
|
|
1065
|
+
raise
|
|
1066
|
+
|
|
1067
|
+
self._execution_steps.append(f"Executed {len(config.pre_sql)} pre-SQL statement(s)")
|
|
1068
|
+
|
|
1069
|
+
def _execute_post_sql(
|
|
1070
|
+
self,
|
|
1071
|
+
config: NodeConfig,
|
|
1072
|
+
ctx: Optional["LoggingContext"] = None,
|
|
1073
|
+
) -> None:
|
|
1074
|
+
"""Execute post-SQL statements after node completes."""
|
|
1075
|
+
if ctx is None:
|
|
1076
|
+
ctx = get_logging_context()
|
|
1077
|
+
|
|
1078
|
+
if not config.post_sql:
|
|
1079
|
+
return
|
|
1080
|
+
|
|
1081
|
+
ctx.info(f"Executing {len(config.post_sql)} post-SQL statement(s)")
|
|
1082
|
+
|
|
1083
|
+
for i, sql in enumerate(config.post_sql, 1):
|
|
1084
|
+
ctx.debug(f"Executing post_sql [{i}/{len(config.post_sql)}]", sql_preview=sql[:100])
|
|
1085
|
+
try:
|
|
1086
|
+
self.engine.execute_sql(sql, self.context)
|
|
1087
|
+
self._executed_sql.append(f"post_sql[{i}]: {sql[:50]}...")
|
|
1088
|
+
except Exception as e:
|
|
1089
|
+
ctx.error(
|
|
1090
|
+
"Post-SQL statement failed",
|
|
1091
|
+
statement_index=i,
|
|
1092
|
+
error=str(e),
|
|
1093
|
+
)
|
|
1094
|
+
raise
|
|
1095
|
+
|
|
1096
|
+
self._execution_steps.append(f"Executed {len(config.post_sql)} post-SQL statement(s)")
|
|
1097
|
+
|
|
1098
|
+
def _execute_contracts_phase(
|
|
1099
|
+
self,
|
|
1100
|
+
config: NodeConfig,
|
|
1101
|
+
df: Any,
|
|
1102
|
+
ctx: Optional["LoggingContext"] = None,
|
|
1103
|
+
) -> None:
|
|
1104
|
+
"""Execute pre-condition contracts."""
|
|
1105
|
+
if ctx is None:
|
|
1106
|
+
ctx = get_logging_context()
|
|
1107
|
+
|
|
1108
|
+
if config.contracts and df is not None:
|
|
1109
|
+
ctx.debug(
|
|
1110
|
+
"Starting contract validation",
|
|
1111
|
+
contract_count=len(config.contracts),
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
df = self.engine.materialize(df)
|
|
1115
|
+
|
|
1116
|
+
from odibi.config import ValidationAction, ValidationConfig
|
|
1117
|
+
from odibi.validation.engine import Validator
|
|
1118
|
+
|
|
1119
|
+
contract_config = ValidationConfig(mode=ValidationAction.FAIL, tests=config.contracts)
|
|
1120
|
+
|
|
1121
|
+
validator = Validator()
|
|
1122
|
+
failures = validator.validate(df, contract_config, context={"columns": config.columns})
|
|
1123
|
+
|
|
1124
|
+
if failures:
|
|
1125
|
+
ctx.error(
|
|
1126
|
+
"Contract validation failed",
|
|
1127
|
+
failures=failures,
|
|
1128
|
+
contract_count=len(config.contracts),
|
|
1129
|
+
)
|
|
1130
|
+
failure_summary = "; ".join(
|
|
1131
|
+
f"{f.get('test', 'unknown')}: {f.get('message', 'failed')}"
|
|
1132
|
+
for f in failures[:3]
|
|
1133
|
+
)
|
|
1134
|
+
if len(failures) > 3:
|
|
1135
|
+
failure_summary += f"; ... and {len(failures) - 3} more"
|
|
1136
|
+
raise ValidationError(
|
|
1137
|
+
f"Node '{config.name}' contract validation failed with {len(failures)} error(s): {failure_summary}",
|
|
1138
|
+
failures,
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
ctx.info(
|
|
1142
|
+
"Contract validation passed",
|
|
1143
|
+
contract_count=len(config.contracts),
|
|
1144
|
+
)
|
|
1145
|
+
self._execution_steps.append(f"Passed {len(config.contracts)} contract checks")
|
|
1146
|
+
|
|
1147
|
+
def _execute_transform_phase(
|
|
1148
|
+
self,
|
|
1149
|
+
config: NodeConfig,
|
|
1150
|
+
result_df: Optional[Any],
|
|
1151
|
+
input_df: Optional[Any],
|
|
1152
|
+
ctx: Optional["LoggingContext"] = None,
|
|
1153
|
+
input_dataframes: Optional[Dict[str, Any]] = None,
|
|
1154
|
+
) -> Optional[Any]:
|
|
1155
|
+
"""
|
|
1156
|
+
Execute transformer and transform steps.
|
|
1157
|
+
|
|
1158
|
+
Args:
|
|
1159
|
+
config: Node configuration
|
|
1160
|
+
result_df: Current result DataFrame
|
|
1161
|
+
input_df: Input DataFrame (for single-input nodes)
|
|
1162
|
+
ctx: Logging context
|
|
1163
|
+
input_dataframes: Dict of named DataFrames for multi-input nodes (inputs block)
|
|
1164
|
+
"""
|
|
1165
|
+
if ctx is None:
|
|
1166
|
+
ctx = get_logging_context()
|
|
1167
|
+
|
|
1168
|
+
input_dataframes = input_dataframes or {}
|
|
1169
|
+
|
|
1170
|
+
pii_meta = self._calculate_pii(config)
|
|
1171
|
+
rows_before = self._count_rows(result_df) if result_df is not None else None
|
|
1172
|
+
schema_before = self._get_schema(result_df) if result_df is not None else None
|
|
1173
|
+
|
|
1174
|
+
# Register named inputs in context for SQL access
|
|
1175
|
+
if input_dataframes:
|
|
1176
|
+
for name, df in input_dataframes.items():
|
|
1177
|
+
self.context.register(name, df)
|
|
1178
|
+
ctx.debug(
|
|
1179
|
+
f"Registered {len(input_dataframes)} named inputs for transforms",
|
|
1180
|
+
inputs=list(input_dataframes.keys()),
|
|
1181
|
+
)
|
|
1182
|
+
|
|
1183
|
+
# Pattern Engine
|
|
1184
|
+
if config.transformer:
|
|
1185
|
+
if result_df is None and input_df is not None:
|
|
1186
|
+
result_df = input_df
|
|
1187
|
+
rows_before = self._count_rows(result_df)
|
|
1188
|
+
schema_before = self._get_schema(result_df)
|
|
1189
|
+
|
|
1190
|
+
with ctx.operation(
|
|
1191
|
+
OperationType.PATTERN,
|
|
1192
|
+
f"transformer:{config.transformer}",
|
|
1193
|
+
) as metrics:
|
|
1194
|
+
metrics.rows_in = rows_before
|
|
1195
|
+
if isinstance(schema_before, dict):
|
|
1196
|
+
metrics.schema_before = schema_before
|
|
1197
|
+
|
|
1198
|
+
is_pattern = False
|
|
1199
|
+
try:
|
|
1200
|
+
from odibi.patterns import get_pattern_class
|
|
1201
|
+
|
|
1202
|
+
pattern_cls = get_pattern_class(config.transformer)
|
|
1203
|
+
is_pattern = True
|
|
1204
|
+
|
|
1205
|
+
# Inject delta_table_properties into config.params for patterns that write Delta
|
|
1206
|
+
pattern_config = config
|
|
1207
|
+
delta_patterns = ("merge", "scd2", "dimension", "aggregation", "fact")
|
|
1208
|
+
if self.performance_config and config.transformer in delta_patterns:
|
|
1209
|
+
global_props = (
|
|
1210
|
+
getattr(self.performance_config, "delta_table_properties", None) or {}
|
|
1211
|
+
)
|
|
1212
|
+
if global_props:
|
|
1213
|
+
merged_params = dict(config.params) if config.params else {}
|
|
1214
|
+
node_props = merged_params.get("table_properties") or {}
|
|
1215
|
+
merged_params["table_properties"] = {**global_props, **node_props}
|
|
1216
|
+
pattern_config = config.model_copy(update={"params": merged_params})
|
|
1217
|
+
|
|
1218
|
+
pattern = pattern_cls(self.engine, pattern_config)
|
|
1219
|
+
pattern.validate()
|
|
1220
|
+
|
|
1221
|
+
engine_ctx = EngineContext(
|
|
1222
|
+
context=self.context,
|
|
1223
|
+
df=result_df,
|
|
1224
|
+
engine_type=self.engine.name,
|
|
1225
|
+
sql_executor=self.engine.execute_sql,
|
|
1226
|
+
engine=self.engine,
|
|
1227
|
+
pii_metadata=pii_meta,
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
result_df = pattern.execute(engine_ctx)
|
|
1231
|
+
self._execution_steps.append(f"Applied pattern '{config.transformer}'")
|
|
1232
|
+
|
|
1233
|
+
if self.catalog_manager and config.write:
|
|
1234
|
+
self.catalog_manager.log_pattern(
|
|
1235
|
+
table_name=config.write.table or config.write.path,
|
|
1236
|
+
pattern_type=config.transformer,
|
|
1237
|
+
configuration=str(config.params),
|
|
1238
|
+
compliance_score=1.0,
|
|
1239
|
+
)
|
|
1240
|
+
|
|
1241
|
+
except ValueError:
|
|
1242
|
+
pass
|
|
1243
|
+
|
|
1244
|
+
if not is_pattern:
|
|
1245
|
+
result_df = self._execute_transformer_node(config, result_df, pii_meta)
|
|
1246
|
+
self._execution_steps.append(f"Applied transformer '{config.transformer}'")
|
|
1247
|
+
|
|
1248
|
+
if self.catalog_manager and config.write:
|
|
1249
|
+
self.catalog_manager.log_pattern(
|
|
1250
|
+
table_name=config.write.table or config.write.path,
|
|
1251
|
+
pattern_type=config.transformer,
|
|
1252
|
+
configuration=str(config.params),
|
|
1253
|
+
compliance_score=1.0,
|
|
1254
|
+
)
|
|
1255
|
+
|
|
1256
|
+
rows_after = self._count_rows(result_df) if result_df is not None else None
|
|
1257
|
+
schema_after = self._get_schema(result_df) if result_df is not None else None
|
|
1258
|
+
metrics.rows_out = rows_after
|
|
1259
|
+
if isinstance(schema_after, dict):
|
|
1260
|
+
metrics.schema_after = schema_after
|
|
1261
|
+
|
|
1262
|
+
if (
|
|
1263
|
+
isinstance(rows_before, (int, float))
|
|
1264
|
+
and isinstance(rows_after, (int, float))
|
|
1265
|
+
and rows_before != rows_after
|
|
1266
|
+
):
|
|
1267
|
+
ctx.log_row_count_change(
|
|
1268
|
+
rows_before, rows_after, operation=f"transformer:{config.transformer}"
|
|
1269
|
+
)
|
|
1270
|
+
if (
|
|
1271
|
+
isinstance(schema_before, dict)
|
|
1272
|
+
and isinstance(schema_after, dict)
|
|
1273
|
+
and schema_before != schema_after
|
|
1274
|
+
):
|
|
1275
|
+
ctx.log_schema_change(
|
|
1276
|
+
schema_before, schema_after, operation=f"transformer:{config.transformer}"
|
|
1277
|
+
)
|
|
1278
|
+
|
|
1279
|
+
# Transform Steps
|
|
1280
|
+
if config.transform:
|
|
1281
|
+
if result_df is None and input_df is not None:
|
|
1282
|
+
result_df = input_df
|
|
1283
|
+
|
|
1284
|
+
step_count = len(config.transform.steps)
|
|
1285
|
+
ctx.debug(f"Executing {step_count} transform steps")
|
|
1286
|
+
|
|
1287
|
+
# Set current write path on engine for transforms that need it (e.g., detect_deletes)
|
|
1288
|
+
if config.write and config.write.path:
|
|
1289
|
+
self.engine._current_write_path = config.write.path
|
|
1290
|
+
elif config.write and config.write.table:
|
|
1291
|
+
self.engine._current_write_path = config.write.table
|
|
1292
|
+
|
|
1293
|
+
result_df = self._execute_transform(config, result_df, pii_meta, ctx)
|
|
1294
|
+
self._execution_steps.append(f"Applied {step_count} transform steps")
|
|
1295
|
+
|
|
1296
|
+
# Privacy Suite
|
|
1297
|
+
if config.privacy:
|
|
1298
|
+
pii_cols = [name for name, is_pii in pii_meta.items() if is_pii]
|
|
1299
|
+
if pii_cols:
|
|
1300
|
+
ctx.debug(f"Anonymizing {len(pii_cols)} PII columns", columns=pii_cols)
|
|
1301
|
+
result_df = self.engine.anonymize(
|
|
1302
|
+
result_df,
|
|
1303
|
+
pii_cols,
|
|
1304
|
+
config.privacy.method,
|
|
1305
|
+
config.privacy.salt,
|
|
1306
|
+
)
|
|
1307
|
+
self._execution_steps.append(f"Anonymized {len(pii_cols)} PII columns")
|
|
1308
|
+
|
|
1309
|
+
return result_df
|
|
1310
|
+
|
|
1311
|
+
def _execute_transformer_node(
|
|
1312
|
+
self, config: NodeConfig, df: Optional[Any], pii_metadata: Optional[Dict[str, bool]] = None
|
|
1313
|
+
) -> Any:
|
|
1314
|
+
"""Execute a top-level transformer (legacy)."""
|
|
1315
|
+
if df is not None:
|
|
1316
|
+
df = self.engine.materialize(df)
|
|
1317
|
+
|
|
1318
|
+
func_name = config.transformer
|
|
1319
|
+
params = dict(config.params) if config.params else {}
|
|
1320
|
+
|
|
1321
|
+
# Merge global delta_table_properties into merge transformer params
|
|
1322
|
+
if func_name == "merge" and self.performance_config:
|
|
1323
|
+
global_props = getattr(self.performance_config, "delta_table_properties", None) or {}
|
|
1324
|
+
node_props = params.get("table_properties") or {}
|
|
1325
|
+
merged_props = {**global_props, **node_props}
|
|
1326
|
+
if merged_props:
|
|
1327
|
+
params["table_properties"] = merged_props
|
|
1328
|
+
|
|
1329
|
+
FunctionRegistry.validate_params(func_name, params)
|
|
1330
|
+
func = FunctionRegistry.get(func_name)
|
|
1331
|
+
sig = inspect.signature(func)
|
|
1332
|
+
|
|
1333
|
+
engine_type = EngineType.PANDAS if self.engine.name == "pandas" else EngineType.SPARK
|
|
1334
|
+
engine_ctx = EngineContext(
|
|
1335
|
+
context=self.context,
|
|
1336
|
+
df=df,
|
|
1337
|
+
engine_type=engine_type,
|
|
1338
|
+
sql_executor=self.engine.execute_sql,
|
|
1339
|
+
engine=self.engine,
|
|
1340
|
+
pii_metadata=pii_metadata,
|
|
1341
|
+
)
|
|
1342
|
+
|
|
1343
|
+
param_model = FunctionRegistry.get_param_model(func_name)
|
|
1344
|
+
call_kwargs = {}
|
|
1345
|
+
if "current" in sig.parameters:
|
|
1346
|
+
call_kwargs["current"] = df
|
|
1347
|
+
|
|
1348
|
+
if param_model:
|
|
1349
|
+
params_obj = param_model(**params)
|
|
1350
|
+
result = func(engine_ctx, params_obj, **call_kwargs)
|
|
1351
|
+
else:
|
|
1352
|
+
result = func(engine_ctx, **params, **call_kwargs)
|
|
1353
|
+
|
|
1354
|
+
if engine_ctx._sql_history:
|
|
1355
|
+
self._executed_sql.extend(engine_ctx._sql_history)
|
|
1356
|
+
|
|
1357
|
+
if isinstance(result, EngineContext):
|
|
1358
|
+
return result.df
|
|
1359
|
+
return result
|
|
1360
|
+
|
|
1361
|
+
def _execute_transform(
|
|
1362
|
+
self,
|
|
1363
|
+
config: NodeConfig,
|
|
1364
|
+
df: Any,
|
|
1365
|
+
pii_metadata: Optional[Dict[str, bool]] = None,
|
|
1366
|
+
ctx: Optional["LoggingContext"] = None,
|
|
1367
|
+
) -> Any:
|
|
1368
|
+
"""Execute transform steps."""
|
|
1369
|
+
if ctx is None:
|
|
1370
|
+
ctx = get_logging_context()
|
|
1371
|
+
|
|
1372
|
+
current_df = df
|
|
1373
|
+
transform_config = config.transform
|
|
1374
|
+
|
|
1375
|
+
if transform_config:
|
|
1376
|
+
total_steps = len(transform_config.steps)
|
|
1377
|
+
for step_idx, step in enumerate(transform_config.steps):
|
|
1378
|
+
step_name = self._get_step_name(step)
|
|
1379
|
+
rows_before = self._count_rows(current_df) if current_df is not None else None
|
|
1380
|
+
schema_before = self._get_schema(current_df) if current_df is not None else None
|
|
1381
|
+
|
|
1382
|
+
try:
|
|
1383
|
+
exec_context = ExecutionContext(
|
|
1384
|
+
node_name=config.name,
|
|
1385
|
+
config_file=self.config_file,
|
|
1386
|
+
step_index=step_idx,
|
|
1387
|
+
total_steps=total_steps,
|
|
1388
|
+
previous_steps=self._execution_steps,
|
|
1389
|
+
)
|
|
1390
|
+
|
|
1391
|
+
with ctx.operation(
|
|
1392
|
+
OperationType.TRANSFORM,
|
|
1393
|
+
f"step[{step_idx + 1}/{total_steps}]:{step_name}",
|
|
1394
|
+
) as metrics:
|
|
1395
|
+
metrics.rows_in = rows_before
|
|
1396
|
+
if isinstance(schema_before, dict):
|
|
1397
|
+
metrics.schema_before = schema_before
|
|
1398
|
+
|
|
1399
|
+
if current_df is not None:
|
|
1400
|
+
self.context.register("current_df", current_df)
|
|
1401
|
+
self.context.register("df", current_df)
|
|
1402
|
+
|
|
1403
|
+
if isinstance(step, str):
|
|
1404
|
+
current_df = self._execute_sql_step(step, current_df)
|
|
1405
|
+
else:
|
|
1406
|
+
if step.function:
|
|
1407
|
+
current_df = self._execute_function_step(
|
|
1408
|
+
step.function, step.params, current_df, pii_metadata
|
|
1409
|
+
)
|
|
1410
|
+
elif step.operation:
|
|
1411
|
+
current_df = self._execute_operation_step(
|
|
1412
|
+
step.operation, step.params, current_df
|
|
1413
|
+
)
|
|
1414
|
+
elif step.sql:
|
|
1415
|
+
current_df = self._execute_sql_step(step.sql, current_df)
|
|
1416
|
+
elif step.sql_file:
|
|
1417
|
+
sql_content = self._resolve_sql_file(step.sql_file)
|
|
1418
|
+
current_df = self._execute_sql_step(sql_content, current_df)
|
|
1419
|
+
else:
|
|
1420
|
+
step_repr = repr(step)[:100] if step else "None"
|
|
1421
|
+
raise TransformError(
|
|
1422
|
+
f"Transform step {step_idx + 1}/{total_steps} is invalid. "
|
|
1423
|
+
f"Step config: {step_repr}. "
|
|
1424
|
+
f"Each step must have exactly one of: 'sql', 'sql_file', 'function', or 'operation'."
|
|
1425
|
+
)
|
|
1426
|
+
|
|
1427
|
+
rows_after = (
|
|
1428
|
+
self._count_rows(current_df) if current_df is not None else None
|
|
1429
|
+
)
|
|
1430
|
+
schema_after = (
|
|
1431
|
+
self._get_schema(current_df) if current_df is not None else None
|
|
1432
|
+
)
|
|
1433
|
+
metrics.rows_out = rows_after
|
|
1434
|
+
if isinstance(schema_after, dict):
|
|
1435
|
+
metrics.schema_after = schema_after
|
|
1436
|
+
|
|
1437
|
+
if (
|
|
1438
|
+
isinstance(rows_before, (int, float))
|
|
1439
|
+
and isinstance(rows_after, (int, float))
|
|
1440
|
+
and rows_before != rows_after
|
|
1441
|
+
):
|
|
1442
|
+
ctx.log_row_count_change(rows_before, rows_after, operation=step_name)
|
|
1443
|
+
|
|
1444
|
+
if (
|
|
1445
|
+
isinstance(schema_before, dict)
|
|
1446
|
+
and isinstance(schema_after, dict)
|
|
1447
|
+
and schema_before != schema_after
|
|
1448
|
+
):
|
|
1449
|
+
ctx.log_schema_change(schema_before, schema_after, operation=step_name)
|
|
1450
|
+
|
|
1451
|
+
except Exception as e:
|
|
1452
|
+
schema_dict = self._get_schema(current_df) if current_df is not None else {}
|
|
1453
|
+
schema = (
|
|
1454
|
+
list(schema_dict.keys()) if isinstance(schema_dict, dict) else schema_dict
|
|
1455
|
+
)
|
|
1456
|
+
shape = self._get_shape(current_df) if current_df is not None else None
|
|
1457
|
+
|
|
1458
|
+
exec_context.input_schema = schema
|
|
1459
|
+
exec_context.input_shape = shape
|
|
1460
|
+
|
|
1461
|
+
suggestions = self._generate_suggestions(e, config)
|
|
1462
|
+
|
|
1463
|
+
ctx.error(
|
|
1464
|
+
f"Transform step failed: {step_name}",
|
|
1465
|
+
step_index=step_idx,
|
|
1466
|
+
total_steps=total_steps,
|
|
1467
|
+
error_type=type(e).__name__,
|
|
1468
|
+
error_message=str(e),
|
|
1469
|
+
)
|
|
1470
|
+
if suggestions:
|
|
1471
|
+
ctx.info(f"Suggestions: {'; '.join(suggestions)}")
|
|
1472
|
+
|
|
1473
|
+
raise NodeExecutionError(
|
|
1474
|
+
message=str(e),
|
|
1475
|
+
context=exec_context,
|
|
1476
|
+
original_error=e,
|
|
1477
|
+
suggestions=suggestions,
|
|
1478
|
+
)
|
|
1479
|
+
|
|
1480
|
+
return current_df
|
|
1481
|
+
|
|
1482
|
+
def _get_step_name(self, step: Any) -> str:
|
|
1483
|
+
"""Get human-readable name for a transform step."""
|
|
1484
|
+
if isinstance(step, str):
|
|
1485
|
+
return f"sql:{step[:50]}..." if len(step) > 50 else f"sql:{step}"
|
|
1486
|
+
if hasattr(step, "function") and step.function:
|
|
1487
|
+
return f"function:{step.function}"
|
|
1488
|
+
if hasattr(step, "operation") and step.operation:
|
|
1489
|
+
return f"operation:{step.operation}"
|
|
1490
|
+
if hasattr(step, "sql") and step.sql:
|
|
1491
|
+
sql_preview = step.sql[:50] + "..." if len(step.sql) > 50 else step.sql
|
|
1492
|
+
return f"sql:{sql_preview}"
|
|
1493
|
+
if hasattr(step, "sql_file") and step.sql_file:
|
|
1494
|
+
return f"sql_file:{step.sql_file}"
|
|
1495
|
+
return "unknown"
|
|
1496
|
+
|
|
1497
|
+
def _execute_sql_step(self, sql: str, current_df: Any = None) -> Any:
|
|
1498
|
+
"""Execute SQL transformation with thread-safe view names.
|
|
1499
|
+
|
|
1500
|
+
Uses unique temp view names to avoid race conditions when
|
|
1501
|
+
multiple nodes execute SQL steps in parallel.
|
|
1502
|
+
|
|
1503
|
+
Args:
|
|
1504
|
+
sql: SQL query string (references to 'df' are replaced with unique view)
|
|
1505
|
+
current_df: DataFrame to register as the source for 'df' references
|
|
1506
|
+
|
|
1507
|
+
Returns:
|
|
1508
|
+
Result DataFrame from SQL execution
|
|
1509
|
+
"""
|
|
1510
|
+
self._executed_sql.append(sql)
|
|
1511
|
+
|
|
1512
|
+
if current_df is not None:
|
|
1513
|
+
view_name = _get_unique_view_name()
|
|
1514
|
+
self.context.register(view_name, current_df)
|
|
1515
|
+
try:
|
|
1516
|
+
safe_sql = re.sub(r"\bdf\b", view_name, sql)
|
|
1517
|
+
return self.engine.execute_sql(safe_sql, self.context)
|
|
1518
|
+
finally:
|
|
1519
|
+
self.context.unregister(view_name)
|
|
1520
|
+
else:
|
|
1521
|
+
return self.engine.execute_sql(sql, self.context)
|
|
1522
|
+
|
|
1523
|
+
def _resolve_sql_file(self, sql_file_path: str) -> str:
|
|
1524
|
+
"""Load SQL content from external file.
|
|
1525
|
+
|
|
1526
|
+
Args:
|
|
1527
|
+
sql_file_path: Path to .sql file, relative to main config file.
|
|
1528
|
+
|
|
1529
|
+
Returns:
|
|
1530
|
+
SQL content as string.
|
|
1531
|
+
|
|
1532
|
+
Raises:
|
|
1533
|
+
FileNotFoundError: If the SQL file does not exist.
|
|
1534
|
+
ValueError: If the file cannot be read.
|
|
1535
|
+
"""
|
|
1536
|
+
if not self.config_file:
|
|
1537
|
+
raise ValueError(
|
|
1538
|
+
f"Cannot resolve sql_file '{sql_file_path}': The config_file path is not available. "
|
|
1539
|
+
f"This happens when a pipeline is created programmatically without a YAML source. "
|
|
1540
|
+
f"Solutions: 1) Load pipeline from YAML using load_config_from_file(), or 2) Use inline 'sql:' instead of 'sql_file:'."
|
|
1541
|
+
)
|
|
1542
|
+
|
|
1543
|
+
config_dir = Path(self.config_file).parent
|
|
1544
|
+
file_path = config_dir / sql_file_path
|
|
1545
|
+
|
|
1546
|
+
if not file_path.exists():
|
|
1547
|
+
raise FileNotFoundError(
|
|
1548
|
+
f"SQL file not found: '{sql_file_path}'. "
|
|
1549
|
+
f"Looked in: {file_path.absolute()}. "
|
|
1550
|
+
f"The path is resolved relative to the YAML config file at: {config_dir.absolute()}. "
|
|
1551
|
+
f"Check: 1) The file exists at the expected location. 2) The path is relative to your pipeline YAML, not project.yaml."
|
|
1552
|
+
)
|
|
1553
|
+
|
|
1554
|
+
try:
|
|
1555
|
+
return file_path.read_text(encoding="utf-8")
|
|
1556
|
+
except Exception as e:
|
|
1557
|
+
raise ValueError(
|
|
1558
|
+
f"Failed to read SQL file '{sql_file_path}' at {file_path.absolute()}. "
|
|
1559
|
+
f"Error: {type(e).__name__}: {e}. "
|
|
1560
|
+
f"Check file permissions and encoding (must be UTF-8)."
|
|
1561
|
+
) from e
|
|
1562
|
+
|
|
1563
|
+
def _execute_function_step(
|
|
1564
|
+
self,
|
|
1565
|
+
function_name: str,
|
|
1566
|
+
params: Dict[str, Any],
|
|
1567
|
+
current_df: Optional[Any],
|
|
1568
|
+
pii_metadata: Optional[Dict[str, bool]] = None,
|
|
1569
|
+
) -> Any:
|
|
1570
|
+
"""Execute Python function transformation."""
|
|
1571
|
+
if current_df is not None:
|
|
1572
|
+
current_df = self.engine.materialize(current_df)
|
|
1573
|
+
|
|
1574
|
+
# Merge global delta_table_properties into merge transformer params
|
|
1575
|
+
if function_name == "merge" and self.performance_config:
|
|
1576
|
+
global_props = getattr(self.performance_config, "delta_table_properties", None) or {}
|
|
1577
|
+
node_props = params.get("table_properties") or {}
|
|
1578
|
+
merged_props = {**global_props, **node_props}
|
|
1579
|
+
if merged_props:
|
|
1580
|
+
params = dict(params) # Don't mutate original
|
|
1581
|
+
params["table_properties"] = merged_props
|
|
1582
|
+
|
|
1583
|
+
FunctionRegistry.validate_params(function_name, params)
|
|
1584
|
+
func = FunctionRegistry.get(function_name)
|
|
1585
|
+
sig = inspect.signature(func)
|
|
1586
|
+
|
|
1587
|
+
engine_type = EngineType.PANDAS if self.engine.name == "pandas" else EngineType.SPARK
|
|
1588
|
+
engine_ctx = EngineContext(
|
|
1589
|
+
context=self.context,
|
|
1590
|
+
df=current_df,
|
|
1591
|
+
engine_type=engine_type,
|
|
1592
|
+
sql_executor=self.engine.execute_sql,
|
|
1593
|
+
engine=self.engine,
|
|
1594
|
+
pii_metadata=pii_metadata,
|
|
1595
|
+
)
|
|
1596
|
+
|
|
1597
|
+
param_model = FunctionRegistry.get_param_model(function_name)
|
|
1598
|
+
call_kwargs = {}
|
|
1599
|
+
|
|
1600
|
+
if "current" in sig.parameters:
|
|
1601
|
+
call_kwargs["current"] = current_df
|
|
1602
|
+
|
|
1603
|
+
if param_model:
|
|
1604
|
+
try:
|
|
1605
|
+
params_obj = param_model(**params)
|
|
1606
|
+
except Exception as e:
|
|
1607
|
+
raise ValueError(f"Invalid parameters for '{function_name}': {e}")
|
|
1608
|
+
|
|
1609
|
+
result = func(engine_ctx, params_obj, **call_kwargs)
|
|
1610
|
+
else:
|
|
1611
|
+
result = func(engine_ctx, **params, **call_kwargs)
|
|
1612
|
+
|
|
1613
|
+
if engine_ctx._sql_history:
|
|
1614
|
+
self._executed_sql.extend(engine_ctx._sql_history)
|
|
1615
|
+
|
|
1616
|
+
if isinstance(result, EngineContext):
|
|
1617
|
+
return result.df
|
|
1618
|
+
|
|
1619
|
+
return result
|
|
1620
|
+
|
|
1621
|
+
def _execute_operation_step(
|
|
1622
|
+
self, operation: str, params: Dict[str, Any], current_df: Any
|
|
1623
|
+
) -> Any:
|
|
1624
|
+
"""Execute built-in operation."""
|
|
1625
|
+
if current_df is not None:
|
|
1626
|
+
current_df = self.engine.materialize(current_df)
|
|
1627
|
+
return self.engine.execute_operation(operation, params, current_df)
|
|
1628
|
+
|
|
1629
|
+
def _execute_validation_phase(
|
|
1630
|
+
self,
|
|
1631
|
+
config: NodeConfig,
|
|
1632
|
+
result_df: Any,
|
|
1633
|
+
ctx: Optional["LoggingContext"] = None,
|
|
1634
|
+
) -> Any:
|
|
1635
|
+
"""Execute validation with quarantine and gate support.
|
|
1636
|
+
|
|
1637
|
+
Returns:
|
|
1638
|
+
DataFrame (valid rows only if quarantine is used)
|
|
1639
|
+
"""
|
|
1640
|
+
if ctx is None:
|
|
1641
|
+
ctx = get_logging_context()
|
|
1642
|
+
|
|
1643
|
+
if not config.validation or result_df is None:
|
|
1644
|
+
return result_df
|
|
1645
|
+
|
|
1646
|
+
test_count = len(config.validation.tests)
|
|
1647
|
+
ctx.debug("Starting validation phase", test_count=test_count)
|
|
1648
|
+
|
|
1649
|
+
with ctx.operation(OperationType.VALIDATE, f"validation:{config.name}") as metrics:
|
|
1650
|
+
rows_before = self._count_rows(result_df)
|
|
1651
|
+
metrics.rows_in = rows_before
|
|
1652
|
+
|
|
1653
|
+
result_df = self.engine.materialize(result_df)
|
|
1654
|
+
|
|
1655
|
+
for test in config.validation.tests:
|
|
1656
|
+
if test.type == "volume_drop" and self.catalog_manager:
|
|
1657
|
+
avg_rows = self.catalog_manager.get_average_volume(
|
|
1658
|
+
config.name, days=test.lookback_days
|
|
1659
|
+
)
|
|
1660
|
+
if avg_rows and avg_rows > 0:
|
|
1661
|
+
current_rows = self._count_rows(result_df)
|
|
1662
|
+
drop_pct = (avg_rows - current_rows) / avg_rows
|
|
1663
|
+
if drop_pct > test.threshold:
|
|
1664
|
+
ctx.error(
|
|
1665
|
+
"Volume drop validation failed",
|
|
1666
|
+
drop_percentage=f"{drop_pct:.1%}",
|
|
1667
|
+
threshold=f"{test.threshold:.1%}",
|
|
1668
|
+
current_rows=current_rows,
|
|
1669
|
+
average_rows=avg_rows,
|
|
1670
|
+
)
|
|
1671
|
+
raise ValidationError(
|
|
1672
|
+
config.name,
|
|
1673
|
+
[
|
|
1674
|
+
f"Volume dropped by {drop_pct:.1%} "
|
|
1675
|
+
f"(Threshold: {test.threshold:.1%})"
|
|
1676
|
+
],
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
from odibi.validation.quarantine import (
|
|
1680
|
+
add_quarantine_metadata,
|
|
1681
|
+
has_quarantine_tests,
|
|
1682
|
+
split_valid_invalid,
|
|
1683
|
+
write_quarantine,
|
|
1684
|
+
)
|
|
1685
|
+
|
|
1686
|
+
validation_config = config.validation
|
|
1687
|
+
quarantine_config = validation_config.quarantine
|
|
1688
|
+
has_quarantine = has_quarantine_tests(validation_config.tests)
|
|
1689
|
+
|
|
1690
|
+
test_results: dict = {}
|
|
1691
|
+
|
|
1692
|
+
if has_quarantine and quarantine_config:
|
|
1693
|
+
quarantine_result = split_valid_invalid(
|
|
1694
|
+
result_df,
|
|
1695
|
+
validation_config.tests,
|
|
1696
|
+
self.engine,
|
|
1697
|
+
)
|
|
1698
|
+
|
|
1699
|
+
if quarantine_result.rows_quarantined > 0:
|
|
1700
|
+
import uuid
|
|
1701
|
+
|
|
1702
|
+
run_id = str(uuid.uuid4())
|
|
1703
|
+
invalid_with_meta = add_quarantine_metadata(
|
|
1704
|
+
quarantine_result.invalid_df,
|
|
1705
|
+
quarantine_result.test_results,
|
|
1706
|
+
quarantine_config.add_columns,
|
|
1707
|
+
self.engine,
|
|
1708
|
+
config.name,
|
|
1709
|
+
run_id,
|
|
1710
|
+
validation_config.tests,
|
|
1711
|
+
)
|
|
1712
|
+
|
|
1713
|
+
write_quarantine(
|
|
1714
|
+
invalid_with_meta,
|
|
1715
|
+
quarantine_config,
|
|
1716
|
+
self.engine,
|
|
1717
|
+
self.connections,
|
|
1718
|
+
)
|
|
1719
|
+
|
|
1720
|
+
ctx.warning(
|
|
1721
|
+
f"Quarantined {quarantine_result.rows_quarantined} rows",
|
|
1722
|
+
quarantine_path=quarantine_config.path or quarantine_config.table,
|
|
1723
|
+
rows_quarantined=quarantine_result.rows_quarantined,
|
|
1724
|
+
)
|
|
1725
|
+
|
|
1726
|
+
self._execution_steps.append(
|
|
1727
|
+
f"Quarantined {quarantine_result.rows_quarantined} rows to "
|
|
1728
|
+
f"{quarantine_config.path or quarantine_config.table}"
|
|
1729
|
+
)
|
|
1730
|
+
|
|
1731
|
+
result_df = quarantine_result.valid_df
|
|
1732
|
+
test_results = quarantine_result.test_results
|
|
1733
|
+
|
|
1734
|
+
# Run standard validation on remaining rows
|
|
1735
|
+
self._execute_validation(config, result_df)
|
|
1736
|
+
|
|
1737
|
+
# Check quality gate
|
|
1738
|
+
if validation_config.gate:
|
|
1739
|
+
result_df = self._check_gate(config, result_df, test_results, validation_config.gate)
|
|
1740
|
+
|
|
1741
|
+
return result_df
|
|
1742
|
+
|
|
1743
|
+
def _execute_validation(self, config: NodeConfig, df: Any) -> None:
|
|
1744
|
+
"""Execute validation rules."""
|
|
1745
|
+
from odibi.config import ValidationAction
|
|
1746
|
+
from odibi.validation.engine import Validator
|
|
1747
|
+
|
|
1748
|
+
validation_config = config.validation
|
|
1749
|
+
validator = Validator()
|
|
1750
|
+
failures = validator.validate(df, validation_config)
|
|
1751
|
+
|
|
1752
|
+
# Observability: Log metrics (validation failures)
|
|
1753
|
+
if self.catalog_manager:
|
|
1754
|
+
# We can register these tests as metrics if we want, or just log failures.
|
|
1755
|
+
# For now, we rely on logging validation failures to meta_runs metrics_json
|
|
1756
|
+
# which is done via result metadata.
|
|
1757
|
+
pass
|
|
1758
|
+
|
|
1759
|
+
if failures:
|
|
1760
|
+
if validation_config.mode == ValidationAction.FAIL:
|
|
1761
|
+
raise ValidationError(config.name, failures)
|
|
1762
|
+
elif validation_config.mode == ValidationAction.WARN:
|
|
1763
|
+
import logging
|
|
1764
|
+
|
|
1765
|
+
logger = logging.getLogger(__name__)
|
|
1766
|
+
for fail in failures:
|
|
1767
|
+
logger.warning(f"Validation Warning (Node {config.name}): {fail}")
|
|
1768
|
+
self._execution_steps.append(f"Warning: {fail}")
|
|
1769
|
+
self._validation_warnings.append(fail)
|
|
1770
|
+
|
|
1771
|
+
def _check_gate(
|
|
1772
|
+
self,
|
|
1773
|
+
config: NodeConfig,
|
|
1774
|
+
df: Any,
|
|
1775
|
+
test_results: dict,
|
|
1776
|
+
gate_config: Any,
|
|
1777
|
+
) -> Any:
|
|
1778
|
+
"""Check quality gate and take action if failed.
|
|
1779
|
+
|
|
1780
|
+
Args:
|
|
1781
|
+
config: Node configuration
|
|
1782
|
+
df: DataFrame to check
|
|
1783
|
+
test_results: Dict of test_name -> per-row boolean results
|
|
1784
|
+
gate_config: GateConfig
|
|
1785
|
+
|
|
1786
|
+
Returns:
|
|
1787
|
+
DataFrame (potentially filtered if gate action is WRITE_VALID_ONLY)
|
|
1788
|
+
|
|
1789
|
+
Raises:
|
|
1790
|
+
GateFailedError: If gate fails and action is ABORT
|
|
1791
|
+
"""
|
|
1792
|
+
from odibi.config import GateOnFail
|
|
1793
|
+
from odibi.exceptions import GateFailedError
|
|
1794
|
+
from odibi.validation.gate import evaluate_gate
|
|
1795
|
+
|
|
1796
|
+
gate_result = evaluate_gate(
|
|
1797
|
+
df,
|
|
1798
|
+
test_results,
|
|
1799
|
+
gate_config,
|
|
1800
|
+
self.engine,
|
|
1801
|
+
catalog=self.catalog_manager,
|
|
1802
|
+
node_name=config.name,
|
|
1803
|
+
)
|
|
1804
|
+
|
|
1805
|
+
if gate_result.passed:
|
|
1806
|
+
self._execution_steps.append(f"Gate passed: {gate_result.pass_rate:.1%} pass rate")
|
|
1807
|
+
return df
|
|
1808
|
+
|
|
1809
|
+
self._execution_steps.append(
|
|
1810
|
+
f"Gate failed: {gate_result.pass_rate:.1%} pass rate "
|
|
1811
|
+
f"(required: {gate_config.require_pass_rate:.1%})"
|
|
1812
|
+
)
|
|
1813
|
+
|
|
1814
|
+
if gate_result.action == GateOnFail.ABORT:
|
|
1815
|
+
raise GateFailedError(
|
|
1816
|
+
node_name=config.name,
|
|
1817
|
+
pass_rate=gate_result.pass_rate,
|
|
1818
|
+
required_rate=gate_config.require_pass_rate,
|
|
1819
|
+
failed_rows=gate_result.failed_rows,
|
|
1820
|
+
total_rows=gate_result.total_rows,
|
|
1821
|
+
failure_reasons=gate_result.failure_reasons,
|
|
1822
|
+
)
|
|
1823
|
+
|
|
1824
|
+
elif gate_result.action == GateOnFail.WARN_AND_WRITE:
|
|
1825
|
+
import logging
|
|
1826
|
+
|
|
1827
|
+
logger = logging.getLogger(__name__)
|
|
1828
|
+
for reason in gate_result.failure_reasons:
|
|
1829
|
+
logger.warning(f"Gate Warning (Node {config.name}): {reason}")
|
|
1830
|
+
self._validation_warnings.append(f"Gate: {reason}")
|
|
1831
|
+
return df
|
|
1832
|
+
|
|
1833
|
+
elif gate_result.action == GateOnFail.WRITE_VALID_ONLY:
|
|
1834
|
+
self._execution_steps.append(
|
|
1835
|
+
f"Writing only valid rows ({gate_result.passed_rows} of {gate_result.total_rows})"
|
|
1836
|
+
)
|
|
1837
|
+
return df
|
|
1838
|
+
|
|
1839
|
+
return df
|
|
1840
|
+
|
|
1841
|
+
def _determine_write_mode(self, config: NodeConfig) -> Optional[WriteMode]:
|
|
1842
|
+
"""Determine write mode."""
|
|
1843
|
+
if not config.write or config.write.first_run_query is None:
|
|
1844
|
+
return None
|
|
1845
|
+
|
|
1846
|
+
write_config = config.write
|
|
1847
|
+
target_connection = self.connections.get(write_config.connection)
|
|
1848
|
+
|
|
1849
|
+
if target_connection is None:
|
|
1850
|
+
return None
|
|
1851
|
+
|
|
1852
|
+
table_exists = self._cached_table_exists(
|
|
1853
|
+
target_connection, table=write_config.table, path=write_config.path
|
|
1854
|
+
)
|
|
1855
|
+
|
|
1856
|
+
if not table_exists:
|
|
1857
|
+
return WriteMode.OVERWRITE
|
|
1858
|
+
|
|
1859
|
+
return None
|
|
1860
|
+
|
|
1861
|
+
def _execute_write_phase(
|
|
1862
|
+
self,
|
|
1863
|
+
config: NodeConfig,
|
|
1864
|
+
df: Any,
|
|
1865
|
+
override_mode: Optional[WriteMode] = None,
|
|
1866
|
+
ctx: Optional[LoggingContext] = None,
|
|
1867
|
+
) -> None:
|
|
1868
|
+
"""Execute write operation."""
|
|
1869
|
+
if ctx is None:
|
|
1870
|
+
ctx = get_logging_context()
|
|
1871
|
+
|
|
1872
|
+
if not config.write:
|
|
1873
|
+
return
|
|
1874
|
+
|
|
1875
|
+
write_config = config.write
|
|
1876
|
+
connection = self.connections.get(write_config.connection)
|
|
1877
|
+
|
|
1878
|
+
if connection is None:
|
|
1879
|
+
raise ValueError(f"Connection '{write_config.connection}' not found.")
|
|
1880
|
+
|
|
1881
|
+
# For Delta writes, defer row count to avoid double DAG execution.
|
|
1882
|
+
# We'll extract row count from Delta commit metadata after write.
|
|
1883
|
+
# For non-Delta formats, count upfront as before.
|
|
1884
|
+
defer_row_count = write_config.format == "delta" and df is not None
|
|
1885
|
+
row_count = None if defer_row_count else (self._count_rows(df) if df is not None else 0)
|
|
1886
|
+
mode = override_mode if override_mode is not None else write_config.mode
|
|
1887
|
+
|
|
1888
|
+
with ctx.operation(
|
|
1889
|
+
OperationType.WRITE,
|
|
1890
|
+
f"target:{write_config.connection}",
|
|
1891
|
+
format=write_config.format,
|
|
1892
|
+
table=write_config.table,
|
|
1893
|
+
path=write_config.path,
|
|
1894
|
+
mode=str(mode) if mode else None,
|
|
1895
|
+
) as metrics:
|
|
1896
|
+
metrics.rows_in = row_count
|
|
1897
|
+
|
|
1898
|
+
if write_config.skip_if_unchanged and df is not None:
|
|
1899
|
+
skip_result = self._check_skip_if_unchanged(config, df, connection)
|
|
1900
|
+
if skip_result["should_skip"]:
|
|
1901
|
+
self._execution_steps.append(
|
|
1902
|
+
f"Skipped write: content unchanged (hash: {skip_result['hash'][:12]}...)"
|
|
1903
|
+
)
|
|
1904
|
+
ctx.info(
|
|
1905
|
+
"Skipping write - content unchanged",
|
|
1906
|
+
content_hash=skip_result["hash"][:12],
|
|
1907
|
+
)
|
|
1908
|
+
return
|
|
1909
|
+
|
|
1910
|
+
if config.schema_policy and df is not None:
|
|
1911
|
+
target_schema = self.engine.get_table_schema(
|
|
1912
|
+
connection=connection,
|
|
1913
|
+
table=write_config.table,
|
|
1914
|
+
path=write_config.path,
|
|
1915
|
+
format=write_config.format,
|
|
1916
|
+
)
|
|
1917
|
+
if target_schema:
|
|
1918
|
+
df = self.engine.harmonize_schema(df, target_schema, config.schema_policy)
|
|
1919
|
+
ctx.debug("Applied schema harmonization")
|
|
1920
|
+
self._execution_steps.append("Applied Schema Policy (Harmonization)")
|
|
1921
|
+
|
|
1922
|
+
if write_config.add_metadata and df is not None:
|
|
1923
|
+
df = self._add_write_metadata(config, df)
|
|
1924
|
+
self._execution_steps.append("Added Bronze metadata columns")
|
|
1925
|
+
|
|
1926
|
+
write_options = write_config.options.copy() if write_config.options else {}
|
|
1927
|
+
deep_diag = write_options.pop("deep_diagnostics", False)
|
|
1928
|
+
diff_keys = write_options.pop("diff_keys", None)
|
|
1929
|
+
|
|
1930
|
+
# Extract partition_by from WriteConfig and add to write_options
|
|
1931
|
+
if write_config.partition_by:
|
|
1932
|
+
write_options["partition_by"] = write_config.partition_by
|
|
1933
|
+
ctx.debug("Partitioning by", columns=write_config.partition_by)
|
|
1934
|
+
self._execution_steps.append(f"Partition by: {write_config.partition_by}")
|
|
1935
|
+
|
|
1936
|
+
# Extract zorder_by from WriteConfig and add to write_options (Delta only)
|
|
1937
|
+
if write_config.zorder_by:
|
|
1938
|
+
if write_config.format == "delta":
|
|
1939
|
+
write_options["zorder_by"] = write_config.zorder_by
|
|
1940
|
+
ctx.debug("Z-Ordering by", columns=write_config.zorder_by)
|
|
1941
|
+
self._execution_steps.append(f"Z-Order by: {write_config.zorder_by}")
|
|
1942
|
+
else:
|
|
1943
|
+
ctx.warning(
|
|
1944
|
+
"zorder_by is only supported for Delta format, ignoring",
|
|
1945
|
+
format=write_config.format,
|
|
1946
|
+
)
|
|
1947
|
+
|
|
1948
|
+
# Extract merge_schema from WriteConfig (Delta schema evolution)
|
|
1949
|
+
if write_config.merge_schema:
|
|
1950
|
+
if write_config.format == "delta":
|
|
1951
|
+
write_options["mergeSchema"] = True
|
|
1952
|
+
ctx.debug("Schema evolution enabled (mergeSchema=true)")
|
|
1953
|
+
self._execution_steps.append("Schema evolution enabled (mergeSchema)")
|
|
1954
|
+
else:
|
|
1955
|
+
# For Spark with other formats, use schema_mode if applicable
|
|
1956
|
+
write_options["schema_mode"] = "merge"
|
|
1957
|
+
ctx.debug("Schema merge mode enabled")
|
|
1958
|
+
self._execution_steps.append("Schema merge mode enabled")
|
|
1959
|
+
|
|
1960
|
+
# Extract merge_keys and merge_options from WriteConfig (SQL Server MERGE)
|
|
1961
|
+
if write_config.merge_keys:
|
|
1962
|
+
write_options["merge_keys"] = write_config.merge_keys
|
|
1963
|
+
ctx.debug("Merge keys configured", keys=write_config.merge_keys)
|
|
1964
|
+
if write_config.merge_options:
|
|
1965
|
+
write_options["merge_options"] = write_config.merge_options
|
|
1966
|
+
ctx.debug("Merge options configured")
|
|
1967
|
+
|
|
1968
|
+
if write_config.format == "delta":
|
|
1969
|
+
merged_props = {}
|
|
1970
|
+
if self.performance_config and hasattr(
|
|
1971
|
+
self.performance_config, "delta_table_properties"
|
|
1972
|
+
):
|
|
1973
|
+
merged_props.update(self.performance_config.delta_table_properties or {})
|
|
1974
|
+
if write_config.table_properties:
|
|
1975
|
+
merged_props.update(write_config.table_properties)
|
|
1976
|
+
if merged_props:
|
|
1977
|
+
write_options["table_properties"] = merged_props
|
|
1978
|
+
|
|
1979
|
+
# Handle materialized strategy
|
|
1980
|
+
if config.materialized:
|
|
1981
|
+
if config.materialized == "view":
|
|
1982
|
+
# Create a view instead of writing to table
|
|
1983
|
+
if write_config.table and hasattr(self.engine, "create_view"):
|
|
1984
|
+
ctx.info(f"Creating view: {write_config.table}")
|
|
1985
|
+
self.engine.create_view(
|
|
1986
|
+
df=df,
|
|
1987
|
+
view_name=write_config.table,
|
|
1988
|
+
connection=connection,
|
|
1989
|
+
)
|
|
1990
|
+
self._execution_steps.append(f"Created view: {write_config.table}")
|
|
1991
|
+
ctx.info(
|
|
1992
|
+
f"View created: {write_config.table}",
|
|
1993
|
+
materialized="view",
|
|
1994
|
+
rows=row_count,
|
|
1995
|
+
)
|
|
1996
|
+
return
|
|
1997
|
+
else:
|
|
1998
|
+
ctx.warning(
|
|
1999
|
+
"View materialization requires table name and engine support",
|
|
2000
|
+
table=write_config.table,
|
|
2001
|
+
)
|
|
2002
|
+
elif config.materialized == "incremental":
|
|
2003
|
+
# Use append mode for incremental materialization
|
|
2004
|
+
mode = WriteMode.APPEND
|
|
2005
|
+
ctx.debug("Using append mode for incremental materialization")
|
|
2006
|
+
self._execution_steps.append("Materialized: incremental (append mode)")
|
|
2007
|
+
elif config.materialized == "table":
|
|
2008
|
+
# Default table write behavior
|
|
2009
|
+
ctx.debug("Using table materialization (default write)")
|
|
2010
|
+
self._execution_steps.append("Materialized: table")
|
|
2011
|
+
|
|
2012
|
+
delta_info = self.engine.write(
|
|
2013
|
+
df=df,
|
|
2014
|
+
connection=connection,
|
|
2015
|
+
format=write_config.format,
|
|
2016
|
+
table=write_config.table,
|
|
2017
|
+
path=write_config.path,
|
|
2018
|
+
register_table=write_config.register_table,
|
|
2019
|
+
mode=mode,
|
|
2020
|
+
options=write_options,
|
|
2021
|
+
streaming_config=write_config.streaming,
|
|
2022
|
+
)
|
|
2023
|
+
|
|
2024
|
+
# Extract row count from Delta commit metadata if deferred
|
|
2025
|
+
if defer_row_count:
|
|
2026
|
+
if delta_info:
|
|
2027
|
+
op_metrics = delta_info.get("operation_metrics") or {}
|
|
2028
|
+
# Delta returns numOutputRows for most operations
|
|
2029
|
+
row_count = op_metrics.get("numOutputRows") or op_metrics.get(
|
|
2030
|
+
"numTargetRowsInserted"
|
|
2031
|
+
)
|
|
2032
|
+
if row_count is not None:
|
|
2033
|
+
try:
|
|
2034
|
+
row_count = int(row_count)
|
|
2035
|
+
except (ValueError, TypeError):
|
|
2036
|
+
row_count = None
|
|
2037
|
+
# Fallback: count if Delta metrics unavailable (e.g., older Delta versions)
|
|
2038
|
+
if row_count is None:
|
|
2039
|
+
ctx.debug("Delta commit metrics unavailable, falling back to count")
|
|
2040
|
+
row_count = self._count_rows(df) if df is not None else 0
|
|
2041
|
+
|
|
2042
|
+
metrics.rows_out = row_count
|
|
2043
|
+
|
|
2044
|
+
ctx.info(
|
|
2045
|
+
f"Write completed to {write_config.connection}",
|
|
2046
|
+
format=write_config.format,
|
|
2047
|
+
table=write_config.table,
|
|
2048
|
+
path=write_config.path,
|
|
2049
|
+
mode=str(mode) if mode else None,
|
|
2050
|
+
rows=row_count,
|
|
2051
|
+
)
|
|
2052
|
+
|
|
2053
|
+
if write_config.auto_optimize and write_config.format == "delta":
|
|
2054
|
+
opt_config = write_config.auto_optimize
|
|
2055
|
+
if isinstance(opt_config, bool):
|
|
2056
|
+
if opt_config:
|
|
2057
|
+
from odibi.config import AutoOptimizeConfig
|
|
2058
|
+
|
|
2059
|
+
opt_config = AutoOptimizeConfig(enabled=True)
|
|
2060
|
+
else:
|
|
2061
|
+
opt_config = None
|
|
2062
|
+
|
|
2063
|
+
if opt_config:
|
|
2064
|
+
ctx.debug("Running auto-optimize on Delta table")
|
|
2065
|
+
self.engine.maintain_table(
|
|
2066
|
+
connection=connection,
|
|
2067
|
+
format=write_config.format,
|
|
2068
|
+
table=write_config.table,
|
|
2069
|
+
path=write_config.path,
|
|
2070
|
+
config=opt_config,
|
|
2071
|
+
)
|
|
2072
|
+
|
|
2073
|
+
if delta_info:
|
|
2074
|
+
self._delta_write_info = delta_info
|
|
2075
|
+
self._calculate_delta_diagnostics(
|
|
2076
|
+
delta_info, connection, write_config, deep_diag, diff_keys
|
|
2077
|
+
)
|
|
2078
|
+
|
|
2079
|
+
# Store row count from write phase to avoid redundant counting in metadata
|
|
2080
|
+
if self._delta_write_info is None:
|
|
2081
|
+
self._delta_write_info = {}
|
|
2082
|
+
self._delta_write_info["_cached_row_count"] = row_count
|
|
2083
|
+
|
|
2084
|
+
if write_config.skip_if_unchanged and write_config.format == "delta":
|
|
2085
|
+
self._store_content_hash_after_write(config, connection)
|
|
2086
|
+
|
|
2087
|
+
# Phase 3: Catalog integration after successful write
|
|
2088
|
+
# Skip if performance config disables catalog writes
|
|
2089
|
+
skip_catalog = self.performance_config and getattr(
|
|
2090
|
+
self.performance_config, "skip_catalog_writes", False
|
|
2091
|
+
)
|
|
2092
|
+
if not skip_catalog:
|
|
2093
|
+
self._register_catalog_entries(config, df, connection, write_config, ctx)
|
|
2094
|
+
else:
|
|
2095
|
+
ctx.debug("Skipping catalog writes (skip_catalog_writes=true)")
|
|
2096
|
+
|
|
2097
|
+
def _register_catalog_entries(
|
|
2098
|
+
self,
|
|
2099
|
+
config: NodeConfig,
|
|
2100
|
+
df: Any,
|
|
2101
|
+
connection: Any,
|
|
2102
|
+
write_config: Any,
|
|
2103
|
+
ctx: Optional["LoggingContext"] = None,
|
|
2104
|
+
) -> None:
|
|
2105
|
+
"""Register catalog entries after successful write.
|
|
2106
|
+
|
|
2107
|
+
Handles Phase 3.2-3.5: register_asset, track_schema, log_pattern, record_lineage
|
|
2108
|
+
|
|
2109
|
+
When batch_write_buffers is provided, records are buffered for batch write
|
|
2110
|
+
at the end of pipeline execution to eliminate concurrency conflicts.
|
|
2111
|
+
"""
|
|
2112
|
+
if not self.catalog_manager:
|
|
2113
|
+
return
|
|
2114
|
+
|
|
2115
|
+
if ctx is None:
|
|
2116
|
+
ctx = get_logging_context()
|
|
2117
|
+
|
|
2118
|
+
import uuid
|
|
2119
|
+
|
|
2120
|
+
run_id = str(uuid.uuid4())
|
|
2121
|
+
|
|
2122
|
+
# Check if we should buffer writes for batch processing
|
|
2123
|
+
use_batch_mode = (
|
|
2124
|
+
self.batch_write_buffers is not None
|
|
2125
|
+
and "lineage" in self.batch_write_buffers
|
|
2126
|
+
and "assets" in self.batch_write_buffers
|
|
2127
|
+
)
|
|
2128
|
+
|
|
2129
|
+
# Determine table path
|
|
2130
|
+
table_path = None
|
|
2131
|
+
if hasattr(connection, "get_path"):
|
|
2132
|
+
table_path = connection.get_path(write_config.path or write_config.table)
|
|
2133
|
+
else:
|
|
2134
|
+
table_path = write_config.path or write_config.table
|
|
2135
|
+
|
|
2136
|
+
# 3.2: Register asset (meta_tables)
|
|
2137
|
+
try:
|
|
2138
|
+
project_name = "unknown"
|
|
2139
|
+
if hasattr(self, "project_config") and self.project_config:
|
|
2140
|
+
project_name = getattr(self.project_config, "project", "unknown")
|
|
2141
|
+
|
|
2142
|
+
table_name = write_config.table or config.name
|
|
2143
|
+
pattern_type = config.materialized or "table"
|
|
2144
|
+
|
|
2145
|
+
schema_hash = ""
|
|
2146
|
+
if df is not None:
|
|
2147
|
+
schema = self._get_schema(df)
|
|
2148
|
+
if isinstance(schema, dict):
|
|
2149
|
+
import hashlib
|
|
2150
|
+
import json
|
|
2151
|
+
|
|
2152
|
+
schema_hash = hashlib.md5(
|
|
2153
|
+
json.dumps(schema, sort_keys=True).encode()
|
|
2154
|
+
).hexdigest()
|
|
2155
|
+
|
|
2156
|
+
asset_record = {
|
|
2157
|
+
"project_name": project_name,
|
|
2158
|
+
"table_name": table_name,
|
|
2159
|
+
"path": table_path or "",
|
|
2160
|
+
"format": write_config.format or "delta",
|
|
2161
|
+
"pattern_type": pattern_type,
|
|
2162
|
+
"schema_hash": schema_hash,
|
|
2163
|
+
}
|
|
2164
|
+
|
|
2165
|
+
if use_batch_mode:
|
|
2166
|
+
self.batch_write_buffers["assets"].append(asset_record)
|
|
2167
|
+
ctx.debug(f"Buffered asset for batch write: {table_name}")
|
|
2168
|
+
else:
|
|
2169
|
+
self.catalog_manager.register_asset(**asset_record)
|
|
2170
|
+
ctx.debug(f"Registered asset: {table_name}")
|
|
2171
|
+
|
|
2172
|
+
except Exception as e:
|
|
2173
|
+
ctx.debug(f"Failed to register asset: {e}")
|
|
2174
|
+
|
|
2175
|
+
# 3.3: Track schema changes (meta_schemas)
|
|
2176
|
+
try:
|
|
2177
|
+
if df is not None and table_path:
|
|
2178
|
+
schema = self._get_schema(df)
|
|
2179
|
+
if isinstance(schema, dict):
|
|
2180
|
+
pipeline_name = self.pipeline_name or (
|
|
2181
|
+
config.tags[0] if config.tags else "unknown"
|
|
2182
|
+
)
|
|
2183
|
+
self.catalog_manager.track_schema(
|
|
2184
|
+
table_path=table_path,
|
|
2185
|
+
schema=schema,
|
|
2186
|
+
pipeline=pipeline_name,
|
|
2187
|
+
node=config.name,
|
|
2188
|
+
run_id=run_id,
|
|
2189
|
+
)
|
|
2190
|
+
ctx.debug(f"Tracked schema for: {table_path}")
|
|
2191
|
+
|
|
2192
|
+
except Exception as e:
|
|
2193
|
+
ctx.debug(f"Failed to track schema: {e}")
|
|
2194
|
+
|
|
2195
|
+
# 3.4: Log pattern usage (meta_patterns)
|
|
2196
|
+
try:
|
|
2197
|
+
if config.materialized:
|
|
2198
|
+
import json
|
|
2199
|
+
|
|
2200
|
+
pattern_config = {
|
|
2201
|
+
"materialized": config.materialized,
|
|
2202
|
+
"format": write_config.format,
|
|
2203
|
+
"mode": str(write_config.mode) if write_config.mode else None,
|
|
2204
|
+
}
|
|
2205
|
+
table_name = write_config.table or config.name
|
|
2206
|
+
self.catalog_manager.log_pattern(
|
|
2207
|
+
table_name=table_name,
|
|
2208
|
+
pattern_type=config.materialized,
|
|
2209
|
+
configuration=json.dumps(pattern_config),
|
|
2210
|
+
compliance_score=1.0,
|
|
2211
|
+
)
|
|
2212
|
+
ctx.debug(f"Logged pattern: {config.materialized}")
|
|
2213
|
+
|
|
2214
|
+
except Exception as e:
|
|
2215
|
+
ctx.debug(f"Failed to log pattern: {e}")
|
|
2216
|
+
|
|
2217
|
+
# 3.5: Record lineage (meta_lineage)
|
|
2218
|
+
try:
|
|
2219
|
+
if config.read and table_path:
|
|
2220
|
+
source_path = None
|
|
2221
|
+
read_config = config.read
|
|
2222
|
+
read_conn = self.connections.get(read_config.connection)
|
|
2223
|
+
if read_conn and hasattr(read_conn, "get_path"):
|
|
2224
|
+
source_path = read_conn.get_path(read_config.path or read_config.table)
|
|
2225
|
+
else:
|
|
2226
|
+
source_path = read_config.path or read_config.table
|
|
2227
|
+
|
|
2228
|
+
if source_path:
|
|
2229
|
+
pipeline_name = self.pipeline_name or (
|
|
2230
|
+
config.tags[0] if config.tags else "unknown"
|
|
2231
|
+
)
|
|
2232
|
+
lineage_record = {
|
|
2233
|
+
"source_table": source_path,
|
|
2234
|
+
"target_table": table_path,
|
|
2235
|
+
"target_pipeline": pipeline_name,
|
|
2236
|
+
"target_node": config.name,
|
|
2237
|
+
"run_id": run_id,
|
|
2238
|
+
}
|
|
2239
|
+
|
|
2240
|
+
if use_batch_mode:
|
|
2241
|
+
self.batch_write_buffers["lineage"].append(lineage_record)
|
|
2242
|
+
ctx.debug(
|
|
2243
|
+
f"Buffered lineage for batch write: {source_path} -> {table_path}"
|
|
2244
|
+
)
|
|
2245
|
+
else:
|
|
2246
|
+
self.catalog_manager.record_lineage(**lineage_record)
|
|
2247
|
+
ctx.debug(f"Recorded lineage: {source_path} -> {table_path}")
|
|
2248
|
+
|
|
2249
|
+
except Exception as e:
|
|
2250
|
+
ctx.debug(f"Failed to record lineage: {e}")
|
|
2251
|
+
|
|
2252
|
+
def _add_write_metadata(self, config: NodeConfig, df: Any) -> Any:
|
|
2253
|
+
"""Add Bronze metadata columns to DataFrame before writing.
|
|
2254
|
+
|
|
2255
|
+
Args:
|
|
2256
|
+
config: Node configuration containing read/write settings
|
|
2257
|
+
df: DataFrame to add metadata to
|
|
2258
|
+
|
|
2259
|
+
Returns:
|
|
2260
|
+
DataFrame with metadata columns added
|
|
2261
|
+
"""
|
|
2262
|
+
write_config = config.write
|
|
2263
|
+
read_config = config.read
|
|
2264
|
+
|
|
2265
|
+
# Determine source info from read config
|
|
2266
|
+
source_connection = None
|
|
2267
|
+
source_table = None
|
|
2268
|
+
source_path = None
|
|
2269
|
+
is_file_source = False
|
|
2270
|
+
|
|
2271
|
+
if read_config:
|
|
2272
|
+
source_connection = read_config.connection
|
|
2273
|
+
source_table = read_config.table
|
|
2274
|
+
|
|
2275
|
+
# Determine if file source based on format
|
|
2276
|
+
read_format = str(read_config.format).lower()
|
|
2277
|
+
file_formats = {"csv", "parquet", "json", "avro", "excel"}
|
|
2278
|
+
is_file_source = read_format in file_formats
|
|
2279
|
+
|
|
2280
|
+
if is_file_source:
|
|
2281
|
+
source_path = read_config.path
|
|
2282
|
+
|
|
2283
|
+
# Call engine's metadata helper
|
|
2284
|
+
return self.engine.add_write_metadata(
|
|
2285
|
+
df=df,
|
|
2286
|
+
metadata_config=write_config.add_metadata,
|
|
2287
|
+
source_connection=source_connection,
|
|
2288
|
+
source_table=source_table,
|
|
2289
|
+
source_path=source_path,
|
|
2290
|
+
is_file_source=is_file_source,
|
|
2291
|
+
)
|
|
2292
|
+
|
|
2293
|
+
def _check_skip_if_unchanged(
|
|
2294
|
+
self,
|
|
2295
|
+
config: NodeConfig,
|
|
2296
|
+
df: Any,
|
|
2297
|
+
connection: Any,
|
|
2298
|
+
) -> Dict[str, Any]:
|
|
2299
|
+
"""Check if write should be skipped due to unchanged content.
|
|
2300
|
+
|
|
2301
|
+
Args:
|
|
2302
|
+
config: Node configuration
|
|
2303
|
+
df: DataFrame to check
|
|
2304
|
+
connection: Target connection
|
|
2305
|
+
|
|
2306
|
+
Returns:
|
|
2307
|
+
Dict with 'should_skip' (bool) and 'hash' (str)
|
|
2308
|
+
"""
|
|
2309
|
+
write_config = config.write
|
|
2310
|
+
format_str = str(write_config.format).lower()
|
|
2311
|
+
|
|
2312
|
+
if format_str != "delta":
|
|
2313
|
+
from odibi.utils.logging import logger
|
|
2314
|
+
|
|
2315
|
+
logger.warning(
|
|
2316
|
+
f"[{config.name}] skip_if_unchanged only supported for Delta format, "
|
|
2317
|
+
f"got '{format_str}'. Proceeding with write."
|
|
2318
|
+
)
|
|
2319
|
+
return {"should_skip": False, "hash": None}
|
|
2320
|
+
|
|
2321
|
+
from odibi.enums import EngineType
|
|
2322
|
+
from odibi.utils.content_hash import get_content_hash_from_state
|
|
2323
|
+
|
|
2324
|
+
engine_type = EngineType.SPARK if self.engine.name == "spark" else EngineType.PANDAS
|
|
2325
|
+
if engine_type == EngineType.SPARK:
|
|
2326
|
+
from odibi.utils.content_hash import compute_spark_dataframe_hash
|
|
2327
|
+
|
|
2328
|
+
current_hash = compute_spark_dataframe_hash(
|
|
2329
|
+
df,
|
|
2330
|
+
columns=write_config.skip_hash_columns,
|
|
2331
|
+
sort_columns=write_config.skip_hash_sort_columns,
|
|
2332
|
+
)
|
|
2333
|
+
else:
|
|
2334
|
+
from odibi.utils.content_hash import compute_dataframe_hash
|
|
2335
|
+
|
|
2336
|
+
pandas_df = df
|
|
2337
|
+
if hasattr(df, "to_pandas"):
|
|
2338
|
+
pandas_df = df.to_pandas()
|
|
2339
|
+
|
|
2340
|
+
current_hash = compute_dataframe_hash(
|
|
2341
|
+
pandas_df,
|
|
2342
|
+
columns=write_config.skip_hash_columns,
|
|
2343
|
+
sort_columns=write_config.skip_hash_sort_columns,
|
|
2344
|
+
)
|
|
2345
|
+
|
|
2346
|
+
table_name = write_config.table or write_config.path
|
|
2347
|
+
state_backend = (
|
|
2348
|
+
getattr(self.state_manager, "backend", None) if hasattr(self, "state_manager") else None
|
|
2349
|
+
)
|
|
2350
|
+
previous_hash = get_content_hash_from_state(state_backend, config.name, table_name)
|
|
2351
|
+
|
|
2352
|
+
if previous_hash and current_hash == previous_hash:
|
|
2353
|
+
# Before skipping, verify the target actually exists
|
|
2354
|
+
# If target was deleted, we must write even if hash matches
|
|
2355
|
+
target_exists = self._check_target_exists(write_config, connection)
|
|
2356
|
+
if not target_exists:
|
|
2357
|
+
from odibi.utils.logging_context import get_logging_context
|
|
2358
|
+
|
|
2359
|
+
ctx = get_logging_context()
|
|
2360
|
+
ctx.warning(
|
|
2361
|
+
f"[{config.name}] Target does not exist despite matching hash, "
|
|
2362
|
+
"proceeding with write"
|
|
2363
|
+
)
|
|
2364
|
+
self._pending_content_hash = current_hash
|
|
2365
|
+
return {"should_skip": False, "hash": current_hash}
|
|
2366
|
+
return {"should_skip": True, "hash": current_hash}
|
|
2367
|
+
|
|
2368
|
+
self._pending_content_hash = current_hash
|
|
2369
|
+
return {"should_skip": False, "hash": current_hash}
|
|
2370
|
+
|
|
2371
|
+
def _store_content_hash_after_write(
|
|
2372
|
+
self,
|
|
2373
|
+
config: NodeConfig,
|
|
2374
|
+
connection: Any,
|
|
2375
|
+
) -> None:
|
|
2376
|
+
"""Store content hash in state catalog after successful write."""
|
|
2377
|
+
if not hasattr(self, "_pending_content_hash") or not self._pending_content_hash:
|
|
2378
|
+
return
|
|
2379
|
+
|
|
2380
|
+
write_config = config.write
|
|
2381
|
+
content_hash = self._pending_content_hash
|
|
2382
|
+
|
|
2383
|
+
from odibi.utils.content_hash import set_content_hash_in_state
|
|
2384
|
+
|
|
2385
|
+
try:
|
|
2386
|
+
table_name = write_config.table or write_config.path
|
|
2387
|
+
state_backend = (
|
|
2388
|
+
getattr(self.state_manager, "backend", None)
|
|
2389
|
+
if hasattr(self, "state_manager")
|
|
2390
|
+
else None
|
|
2391
|
+
)
|
|
2392
|
+
|
|
2393
|
+
set_content_hash_in_state(state_backend, config.name, table_name, content_hash)
|
|
2394
|
+
|
|
2395
|
+
from odibi.utils.logging import logger
|
|
2396
|
+
|
|
2397
|
+
logger.debug(f"[{config.name}] Stored content hash: {content_hash[:12]}...")
|
|
2398
|
+
except Exception as e:
|
|
2399
|
+
from odibi.utils.logging import logger
|
|
2400
|
+
|
|
2401
|
+
logger.warning(f"[{config.name}] Failed to store content hash: {e}")
|
|
2402
|
+
finally:
|
|
2403
|
+
self._pending_content_hash = None
|
|
2404
|
+
|
|
2405
|
+
def _check_target_exists(self, write_config: Any, connection: Any) -> bool:
|
|
2406
|
+
"""Check if the target table or path exists.
|
|
2407
|
+
|
|
2408
|
+
Used by skip_if_unchanged to verify target wasn't deleted.
|
|
2409
|
+
|
|
2410
|
+
Args:
|
|
2411
|
+
write_config: Write configuration with table/path info
|
|
2412
|
+
connection: Target connection
|
|
2413
|
+
|
|
2414
|
+
Returns:
|
|
2415
|
+
True if target exists, False otherwise
|
|
2416
|
+
"""
|
|
2417
|
+
try:
|
|
2418
|
+
if write_config.table:
|
|
2419
|
+
# Table-based target
|
|
2420
|
+
if hasattr(self.engine, "spark"):
|
|
2421
|
+
return self.engine.spark.catalog.tableExists(write_config.table)
|
|
2422
|
+
return True # Assume exists for non-Spark engines
|
|
2423
|
+
|
|
2424
|
+
if write_config.path:
|
|
2425
|
+
# Path-based Delta target
|
|
2426
|
+
full_path = connection.get_path(write_config.path)
|
|
2427
|
+
if hasattr(self.engine, "spark"):
|
|
2428
|
+
try:
|
|
2429
|
+
from delta.tables import DeltaTable
|
|
2430
|
+
|
|
2431
|
+
return DeltaTable.isDeltaTable(self.engine.spark, full_path)
|
|
2432
|
+
except Exception:
|
|
2433
|
+
# Fallback: check if path exists
|
|
2434
|
+
try:
|
|
2435
|
+
self.engine.spark.read.format("delta").load(full_path).limit(0)
|
|
2436
|
+
return True
|
|
2437
|
+
except Exception:
|
|
2438
|
+
return False
|
|
2439
|
+
return True # Assume exists for non-Spark engines
|
|
2440
|
+
|
|
2441
|
+
return True # No table or path specified, assume exists
|
|
2442
|
+
except Exception:
|
|
2443
|
+
return False # On any error, assume doesn't exist (safer to write)
|
|
2444
|
+
|
|
2445
|
+
def _calculate_delta_diagnostics(
|
|
2446
|
+
self,
|
|
2447
|
+
delta_info: Dict[str, Any],
|
|
2448
|
+
connection: Any,
|
|
2449
|
+
write_config: Any,
|
|
2450
|
+
deep_diag: bool,
|
|
2451
|
+
diff_keys: Optional[List[str]],
|
|
2452
|
+
) -> None:
|
|
2453
|
+
"""Calculate Delta Lake diagnostics/diff."""
|
|
2454
|
+
ver = delta_info.get("version", 0)
|
|
2455
|
+
if isinstance(ver, int) and ver > 0:
|
|
2456
|
+
try:
|
|
2457
|
+
from odibi.diagnostics import get_delta_diff
|
|
2458
|
+
|
|
2459
|
+
full_path = connection.get_path(write_config.path) if write_config.path else None
|
|
2460
|
+
|
|
2461
|
+
if full_path:
|
|
2462
|
+
spark_session = getattr(self.engine, "spark", None)
|
|
2463
|
+
curr_ver = delta_info["version"]
|
|
2464
|
+
prev_ver = curr_ver - 1
|
|
2465
|
+
|
|
2466
|
+
if deep_diag:
|
|
2467
|
+
diff = get_delta_diff(
|
|
2468
|
+
table_path=full_path,
|
|
2469
|
+
version_a=prev_ver,
|
|
2470
|
+
version_b=curr_ver,
|
|
2471
|
+
spark=spark_session,
|
|
2472
|
+
deep=True,
|
|
2473
|
+
keys=diff_keys,
|
|
2474
|
+
)
|
|
2475
|
+
self._delta_write_info["data_diff"] = {
|
|
2476
|
+
"rows_change": diff.rows_change,
|
|
2477
|
+
"rows_added": diff.rows_added,
|
|
2478
|
+
"rows_removed": diff.rows_removed,
|
|
2479
|
+
"rows_updated": diff.rows_updated,
|
|
2480
|
+
"schema_added": diff.schema_added,
|
|
2481
|
+
"schema_removed": diff.schema_removed,
|
|
2482
|
+
"schema_previous": diff.schema_previous,
|
|
2483
|
+
"sample_added": diff.sample_added,
|
|
2484
|
+
"sample_removed": diff.sample_removed,
|
|
2485
|
+
"sample_updated": diff.sample_updated,
|
|
2486
|
+
}
|
|
2487
|
+
else:
|
|
2488
|
+
metrics = delta_info.get("operation_metrics", {})
|
|
2489
|
+
rows_inserted = int(
|
|
2490
|
+
metrics.get("numTargetRowsInserted", 0)
|
|
2491
|
+
or metrics.get("numOutputRows", 0)
|
|
2492
|
+
)
|
|
2493
|
+
rows_deleted = int(metrics.get("numTargetRowsDeleted", 0))
|
|
2494
|
+
net_change = rows_inserted - rows_deleted
|
|
2495
|
+
self._delta_write_info["data_diff"] = {
|
|
2496
|
+
"rows_change": net_change,
|
|
2497
|
+
"sample_added": None,
|
|
2498
|
+
"sample_removed": None,
|
|
2499
|
+
}
|
|
2500
|
+
except Exception as e:
|
|
2501
|
+
import logging
|
|
2502
|
+
|
|
2503
|
+
logger = logging.getLogger(__name__)
|
|
2504
|
+
logger.warning(f"Failed to calculate data diff: {e}")
|
|
2505
|
+
|
|
2506
|
+
def _collect_metadata(
|
|
2507
|
+
self,
|
|
2508
|
+
config: NodeConfig,
|
|
2509
|
+
df: Optional[Any],
|
|
2510
|
+
input_schema: Optional[Any] = None,
|
|
2511
|
+
input_sample: Optional[List[Dict[str, Any]]] = None,
|
|
2512
|
+
) -> Dict[str, Any]:
|
|
2513
|
+
"""Collect metadata."""
|
|
2514
|
+
import getpass
|
|
2515
|
+
import platform
|
|
2516
|
+
import socket
|
|
2517
|
+
import sys
|
|
2518
|
+
|
|
2519
|
+
try:
|
|
2520
|
+
import pandas as pd
|
|
2521
|
+
|
|
2522
|
+
pandas_version = getattr(pd, "__version__", None)
|
|
2523
|
+
except ImportError:
|
|
2524
|
+
pandas_version = None
|
|
2525
|
+
|
|
2526
|
+
try:
|
|
2527
|
+
import pyspark
|
|
2528
|
+
|
|
2529
|
+
pyspark_version = getattr(pyspark, "__version__", None)
|
|
2530
|
+
except ImportError:
|
|
2531
|
+
pyspark_version = None
|
|
2532
|
+
|
|
2533
|
+
sql_hash = None
|
|
2534
|
+
if self._executed_sql:
|
|
2535
|
+
normalized_sql = " ".join(self._executed_sql).lower().strip()
|
|
2536
|
+
sql_hash = hashlib.md5(normalized_sql.encode("utf-8")).hexdigest()
|
|
2537
|
+
|
|
2538
|
+
config_snapshot = (
|
|
2539
|
+
config.model_dump(mode="json") if hasattr(config, "model_dump") else config.model_dump()
|
|
2540
|
+
)
|
|
2541
|
+
|
|
2542
|
+
metadata = {
|
|
2543
|
+
"timestamp": datetime.now().isoformat(),
|
|
2544
|
+
"environment": {
|
|
2545
|
+
"user": getpass.getuser(),
|
|
2546
|
+
"host": socket.gethostname(),
|
|
2547
|
+
"platform": platform.platform(),
|
|
2548
|
+
"python": sys.version.split()[0],
|
|
2549
|
+
"pandas": pandas_version,
|
|
2550
|
+
"pyspark": pyspark_version,
|
|
2551
|
+
"odibi": __import__("odibi").__version__,
|
|
2552
|
+
},
|
|
2553
|
+
"steps": self._execution_steps.copy(),
|
|
2554
|
+
"executed_sql": self._executed_sql.copy(),
|
|
2555
|
+
"sql_hash": sql_hash,
|
|
2556
|
+
"transformation_stack": [
|
|
2557
|
+
step.function if hasattr(step, "function") else str(step)
|
|
2558
|
+
for step in (config.transform.steps if config.transform else [])
|
|
2559
|
+
],
|
|
2560
|
+
"validation_warnings": self._validation_warnings.copy(),
|
|
2561
|
+
"config_snapshot": config_snapshot,
|
|
2562
|
+
}
|
|
2563
|
+
|
|
2564
|
+
if self._delta_write_info and "version" in self._delta_write_info:
|
|
2565
|
+
if self._delta_write_info.get("streaming"):
|
|
2566
|
+
metadata["streaming_info"] = {
|
|
2567
|
+
"query_id": self._delta_write_info.get("query_id"),
|
|
2568
|
+
"query_name": self._delta_write_info.get("query_name"),
|
|
2569
|
+
"status": self._delta_write_info.get("status"),
|
|
2570
|
+
"target": self._delta_write_info.get("target"),
|
|
2571
|
+
"output_mode": self._delta_write_info.get("output_mode"),
|
|
2572
|
+
"checkpoint_location": self._delta_write_info.get("checkpoint_location"),
|
|
2573
|
+
}
|
|
2574
|
+
else:
|
|
2575
|
+
ts = self._delta_write_info.get("timestamp")
|
|
2576
|
+
metadata["delta_info"] = {
|
|
2577
|
+
"version": self._delta_write_info["version"],
|
|
2578
|
+
"timestamp": (
|
|
2579
|
+
ts.isoformat() if hasattr(ts, "isoformat") else str(ts) if ts else None
|
|
2580
|
+
),
|
|
2581
|
+
"operation": self._delta_write_info.get("operation"),
|
|
2582
|
+
"operation_metrics": self._delta_write_info.get("operation_metrics", {}),
|
|
2583
|
+
"read_version": self._delta_write_info.get("read_version"),
|
|
2584
|
+
}
|
|
2585
|
+
if "data_diff" in self._delta_write_info:
|
|
2586
|
+
metadata["data_diff"] = self._delta_write_info["data_diff"]
|
|
2587
|
+
|
|
2588
|
+
if df is not None:
|
|
2589
|
+
# Reuse row count from write phase if available (avoids redundant count)
|
|
2590
|
+
cached_row_count = None
|
|
2591
|
+
rows_written = None
|
|
2592
|
+
if self._delta_write_info:
|
|
2593
|
+
cached_row_count = self._delta_write_info.get("_cached_row_count")
|
|
2594
|
+
rows_written = self._delta_write_info.get("_cached_row_count")
|
|
2595
|
+
metadata["rows"] = (
|
|
2596
|
+
cached_row_count if cached_row_count is not None else self._count_rows(df)
|
|
2597
|
+
)
|
|
2598
|
+
# Track rows read vs rows written for story metrics
|
|
2599
|
+
metadata["rows_read"] = self._read_row_count
|
|
2600
|
+
metadata["rows_written"] = rows_written
|
|
2601
|
+
metadata["schema"] = self._get_schema(df)
|
|
2602
|
+
metadata["source_files"] = self.engine.get_source_files(df)
|
|
2603
|
+
# Skip null profiling if configured (expensive for large Spark DataFrames)
|
|
2604
|
+
skip_null_profiling = self.performance_config and getattr(
|
|
2605
|
+
self.performance_config, "skip_null_profiling", False
|
|
2606
|
+
)
|
|
2607
|
+
if skip_null_profiling:
|
|
2608
|
+
metadata["null_profile"] = {}
|
|
2609
|
+
else:
|
|
2610
|
+
try:
|
|
2611
|
+
metadata["null_profile"] = self.engine.profile_nulls(df)
|
|
2612
|
+
except Exception:
|
|
2613
|
+
metadata["null_profile"] = {}
|
|
2614
|
+
|
|
2615
|
+
if input_schema and metadata.get("schema"):
|
|
2616
|
+
output_schema = metadata["schema"]
|
|
2617
|
+
set_in = set(input_schema)
|
|
2618
|
+
set_out = set(output_schema)
|
|
2619
|
+
metadata["schema_in"] = input_schema
|
|
2620
|
+
metadata["columns_added"] = list(set_out - set_in)
|
|
2621
|
+
metadata["columns_removed"] = list(set_in - set_out)
|
|
2622
|
+
if input_sample:
|
|
2623
|
+
metadata["sample_data_in"] = input_sample
|
|
2624
|
+
|
|
2625
|
+
if df is not None and self.max_sample_rows > 0:
|
|
2626
|
+
metadata["sample_data"] = self._get_redacted_sample(df, config.sensitive, self.engine)
|
|
2627
|
+
|
|
2628
|
+
if "sample_data_in" in metadata:
|
|
2629
|
+
metadata["sample_data_in"] = self._redact_sample_list(
|
|
2630
|
+
metadata["sample_data_in"], config.sensitive
|
|
2631
|
+
)
|
|
2632
|
+
|
|
2633
|
+
# Create output record for cross-pipeline dependencies (batch written at end of pipeline)
|
|
2634
|
+
# Supports both explicit write blocks and merge/scd2 function outputs
|
|
2635
|
+
output_record = self._create_output_record(config, metadata.get("rows"))
|
|
2636
|
+
if output_record:
|
|
2637
|
+
metadata["_output_record"] = output_record
|
|
2638
|
+
|
|
2639
|
+
return metadata
|
|
2640
|
+
|
|
2641
|
+
def _get_redacted_sample(
|
|
2642
|
+
self, df: Any, sensitive_config: Any, engine: Any
|
|
2643
|
+
) -> List[Dict[str, Any]]:
|
|
2644
|
+
"""Get sample data with redaction."""
|
|
2645
|
+
if sensitive_config is True:
|
|
2646
|
+
return [{"message": "[REDACTED: Sensitive Data]"}]
|
|
2647
|
+
try:
|
|
2648
|
+
sample = engine.get_sample(df, n=self.max_sample_rows)
|
|
2649
|
+
return self._redact_sample_list(sample, sensitive_config)
|
|
2650
|
+
except Exception:
|
|
2651
|
+
return []
|
|
2652
|
+
|
|
2653
|
+
def _redact_sample_list(
|
|
2654
|
+
self, sample: List[Dict[str, Any]], sensitive_config: Any
|
|
2655
|
+
) -> List[Dict[str, Any]]:
|
|
2656
|
+
"""Redact list of rows."""
|
|
2657
|
+
if not sample:
|
|
2658
|
+
return []
|
|
2659
|
+
if sensitive_config is True:
|
|
2660
|
+
return [{"message": "[REDACTED: Sensitive Data]"}]
|
|
2661
|
+
if isinstance(sensitive_config, list):
|
|
2662
|
+
for row in sample:
|
|
2663
|
+
for col in sensitive_config:
|
|
2664
|
+
if col in row:
|
|
2665
|
+
row[col] = "[REDACTED]"
|
|
2666
|
+
return sample
|
|
2667
|
+
|
|
2668
|
+
def _create_output_record(
|
|
2669
|
+
self, config: NodeConfig, row_count: Optional[int]
|
|
2670
|
+
) -> Optional[Dict[str, Any]]:
|
|
2671
|
+
"""
|
|
2672
|
+
Create an output record for cross-pipeline dependency tracking.
|
|
2673
|
+
|
|
2674
|
+
This record is collected during execution and batch-written to meta_outputs
|
|
2675
|
+
at the end of pipeline execution for performance.
|
|
2676
|
+
|
|
2677
|
+
Extracts output info from:
|
|
2678
|
+
1. Explicit write block (preferred)
|
|
2679
|
+
2. merge/scd2 function params in transform steps (fallback)
|
|
2680
|
+
|
|
2681
|
+
Args:
|
|
2682
|
+
config: Node configuration
|
|
2683
|
+
row_count: Number of rows written
|
|
2684
|
+
|
|
2685
|
+
Returns:
|
|
2686
|
+
Dict with output metadata or None if no output location found
|
|
2687
|
+
"""
|
|
2688
|
+
if config.write:
|
|
2689
|
+
write_cfg = config.write
|
|
2690
|
+
output_type = (
|
|
2691
|
+
"managed_table" if write_cfg.table and not write_cfg.path else "external_table"
|
|
2692
|
+
)
|
|
2693
|
+
return {
|
|
2694
|
+
"pipeline_name": self.pipeline_name,
|
|
2695
|
+
"node_name": config.name,
|
|
2696
|
+
"output_type": output_type,
|
|
2697
|
+
"connection_name": write_cfg.connection,
|
|
2698
|
+
"path": write_cfg.path,
|
|
2699
|
+
"format": write_cfg.format,
|
|
2700
|
+
"table_name": write_cfg.register_table or write_cfg.table,
|
|
2701
|
+
"last_run": datetime.now(),
|
|
2702
|
+
"row_count": row_count,
|
|
2703
|
+
}
|
|
2704
|
+
|
|
2705
|
+
output_info = self._extract_output_from_transform_steps(config)
|
|
2706
|
+
if output_info:
|
|
2707
|
+
return {
|
|
2708
|
+
"pipeline_name": self.pipeline_name,
|
|
2709
|
+
"node_name": config.name,
|
|
2710
|
+
"output_type": output_info.get("output_type", "external_table"),
|
|
2711
|
+
"connection_name": output_info.get("connection"),
|
|
2712
|
+
"path": output_info.get("path"),
|
|
2713
|
+
"format": output_info.get("format", "delta"),
|
|
2714
|
+
"table_name": output_info.get("register_table"),
|
|
2715
|
+
"last_run": datetime.now(),
|
|
2716
|
+
"row_count": row_count,
|
|
2717
|
+
}
|
|
2718
|
+
|
|
2719
|
+
return None
|
|
2720
|
+
|
|
2721
|
+
def _extract_output_from_transform_steps(self, config: NodeConfig) -> Optional[Dict[str, Any]]:
|
|
2722
|
+
"""
|
|
2723
|
+
Extract output location from merge/scd2 used as transformer or in transform steps.
|
|
2724
|
+
|
|
2725
|
+
These functions write data internally but don't use a write block,
|
|
2726
|
+
so we need to extract their output info for cross-pipeline references.
|
|
2727
|
+
|
|
2728
|
+
Checks in order:
|
|
2729
|
+
1. Transform steps (last merge/scd2 in chain)
|
|
2730
|
+
2. Top-level transformer with params
|
|
2731
|
+
|
|
2732
|
+
Args:
|
|
2733
|
+
config: Node configuration
|
|
2734
|
+
|
|
2735
|
+
Returns:
|
|
2736
|
+
Dict with connection, path, format, register_table or None
|
|
2737
|
+
"""
|
|
2738
|
+
output_functions = {"merge", "scd2"}
|
|
2739
|
+
|
|
2740
|
+
if config.transform and config.transform.steps:
|
|
2741
|
+
for step in reversed(config.transform.steps):
|
|
2742
|
+
if isinstance(step, str):
|
|
2743
|
+
continue
|
|
2744
|
+
|
|
2745
|
+
if hasattr(step, "function") and step.function in output_functions:
|
|
2746
|
+
params = step.params or {}
|
|
2747
|
+
connection = params.get("connection")
|
|
2748
|
+
path = params.get("path") or params.get("target")
|
|
2749
|
+
register_table = params.get("register_table")
|
|
2750
|
+
|
|
2751
|
+
if connection and path:
|
|
2752
|
+
return {
|
|
2753
|
+
"connection": connection,
|
|
2754
|
+
"path": path,
|
|
2755
|
+
"format": "delta",
|
|
2756
|
+
"register_table": register_table,
|
|
2757
|
+
"output_type": "managed_table" if register_table else "external_table",
|
|
2758
|
+
}
|
|
2759
|
+
|
|
2760
|
+
if config.transformer in output_functions and config.params:
|
|
2761
|
+
params = config.params
|
|
2762
|
+
connection = params.get("connection")
|
|
2763
|
+
path = params.get("path") or params.get("target")
|
|
2764
|
+
register_table = params.get("register_table")
|
|
2765
|
+
|
|
2766
|
+
if connection and path:
|
|
2767
|
+
return {
|
|
2768
|
+
"connection": connection,
|
|
2769
|
+
"path": path,
|
|
2770
|
+
"format": "delta",
|
|
2771
|
+
"register_table": register_table,
|
|
2772
|
+
"output_type": "managed_table" if register_table else "external_table",
|
|
2773
|
+
}
|
|
2774
|
+
|
|
2775
|
+
return None
|
|
2776
|
+
|
|
2777
|
+
def _get_schema(self, df: Any) -> Any:
|
|
2778
|
+
return self.engine.get_schema(df)
|
|
2779
|
+
|
|
2780
|
+
def _get_shape(self, df: Any) -> tuple:
|
|
2781
|
+
return self.engine.get_shape(df)
|
|
2782
|
+
|
|
2783
|
+
def _count_rows(self, df: Any) -> Optional[int]:
|
|
2784
|
+
if df is not None and getattr(df, "isStreaming", False):
|
|
2785
|
+
return None
|
|
2786
|
+
return self.engine.count_rows(df)
|
|
2787
|
+
|
|
2788
|
+
def _get_column_max(self, df: Any, column: str, fallback_column: Optional[str] = None) -> Any:
|
|
2789
|
+
"""Get maximum value of a column, with optional fallback for NULL values."""
|
|
2790
|
+
if df is not None and getattr(df, "isStreaming", False):
|
|
2791
|
+
return None
|
|
2792
|
+
if hasattr(self.engine, "spark"):
|
|
2793
|
+
from pyspark.sql import functions as F
|
|
2794
|
+
|
|
2795
|
+
try:
|
|
2796
|
+
if fallback_column:
|
|
2797
|
+
coalesce_col = F.coalesce(F.col(column), F.col(fallback_column))
|
|
2798
|
+
row = df.select(F.max(coalesce_col)).first()
|
|
2799
|
+
else:
|
|
2800
|
+
row = df.select(F.max(column)).first()
|
|
2801
|
+
return row[0] if row else None
|
|
2802
|
+
except Exception:
|
|
2803
|
+
return None
|
|
2804
|
+
else:
|
|
2805
|
+
try:
|
|
2806
|
+
import numpy as np
|
|
2807
|
+
import pandas as pd
|
|
2808
|
+
|
|
2809
|
+
if fallback_column and fallback_column in df.columns:
|
|
2810
|
+
combined = df[column].combine_first(df[fallback_column])
|
|
2811
|
+
val = combined.max()
|
|
2812
|
+
elif column in df.columns:
|
|
2813
|
+
val = df[column].max()
|
|
2814
|
+
else:
|
|
2815
|
+
return None
|
|
2816
|
+
|
|
2817
|
+
if pd.isna(val):
|
|
2818
|
+
return None
|
|
2819
|
+
if isinstance(val, (np.integer, np.floating)):
|
|
2820
|
+
return val.item()
|
|
2821
|
+
if isinstance(val, np.datetime64):
|
|
2822
|
+
return str(val)
|
|
2823
|
+
return val
|
|
2824
|
+
except Exception:
|
|
2825
|
+
return None
|
|
2826
|
+
|
|
2827
|
+
def _generate_suggestions(self, error: Exception, config: NodeConfig) -> List[str]:
|
|
2828
|
+
"""Generate suggestions."""
|
|
2829
|
+
suggestions = []
|
|
2830
|
+
error_str = str(error).lower()
|
|
2831
|
+
|
|
2832
|
+
if "column" in error_str and "not found" in error_str:
|
|
2833
|
+
suggestions.append("Check that previous nodes output the expected columns")
|
|
2834
|
+
suggestions.append(f"Use 'odibi run-node {config.name} --show-schema' to debug")
|
|
2835
|
+
|
|
2836
|
+
if "validation failed" in error_str:
|
|
2837
|
+
suggestions.append("Check your validation rules against the input data")
|
|
2838
|
+
suggestions.append("Inspect the sample data in the generated story")
|
|
2839
|
+
|
|
2840
|
+
if "keyerror" in error.__class__.__name__.lower():
|
|
2841
|
+
suggestions.append("Verify that all referenced DataFrames are registered in context")
|
|
2842
|
+
suggestions.append("Check node dependencies in 'depends_on' list")
|
|
2843
|
+
|
|
2844
|
+
if "function" in error_str and "not" in error_str:
|
|
2845
|
+
suggestions.append("Ensure the transform function is decorated with @transform")
|
|
2846
|
+
suggestions.append("Import the module containing the transform function")
|
|
2847
|
+
|
|
2848
|
+
if "connection" in error_str:
|
|
2849
|
+
suggestions.append("Verify connection configuration in project.yaml")
|
|
2850
|
+
suggestions.append("Check network connectivity and credentials")
|
|
2851
|
+
|
|
2852
|
+
return suggestions
|
|
2853
|
+
|
|
2854
|
+
def _clean_spark_traceback(self, raw_traceback: str) -> str:
|
|
2855
|
+
"""Clean Spark/Py4J traceback to show only relevant Python info.
|
|
2856
|
+
|
|
2857
|
+
Removes Java stack traces and Py4J noise to make errors more readable.
|
|
2858
|
+
|
|
2859
|
+
Args:
|
|
2860
|
+
raw_traceback: Full traceback string
|
|
2861
|
+
|
|
2862
|
+
Returns:
|
|
2863
|
+
Cleaned traceback with Java/Py4J details removed
|
|
2864
|
+
"""
|
|
2865
|
+
import re
|
|
2866
|
+
|
|
2867
|
+
lines = raw_traceback.split("\n")
|
|
2868
|
+
cleaned_lines = []
|
|
2869
|
+
skip_until_python = False
|
|
2870
|
+
|
|
2871
|
+
for line in lines:
|
|
2872
|
+
# Skip Java stack trace lines
|
|
2873
|
+
if re.match(r"\s+at (org\.|java\.|scala\.|py4j\.)", line):
|
|
2874
|
+
skip_until_python = True
|
|
2875
|
+
continue
|
|
2876
|
+
|
|
2877
|
+
# Skip Py4J internal lines
|
|
2878
|
+
if "py4j.protocol" in line or "Py4JJavaError" in line:
|
|
2879
|
+
continue
|
|
2880
|
+
|
|
2881
|
+
# Skip lines that are just "..."
|
|
2882
|
+
if line.strip() == "...":
|
|
2883
|
+
continue
|
|
2884
|
+
|
|
2885
|
+
# If we hit a Python traceback line, resume capturing
|
|
2886
|
+
if line.strip().startswith("File ") or line.strip().startswith("Traceback"):
|
|
2887
|
+
skip_until_python = False
|
|
2888
|
+
|
|
2889
|
+
if not skip_until_python:
|
|
2890
|
+
# Clean up common Spark error prefixes
|
|
2891
|
+
cleaned_line = re.sub(r"org\.apache\.spark\.[a-zA-Z.]+Exception: ", "", line)
|
|
2892
|
+
cleaned_lines.append(cleaned_line)
|
|
2893
|
+
|
|
2894
|
+
# Remove duplicate empty lines
|
|
2895
|
+
result_lines = []
|
|
2896
|
+
prev_empty = False
|
|
2897
|
+
for line in cleaned_lines:
|
|
2898
|
+
is_empty = not line.strip()
|
|
2899
|
+
if is_empty and prev_empty:
|
|
2900
|
+
continue
|
|
2901
|
+
result_lines.append(line)
|
|
2902
|
+
prev_empty = is_empty
|
|
2903
|
+
|
|
2904
|
+
return "\n".join(result_lines).strip()
|
|
2905
|
+
|
|
2906
|
+
def _calculate_pii(self, config: NodeConfig) -> Dict[str, bool]:
|
|
2907
|
+
"""Calculate effective PII metadata (Inheritance + Local - Declassify)."""
|
|
2908
|
+
# 1. Collect Upstream PII
|
|
2909
|
+
inherited_pii = {}
|
|
2910
|
+
if config.depends_on:
|
|
2911
|
+
for dep in config.depends_on:
|
|
2912
|
+
meta = self.context.get_metadata(dep)
|
|
2913
|
+
if meta and "pii_columns" in meta:
|
|
2914
|
+
inherited_pii.update(meta["pii_columns"])
|
|
2915
|
+
|
|
2916
|
+
# 2. Merge with Local PII
|
|
2917
|
+
local_pii = {name: True for name, meta in config.columns.items() if meta.pii}
|
|
2918
|
+
merged_pii = {**inherited_pii, **local_pii}
|
|
2919
|
+
|
|
2920
|
+
# 3. Apply Declassification
|
|
2921
|
+
if config.privacy and config.privacy.declassify:
|
|
2922
|
+
for col in config.privacy.declassify:
|
|
2923
|
+
merged_pii.pop(col, None)
|
|
2924
|
+
|
|
2925
|
+
return merged_pii
|
|
2926
|
+
|
|
2927
|
+
|
|
2928
|
+
class Node:
|
|
2929
|
+
"""Base node execution orchestrator."""
|
|
2930
|
+
|
|
2931
|
+
def __init__(
|
|
2932
|
+
self,
|
|
2933
|
+
config: NodeConfig,
|
|
2934
|
+
context: Context,
|
|
2935
|
+
engine: Any,
|
|
2936
|
+
connections: Dict[str, Any],
|
|
2937
|
+
config_file: Optional[str] = None,
|
|
2938
|
+
max_sample_rows: int = 10,
|
|
2939
|
+
dry_run: bool = False,
|
|
2940
|
+
retry_config: Optional[RetryConfig] = None,
|
|
2941
|
+
catalog_manager: Optional[Any] = None,
|
|
2942
|
+
performance_config: Optional[Any] = None,
|
|
2943
|
+
pipeline_name: Optional[str] = None,
|
|
2944
|
+
batch_write_buffers: Optional[Dict[str, List]] = None,
|
|
2945
|
+
):
|
|
2946
|
+
"""Initialize node."""
|
|
2947
|
+
self.config = config
|
|
2948
|
+
self.context = context
|
|
2949
|
+
self.engine = engine
|
|
2950
|
+
self.connections = connections
|
|
2951
|
+
self.config_file = config_file
|
|
2952
|
+
self.max_sample_rows = max_sample_rows
|
|
2953
|
+
self.dry_run = dry_run
|
|
2954
|
+
self.retry_config = retry_config or RetryConfig(enabled=False)
|
|
2955
|
+
self.catalog_manager = catalog_manager
|
|
2956
|
+
self.performance_config = performance_config
|
|
2957
|
+
self.pipeline_name = pipeline_name
|
|
2958
|
+
self.batch_write_buffers = batch_write_buffers
|
|
2959
|
+
|
|
2960
|
+
self._cached_result: Optional[Any] = None
|
|
2961
|
+
|
|
2962
|
+
# Initialize State Manager
|
|
2963
|
+
spark_session = None
|
|
2964
|
+
if hasattr(self.engine, "spark"):
|
|
2965
|
+
spark_session = self.engine.spark
|
|
2966
|
+
|
|
2967
|
+
if self.catalog_manager and self.catalog_manager.tables:
|
|
2968
|
+
storage_opts = self.catalog_manager._get_storage_options()
|
|
2969
|
+
environment = getattr(self.catalog_manager.config, "environment", None)
|
|
2970
|
+
backend = CatalogStateBackend(
|
|
2971
|
+
spark_session=spark_session,
|
|
2972
|
+
meta_state_path=self.catalog_manager.tables.get("meta_state"),
|
|
2973
|
+
meta_runs_path=self.catalog_manager.tables.get("meta_runs"),
|
|
2974
|
+
storage_options=storage_opts if storage_opts else None,
|
|
2975
|
+
environment=environment,
|
|
2976
|
+
)
|
|
2977
|
+
else:
|
|
2978
|
+
# Fallback to default local paths (Unified Catalog default)
|
|
2979
|
+
backend = CatalogStateBackend(
|
|
2980
|
+
spark_session=spark_session,
|
|
2981
|
+
meta_state_path=".odibi/system/meta_state",
|
|
2982
|
+
meta_runs_path=".odibi/system/meta_runs",
|
|
2983
|
+
)
|
|
2984
|
+
|
|
2985
|
+
self.state_manager = StateManager(backend=backend)
|
|
2986
|
+
|
|
2987
|
+
# Initialize Executor
|
|
2988
|
+
self.executor = NodeExecutor(
|
|
2989
|
+
context=context,
|
|
2990
|
+
engine=engine,
|
|
2991
|
+
connections=connections,
|
|
2992
|
+
catalog_manager=catalog_manager,
|
|
2993
|
+
config_file=config_file,
|
|
2994
|
+
max_sample_rows=max_sample_rows,
|
|
2995
|
+
performance_config=performance_config,
|
|
2996
|
+
state_manager=self.state_manager,
|
|
2997
|
+
pipeline_name=pipeline_name,
|
|
2998
|
+
batch_write_buffers=batch_write_buffers,
|
|
2999
|
+
)
|
|
3000
|
+
|
|
3001
|
+
def restore(self) -> bool:
|
|
3002
|
+
"""Restore node state from previous execution (if persisted)."""
|
|
3003
|
+
ctx = create_logging_context(
|
|
3004
|
+
node_id=self.config.name,
|
|
3005
|
+
engine=self.engine.__class__.__name__,
|
|
3006
|
+
)
|
|
3007
|
+
|
|
3008
|
+
if not self.config.write:
|
|
3009
|
+
ctx.debug("No write config, skipping restore")
|
|
3010
|
+
return False
|
|
3011
|
+
|
|
3012
|
+
write_config = self.config.write
|
|
3013
|
+
connection = self.connections.get(write_config.connection)
|
|
3014
|
+
|
|
3015
|
+
if connection is None:
|
|
3016
|
+
ctx.debug(f"Connection '{write_config.connection}' not found, skipping restore")
|
|
3017
|
+
return False
|
|
3018
|
+
|
|
3019
|
+
try:
|
|
3020
|
+
ctx.debug(
|
|
3021
|
+
"Attempting to restore node from persisted state",
|
|
3022
|
+
table=write_config.table,
|
|
3023
|
+
path=write_config.path,
|
|
3024
|
+
)
|
|
3025
|
+
|
|
3026
|
+
df = self.engine.read(
|
|
3027
|
+
connection=connection,
|
|
3028
|
+
format=write_config.format,
|
|
3029
|
+
table=write_config.table,
|
|
3030
|
+
path=write_config.path,
|
|
3031
|
+
options={},
|
|
3032
|
+
)
|
|
3033
|
+
|
|
3034
|
+
if df is not None:
|
|
3035
|
+
row_count = self.engine.count_rows(df) if df is not None else 0
|
|
3036
|
+
self.context.register(self.config.name, df)
|
|
3037
|
+
if self.config.cache:
|
|
3038
|
+
self._cached_result = df
|
|
3039
|
+
ctx.info(
|
|
3040
|
+
"Node state restored successfully",
|
|
3041
|
+
rows=row_count,
|
|
3042
|
+
table=write_config.table,
|
|
3043
|
+
path=write_config.path,
|
|
3044
|
+
)
|
|
3045
|
+
return True
|
|
3046
|
+
|
|
3047
|
+
except Exception as e:
|
|
3048
|
+
ctx.warning(
|
|
3049
|
+
f"Failed to restore node state: {e}",
|
|
3050
|
+
error_type=type(e).__name__,
|
|
3051
|
+
)
|
|
3052
|
+
return False
|
|
3053
|
+
|
|
3054
|
+
return False
|
|
3055
|
+
|
|
3056
|
+
def get_version_hash(self) -> str:
|
|
3057
|
+
"""Calculate a deterministic hash of the node's configuration."""
|
|
3058
|
+
import json
|
|
3059
|
+
|
|
3060
|
+
# We use model_dump_json for consistent serialization
|
|
3061
|
+
# Exclude fields that don't affect logic (e.g., description, tags?)
|
|
3062
|
+
# Actually, changing tags might affect scheduling, but not node logic.
|
|
3063
|
+
# Let's stick to functional fields.
|
|
3064
|
+
|
|
3065
|
+
# We need to handle the fact that model_dump might include defaults or not consistently.
|
|
3066
|
+
# Using model_dump(mode='json') is good.
|
|
3067
|
+
|
|
3068
|
+
dump = (
|
|
3069
|
+
self.config.model_dump(mode="json", exclude={"description", "tags", "log_level"})
|
|
3070
|
+
if hasattr(self.config, "model_dump")
|
|
3071
|
+
else self.config.model_dump(exclude={"description", "tags", "log_level"})
|
|
3072
|
+
)
|
|
3073
|
+
|
|
3074
|
+
# Sort keys to ensure determinism
|
|
3075
|
+
dump_str = json.dumps(dump, sort_keys=True)
|
|
3076
|
+
return hashlib.md5(dump_str.encode("utf-8")).hexdigest()
|
|
3077
|
+
|
|
3078
|
+
def execute(self) -> NodeResult:
|
|
3079
|
+
"""Execute the node with telemetry and retry logic."""
|
|
3080
|
+
import json
|
|
3081
|
+
import uuid
|
|
3082
|
+
|
|
3083
|
+
from odibi.utils.telemetry import (
|
|
3084
|
+
Status,
|
|
3085
|
+
StatusCode,
|
|
3086
|
+
node_duration,
|
|
3087
|
+
nodes_executed,
|
|
3088
|
+
rows_processed,
|
|
3089
|
+
tracer,
|
|
3090
|
+
)
|
|
3091
|
+
|
|
3092
|
+
ctx = create_logging_context(
|
|
3093
|
+
node_id=self.config.name,
|
|
3094
|
+
engine=self.engine.__class__.__name__,
|
|
3095
|
+
)
|
|
3096
|
+
|
|
3097
|
+
node_log_level = self.config.log_level.value if self.config.log_level else None
|
|
3098
|
+
|
|
3099
|
+
result_for_log = NodeResult(node_name=self.config.name, success=False, duration=0.0)
|
|
3100
|
+
start_time = time.time()
|
|
3101
|
+
|
|
3102
|
+
ctx.info(
|
|
3103
|
+
f"Starting node execution: {self.config.name}",
|
|
3104
|
+
engine=self.engine.__class__.__name__,
|
|
3105
|
+
dry_run=self.dry_run,
|
|
3106
|
+
retry_enabled=self.retry_config.enabled if self.retry_config else False,
|
|
3107
|
+
)
|
|
3108
|
+
|
|
3109
|
+
with (
|
|
3110
|
+
_override_log_level(node_log_level),
|
|
3111
|
+
tracer.start_as_current_span("node_execution") as span,
|
|
3112
|
+
):
|
|
3113
|
+
span.set_attribute("node.name", self.config.name)
|
|
3114
|
+
span.set_attribute("node.engine", self.engine.__class__.__name__)
|
|
3115
|
+
|
|
3116
|
+
try:
|
|
3117
|
+
try:
|
|
3118
|
+
result = self._execute_with_retries()
|
|
3119
|
+
result_for_log = result
|
|
3120
|
+
except Exception as e:
|
|
3121
|
+
span.record_exception(e)
|
|
3122
|
+
span.set_status(Status(StatusCode.ERROR))
|
|
3123
|
+
nodes_executed.add(1, {"status": "failure", "node": self.config.name})
|
|
3124
|
+
|
|
3125
|
+
result_for_log.duration = time.time() - start_time
|
|
3126
|
+
result_for_log.error = e
|
|
3127
|
+
result_for_log.metadata = {"error": str(e), "catastrophic": True}
|
|
3128
|
+
|
|
3129
|
+
ctx.error(
|
|
3130
|
+
"Catastrophic failure in node execution",
|
|
3131
|
+
error_type=type(e).__name__,
|
|
3132
|
+
error_message=str(e),
|
|
3133
|
+
elapsed_ms=round(result_for_log.duration * 1000, 2),
|
|
3134
|
+
)
|
|
3135
|
+
|
|
3136
|
+
raise e
|
|
3137
|
+
|
|
3138
|
+
if result.success:
|
|
3139
|
+
span.set_status(Status(StatusCode.OK))
|
|
3140
|
+
nodes_executed.add(1, {"status": "success", "node": self.config.name})
|
|
3141
|
+
ctx.info(
|
|
3142
|
+
"Node execution succeeded",
|
|
3143
|
+
rows_processed=result.rows_processed,
|
|
3144
|
+
elapsed_ms=round(result.duration * 1000, 2),
|
|
3145
|
+
attempts=result.metadata.get("attempts", 1),
|
|
3146
|
+
)
|
|
3147
|
+
else:
|
|
3148
|
+
span.set_status(Status(StatusCode.ERROR))
|
|
3149
|
+
if result.error:
|
|
3150
|
+
span.record_exception(result.error)
|
|
3151
|
+
nodes_executed.add(1, {"status": "failure", "node": self.config.name})
|
|
3152
|
+
ctx.error(
|
|
3153
|
+
"Node execution failed",
|
|
3154
|
+
error_type=type(result.error).__name__ if result.error else "Unknown",
|
|
3155
|
+
elapsed_ms=round(result.duration * 1000, 2),
|
|
3156
|
+
)
|
|
3157
|
+
|
|
3158
|
+
if result.rows_processed is not None:
|
|
3159
|
+
rows_processed.add(result.rows_processed, {"node": self.config.name})
|
|
3160
|
+
|
|
3161
|
+
node_duration.record(result.duration, {"node": self.config.name})
|
|
3162
|
+
|
|
3163
|
+
result.metadata["version_hash"] = self.get_version_hash()
|
|
3164
|
+
|
|
3165
|
+
return result
|
|
3166
|
+
|
|
3167
|
+
finally:
|
|
3168
|
+
|
|
3169
|
+
def safe_default(o):
|
|
3170
|
+
return str(o)
|
|
3171
|
+
|
|
3172
|
+
try:
|
|
3173
|
+
metrics_json = json.dumps(result_for_log.metadata, default=safe_default)
|
|
3174
|
+
except Exception:
|
|
3175
|
+
metrics_json = "{}"
|
|
3176
|
+
|
|
3177
|
+
run_record = {
|
|
3178
|
+
"run_id": str(uuid.uuid4()),
|
|
3179
|
+
"pipeline_name": self.pipeline_name
|
|
3180
|
+
or (self.config.tags[0] if self.config.tags else "unknown"),
|
|
3181
|
+
"node_name": self.config.name,
|
|
3182
|
+
"status": "SUCCESS" if result_for_log.success else "FAILURE",
|
|
3183
|
+
"rows_processed": result_for_log.rows_processed or 0,
|
|
3184
|
+
"duration_ms": int(result_for_log.duration * 1000),
|
|
3185
|
+
"metrics_json": metrics_json,
|
|
3186
|
+
}
|
|
3187
|
+
result_for_log.metadata["_run_record"] = run_record
|
|
3188
|
+
|
|
3189
|
+
def _execute_with_retries(self) -> NodeResult:
|
|
3190
|
+
"""Execute with internal retry logic."""
|
|
3191
|
+
ctx = create_logging_context(
|
|
3192
|
+
node_id=self.config.name,
|
|
3193
|
+
engine=self.engine.__class__.__name__,
|
|
3194
|
+
)
|
|
3195
|
+
|
|
3196
|
+
start_time = time.time()
|
|
3197
|
+
attempts = 0
|
|
3198
|
+
max_attempts = self.retry_config.max_attempts if self.retry_config.enabled else 1
|
|
3199
|
+
last_error = None
|
|
3200
|
+
retry_history: List[Dict[str, Any]] = []
|
|
3201
|
+
|
|
3202
|
+
if max_attempts > 1:
|
|
3203
|
+
ctx.debug(
|
|
3204
|
+
"Retry logic enabled",
|
|
3205
|
+
max_attempts=max_attempts,
|
|
3206
|
+
backoff=self.retry_config.backoff,
|
|
3207
|
+
)
|
|
3208
|
+
|
|
3209
|
+
while attempts < max_attempts:
|
|
3210
|
+
attempts += 1
|
|
3211
|
+
attempt_start = time.time()
|
|
3212
|
+
|
|
3213
|
+
if attempts > 1:
|
|
3214
|
+
ctx.info(
|
|
3215
|
+
f"Retry attempt {attempts}/{max_attempts}",
|
|
3216
|
+
previous_error=str(last_error) if last_error else None,
|
|
3217
|
+
)
|
|
3218
|
+
|
|
3219
|
+
try:
|
|
3220
|
+
hwm_state = None
|
|
3221
|
+
if (
|
|
3222
|
+
self.config.read
|
|
3223
|
+
and self.config.read.incremental
|
|
3224
|
+
and self.config.read.incremental.mode == IncrementalMode.STATEFUL
|
|
3225
|
+
):
|
|
3226
|
+
key = self.config.read.incremental.state_key or f"{self.config.name}_hwm"
|
|
3227
|
+
val = self.state_manager.get_hwm(key)
|
|
3228
|
+
hwm_state = (key, val)
|
|
3229
|
+
|
|
3230
|
+
# Suppress error logs on non-final attempts
|
|
3231
|
+
is_last_attempt = attempts >= max_attempts
|
|
3232
|
+
result = self.executor.execute(
|
|
3233
|
+
self.config,
|
|
3234
|
+
dry_run=self.dry_run,
|
|
3235
|
+
hwm_state=hwm_state,
|
|
3236
|
+
suppress_error_log=not is_last_attempt,
|
|
3237
|
+
current_pipeline=self.pipeline_name,
|
|
3238
|
+
)
|
|
3239
|
+
|
|
3240
|
+
attempt_duration = time.time() - attempt_start
|
|
3241
|
+
|
|
3242
|
+
if result.success:
|
|
3243
|
+
retry_history.append(
|
|
3244
|
+
{
|
|
3245
|
+
"attempt": attempts,
|
|
3246
|
+
"success": True,
|
|
3247
|
+
"duration": round(attempt_duration, 3),
|
|
3248
|
+
}
|
|
3249
|
+
)
|
|
3250
|
+
result.metadata["attempts"] = attempts
|
|
3251
|
+
result.metadata["retry_history"] = retry_history
|
|
3252
|
+
result.duration = time.time() - start_time
|
|
3253
|
+
|
|
3254
|
+
if self.config.cache and self.context.get(self.config.name) is not None:
|
|
3255
|
+
self._cached_result = self.context.get(self.config.name)
|
|
3256
|
+
|
|
3257
|
+
if result.metadata.get("hwm_pending"):
|
|
3258
|
+
hwm_update = result.metadata.get("hwm_update")
|
|
3259
|
+
if hwm_update:
|
|
3260
|
+
try:
|
|
3261
|
+
self.state_manager.set_hwm(hwm_update["key"], hwm_update["value"])
|
|
3262
|
+
ctx.debug(
|
|
3263
|
+
"HWM state updated",
|
|
3264
|
+
hwm_key=hwm_update["key"],
|
|
3265
|
+
hwm_value=str(hwm_update["value"]),
|
|
3266
|
+
)
|
|
3267
|
+
except Exception as e:
|
|
3268
|
+
result.metadata["hwm_error"] = str(e)
|
|
3269
|
+
ctx.warning(f"Failed to update HWM state: {e}")
|
|
3270
|
+
|
|
3271
|
+
return result
|
|
3272
|
+
|
|
3273
|
+
last_error = result.error
|
|
3274
|
+
retry_history.append(
|
|
3275
|
+
{
|
|
3276
|
+
"attempt": attempts,
|
|
3277
|
+
"success": False,
|
|
3278
|
+
"error": str(last_error) if last_error else "Unknown error",
|
|
3279
|
+
"error_type": type(last_error).__name__ if last_error else "Unknown",
|
|
3280
|
+
"error_traceback": result.metadata.get("error_traceback_cleaned")
|
|
3281
|
+
or result.metadata.get("error_traceback"),
|
|
3282
|
+
"duration": round(attempt_duration, 3),
|
|
3283
|
+
}
|
|
3284
|
+
)
|
|
3285
|
+
|
|
3286
|
+
except Exception as e:
|
|
3287
|
+
attempt_duration = time.time() - attempt_start
|
|
3288
|
+
last_error = e
|
|
3289
|
+
retry_history.append(
|
|
3290
|
+
{
|
|
3291
|
+
"attempt": attempts,
|
|
3292
|
+
"success": False,
|
|
3293
|
+
"error": str(e),
|
|
3294
|
+
"error_type": type(e).__name__,
|
|
3295
|
+
"error_traceback": traceback.format_exc(),
|
|
3296
|
+
"duration": round(attempt_duration, 3),
|
|
3297
|
+
}
|
|
3298
|
+
)
|
|
3299
|
+
|
|
3300
|
+
if attempts < max_attempts:
|
|
3301
|
+
sleep_time = 1
|
|
3302
|
+
if self.retry_config.backoff == "exponential":
|
|
3303
|
+
sleep_time = 2 ** (attempts - 1)
|
|
3304
|
+
elif self.retry_config.backoff == "linear":
|
|
3305
|
+
sleep_time = attempts
|
|
3306
|
+
elif self.retry_config.backoff == "constant":
|
|
3307
|
+
sleep_time = 1
|
|
3308
|
+
|
|
3309
|
+
ctx.warning(
|
|
3310
|
+
f"Attempt {attempts} failed, retrying in {sleep_time}s",
|
|
3311
|
+
error_type=type(e).__name__,
|
|
3312
|
+
error_message=str(e),
|
|
3313
|
+
backoff_seconds=sleep_time,
|
|
3314
|
+
)
|
|
3315
|
+
time.sleep(sleep_time)
|
|
3316
|
+
|
|
3317
|
+
duration = time.time() - start_time
|
|
3318
|
+
|
|
3319
|
+
ctx.error(
|
|
3320
|
+
"All retry attempts exhausted",
|
|
3321
|
+
attempts=attempts,
|
|
3322
|
+
max_attempts=max_attempts,
|
|
3323
|
+
elapsed_ms=round(duration * 1000, 2),
|
|
3324
|
+
)
|
|
3325
|
+
|
|
3326
|
+
if not isinstance(last_error, NodeExecutionError) and last_error:
|
|
3327
|
+
error = NodeExecutionError(
|
|
3328
|
+
message=str(last_error),
|
|
3329
|
+
context=ExecutionContext(node_name=self.config.name, config_file=self.config_file),
|
|
3330
|
+
original_error=last_error,
|
|
3331
|
+
)
|
|
3332
|
+
else:
|
|
3333
|
+
error = last_error
|
|
3334
|
+
|
|
3335
|
+
return NodeResult(
|
|
3336
|
+
node_name=self.config.name,
|
|
3337
|
+
success=False,
|
|
3338
|
+
duration=duration,
|
|
3339
|
+
error=error,
|
|
3340
|
+
metadata={"attempts": attempts, "retry_history": retry_history},
|
|
3341
|
+
)
|