odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/node.py ADDED
@@ -0,0 +1,3341 @@
1
+ """Node execution engine."""
2
+
3
+ import hashlib
4
+ import inspect
5
+ import logging
6
+ import re
7
+ import time
8
+ import traceback
9
+ from contextlib import contextmanager
10
+ from datetime import datetime, timedelta
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+ from odibi.config import IncrementalConfig, IncrementalMode, NodeConfig, RetryConfig, WriteMode
17
+ from odibi.context import Context, EngineContext, _get_unique_view_name
18
+ from odibi.enums import EngineType
19
+ from odibi.exceptions import ExecutionContext, NodeExecutionError, TransformError, ValidationError
20
+ from odibi.registry import FunctionRegistry
21
+ from odibi.state import (
22
+ CatalogStateBackend,
23
+ StateManager,
24
+ )
25
+ from odibi.utils.duration import parse_duration
26
+ from odibi.utils.logging_context import (
27
+ LoggingContext,
28
+ OperationType,
29
+ create_logging_context,
30
+ get_logging_context,
31
+ )
32
+
33
+
34
+ class PhaseTimer:
35
+ """Track timing for individual execution phases.
36
+
37
+ Usage:
38
+ timer = PhaseTimer()
39
+ with timer.phase("read"):
40
+ # do read
41
+ with timer.phase("transform"):
42
+ # do transform
43
+ print(timer.summary()) # {"read": 1.23, "transform": 0.45, ...}
44
+ """
45
+
46
+ def __init__(self):
47
+ self._timings: Dict[str, float] = {}
48
+ self._current_phase: Optional[str] = None
49
+ self._phase_start: Optional[float] = None
50
+
51
+ @contextmanager
52
+ def phase(self, name: str):
53
+ """Context manager to time a phase."""
54
+ start = time.time()
55
+ try:
56
+ yield
57
+ finally:
58
+ elapsed = time.time() - start
59
+ self._timings[name] = self._timings.get(name, 0) + elapsed
60
+
61
+ def record(self, name: str, duration: float):
62
+ """Manually record a phase duration."""
63
+ self._timings[name] = self._timings.get(name, 0) + duration
64
+
65
+ def get(self, name: str) -> float:
66
+ """Get duration for a specific phase."""
67
+ return self._timings.get(name, 0)
68
+
69
+ def summary(self) -> Dict[str, float]:
70
+ """Get all phase timings rounded to 3 decimal places."""
71
+ return {k: round(v, 3) for k, v in self._timings.items()}
72
+
73
+ def summary_ms(self) -> Dict[str, float]:
74
+ """Get all phase timings in milliseconds."""
75
+ return {k: round(v * 1000, 2) for k, v in self._timings.items()}
76
+
77
+
78
+ class NodeResult(BaseModel):
79
+ """Result of node execution."""
80
+
81
+ model_config = {"arbitrary_types_allowed": True} # Allow Exception type
82
+
83
+ node_name: str
84
+ success: bool
85
+ duration: float
86
+ rows_processed: Optional[int] = None
87
+ rows_read: Optional[int] = None
88
+ rows_written: Optional[int] = None
89
+ result_schema: Optional[Any] = Field(default=None, alias="schema") # Renamed to avoid shadowing
90
+ error: Optional[Exception] = None
91
+ metadata: Dict[str, Any] = Field(default_factory=dict)
92
+
93
+
94
+ @contextmanager
95
+ def _override_log_level(log_level: Optional[str]):
96
+ """Temporarily override the logging level for a node execution."""
97
+ if not log_level:
98
+ yield
99
+ return
100
+
101
+ from odibi.utils.logging import logger as odibi_logger
102
+
103
+ original_level = odibi_logger.level
104
+ new_level = getattr(logging, log_level.upper(), original_level)
105
+ odibi_logger.level = new_level
106
+ odibi_logger.logger.setLevel(new_level)
107
+
108
+ try:
109
+ yield
110
+ finally:
111
+ odibi_logger.level = original_level
112
+ odibi_logger.logger.setLevel(original_level)
113
+
114
+
115
+ class NodeExecutor:
116
+ """Handles the execution logic (read, transform, write) of a node."""
117
+
118
+ def __init__(
119
+ self,
120
+ context: Context,
121
+ engine: Any,
122
+ connections: Dict[str, Any],
123
+ catalog_manager: Optional[Any] = None,
124
+ config_file: Optional[str] = None,
125
+ max_sample_rows: int = 10,
126
+ performance_config: Optional[Any] = None,
127
+ state_manager: Optional[Any] = None,
128
+ pipeline_name: Optional[str] = None,
129
+ batch_write_buffers: Optional[Dict[str, List]] = None,
130
+ ):
131
+ self.context = context
132
+ self.engine = engine
133
+ self.connections = connections
134
+ self.catalog_manager = catalog_manager
135
+ self.config_file = config_file
136
+ self.max_sample_rows = max_sample_rows
137
+ self.performance_config = performance_config
138
+ self.state_manager = state_manager
139
+ self.pipeline_name = pipeline_name
140
+ self.batch_write_buffers = batch_write_buffers
141
+
142
+ # Ephemeral state per execution
143
+ self._execution_steps: List[str] = []
144
+ self._executed_sql: List[str] = []
145
+ self._delta_write_info: Optional[Dict[str, Any]] = None
146
+ self._validation_warnings: List[str] = []
147
+ self._read_row_count: Optional[int] = None # Cache row count from read phase
148
+ self._table_exists_cache: Dict[str, bool] = {} # Cache table existence checks
149
+
150
+ def _cached_table_exists(
151
+ self,
152
+ connection: Any,
153
+ table: Optional[str] = None,
154
+ path: Optional[str] = None,
155
+ ) -> bool:
156
+ """Check if table exists with caching to avoid repeated Delta operations.
157
+
158
+ Performance: Table existence checks involve Delta table open + limit(0).collect()
159
+ which can take 3-5s. Caching saves significant time for nodes that check
160
+ existence multiple times (incremental filter, write phase, etc.).
161
+ """
162
+ cache_key = f"{id(connection)}:{table}:{path}"
163
+ if cache_key not in self._table_exists_cache:
164
+ self._table_exists_cache[cache_key] = self.engine.table_exists(connection, table, path)
165
+ return self._table_exists_cache[cache_key]
166
+
167
+ def execute(
168
+ self,
169
+ config: NodeConfig,
170
+ input_df: Optional[Any] = None,
171
+ dry_run: bool = False,
172
+ hwm_state: Optional[Tuple[str, Any]] = None,
173
+ suppress_error_log: bool = False,
174
+ current_pipeline: Optional[str] = None,
175
+ ) -> NodeResult:
176
+ """Execute the node logic.
177
+
178
+ Args:
179
+ config: Node configuration
180
+ input_df: Optional input dataframe (e.g. from dependencies)
181
+ dry_run: Whether to simulate execution
182
+ hwm_state: Current High Water Mark state (key, value)
183
+ suppress_error_log: If True, suppress error logging (used during retries)
184
+ current_pipeline: Name of current pipeline (for same-pipeline cache lookup)
185
+
186
+ Returns:
187
+ NodeResult
188
+ """
189
+ self._current_pipeline = current_pipeline
190
+ start_time = time.time()
191
+
192
+ # Reset ephemeral state
193
+ self._execution_steps = []
194
+ self._executed_sql = []
195
+ self._delta_write_info = None
196
+ self._validation_warnings = []
197
+ self._read_row_count = None
198
+ self._table_exists_cache = {} # Reset cache per execution
199
+
200
+ ctx = create_logging_context(
201
+ node_id=config.name,
202
+ engine=self.engine.__class__.__name__,
203
+ )
204
+
205
+ # Handle materialized field - controls output as table/view/incremental
206
+ if config.materialized:
207
+ ctx.info(
208
+ f"Materialization strategy: {config.materialized}",
209
+ materialized=config.materialized,
210
+ )
211
+
212
+ if dry_run:
213
+ ctx.debug("Executing node in dry-run mode")
214
+ return self._execute_dry_run(config)
215
+
216
+ with ctx.operation(OperationType.EXECUTE, f"node:{config.name}") as metrics:
217
+ try:
218
+ input_schema = None
219
+ input_sample = None
220
+ pending_hwm_update = None
221
+ rows_in = None
222
+ phase_timer = PhaseTimer()
223
+
224
+ # 0. Pre-SQL Phase
225
+ with phase_timer.phase("pre_sql"):
226
+ self._execute_pre_sql(config, ctx)
227
+
228
+ # 1. Read Phase (either single read, multi-input, or dependency)
229
+ input_dataframes: Dict[str, Any] = {}
230
+
231
+ if config.inputs:
232
+ # Multi-input mode for cross-pipeline dependencies
233
+ with phase_timer.phase("inputs"):
234
+ input_dataframes = self._execute_inputs_phase(
235
+ config, ctx, current_pipeline=self._current_pipeline
236
+ )
237
+ # For transform phase, use first input as primary (or "df" if named)
238
+ if "df" in input_dataframes:
239
+ result_df = input_dataframes["df"]
240
+ elif input_dataframes:
241
+ first_key = next(iter(input_dataframes))
242
+ result_df = input_dataframes[first_key]
243
+ input_df = result_df
244
+ else:
245
+ # Standard single read or dependency
246
+ with phase_timer.phase("read"):
247
+ result_df, pending_hwm_update = self._execute_read_phase(
248
+ config, hwm_state, ctx
249
+ )
250
+
251
+ # If no direct read, check dependencies or use passed input_df
252
+ if result_df is None:
253
+ if input_df is not None:
254
+ result_df = input_df
255
+ ctx.debug(
256
+ "Using provided input_df",
257
+ rows=self._count_rows(input_df) if input_df is not None else 0,
258
+ )
259
+ elif config.depends_on:
260
+ result_df = self.context.get(config.depends_on[0])
261
+ if input_df is None:
262
+ input_df = result_df
263
+ ctx.debug(
264
+ f"Using data from dependency: {config.depends_on[0]}",
265
+ rows=self._count_rows(result_df) if result_df is not None else 0,
266
+ )
267
+
268
+ if config.read:
269
+ input_df = result_df
270
+
271
+ # Capture input schema before transformation
272
+ with phase_timer.phase("schema_capture"):
273
+ if input_df is not None:
274
+ input_schema = self._get_schema(input_df)
275
+ # Reuse row count from read phase if available (avoids redundant count)
276
+ rows_in = (
277
+ self._read_row_count
278
+ if self._read_row_count is not None
279
+ else self._count_rows(input_df)
280
+ )
281
+ metrics.rows_in = rows_in
282
+ metrics.schema_before = (
283
+ input_schema if isinstance(input_schema, dict) else None
284
+ )
285
+ if self.max_sample_rows > 0:
286
+ try:
287
+ input_sample = self.engine.get_sample(
288
+ input_df, n=self.max_sample_rows
289
+ )
290
+ except Exception:
291
+ pass
292
+
293
+ # 1.5 Contracts Phase (Pre-conditions)
294
+ with phase_timer.phase("contracts"):
295
+ self._execute_contracts_phase(config, input_df, ctx)
296
+
297
+ # 2. Transform Phase
298
+ with phase_timer.phase("transform"):
299
+ result_df = self._execute_transform_phase(
300
+ config, result_df, input_df, ctx, input_dataframes
301
+ )
302
+
303
+ # 3. Validation Phase (returns filtered df if quarantine is used)
304
+ with phase_timer.phase("validation"):
305
+ result_df = self._execute_validation_phase(config, result_df, ctx)
306
+
307
+ # 4. Write Phase
308
+ with phase_timer.phase("write"):
309
+ override_mode = self._determine_write_mode(config)
310
+ self._execute_write_phase(config, result_df, override_mode, ctx)
311
+
312
+ # 4.5 Post-SQL Phase
313
+ with phase_timer.phase("post_sql"):
314
+ self._execute_post_sql(config, ctx)
315
+
316
+ # 5. Register & Cache
317
+ with phase_timer.phase("register"):
318
+ if result_df is not None:
319
+ pii_meta = self._calculate_pii(config)
320
+ self.context.register(
321
+ config.name, result_df, metadata={"pii_columns": pii_meta}
322
+ )
323
+
324
+ # 6. Metadata Collection
325
+ with phase_timer.phase("metadata"):
326
+ duration = time.time() - start_time
327
+ metadata = self._collect_metadata(config, result_df, input_schema, input_sample)
328
+
329
+ rows_out = metadata.get("rows")
330
+ metrics.rows_out = rows_out
331
+
332
+ # Log schema changes
333
+ if input_schema and metadata.get("schema"):
334
+ output_schema = metadata["schema"]
335
+ if isinstance(input_schema, dict) and isinstance(output_schema, dict):
336
+ ctx.log_schema_change(
337
+ input_schema, output_schema, operation="node_execution"
338
+ )
339
+ cols_added = metadata.get("columns_added", [])
340
+ cols_removed = metadata.get("columns_removed", [])
341
+ if cols_added or cols_removed:
342
+ ctx.debug(
343
+ "Schema modified",
344
+ columns_added=cols_added,
345
+ columns_removed=cols_removed,
346
+ )
347
+
348
+ # Log row count delta
349
+ if isinstance(rows_in, (int, float)) and isinstance(rows_out, (int, float)):
350
+ delta = rows_out - rows_in
351
+ if delta != 0:
352
+ ctx.log_row_count_change(rows_in, rows_out, operation="node_execution")
353
+
354
+ # Pass back HWM update if any
355
+ if pending_hwm_update:
356
+ key, value = pending_hwm_update
357
+ metadata["hwm_update"] = {"key": key, "value": value}
358
+ metadata["hwm_pending"] = True
359
+ ctx.debug(f"HWM pending update: {key}={value}")
360
+
361
+ # Add phase timings to metadata
362
+ metadata["phase_timings_ms"] = phase_timer.summary_ms()
363
+
364
+ ctx.info(
365
+ "Node execution completed successfully",
366
+ rows_in=rows_in,
367
+ rows_out=rows_out,
368
+ elapsed_ms=round((time.time() - start_time) * 1000, 2),
369
+ phase_timings_ms=phase_timer.summary_ms(),
370
+ )
371
+
372
+ return NodeResult(
373
+ node_name=config.name,
374
+ success=True,
375
+ duration=duration,
376
+ rows_processed=metadata.get("rows"),
377
+ rows_read=metadata.get("rows_read"),
378
+ rows_written=metadata.get("rows_written"),
379
+ schema=metadata.get("schema"),
380
+ metadata=metadata,
381
+ )
382
+
383
+ except Exception as e:
384
+ duration = time.time() - start_time
385
+ suggestions = self._generate_suggestions(e, config)
386
+
387
+ # Capture traceback
388
+ raw_traceback = traceback.format_exc()
389
+ cleaned_traceback = self._clean_spark_traceback(raw_traceback)
390
+
391
+ # Log error with full context (suppress during retries)
392
+ if not suppress_error_log:
393
+ ctx.error(
394
+ f"Node execution failed: {type(e).__name__}: {e}",
395
+ elapsed_ms=round(duration * 1000, 2),
396
+ steps_completed=self._execution_steps.copy(),
397
+ )
398
+ if suggestions:
399
+ ctx.info(f"Suggestions: {'; '.join(suggestions)}")
400
+
401
+ # Wrap error
402
+ if not isinstance(e, NodeExecutionError):
403
+ exec_context = ExecutionContext(
404
+ node_name=config.name,
405
+ config_file=self.config_file,
406
+ previous_steps=self._execution_steps,
407
+ )
408
+ error = NodeExecutionError(
409
+ message=str(e),
410
+ context=exec_context,
411
+ original_error=e,
412
+ suggestions=suggestions,
413
+ )
414
+ else:
415
+ error = e
416
+
417
+ return NodeResult(
418
+ node_name=config.name,
419
+ success=False,
420
+ duration=duration,
421
+ error=error,
422
+ metadata={
423
+ "steps": self._execution_steps.copy(),
424
+ "error_traceback": raw_traceback,
425
+ "error_traceback_cleaned": cleaned_traceback,
426
+ },
427
+ )
428
+
429
+ def _execute_dry_run(self, config: NodeConfig) -> NodeResult:
430
+ """Simulate execution."""
431
+ self._execution_steps.append("Dry run: Skipping actual execution")
432
+
433
+ if config.read:
434
+ self._execution_steps.append(f"Dry run: Would read from {config.read.connection}")
435
+
436
+ if config.transform:
437
+ self._execution_steps.append(
438
+ f"Dry run: Would apply {len(config.transform.steps)} transform steps"
439
+ )
440
+
441
+ if config.write:
442
+ self._execution_steps.append(f"Dry run: Would write to {config.write.connection}")
443
+
444
+ return NodeResult(
445
+ node_name=config.name,
446
+ success=True,
447
+ duration=0.0,
448
+ rows_processed=0,
449
+ metadata={"dry_run": True, "steps": self._execution_steps},
450
+ )
451
+
452
+ def _execute_read_phase(
453
+ self,
454
+ config: NodeConfig,
455
+ hwm_state: Optional[Tuple[str, Any]],
456
+ ctx: Optional["LoggingContext"] = None,
457
+ ) -> Tuple[Optional[Any], Optional[Tuple[str, Any]]]:
458
+ """Execute read operation. Returns (df, pending_hwm_update)."""
459
+ if ctx is None:
460
+ ctx = get_logging_context()
461
+
462
+ if not config.read:
463
+ return None, None
464
+
465
+ read_config = config.read
466
+ connection = self.connections.get(read_config.connection)
467
+
468
+ if connection is None:
469
+ available = ", ".join(sorted(self.connections.keys())) or "(none defined)"
470
+ raise ValueError(
471
+ f"Read phase failed: Connection '{read_config.connection}' not found in configured connections. "
472
+ f"Available connections: [{available}]. "
473
+ f"Check your read.connection value in the node configuration or add the missing connection to project.yaml."
474
+ )
475
+
476
+ with ctx.operation(
477
+ OperationType.READ,
478
+ f"source:{read_config.connection}",
479
+ format=read_config.format,
480
+ table=read_config.table,
481
+ path=read_config.path,
482
+ ) as metrics:
483
+ # Time Travel
484
+ as_of_version = None
485
+ as_of_timestamp = None
486
+ if read_config.time_travel:
487
+ as_of_version = read_config.time_travel.as_of_version
488
+ as_of_timestamp = read_config.time_travel.as_of_timestamp
489
+ ctx.debug(
490
+ "Time travel read",
491
+ as_of_version=as_of_version,
492
+ as_of_timestamp=str(as_of_timestamp) if as_of_timestamp else None,
493
+ )
494
+
495
+ # Legacy HWM: First Run Query Logic
496
+ read_options = read_config.options.copy() if read_config.options else {}
497
+
498
+ if config.write and config.write.first_run_query:
499
+ write_config = config.write
500
+ target_conn = self.connections.get(write_config.connection)
501
+ if target_conn:
502
+ if not self._cached_table_exists(
503
+ target_conn, write_config.table, write_config.path
504
+ ):
505
+ read_options["query"] = config.write.first_run_query
506
+ ctx.debug("Using first_run_query (target table does not exist)")
507
+
508
+ # Merge archive_options into read_options (e.g., badRecordsPath for Spark)
509
+ if read_config.archive_options:
510
+ read_options.update(read_config.archive_options)
511
+ ctx.debug(
512
+ "Applied archive_options",
513
+ archive_options=list(read_config.archive_options.keys()),
514
+ )
515
+ self._execution_steps.append(
516
+ f"Applied archive_options: {list(read_config.archive_options.keys())}"
517
+ )
518
+
519
+ # Incremental SQL Pushdown: Generate filter for SQL sources
520
+ if read_config.incremental and read_config.format in [
521
+ "sql",
522
+ "sql_server",
523
+ "azure_sql",
524
+ ]:
525
+ incremental_filter = self._generate_incremental_sql_filter(
526
+ read_config.incremental, config, ctx
527
+ )
528
+ if incremental_filter:
529
+ # Combine with existing filter if present
530
+ existing_filter = read_options.get("filter")
531
+ if existing_filter:
532
+ read_options["filter"] = f"({existing_filter}) AND ({incremental_filter})"
533
+ else:
534
+ read_options["filter"] = incremental_filter
535
+ ctx.debug(
536
+ "Added incremental SQL pushdown filter",
537
+ filter=read_options["filter"],
538
+ )
539
+ self._execution_steps.append(f"Incremental SQL pushdown: {incremental_filter}")
540
+
541
+ # Execute Read
542
+ df = self.engine.read(
543
+ connection=connection,
544
+ format=read_config.format,
545
+ table=read_config.table,
546
+ path=read_config.path,
547
+ streaming=read_config.streaming,
548
+ schema=getattr(read_config, "schema_ddl", None),
549
+ options=read_options,
550
+ as_of_version=as_of_version,
551
+ as_of_timestamp=as_of_timestamp,
552
+ )
553
+
554
+ if read_config.streaming:
555
+ ctx.info("Streaming read enabled")
556
+ self._execution_steps.append("Streaming read enabled")
557
+
558
+ row_count = self._count_rows(df) if df is not None else 0
559
+ metrics.rows_out = row_count
560
+ # Cache row count to avoid redundant counting in schema_capture phase
561
+ self._read_row_count = row_count
562
+
563
+ ctx.info(
564
+ f"Read completed from {read_config.connection}",
565
+ format=read_config.format,
566
+ table=read_config.table,
567
+ path=read_config.path,
568
+ rows=row_count,
569
+ )
570
+
571
+ # Apply Incremental Logic
572
+ pending_hwm = None
573
+ if config.read.incremental:
574
+ df, pending_hwm = self._apply_incremental_filtering(df, config, hwm_state)
575
+ if pending_hwm:
576
+ ctx.debug(
577
+ "Incremental filtering applied",
578
+ hwm_key=pending_hwm[0],
579
+ hwm_value=str(pending_hwm[1]),
580
+ )
581
+
582
+ self._execution_steps.append(f"Read from {config.read.connection}")
583
+ return df, pending_hwm
584
+
585
+ def _execute_inputs_phase(
586
+ self,
587
+ config: NodeConfig,
588
+ ctx: Optional["LoggingContext"] = None,
589
+ current_pipeline: Optional[str] = None,
590
+ ) -> Dict[str, Any]:
591
+ """
592
+ Execute inputs block for cross-pipeline dependencies.
593
+
594
+ Returns a dict of {input_name: DataFrame} for use in transforms.
595
+
596
+ For same-pipeline references, checks context cache first before catalog lookup.
597
+ This enables first-run scenarios where Delta tables don't exist yet.
598
+ """
599
+ if ctx is None:
600
+ ctx = get_logging_context()
601
+
602
+ if not config.inputs:
603
+ return {}
604
+
605
+ from odibi.references import is_pipeline_reference, resolve_input_reference
606
+
607
+ dataframes = {}
608
+
609
+ for name, ref in config.inputs.items():
610
+ if is_pipeline_reference(ref):
611
+ # Parse the reference to check if it's same-pipeline
612
+ parts = ref[1:].split(".", 1) # Remove $ and split
613
+ ref_pipeline = parts[0] if len(parts) == 2 else None
614
+ ref_node = parts[1] if len(parts) == 2 else None
615
+
616
+ # Try catalog lookup first (read from Delta table - the canonical source)
617
+ df = None
618
+ read_from_catalog = False
619
+
620
+ if self.catalog_manager:
621
+ try:
622
+ read_config = resolve_input_reference(ref, self.catalog_manager)
623
+ ctx.debug(
624
+ f"Resolved reference '{ref}'",
625
+ input_name=name,
626
+ resolved_config=read_config,
627
+ )
628
+
629
+ connection = None
630
+ if "connection" in read_config and read_config["connection"]:
631
+ connection = self.connections.get(read_config["connection"])
632
+ if connection is None:
633
+ available = (
634
+ ", ".join(sorted(self.connections.keys())) or "(none defined)"
635
+ )
636
+ raise ValueError(
637
+ f"Input '{name}' failed: Connection '{read_config['connection']}' not found. "
638
+ f"Available connections: [{available}]. "
639
+ f"Check the connection name in your input reference or add it to project.yaml connections."
640
+ )
641
+
642
+ # Check if table/path exists before reading
643
+ table_or_path = read_config.get("table") or read_config.get("path")
644
+ if table_or_path and self.engine.table_exists(
645
+ connection, read_config.get("table"), read_config.get("path")
646
+ ):
647
+ df = self.engine.read(
648
+ connection=connection,
649
+ format=read_config.get("format"),
650
+ table=read_config.get("table"),
651
+ path=read_config.get("path"),
652
+ )
653
+ read_from_catalog = True
654
+ except Exception as e:
655
+ # Catalog lookup failed - will try cache fallback
656
+ ctx.debug(
657
+ f"Catalog lookup failed for '{ref}': {e}",
658
+ input_name=name,
659
+ )
660
+
661
+ # Fallback to context cache for same-pipeline refs (first run scenario)
662
+ if (
663
+ df is None
664
+ and ref_node
665
+ and current_pipeline
666
+ and ref_pipeline == current_pipeline
667
+ ):
668
+ cached_df = self.context.get(ref_node)
669
+ if cached_df is not None:
670
+ ctx.debug(
671
+ f"Using cached data for same-pipeline reference '{ref}' (Delta not available)",
672
+ input_name=name,
673
+ source_node=ref_node,
674
+ )
675
+ df = cached_df
676
+
677
+ if df is None:
678
+ raise ValueError(
679
+ f"Input '{name}' failed: Cannot resolve reference '{ref}'. "
680
+ f"The referenced data was not found in the catalog or context cache. "
681
+ f"Ensure the referenced node has run successfully and written its output before this node executes. "
682
+ f"Check: 1) The node name is spelled correctly. 2) The referenced pipeline ran first. 3) depends_on is configured if same-pipeline."
683
+ )
684
+
685
+ # Store input source path for transforms that need it (e.g., detect_deletes)
686
+ # Only if we read from catalog (read_config was set)
687
+ if read_from_catalog:
688
+ input_path = read_config.get("path") or read_config.get("table")
689
+ if input_path:
690
+ if connection and hasattr(connection, "get_path"):
691
+ input_path = connection.get_path(input_path)
692
+ self.engine._current_input_path = input_path
693
+
694
+ elif isinstance(ref, dict):
695
+ conn_name = ref.get("connection")
696
+ connection = self.connections.get(conn_name) if conn_name else None
697
+
698
+ if conn_name and connection is None:
699
+ available = ", ".join(sorted(self.connections.keys())) or "(none defined)"
700
+ raise ValueError(
701
+ f"Input '{name}' failed: Connection '{conn_name}' not found. "
702
+ f"Available connections: [{available}]. "
703
+ f"Check your input configuration or add the missing connection to project.yaml."
704
+ )
705
+
706
+ df = self.engine.read(
707
+ connection=connection,
708
+ format=ref.get("format"),
709
+ table=ref.get("table"),
710
+ path=ref.get("path"),
711
+ )
712
+
713
+ else:
714
+ raise ValueError(
715
+ f"Input '{name}' failed: Invalid input format. Got: {type(ref).__name__} = {repr(ref)[:100]}. "
716
+ f"Expected either: 1) A pipeline reference string like '$pipeline_name.node_name', or "
717
+ f"2) A read config dict with 'connection', 'format', and 'table'/'path' keys."
718
+ )
719
+
720
+ dataframes[name] = df
721
+ row_count = self._count_rows(df) if df is not None else 0
722
+ ctx.info(
723
+ f"Loaded input '{name}'",
724
+ rows=row_count,
725
+ source=ref if isinstance(ref, str) else ref.get("path") or ref.get("table"),
726
+ )
727
+ self._execution_steps.append(f"Loaded input '{name}' ({row_count} rows)")
728
+
729
+ return dataframes
730
+
731
+ def _quote_sql_column(self, column: str, format: Optional[str] = None) -> str:
732
+ """Quote a column name for SQL to handle spaces and special characters.
733
+
734
+ Uses [] for SQL Server dialects, backticks for others.
735
+ """
736
+ if format in ("sql_server", "azure_sql"):
737
+ return f"[{column}]"
738
+ else:
739
+ return f"`{column}`"
740
+
741
+ def _get_date_expr(
742
+ self, quoted_col: str, cutoff: datetime, date_format: Optional[str]
743
+ ) -> Tuple[str, str]:
744
+ """Get SQL expressions for date column and cutoff value.
745
+
746
+ Args:
747
+ quoted_col: The quoted column name
748
+ cutoff: The cutoff datetime value
749
+ date_format: The source date format
750
+
751
+ Returns:
752
+ Tuple of (column_expression, cutoff_expression)
753
+
754
+ Supported date_format values:
755
+ - None: Default ISO format (YYYY-MM-DD HH:MM:SS)
756
+ - "oracle": DD-MON-YY format (e.g., 20-APR-24 07:11:01.0)
757
+ - "sql_server": SQL Server CONVERT with style 120
758
+ - "us": MM/DD/YYYY format
759
+ - "eu": DD/MM/YYYY format
760
+ - "iso": Explicit ISO format with T separator
761
+ """
762
+ if date_format == "oracle":
763
+ cutoff_str = cutoff.strftime("%d-%b-%y %H:%M:%S").upper()
764
+ col_expr = f"TO_TIMESTAMP({quoted_col}, 'DD-MON-RR HH24:MI:SS.FF')"
765
+ cutoff_expr = f"TO_TIMESTAMP('{cutoff_str}', 'DD-MON-RR HH24:MI:SS')"
766
+ elif date_format == "oracle_sqlserver":
767
+ cutoff_str = cutoff.strftime("%Y-%m-%d %H:%M:%S")
768
+ col_expr = (
769
+ f"TRY_CAST("
770
+ f"RIGHT('20' + SUBSTRING({quoted_col}, 8, 2), 4) + '-' + "
771
+ f"CASE SUBSTRING({quoted_col}, 4, 3) "
772
+ f"WHEN 'JAN' THEN '01' WHEN 'FEB' THEN '02' WHEN 'MAR' THEN '03' "
773
+ f"WHEN 'APR' THEN '04' WHEN 'MAY' THEN '05' WHEN 'JUN' THEN '06' "
774
+ f"WHEN 'JUL' THEN '07' WHEN 'AUG' THEN '08' WHEN 'SEP' THEN '09' "
775
+ f"WHEN 'OCT' THEN '10' WHEN 'NOV' THEN '11' WHEN 'DEC' THEN '12' END + '-' + "
776
+ f"SUBSTRING({quoted_col}, 1, 2) + ' ' + "
777
+ f"SUBSTRING({quoted_col}, 11, 8) AS DATETIME)"
778
+ )
779
+ cutoff_expr = f"'{cutoff_str}'"
780
+ elif date_format == "sql_server":
781
+ cutoff_str = cutoff.strftime("%Y-%m-%d %H:%M:%S")
782
+ col_expr = f"CONVERT(DATETIME, {quoted_col}, 120)"
783
+ cutoff_expr = f"'{cutoff_str}'"
784
+ elif date_format == "us":
785
+ cutoff_str = cutoff.strftime("%m/%d/%Y %H:%M:%S")
786
+ col_expr = f"TO_TIMESTAMP({quoted_col}, 'MM/DD/YYYY HH24:MI:SS')"
787
+ cutoff_expr = f"TO_TIMESTAMP('{cutoff_str}', 'MM/DD/YYYY HH24:MI:SS')"
788
+ elif date_format == "eu":
789
+ cutoff_str = cutoff.strftime("%d/%m/%Y %H:%M:%S")
790
+ col_expr = f"TO_TIMESTAMP({quoted_col}, 'DD/MM/YYYY HH24:MI:SS')"
791
+ cutoff_expr = f"TO_TIMESTAMP('{cutoff_str}', 'DD/MM/YYYY HH24:MI:SS')"
792
+ elif date_format == "iso":
793
+ cutoff_str = cutoff.strftime("%Y-%m-%dT%H:%M:%S")
794
+ col_expr = f"TO_TIMESTAMP({quoted_col}, 'YYYY-MM-DD\"T\"HH24:MI:SS')"
795
+ cutoff_expr = f"TO_TIMESTAMP('{cutoff_str}', 'YYYY-MM-DD\"T\"HH24:MI:SS')"
796
+ else:
797
+ cutoff_str = cutoff.strftime("%Y-%m-%d %H:%M:%S")
798
+ col_expr = quoted_col
799
+ cutoff_expr = f"'{cutoff_str}'"
800
+
801
+ return col_expr, cutoff_expr
802
+
803
+ def _generate_incremental_sql_filter(
804
+ self,
805
+ inc: IncrementalConfig,
806
+ config: NodeConfig,
807
+ ctx: Optional["LoggingContext"] = None,
808
+ ) -> Optional[str]:
809
+ """Generate SQL WHERE clause for incremental filtering (pushdown to SQL source).
810
+
811
+ Returns a SQL filter string or None if no filter should be applied.
812
+ """
813
+ if ctx is None:
814
+ ctx = get_logging_context()
815
+
816
+ # Check if target table exists - if not, this is first run (full load)
817
+ if config.write:
818
+ target_conn = self.connections.get(config.write.connection)
819
+ # Use register_table if table is not set (path-based Delta with registration)
820
+ table_to_check = config.write.table or config.write.register_table
821
+ if target_conn and not self._cached_table_exists(
822
+ target_conn, table_to_check, config.write.path
823
+ ):
824
+ ctx.debug("First run detected - skipping incremental SQL pushdown")
825
+ return None
826
+
827
+ # Get the SQL format for proper column quoting
828
+ sql_format = config.read.format if config.read else None
829
+
830
+ if inc.mode == IncrementalMode.ROLLING_WINDOW:
831
+ if not inc.lookback or not inc.unit:
832
+ return None
833
+
834
+ # Calculate cutoff
835
+ now = datetime.now()
836
+
837
+ delta = None
838
+ if inc.unit == "hour":
839
+ delta = timedelta(hours=inc.lookback)
840
+ elif inc.unit == "day":
841
+ delta = timedelta(days=inc.lookback)
842
+ elif inc.unit == "month":
843
+ delta = timedelta(days=inc.lookback * 30)
844
+ elif inc.unit == "year":
845
+ delta = timedelta(days=inc.lookback * 365)
846
+
847
+ if delta:
848
+ cutoff = now - delta
849
+ quoted_col = self._quote_sql_column(inc.column, sql_format)
850
+ col_expr, cutoff_expr = self._get_date_expr(quoted_col, cutoff, inc.date_format)
851
+
852
+ if inc.fallback_column:
853
+ quoted_fallback = self._quote_sql_column(inc.fallback_column, sql_format)
854
+ fallback_expr, _ = self._get_date_expr(quoted_fallback, cutoff, inc.date_format)
855
+ return f"COALESCE({col_expr}, {fallback_expr}) >= {cutoff_expr}"
856
+ else:
857
+ return f"{col_expr} >= {cutoff_expr}"
858
+
859
+ elif inc.mode == IncrementalMode.STATEFUL:
860
+ # For stateful, we need to get the HWM from state
861
+ state_key = inc.state_key or f"{config.name}_hwm"
862
+
863
+ if self.state_manager:
864
+ last_hwm = self.state_manager.get_hwm(state_key)
865
+ if last_hwm is not None:
866
+ # Apply watermark_lag if configured
867
+ if inc.watermark_lag:
868
+ from odibi.utils.duration import parse_duration
869
+
870
+ lag_delta = parse_duration(inc.watermark_lag)
871
+ if lag_delta and isinstance(last_hwm, str):
872
+ try:
873
+ hwm_dt = datetime.fromisoformat(last_hwm)
874
+ last_hwm = (hwm_dt - lag_delta).isoformat()
875
+ except ValueError:
876
+ pass
877
+
878
+ # Format HWM for SQL compatibility (SQL Server doesn't like ISO 'T')
879
+ hwm_str = str(last_hwm)
880
+ if isinstance(last_hwm, str) and "T" in last_hwm:
881
+ try:
882
+ hwm_dt = datetime.fromisoformat(last_hwm)
883
+ hwm_str = hwm_dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
884
+ except ValueError:
885
+ hwm_str = last_hwm.replace("T", " ")
886
+
887
+ quoted_col = self._quote_sql_column(inc.column, sql_format)
888
+ if inc.fallback_column:
889
+ quoted_fallback = self._quote_sql_column(inc.fallback_column, sql_format)
890
+ return f"COALESCE({quoted_col}, {quoted_fallback}) > '{hwm_str}'"
891
+ else:
892
+ return f"{quoted_col} > '{hwm_str}'"
893
+
894
+ return None
895
+
896
+ def _apply_incremental_filtering(
897
+ self, df: Any, config: NodeConfig, hwm_state: Optional[Tuple[str, Any]]
898
+ ) -> Tuple[Any, Optional[Tuple[str, Any]]]:
899
+ """Apply incremental filtering and capture new HWM.
900
+
901
+ Note: For SQL sources, filtering is done via SQL pushdown in _generate_incremental_sql_filter.
902
+ This method handles non-SQL sources and HWM capture for stateful mode.
903
+ """
904
+ inc = config.read.incremental
905
+ if not inc:
906
+ return df, None
907
+
908
+ # Skip in-memory filtering for SQL sources (already pushed down)
909
+ if config.read.format in ["sql", "sql_server", "azure_sql"]:
910
+ # Still need to capture HWM for stateful mode
911
+ if inc.mode == IncrementalMode.STATEFUL:
912
+ state_key = inc.state_key or f"{config.name}_hwm"
913
+ new_max = self._get_column_max(df, inc.column)
914
+ if new_max is not None:
915
+ return df, (state_key, new_max)
916
+ return df, None
917
+
918
+ # Smart Read Pattern: If target table doesn't exist, skip filtering (Full Load)
919
+ if config.write:
920
+ target_conn = self.connections.get(config.write.connection)
921
+ # Use register_table if table is not set (path-based Delta with registration)
922
+ table_to_check = config.write.table or config.write.register_table
923
+ if target_conn and not self._cached_table_exists(
924
+ target_conn, table_to_check, config.write.path
925
+ ):
926
+ # First Run detected -> Full Load
927
+ # We still need to capture HWM if stateful!
928
+ if inc.mode == IncrementalMode.STATEFUL:
929
+ state_key = inc.state_key or f"{config.name}_hwm"
930
+ new_max = self._get_column_max(df, inc.column)
931
+ if new_max is not None:
932
+ return df, (state_key, new_max)
933
+
934
+ return df, None
935
+
936
+ if inc.mode == IncrementalMode.ROLLING_WINDOW:
937
+ if not inc.lookback or not inc.unit:
938
+ return df, None
939
+
940
+ # Calculate cutoff
941
+ now = datetime.now()
942
+
943
+ delta = None
944
+ if inc.unit == "hour":
945
+ delta = timedelta(hours=inc.lookback)
946
+ elif inc.unit == "day":
947
+ delta = timedelta(days=inc.lookback)
948
+ elif inc.unit == "month":
949
+ delta = timedelta(days=inc.lookback * 30)
950
+ elif inc.unit == "year":
951
+ delta = timedelta(days=inc.lookback * 365)
952
+
953
+ if delta:
954
+ cutoff = now - delta
955
+
956
+ if inc.fallback_column:
957
+ if hasattr(self.engine, "filter_coalesce"):
958
+ # Use >= for inclusive rolling window usually? Or >?
959
+ # Standard is usually >= (within the last X days)
960
+ df = self.engine.filter_coalesce(
961
+ df, inc.column, inc.fallback_column, ">=", cutoff
962
+ )
963
+ elif hasattr(self.engine, "filter_greater_than"):
964
+ df = self.engine.filter_greater_than(df, inc.column, cutoff)
965
+ else:
966
+ if hasattr(self.engine, "filter_greater_than"):
967
+ # Note: engine.filter_greater_than is strictly >.
968
+ # For rolling window, we usually want >= cutoff.
969
+ # But filter_greater_than implementation is >.
970
+ # Let's check if we should add filter_greater_than_or_equal?
971
+ # Or just use > (cutoff - epsilon)?
972
+ # Given existing test expectation (kept rows at cutoff?), use >.
973
+ # Test says: Cutoff 2023-10-24 12:00:00.
974
+ # Row 2: 2023-10-25 11:00:00. (Kept)
975
+ # Row 3: 2023-10-25 11:30:00. (Kept)
976
+ # Row 1: 2023-10-01 (Filtered)
977
+ # So > is fine.
978
+ df = self.engine.filter_greater_than(df, inc.column, cutoff)
979
+
980
+ elif inc.mode == IncrementalMode.STATEFUL:
981
+ # Check if we have state
982
+ # hwm_state is (key, value)
983
+
984
+ last_hwm = None
985
+ state_key = inc.state_key or f"{config.name}_hwm"
986
+
987
+ if hwm_state and hwm_state[0] == state_key:
988
+ last_hwm = hwm_state[1]
989
+
990
+ # Apply watermark_lag: subtract lag duration from HWM for late-arriving data
991
+ if last_hwm is not None and inc.watermark_lag:
992
+ lag_delta = parse_duration(inc.watermark_lag)
993
+ if lag_delta:
994
+ ctx = get_logging_context()
995
+ ctx.debug(
996
+ f"Applying watermark_lag: {inc.watermark_lag}",
997
+ original_hwm=str(last_hwm),
998
+ )
999
+ # Parse string HWM to datetime if needed (HWM is stored as JSON string)
1000
+ if isinstance(last_hwm, str):
1001
+ try:
1002
+ last_hwm = datetime.fromisoformat(last_hwm)
1003
+ except ValueError:
1004
+ ctx.warning(
1005
+ f"Could not parse HWM '{last_hwm}' as datetime for watermark_lag"
1006
+ )
1007
+ # Subtract lag from HWM to handle late-arriving data
1008
+ if hasattr(last_hwm, "__sub__"):
1009
+ last_hwm = last_hwm - lag_delta
1010
+ ctx.info(
1011
+ "Watermark lag applied",
1012
+ lag=inc.watermark_lag,
1013
+ adjusted_hwm=str(last_hwm),
1014
+ )
1015
+ self._execution_steps.append(f"Applied watermark_lag: {inc.watermark_lag}")
1016
+
1017
+ # Filter
1018
+ if last_hwm is not None:
1019
+ # Apply filter: col > last_hwm (with fallback if configured)
1020
+ if inc.fallback_column and hasattr(self.engine, "filter_coalesce"):
1021
+ df = self.engine.filter_coalesce(
1022
+ df, inc.column, inc.fallback_column, ">", last_hwm
1023
+ )
1024
+ self._execution_steps.append(
1025
+ f"Incremental: Filtered COALESCE({inc.column}, "
1026
+ f"{inc.fallback_column}) > {last_hwm}"
1027
+ )
1028
+ else:
1029
+ df = self.engine.filter_greater_than(df, inc.column, last_hwm)
1030
+ self._execution_steps.append(f"Incremental: Filtered {inc.column} > {last_hwm}")
1031
+
1032
+ # Capture new HWM (use fallback column if configured)
1033
+ new_max = self._get_column_max(df, inc.column, inc.fallback_column)
1034
+
1035
+ if new_max is not None:
1036
+ return df, (state_key, new_max)
1037
+
1038
+ return df, None
1039
+
1040
+ def _execute_pre_sql(
1041
+ self,
1042
+ config: NodeConfig,
1043
+ ctx: Optional["LoggingContext"] = None,
1044
+ ) -> None:
1045
+ """Execute pre-SQL statements before node runs."""
1046
+ if ctx is None:
1047
+ ctx = get_logging_context()
1048
+
1049
+ if not config.pre_sql:
1050
+ return
1051
+
1052
+ ctx.info(f"Executing {len(config.pre_sql)} pre-SQL statement(s)")
1053
+
1054
+ for i, sql in enumerate(config.pre_sql, 1):
1055
+ ctx.debug(f"Executing pre_sql [{i}/{len(config.pre_sql)}]", sql_preview=sql[:100])
1056
+ try:
1057
+ self.engine.execute_sql(sql, self.context)
1058
+ self._executed_sql.append(f"pre_sql[{i}]: {sql[:50]}...")
1059
+ except Exception as e:
1060
+ ctx.error(
1061
+ "Pre-SQL statement failed",
1062
+ statement_index=i,
1063
+ error=str(e),
1064
+ )
1065
+ raise
1066
+
1067
+ self._execution_steps.append(f"Executed {len(config.pre_sql)} pre-SQL statement(s)")
1068
+
1069
+ def _execute_post_sql(
1070
+ self,
1071
+ config: NodeConfig,
1072
+ ctx: Optional["LoggingContext"] = None,
1073
+ ) -> None:
1074
+ """Execute post-SQL statements after node completes."""
1075
+ if ctx is None:
1076
+ ctx = get_logging_context()
1077
+
1078
+ if not config.post_sql:
1079
+ return
1080
+
1081
+ ctx.info(f"Executing {len(config.post_sql)} post-SQL statement(s)")
1082
+
1083
+ for i, sql in enumerate(config.post_sql, 1):
1084
+ ctx.debug(f"Executing post_sql [{i}/{len(config.post_sql)}]", sql_preview=sql[:100])
1085
+ try:
1086
+ self.engine.execute_sql(sql, self.context)
1087
+ self._executed_sql.append(f"post_sql[{i}]: {sql[:50]}...")
1088
+ except Exception as e:
1089
+ ctx.error(
1090
+ "Post-SQL statement failed",
1091
+ statement_index=i,
1092
+ error=str(e),
1093
+ )
1094
+ raise
1095
+
1096
+ self._execution_steps.append(f"Executed {len(config.post_sql)} post-SQL statement(s)")
1097
+
1098
+ def _execute_contracts_phase(
1099
+ self,
1100
+ config: NodeConfig,
1101
+ df: Any,
1102
+ ctx: Optional["LoggingContext"] = None,
1103
+ ) -> None:
1104
+ """Execute pre-condition contracts."""
1105
+ if ctx is None:
1106
+ ctx = get_logging_context()
1107
+
1108
+ if config.contracts and df is not None:
1109
+ ctx.debug(
1110
+ "Starting contract validation",
1111
+ contract_count=len(config.contracts),
1112
+ )
1113
+
1114
+ df = self.engine.materialize(df)
1115
+
1116
+ from odibi.config import ValidationAction, ValidationConfig
1117
+ from odibi.validation.engine import Validator
1118
+
1119
+ contract_config = ValidationConfig(mode=ValidationAction.FAIL, tests=config.contracts)
1120
+
1121
+ validator = Validator()
1122
+ failures = validator.validate(df, contract_config, context={"columns": config.columns})
1123
+
1124
+ if failures:
1125
+ ctx.error(
1126
+ "Contract validation failed",
1127
+ failures=failures,
1128
+ contract_count=len(config.contracts),
1129
+ )
1130
+ failure_summary = "; ".join(
1131
+ f"{f.get('test', 'unknown')}: {f.get('message', 'failed')}"
1132
+ for f in failures[:3]
1133
+ )
1134
+ if len(failures) > 3:
1135
+ failure_summary += f"; ... and {len(failures) - 3} more"
1136
+ raise ValidationError(
1137
+ f"Node '{config.name}' contract validation failed with {len(failures)} error(s): {failure_summary}",
1138
+ failures,
1139
+ )
1140
+
1141
+ ctx.info(
1142
+ "Contract validation passed",
1143
+ contract_count=len(config.contracts),
1144
+ )
1145
+ self._execution_steps.append(f"Passed {len(config.contracts)} contract checks")
1146
+
1147
+ def _execute_transform_phase(
1148
+ self,
1149
+ config: NodeConfig,
1150
+ result_df: Optional[Any],
1151
+ input_df: Optional[Any],
1152
+ ctx: Optional["LoggingContext"] = None,
1153
+ input_dataframes: Optional[Dict[str, Any]] = None,
1154
+ ) -> Optional[Any]:
1155
+ """
1156
+ Execute transformer and transform steps.
1157
+
1158
+ Args:
1159
+ config: Node configuration
1160
+ result_df: Current result DataFrame
1161
+ input_df: Input DataFrame (for single-input nodes)
1162
+ ctx: Logging context
1163
+ input_dataframes: Dict of named DataFrames for multi-input nodes (inputs block)
1164
+ """
1165
+ if ctx is None:
1166
+ ctx = get_logging_context()
1167
+
1168
+ input_dataframes = input_dataframes or {}
1169
+
1170
+ pii_meta = self._calculate_pii(config)
1171
+ rows_before = self._count_rows(result_df) if result_df is not None else None
1172
+ schema_before = self._get_schema(result_df) if result_df is not None else None
1173
+
1174
+ # Register named inputs in context for SQL access
1175
+ if input_dataframes:
1176
+ for name, df in input_dataframes.items():
1177
+ self.context.register(name, df)
1178
+ ctx.debug(
1179
+ f"Registered {len(input_dataframes)} named inputs for transforms",
1180
+ inputs=list(input_dataframes.keys()),
1181
+ )
1182
+
1183
+ # Pattern Engine
1184
+ if config.transformer:
1185
+ if result_df is None and input_df is not None:
1186
+ result_df = input_df
1187
+ rows_before = self._count_rows(result_df)
1188
+ schema_before = self._get_schema(result_df)
1189
+
1190
+ with ctx.operation(
1191
+ OperationType.PATTERN,
1192
+ f"transformer:{config.transformer}",
1193
+ ) as metrics:
1194
+ metrics.rows_in = rows_before
1195
+ if isinstance(schema_before, dict):
1196
+ metrics.schema_before = schema_before
1197
+
1198
+ is_pattern = False
1199
+ try:
1200
+ from odibi.patterns import get_pattern_class
1201
+
1202
+ pattern_cls = get_pattern_class(config.transformer)
1203
+ is_pattern = True
1204
+
1205
+ # Inject delta_table_properties into config.params for patterns that write Delta
1206
+ pattern_config = config
1207
+ delta_patterns = ("merge", "scd2", "dimension", "aggregation", "fact")
1208
+ if self.performance_config and config.transformer in delta_patterns:
1209
+ global_props = (
1210
+ getattr(self.performance_config, "delta_table_properties", None) or {}
1211
+ )
1212
+ if global_props:
1213
+ merged_params = dict(config.params) if config.params else {}
1214
+ node_props = merged_params.get("table_properties") or {}
1215
+ merged_params["table_properties"] = {**global_props, **node_props}
1216
+ pattern_config = config.model_copy(update={"params": merged_params})
1217
+
1218
+ pattern = pattern_cls(self.engine, pattern_config)
1219
+ pattern.validate()
1220
+
1221
+ engine_ctx = EngineContext(
1222
+ context=self.context,
1223
+ df=result_df,
1224
+ engine_type=self.engine.name,
1225
+ sql_executor=self.engine.execute_sql,
1226
+ engine=self.engine,
1227
+ pii_metadata=pii_meta,
1228
+ )
1229
+
1230
+ result_df = pattern.execute(engine_ctx)
1231
+ self._execution_steps.append(f"Applied pattern '{config.transformer}'")
1232
+
1233
+ if self.catalog_manager and config.write:
1234
+ self.catalog_manager.log_pattern(
1235
+ table_name=config.write.table or config.write.path,
1236
+ pattern_type=config.transformer,
1237
+ configuration=str(config.params),
1238
+ compliance_score=1.0,
1239
+ )
1240
+
1241
+ except ValueError:
1242
+ pass
1243
+
1244
+ if not is_pattern:
1245
+ result_df = self._execute_transformer_node(config, result_df, pii_meta)
1246
+ self._execution_steps.append(f"Applied transformer '{config.transformer}'")
1247
+
1248
+ if self.catalog_manager and config.write:
1249
+ self.catalog_manager.log_pattern(
1250
+ table_name=config.write.table or config.write.path,
1251
+ pattern_type=config.transformer,
1252
+ configuration=str(config.params),
1253
+ compliance_score=1.0,
1254
+ )
1255
+
1256
+ rows_after = self._count_rows(result_df) if result_df is not None else None
1257
+ schema_after = self._get_schema(result_df) if result_df is not None else None
1258
+ metrics.rows_out = rows_after
1259
+ if isinstance(schema_after, dict):
1260
+ metrics.schema_after = schema_after
1261
+
1262
+ if (
1263
+ isinstance(rows_before, (int, float))
1264
+ and isinstance(rows_after, (int, float))
1265
+ and rows_before != rows_after
1266
+ ):
1267
+ ctx.log_row_count_change(
1268
+ rows_before, rows_after, operation=f"transformer:{config.transformer}"
1269
+ )
1270
+ if (
1271
+ isinstance(schema_before, dict)
1272
+ and isinstance(schema_after, dict)
1273
+ and schema_before != schema_after
1274
+ ):
1275
+ ctx.log_schema_change(
1276
+ schema_before, schema_after, operation=f"transformer:{config.transformer}"
1277
+ )
1278
+
1279
+ # Transform Steps
1280
+ if config.transform:
1281
+ if result_df is None and input_df is not None:
1282
+ result_df = input_df
1283
+
1284
+ step_count = len(config.transform.steps)
1285
+ ctx.debug(f"Executing {step_count} transform steps")
1286
+
1287
+ # Set current write path on engine for transforms that need it (e.g., detect_deletes)
1288
+ if config.write and config.write.path:
1289
+ self.engine._current_write_path = config.write.path
1290
+ elif config.write and config.write.table:
1291
+ self.engine._current_write_path = config.write.table
1292
+
1293
+ result_df = self._execute_transform(config, result_df, pii_meta, ctx)
1294
+ self._execution_steps.append(f"Applied {step_count} transform steps")
1295
+
1296
+ # Privacy Suite
1297
+ if config.privacy:
1298
+ pii_cols = [name for name, is_pii in pii_meta.items() if is_pii]
1299
+ if pii_cols:
1300
+ ctx.debug(f"Anonymizing {len(pii_cols)} PII columns", columns=pii_cols)
1301
+ result_df = self.engine.anonymize(
1302
+ result_df,
1303
+ pii_cols,
1304
+ config.privacy.method,
1305
+ config.privacy.salt,
1306
+ )
1307
+ self._execution_steps.append(f"Anonymized {len(pii_cols)} PII columns")
1308
+
1309
+ return result_df
1310
+
1311
+ def _execute_transformer_node(
1312
+ self, config: NodeConfig, df: Optional[Any], pii_metadata: Optional[Dict[str, bool]] = None
1313
+ ) -> Any:
1314
+ """Execute a top-level transformer (legacy)."""
1315
+ if df is not None:
1316
+ df = self.engine.materialize(df)
1317
+
1318
+ func_name = config.transformer
1319
+ params = dict(config.params) if config.params else {}
1320
+
1321
+ # Merge global delta_table_properties into merge transformer params
1322
+ if func_name == "merge" and self.performance_config:
1323
+ global_props = getattr(self.performance_config, "delta_table_properties", None) or {}
1324
+ node_props = params.get("table_properties") or {}
1325
+ merged_props = {**global_props, **node_props}
1326
+ if merged_props:
1327
+ params["table_properties"] = merged_props
1328
+
1329
+ FunctionRegistry.validate_params(func_name, params)
1330
+ func = FunctionRegistry.get(func_name)
1331
+ sig = inspect.signature(func)
1332
+
1333
+ engine_type = EngineType.PANDAS if self.engine.name == "pandas" else EngineType.SPARK
1334
+ engine_ctx = EngineContext(
1335
+ context=self.context,
1336
+ df=df,
1337
+ engine_type=engine_type,
1338
+ sql_executor=self.engine.execute_sql,
1339
+ engine=self.engine,
1340
+ pii_metadata=pii_metadata,
1341
+ )
1342
+
1343
+ param_model = FunctionRegistry.get_param_model(func_name)
1344
+ call_kwargs = {}
1345
+ if "current" in sig.parameters:
1346
+ call_kwargs["current"] = df
1347
+
1348
+ if param_model:
1349
+ params_obj = param_model(**params)
1350
+ result = func(engine_ctx, params_obj, **call_kwargs)
1351
+ else:
1352
+ result = func(engine_ctx, **params, **call_kwargs)
1353
+
1354
+ if engine_ctx._sql_history:
1355
+ self._executed_sql.extend(engine_ctx._sql_history)
1356
+
1357
+ if isinstance(result, EngineContext):
1358
+ return result.df
1359
+ return result
1360
+
1361
+ def _execute_transform(
1362
+ self,
1363
+ config: NodeConfig,
1364
+ df: Any,
1365
+ pii_metadata: Optional[Dict[str, bool]] = None,
1366
+ ctx: Optional["LoggingContext"] = None,
1367
+ ) -> Any:
1368
+ """Execute transform steps."""
1369
+ if ctx is None:
1370
+ ctx = get_logging_context()
1371
+
1372
+ current_df = df
1373
+ transform_config = config.transform
1374
+
1375
+ if transform_config:
1376
+ total_steps = len(transform_config.steps)
1377
+ for step_idx, step in enumerate(transform_config.steps):
1378
+ step_name = self._get_step_name(step)
1379
+ rows_before = self._count_rows(current_df) if current_df is not None else None
1380
+ schema_before = self._get_schema(current_df) if current_df is not None else None
1381
+
1382
+ try:
1383
+ exec_context = ExecutionContext(
1384
+ node_name=config.name,
1385
+ config_file=self.config_file,
1386
+ step_index=step_idx,
1387
+ total_steps=total_steps,
1388
+ previous_steps=self._execution_steps,
1389
+ )
1390
+
1391
+ with ctx.operation(
1392
+ OperationType.TRANSFORM,
1393
+ f"step[{step_idx + 1}/{total_steps}]:{step_name}",
1394
+ ) as metrics:
1395
+ metrics.rows_in = rows_before
1396
+ if isinstance(schema_before, dict):
1397
+ metrics.schema_before = schema_before
1398
+
1399
+ if current_df is not None:
1400
+ self.context.register("current_df", current_df)
1401
+ self.context.register("df", current_df)
1402
+
1403
+ if isinstance(step, str):
1404
+ current_df = self._execute_sql_step(step, current_df)
1405
+ else:
1406
+ if step.function:
1407
+ current_df = self._execute_function_step(
1408
+ step.function, step.params, current_df, pii_metadata
1409
+ )
1410
+ elif step.operation:
1411
+ current_df = self._execute_operation_step(
1412
+ step.operation, step.params, current_df
1413
+ )
1414
+ elif step.sql:
1415
+ current_df = self._execute_sql_step(step.sql, current_df)
1416
+ elif step.sql_file:
1417
+ sql_content = self._resolve_sql_file(step.sql_file)
1418
+ current_df = self._execute_sql_step(sql_content, current_df)
1419
+ else:
1420
+ step_repr = repr(step)[:100] if step else "None"
1421
+ raise TransformError(
1422
+ f"Transform step {step_idx + 1}/{total_steps} is invalid. "
1423
+ f"Step config: {step_repr}. "
1424
+ f"Each step must have exactly one of: 'sql', 'sql_file', 'function', or 'operation'."
1425
+ )
1426
+
1427
+ rows_after = (
1428
+ self._count_rows(current_df) if current_df is not None else None
1429
+ )
1430
+ schema_after = (
1431
+ self._get_schema(current_df) if current_df is not None else None
1432
+ )
1433
+ metrics.rows_out = rows_after
1434
+ if isinstance(schema_after, dict):
1435
+ metrics.schema_after = schema_after
1436
+
1437
+ if (
1438
+ isinstance(rows_before, (int, float))
1439
+ and isinstance(rows_after, (int, float))
1440
+ and rows_before != rows_after
1441
+ ):
1442
+ ctx.log_row_count_change(rows_before, rows_after, operation=step_name)
1443
+
1444
+ if (
1445
+ isinstance(schema_before, dict)
1446
+ and isinstance(schema_after, dict)
1447
+ and schema_before != schema_after
1448
+ ):
1449
+ ctx.log_schema_change(schema_before, schema_after, operation=step_name)
1450
+
1451
+ except Exception as e:
1452
+ schema_dict = self._get_schema(current_df) if current_df is not None else {}
1453
+ schema = (
1454
+ list(schema_dict.keys()) if isinstance(schema_dict, dict) else schema_dict
1455
+ )
1456
+ shape = self._get_shape(current_df) if current_df is not None else None
1457
+
1458
+ exec_context.input_schema = schema
1459
+ exec_context.input_shape = shape
1460
+
1461
+ suggestions = self._generate_suggestions(e, config)
1462
+
1463
+ ctx.error(
1464
+ f"Transform step failed: {step_name}",
1465
+ step_index=step_idx,
1466
+ total_steps=total_steps,
1467
+ error_type=type(e).__name__,
1468
+ error_message=str(e),
1469
+ )
1470
+ if suggestions:
1471
+ ctx.info(f"Suggestions: {'; '.join(suggestions)}")
1472
+
1473
+ raise NodeExecutionError(
1474
+ message=str(e),
1475
+ context=exec_context,
1476
+ original_error=e,
1477
+ suggestions=suggestions,
1478
+ )
1479
+
1480
+ return current_df
1481
+
1482
+ def _get_step_name(self, step: Any) -> str:
1483
+ """Get human-readable name for a transform step."""
1484
+ if isinstance(step, str):
1485
+ return f"sql:{step[:50]}..." if len(step) > 50 else f"sql:{step}"
1486
+ if hasattr(step, "function") and step.function:
1487
+ return f"function:{step.function}"
1488
+ if hasattr(step, "operation") and step.operation:
1489
+ return f"operation:{step.operation}"
1490
+ if hasattr(step, "sql") and step.sql:
1491
+ sql_preview = step.sql[:50] + "..." if len(step.sql) > 50 else step.sql
1492
+ return f"sql:{sql_preview}"
1493
+ if hasattr(step, "sql_file") and step.sql_file:
1494
+ return f"sql_file:{step.sql_file}"
1495
+ return "unknown"
1496
+
1497
+ def _execute_sql_step(self, sql: str, current_df: Any = None) -> Any:
1498
+ """Execute SQL transformation with thread-safe view names.
1499
+
1500
+ Uses unique temp view names to avoid race conditions when
1501
+ multiple nodes execute SQL steps in parallel.
1502
+
1503
+ Args:
1504
+ sql: SQL query string (references to 'df' are replaced with unique view)
1505
+ current_df: DataFrame to register as the source for 'df' references
1506
+
1507
+ Returns:
1508
+ Result DataFrame from SQL execution
1509
+ """
1510
+ self._executed_sql.append(sql)
1511
+
1512
+ if current_df is not None:
1513
+ view_name = _get_unique_view_name()
1514
+ self.context.register(view_name, current_df)
1515
+ try:
1516
+ safe_sql = re.sub(r"\bdf\b", view_name, sql)
1517
+ return self.engine.execute_sql(safe_sql, self.context)
1518
+ finally:
1519
+ self.context.unregister(view_name)
1520
+ else:
1521
+ return self.engine.execute_sql(sql, self.context)
1522
+
1523
+ def _resolve_sql_file(self, sql_file_path: str) -> str:
1524
+ """Load SQL content from external file.
1525
+
1526
+ Args:
1527
+ sql_file_path: Path to .sql file, relative to main config file.
1528
+
1529
+ Returns:
1530
+ SQL content as string.
1531
+
1532
+ Raises:
1533
+ FileNotFoundError: If the SQL file does not exist.
1534
+ ValueError: If the file cannot be read.
1535
+ """
1536
+ if not self.config_file:
1537
+ raise ValueError(
1538
+ f"Cannot resolve sql_file '{sql_file_path}': The config_file path is not available. "
1539
+ f"This happens when a pipeline is created programmatically without a YAML source. "
1540
+ f"Solutions: 1) Load pipeline from YAML using load_config_from_file(), or 2) Use inline 'sql:' instead of 'sql_file:'."
1541
+ )
1542
+
1543
+ config_dir = Path(self.config_file).parent
1544
+ file_path = config_dir / sql_file_path
1545
+
1546
+ if not file_path.exists():
1547
+ raise FileNotFoundError(
1548
+ f"SQL file not found: '{sql_file_path}'. "
1549
+ f"Looked in: {file_path.absolute()}. "
1550
+ f"The path is resolved relative to the YAML config file at: {config_dir.absolute()}. "
1551
+ f"Check: 1) The file exists at the expected location. 2) The path is relative to your pipeline YAML, not project.yaml."
1552
+ )
1553
+
1554
+ try:
1555
+ return file_path.read_text(encoding="utf-8")
1556
+ except Exception as e:
1557
+ raise ValueError(
1558
+ f"Failed to read SQL file '{sql_file_path}' at {file_path.absolute()}. "
1559
+ f"Error: {type(e).__name__}: {e}. "
1560
+ f"Check file permissions and encoding (must be UTF-8)."
1561
+ ) from e
1562
+
1563
+ def _execute_function_step(
1564
+ self,
1565
+ function_name: str,
1566
+ params: Dict[str, Any],
1567
+ current_df: Optional[Any],
1568
+ pii_metadata: Optional[Dict[str, bool]] = None,
1569
+ ) -> Any:
1570
+ """Execute Python function transformation."""
1571
+ if current_df is not None:
1572
+ current_df = self.engine.materialize(current_df)
1573
+
1574
+ # Merge global delta_table_properties into merge transformer params
1575
+ if function_name == "merge" and self.performance_config:
1576
+ global_props = getattr(self.performance_config, "delta_table_properties", None) or {}
1577
+ node_props = params.get("table_properties") or {}
1578
+ merged_props = {**global_props, **node_props}
1579
+ if merged_props:
1580
+ params = dict(params) # Don't mutate original
1581
+ params["table_properties"] = merged_props
1582
+
1583
+ FunctionRegistry.validate_params(function_name, params)
1584
+ func = FunctionRegistry.get(function_name)
1585
+ sig = inspect.signature(func)
1586
+
1587
+ engine_type = EngineType.PANDAS if self.engine.name == "pandas" else EngineType.SPARK
1588
+ engine_ctx = EngineContext(
1589
+ context=self.context,
1590
+ df=current_df,
1591
+ engine_type=engine_type,
1592
+ sql_executor=self.engine.execute_sql,
1593
+ engine=self.engine,
1594
+ pii_metadata=pii_metadata,
1595
+ )
1596
+
1597
+ param_model = FunctionRegistry.get_param_model(function_name)
1598
+ call_kwargs = {}
1599
+
1600
+ if "current" in sig.parameters:
1601
+ call_kwargs["current"] = current_df
1602
+
1603
+ if param_model:
1604
+ try:
1605
+ params_obj = param_model(**params)
1606
+ except Exception as e:
1607
+ raise ValueError(f"Invalid parameters for '{function_name}': {e}")
1608
+
1609
+ result = func(engine_ctx, params_obj, **call_kwargs)
1610
+ else:
1611
+ result = func(engine_ctx, **params, **call_kwargs)
1612
+
1613
+ if engine_ctx._sql_history:
1614
+ self._executed_sql.extend(engine_ctx._sql_history)
1615
+
1616
+ if isinstance(result, EngineContext):
1617
+ return result.df
1618
+
1619
+ return result
1620
+
1621
+ def _execute_operation_step(
1622
+ self, operation: str, params: Dict[str, Any], current_df: Any
1623
+ ) -> Any:
1624
+ """Execute built-in operation."""
1625
+ if current_df is not None:
1626
+ current_df = self.engine.materialize(current_df)
1627
+ return self.engine.execute_operation(operation, params, current_df)
1628
+
1629
+ def _execute_validation_phase(
1630
+ self,
1631
+ config: NodeConfig,
1632
+ result_df: Any,
1633
+ ctx: Optional["LoggingContext"] = None,
1634
+ ) -> Any:
1635
+ """Execute validation with quarantine and gate support.
1636
+
1637
+ Returns:
1638
+ DataFrame (valid rows only if quarantine is used)
1639
+ """
1640
+ if ctx is None:
1641
+ ctx = get_logging_context()
1642
+
1643
+ if not config.validation or result_df is None:
1644
+ return result_df
1645
+
1646
+ test_count = len(config.validation.tests)
1647
+ ctx.debug("Starting validation phase", test_count=test_count)
1648
+
1649
+ with ctx.operation(OperationType.VALIDATE, f"validation:{config.name}") as metrics:
1650
+ rows_before = self._count_rows(result_df)
1651
+ metrics.rows_in = rows_before
1652
+
1653
+ result_df = self.engine.materialize(result_df)
1654
+
1655
+ for test in config.validation.tests:
1656
+ if test.type == "volume_drop" and self.catalog_manager:
1657
+ avg_rows = self.catalog_manager.get_average_volume(
1658
+ config.name, days=test.lookback_days
1659
+ )
1660
+ if avg_rows and avg_rows > 0:
1661
+ current_rows = self._count_rows(result_df)
1662
+ drop_pct = (avg_rows - current_rows) / avg_rows
1663
+ if drop_pct > test.threshold:
1664
+ ctx.error(
1665
+ "Volume drop validation failed",
1666
+ drop_percentage=f"{drop_pct:.1%}",
1667
+ threshold=f"{test.threshold:.1%}",
1668
+ current_rows=current_rows,
1669
+ average_rows=avg_rows,
1670
+ )
1671
+ raise ValidationError(
1672
+ config.name,
1673
+ [
1674
+ f"Volume dropped by {drop_pct:.1%} "
1675
+ f"(Threshold: {test.threshold:.1%})"
1676
+ ],
1677
+ )
1678
+
1679
+ from odibi.validation.quarantine import (
1680
+ add_quarantine_metadata,
1681
+ has_quarantine_tests,
1682
+ split_valid_invalid,
1683
+ write_quarantine,
1684
+ )
1685
+
1686
+ validation_config = config.validation
1687
+ quarantine_config = validation_config.quarantine
1688
+ has_quarantine = has_quarantine_tests(validation_config.tests)
1689
+
1690
+ test_results: dict = {}
1691
+
1692
+ if has_quarantine and quarantine_config:
1693
+ quarantine_result = split_valid_invalid(
1694
+ result_df,
1695
+ validation_config.tests,
1696
+ self.engine,
1697
+ )
1698
+
1699
+ if quarantine_result.rows_quarantined > 0:
1700
+ import uuid
1701
+
1702
+ run_id = str(uuid.uuid4())
1703
+ invalid_with_meta = add_quarantine_metadata(
1704
+ quarantine_result.invalid_df,
1705
+ quarantine_result.test_results,
1706
+ quarantine_config.add_columns,
1707
+ self.engine,
1708
+ config.name,
1709
+ run_id,
1710
+ validation_config.tests,
1711
+ )
1712
+
1713
+ write_quarantine(
1714
+ invalid_with_meta,
1715
+ quarantine_config,
1716
+ self.engine,
1717
+ self.connections,
1718
+ )
1719
+
1720
+ ctx.warning(
1721
+ f"Quarantined {quarantine_result.rows_quarantined} rows",
1722
+ quarantine_path=quarantine_config.path or quarantine_config.table,
1723
+ rows_quarantined=quarantine_result.rows_quarantined,
1724
+ )
1725
+
1726
+ self._execution_steps.append(
1727
+ f"Quarantined {quarantine_result.rows_quarantined} rows to "
1728
+ f"{quarantine_config.path or quarantine_config.table}"
1729
+ )
1730
+
1731
+ result_df = quarantine_result.valid_df
1732
+ test_results = quarantine_result.test_results
1733
+
1734
+ # Run standard validation on remaining rows
1735
+ self._execute_validation(config, result_df)
1736
+
1737
+ # Check quality gate
1738
+ if validation_config.gate:
1739
+ result_df = self._check_gate(config, result_df, test_results, validation_config.gate)
1740
+
1741
+ return result_df
1742
+
1743
+ def _execute_validation(self, config: NodeConfig, df: Any) -> None:
1744
+ """Execute validation rules."""
1745
+ from odibi.config import ValidationAction
1746
+ from odibi.validation.engine import Validator
1747
+
1748
+ validation_config = config.validation
1749
+ validator = Validator()
1750
+ failures = validator.validate(df, validation_config)
1751
+
1752
+ # Observability: Log metrics (validation failures)
1753
+ if self.catalog_manager:
1754
+ # We can register these tests as metrics if we want, or just log failures.
1755
+ # For now, we rely on logging validation failures to meta_runs metrics_json
1756
+ # which is done via result metadata.
1757
+ pass
1758
+
1759
+ if failures:
1760
+ if validation_config.mode == ValidationAction.FAIL:
1761
+ raise ValidationError(config.name, failures)
1762
+ elif validation_config.mode == ValidationAction.WARN:
1763
+ import logging
1764
+
1765
+ logger = logging.getLogger(__name__)
1766
+ for fail in failures:
1767
+ logger.warning(f"Validation Warning (Node {config.name}): {fail}")
1768
+ self._execution_steps.append(f"Warning: {fail}")
1769
+ self._validation_warnings.append(fail)
1770
+
1771
+ def _check_gate(
1772
+ self,
1773
+ config: NodeConfig,
1774
+ df: Any,
1775
+ test_results: dict,
1776
+ gate_config: Any,
1777
+ ) -> Any:
1778
+ """Check quality gate and take action if failed.
1779
+
1780
+ Args:
1781
+ config: Node configuration
1782
+ df: DataFrame to check
1783
+ test_results: Dict of test_name -> per-row boolean results
1784
+ gate_config: GateConfig
1785
+
1786
+ Returns:
1787
+ DataFrame (potentially filtered if gate action is WRITE_VALID_ONLY)
1788
+
1789
+ Raises:
1790
+ GateFailedError: If gate fails and action is ABORT
1791
+ """
1792
+ from odibi.config import GateOnFail
1793
+ from odibi.exceptions import GateFailedError
1794
+ from odibi.validation.gate import evaluate_gate
1795
+
1796
+ gate_result = evaluate_gate(
1797
+ df,
1798
+ test_results,
1799
+ gate_config,
1800
+ self.engine,
1801
+ catalog=self.catalog_manager,
1802
+ node_name=config.name,
1803
+ )
1804
+
1805
+ if gate_result.passed:
1806
+ self._execution_steps.append(f"Gate passed: {gate_result.pass_rate:.1%} pass rate")
1807
+ return df
1808
+
1809
+ self._execution_steps.append(
1810
+ f"Gate failed: {gate_result.pass_rate:.1%} pass rate "
1811
+ f"(required: {gate_config.require_pass_rate:.1%})"
1812
+ )
1813
+
1814
+ if gate_result.action == GateOnFail.ABORT:
1815
+ raise GateFailedError(
1816
+ node_name=config.name,
1817
+ pass_rate=gate_result.pass_rate,
1818
+ required_rate=gate_config.require_pass_rate,
1819
+ failed_rows=gate_result.failed_rows,
1820
+ total_rows=gate_result.total_rows,
1821
+ failure_reasons=gate_result.failure_reasons,
1822
+ )
1823
+
1824
+ elif gate_result.action == GateOnFail.WARN_AND_WRITE:
1825
+ import logging
1826
+
1827
+ logger = logging.getLogger(__name__)
1828
+ for reason in gate_result.failure_reasons:
1829
+ logger.warning(f"Gate Warning (Node {config.name}): {reason}")
1830
+ self._validation_warnings.append(f"Gate: {reason}")
1831
+ return df
1832
+
1833
+ elif gate_result.action == GateOnFail.WRITE_VALID_ONLY:
1834
+ self._execution_steps.append(
1835
+ f"Writing only valid rows ({gate_result.passed_rows} of {gate_result.total_rows})"
1836
+ )
1837
+ return df
1838
+
1839
+ return df
1840
+
1841
+ def _determine_write_mode(self, config: NodeConfig) -> Optional[WriteMode]:
1842
+ """Determine write mode."""
1843
+ if not config.write or config.write.first_run_query is None:
1844
+ return None
1845
+
1846
+ write_config = config.write
1847
+ target_connection = self.connections.get(write_config.connection)
1848
+
1849
+ if target_connection is None:
1850
+ return None
1851
+
1852
+ table_exists = self._cached_table_exists(
1853
+ target_connection, table=write_config.table, path=write_config.path
1854
+ )
1855
+
1856
+ if not table_exists:
1857
+ return WriteMode.OVERWRITE
1858
+
1859
+ return None
1860
+
1861
+ def _execute_write_phase(
1862
+ self,
1863
+ config: NodeConfig,
1864
+ df: Any,
1865
+ override_mode: Optional[WriteMode] = None,
1866
+ ctx: Optional[LoggingContext] = None,
1867
+ ) -> None:
1868
+ """Execute write operation."""
1869
+ if ctx is None:
1870
+ ctx = get_logging_context()
1871
+
1872
+ if not config.write:
1873
+ return
1874
+
1875
+ write_config = config.write
1876
+ connection = self.connections.get(write_config.connection)
1877
+
1878
+ if connection is None:
1879
+ raise ValueError(f"Connection '{write_config.connection}' not found.")
1880
+
1881
+ # For Delta writes, defer row count to avoid double DAG execution.
1882
+ # We'll extract row count from Delta commit metadata after write.
1883
+ # For non-Delta formats, count upfront as before.
1884
+ defer_row_count = write_config.format == "delta" and df is not None
1885
+ row_count = None if defer_row_count else (self._count_rows(df) if df is not None else 0)
1886
+ mode = override_mode if override_mode is not None else write_config.mode
1887
+
1888
+ with ctx.operation(
1889
+ OperationType.WRITE,
1890
+ f"target:{write_config.connection}",
1891
+ format=write_config.format,
1892
+ table=write_config.table,
1893
+ path=write_config.path,
1894
+ mode=str(mode) if mode else None,
1895
+ ) as metrics:
1896
+ metrics.rows_in = row_count
1897
+
1898
+ if write_config.skip_if_unchanged and df is not None:
1899
+ skip_result = self._check_skip_if_unchanged(config, df, connection)
1900
+ if skip_result["should_skip"]:
1901
+ self._execution_steps.append(
1902
+ f"Skipped write: content unchanged (hash: {skip_result['hash'][:12]}...)"
1903
+ )
1904
+ ctx.info(
1905
+ "Skipping write - content unchanged",
1906
+ content_hash=skip_result["hash"][:12],
1907
+ )
1908
+ return
1909
+
1910
+ if config.schema_policy and df is not None:
1911
+ target_schema = self.engine.get_table_schema(
1912
+ connection=connection,
1913
+ table=write_config.table,
1914
+ path=write_config.path,
1915
+ format=write_config.format,
1916
+ )
1917
+ if target_schema:
1918
+ df = self.engine.harmonize_schema(df, target_schema, config.schema_policy)
1919
+ ctx.debug("Applied schema harmonization")
1920
+ self._execution_steps.append("Applied Schema Policy (Harmonization)")
1921
+
1922
+ if write_config.add_metadata and df is not None:
1923
+ df = self._add_write_metadata(config, df)
1924
+ self._execution_steps.append("Added Bronze metadata columns")
1925
+
1926
+ write_options = write_config.options.copy() if write_config.options else {}
1927
+ deep_diag = write_options.pop("deep_diagnostics", False)
1928
+ diff_keys = write_options.pop("diff_keys", None)
1929
+
1930
+ # Extract partition_by from WriteConfig and add to write_options
1931
+ if write_config.partition_by:
1932
+ write_options["partition_by"] = write_config.partition_by
1933
+ ctx.debug("Partitioning by", columns=write_config.partition_by)
1934
+ self._execution_steps.append(f"Partition by: {write_config.partition_by}")
1935
+
1936
+ # Extract zorder_by from WriteConfig and add to write_options (Delta only)
1937
+ if write_config.zorder_by:
1938
+ if write_config.format == "delta":
1939
+ write_options["zorder_by"] = write_config.zorder_by
1940
+ ctx.debug("Z-Ordering by", columns=write_config.zorder_by)
1941
+ self._execution_steps.append(f"Z-Order by: {write_config.zorder_by}")
1942
+ else:
1943
+ ctx.warning(
1944
+ "zorder_by is only supported for Delta format, ignoring",
1945
+ format=write_config.format,
1946
+ )
1947
+
1948
+ # Extract merge_schema from WriteConfig (Delta schema evolution)
1949
+ if write_config.merge_schema:
1950
+ if write_config.format == "delta":
1951
+ write_options["mergeSchema"] = True
1952
+ ctx.debug("Schema evolution enabled (mergeSchema=true)")
1953
+ self._execution_steps.append("Schema evolution enabled (mergeSchema)")
1954
+ else:
1955
+ # For Spark with other formats, use schema_mode if applicable
1956
+ write_options["schema_mode"] = "merge"
1957
+ ctx.debug("Schema merge mode enabled")
1958
+ self._execution_steps.append("Schema merge mode enabled")
1959
+
1960
+ # Extract merge_keys and merge_options from WriteConfig (SQL Server MERGE)
1961
+ if write_config.merge_keys:
1962
+ write_options["merge_keys"] = write_config.merge_keys
1963
+ ctx.debug("Merge keys configured", keys=write_config.merge_keys)
1964
+ if write_config.merge_options:
1965
+ write_options["merge_options"] = write_config.merge_options
1966
+ ctx.debug("Merge options configured")
1967
+
1968
+ if write_config.format == "delta":
1969
+ merged_props = {}
1970
+ if self.performance_config and hasattr(
1971
+ self.performance_config, "delta_table_properties"
1972
+ ):
1973
+ merged_props.update(self.performance_config.delta_table_properties or {})
1974
+ if write_config.table_properties:
1975
+ merged_props.update(write_config.table_properties)
1976
+ if merged_props:
1977
+ write_options["table_properties"] = merged_props
1978
+
1979
+ # Handle materialized strategy
1980
+ if config.materialized:
1981
+ if config.materialized == "view":
1982
+ # Create a view instead of writing to table
1983
+ if write_config.table and hasattr(self.engine, "create_view"):
1984
+ ctx.info(f"Creating view: {write_config.table}")
1985
+ self.engine.create_view(
1986
+ df=df,
1987
+ view_name=write_config.table,
1988
+ connection=connection,
1989
+ )
1990
+ self._execution_steps.append(f"Created view: {write_config.table}")
1991
+ ctx.info(
1992
+ f"View created: {write_config.table}",
1993
+ materialized="view",
1994
+ rows=row_count,
1995
+ )
1996
+ return
1997
+ else:
1998
+ ctx.warning(
1999
+ "View materialization requires table name and engine support",
2000
+ table=write_config.table,
2001
+ )
2002
+ elif config.materialized == "incremental":
2003
+ # Use append mode for incremental materialization
2004
+ mode = WriteMode.APPEND
2005
+ ctx.debug("Using append mode for incremental materialization")
2006
+ self._execution_steps.append("Materialized: incremental (append mode)")
2007
+ elif config.materialized == "table":
2008
+ # Default table write behavior
2009
+ ctx.debug("Using table materialization (default write)")
2010
+ self._execution_steps.append("Materialized: table")
2011
+
2012
+ delta_info = self.engine.write(
2013
+ df=df,
2014
+ connection=connection,
2015
+ format=write_config.format,
2016
+ table=write_config.table,
2017
+ path=write_config.path,
2018
+ register_table=write_config.register_table,
2019
+ mode=mode,
2020
+ options=write_options,
2021
+ streaming_config=write_config.streaming,
2022
+ )
2023
+
2024
+ # Extract row count from Delta commit metadata if deferred
2025
+ if defer_row_count:
2026
+ if delta_info:
2027
+ op_metrics = delta_info.get("operation_metrics") or {}
2028
+ # Delta returns numOutputRows for most operations
2029
+ row_count = op_metrics.get("numOutputRows") or op_metrics.get(
2030
+ "numTargetRowsInserted"
2031
+ )
2032
+ if row_count is not None:
2033
+ try:
2034
+ row_count = int(row_count)
2035
+ except (ValueError, TypeError):
2036
+ row_count = None
2037
+ # Fallback: count if Delta metrics unavailable (e.g., older Delta versions)
2038
+ if row_count is None:
2039
+ ctx.debug("Delta commit metrics unavailable, falling back to count")
2040
+ row_count = self._count_rows(df) if df is not None else 0
2041
+
2042
+ metrics.rows_out = row_count
2043
+
2044
+ ctx.info(
2045
+ f"Write completed to {write_config.connection}",
2046
+ format=write_config.format,
2047
+ table=write_config.table,
2048
+ path=write_config.path,
2049
+ mode=str(mode) if mode else None,
2050
+ rows=row_count,
2051
+ )
2052
+
2053
+ if write_config.auto_optimize and write_config.format == "delta":
2054
+ opt_config = write_config.auto_optimize
2055
+ if isinstance(opt_config, bool):
2056
+ if opt_config:
2057
+ from odibi.config import AutoOptimizeConfig
2058
+
2059
+ opt_config = AutoOptimizeConfig(enabled=True)
2060
+ else:
2061
+ opt_config = None
2062
+
2063
+ if opt_config:
2064
+ ctx.debug("Running auto-optimize on Delta table")
2065
+ self.engine.maintain_table(
2066
+ connection=connection,
2067
+ format=write_config.format,
2068
+ table=write_config.table,
2069
+ path=write_config.path,
2070
+ config=opt_config,
2071
+ )
2072
+
2073
+ if delta_info:
2074
+ self._delta_write_info = delta_info
2075
+ self._calculate_delta_diagnostics(
2076
+ delta_info, connection, write_config, deep_diag, diff_keys
2077
+ )
2078
+
2079
+ # Store row count from write phase to avoid redundant counting in metadata
2080
+ if self._delta_write_info is None:
2081
+ self._delta_write_info = {}
2082
+ self._delta_write_info["_cached_row_count"] = row_count
2083
+
2084
+ if write_config.skip_if_unchanged and write_config.format == "delta":
2085
+ self._store_content_hash_after_write(config, connection)
2086
+
2087
+ # Phase 3: Catalog integration after successful write
2088
+ # Skip if performance config disables catalog writes
2089
+ skip_catalog = self.performance_config and getattr(
2090
+ self.performance_config, "skip_catalog_writes", False
2091
+ )
2092
+ if not skip_catalog:
2093
+ self._register_catalog_entries(config, df, connection, write_config, ctx)
2094
+ else:
2095
+ ctx.debug("Skipping catalog writes (skip_catalog_writes=true)")
2096
+
2097
+ def _register_catalog_entries(
2098
+ self,
2099
+ config: NodeConfig,
2100
+ df: Any,
2101
+ connection: Any,
2102
+ write_config: Any,
2103
+ ctx: Optional["LoggingContext"] = None,
2104
+ ) -> None:
2105
+ """Register catalog entries after successful write.
2106
+
2107
+ Handles Phase 3.2-3.5: register_asset, track_schema, log_pattern, record_lineage
2108
+
2109
+ When batch_write_buffers is provided, records are buffered for batch write
2110
+ at the end of pipeline execution to eliminate concurrency conflicts.
2111
+ """
2112
+ if not self.catalog_manager:
2113
+ return
2114
+
2115
+ if ctx is None:
2116
+ ctx = get_logging_context()
2117
+
2118
+ import uuid
2119
+
2120
+ run_id = str(uuid.uuid4())
2121
+
2122
+ # Check if we should buffer writes for batch processing
2123
+ use_batch_mode = (
2124
+ self.batch_write_buffers is not None
2125
+ and "lineage" in self.batch_write_buffers
2126
+ and "assets" in self.batch_write_buffers
2127
+ )
2128
+
2129
+ # Determine table path
2130
+ table_path = None
2131
+ if hasattr(connection, "get_path"):
2132
+ table_path = connection.get_path(write_config.path or write_config.table)
2133
+ else:
2134
+ table_path = write_config.path or write_config.table
2135
+
2136
+ # 3.2: Register asset (meta_tables)
2137
+ try:
2138
+ project_name = "unknown"
2139
+ if hasattr(self, "project_config") and self.project_config:
2140
+ project_name = getattr(self.project_config, "project", "unknown")
2141
+
2142
+ table_name = write_config.table or config.name
2143
+ pattern_type = config.materialized or "table"
2144
+
2145
+ schema_hash = ""
2146
+ if df is not None:
2147
+ schema = self._get_schema(df)
2148
+ if isinstance(schema, dict):
2149
+ import hashlib
2150
+ import json
2151
+
2152
+ schema_hash = hashlib.md5(
2153
+ json.dumps(schema, sort_keys=True).encode()
2154
+ ).hexdigest()
2155
+
2156
+ asset_record = {
2157
+ "project_name": project_name,
2158
+ "table_name": table_name,
2159
+ "path": table_path or "",
2160
+ "format": write_config.format or "delta",
2161
+ "pattern_type": pattern_type,
2162
+ "schema_hash": schema_hash,
2163
+ }
2164
+
2165
+ if use_batch_mode:
2166
+ self.batch_write_buffers["assets"].append(asset_record)
2167
+ ctx.debug(f"Buffered asset for batch write: {table_name}")
2168
+ else:
2169
+ self.catalog_manager.register_asset(**asset_record)
2170
+ ctx.debug(f"Registered asset: {table_name}")
2171
+
2172
+ except Exception as e:
2173
+ ctx.debug(f"Failed to register asset: {e}")
2174
+
2175
+ # 3.3: Track schema changes (meta_schemas)
2176
+ try:
2177
+ if df is not None and table_path:
2178
+ schema = self._get_schema(df)
2179
+ if isinstance(schema, dict):
2180
+ pipeline_name = self.pipeline_name or (
2181
+ config.tags[0] if config.tags else "unknown"
2182
+ )
2183
+ self.catalog_manager.track_schema(
2184
+ table_path=table_path,
2185
+ schema=schema,
2186
+ pipeline=pipeline_name,
2187
+ node=config.name,
2188
+ run_id=run_id,
2189
+ )
2190
+ ctx.debug(f"Tracked schema for: {table_path}")
2191
+
2192
+ except Exception as e:
2193
+ ctx.debug(f"Failed to track schema: {e}")
2194
+
2195
+ # 3.4: Log pattern usage (meta_patterns)
2196
+ try:
2197
+ if config.materialized:
2198
+ import json
2199
+
2200
+ pattern_config = {
2201
+ "materialized": config.materialized,
2202
+ "format": write_config.format,
2203
+ "mode": str(write_config.mode) if write_config.mode else None,
2204
+ }
2205
+ table_name = write_config.table or config.name
2206
+ self.catalog_manager.log_pattern(
2207
+ table_name=table_name,
2208
+ pattern_type=config.materialized,
2209
+ configuration=json.dumps(pattern_config),
2210
+ compliance_score=1.0,
2211
+ )
2212
+ ctx.debug(f"Logged pattern: {config.materialized}")
2213
+
2214
+ except Exception as e:
2215
+ ctx.debug(f"Failed to log pattern: {e}")
2216
+
2217
+ # 3.5: Record lineage (meta_lineage)
2218
+ try:
2219
+ if config.read and table_path:
2220
+ source_path = None
2221
+ read_config = config.read
2222
+ read_conn = self.connections.get(read_config.connection)
2223
+ if read_conn and hasattr(read_conn, "get_path"):
2224
+ source_path = read_conn.get_path(read_config.path or read_config.table)
2225
+ else:
2226
+ source_path = read_config.path or read_config.table
2227
+
2228
+ if source_path:
2229
+ pipeline_name = self.pipeline_name or (
2230
+ config.tags[0] if config.tags else "unknown"
2231
+ )
2232
+ lineage_record = {
2233
+ "source_table": source_path,
2234
+ "target_table": table_path,
2235
+ "target_pipeline": pipeline_name,
2236
+ "target_node": config.name,
2237
+ "run_id": run_id,
2238
+ }
2239
+
2240
+ if use_batch_mode:
2241
+ self.batch_write_buffers["lineage"].append(lineage_record)
2242
+ ctx.debug(
2243
+ f"Buffered lineage for batch write: {source_path} -> {table_path}"
2244
+ )
2245
+ else:
2246
+ self.catalog_manager.record_lineage(**lineage_record)
2247
+ ctx.debug(f"Recorded lineage: {source_path} -> {table_path}")
2248
+
2249
+ except Exception as e:
2250
+ ctx.debug(f"Failed to record lineage: {e}")
2251
+
2252
+ def _add_write_metadata(self, config: NodeConfig, df: Any) -> Any:
2253
+ """Add Bronze metadata columns to DataFrame before writing.
2254
+
2255
+ Args:
2256
+ config: Node configuration containing read/write settings
2257
+ df: DataFrame to add metadata to
2258
+
2259
+ Returns:
2260
+ DataFrame with metadata columns added
2261
+ """
2262
+ write_config = config.write
2263
+ read_config = config.read
2264
+
2265
+ # Determine source info from read config
2266
+ source_connection = None
2267
+ source_table = None
2268
+ source_path = None
2269
+ is_file_source = False
2270
+
2271
+ if read_config:
2272
+ source_connection = read_config.connection
2273
+ source_table = read_config.table
2274
+
2275
+ # Determine if file source based on format
2276
+ read_format = str(read_config.format).lower()
2277
+ file_formats = {"csv", "parquet", "json", "avro", "excel"}
2278
+ is_file_source = read_format in file_formats
2279
+
2280
+ if is_file_source:
2281
+ source_path = read_config.path
2282
+
2283
+ # Call engine's metadata helper
2284
+ return self.engine.add_write_metadata(
2285
+ df=df,
2286
+ metadata_config=write_config.add_metadata,
2287
+ source_connection=source_connection,
2288
+ source_table=source_table,
2289
+ source_path=source_path,
2290
+ is_file_source=is_file_source,
2291
+ )
2292
+
2293
+ def _check_skip_if_unchanged(
2294
+ self,
2295
+ config: NodeConfig,
2296
+ df: Any,
2297
+ connection: Any,
2298
+ ) -> Dict[str, Any]:
2299
+ """Check if write should be skipped due to unchanged content.
2300
+
2301
+ Args:
2302
+ config: Node configuration
2303
+ df: DataFrame to check
2304
+ connection: Target connection
2305
+
2306
+ Returns:
2307
+ Dict with 'should_skip' (bool) and 'hash' (str)
2308
+ """
2309
+ write_config = config.write
2310
+ format_str = str(write_config.format).lower()
2311
+
2312
+ if format_str != "delta":
2313
+ from odibi.utils.logging import logger
2314
+
2315
+ logger.warning(
2316
+ f"[{config.name}] skip_if_unchanged only supported for Delta format, "
2317
+ f"got '{format_str}'. Proceeding with write."
2318
+ )
2319
+ return {"should_skip": False, "hash": None}
2320
+
2321
+ from odibi.enums import EngineType
2322
+ from odibi.utils.content_hash import get_content_hash_from_state
2323
+
2324
+ engine_type = EngineType.SPARK if self.engine.name == "spark" else EngineType.PANDAS
2325
+ if engine_type == EngineType.SPARK:
2326
+ from odibi.utils.content_hash import compute_spark_dataframe_hash
2327
+
2328
+ current_hash = compute_spark_dataframe_hash(
2329
+ df,
2330
+ columns=write_config.skip_hash_columns,
2331
+ sort_columns=write_config.skip_hash_sort_columns,
2332
+ )
2333
+ else:
2334
+ from odibi.utils.content_hash import compute_dataframe_hash
2335
+
2336
+ pandas_df = df
2337
+ if hasattr(df, "to_pandas"):
2338
+ pandas_df = df.to_pandas()
2339
+
2340
+ current_hash = compute_dataframe_hash(
2341
+ pandas_df,
2342
+ columns=write_config.skip_hash_columns,
2343
+ sort_columns=write_config.skip_hash_sort_columns,
2344
+ )
2345
+
2346
+ table_name = write_config.table or write_config.path
2347
+ state_backend = (
2348
+ getattr(self.state_manager, "backend", None) if hasattr(self, "state_manager") else None
2349
+ )
2350
+ previous_hash = get_content_hash_from_state(state_backend, config.name, table_name)
2351
+
2352
+ if previous_hash and current_hash == previous_hash:
2353
+ # Before skipping, verify the target actually exists
2354
+ # If target was deleted, we must write even if hash matches
2355
+ target_exists = self._check_target_exists(write_config, connection)
2356
+ if not target_exists:
2357
+ from odibi.utils.logging_context import get_logging_context
2358
+
2359
+ ctx = get_logging_context()
2360
+ ctx.warning(
2361
+ f"[{config.name}] Target does not exist despite matching hash, "
2362
+ "proceeding with write"
2363
+ )
2364
+ self._pending_content_hash = current_hash
2365
+ return {"should_skip": False, "hash": current_hash}
2366
+ return {"should_skip": True, "hash": current_hash}
2367
+
2368
+ self._pending_content_hash = current_hash
2369
+ return {"should_skip": False, "hash": current_hash}
2370
+
2371
+ def _store_content_hash_after_write(
2372
+ self,
2373
+ config: NodeConfig,
2374
+ connection: Any,
2375
+ ) -> None:
2376
+ """Store content hash in state catalog after successful write."""
2377
+ if not hasattr(self, "_pending_content_hash") or not self._pending_content_hash:
2378
+ return
2379
+
2380
+ write_config = config.write
2381
+ content_hash = self._pending_content_hash
2382
+
2383
+ from odibi.utils.content_hash import set_content_hash_in_state
2384
+
2385
+ try:
2386
+ table_name = write_config.table or write_config.path
2387
+ state_backend = (
2388
+ getattr(self.state_manager, "backend", None)
2389
+ if hasattr(self, "state_manager")
2390
+ else None
2391
+ )
2392
+
2393
+ set_content_hash_in_state(state_backend, config.name, table_name, content_hash)
2394
+
2395
+ from odibi.utils.logging import logger
2396
+
2397
+ logger.debug(f"[{config.name}] Stored content hash: {content_hash[:12]}...")
2398
+ except Exception as e:
2399
+ from odibi.utils.logging import logger
2400
+
2401
+ logger.warning(f"[{config.name}] Failed to store content hash: {e}")
2402
+ finally:
2403
+ self._pending_content_hash = None
2404
+
2405
+ def _check_target_exists(self, write_config: Any, connection: Any) -> bool:
2406
+ """Check if the target table or path exists.
2407
+
2408
+ Used by skip_if_unchanged to verify target wasn't deleted.
2409
+
2410
+ Args:
2411
+ write_config: Write configuration with table/path info
2412
+ connection: Target connection
2413
+
2414
+ Returns:
2415
+ True if target exists, False otherwise
2416
+ """
2417
+ try:
2418
+ if write_config.table:
2419
+ # Table-based target
2420
+ if hasattr(self.engine, "spark"):
2421
+ return self.engine.spark.catalog.tableExists(write_config.table)
2422
+ return True # Assume exists for non-Spark engines
2423
+
2424
+ if write_config.path:
2425
+ # Path-based Delta target
2426
+ full_path = connection.get_path(write_config.path)
2427
+ if hasattr(self.engine, "spark"):
2428
+ try:
2429
+ from delta.tables import DeltaTable
2430
+
2431
+ return DeltaTable.isDeltaTable(self.engine.spark, full_path)
2432
+ except Exception:
2433
+ # Fallback: check if path exists
2434
+ try:
2435
+ self.engine.spark.read.format("delta").load(full_path).limit(0)
2436
+ return True
2437
+ except Exception:
2438
+ return False
2439
+ return True # Assume exists for non-Spark engines
2440
+
2441
+ return True # No table or path specified, assume exists
2442
+ except Exception:
2443
+ return False # On any error, assume doesn't exist (safer to write)
2444
+
2445
+ def _calculate_delta_diagnostics(
2446
+ self,
2447
+ delta_info: Dict[str, Any],
2448
+ connection: Any,
2449
+ write_config: Any,
2450
+ deep_diag: bool,
2451
+ diff_keys: Optional[List[str]],
2452
+ ) -> None:
2453
+ """Calculate Delta Lake diagnostics/diff."""
2454
+ ver = delta_info.get("version", 0)
2455
+ if isinstance(ver, int) and ver > 0:
2456
+ try:
2457
+ from odibi.diagnostics import get_delta_diff
2458
+
2459
+ full_path = connection.get_path(write_config.path) if write_config.path else None
2460
+
2461
+ if full_path:
2462
+ spark_session = getattr(self.engine, "spark", None)
2463
+ curr_ver = delta_info["version"]
2464
+ prev_ver = curr_ver - 1
2465
+
2466
+ if deep_diag:
2467
+ diff = get_delta_diff(
2468
+ table_path=full_path,
2469
+ version_a=prev_ver,
2470
+ version_b=curr_ver,
2471
+ spark=spark_session,
2472
+ deep=True,
2473
+ keys=diff_keys,
2474
+ )
2475
+ self._delta_write_info["data_diff"] = {
2476
+ "rows_change": diff.rows_change,
2477
+ "rows_added": diff.rows_added,
2478
+ "rows_removed": diff.rows_removed,
2479
+ "rows_updated": diff.rows_updated,
2480
+ "schema_added": diff.schema_added,
2481
+ "schema_removed": diff.schema_removed,
2482
+ "schema_previous": diff.schema_previous,
2483
+ "sample_added": diff.sample_added,
2484
+ "sample_removed": diff.sample_removed,
2485
+ "sample_updated": diff.sample_updated,
2486
+ }
2487
+ else:
2488
+ metrics = delta_info.get("operation_metrics", {})
2489
+ rows_inserted = int(
2490
+ metrics.get("numTargetRowsInserted", 0)
2491
+ or metrics.get("numOutputRows", 0)
2492
+ )
2493
+ rows_deleted = int(metrics.get("numTargetRowsDeleted", 0))
2494
+ net_change = rows_inserted - rows_deleted
2495
+ self._delta_write_info["data_diff"] = {
2496
+ "rows_change": net_change,
2497
+ "sample_added": None,
2498
+ "sample_removed": None,
2499
+ }
2500
+ except Exception as e:
2501
+ import logging
2502
+
2503
+ logger = logging.getLogger(__name__)
2504
+ logger.warning(f"Failed to calculate data diff: {e}")
2505
+
2506
+ def _collect_metadata(
2507
+ self,
2508
+ config: NodeConfig,
2509
+ df: Optional[Any],
2510
+ input_schema: Optional[Any] = None,
2511
+ input_sample: Optional[List[Dict[str, Any]]] = None,
2512
+ ) -> Dict[str, Any]:
2513
+ """Collect metadata."""
2514
+ import getpass
2515
+ import platform
2516
+ import socket
2517
+ import sys
2518
+
2519
+ try:
2520
+ import pandas as pd
2521
+
2522
+ pandas_version = getattr(pd, "__version__", None)
2523
+ except ImportError:
2524
+ pandas_version = None
2525
+
2526
+ try:
2527
+ import pyspark
2528
+
2529
+ pyspark_version = getattr(pyspark, "__version__", None)
2530
+ except ImportError:
2531
+ pyspark_version = None
2532
+
2533
+ sql_hash = None
2534
+ if self._executed_sql:
2535
+ normalized_sql = " ".join(self._executed_sql).lower().strip()
2536
+ sql_hash = hashlib.md5(normalized_sql.encode("utf-8")).hexdigest()
2537
+
2538
+ config_snapshot = (
2539
+ config.model_dump(mode="json") if hasattr(config, "model_dump") else config.model_dump()
2540
+ )
2541
+
2542
+ metadata = {
2543
+ "timestamp": datetime.now().isoformat(),
2544
+ "environment": {
2545
+ "user": getpass.getuser(),
2546
+ "host": socket.gethostname(),
2547
+ "platform": platform.platform(),
2548
+ "python": sys.version.split()[0],
2549
+ "pandas": pandas_version,
2550
+ "pyspark": pyspark_version,
2551
+ "odibi": __import__("odibi").__version__,
2552
+ },
2553
+ "steps": self._execution_steps.copy(),
2554
+ "executed_sql": self._executed_sql.copy(),
2555
+ "sql_hash": sql_hash,
2556
+ "transformation_stack": [
2557
+ step.function if hasattr(step, "function") else str(step)
2558
+ for step in (config.transform.steps if config.transform else [])
2559
+ ],
2560
+ "validation_warnings": self._validation_warnings.copy(),
2561
+ "config_snapshot": config_snapshot,
2562
+ }
2563
+
2564
+ if self._delta_write_info and "version" in self._delta_write_info:
2565
+ if self._delta_write_info.get("streaming"):
2566
+ metadata["streaming_info"] = {
2567
+ "query_id": self._delta_write_info.get("query_id"),
2568
+ "query_name": self._delta_write_info.get("query_name"),
2569
+ "status": self._delta_write_info.get("status"),
2570
+ "target": self._delta_write_info.get("target"),
2571
+ "output_mode": self._delta_write_info.get("output_mode"),
2572
+ "checkpoint_location": self._delta_write_info.get("checkpoint_location"),
2573
+ }
2574
+ else:
2575
+ ts = self._delta_write_info.get("timestamp")
2576
+ metadata["delta_info"] = {
2577
+ "version": self._delta_write_info["version"],
2578
+ "timestamp": (
2579
+ ts.isoformat() if hasattr(ts, "isoformat") else str(ts) if ts else None
2580
+ ),
2581
+ "operation": self._delta_write_info.get("operation"),
2582
+ "operation_metrics": self._delta_write_info.get("operation_metrics", {}),
2583
+ "read_version": self._delta_write_info.get("read_version"),
2584
+ }
2585
+ if "data_diff" in self._delta_write_info:
2586
+ metadata["data_diff"] = self._delta_write_info["data_diff"]
2587
+
2588
+ if df is not None:
2589
+ # Reuse row count from write phase if available (avoids redundant count)
2590
+ cached_row_count = None
2591
+ rows_written = None
2592
+ if self._delta_write_info:
2593
+ cached_row_count = self._delta_write_info.get("_cached_row_count")
2594
+ rows_written = self._delta_write_info.get("_cached_row_count")
2595
+ metadata["rows"] = (
2596
+ cached_row_count if cached_row_count is not None else self._count_rows(df)
2597
+ )
2598
+ # Track rows read vs rows written for story metrics
2599
+ metadata["rows_read"] = self._read_row_count
2600
+ metadata["rows_written"] = rows_written
2601
+ metadata["schema"] = self._get_schema(df)
2602
+ metadata["source_files"] = self.engine.get_source_files(df)
2603
+ # Skip null profiling if configured (expensive for large Spark DataFrames)
2604
+ skip_null_profiling = self.performance_config and getattr(
2605
+ self.performance_config, "skip_null_profiling", False
2606
+ )
2607
+ if skip_null_profiling:
2608
+ metadata["null_profile"] = {}
2609
+ else:
2610
+ try:
2611
+ metadata["null_profile"] = self.engine.profile_nulls(df)
2612
+ except Exception:
2613
+ metadata["null_profile"] = {}
2614
+
2615
+ if input_schema and metadata.get("schema"):
2616
+ output_schema = metadata["schema"]
2617
+ set_in = set(input_schema)
2618
+ set_out = set(output_schema)
2619
+ metadata["schema_in"] = input_schema
2620
+ metadata["columns_added"] = list(set_out - set_in)
2621
+ metadata["columns_removed"] = list(set_in - set_out)
2622
+ if input_sample:
2623
+ metadata["sample_data_in"] = input_sample
2624
+
2625
+ if df is not None and self.max_sample_rows > 0:
2626
+ metadata["sample_data"] = self._get_redacted_sample(df, config.sensitive, self.engine)
2627
+
2628
+ if "sample_data_in" in metadata:
2629
+ metadata["sample_data_in"] = self._redact_sample_list(
2630
+ metadata["sample_data_in"], config.sensitive
2631
+ )
2632
+
2633
+ # Create output record for cross-pipeline dependencies (batch written at end of pipeline)
2634
+ # Supports both explicit write blocks and merge/scd2 function outputs
2635
+ output_record = self._create_output_record(config, metadata.get("rows"))
2636
+ if output_record:
2637
+ metadata["_output_record"] = output_record
2638
+
2639
+ return metadata
2640
+
2641
+ def _get_redacted_sample(
2642
+ self, df: Any, sensitive_config: Any, engine: Any
2643
+ ) -> List[Dict[str, Any]]:
2644
+ """Get sample data with redaction."""
2645
+ if sensitive_config is True:
2646
+ return [{"message": "[REDACTED: Sensitive Data]"}]
2647
+ try:
2648
+ sample = engine.get_sample(df, n=self.max_sample_rows)
2649
+ return self._redact_sample_list(sample, sensitive_config)
2650
+ except Exception:
2651
+ return []
2652
+
2653
+ def _redact_sample_list(
2654
+ self, sample: List[Dict[str, Any]], sensitive_config: Any
2655
+ ) -> List[Dict[str, Any]]:
2656
+ """Redact list of rows."""
2657
+ if not sample:
2658
+ return []
2659
+ if sensitive_config is True:
2660
+ return [{"message": "[REDACTED: Sensitive Data]"}]
2661
+ if isinstance(sensitive_config, list):
2662
+ for row in sample:
2663
+ for col in sensitive_config:
2664
+ if col in row:
2665
+ row[col] = "[REDACTED]"
2666
+ return sample
2667
+
2668
+ def _create_output_record(
2669
+ self, config: NodeConfig, row_count: Optional[int]
2670
+ ) -> Optional[Dict[str, Any]]:
2671
+ """
2672
+ Create an output record for cross-pipeline dependency tracking.
2673
+
2674
+ This record is collected during execution and batch-written to meta_outputs
2675
+ at the end of pipeline execution for performance.
2676
+
2677
+ Extracts output info from:
2678
+ 1. Explicit write block (preferred)
2679
+ 2. merge/scd2 function params in transform steps (fallback)
2680
+
2681
+ Args:
2682
+ config: Node configuration
2683
+ row_count: Number of rows written
2684
+
2685
+ Returns:
2686
+ Dict with output metadata or None if no output location found
2687
+ """
2688
+ if config.write:
2689
+ write_cfg = config.write
2690
+ output_type = (
2691
+ "managed_table" if write_cfg.table and not write_cfg.path else "external_table"
2692
+ )
2693
+ return {
2694
+ "pipeline_name": self.pipeline_name,
2695
+ "node_name": config.name,
2696
+ "output_type": output_type,
2697
+ "connection_name": write_cfg.connection,
2698
+ "path": write_cfg.path,
2699
+ "format": write_cfg.format,
2700
+ "table_name": write_cfg.register_table or write_cfg.table,
2701
+ "last_run": datetime.now(),
2702
+ "row_count": row_count,
2703
+ }
2704
+
2705
+ output_info = self._extract_output_from_transform_steps(config)
2706
+ if output_info:
2707
+ return {
2708
+ "pipeline_name": self.pipeline_name,
2709
+ "node_name": config.name,
2710
+ "output_type": output_info.get("output_type", "external_table"),
2711
+ "connection_name": output_info.get("connection"),
2712
+ "path": output_info.get("path"),
2713
+ "format": output_info.get("format", "delta"),
2714
+ "table_name": output_info.get("register_table"),
2715
+ "last_run": datetime.now(),
2716
+ "row_count": row_count,
2717
+ }
2718
+
2719
+ return None
2720
+
2721
+ def _extract_output_from_transform_steps(self, config: NodeConfig) -> Optional[Dict[str, Any]]:
2722
+ """
2723
+ Extract output location from merge/scd2 used as transformer or in transform steps.
2724
+
2725
+ These functions write data internally but don't use a write block,
2726
+ so we need to extract their output info for cross-pipeline references.
2727
+
2728
+ Checks in order:
2729
+ 1. Transform steps (last merge/scd2 in chain)
2730
+ 2. Top-level transformer with params
2731
+
2732
+ Args:
2733
+ config: Node configuration
2734
+
2735
+ Returns:
2736
+ Dict with connection, path, format, register_table or None
2737
+ """
2738
+ output_functions = {"merge", "scd2"}
2739
+
2740
+ if config.transform and config.transform.steps:
2741
+ for step in reversed(config.transform.steps):
2742
+ if isinstance(step, str):
2743
+ continue
2744
+
2745
+ if hasattr(step, "function") and step.function in output_functions:
2746
+ params = step.params or {}
2747
+ connection = params.get("connection")
2748
+ path = params.get("path") or params.get("target")
2749
+ register_table = params.get("register_table")
2750
+
2751
+ if connection and path:
2752
+ return {
2753
+ "connection": connection,
2754
+ "path": path,
2755
+ "format": "delta",
2756
+ "register_table": register_table,
2757
+ "output_type": "managed_table" if register_table else "external_table",
2758
+ }
2759
+
2760
+ if config.transformer in output_functions and config.params:
2761
+ params = config.params
2762
+ connection = params.get("connection")
2763
+ path = params.get("path") or params.get("target")
2764
+ register_table = params.get("register_table")
2765
+
2766
+ if connection and path:
2767
+ return {
2768
+ "connection": connection,
2769
+ "path": path,
2770
+ "format": "delta",
2771
+ "register_table": register_table,
2772
+ "output_type": "managed_table" if register_table else "external_table",
2773
+ }
2774
+
2775
+ return None
2776
+
2777
+ def _get_schema(self, df: Any) -> Any:
2778
+ return self.engine.get_schema(df)
2779
+
2780
+ def _get_shape(self, df: Any) -> tuple:
2781
+ return self.engine.get_shape(df)
2782
+
2783
+ def _count_rows(self, df: Any) -> Optional[int]:
2784
+ if df is not None and getattr(df, "isStreaming", False):
2785
+ return None
2786
+ return self.engine.count_rows(df)
2787
+
2788
+ def _get_column_max(self, df: Any, column: str, fallback_column: Optional[str] = None) -> Any:
2789
+ """Get maximum value of a column, with optional fallback for NULL values."""
2790
+ if df is not None and getattr(df, "isStreaming", False):
2791
+ return None
2792
+ if hasattr(self.engine, "spark"):
2793
+ from pyspark.sql import functions as F
2794
+
2795
+ try:
2796
+ if fallback_column:
2797
+ coalesce_col = F.coalesce(F.col(column), F.col(fallback_column))
2798
+ row = df.select(F.max(coalesce_col)).first()
2799
+ else:
2800
+ row = df.select(F.max(column)).first()
2801
+ return row[0] if row else None
2802
+ except Exception:
2803
+ return None
2804
+ else:
2805
+ try:
2806
+ import numpy as np
2807
+ import pandas as pd
2808
+
2809
+ if fallback_column and fallback_column in df.columns:
2810
+ combined = df[column].combine_first(df[fallback_column])
2811
+ val = combined.max()
2812
+ elif column in df.columns:
2813
+ val = df[column].max()
2814
+ else:
2815
+ return None
2816
+
2817
+ if pd.isna(val):
2818
+ return None
2819
+ if isinstance(val, (np.integer, np.floating)):
2820
+ return val.item()
2821
+ if isinstance(val, np.datetime64):
2822
+ return str(val)
2823
+ return val
2824
+ except Exception:
2825
+ return None
2826
+
2827
+ def _generate_suggestions(self, error: Exception, config: NodeConfig) -> List[str]:
2828
+ """Generate suggestions."""
2829
+ suggestions = []
2830
+ error_str = str(error).lower()
2831
+
2832
+ if "column" in error_str and "not found" in error_str:
2833
+ suggestions.append("Check that previous nodes output the expected columns")
2834
+ suggestions.append(f"Use 'odibi run-node {config.name} --show-schema' to debug")
2835
+
2836
+ if "validation failed" in error_str:
2837
+ suggestions.append("Check your validation rules against the input data")
2838
+ suggestions.append("Inspect the sample data in the generated story")
2839
+
2840
+ if "keyerror" in error.__class__.__name__.lower():
2841
+ suggestions.append("Verify that all referenced DataFrames are registered in context")
2842
+ suggestions.append("Check node dependencies in 'depends_on' list")
2843
+
2844
+ if "function" in error_str and "not" in error_str:
2845
+ suggestions.append("Ensure the transform function is decorated with @transform")
2846
+ suggestions.append("Import the module containing the transform function")
2847
+
2848
+ if "connection" in error_str:
2849
+ suggestions.append("Verify connection configuration in project.yaml")
2850
+ suggestions.append("Check network connectivity and credentials")
2851
+
2852
+ return suggestions
2853
+
2854
+ def _clean_spark_traceback(self, raw_traceback: str) -> str:
2855
+ """Clean Spark/Py4J traceback to show only relevant Python info.
2856
+
2857
+ Removes Java stack traces and Py4J noise to make errors more readable.
2858
+
2859
+ Args:
2860
+ raw_traceback: Full traceback string
2861
+
2862
+ Returns:
2863
+ Cleaned traceback with Java/Py4J details removed
2864
+ """
2865
+ import re
2866
+
2867
+ lines = raw_traceback.split("\n")
2868
+ cleaned_lines = []
2869
+ skip_until_python = False
2870
+
2871
+ for line in lines:
2872
+ # Skip Java stack trace lines
2873
+ if re.match(r"\s+at (org\.|java\.|scala\.|py4j\.)", line):
2874
+ skip_until_python = True
2875
+ continue
2876
+
2877
+ # Skip Py4J internal lines
2878
+ if "py4j.protocol" in line or "Py4JJavaError" in line:
2879
+ continue
2880
+
2881
+ # Skip lines that are just "..."
2882
+ if line.strip() == "...":
2883
+ continue
2884
+
2885
+ # If we hit a Python traceback line, resume capturing
2886
+ if line.strip().startswith("File ") or line.strip().startswith("Traceback"):
2887
+ skip_until_python = False
2888
+
2889
+ if not skip_until_python:
2890
+ # Clean up common Spark error prefixes
2891
+ cleaned_line = re.sub(r"org\.apache\.spark\.[a-zA-Z.]+Exception: ", "", line)
2892
+ cleaned_lines.append(cleaned_line)
2893
+
2894
+ # Remove duplicate empty lines
2895
+ result_lines = []
2896
+ prev_empty = False
2897
+ for line in cleaned_lines:
2898
+ is_empty = not line.strip()
2899
+ if is_empty and prev_empty:
2900
+ continue
2901
+ result_lines.append(line)
2902
+ prev_empty = is_empty
2903
+
2904
+ return "\n".join(result_lines).strip()
2905
+
2906
+ def _calculate_pii(self, config: NodeConfig) -> Dict[str, bool]:
2907
+ """Calculate effective PII metadata (Inheritance + Local - Declassify)."""
2908
+ # 1. Collect Upstream PII
2909
+ inherited_pii = {}
2910
+ if config.depends_on:
2911
+ for dep in config.depends_on:
2912
+ meta = self.context.get_metadata(dep)
2913
+ if meta and "pii_columns" in meta:
2914
+ inherited_pii.update(meta["pii_columns"])
2915
+
2916
+ # 2. Merge with Local PII
2917
+ local_pii = {name: True for name, meta in config.columns.items() if meta.pii}
2918
+ merged_pii = {**inherited_pii, **local_pii}
2919
+
2920
+ # 3. Apply Declassification
2921
+ if config.privacy and config.privacy.declassify:
2922
+ for col in config.privacy.declassify:
2923
+ merged_pii.pop(col, None)
2924
+
2925
+ return merged_pii
2926
+
2927
+
2928
+ class Node:
2929
+ """Base node execution orchestrator."""
2930
+
2931
+ def __init__(
2932
+ self,
2933
+ config: NodeConfig,
2934
+ context: Context,
2935
+ engine: Any,
2936
+ connections: Dict[str, Any],
2937
+ config_file: Optional[str] = None,
2938
+ max_sample_rows: int = 10,
2939
+ dry_run: bool = False,
2940
+ retry_config: Optional[RetryConfig] = None,
2941
+ catalog_manager: Optional[Any] = None,
2942
+ performance_config: Optional[Any] = None,
2943
+ pipeline_name: Optional[str] = None,
2944
+ batch_write_buffers: Optional[Dict[str, List]] = None,
2945
+ ):
2946
+ """Initialize node."""
2947
+ self.config = config
2948
+ self.context = context
2949
+ self.engine = engine
2950
+ self.connections = connections
2951
+ self.config_file = config_file
2952
+ self.max_sample_rows = max_sample_rows
2953
+ self.dry_run = dry_run
2954
+ self.retry_config = retry_config or RetryConfig(enabled=False)
2955
+ self.catalog_manager = catalog_manager
2956
+ self.performance_config = performance_config
2957
+ self.pipeline_name = pipeline_name
2958
+ self.batch_write_buffers = batch_write_buffers
2959
+
2960
+ self._cached_result: Optional[Any] = None
2961
+
2962
+ # Initialize State Manager
2963
+ spark_session = None
2964
+ if hasattr(self.engine, "spark"):
2965
+ spark_session = self.engine.spark
2966
+
2967
+ if self.catalog_manager and self.catalog_manager.tables:
2968
+ storage_opts = self.catalog_manager._get_storage_options()
2969
+ environment = getattr(self.catalog_manager.config, "environment", None)
2970
+ backend = CatalogStateBackend(
2971
+ spark_session=spark_session,
2972
+ meta_state_path=self.catalog_manager.tables.get("meta_state"),
2973
+ meta_runs_path=self.catalog_manager.tables.get("meta_runs"),
2974
+ storage_options=storage_opts if storage_opts else None,
2975
+ environment=environment,
2976
+ )
2977
+ else:
2978
+ # Fallback to default local paths (Unified Catalog default)
2979
+ backend = CatalogStateBackend(
2980
+ spark_session=spark_session,
2981
+ meta_state_path=".odibi/system/meta_state",
2982
+ meta_runs_path=".odibi/system/meta_runs",
2983
+ )
2984
+
2985
+ self.state_manager = StateManager(backend=backend)
2986
+
2987
+ # Initialize Executor
2988
+ self.executor = NodeExecutor(
2989
+ context=context,
2990
+ engine=engine,
2991
+ connections=connections,
2992
+ catalog_manager=catalog_manager,
2993
+ config_file=config_file,
2994
+ max_sample_rows=max_sample_rows,
2995
+ performance_config=performance_config,
2996
+ state_manager=self.state_manager,
2997
+ pipeline_name=pipeline_name,
2998
+ batch_write_buffers=batch_write_buffers,
2999
+ )
3000
+
3001
+ def restore(self) -> bool:
3002
+ """Restore node state from previous execution (if persisted)."""
3003
+ ctx = create_logging_context(
3004
+ node_id=self.config.name,
3005
+ engine=self.engine.__class__.__name__,
3006
+ )
3007
+
3008
+ if not self.config.write:
3009
+ ctx.debug("No write config, skipping restore")
3010
+ return False
3011
+
3012
+ write_config = self.config.write
3013
+ connection = self.connections.get(write_config.connection)
3014
+
3015
+ if connection is None:
3016
+ ctx.debug(f"Connection '{write_config.connection}' not found, skipping restore")
3017
+ return False
3018
+
3019
+ try:
3020
+ ctx.debug(
3021
+ "Attempting to restore node from persisted state",
3022
+ table=write_config.table,
3023
+ path=write_config.path,
3024
+ )
3025
+
3026
+ df = self.engine.read(
3027
+ connection=connection,
3028
+ format=write_config.format,
3029
+ table=write_config.table,
3030
+ path=write_config.path,
3031
+ options={},
3032
+ )
3033
+
3034
+ if df is not None:
3035
+ row_count = self.engine.count_rows(df) if df is not None else 0
3036
+ self.context.register(self.config.name, df)
3037
+ if self.config.cache:
3038
+ self._cached_result = df
3039
+ ctx.info(
3040
+ "Node state restored successfully",
3041
+ rows=row_count,
3042
+ table=write_config.table,
3043
+ path=write_config.path,
3044
+ )
3045
+ return True
3046
+
3047
+ except Exception as e:
3048
+ ctx.warning(
3049
+ f"Failed to restore node state: {e}",
3050
+ error_type=type(e).__name__,
3051
+ )
3052
+ return False
3053
+
3054
+ return False
3055
+
3056
+ def get_version_hash(self) -> str:
3057
+ """Calculate a deterministic hash of the node's configuration."""
3058
+ import json
3059
+
3060
+ # We use model_dump_json for consistent serialization
3061
+ # Exclude fields that don't affect logic (e.g., description, tags?)
3062
+ # Actually, changing tags might affect scheduling, but not node logic.
3063
+ # Let's stick to functional fields.
3064
+
3065
+ # We need to handle the fact that model_dump might include defaults or not consistently.
3066
+ # Using model_dump(mode='json') is good.
3067
+
3068
+ dump = (
3069
+ self.config.model_dump(mode="json", exclude={"description", "tags", "log_level"})
3070
+ if hasattr(self.config, "model_dump")
3071
+ else self.config.model_dump(exclude={"description", "tags", "log_level"})
3072
+ )
3073
+
3074
+ # Sort keys to ensure determinism
3075
+ dump_str = json.dumps(dump, sort_keys=True)
3076
+ return hashlib.md5(dump_str.encode("utf-8")).hexdigest()
3077
+
3078
+ def execute(self) -> NodeResult:
3079
+ """Execute the node with telemetry and retry logic."""
3080
+ import json
3081
+ import uuid
3082
+
3083
+ from odibi.utils.telemetry import (
3084
+ Status,
3085
+ StatusCode,
3086
+ node_duration,
3087
+ nodes_executed,
3088
+ rows_processed,
3089
+ tracer,
3090
+ )
3091
+
3092
+ ctx = create_logging_context(
3093
+ node_id=self.config.name,
3094
+ engine=self.engine.__class__.__name__,
3095
+ )
3096
+
3097
+ node_log_level = self.config.log_level.value if self.config.log_level else None
3098
+
3099
+ result_for_log = NodeResult(node_name=self.config.name, success=False, duration=0.0)
3100
+ start_time = time.time()
3101
+
3102
+ ctx.info(
3103
+ f"Starting node execution: {self.config.name}",
3104
+ engine=self.engine.__class__.__name__,
3105
+ dry_run=self.dry_run,
3106
+ retry_enabled=self.retry_config.enabled if self.retry_config else False,
3107
+ )
3108
+
3109
+ with (
3110
+ _override_log_level(node_log_level),
3111
+ tracer.start_as_current_span("node_execution") as span,
3112
+ ):
3113
+ span.set_attribute("node.name", self.config.name)
3114
+ span.set_attribute("node.engine", self.engine.__class__.__name__)
3115
+
3116
+ try:
3117
+ try:
3118
+ result = self._execute_with_retries()
3119
+ result_for_log = result
3120
+ except Exception as e:
3121
+ span.record_exception(e)
3122
+ span.set_status(Status(StatusCode.ERROR))
3123
+ nodes_executed.add(1, {"status": "failure", "node": self.config.name})
3124
+
3125
+ result_for_log.duration = time.time() - start_time
3126
+ result_for_log.error = e
3127
+ result_for_log.metadata = {"error": str(e), "catastrophic": True}
3128
+
3129
+ ctx.error(
3130
+ "Catastrophic failure in node execution",
3131
+ error_type=type(e).__name__,
3132
+ error_message=str(e),
3133
+ elapsed_ms=round(result_for_log.duration * 1000, 2),
3134
+ )
3135
+
3136
+ raise e
3137
+
3138
+ if result.success:
3139
+ span.set_status(Status(StatusCode.OK))
3140
+ nodes_executed.add(1, {"status": "success", "node": self.config.name})
3141
+ ctx.info(
3142
+ "Node execution succeeded",
3143
+ rows_processed=result.rows_processed,
3144
+ elapsed_ms=round(result.duration * 1000, 2),
3145
+ attempts=result.metadata.get("attempts", 1),
3146
+ )
3147
+ else:
3148
+ span.set_status(Status(StatusCode.ERROR))
3149
+ if result.error:
3150
+ span.record_exception(result.error)
3151
+ nodes_executed.add(1, {"status": "failure", "node": self.config.name})
3152
+ ctx.error(
3153
+ "Node execution failed",
3154
+ error_type=type(result.error).__name__ if result.error else "Unknown",
3155
+ elapsed_ms=round(result.duration * 1000, 2),
3156
+ )
3157
+
3158
+ if result.rows_processed is not None:
3159
+ rows_processed.add(result.rows_processed, {"node": self.config.name})
3160
+
3161
+ node_duration.record(result.duration, {"node": self.config.name})
3162
+
3163
+ result.metadata["version_hash"] = self.get_version_hash()
3164
+
3165
+ return result
3166
+
3167
+ finally:
3168
+
3169
+ def safe_default(o):
3170
+ return str(o)
3171
+
3172
+ try:
3173
+ metrics_json = json.dumps(result_for_log.metadata, default=safe_default)
3174
+ except Exception:
3175
+ metrics_json = "{}"
3176
+
3177
+ run_record = {
3178
+ "run_id": str(uuid.uuid4()),
3179
+ "pipeline_name": self.pipeline_name
3180
+ or (self.config.tags[0] if self.config.tags else "unknown"),
3181
+ "node_name": self.config.name,
3182
+ "status": "SUCCESS" if result_for_log.success else "FAILURE",
3183
+ "rows_processed": result_for_log.rows_processed or 0,
3184
+ "duration_ms": int(result_for_log.duration * 1000),
3185
+ "metrics_json": metrics_json,
3186
+ }
3187
+ result_for_log.metadata["_run_record"] = run_record
3188
+
3189
+ def _execute_with_retries(self) -> NodeResult:
3190
+ """Execute with internal retry logic."""
3191
+ ctx = create_logging_context(
3192
+ node_id=self.config.name,
3193
+ engine=self.engine.__class__.__name__,
3194
+ )
3195
+
3196
+ start_time = time.time()
3197
+ attempts = 0
3198
+ max_attempts = self.retry_config.max_attempts if self.retry_config.enabled else 1
3199
+ last_error = None
3200
+ retry_history: List[Dict[str, Any]] = []
3201
+
3202
+ if max_attempts > 1:
3203
+ ctx.debug(
3204
+ "Retry logic enabled",
3205
+ max_attempts=max_attempts,
3206
+ backoff=self.retry_config.backoff,
3207
+ )
3208
+
3209
+ while attempts < max_attempts:
3210
+ attempts += 1
3211
+ attempt_start = time.time()
3212
+
3213
+ if attempts > 1:
3214
+ ctx.info(
3215
+ f"Retry attempt {attempts}/{max_attempts}",
3216
+ previous_error=str(last_error) if last_error else None,
3217
+ )
3218
+
3219
+ try:
3220
+ hwm_state = None
3221
+ if (
3222
+ self.config.read
3223
+ and self.config.read.incremental
3224
+ and self.config.read.incremental.mode == IncrementalMode.STATEFUL
3225
+ ):
3226
+ key = self.config.read.incremental.state_key or f"{self.config.name}_hwm"
3227
+ val = self.state_manager.get_hwm(key)
3228
+ hwm_state = (key, val)
3229
+
3230
+ # Suppress error logs on non-final attempts
3231
+ is_last_attempt = attempts >= max_attempts
3232
+ result = self.executor.execute(
3233
+ self.config,
3234
+ dry_run=self.dry_run,
3235
+ hwm_state=hwm_state,
3236
+ suppress_error_log=not is_last_attempt,
3237
+ current_pipeline=self.pipeline_name,
3238
+ )
3239
+
3240
+ attempt_duration = time.time() - attempt_start
3241
+
3242
+ if result.success:
3243
+ retry_history.append(
3244
+ {
3245
+ "attempt": attempts,
3246
+ "success": True,
3247
+ "duration": round(attempt_duration, 3),
3248
+ }
3249
+ )
3250
+ result.metadata["attempts"] = attempts
3251
+ result.metadata["retry_history"] = retry_history
3252
+ result.duration = time.time() - start_time
3253
+
3254
+ if self.config.cache and self.context.get(self.config.name) is not None:
3255
+ self._cached_result = self.context.get(self.config.name)
3256
+
3257
+ if result.metadata.get("hwm_pending"):
3258
+ hwm_update = result.metadata.get("hwm_update")
3259
+ if hwm_update:
3260
+ try:
3261
+ self.state_manager.set_hwm(hwm_update["key"], hwm_update["value"])
3262
+ ctx.debug(
3263
+ "HWM state updated",
3264
+ hwm_key=hwm_update["key"],
3265
+ hwm_value=str(hwm_update["value"]),
3266
+ )
3267
+ except Exception as e:
3268
+ result.metadata["hwm_error"] = str(e)
3269
+ ctx.warning(f"Failed to update HWM state: {e}")
3270
+
3271
+ return result
3272
+
3273
+ last_error = result.error
3274
+ retry_history.append(
3275
+ {
3276
+ "attempt": attempts,
3277
+ "success": False,
3278
+ "error": str(last_error) if last_error else "Unknown error",
3279
+ "error_type": type(last_error).__name__ if last_error else "Unknown",
3280
+ "error_traceback": result.metadata.get("error_traceback_cleaned")
3281
+ or result.metadata.get("error_traceback"),
3282
+ "duration": round(attempt_duration, 3),
3283
+ }
3284
+ )
3285
+
3286
+ except Exception as e:
3287
+ attempt_duration = time.time() - attempt_start
3288
+ last_error = e
3289
+ retry_history.append(
3290
+ {
3291
+ "attempt": attempts,
3292
+ "success": False,
3293
+ "error": str(e),
3294
+ "error_type": type(e).__name__,
3295
+ "error_traceback": traceback.format_exc(),
3296
+ "duration": round(attempt_duration, 3),
3297
+ }
3298
+ )
3299
+
3300
+ if attempts < max_attempts:
3301
+ sleep_time = 1
3302
+ if self.retry_config.backoff == "exponential":
3303
+ sleep_time = 2 ** (attempts - 1)
3304
+ elif self.retry_config.backoff == "linear":
3305
+ sleep_time = attempts
3306
+ elif self.retry_config.backoff == "constant":
3307
+ sleep_time = 1
3308
+
3309
+ ctx.warning(
3310
+ f"Attempt {attempts} failed, retrying in {sleep_time}s",
3311
+ error_type=type(e).__name__,
3312
+ error_message=str(e),
3313
+ backoff_seconds=sleep_time,
3314
+ )
3315
+ time.sleep(sleep_time)
3316
+
3317
+ duration = time.time() - start_time
3318
+
3319
+ ctx.error(
3320
+ "All retry attempts exhausted",
3321
+ attempts=attempts,
3322
+ max_attempts=max_attempts,
3323
+ elapsed_ms=round(duration * 1000, 2),
3324
+ )
3325
+
3326
+ if not isinstance(last_error, NodeExecutionError) and last_error:
3327
+ error = NodeExecutionError(
3328
+ message=str(last_error),
3329
+ context=ExecutionContext(node_name=self.config.name, config_file=self.config_file),
3330
+ original_error=last_error,
3331
+ )
3332
+ else:
3333
+ error = last_error
3334
+
3335
+ return NodeResult(
3336
+ node_name=self.config.name,
3337
+ success=False,
3338
+ duration=duration,
3339
+ error=error,
3340
+ metadata={"attempts": attempts, "retry_history": retry_history},
3341
+ )