odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2362 @@
1
+ """Spark execution engine (Phase 2B: Delta Lake support).
2
+
3
+ Status: Phase 2B implemented - Delta Lake read/write, VACUUM, history, restore
4
+ """
5
+
6
+ import re
7
+ import time
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+ from odibi.enums import EngineType
11
+ from odibi.exceptions import TransformError
12
+ from odibi.utils.logging_context import get_logging_context
13
+
14
+ from .base import Engine
15
+
16
+
17
+ def _extract_spark_error_message(error: Exception) -> str:
18
+ """Extract a clean, user-friendly error message from Spark/Py4J exceptions.
19
+
20
+ Removes Java stack traces and Py4J noise, keeping only the useful error info.
21
+
22
+ Args:
23
+ error: The exception to clean
24
+
25
+ Returns:
26
+ Clean error message without Java stack traces
27
+ """
28
+ error_str = str(error)
29
+
30
+ # For AnalysisException, extract the error class and message up to SQLSTATE or line info
31
+ # Format: [ERROR_CLASS] message. Did you mean...? SQLSTATE: xxx; line X pos Y;\n'Plan...
32
+ match = re.match(
33
+ r"(\[[\w._]+\])\s*(.+?)(?:\s*SQLSTATE|\s*;\s*line|\n'|\n\tat|$)",
34
+ error_str,
35
+ re.DOTALL,
36
+ )
37
+ if match:
38
+ error_class = match.group(1)
39
+ message = match.group(2).strip().rstrip(".")
40
+ return f"{error_class} {message}"
41
+
42
+ # For other Spark errors, try to extract the first meaningful line
43
+ lines = error_str.split("\n")
44
+ for line in lines:
45
+ line = line.strip()
46
+ # Skip Java stack trace lines
47
+ if re.match(r"at (org\.|java\.|scala\.|py4j\.)", line):
48
+ continue
49
+ # Skip empty or noise lines
50
+ if not line or line.startswith("Py4JJavaError") or line == ":":
51
+ continue
52
+ # Return first meaningful line
53
+ if len(line) > 10:
54
+ # Truncate very long messages
55
+ if len(line) > 200:
56
+ return line[:200] + "..."
57
+ return line
58
+
59
+ # Fallback: return first 200 chars
60
+ return error_str[:200] + "..." if len(error_str) > 200 else error_str
61
+
62
+
63
+ class SparkEngine(Engine):
64
+ """Spark execution engine with PySpark backend.
65
+
66
+ Phase 2A: Basic read/write + ADLS multi-account support
67
+ Phase 2B: Delta Lake support
68
+ """
69
+
70
+ name = "spark"
71
+ engine_type = EngineType.SPARK
72
+
73
+ def __init__(
74
+ self,
75
+ connections: Optional[Dict[str, Any]] = None,
76
+ spark_session: Any = None,
77
+ config: Optional[Dict[str, Any]] = None,
78
+ ):
79
+ """Initialize Spark engine with import guard.
80
+
81
+ Args:
82
+ connections: Dictionary of connection objects (for multi-account config)
83
+ spark_session: Existing SparkSession (optional, creates new if None)
84
+ config: Engine configuration (optional)
85
+
86
+ Raises:
87
+ ImportError: If pyspark not installed
88
+ """
89
+ ctx = get_logging_context().with_context(engine="spark")
90
+ ctx.debug("Initializing SparkEngine", connections_count=len(connections or {}))
91
+
92
+ try:
93
+ from pyspark.sql import SparkSession
94
+ except ImportError as e:
95
+ ctx.error(
96
+ "PySpark not installed",
97
+ error_type="ImportError",
98
+ suggestion="pip install odibi[spark]",
99
+ )
100
+ raise ImportError(
101
+ "Spark support requires 'pip install odibi[spark]'. "
102
+ "See docs/setup_databricks.md for setup instructions."
103
+ ) from e
104
+
105
+ start_time = time.time()
106
+
107
+ # Configure Delta Lake support
108
+ try:
109
+ from delta import configure_spark_with_delta_pip
110
+
111
+ builder = SparkSession.builder.appName("odibi").config(
112
+ "spark.sql.sources.partitionOverwriteMode", "dynamic"
113
+ )
114
+
115
+ # Performance Optimizations
116
+ builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true")
117
+ builder = builder.config("spark.sql.adaptive.enabled", "true")
118
+
119
+ # Reduce Verbosity
120
+ builder = builder.config(
121
+ "spark.driver.extraJavaOptions", "-Dlog4j.rootCategory=ERROR, console"
122
+ )
123
+ builder = builder.config(
124
+ "spark.executor.extraJavaOptions", "-Dlog4j.rootCategory=ERROR, console"
125
+ )
126
+
127
+ self.spark = spark_session or configure_spark_with_delta_pip(builder).getOrCreate()
128
+ self.spark.sparkContext.setLogLevel("ERROR")
129
+
130
+ ctx.debug("Delta Lake support enabled")
131
+
132
+ except ImportError:
133
+ ctx.debug("Delta Lake not available, using standard Spark")
134
+ builder = SparkSession.builder.appName("odibi").config(
135
+ "spark.sql.sources.partitionOverwriteMode", "dynamic"
136
+ )
137
+
138
+ # Performance Optimizations
139
+ builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true")
140
+ builder = builder.config("spark.sql.adaptive.enabled", "true")
141
+
142
+ # Reduce Verbosity
143
+ builder = builder.config(
144
+ "spark.driver.extraJavaOptions", "-Dlog4j.rootCategory=ERROR, console"
145
+ )
146
+
147
+ self.spark = spark_session or builder.getOrCreate()
148
+ self.spark.sparkContext.setLogLevel("ERROR")
149
+
150
+ self.config = config or {}
151
+ self.connections = connections or {}
152
+
153
+ # Configure all ADLS connections upfront
154
+ self._configure_all_connections()
155
+
156
+ # Apply user-defined Spark configs from performance settings
157
+ self._apply_spark_config()
158
+
159
+ elapsed = (time.time() - start_time) * 1000
160
+ ctx.info(
161
+ "SparkEngine initialized",
162
+ elapsed_ms=round(elapsed, 2),
163
+ app_name=self.spark.sparkContext.appName,
164
+ spark_version=self.spark.version,
165
+ connections_configured=len(self.connections),
166
+ using_existing_session=spark_session is not None,
167
+ )
168
+
169
+ def _configure_all_connections(self) -> None:
170
+ """Configure Spark with all ADLS connection credentials.
171
+
172
+ This sets all storage account keys upfront so Spark can access
173
+ multiple accounts. Keys are scoped by account name, so no conflicts.
174
+ """
175
+ ctx = get_logging_context().with_context(engine="spark")
176
+
177
+ for conn_name, connection in self.connections.items():
178
+ if hasattr(connection, "configure_spark"):
179
+ ctx.log_connection(
180
+ connection_type=type(connection).__name__,
181
+ connection_name=conn_name,
182
+ action="configure_spark",
183
+ )
184
+ try:
185
+ connection.configure_spark(self.spark)
186
+ ctx.debug(f"Configured ADLS connection: {conn_name}")
187
+ except Exception as e:
188
+ ctx.error(
189
+ f"Failed to configure ADLS connection: {conn_name}",
190
+ error_type=type(e).__name__,
191
+ error_message=str(e),
192
+ )
193
+ raise
194
+
195
+ def _apply_spark_config(self) -> None:
196
+ """Apply user-defined Spark configurations from performance settings.
197
+
198
+ Applies configs via spark.conf.set() for runtime-settable options.
199
+ For existing sessions (e.g., Databricks), only modifiable configs take effect.
200
+ """
201
+ ctx = get_logging_context().with_context(engine="spark")
202
+
203
+ performance = self.config.get("performance", {})
204
+ spark_config = performance.get("spark_config", {})
205
+
206
+ if not spark_config:
207
+ return
208
+
209
+ ctx.debug("Applying Spark configuration", config_count=len(spark_config))
210
+
211
+ for key, value in spark_config.items():
212
+ try:
213
+ self.spark.conf.set(key, value)
214
+ ctx.debug(
215
+ f"Applied Spark config: {key}={value}", config_key=key, config_value=value
216
+ )
217
+ except Exception as e:
218
+ ctx.warning(
219
+ f"Failed to set Spark config '{key}'",
220
+ config_key=key,
221
+ error_message=str(e),
222
+ suggestion="This config may require session restart",
223
+ )
224
+
225
+ def _apply_table_properties(
226
+ self, target: str, properties: Dict[str, str], is_table: bool = False
227
+ ) -> None:
228
+ """Apply table properties to a Delta table.
229
+
230
+ Performance: Batches all properties into a single ALTER TABLE statement
231
+ to avoid multiple round-trips to the catalog.
232
+ """
233
+ if not properties:
234
+ return
235
+
236
+ ctx = get_logging_context().with_context(engine="spark")
237
+
238
+ try:
239
+ table_ref = target if is_table else f"delta.`{target}`"
240
+ ctx.debug(
241
+ f"Applying table properties to {target}",
242
+ properties_count=len(properties),
243
+ is_table=is_table,
244
+ )
245
+
246
+ props_list = [f"'{k}' = '{v}'" for k, v in properties.items()]
247
+ props_str = ", ".join(props_list)
248
+ sql = f"ALTER TABLE {table_ref} SET TBLPROPERTIES ({props_str})"
249
+ self.spark.sql(sql)
250
+ ctx.debug(f"Set {len(properties)} table properties in single statement")
251
+
252
+ except Exception as e:
253
+ ctx.warning(
254
+ f"Failed to set table properties on {target}",
255
+ error_type=type(e).__name__,
256
+ error_message=str(e),
257
+ )
258
+
259
+ def _optimize_delta_write(
260
+ self, target: str, options: Dict[str, Any], is_table: bool = False
261
+ ) -> None:
262
+ """Run Delta Lake optimization (OPTIMIZE / ZORDER)."""
263
+ should_optimize = options.get("optimize_write", False)
264
+ zorder_by = options.get("zorder_by")
265
+
266
+ if not should_optimize and not zorder_by:
267
+ return
268
+
269
+ ctx = get_logging_context().with_context(engine="spark")
270
+ start_time = time.time()
271
+
272
+ try:
273
+ if is_table:
274
+ sql = f"OPTIMIZE {target}"
275
+ else:
276
+ sql = f"OPTIMIZE delta.`{target}`"
277
+
278
+ if zorder_by:
279
+ if isinstance(zorder_by, str):
280
+ zorder_by = [zorder_by]
281
+ cols = ", ".join(zorder_by)
282
+ sql += f" ZORDER BY ({cols})"
283
+
284
+ ctx.debug("Running Delta optimization", sql=sql, target=target)
285
+ self.spark.sql(sql)
286
+
287
+ elapsed = (time.time() - start_time) * 1000
288
+ ctx.info(
289
+ "Delta optimization completed",
290
+ target=target,
291
+ zorder_by=zorder_by,
292
+ elapsed_ms=round(elapsed, 2),
293
+ )
294
+
295
+ except Exception as e:
296
+ elapsed = (time.time() - start_time) * 1000
297
+ ctx.warning(
298
+ f"Optimization failed for {target}",
299
+ error_type=type(e).__name__,
300
+ error_message=str(e),
301
+ elapsed_ms=round(elapsed, 2),
302
+ )
303
+
304
+ def _get_last_delta_commit_info(
305
+ self, target: str, is_table: bool = False
306
+ ) -> Optional[Dict[str, Any]]:
307
+ """Get metadata for the most recent Delta commit."""
308
+ ctx = get_logging_context().with_context(engine="spark")
309
+
310
+ try:
311
+ from delta.tables import DeltaTable
312
+
313
+ if is_table:
314
+ dt = DeltaTable.forName(self.spark, target)
315
+ else:
316
+ dt = DeltaTable.forPath(self.spark, target)
317
+
318
+ last_commit = dt.history(1).collect()[0]
319
+
320
+ def safe_get(row, field):
321
+ if hasattr(row, field):
322
+ return getattr(row, field)
323
+ if hasattr(row, "__getitem__"):
324
+ try:
325
+ return row[field]
326
+ except (KeyError, ValueError):
327
+ return None
328
+ return None
329
+
330
+ commit_info = {
331
+ "version": safe_get(last_commit, "version"),
332
+ "timestamp": safe_get(last_commit, "timestamp"),
333
+ "operation": safe_get(last_commit, "operation"),
334
+ "operation_metrics": safe_get(last_commit, "operationMetrics"),
335
+ "read_version": safe_get(last_commit, "readVersion"),
336
+ }
337
+
338
+ ctx.debug(
339
+ "Delta commit metadata retrieved",
340
+ target=target,
341
+ version=commit_info.get("version"),
342
+ operation=commit_info.get("operation"),
343
+ )
344
+
345
+ return commit_info
346
+
347
+ except Exception as e:
348
+ ctx.warning(
349
+ f"Failed to fetch Delta commit info for {target}",
350
+ error_type=type(e).__name__,
351
+ error_message=str(e),
352
+ )
353
+ return None
354
+
355
+ def harmonize_schema(self, df, target_schema: Dict[str, str], policy: Any):
356
+ """Harmonize DataFrame schema with target schema according to policy."""
357
+ from pyspark.sql.functions import col, lit
358
+
359
+ from odibi.config import OnMissingColumns, OnNewColumns, SchemaMode
360
+
361
+ ctx = get_logging_context().with_context(engine="spark")
362
+
363
+ target_cols = list(target_schema.keys())
364
+ current_cols = df.columns
365
+
366
+ missing = set(target_cols) - set(current_cols)
367
+ new_cols = set(current_cols) - set(target_cols)
368
+
369
+ ctx.debug(
370
+ "Schema harmonization",
371
+ target_columns=len(target_cols),
372
+ current_columns=len(current_cols),
373
+ missing_columns=list(missing) if missing else None,
374
+ new_columns=list(new_cols) if new_cols else None,
375
+ policy_mode=policy.mode.value if hasattr(policy.mode, "value") else str(policy.mode),
376
+ )
377
+
378
+ # Check Validations
379
+ if missing and policy.on_missing_columns == OnMissingColumns.FAIL:
380
+ ctx.error(
381
+ f"Schema Policy Violation: Missing columns {missing}",
382
+ missing_columns=list(missing),
383
+ )
384
+ raise ValueError(f"Schema Policy Violation: Missing columns {missing}")
385
+
386
+ if new_cols and policy.on_new_columns == OnNewColumns.FAIL:
387
+ ctx.error(
388
+ f"Schema Policy Violation: New columns {new_cols}",
389
+ new_columns=list(new_cols),
390
+ )
391
+ raise ValueError(f"Schema Policy Violation: New columns {new_cols}")
392
+
393
+ # Apply Transformations
394
+ if policy.mode == SchemaMode.EVOLVE and policy.on_new_columns == OnNewColumns.ADD_NULLABLE:
395
+ res = df
396
+ for c in missing:
397
+ res = res.withColumn(c, lit(None))
398
+ ctx.debug("Schema evolved: added missing columns as null")
399
+ return res
400
+ else:
401
+ select_exprs = []
402
+ for c in target_cols:
403
+ if c in current_cols:
404
+ select_exprs.append(col(c))
405
+ else:
406
+ select_exprs.append(lit(None).alias(c))
407
+
408
+ ctx.debug("Schema enforced: projected to target schema")
409
+ return df.select(*select_exprs)
410
+
411
+ def anonymize(self, df, columns: List[str], method: str, salt: Optional[str] = None):
412
+ """Anonymize columns using Spark functions."""
413
+ from pyspark.sql.functions import col, concat, lit, regexp_replace, sha2
414
+
415
+ ctx = get_logging_context().with_context(engine="spark")
416
+ ctx.debug(
417
+ "Anonymizing columns",
418
+ columns=columns,
419
+ method=method,
420
+ has_salt=salt is not None,
421
+ )
422
+
423
+ res = df
424
+ for c in columns:
425
+ if c not in df.columns:
426
+ ctx.warning(f"Column '{c}' not found for anonymization, skipping", column=c)
427
+ continue
428
+
429
+ if method == "hash":
430
+ if salt:
431
+ res = res.withColumn(c, sha2(concat(col(c), lit(salt)), 256))
432
+ else:
433
+ res = res.withColumn(c, sha2(col(c), 256))
434
+
435
+ elif method == "mask":
436
+ res = res.withColumn(c, regexp_replace(col(c), ".(?=.{4})", "*"))
437
+
438
+ elif method == "redact":
439
+ res = res.withColumn(c, lit("[REDACTED]"))
440
+
441
+ ctx.debug(f"Anonymization completed using {method}")
442
+ return res
443
+
444
+ def get_schema(self, df) -> Dict[str, str]:
445
+ """Get DataFrame schema with types."""
446
+ return {f.name: f.dataType.simpleString() for f in df.schema}
447
+
448
+ def get_shape(self, df) -> Tuple[int, int]:
449
+ """Get DataFrame shape as (rows, columns)."""
450
+ return (df.count(), len(df.columns))
451
+
452
+ def count_rows(self, df) -> int:
453
+ """Count rows in DataFrame."""
454
+ return df.count()
455
+
456
+ def read(
457
+ self,
458
+ connection: Any,
459
+ format: str,
460
+ table: Optional[str] = None,
461
+ path: Optional[str] = None,
462
+ streaming: bool = False,
463
+ schema: Optional[str] = None,
464
+ options: Optional[Dict[str, Any]] = None,
465
+ as_of_version: Optional[int] = None,
466
+ as_of_timestamp: Optional[str] = None,
467
+ ) -> Any:
468
+ """Read data using Spark.
469
+
470
+ Args:
471
+ connection: Connection object (with get_path method)
472
+ format: Data format (csv, parquet, json, delta, sql_server)
473
+ table: Table name
474
+ path: File path
475
+ streaming: Whether to read as a stream (readStream)
476
+ schema: Schema string in DDL format (required for streaming file sources)
477
+ options: Format-specific options (including versionAsOf for Delta time travel)
478
+ as_of_version: Time travel version
479
+ as_of_timestamp: Time travel timestamp
480
+
481
+ Returns:
482
+ Spark DataFrame (or Streaming DataFrame)
483
+ """
484
+ ctx = get_logging_context().with_context(engine="spark")
485
+ start_time = time.time()
486
+ options = options or {}
487
+
488
+ source_identifier = table or path or "unknown"
489
+ ctx.debug(
490
+ "Starting Spark read",
491
+ format=format,
492
+ source=source_identifier,
493
+ streaming=streaming,
494
+ as_of_version=as_of_version,
495
+ as_of_timestamp=as_of_timestamp,
496
+ )
497
+
498
+ # Handle Time Travel options
499
+ if as_of_version is not None:
500
+ options["versionAsOf"] = as_of_version
501
+ ctx.debug(f"Time travel enabled: version {as_of_version}")
502
+ if as_of_timestamp is not None:
503
+ options["timestampAsOf"] = as_of_timestamp
504
+ ctx.debug(f"Time travel enabled: timestamp {as_of_timestamp}")
505
+
506
+ # SQL Server / Azure SQL Support
507
+ if format in ["sql", "sql_server", "azure_sql"]:
508
+ if streaming:
509
+ ctx.error("Streaming not supported for SQL Server / Azure SQL")
510
+ raise ValueError("Streaming not supported for SQL Server / Azure SQL yet.")
511
+
512
+ if not hasattr(connection, "get_spark_options"):
513
+ conn_type = type(connection).__name__
514
+ msg = f"Connection type '{conn_type}' does not support Spark SQL read"
515
+ ctx.error(msg, connection_type=conn_type)
516
+ raise ValueError(msg)
517
+
518
+ jdbc_options = connection.get_spark_options()
519
+ merged_options = {**jdbc_options, **options}
520
+
521
+ # Extract filter for SQL pushdown
522
+ sql_filter = merged_options.pop("filter", None)
523
+
524
+ if "query" in merged_options:
525
+ merged_options.pop("dbtable", None)
526
+ # If filter provided with query, append to WHERE clause
527
+ if sql_filter:
528
+ existing_query = merged_options["query"]
529
+ # Wrap existing query and add filter
530
+ if "WHERE" in existing_query.upper():
531
+ merged_options["query"] = f"({existing_query}) AND ({sql_filter})"
532
+ else:
533
+ subquery = f"SELECT * FROM ({existing_query}) AS _subq WHERE {sql_filter}"
534
+ merged_options["query"] = subquery
535
+ ctx.debug(f"Applied SQL pushdown filter to query: {sql_filter}")
536
+ elif table:
537
+ # Build query with filter pushdown instead of using dbtable
538
+ if sql_filter:
539
+ merged_options.pop("dbtable", None)
540
+ merged_options["query"] = f"SELECT * FROM {table} WHERE {sql_filter}"
541
+ ctx.debug(f"Applied SQL pushdown filter: {sql_filter}")
542
+ else:
543
+ merged_options["dbtable"] = table
544
+ elif "dbtable" not in merged_options:
545
+ ctx.error("SQL format requires 'table' config or 'query' option")
546
+ raise ValueError("SQL format requires 'table' config or 'query' option")
547
+
548
+ ctx.debug("Executing JDBC read", has_query="query" in merged_options)
549
+
550
+ try:
551
+ df = self.spark.read.format("jdbc").options(**merged_options).load()
552
+ elapsed = (time.time() - start_time) * 1000
553
+ partition_count = df.rdd.getNumPartitions()
554
+
555
+ ctx.log_file_io(path=source_identifier, format=format, mode="read")
556
+ ctx.log_spark_metrics(partition_count=partition_count)
557
+ ctx.info(
558
+ "JDBC read completed",
559
+ source=source_identifier,
560
+ elapsed_ms=round(elapsed, 2),
561
+ partitions=partition_count,
562
+ )
563
+ return df
564
+
565
+ except Exception as e:
566
+ elapsed = (time.time() - start_time) * 1000
567
+ ctx.error(
568
+ "JDBC read failed",
569
+ source=source_identifier,
570
+ error_type=type(e).__name__,
571
+ error_message=str(e),
572
+ elapsed_ms=round(elapsed, 2),
573
+ )
574
+ raise
575
+
576
+ # Read based on format
577
+ if table:
578
+ # Managed/External Table (Catalog)
579
+ ctx.debug(f"Reading from catalog table: {table}")
580
+
581
+ if streaming:
582
+ reader = self.spark.readStream.format(format)
583
+ else:
584
+ reader = self.spark.read.format(format)
585
+
586
+ for key, value in options.items():
587
+ reader = reader.option(key, value)
588
+
589
+ try:
590
+ df = reader.table(table)
591
+
592
+ if "filter" in options:
593
+ df = df.filter(options["filter"])
594
+ ctx.debug(f"Applied filter: {options['filter']}")
595
+
596
+ elapsed = (time.time() - start_time) * 1000
597
+
598
+ if not streaming:
599
+ partition_count = df.rdd.getNumPartitions()
600
+ ctx.log_spark_metrics(partition_count=partition_count)
601
+ ctx.log_file_io(path=table, format=format, mode="read")
602
+ ctx.info(
603
+ f"Table read completed: {table}",
604
+ elapsed_ms=round(elapsed, 2),
605
+ partitions=partition_count,
606
+ )
607
+ else:
608
+ ctx.info(f"Streaming read started: {table}", elapsed_ms=round(elapsed, 2))
609
+
610
+ return df
611
+
612
+ except Exception as e:
613
+ elapsed = (time.time() - start_time) * 1000
614
+ ctx.error(
615
+ f"Table read failed: {table}",
616
+ error_type=type(e).__name__,
617
+ error_message=str(e),
618
+ elapsed_ms=round(elapsed, 2),
619
+ )
620
+ raise
621
+
622
+ elif path:
623
+ # File Path
624
+ full_path = connection.get_path(path)
625
+ ctx.debug(f"Reading from path: {full_path}")
626
+
627
+ # Auto-detect encoding for CSV (Batch only)
628
+ if not streaming and format == "csv" and options.get("auto_encoding"):
629
+ options = options.copy()
630
+ options.pop("auto_encoding")
631
+
632
+ if "encoding" not in options:
633
+ try:
634
+ from odibi.utils.encoding import detect_encoding
635
+
636
+ detected = detect_encoding(connection, path)
637
+ if detected:
638
+ options["encoding"] = detected
639
+ ctx.debug(f"Detected encoding: {detected}", path=path)
640
+ except ImportError:
641
+ pass
642
+ except Exception as e:
643
+ ctx.warning(
644
+ f"Encoding detection failed for {path}",
645
+ error_message=str(e),
646
+ )
647
+
648
+ if streaming:
649
+ reader = self.spark.readStream.format(format)
650
+ if schema:
651
+ reader = reader.schema(schema)
652
+ ctx.debug(f"Applied schema for streaming read: {schema[:100]}...")
653
+ else:
654
+ # Determine if we should warn about missing schema
655
+ # Formats that can infer schema: delta, parquet, avro (embedded schema)
656
+ # cloudFiles with schemaLocation or self-describing formats (avro, parquet) are fine
657
+ should_warn = True
658
+
659
+ if format in ["delta", "parquet"]:
660
+ should_warn = False
661
+ elif format == "cloudFiles":
662
+ cloud_format = options.get("cloudFiles.format", "")
663
+ has_schema_location = "cloudFiles.schemaLocation" in options
664
+ # avro and parquet have embedded schemas
665
+ if cloud_format in ["avro", "parquet"] or has_schema_location:
666
+ should_warn = False
667
+
668
+ if should_warn:
669
+ ctx.warning(
670
+ f"Streaming read from '{format}' format without schema. "
671
+ "Schema inference is not supported for streaming sources. "
672
+ "Consider adding 'schema' to your read config."
673
+ )
674
+ else:
675
+ reader = self.spark.read.format(format)
676
+ if schema:
677
+ reader = reader.schema(schema)
678
+
679
+ for key, value in options.items():
680
+ if key == "header" and isinstance(value, bool):
681
+ value = str(value).lower()
682
+ reader = reader.option(key, value)
683
+
684
+ try:
685
+ df = reader.load(full_path)
686
+
687
+ if "filter" in options:
688
+ df = df.filter(options["filter"])
689
+ ctx.debug(f"Applied filter: {options['filter']}")
690
+
691
+ elapsed = (time.time() - start_time) * 1000
692
+
693
+ if not streaming:
694
+ partition_count = df.rdd.getNumPartitions()
695
+ ctx.log_spark_metrics(partition_count=partition_count)
696
+ ctx.log_file_io(path=path, format=format, mode="read")
697
+ ctx.info(
698
+ f"File read completed: {path}",
699
+ elapsed_ms=round(elapsed, 2),
700
+ partitions=partition_count,
701
+ format=format,
702
+ )
703
+ else:
704
+ ctx.info(f"Streaming read started: {path}", elapsed_ms=round(elapsed, 2))
705
+
706
+ return df
707
+
708
+ except Exception as e:
709
+ elapsed = (time.time() - start_time) * 1000
710
+ ctx.error(
711
+ f"File read failed: {path}",
712
+ error_type=type(e).__name__,
713
+ error_message=str(e),
714
+ elapsed_ms=round(elapsed, 2),
715
+ format=format,
716
+ )
717
+ raise
718
+ else:
719
+ ctx.error("Either path or table must be provided")
720
+ raise ValueError("Either path or table must be provided")
721
+
722
+ def write(
723
+ self,
724
+ df: Any,
725
+ connection: Any,
726
+ format: str,
727
+ table: Optional[str] = None,
728
+ path: Optional[str] = None,
729
+ register_table: Optional[str] = None,
730
+ mode: str = "overwrite",
731
+ options: Optional[Dict[str, Any]] = None,
732
+ streaming_config: Optional[Any] = None,
733
+ ) -> Optional[Dict[str, Any]]:
734
+ """Write data using Spark.
735
+
736
+ Args:
737
+ df: Spark DataFrame to write
738
+ connection: Connection object
739
+ format: Output format (csv, parquet, json, delta)
740
+ table: Table name
741
+ path: File path
742
+ register_table: Name to register as external table (if path is used)
743
+ mode: Write mode (overwrite, append, error, ignore, upsert, append_once)
744
+ options: Format-specific options (including partition_by for partitioning)
745
+ streaming_config: StreamingWriteConfig for streaming DataFrames
746
+
747
+ Returns:
748
+ Optional dictionary containing Delta commit metadata (if format=delta),
749
+ or streaming query info (if streaming)
750
+ """
751
+ ctx = get_logging_context().with_context(engine="spark")
752
+ start_time = time.time()
753
+ options = options or {}
754
+
755
+ if getattr(df, "isStreaming", False) is True:
756
+ return self._write_streaming(
757
+ df=df,
758
+ connection=connection,
759
+ format=format,
760
+ table=table,
761
+ path=path,
762
+ register_table=register_table,
763
+ options=options,
764
+ streaming_config=streaming_config,
765
+ )
766
+
767
+ target_identifier = table or path or "unknown"
768
+ try:
769
+ partition_count = df.rdd.getNumPartitions()
770
+ except Exception:
771
+ partition_count = 1 # Fallback for mocks or unsupported DataFrames
772
+
773
+ # Auto-coalesce DataFrames for Delta writes to reduce file overhead
774
+ # Use coalesce_partitions option to explicitly set target partitions
775
+ # NOTE: We avoid df.count() here as it would trigger double-evaluation of lazy DataFrames
776
+ coalesce_partitions = options.pop("coalesce_partitions", None)
777
+ if (
778
+ coalesce_partitions
779
+ and isinstance(partition_count, int)
780
+ and partition_count > coalesce_partitions
781
+ ):
782
+ df = df.coalesce(coalesce_partitions)
783
+ ctx.debug(
784
+ f"Coalesced DataFrame to {coalesce_partitions} partition(s)",
785
+ original_partitions=partition_count,
786
+ )
787
+ partition_count = coalesce_partitions
788
+
789
+ ctx.debug(
790
+ "Starting Spark write",
791
+ format=format,
792
+ target=target_identifier,
793
+ mode=mode,
794
+ partitions=partition_count,
795
+ )
796
+
797
+ # SQL Server / Azure SQL Support
798
+ if format in ["sql", "sql_server", "azure_sql"]:
799
+ if not hasattr(connection, "get_spark_options"):
800
+ conn_type = type(connection).__name__
801
+ msg = f"Connection type '{conn_type}' does not support Spark SQL write"
802
+ ctx.error(msg, connection_type=conn_type)
803
+ raise ValueError(msg)
804
+
805
+ jdbc_options = connection.get_spark_options()
806
+ merged_options = {**jdbc_options, **options}
807
+
808
+ if table:
809
+ merged_options["dbtable"] = table
810
+ elif "dbtable" not in merged_options:
811
+ ctx.error("SQL format requires 'table' config or 'dbtable' option")
812
+ raise ValueError("SQL format requires 'table' config or 'dbtable' option")
813
+
814
+ # Handle MERGE mode for SQL Server
815
+ if mode == "merge":
816
+ merge_keys = options.get("merge_keys")
817
+ merge_options = options.get("merge_options")
818
+
819
+ if not merge_keys:
820
+ ctx.error("MERGE mode requires 'merge_keys' in options")
821
+ raise ValueError(
822
+ "MERGE mode requires 'merge_keys' in options. "
823
+ "Specify the key columns for the MERGE ON clause."
824
+ )
825
+
826
+ from odibi.writers.sql_server_writer import SqlServerMergeWriter
827
+
828
+ writer = SqlServerMergeWriter(connection)
829
+ ctx.debug(
830
+ "Executing SQL Server MERGE",
831
+ target=table,
832
+ merge_keys=merge_keys,
833
+ )
834
+
835
+ try:
836
+ result = writer.merge(
837
+ df=df,
838
+ spark_engine=self,
839
+ target_table=table,
840
+ merge_keys=merge_keys,
841
+ options=merge_options,
842
+ jdbc_options=jdbc_options,
843
+ )
844
+ elapsed = (time.time() - start_time) * 1000
845
+ ctx.log_file_io(path=target_identifier, format=format, mode="write")
846
+ ctx.info(
847
+ "SQL Server MERGE completed",
848
+ target=target_identifier,
849
+ mode=mode,
850
+ inserted=result.inserted,
851
+ updated=result.updated,
852
+ deleted=result.deleted,
853
+ elapsed_ms=round(elapsed, 2),
854
+ )
855
+ return {
856
+ "mode": "merge",
857
+ "inserted": result.inserted,
858
+ "updated": result.updated,
859
+ "deleted": result.deleted,
860
+ "total_affected": result.total_affected,
861
+ }
862
+
863
+ except Exception as e:
864
+ elapsed = (time.time() - start_time) * 1000
865
+ ctx.error(
866
+ "SQL Server MERGE failed",
867
+ target=target_identifier,
868
+ error_type=type(e).__name__,
869
+ error_message=str(e),
870
+ elapsed_ms=round(elapsed, 2),
871
+ )
872
+ raise
873
+
874
+ # Handle enhanced overwrite with strategies
875
+ if mode == "overwrite" and options.get("overwrite_options"):
876
+ from odibi.writers.sql_server_writer import SqlServerMergeWriter
877
+
878
+ overwrite_options = options.get("overwrite_options")
879
+ writer = SqlServerMergeWriter(connection)
880
+
881
+ ctx.debug(
882
+ "Executing SQL Server enhanced overwrite",
883
+ target=table,
884
+ strategy=(
885
+ overwrite_options.strategy.value
886
+ if hasattr(overwrite_options, "strategy")
887
+ else "truncate_insert"
888
+ ),
889
+ )
890
+
891
+ try:
892
+ result = writer.overwrite_spark(
893
+ df=df,
894
+ target_table=table,
895
+ options=overwrite_options,
896
+ jdbc_options=jdbc_options,
897
+ )
898
+ elapsed = (time.time() - start_time) * 1000
899
+ ctx.log_file_io(path=target_identifier, format=format, mode="write")
900
+ ctx.info(
901
+ "SQL Server enhanced overwrite completed",
902
+ target=target_identifier,
903
+ strategy=result.strategy,
904
+ rows_written=result.rows_written,
905
+ elapsed_ms=round(elapsed, 2),
906
+ )
907
+ return {
908
+ "mode": "overwrite",
909
+ "strategy": result.strategy,
910
+ "rows_written": result.rows_written,
911
+ }
912
+
913
+ except Exception as e:
914
+ elapsed = (time.time() - start_time) * 1000
915
+ ctx.error(
916
+ "SQL Server enhanced overwrite failed",
917
+ target=target_identifier,
918
+ error_type=type(e).__name__,
919
+ error_message=str(e),
920
+ elapsed_ms=round(elapsed, 2),
921
+ )
922
+ raise
923
+
924
+ if mode not in ["overwrite", "append", "ignore", "error"]:
925
+ if mode == "fail":
926
+ mode = "error"
927
+ else:
928
+ ctx.error(f"Write mode '{mode}' not supported for Spark SQL write")
929
+ raise ValueError(f"Write mode '{mode}' not supported for Spark SQL write")
930
+
931
+ ctx.debug("Executing JDBC write", target=table or merged_options.get("dbtable"))
932
+
933
+ try:
934
+ df.write.format("jdbc").options(**merged_options).mode(mode).save()
935
+ elapsed = (time.time() - start_time) * 1000
936
+ ctx.log_file_io(path=target_identifier, format=format, mode="write")
937
+ ctx.info(
938
+ "JDBC write completed",
939
+ target=target_identifier,
940
+ mode=mode,
941
+ elapsed_ms=round(elapsed, 2),
942
+ )
943
+ return None
944
+
945
+ except Exception as e:
946
+ elapsed = (time.time() - start_time) * 1000
947
+ ctx.error(
948
+ "JDBC write failed",
949
+ target=target_identifier,
950
+ error_type=type(e).__name__,
951
+ error_message=str(e),
952
+ elapsed_ms=round(elapsed, 2),
953
+ )
954
+ raise
955
+
956
+ # Handle Upsert/AppendOnce (Delta Only)
957
+ if mode in ["upsert", "append_once"]:
958
+ if format != "delta":
959
+ ctx.error(f"Mode '{mode}' only supported for Delta format")
960
+ raise NotImplementedError(
961
+ f"Mode '{mode}' only supported for Delta format in Spark engine."
962
+ )
963
+
964
+ keys = options.get("keys")
965
+ if not keys:
966
+ ctx.error(f"Mode '{mode}' requires 'keys' list in options")
967
+ raise ValueError(f"Mode '{mode}' requires 'keys' list in options")
968
+
969
+ if isinstance(keys, str):
970
+ keys = [keys]
971
+
972
+ exists = self.table_exists(connection, table, path)
973
+ ctx.debug("Table existence check for merge", target=target_identifier, exists=exists)
974
+
975
+ if not exists:
976
+ mode = "overwrite"
977
+ ctx.debug("Target does not exist, falling back to overwrite mode")
978
+ else:
979
+ from delta.tables import DeltaTable
980
+
981
+ target_dt = None
982
+ target_name = ""
983
+ is_table_target = False
984
+
985
+ if table:
986
+ target_dt = DeltaTable.forName(self.spark, table)
987
+ target_name = table
988
+ is_table_target = True
989
+ elif path:
990
+ full_path = connection.get_path(path)
991
+ target_dt = DeltaTable.forPath(self.spark, full_path)
992
+ target_name = full_path
993
+ is_table_target = False
994
+
995
+ condition = " AND ".join([f"target.`{k}` = source.`{k}`" for k in keys])
996
+ ctx.debug("Executing Delta merge", merge_mode=mode, keys=keys, condition=condition)
997
+
998
+ merge_builder = target_dt.alias("target").merge(df.alias("source"), condition)
999
+
1000
+ try:
1001
+ if mode == "upsert":
1002
+ merge_builder.whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
1003
+ elif mode == "append_once":
1004
+ merge_builder.whenNotMatchedInsertAll().execute()
1005
+
1006
+ elapsed = (time.time() - start_time) * 1000
1007
+ ctx.info(
1008
+ "Delta merge completed",
1009
+ target=target_name,
1010
+ mode=mode,
1011
+ elapsed_ms=round(elapsed, 2),
1012
+ )
1013
+
1014
+ self._optimize_delta_write(target_name, options, is_table=is_table_target)
1015
+ commit_info = self._get_last_delta_commit_info(
1016
+ target_name, is_table=is_table_target
1017
+ )
1018
+
1019
+ if commit_info:
1020
+ ctx.debug(
1021
+ "Delta commit info",
1022
+ version=commit_info.get("version"),
1023
+ operation=commit_info.get("operation"),
1024
+ )
1025
+
1026
+ return commit_info
1027
+
1028
+ except Exception as e:
1029
+ elapsed = (time.time() - start_time) * 1000
1030
+ ctx.error(
1031
+ "Delta merge failed",
1032
+ target=target_name,
1033
+ error_type=type(e).__name__,
1034
+ error_message=str(e),
1035
+ elapsed_ms=round(elapsed, 2),
1036
+ )
1037
+ raise
1038
+
1039
+ # Get output location
1040
+ if table:
1041
+ # Managed/External Table (Catalog)
1042
+ ctx.debug(f"Writing to catalog table: {table}")
1043
+ writer = df.write.format(format).mode(mode)
1044
+
1045
+ partition_by = options.get("partition_by")
1046
+ if partition_by:
1047
+ if isinstance(partition_by, str):
1048
+ partition_by = [partition_by]
1049
+ writer = writer.partitionBy(*partition_by)
1050
+ ctx.debug(f"Partitioning by: {partition_by}")
1051
+
1052
+ for key, value in options.items():
1053
+ writer = writer.option(key, value)
1054
+
1055
+ try:
1056
+ writer.saveAsTable(table)
1057
+ elapsed = (time.time() - start_time) * 1000
1058
+
1059
+ ctx.log_file_io(
1060
+ path=table,
1061
+ format=format,
1062
+ mode=mode,
1063
+ partitions=partition_by,
1064
+ )
1065
+ ctx.info(
1066
+ f"Table write completed: {table}",
1067
+ mode=mode,
1068
+ elapsed_ms=round(elapsed, 2),
1069
+ )
1070
+
1071
+ if format == "delta":
1072
+ self._optimize_delta_write(table, options, is_table=True)
1073
+ return self._get_last_delta_commit_info(table, is_table=True)
1074
+ return None
1075
+
1076
+ except Exception as e:
1077
+ elapsed = (time.time() - start_time) * 1000
1078
+ ctx.error(
1079
+ f"Table write failed: {table}",
1080
+ error_type=type(e).__name__,
1081
+ error_message=str(e),
1082
+ elapsed_ms=round(elapsed, 2),
1083
+ )
1084
+ raise
1085
+
1086
+ elif path:
1087
+ full_path = connection.get_path(path)
1088
+ else:
1089
+ ctx.error("Either path or table must be provided")
1090
+ raise ValueError("Either path or table must be provided")
1091
+
1092
+ # Extract partition_by option
1093
+ partition_by = options.pop("partition_by", None) or options.pop("partitionBy", None)
1094
+
1095
+ # Extract cluster_by option (Liquid Clustering)
1096
+ cluster_by = options.pop("cluster_by", None)
1097
+
1098
+ # Warn about partitioning anti-patterns
1099
+ if partition_by and cluster_by:
1100
+ import warnings
1101
+
1102
+ ctx.warning(
1103
+ "Conflict: Both 'partition_by' and 'cluster_by' are set",
1104
+ partition_by=partition_by,
1105
+ cluster_by=cluster_by,
1106
+ )
1107
+ warnings.warn(
1108
+ "⚠️ Conflict: Both 'partition_by' and 'cluster_by' (Liquid Clustering) are set. "
1109
+ "Liquid Clustering supersedes partitioning. 'partition_by' will be ignored "
1110
+ "if the table is being created now.",
1111
+ UserWarning,
1112
+ )
1113
+
1114
+ elif partition_by:
1115
+ import warnings
1116
+
1117
+ ctx.warning(
1118
+ "Partitioning warning: ensure low-cardinality columns",
1119
+ partition_by=partition_by,
1120
+ )
1121
+ warnings.warn(
1122
+ "⚠️ Partitioning can cause performance issues if misused. "
1123
+ "Only partition on low-cardinality columns (< 1000 unique values) "
1124
+ "and ensure each partition has > 1000 rows.",
1125
+ UserWarning,
1126
+ )
1127
+
1128
+ # Handle Upsert/Append-Once for Delta Lake (Path-based only for now)
1129
+ if format == "delta" and mode in ["upsert", "append_once"]:
1130
+ try:
1131
+ from delta.tables import DeltaTable
1132
+ except ImportError:
1133
+ ctx.error("Delta Lake support requires 'delta-spark'")
1134
+ raise ImportError("Delta Lake support requires 'delta-spark'")
1135
+
1136
+ if "keys" not in options:
1137
+ ctx.error(f"Mode '{mode}' requires 'keys' list in options")
1138
+ raise ValueError(f"Mode '{mode}' requires 'keys' list in options")
1139
+
1140
+ if DeltaTable.isDeltaTable(self.spark, full_path):
1141
+ ctx.debug(f"Performing Delta merge at path: {full_path}")
1142
+ delta_table = DeltaTable.forPath(self.spark, full_path)
1143
+ keys = options["keys"]
1144
+ if isinstance(keys, str):
1145
+ keys = [keys]
1146
+
1147
+ condition = " AND ".join([f"target.{k} = source.{k}" for k in keys])
1148
+ merger = delta_table.alias("target").merge(df.alias("source"), condition)
1149
+
1150
+ try:
1151
+ if mode == "upsert":
1152
+ merger.whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
1153
+ else:
1154
+ merger.whenNotMatchedInsertAll().execute()
1155
+
1156
+ elapsed = (time.time() - start_time) * 1000
1157
+ ctx.info(
1158
+ "Delta merge completed at path",
1159
+ path=path,
1160
+ mode=mode,
1161
+ elapsed_ms=round(elapsed, 2),
1162
+ )
1163
+
1164
+ if register_table:
1165
+ try:
1166
+ table_in_catalog = self.spark.catalog.tableExists(register_table)
1167
+ needs_registration = not table_in_catalog
1168
+
1169
+ # Handle orphan catalog entries (only for path-not-found errors)
1170
+ if table_in_catalog:
1171
+ try:
1172
+ self.spark.table(register_table).limit(0).collect()
1173
+ ctx.debug(
1174
+ f"Table '{register_table}' already registered and valid"
1175
+ )
1176
+ except Exception as verify_err:
1177
+ error_str = str(verify_err)
1178
+ is_orphan = (
1179
+ "DELTA_PATH_DOES_NOT_EXIST" in error_str
1180
+ or "Path does not exist" in error_str
1181
+ or "FileNotFoundException" in error_str
1182
+ )
1183
+ if is_orphan:
1184
+ ctx.warning(
1185
+ f"Table '{register_table}' is orphan, re-registering"
1186
+ )
1187
+ try:
1188
+ self.spark.sql(f"DROP TABLE IF EXISTS {register_table}")
1189
+ except Exception:
1190
+ pass
1191
+ needs_registration = True
1192
+ else:
1193
+ ctx.debug(
1194
+ f"Table '{register_table}' verify failed, "
1195
+ "skipping registration"
1196
+ )
1197
+
1198
+ if needs_registration:
1199
+ create_sql = (
1200
+ f"CREATE TABLE IF NOT EXISTS {register_table} "
1201
+ f"USING DELTA LOCATION '{full_path}'"
1202
+ )
1203
+ self.spark.sql(create_sql)
1204
+ ctx.info(f"Registered table: {register_table}", path=full_path)
1205
+ except Exception as e:
1206
+ ctx.error(
1207
+ f"Failed to register external table '{register_table}'",
1208
+ error_message=str(e),
1209
+ )
1210
+
1211
+ self._optimize_delta_write(full_path, options, is_table=False)
1212
+ return self._get_last_delta_commit_info(full_path, is_table=False)
1213
+
1214
+ except Exception as e:
1215
+ elapsed = (time.time() - start_time) * 1000
1216
+ ctx.error(
1217
+ "Delta merge failed at path",
1218
+ path=path,
1219
+ error_type=type(e).__name__,
1220
+ error_message=str(e),
1221
+ elapsed_ms=round(elapsed, 2),
1222
+ )
1223
+ raise
1224
+ else:
1225
+ mode = "overwrite"
1226
+ ctx.debug("Target does not exist, falling back to overwrite mode")
1227
+
1228
+ # Write based on format (Path-based)
1229
+ ctx.debug(f"Writing to path: {full_path}")
1230
+
1231
+ # Handle Liquid Clustering (New Table Creation via SQL)
1232
+ if format == "delta" and cluster_by:
1233
+ should_create = False
1234
+ target_name = None
1235
+
1236
+ if table:
1237
+ target_name = table
1238
+ if mode == "overwrite":
1239
+ should_create = True
1240
+ elif mode == "append":
1241
+ if not self.spark.catalog.tableExists(table):
1242
+ should_create = True
1243
+ elif path:
1244
+ full_path = connection.get_path(path)
1245
+ target_name = f"delta.`{full_path}`"
1246
+ if mode == "overwrite":
1247
+ should_create = True
1248
+ elif mode == "append":
1249
+ try:
1250
+ from delta.tables import DeltaTable
1251
+
1252
+ if not DeltaTable.isDeltaTable(self.spark, full_path):
1253
+ should_create = True
1254
+ except ImportError:
1255
+ pass
1256
+
1257
+ if should_create:
1258
+ if isinstance(cluster_by, str):
1259
+ cluster_by = [cluster_by]
1260
+
1261
+ cols = ", ".join(cluster_by)
1262
+ temp_view = f"odibi_temp_writer_{abs(hash(str(target_name)))}"
1263
+ df.createOrReplaceTempView(temp_view)
1264
+
1265
+ create_cmd = (
1266
+ "CREATE OR REPLACE TABLE"
1267
+ if mode == "overwrite"
1268
+ else "CREATE TABLE IF NOT EXISTS"
1269
+ )
1270
+
1271
+ sql = (
1272
+ f"{create_cmd} {target_name} USING DELTA CLUSTER BY ({cols}) "
1273
+ f"AS SELECT * FROM {temp_view}"
1274
+ )
1275
+
1276
+ ctx.debug("Creating clustered Delta table", sql=sql, cluster_by=cluster_by)
1277
+
1278
+ try:
1279
+ self.spark.sql(sql)
1280
+ self.spark.catalog.dropTempView(temp_view)
1281
+
1282
+ elapsed = (time.time() - start_time) * 1000
1283
+ ctx.info(
1284
+ "Clustered Delta table created",
1285
+ target=target_name,
1286
+ cluster_by=cluster_by,
1287
+ elapsed_ms=round(elapsed, 2),
1288
+ )
1289
+
1290
+ if register_table and path:
1291
+ try:
1292
+ reg_sql = (
1293
+ f"CREATE TABLE IF NOT EXISTS {register_table} "
1294
+ f"USING DELTA LOCATION '{full_path}'"
1295
+ )
1296
+ self.spark.sql(reg_sql)
1297
+ ctx.info(f"Registered table: {register_table}")
1298
+ except Exception:
1299
+ pass
1300
+
1301
+ if format == "delta":
1302
+ self._optimize_delta_write(
1303
+ target_name if table else full_path, options, is_table=bool(table)
1304
+ )
1305
+ return self._get_last_delta_commit_info(
1306
+ target_name if table else full_path, is_table=bool(table)
1307
+ )
1308
+ return None
1309
+
1310
+ except Exception as e:
1311
+ elapsed = (time.time() - start_time) * 1000
1312
+ ctx.error(
1313
+ "Failed to create clustered Delta table",
1314
+ error_type=type(e).__name__,
1315
+ error_message=str(e),
1316
+ elapsed_ms=round(elapsed, 2),
1317
+ )
1318
+ raise
1319
+
1320
+ # Extract table_properties from options
1321
+ table_properties = options.pop("table_properties", None)
1322
+
1323
+ # For column mapping and other properties that must be set BEFORE write
1324
+ original_configs = {}
1325
+ if table_properties and format == "delta":
1326
+ for prop_name, prop_value in table_properties.items():
1327
+ spark_conf_key = (
1328
+ f"spark.databricks.delta.properties.defaults.{prop_name.replace('delta.', '')}"
1329
+ )
1330
+ try:
1331
+ original_configs[spark_conf_key] = self.spark.conf.get(spark_conf_key, None)
1332
+ except Exception:
1333
+ original_configs[spark_conf_key] = None
1334
+ self.spark.conf.set(spark_conf_key, prop_value)
1335
+ ctx.debug(
1336
+ "Applied table properties as session defaults",
1337
+ properties=list(table_properties.keys()),
1338
+ )
1339
+
1340
+ writer = df.write.format(format).mode(mode)
1341
+
1342
+ if partition_by:
1343
+ if isinstance(partition_by, str):
1344
+ partition_by = [partition_by]
1345
+ writer = writer.partitionBy(*partition_by)
1346
+ ctx.debug(f"Partitioning by: {partition_by}")
1347
+
1348
+ for key, value in options.items():
1349
+ writer = writer.option(key, value)
1350
+
1351
+ try:
1352
+ writer.save(full_path)
1353
+ elapsed = (time.time() - start_time) * 1000
1354
+
1355
+ ctx.log_file_io(
1356
+ path=path,
1357
+ format=format,
1358
+ mode=mode,
1359
+ partitions=partition_by,
1360
+ )
1361
+ ctx.info(
1362
+ f"File write completed: {path}",
1363
+ format=format,
1364
+ mode=mode,
1365
+ elapsed_ms=round(elapsed, 2),
1366
+ )
1367
+
1368
+ except Exception as e:
1369
+ elapsed = (time.time() - start_time) * 1000
1370
+ ctx.error(
1371
+ f"File write failed: {path}",
1372
+ error_type=type(e).__name__,
1373
+ error_message=str(e),
1374
+ elapsed_ms=round(elapsed, 2),
1375
+ )
1376
+ raise
1377
+ finally:
1378
+ for conf_key, original_value in original_configs.items():
1379
+ if original_value is None:
1380
+ self.spark.conf.unset(conf_key)
1381
+ else:
1382
+ self.spark.conf.set(conf_key, original_value)
1383
+
1384
+ if format == "delta":
1385
+ self._optimize_delta_write(full_path, options, is_table=False)
1386
+
1387
+ if register_table and format == "delta":
1388
+ try:
1389
+ table_in_catalog = self.spark.catalog.tableExists(register_table)
1390
+ needs_registration = not table_in_catalog
1391
+
1392
+ # Handle orphan catalog entries: table exists but points to deleted path
1393
+ # Only treat as orphan if it's specifically a DELTA_PATH_DOES_NOT_EXIST error
1394
+ if table_in_catalog:
1395
+ try:
1396
+ self.spark.table(register_table).limit(0).collect()
1397
+ ctx.debug(
1398
+ f"Table '{register_table}' already registered and valid, "
1399
+ "skipping registration"
1400
+ )
1401
+ except Exception as verify_err:
1402
+ error_str = str(verify_err)
1403
+ is_orphan = (
1404
+ "DELTA_PATH_DOES_NOT_EXIST" in error_str
1405
+ or "Path does not exist" in error_str
1406
+ or "FileNotFoundException" in error_str
1407
+ )
1408
+
1409
+ if is_orphan:
1410
+ # Orphan entry - table in catalog but path was deleted
1411
+ ctx.warning(
1412
+ f"Table '{register_table}' is orphan (path deleted), "
1413
+ "dropping and re-registering",
1414
+ error_message=error_str[:200],
1415
+ )
1416
+ try:
1417
+ self.spark.sql(f"DROP TABLE IF EXISTS {register_table}")
1418
+ except Exception:
1419
+ pass # Best effort cleanup
1420
+ needs_registration = True
1421
+ else:
1422
+ # Other error (auth, network, etc.) - don't drop, just log
1423
+ ctx.debug(
1424
+ f"Table '{register_table}' exists but verify failed "
1425
+ "(not orphan), skipping registration",
1426
+ error_message=error_str[:200],
1427
+ )
1428
+
1429
+ if needs_registration:
1430
+ ctx.debug(f"Registering table '{register_table}' at '{full_path}'")
1431
+ reg_sql = (
1432
+ f"CREATE TABLE IF NOT EXISTS {register_table} "
1433
+ f"USING DELTA LOCATION '{full_path}'"
1434
+ )
1435
+ self.spark.sql(reg_sql)
1436
+ ctx.info(f"Registered table: {register_table}", path=full_path)
1437
+ except Exception as e:
1438
+ ctx.error(
1439
+ f"Failed to register table '{register_table}'",
1440
+ error_message=str(e),
1441
+ )
1442
+ raise RuntimeError(
1443
+ f"Failed to register external table '{register_table}': {e}"
1444
+ ) from e
1445
+
1446
+ if format == "delta":
1447
+ return self._get_last_delta_commit_info(full_path, is_table=False)
1448
+
1449
+ return None
1450
+
1451
+ def _write_streaming(
1452
+ self,
1453
+ df,
1454
+ connection: Any,
1455
+ format: str,
1456
+ table: Optional[str] = None,
1457
+ path: Optional[str] = None,
1458
+ register_table: Optional[str] = None,
1459
+ options: Optional[Dict[str, Any]] = None,
1460
+ streaming_config: Optional[Any] = None,
1461
+ ) -> Dict[str, Any]:
1462
+ """Write streaming DataFrame using Spark Structured Streaming.
1463
+
1464
+ Args:
1465
+ df: Streaming Spark DataFrame
1466
+ connection: Connection object
1467
+ format: Output format (delta, kafka, etc.)
1468
+ table: Table name
1469
+ path: File path
1470
+ register_table: Name to register as external table (if path is used)
1471
+ options: Format-specific options
1472
+ streaming_config: StreamingWriteConfig with streaming parameters
1473
+
1474
+ Returns:
1475
+ Dictionary with streaming query information
1476
+ """
1477
+ ctx = get_logging_context().with_context(engine="spark")
1478
+ start_time = time.time()
1479
+ options = options or {}
1480
+
1481
+ if streaming_config is None:
1482
+ ctx.error("Streaming DataFrame requires streaming_config")
1483
+ raise ValueError(
1484
+ "Streaming DataFrame detected but no streaming_config provided. "
1485
+ "Add a 'streaming' section to your write config with at least "
1486
+ "'checkpoint_location' specified."
1487
+ )
1488
+
1489
+ target_identifier = table or path or "unknown"
1490
+
1491
+ checkpoint_location = streaming_config.checkpoint_location
1492
+ if checkpoint_location and connection:
1493
+ if not checkpoint_location.startswith(
1494
+ ("abfss://", "s3://", "gs://", "dbfs://", "hdfs://", "wasbs://")
1495
+ ):
1496
+ checkpoint_location = connection.get_path(checkpoint_location)
1497
+ ctx.debug(
1498
+ "Resolved checkpoint location through connection",
1499
+ original=streaming_config.checkpoint_location,
1500
+ resolved=checkpoint_location,
1501
+ )
1502
+
1503
+ ctx.debug(
1504
+ "Starting streaming write",
1505
+ format=format,
1506
+ target=target_identifier,
1507
+ output_mode=streaming_config.output_mode,
1508
+ checkpoint=checkpoint_location,
1509
+ )
1510
+
1511
+ writer = df.writeStream.format(format)
1512
+ writer = writer.outputMode(streaming_config.output_mode)
1513
+ writer = writer.option("checkpointLocation", checkpoint_location)
1514
+
1515
+ if streaming_config.query_name:
1516
+ writer = writer.queryName(streaming_config.query_name)
1517
+
1518
+ if streaming_config.trigger:
1519
+ trigger = streaming_config.trigger
1520
+ if trigger.once:
1521
+ writer = writer.trigger(once=True)
1522
+ elif trigger.available_now:
1523
+ writer = writer.trigger(availableNow=True)
1524
+ elif trigger.processing_time:
1525
+ writer = writer.trigger(processingTime=trigger.processing_time)
1526
+ elif trigger.continuous:
1527
+ writer = writer.trigger(continuous=trigger.continuous)
1528
+
1529
+ partition_by = options.pop("partition_by", None) or options.pop("partitionBy", None)
1530
+ if partition_by:
1531
+ if isinstance(partition_by, str):
1532
+ partition_by = [partition_by]
1533
+ writer = writer.partitionBy(*partition_by)
1534
+ ctx.debug(f"Partitioning by: {partition_by}")
1535
+
1536
+ for key, value in options.items():
1537
+ writer = writer.option(key, value)
1538
+
1539
+ try:
1540
+ if table:
1541
+ query = writer.toTable(table)
1542
+ ctx.info(
1543
+ f"Streaming query started: writing to table {table}",
1544
+ query_id=str(query.id),
1545
+ query_name=query.name,
1546
+ )
1547
+ elif path:
1548
+ full_path = connection.get_path(path)
1549
+ query = writer.start(full_path)
1550
+ ctx.info(
1551
+ f"Streaming query started: writing to path {path}",
1552
+ query_id=str(query.id),
1553
+ query_name=query.name,
1554
+ )
1555
+ else:
1556
+ ctx.error("Either path or table must be provided for streaming write")
1557
+ raise ValueError(
1558
+ "Streaming write operation failed: neither 'path' nor 'table' was provided. "
1559
+ "Specify a file path or table name in your streaming configuration."
1560
+ )
1561
+
1562
+ elapsed = (time.time() - start_time) * 1000
1563
+
1564
+ result = {
1565
+ "streaming": True,
1566
+ "query_id": str(query.id),
1567
+ "query_name": query.name,
1568
+ "status": "running",
1569
+ "target": target_identifier,
1570
+ "output_mode": streaming_config.output_mode,
1571
+ "checkpoint_location": streaming_config.checkpoint_location,
1572
+ "elapsed_ms": round(elapsed, 2),
1573
+ }
1574
+
1575
+ should_wait = streaming_config.await_termination
1576
+ if streaming_config.trigger:
1577
+ trigger = streaming_config.trigger
1578
+ if trigger.once or trigger.available_now:
1579
+ should_wait = True
1580
+
1581
+ if should_wait:
1582
+ ctx.info(
1583
+ "Awaiting streaming query termination",
1584
+ timeout_seconds=streaming_config.timeout_seconds,
1585
+ )
1586
+ query.awaitTermination(streaming_config.timeout_seconds)
1587
+ result["status"] = "terminated"
1588
+ elapsed = (time.time() - start_time) * 1000
1589
+ result["elapsed_ms"] = round(elapsed, 2)
1590
+ ctx.info(
1591
+ "Streaming query terminated",
1592
+ query_id=str(query.id),
1593
+ elapsed_ms=round(elapsed, 2),
1594
+ )
1595
+
1596
+ if register_table and path and format == "delta":
1597
+ full_path = connection.get_path(path)
1598
+ try:
1599
+ self.spark.sql(
1600
+ f"CREATE TABLE IF NOT EXISTS {register_table} "
1601
+ f"USING DELTA LOCATION '{full_path}'"
1602
+ )
1603
+ ctx.info(
1604
+ f"Registered external table: {register_table}",
1605
+ path=full_path,
1606
+ )
1607
+ result["registered_table"] = register_table
1608
+ except Exception as reg_err:
1609
+ ctx.warning(
1610
+ f"Failed to register external table '{register_table}'",
1611
+ error=str(reg_err),
1612
+ )
1613
+ else:
1614
+ result["streaming_query"] = query
1615
+ if register_table:
1616
+ ctx.warning(
1617
+ "register_table ignored for continuous streaming. "
1618
+ "Table will be registered after query terminates or manually."
1619
+ )
1620
+
1621
+ return result
1622
+
1623
+ except Exception as e:
1624
+ elapsed = (time.time() - start_time) * 1000
1625
+ ctx.error(
1626
+ "Streaming write failed",
1627
+ target=target_identifier,
1628
+ error_type=type(e).__name__,
1629
+ error_message=str(e),
1630
+ elapsed_ms=round(elapsed, 2),
1631
+ )
1632
+ raise
1633
+
1634
+ def execute_sql(self, sql: str, context: Any = None) -> Any:
1635
+ """Execute SQL query in Spark.
1636
+
1637
+ Args:
1638
+ sql: SQL query string
1639
+ context: Execution context (optional, not used for Spark)
1640
+
1641
+ Returns:
1642
+ Spark DataFrame with query results
1643
+ """
1644
+ ctx = get_logging_context().with_context(engine="spark")
1645
+ start_time = time.time()
1646
+
1647
+ ctx.debug("Executing Spark SQL", query_preview=sql[:200] if len(sql) > 200 else sql)
1648
+
1649
+ try:
1650
+ result = self.spark.sql(sql)
1651
+ elapsed = (time.time() - start_time) * 1000
1652
+ partition_count = result.rdd.getNumPartitions()
1653
+
1654
+ ctx.log_spark_metrics(partition_count=partition_count)
1655
+ ctx.info(
1656
+ "Spark SQL executed",
1657
+ elapsed_ms=round(elapsed, 2),
1658
+ partitions=partition_count,
1659
+ )
1660
+
1661
+ return result
1662
+
1663
+ except Exception as e:
1664
+ elapsed = (time.time() - start_time) * 1000
1665
+ error_type = type(e).__name__
1666
+ clean_message = _extract_spark_error_message(e)
1667
+
1668
+ if "AnalysisException" in error_type:
1669
+ ctx.error(
1670
+ "Spark SQL Analysis Error",
1671
+ error_type=error_type,
1672
+ error_message=clean_message,
1673
+ query_preview=sql[:200] if len(sql) > 200 else sql,
1674
+ elapsed_ms=round(elapsed, 2),
1675
+ )
1676
+ raise TransformError(f"Spark SQL Analysis Error: {clean_message}")
1677
+
1678
+ if "ParseException" in error_type:
1679
+ ctx.error(
1680
+ "Spark SQL Parse Error",
1681
+ error_type=error_type,
1682
+ error_message=clean_message,
1683
+ query_preview=sql[:200] if len(sql) > 200 else sql,
1684
+ elapsed_ms=round(elapsed, 2),
1685
+ )
1686
+ raise TransformError(f"Spark SQL Parse Error: {clean_message}")
1687
+
1688
+ ctx.error(
1689
+ "Spark SQL execution failed",
1690
+ error_type=error_type,
1691
+ error_message=clean_message,
1692
+ elapsed_ms=round(elapsed, 2),
1693
+ )
1694
+ raise TransformError(f"Spark SQL Error: {clean_message}")
1695
+
1696
+ def execute_transform(self, *args, **kwargs):
1697
+ raise NotImplementedError(
1698
+ "SparkEngine.execute_transform() will be implemented in Phase 2B. "
1699
+ "See PHASES.md for implementation plan."
1700
+ )
1701
+
1702
+ def execute_operation(self, operation: str, params: Dict[str, Any], df) -> Any:
1703
+ """Execute built-in operation on Spark DataFrame."""
1704
+ ctx = get_logging_context().with_context(engine="spark")
1705
+ params = params or {}
1706
+
1707
+ ctx.debug(f"Executing operation: {operation}", params=list(params.keys()))
1708
+
1709
+ if operation == "pivot":
1710
+ group_by = params.get("group_by", [])
1711
+ pivot_column = params.get("pivot_column")
1712
+ value_column = params.get("value_column")
1713
+ agg_func = params.get("agg_func", "first")
1714
+
1715
+ if not pivot_column or not value_column:
1716
+ ctx.error("Pivot requires 'pivot_column' and 'value_column'")
1717
+ raise ValueError("Pivot requires 'pivot_column' and 'value_column'")
1718
+
1719
+ if isinstance(group_by, str):
1720
+ group_by = [group_by]
1721
+
1722
+ agg_expr = {value_column: agg_func}
1723
+ return df.groupBy(*group_by).pivot(pivot_column).agg(agg_expr)
1724
+
1725
+ elif operation == "drop_duplicates":
1726
+ subset = params.get("subset")
1727
+ if subset:
1728
+ if isinstance(subset, str):
1729
+ subset = [subset]
1730
+ return df.dropDuplicates(subset=subset)
1731
+ return df.dropDuplicates()
1732
+
1733
+ elif operation == "fillna":
1734
+ value = params.get("value")
1735
+ subset = params.get("subset")
1736
+ return df.fillna(value, subset=subset)
1737
+
1738
+ elif operation == "drop":
1739
+ columns = params.get("columns")
1740
+ if not columns:
1741
+ return df
1742
+ if isinstance(columns, str):
1743
+ columns = [columns]
1744
+ return df.drop(*columns)
1745
+
1746
+ elif operation == "rename":
1747
+ columns = params.get("columns")
1748
+ if not columns:
1749
+ return df
1750
+
1751
+ res = df
1752
+ for old_name, new_name in columns.items():
1753
+ res = res.withColumnRenamed(old_name, new_name)
1754
+ return res
1755
+
1756
+ elif operation == "sort":
1757
+ by = params.get("by")
1758
+ ascending = params.get("ascending", True)
1759
+
1760
+ if not by:
1761
+ return df
1762
+
1763
+ if isinstance(by, str):
1764
+ by = [by]
1765
+
1766
+ if not ascending:
1767
+ from pyspark.sql.functions import desc
1768
+
1769
+ sort_cols = [desc(c) for c in by]
1770
+ return df.orderBy(*sort_cols)
1771
+
1772
+ return df.orderBy(*by)
1773
+
1774
+ elif operation == "sample":
1775
+ fraction = params.get("frac", 0.1)
1776
+ seed = params.get("random_state")
1777
+ with_replacement = params.get("replace", False)
1778
+ return df.sample(withReplacement=with_replacement, fraction=fraction, seed=seed)
1779
+
1780
+ else:
1781
+ # Fallback: check if operation is a registered transformer
1782
+ from odibi.context import EngineContext
1783
+ from odibi.registry import FunctionRegistry
1784
+
1785
+ ctx.debug(
1786
+ f"Checking registry for operation: {operation}",
1787
+ registered_functions=list(FunctionRegistry._functions.keys())[:10],
1788
+ has_function=FunctionRegistry.has_function(operation),
1789
+ )
1790
+
1791
+ if FunctionRegistry.has_function(operation):
1792
+ ctx.debug(f"Executing registered transformer as operation: {operation}")
1793
+ func = FunctionRegistry.get_function(operation)
1794
+ param_model = FunctionRegistry.get_param_model(operation)
1795
+
1796
+ # Create EngineContext from current df
1797
+ from odibi.context import SparkContext
1798
+
1799
+ engine_ctx = EngineContext(
1800
+ context=SparkContext(self.spark),
1801
+ df=df,
1802
+ engine=self,
1803
+ engine_type=self.engine_type,
1804
+ )
1805
+
1806
+ # Validate and instantiate params
1807
+ if param_model:
1808
+ validated_params = param_model(**params)
1809
+ result_ctx = func(engine_ctx, validated_params)
1810
+ else:
1811
+ result_ctx = func(engine_ctx, **params)
1812
+
1813
+ return result_ctx.df
1814
+
1815
+ ctx.error(f"Unsupported operation for Spark engine: {operation}")
1816
+ raise ValueError(f"Unsupported operation for Spark engine: {operation}")
1817
+
1818
+ def count_nulls(self, df, columns: List[str]) -> Dict[str, int]:
1819
+ """Count nulls in specified columns."""
1820
+ from pyspark.sql.functions import col, count, when
1821
+
1822
+ missing = set(columns) - set(df.columns)
1823
+ if missing:
1824
+ raise ValueError(f"Columns not found in DataFrame: {', '.join(missing)}")
1825
+
1826
+ aggs = [count(when(col(c).isNull(), c)).alias(c) for c in columns]
1827
+ result = df.select(*aggs).collect()[0].asDict()
1828
+ return result
1829
+
1830
+ def validate_schema(self, df, schema_rules: Dict[str, Any]) -> List[str]:
1831
+ """Validate DataFrame schema."""
1832
+ failures = []
1833
+
1834
+ if "required_columns" in schema_rules:
1835
+ required = schema_rules["required_columns"]
1836
+ missing = set(required) - set(df.columns)
1837
+ if missing:
1838
+ failures.append(f"Missing required columns: {', '.join(missing)}")
1839
+
1840
+ if "types" in schema_rules:
1841
+ type_map = {
1842
+ "int": ["integer", "long", "short", "byte", "bigint"],
1843
+ "float": ["double", "float"],
1844
+ "str": ["string"],
1845
+ "bool": ["boolean"],
1846
+ }
1847
+
1848
+ for col_name, expected_type in schema_rules["types"].items():
1849
+ if col_name not in df.columns:
1850
+ failures.append(f"Column '{col_name}' not found for type validation")
1851
+ continue
1852
+
1853
+ actual_type = dict(df.dtypes)[col_name]
1854
+ expected_dtypes = type_map.get(expected_type, [expected_type])
1855
+
1856
+ if actual_type not in expected_dtypes:
1857
+ failures.append(
1858
+ f"Column '{col_name}' has type '{actual_type}', expected '{expected_type}'"
1859
+ )
1860
+
1861
+ return failures
1862
+
1863
+ def validate_data(self, df, validation_config: Any) -> List[str]:
1864
+ """Validate DataFrame against rules."""
1865
+ from pyspark.sql.functions import col
1866
+
1867
+ ctx = get_logging_context().with_context(engine="spark")
1868
+ failures = []
1869
+
1870
+ if validation_config.not_empty:
1871
+ if df.isEmpty():
1872
+ failures.append("DataFrame is empty")
1873
+
1874
+ if validation_config.no_nulls:
1875
+ null_counts = self.count_nulls(df, validation_config.no_nulls)
1876
+ for col_name, count in null_counts.items():
1877
+ if count > 0:
1878
+ failures.append(f"Column '{col_name}' has {count} null values")
1879
+
1880
+ if validation_config.schema_validation:
1881
+ schema_failures = self.validate_schema(df, validation_config.schema_validation)
1882
+ failures.extend(schema_failures)
1883
+
1884
+ if validation_config.ranges:
1885
+ for col_name, bounds in validation_config.ranges.items():
1886
+ if col_name in df.columns:
1887
+ min_val = bounds.get("min")
1888
+ max_val = bounds.get("max")
1889
+
1890
+ if min_val is not None:
1891
+ count = df.filter(col(col_name) < min_val).count()
1892
+ if count > 0:
1893
+ failures.append(f"Column '{col_name}' has values < {min_val}")
1894
+
1895
+ if max_val is not None:
1896
+ count = df.filter(col(col_name) > max_val).count()
1897
+ if count > 0:
1898
+ failures.append(f"Column '{col_name}' has values > {max_val}")
1899
+ else:
1900
+ failures.append(f"Column '{col_name}' not found for range validation")
1901
+
1902
+ if validation_config.allowed_values:
1903
+ for col_name, allowed in validation_config.allowed_values.items():
1904
+ if col_name in df.columns:
1905
+ count = df.filter(~col(col_name).isin(allowed)).count()
1906
+ if count > 0:
1907
+ failures.append(f"Column '{col_name}' has invalid values")
1908
+ else:
1909
+ failures.append(f"Column '{col_name}' not found for allowed values validation")
1910
+
1911
+ ctx.log_validation_result(
1912
+ passed=len(failures) == 0,
1913
+ rule_name="data_validation",
1914
+ failures=failures if failures else None,
1915
+ )
1916
+
1917
+ return failures
1918
+
1919
+ def get_sample(self, df, n: int = 10) -> List[Dict[str, Any]]:
1920
+ """Get sample rows as list of dictionaries."""
1921
+ return [row.asDict() for row in df.limit(n).collect()]
1922
+
1923
+ def table_exists(
1924
+ self, connection: Any, table: Optional[str] = None, path: Optional[str] = None
1925
+ ) -> bool:
1926
+ """Check if table or location exists.
1927
+
1928
+ Handles orphan catalog entries where the table is registered but
1929
+ the underlying Delta path no longer exists.
1930
+ """
1931
+ ctx = get_logging_context().with_context(engine="spark")
1932
+
1933
+ if table:
1934
+ try:
1935
+ if not self.spark.catalog.tableExists(table):
1936
+ ctx.debug(f"Table does not exist: {table}")
1937
+ return False
1938
+ # Table exists in catalog - verify it's actually readable
1939
+ # This catches orphan entries where path was deleted
1940
+ self.spark.table(table).limit(0).collect()
1941
+ ctx.debug(f"Table existence check: {table}", exists=True)
1942
+ return True
1943
+ except Exception as e:
1944
+ # Table exists in catalog but underlying data is gone (orphan entry)
1945
+ # This is expected during first-run detection - log at debug level
1946
+ ctx.debug(
1947
+ f"Table {table} exists in catalog but is not accessible (treating as first run)",
1948
+ error_message=str(e),
1949
+ )
1950
+ return False
1951
+ elif path:
1952
+ try:
1953
+ from delta.tables import DeltaTable
1954
+
1955
+ full_path = connection.get_path(path)
1956
+ exists = DeltaTable.isDeltaTable(self.spark, full_path)
1957
+ ctx.debug(f"Delta table existence check: {path}", exists=exists)
1958
+ return exists
1959
+ except ImportError:
1960
+ try:
1961
+ full_path = connection.get_path(path)
1962
+ exists = (
1963
+ self.spark.sparkContext._gateway.jvm.org.apache.hadoop.fs.FileSystem.get(
1964
+ self.spark.sparkContext._jsc.hadoopConfiguration()
1965
+ ).exists(
1966
+ self.spark.sparkContext._gateway.jvm.org.apache.hadoop.fs.Path(
1967
+ full_path
1968
+ )
1969
+ )
1970
+ )
1971
+ ctx.debug(f"Path existence check: {path}", exists=exists)
1972
+ return exists
1973
+ except Exception as e:
1974
+ ctx.warning(f"Path existence check failed: {path}", error_message=str(e))
1975
+ return False
1976
+ except Exception as e:
1977
+ ctx.warning(f"Table existence check failed: {path}", error_message=str(e))
1978
+ return False
1979
+ return False
1980
+
1981
+ def get_table_schema(
1982
+ self,
1983
+ connection: Any,
1984
+ table: Optional[str] = None,
1985
+ path: Optional[str] = None,
1986
+ format: Optional[str] = None,
1987
+ ) -> Optional[Dict[str, str]]:
1988
+ """Get schema of an existing table/file."""
1989
+ ctx = get_logging_context().with_context(engine="spark")
1990
+
1991
+ try:
1992
+ if table:
1993
+ if self.spark.catalog.tableExists(table):
1994
+ schema = self.get_schema(self.spark.table(table))
1995
+ ctx.debug(f"Retrieved schema for table: {table}", columns=len(schema))
1996
+ return schema
1997
+ elif path:
1998
+ full_path = connection.get_path(path)
1999
+ if format == "delta":
2000
+ from delta.tables import DeltaTable
2001
+
2002
+ if DeltaTable.isDeltaTable(self.spark, full_path):
2003
+ schema = self.get_schema(DeltaTable.forPath(self.spark, full_path).toDF())
2004
+ ctx.debug(f"Retrieved Delta schema: {path}", columns=len(schema))
2005
+ return schema
2006
+ elif format == "parquet":
2007
+ schema = self.get_schema(self.spark.read.parquet(full_path))
2008
+ ctx.debug(f"Retrieved Parquet schema: {path}", columns=len(schema))
2009
+ return schema
2010
+ elif format:
2011
+ schema = self.get_schema(self.spark.read.format(format).load(full_path))
2012
+ ctx.debug(f"Retrieved schema: {path}", format=format, columns=len(schema))
2013
+ return schema
2014
+ except Exception as e:
2015
+ ctx.warning(
2016
+ "Failed to get schema",
2017
+ table=table,
2018
+ path=path,
2019
+ error_message=str(e),
2020
+ )
2021
+ return None
2022
+
2023
+ def vacuum_delta(
2024
+ self,
2025
+ connection: Any,
2026
+ path: str,
2027
+ retention_hours: int = 168,
2028
+ ) -> None:
2029
+ """VACUUM a Delta table to remove old files."""
2030
+ ctx = get_logging_context().with_context(engine="spark")
2031
+ start_time = time.time()
2032
+
2033
+ ctx.debug(
2034
+ "Starting Delta VACUUM",
2035
+ path=path,
2036
+ retention_hours=retention_hours,
2037
+ )
2038
+
2039
+ try:
2040
+ from delta.tables import DeltaTable
2041
+ except ImportError:
2042
+ ctx.error("Delta Lake support requires 'delta-spark'")
2043
+ raise ImportError(
2044
+ "Delta Lake support requires 'pip install odibi[spark]' "
2045
+ "with delta-spark. "
2046
+ "See README.md for installation instructions."
2047
+ )
2048
+
2049
+ full_path = connection.get_path(path)
2050
+
2051
+ try:
2052
+ delta_table = DeltaTable.forPath(self.spark, full_path)
2053
+ delta_table.vacuum(retention_hours / 24.0)
2054
+
2055
+ elapsed = (time.time() - start_time) * 1000
2056
+ ctx.info(
2057
+ "Delta VACUUM completed",
2058
+ path=path,
2059
+ retention_hours=retention_hours,
2060
+ elapsed_ms=round(elapsed, 2),
2061
+ )
2062
+
2063
+ except Exception as e:
2064
+ elapsed = (time.time() - start_time) * 1000
2065
+ ctx.error(
2066
+ "Delta VACUUM failed",
2067
+ path=path,
2068
+ error_type=type(e).__name__,
2069
+ error_message=str(e),
2070
+ elapsed_ms=round(elapsed, 2),
2071
+ )
2072
+ raise
2073
+
2074
+ def get_delta_history(
2075
+ self, connection: Any, path: str, limit: Optional[int] = None
2076
+ ) -> List[Dict[str, Any]]:
2077
+ """Get Delta table history."""
2078
+ ctx = get_logging_context().with_context(engine="spark")
2079
+ start_time = time.time()
2080
+
2081
+ ctx.debug("Fetching Delta history", path=path, limit=limit)
2082
+
2083
+ try:
2084
+ from delta.tables import DeltaTable
2085
+ except ImportError:
2086
+ ctx.error("Delta Lake support requires 'delta-spark'")
2087
+ raise ImportError(
2088
+ "Delta Lake support requires 'pip install odibi[spark]' "
2089
+ "with delta-spark. "
2090
+ "See README.md for installation instructions."
2091
+ )
2092
+
2093
+ full_path = connection.get_path(path)
2094
+
2095
+ try:
2096
+ delta_table = DeltaTable.forPath(self.spark, full_path)
2097
+ history_df = delta_table.history(limit) if limit else delta_table.history()
2098
+ history = [row.asDict() for row in history_df.collect()]
2099
+
2100
+ elapsed = (time.time() - start_time) * 1000
2101
+ ctx.info(
2102
+ "Delta history retrieved",
2103
+ path=path,
2104
+ versions_returned=len(history),
2105
+ elapsed_ms=round(elapsed, 2),
2106
+ )
2107
+
2108
+ return history
2109
+
2110
+ except Exception as e:
2111
+ elapsed = (time.time() - start_time) * 1000
2112
+ ctx.error(
2113
+ "Failed to get Delta history",
2114
+ path=path,
2115
+ error_type=type(e).__name__,
2116
+ error_message=str(e),
2117
+ elapsed_ms=round(elapsed, 2),
2118
+ )
2119
+ raise
2120
+
2121
+ def restore_delta(self, connection: Any, path: str, version: int) -> None:
2122
+ """Restore Delta table to a specific version."""
2123
+ ctx = get_logging_context().with_context(engine="spark")
2124
+ start_time = time.time()
2125
+
2126
+ ctx.debug("Restoring Delta table", path=path, version=version)
2127
+
2128
+ try:
2129
+ from delta.tables import DeltaTable
2130
+ except ImportError:
2131
+ ctx.error("Delta Lake support requires 'delta-spark'")
2132
+ raise ImportError(
2133
+ "Delta Lake support requires 'pip install odibi[spark]' "
2134
+ "with delta-spark. "
2135
+ "See README.md for installation instructions."
2136
+ )
2137
+
2138
+ full_path = connection.get_path(path)
2139
+
2140
+ try:
2141
+ delta_table = DeltaTable.forPath(self.spark, full_path)
2142
+ delta_table.restoreToVersion(version)
2143
+
2144
+ elapsed = (time.time() - start_time) * 1000
2145
+ ctx.info(
2146
+ "Delta table restored",
2147
+ path=path,
2148
+ version=version,
2149
+ elapsed_ms=round(elapsed, 2),
2150
+ )
2151
+
2152
+ except Exception as e:
2153
+ elapsed = (time.time() - start_time) * 1000
2154
+ ctx.error(
2155
+ "Delta restore failed",
2156
+ path=path,
2157
+ version=version,
2158
+ error_type=type(e).__name__,
2159
+ error_message=str(e),
2160
+ elapsed_ms=round(elapsed, 2),
2161
+ )
2162
+ raise
2163
+
2164
+ def maintain_table(
2165
+ self,
2166
+ connection: Any,
2167
+ format: str,
2168
+ table: Optional[str] = None,
2169
+ path: Optional[str] = None,
2170
+ config: Optional[Any] = None,
2171
+ ) -> None:
2172
+ """Run table maintenance operations (optimize, vacuum)."""
2173
+ if format != "delta" or not config or not config.enabled:
2174
+ return
2175
+
2176
+ ctx = get_logging_context().with_context(engine="spark")
2177
+ start_time = time.time()
2178
+
2179
+ if table:
2180
+ target = table
2181
+ elif path:
2182
+ full_path = connection.get_path(path)
2183
+ target = f"delta.`{full_path}`"
2184
+ else:
2185
+ return
2186
+
2187
+ ctx.debug("Starting table maintenance", target=target)
2188
+
2189
+ try:
2190
+ ctx.debug(f"Running OPTIMIZE on {target}")
2191
+ self.spark.sql(f"OPTIMIZE {target}")
2192
+
2193
+ retention = config.vacuum_retention_hours
2194
+ if retention is not None and retention > 0:
2195
+ ctx.debug(f"Running VACUUM on {target}", retention_hours=retention)
2196
+ self.spark.sql(f"VACUUM {target} RETAIN {retention} HOURS")
2197
+
2198
+ elapsed = (time.time() - start_time) * 1000
2199
+ ctx.info(
2200
+ "Table maintenance completed",
2201
+ target=target,
2202
+ vacuum_retention_hours=retention,
2203
+ elapsed_ms=round(elapsed, 2),
2204
+ )
2205
+
2206
+ except Exception as e:
2207
+ elapsed = (time.time() - start_time) * 1000
2208
+ ctx.warning(
2209
+ f"Auto-optimize failed for {target}",
2210
+ error_type=type(e).__name__,
2211
+ error_message=str(e),
2212
+ elapsed_ms=round(elapsed, 2),
2213
+ )
2214
+
2215
+ def get_source_files(self, df) -> List[str]:
2216
+ """Get list of source files that generated this DataFrame."""
2217
+ try:
2218
+ return df.inputFiles()
2219
+ except Exception:
2220
+ return []
2221
+
2222
+ def profile_nulls(self, df) -> Dict[str, float]:
2223
+ """Calculate null percentage for each column."""
2224
+ from pyspark.sql.functions import col, mean, when
2225
+
2226
+ aggs = []
2227
+ for c in df.columns:
2228
+ aggs.append(mean(when(col(c).isNull(), 1).otherwise(0)).alias(c))
2229
+
2230
+ if not aggs:
2231
+ return {}
2232
+
2233
+ try:
2234
+ result = df.select(*aggs).collect()[0].asDict()
2235
+ return result
2236
+ except Exception:
2237
+ return {}
2238
+
2239
+ def filter_greater_than(self, df, column: str, value: Any) -> Any:
2240
+ """Filter DataFrame where column > value.
2241
+
2242
+ Automatically casts string columns to timestamp for proper comparison.
2243
+ Tries multiple date formats including Oracle-style (DD-MON-YY).
2244
+ """
2245
+ from pyspark.sql import functions as F
2246
+ from pyspark.sql.types import StringType
2247
+
2248
+ col_type = df.schema[column].dataType
2249
+ if isinstance(col_type, StringType):
2250
+ ts_col = self._parse_string_to_timestamp(F.col(column))
2251
+ return df.filter(ts_col > value)
2252
+ return df.filter(F.col(column) > value)
2253
+
2254
+ def _parse_string_to_timestamp(self, col):
2255
+ """Parse string column to timestamp, trying multiple formats.
2256
+
2257
+ Supports:
2258
+ - ISO format: 2024-04-20 07:11:01
2259
+ - Oracle format: 20-APR-24 07:11:01.0 (handles uppercase months)
2260
+ """
2261
+ from pyspark.sql import functions as F
2262
+
2263
+ result = F.to_timestamp(col)
2264
+
2265
+ result = F.coalesce(result, F.to_timestamp(col, "yyyy-MM-dd HH:mm:ss"))
2266
+ result = F.coalesce(result, F.to_timestamp(col, "yyyy-MM-dd'T'HH:mm:ss"))
2267
+ result = F.coalesce(result, F.to_timestamp(col, "MM/dd/yyyy HH:mm:ss"))
2268
+
2269
+ col_oracle = F.concat(
2270
+ F.substring(col, 1, 3),
2271
+ F.upper(F.substring(col, 4, 1)),
2272
+ F.lower(F.substring(col, 5, 2)),
2273
+ F.substring(col, 7, 100),
2274
+ )
2275
+ result = F.coalesce(result, F.to_timestamp(col_oracle, "dd-MMM-yy HH:mm:ss.S"))
2276
+ result = F.coalesce(result, F.to_timestamp(col_oracle, "dd-MMM-yy HH:mm:ss"))
2277
+
2278
+ return result
2279
+
2280
+ def filter_coalesce(self, df, col1: str, col2: str, op: str, value: Any) -> Any:
2281
+ """Filter using COALESCE(col1, col2) op value.
2282
+
2283
+ Automatically casts string columns to timestamp for proper comparison.
2284
+ Tries multiple date formats including Oracle-style (DD-MON-YY).
2285
+ """
2286
+ from pyspark.sql import functions as F
2287
+ from pyspark.sql.types import StringType
2288
+
2289
+ col1_type = df.schema[col1].dataType
2290
+ col2_type = df.schema[col2].dataType
2291
+
2292
+ if isinstance(col1_type, StringType):
2293
+ c1 = self._parse_string_to_timestamp(F.col(col1))
2294
+ else:
2295
+ c1 = F.col(col1)
2296
+
2297
+ if isinstance(col2_type, StringType):
2298
+ c2 = self._parse_string_to_timestamp(F.col(col2))
2299
+ else:
2300
+ c2 = F.col(col2)
2301
+
2302
+ coalesced = F.coalesce(c1, c2)
2303
+
2304
+ if op == ">":
2305
+ return df.filter(coalesced > value)
2306
+ elif op == ">=":
2307
+ return df.filter(coalesced >= value)
2308
+ elif op == "<":
2309
+ return df.filter(coalesced < value)
2310
+ elif op == "<=":
2311
+ return df.filter(coalesced <= value)
2312
+ elif op == "=":
2313
+ return df.filter(coalesced == value)
2314
+ else:
2315
+ return df.filter(f"COALESCE({col1}, {col2}) {op} '{value}'")
2316
+
2317
+ def add_write_metadata(
2318
+ self,
2319
+ df: Any,
2320
+ metadata_config: Any,
2321
+ source_connection: Optional[str] = None,
2322
+ source_table: Optional[str] = None,
2323
+ source_path: Optional[str] = None,
2324
+ is_file_source: bool = False,
2325
+ ) -> Any:
2326
+ """Add metadata columns to DataFrame before writing (Bronze layer lineage).
2327
+
2328
+ Args:
2329
+ df: Spark DataFrame
2330
+ metadata_config: WriteMetadataConfig or True (for all defaults)
2331
+ source_connection: Name of the source connection
2332
+ source_table: Name of the source table (SQL sources)
2333
+ source_path: Path of the source file (file sources)
2334
+ is_file_source: True if source is a file-based read
2335
+
2336
+ Returns:
2337
+ DataFrame with metadata columns added
2338
+ """
2339
+ from pyspark.sql import functions as F
2340
+
2341
+ from odibi.config import WriteMetadataConfig
2342
+
2343
+ if metadata_config is True:
2344
+ config = WriteMetadataConfig()
2345
+ elif isinstance(metadata_config, WriteMetadataConfig):
2346
+ config = metadata_config
2347
+ else:
2348
+ return df
2349
+
2350
+ if config.extracted_at:
2351
+ df = df.withColumn("_extracted_at", F.current_timestamp())
2352
+
2353
+ if config.source_file and is_file_source and source_path:
2354
+ df = df.withColumn("_source_file", F.lit(source_path))
2355
+
2356
+ if config.source_connection and source_connection:
2357
+ df = df.withColumn("_source_connection", F.lit(source_connection))
2358
+
2359
+ if config.source_table and source_table:
2360
+ df = df.withColumn("_source_table", F.lit(source_table))
2361
+
2362
+ return df