odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2362 @@
|
|
|
1
|
+
"""Spark execution engine (Phase 2B: Delta Lake support).
|
|
2
|
+
|
|
3
|
+
Status: Phase 2B implemented - Delta Lake read/write, VACUUM, history, restore
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
from odibi.enums import EngineType
|
|
11
|
+
from odibi.exceptions import TransformError
|
|
12
|
+
from odibi.utils.logging_context import get_logging_context
|
|
13
|
+
|
|
14
|
+
from .base import Engine
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _extract_spark_error_message(error: Exception) -> str:
|
|
18
|
+
"""Extract a clean, user-friendly error message from Spark/Py4J exceptions.
|
|
19
|
+
|
|
20
|
+
Removes Java stack traces and Py4J noise, keeping only the useful error info.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
error: The exception to clean
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Clean error message without Java stack traces
|
|
27
|
+
"""
|
|
28
|
+
error_str = str(error)
|
|
29
|
+
|
|
30
|
+
# For AnalysisException, extract the error class and message up to SQLSTATE or line info
|
|
31
|
+
# Format: [ERROR_CLASS] message. Did you mean...? SQLSTATE: xxx; line X pos Y;\n'Plan...
|
|
32
|
+
match = re.match(
|
|
33
|
+
r"(\[[\w._]+\])\s*(.+?)(?:\s*SQLSTATE|\s*;\s*line|\n'|\n\tat|$)",
|
|
34
|
+
error_str,
|
|
35
|
+
re.DOTALL,
|
|
36
|
+
)
|
|
37
|
+
if match:
|
|
38
|
+
error_class = match.group(1)
|
|
39
|
+
message = match.group(2).strip().rstrip(".")
|
|
40
|
+
return f"{error_class} {message}"
|
|
41
|
+
|
|
42
|
+
# For other Spark errors, try to extract the first meaningful line
|
|
43
|
+
lines = error_str.split("\n")
|
|
44
|
+
for line in lines:
|
|
45
|
+
line = line.strip()
|
|
46
|
+
# Skip Java stack trace lines
|
|
47
|
+
if re.match(r"at (org\.|java\.|scala\.|py4j\.)", line):
|
|
48
|
+
continue
|
|
49
|
+
# Skip empty or noise lines
|
|
50
|
+
if not line or line.startswith("Py4JJavaError") or line == ":":
|
|
51
|
+
continue
|
|
52
|
+
# Return first meaningful line
|
|
53
|
+
if len(line) > 10:
|
|
54
|
+
# Truncate very long messages
|
|
55
|
+
if len(line) > 200:
|
|
56
|
+
return line[:200] + "..."
|
|
57
|
+
return line
|
|
58
|
+
|
|
59
|
+
# Fallback: return first 200 chars
|
|
60
|
+
return error_str[:200] + "..." if len(error_str) > 200 else error_str
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class SparkEngine(Engine):
|
|
64
|
+
"""Spark execution engine with PySpark backend.
|
|
65
|
+
|
|
66
|
+
Phase 2A: Basic read/write + ADLS multi-account support
|
|
67
|
+
Phase 2B: Delta Lake support
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
name = "spark"
|
|
71
|
+
engine_type = EngineType.SPARK
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
connections: Optional[Dict[str, Any]] = None,
|
|
76
|
+
spark_session: Any = None,
|
|
77
|
+
config: Optional[Dict[str, Any]] = None,
|
|
78
|
+
):
|
|
79
|
+
"""Initialize Spark engine with import guard.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
connections: Dictionary of connection objects (for multi-account config)
|
|
83
|
+
spark_session: Existing SparkSession (optional, creates new if None)
|
|
84
|
+
config: Engine configuration (optional)
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
ImportError: If pyspark not installed
|
|
88
|
+
"""
|
|
89
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
90
|
+
ctx.debug("Initializing SparkEngine", connections_count=len(connections or {}))
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
from pyspark.sql import SparkSession
|
|
94
|
+
except ImportError as e:
|
|
95
|
+
ctx.error(
|
|
96
|
+
"PySpark not installed",
|
|
97
|
+
error_type="ImportError",
|
|
98
|
+
suggestion="pip install odibi[spark]",
|
|
99
|
+
)
|
|
100
|
+
raise ImportError(
|
|
101
|
+
"Spark support requires 'pip install odibi[spark]'. "
|
|
102
|
+
"See docs/setup_databricks.md for setup instructions."
|
|
103
|
+
) from e
|
|
104
|
+
|
|
105
|
+
start_time = time.time()
|
|
106
|
+
|
|
107
|
+
# Configure Delta Lake support
|
|
108
|
+
try:
|
|
109
|
+
from delta import configure_spark_with_delta_pip
|
|
110
|
+
|
|
111
|
+
builder = SparkSession.builder.appName("odibi").config(
|
|
112
|
+
"spark.sql.sources.partitionOverwriteMode", "dynamic"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Performance Optimizations
|
|
116
|
+
builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true")
|
|
117
|
+
builder = builder.config("spark.sql.adaptive.enabled", "true")
|
|
118
|
+
|
|
119
|
+
# Reduce Verbosity
|
|
120
|
+
builder = builder.config(
|
|
121
|
+
"spark.driver.extraJavaOptions", "-Dlog4j.rootCategory=ERROR, console"
|
|
122
|
+
)
|
|
123
|
+
builder = builder.config(
|
|
124
|
+
"spark.executor.extraJavaOptions", "-Dlog4j.rootCategory=ERROR, console"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
self.spark = spark_session or configure_spark_with_delta_pip(builder).getOrCreate()
|
|
128
|
+
self.spark.sparkContext.setLogLevel("ERROR")
|
|
129
|
+
|
|
130
|
+
ctx.debug("Delta Lake support enabled")
|
|
131
|
+
|
|
132
|
+
except ImportError:
|
|
133
|
+
ctx.debug("Delta Lake not available, using standard Spark")
|
|
134
|
+
builder = SparkSession.builder.appName("odibi").config(
|
|
135
|
+
"spark.sql.sources.partitionOverwriteMode", "dynamic"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Performance Optimizations
|
|
139
|
+
builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true")
|
|
140
|
+
builder = builder.config("spark.sql.adaptive.enabled", "true")
|
|
141
|
+
|
|
142
|
+
# Reduce Verbosity
|
|
143
|
+
builder = builder.config(
|
|
144
|
+
"spark.driver.extraJavaOptions", "-Dlog4j.rootCategory=ERROR, console"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
self.spark = spark_session or builder.getOrCreate()
|
|
148
|
+
self.spark.sparkContext.setLogLevel("ERROR")
|
|
149
|
+
|
|
150
|
+
self.config = config or {}
|
|
151
|
+
self.connections = connections or {}
|
|
152
|
+
|
|
153
|
+
# Configure all ADLS connections upfront
|
|
154
|
+
self._configure_all_connections()
|
|
155
|
+
|
|
156
|
+
# Apply user-defined Spark configs from performance settings
|
|
157
|
+
self._apply_spark_config()
|
|
158
|
+
|
|
159
|
+
elapsed = (time.time() - start_time) * 1000
|
|
160
|
+
ctx.info(
|
|
161
|
+
"SparkEngine initialized",
|
|
162
|
+
elapsed_ms=round(elapsed, 2),
|
|
163
|
+
app_name=self.spark.sparkContext.appName,
|
|
164
|
+
spark_version=self.spark.version,
|
|
165
|
+
connections_configured=len(self.connections),
|
|
166
|
+
using_existing_session=spark_session is not None,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def _configure_all_connections(self) -> None:
|
|
170
|
+
"""Configure Spark with all ADLS connection credentials.
|
|
171
|
+
|
|
172
|
+
This sets all storage account keys upfront so Spark can access
|
|
173
|
+
multiple accounts. Keys are scoped by account name, so no conflicts.
|
|
174
|
+
"""
|
|
175
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
176
|
+
|
|
177
|
+
for conn_name, connection in self.connections.items():
|
|
178
|
+
if hasattr(connection, "configure_spark"):
|
|
179
|
+
ctx.log_connection(
|
|
180
|
+
connection_type=type(connection).__name__,
|
|
181
|
+
connection_name=conn_name,
|
|
182
|
+
action="configure_spark",
|
|
183
|
+
)
|
|
184
|
+
try:
|
|
185
|
+
connection.configure_spark(self.spark)
|
|
186
|
+
ctx.debug(f"Configured ADLS connection: {conn_name}")
|
|
187
|
+
except Exception as e:
|
|
188
|
+
ctx.error(
|
|
189
|
+
f"Failed to configure ADLS connection: {conn_name}",
|
|
190
|
+
error_type=type(e).__name__,
|
|
191
|
+
error_message=str(e),
|
|
192
|
+
)
|
|
193
|
+
raise
|
|
194
|
+
|
|
195
|
+
def _apply_spark_config(self) -> None:
|
|
196
|
+
"""Apply user-defined Spark configurations from performance settings.
|
|
197
|
+
|
|
198
|
+
Applies configs via spark.conf.set() for runtime-settable options.
|
|
199
|
+
For existing sessions (e.g., Databricks), only modifiable configs take effect.
|
|
200
|
+
"""
|
|
201
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
202
|
+
|
|
203
|
+
performance = self.config.get("performance", {})
|
|
204
|
+
spark_config = performance.get("spark_config", {})
|
|
205
|
+
|
|
206
|
+
if not spark_config:
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
ctx.debug("Applying Spark configuration", config_count=len(spark_config))
|
|
210
|
+
|
|
211
|
+
for key, value in spark_config.items():
|
|
212
|
+
try:
|
|
213
|
+
self.spark.conf.set(key, value)
|
|
214
|
+
ctx.debug(
|
|
215
|
+
f"Applied Spark config: {key}={value}", config_key=key, config_value=value
|
|
216
|
+
)
|
|
217
|
+
except Exception as e:
|
|
218
|
+
ctx.warning(
|
|
219
|
+
f"Failed to set Spark config '{key}'",
|
|
220
|
+
config_key=key,
|
|
221
|
+
error_message=str(e),
|
|
222
|
+
suggestion="This config may require session restart",
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def _apply_table_properties(
|
|
226
|
+
self, target: str, properties: Dict[str, str], is_table: bool = False
|
|
227
|
+
) -> None:
|
|
228
|
+
"""Apply table properties to a Delta table.
|
|
229
|
+
|
|
230
|
+
Performance: Batches all properties into a single ALTER TABLE statement
|
|
231
|
+
to avoid multiple round-trips to the catalog.
|
|
232
|
+
"""
|
|
233
|
+
if not properties:
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
table_ref = target if is_table else f"delta.`{target}`"
|
|
240
|
+
ctx.debug(
|
|
241
|
+
f"Applying table properties to {target}",
|
|
242
|
+
properties_count=len(properties),
|
|
243
|
+
is_table=is_table,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
props_list = [f"'{k}' = '{v}'" for k, v in properties.items()]
|
|
247
|
+
props_str = ", ".join(props_list)
|
|
248
|
+
sql = f"ALTER TABLE {table_ref} SET TBLPROPERTIES ({props_str})"
|
|
249
|
+
self.spark.sql(sql)
|
|
250
|
+
ctx.debug(f"Set {len(properties)} table properties in single statement")
|
|
251
|
+
|
|
252
|
+
except Exception as e:
|
|
253
|
+
ctx.warning(
|
|
254
|
+
f"Failed to set table properties on {target}",
|
|
255
|
+
error_type=type(e).__name__,
|
|
256
|
+
error_message=str(e),
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def _optimize_delta_write(
|
|
260
|
+
self, target: str, options: Dict[str, Any], is_table: bool = False
|
|
261
|
+
) -> None:
|
|
262
|
+
"""Run Delta Lake optimization (OPTIMIZE / ZORDER)."""
|
|
263
|
+
should_optimize = options.get("optimize_write", False)
|
|
264
|
+
zorder_by = options.get("zorder_by")
|
|
265
|
+
|
|
266
|
+
if not should_optimize and not zorder_by:
|
|
267
|
+
return
|
|
268
|
+
|
|
269
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
270
|
+
start_time = time.time()
|
|
271
|
+
|
|
272
|
+
try:
|
|
273
|
+
if is_table:
|
|
274
|
+
sql = f"OPTIMIZE {target}"
|
|
275
|
+
else:
|
|
276
|
+
sql = f"OPTIMIZE delta.`{target}`"
|
|
277
|
+
|
|
278
|
+
if zorder_by:
|
|
279
|
+
if isinstance(zorder_by, str):
|
|
280
|
+
zorder_by = [zorder_by]
|
|
281
|
+
cols = ", ".join(zorder_by)
|
|
282
|
+
sql += f" ZORDER BY ({cols})"
|
|
283
|
+
|
|
284
|
+
ctx.debug("Running Delta optimization", sql=sql, target=target)
|
|
285
|
+
self.spark.sql(sql)
|
|
286
|
+
|
|
287
|
+
elapsed = (time.time() - start_time) * 1000
|
|
288
|
+
ctx.info(
|
|
289
|
+
"Delta optimization completed",
|
|
290
|
+
target=target,
|
|
291
|
+
zorder_by=zorder_by,
|
|
292
|
+
elapsed_ms=round(elapsed, 2),
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
except Exception as e:
|
|
296
|
+
elapsed = (time.time() - start_time) * 1000
|
|
297
|
+
ctx.warning(
|
|
298
|
+
f"Optimization failed for {target}",
|
|
299
|
+
error_type=type(e).__name__,
|
|
300
|
+
error_message=str(e),
|
|
301
|
+
elapsed_ms=round(elapsed, 2),
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
def _get_last_delta_commit_info(
|
|
305
|
+
self, target: str, is_table: bool = False
|
|
306
|
+
) -> Optional[Dict[str, Any]]:
|
|
307
|
+
"""Get metadata for the most recent Delta commit."""
|
|
308
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
309
|
+
|
|
310
|
+
try:
|
|
311
|
+
from delta.tables import DeltaTable
|
|
312
|
+
|
|
313
|
+
if is_table:
|
|
314
|
+
dt = DeltaTable.forName(self.spark, target)
|
|
315
|
+
else:
|
|
316
|
+
dt = DeltaTable.forPath(self.spark, target)
|
|
317
|
+
|
|
318
|
+
last_commit = dt.history(1).collect()[0]
|
|
319
|
+
|
|
320
|
+
def safe_get(row, field):
|
|
321
|
+
if hasattr(row, field):
|
|
322
|
+
return getattr(row, field)
|
|
323
|
+
if hasattr(row, "__getitem__"):
|
|
324
|
+
try:
|
|
325
|
+
return row[field]
|
|
326
|
+
except (KeyError, ValueError):
|
|
327
|
+
return None
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
commit_info = {
|
|
331
|
+
"version": safe_get(last_commit, "version"),
|
|
332
|
+
"timestamp": safe_get(last_commit, "timestamp"),
|
|
333
|
+
"operation": safe_get(last_commit, "operation"),
|
|
334
|
+
"operation_metrics": safe_get(last_commit, "operationMetrics"),
|
|
335
|
+
"read_version": safe_get(last_commit, "readVersion"),
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
ctx.debug(
|
|
339
|
+
"Delta commit metadata retrieved",
|
|
340
|
+
target=target,
|
|
341
|
+
version=commit_info.get("version"),
|
|
342
|
+
operation=commit_info.get("operation"),
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return commit_info
|
|
346
|
+
|
|
347
|
+
except Exception as e:
|
|
348
|
+
ctx.warning(
|
|
349
|
+
f"Failed to fetch Delta commit info for {target}",
|
|
350
|
+
error_type=type(e).__name__,
|
|
351
|
+
error_message=str(e),
|
|
352
|
+
)
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
def harmonize_schema(self, df, target_schema: Dict[str, str], policy: Any):
|
|
356
|
+
"""Harmonize DataFrame schema with target schema according to policy."""
|
|
357
|
+
from pyspark.sql.functions import col, lit
|
|
358
|
+
|
|
359
|
+
from odibi.config import OnMissingColumns, OnNewColumns, SchemaMode
|
|
360
|
+
|
|
361
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
362
|
+
|
|
363
|
+
target_cols = list(target_schema.keys())
|
|
364
|
+
current_cols = df.columns
|
|
365
|
+
|
|
366
|
+
missing = set(target_cols) - set(current_cols)
|
|
367
|
+
new_cols = set(current_cols) - set(target_cols)
|
|
368
|
+
|
|
369
|
+
ctx.debug(
|
|
370
|
+
"Schema harmonization",
|
|
371
|
+
target_columns=len(target_cols),
|
|
372
|
+
current_columns=len(current_cols),
|
|
373
|
+
missing_columns=list(missing) if missing else None,
|
|
374
|
+
new_columns=list(new_cols) if new_cols else None,
|
|
375
|
+
policy_mode=policy.mode.value if hasattr(policy.mode, "value") else str(policy.mode),
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Check Validations
|
|
379
|
+
if missing and policy.on_missing_columns == OnMissingColumns.FAIL:
|
|
380
|
+
ctx.error(
|
|
381
|
+
f"Schema Policy Violation: Missing columns {missing}",
|
|
382
|
+
missing_columns=list(missing),
|
|
383
|
+
)
|
|
384
|
+
raise ValueError(f"Schema Policy Violation: Missing columns {missing}")
|
|
385
|
+
|
|
386
|
+
if new_cols and policy.on_new_columns == OnNewColumns.FAIL:
|
|
387
|
+
ctx.error(
|
|
388
|
+
f"Schema Policy Violation: New columns {new_cols}",
|
|
389
|
+
new_columns=list(new_cols),
|
|
390
|
+
)
|
|
391
|
+
raise ValueError(f"Schema Policy Violation: New columns {new_cols}")
|
|
392
|
+
|
|
393
|
+
# Apply Transformations
|
|
394
|
+
if policy.mode == SchemaMode.EVOLVE and policy.on_new_columns == OnNewColumns.ADD_NULLABLE:
|
|
395
|
+
res = df
|
|
396
|
+
for c in missing:
|
|
397
|
+
res = res.withColumn(c, lit(None))
|
|
398
|
+
ctx.debug("Schema evolved: added missing columns as null")
|
|
399
|
+
return res
|
|
400
|
+
else:
|
|
401
|
+
select_exprs = []
|
|
402
|
+
for c in target_cols:
|
|
403
|
+
if c in current_cols:
|
|
404
|
+
select_exprs.append(col(c))
|
|
405
|
+
else:
|
|
406
|
+
select_exprs.append(lit(None).alias(c))
|
|
407
|
+
|
|
408
|
+
ctx.debug("Schema enforced: projected to target schema")
|
|
409
|
+
return df.select(*select_exprs)
|
|
410
|
+
|
|
411
|
+
def anonymize(self, df, columns: List[str], method: str, salt: Optional[str] = None):
|
|
412
|
+
"""Anonymize columns using Spark functions."""
|
|
413
|
+
from pyspark.sql.functions import col, concat, lit, regexp_replace, sha2
|
|
414
|
+
|
|
415
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
416
|
+
ctx.debug(
|
|
417
|
+
"Anonymizing columns",
|
|
418
|
+
columns=columns,
|
|
419
|
+
method=method,
|
|
420
|
+
has_salt=salt is not None,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
res = df
|
|
424
|
+
for c in columns:
|
|
425
|
+
if c not in df.columns:
|
|
426
|
+
ctx.warning(f"Column '{c}' not found for anonymization, skipping", column=c)
|
|
427
|
+
continue
|
|
428
|
+
|
|
429
|
+
if method == "hash":
|
|
430
|
+
if salt:
|
|
431
|
+
res = res.withColumn(c, sha2(concat(col(c), lit(salt)), 256))
|
|
432
|
+
else:
|
|
433
|
+
res = res.withColumn(c, sha2(col(c), 256))
|
|
434
|
+
|
|
435
|
+
elif method == "mask":
|
|
436
|
+
res = res.withColumn(c, regexp_replace(col(c), ".(?=.{4})", "*"))
|
|
437
|
+
|
|
438
|
+
elif method == "redact":
|
|
439
|
+
res = res.withColumn(c, lit("[REDACTED]"))
|
|
440
|
+
|
|
441
|
+
ctx.debug(f"Anonymization completed using {method}")
|
|
442
|
+
return res
|
|
443
|
+
|
|
444
|
+
def get_schema(self, df) -> Dict[str, str]:
|
|
445
|
+
"""Get DataFrame schema with types."""
|
|
446
|
+
return {f.name: f.dataType.simpleString() for f in df.schema}
|
|
447
|
+
|
|
448
|
+
def get_shape(self, df) -> Tuple[int, int]:
|
|
449
|
+
"""Get DataFrame shape as (rows, columns)."""
|
|
450
|
+
return (df.count(), len(df.columns))
|
|
451
|
+
|
|
452
|
+
def count_rows(self, df) -> int:
|
|
453
|
+
"""Count rows in DataFrame."""
|
|
454
|
+
return df.count()
|
|
455
|
+
|
|
456
|
+
def read(
|
|
457
|
+
self,
|
|
458
|
+
connection: Any,
|
|
459
|
+
format: str,
|
|
460
|
+
table: Optional[str] = None,
|
|
461
|
+
path: Optional[str] = None,
|
|
462
|
+
streaming: bool = False,
|
|
463
|
+
schema: Optional[str] = None,
|
|
464
|
+
options: Optional[Dict[str, Any]] = None,
|
|
465
|
+
as_of_version: Optional[int] = None,
|
|
466
|
+
as_of_timestamp: Optional[str] = None,
|
|
467
|
+
) -> Any:
|
|
468
|
+
"""Read data using Spark.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
connection: Connection object (with get_path method)
|
|
472
|
+
format: Data format (csv, parquet, json, delta, sql_server)
|
|
473
|
+
table: Table name
|
|
474
|
+
path: File path
|
|
475
|
+
streaming: Whether to read as a stream (readStream)
|
|
476
|
+
schema: Schema string in DDL format (required for streaming file sources)
|
|
477
|
+
options: Format-specific options (including versionAsOf for Delta time travel)
|
|
478
|
+
as_of_version: Time travel version
|
|
479
|
+
as_of_timestamp: Time travel timestamp
|
|
480
|
+
|
|
481
|
+
Returns:
|
|
482
|
+
Spark DataFrame (or Streaming DataFrame)
|
|
483
|
+
"""
|
|
484
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
485
|
+
start_time = time.time()
|
|
486
|
+
options = options or {}
|
|
487
|
+
|
|
488
|
+
source_identifier = table or path or "unknown"
|
|
489
|
+
ctx.debug(
|
|
490
|
+
"Starting Spark read",
|
|
491
|
+
format=format,
|
|
492
|
+
source=source_identifier,
|
|
493
|
+
streaming=streaming,
|
|
494
|
+
as_of_version=as_of_version,
|
|
495
|
+
as_of_timestamp=as_of_timestamp,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Handle Time Travel options
|
|
499
|
+
if as_of_version is not None:
|
|
500
|
+
options["versionAsOf"] = as_of_version
|
|
501
|
+
ctx.debug(f"Time travel enabled: version {as_of_version}")
|
|
502
|
+
if as_of_timestamp is not None:
|
|
503
|
+
options["timestampAsOf"] = as_of_timestamp
|
|
504
|
+
ctx.debug(f"Time travel enabled: timestamp {as_of_timestamp}")
|
|
505
|
+
|
|
506
|
+
# SQL Server / Azure SQL Support
|
|
507
|
+
if format in ["sql", "sql_server", "azure_sql"]:
|
|
508
|
+
if streaming:
|
|
509
|
+
ctx.error("Streaming not supported for SQL Server / Azure SQL")
|
|
510
|
+
raise ValueError("Streaming not supported for SQL Server / Azure SQL yet.")
|
|
511
|
+
|
|
512
|
+
if not hasattr(connection, "get_spark_options"):
|
|
513
|
+
conn_type = type(connection).__name__
|
|
514
|
+
msg = f"Connection type '{conn_type}' does not support Spark SQL read"
|
|
515
|
+
ctx.error(msg, connection_type=conn_type)
|
|
516
|
+
raise ValueError(msg)
|
|
517
|
+
|
|
518
|
+
jdbc_options = connection.get_spark_options()
|
|
519
|
+
merged_options = {**jdbc_options, **options}
|
|
520
|
+
|
|
521
|
+
# Extract filter for SQL pushdown
|
|
522
|
+
sql_filter = merged_options.pop("filter", None)
|
|
523
|
+
|
|
524
|
+
if "query" in merged_options:
|
|
525
|
+
merged_options.pop("dbtable", None)
|
|
526
|
+
# If filter provided with query, append to WHERE clause
|
|
527
|
+
if sql_filter:
|
|
528
|
+
existing_query = merged_options["query"]
|
|
529
|
+
# Wrap existing query and add filter
|
|
530
|
+
if "WHERE" in existing_query.upper():
|
|
531
|
+
merged_options["query"] = f"({existing_query}) AND ({sql_filter})"
|
|
532
|
+
else:
|
|
533
|
+
subquery = f"SELECT * FROM ({existing_query}) AS _subq WHERE {sql_filter}"
|
|
534
|
+
merged_options["query"] = subquery
|
|
535
|
+
ctx.debug(f"Applied SQL pushdown filter to query: {sql_filter}")
|
|
536
|
+
elif table:
|
|
537
|
+
# Build query with filter pushdown instead of using dbtable
|
|
538
|
+
if sql_filter:
|
|
539
|
+
merged_options.pop("dbtable", None)
|
|
540
|
+
merged_options["query"] = f"SELECT * FROM {table} WHERE {sql_filter}"
|
|
541
|
+
ctx.debug(f"Applied SQL pushdown filter: {sql_filter}")
|
|
542
|
+
else:
|
|
543
|
+
merged_options["dbtable"] = table
|
|
544
|
+
elif "dbtable" not in merged_options:
|
|
545
|
+
ctx.error("SQL format requires 'table' config or 'query' option")
|
|
546
|
+
raise ValueError("SQL format requires 'table' config or 'query' option")
|
|
547
|
+
|
|
548
|
+
ctx.debug("Executing JDBC read", has_query="query" in merged_options)
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
df = self.spark.read.format("jdbc").options(**merged_options).load()
|
|
552
|
+
elapsed = (time.time() - start_time) * 1000
|
|
553
|
+
partition_count = df.rdd.getNumPartitions()
|
|
554
|
+
|
|
555
|
+
ctx.log_file_io(path=source_identifier, format=format, mode="read")
|
|
556
|
+
ctx.log_spark_metrics(partition_count=partition_count)
|
|
557
|
+
ctx.info(
|
|
558
|
+
"JDBC read completed",
|
|
559
|
+
source=source_identifier,
|
|
560
|
+
elapsed_ms=round(elapsed, 2),
|
|
561
|
+
partitions=partition_count,
|
|
562
|
+
)
|
|
563
|
+
return df
|
|
564
|
+
|
|
565
|
+
except Exception as e:
|
|
566
|
+
elapsed = (time.time() - start_time) * 1000
|
|
567
|
+
ctx.error(
|
|
568
|
+
"JDBC read failed",
|
|
569
|
+
source=source_identifier,
|
|
570
|
+
error_type=type(e).__name__,
|
|
571
|
+
error_message=str(e),
|
|
572
|
+
elapsed_ms=round(elapsed, 2),
|
|
573
|
+
)
|
|
574
|
+
raise
|
|
575
|
+
|
|
576
|
+
# Read based on format
|
|
577
|
+
if table:
|
|
578
|
+
# Managed/External Table (Catalog)
|
|
579
|
+
ctx.debug(f"Reading from catalog table: {table}")
|
|
580
|
+
|
|
581
|
+
if streaming:
|
|
582
|
+
reader = self.spark.readStream.format(format)
|
|
583
|
+
else:
|
|
584
|
+
reader = self.spark.read.format(format)
|
|
585
|
+
|
|
586
|
+
for key, value in options.items():
|
|
587
|
+
reader = reader.option(key, value)
|
|
588
|
+
|
|
589
|
+
try:
|
|
590
|
+
df = reader.table(table)
|
|
591
|
+
|
|
592
|
+
if "filter" in options:
|
|
593
|
+
df = df.filter(options["filter"])
|
|
594
|
+
ctx.debug(f"Applied filter: {options['filter']}")
|
|
595
|
+
|
|
596
|
+
elapsed = (time.time() - start_time) * 1000
|
|
597
|
+
|
|
598
|
+
if not streaming:
|
|
599
|
+
partition_count = df.rdd.getNumPartitions()
|
|
600
|
+
ctx.log_spark_metrics(partition_count=partition_count)
|
|
601
|
+
ctx.log_file_io(path=table, format=format, mode="read")
|
|
602
|
+
ctx.info(
|
|
603
|
+
f"Table read completed: {table}",
|
|
604
|
+
elapsed_ms=round(elapsed, 2),
|
|
605
|
+
partitions=partition_count,
|
|
606
|
+
)
|
|
607
|
+
else:
|
|
608
|
+
ctx.info(f"Streaming read started: {table}", elapsed_ms=round(elapsed, 2))
|
|
609
|
+
|
|
610
|
+
return df
|
|
611
|
+
|
|
612
|
+
except Exception as e:
|
|
613
|
+
elapsed = (time.time() - start_time) * 1000
|
|
614
|
+
ctx.error(
|
|
615
|
+
f"Table read failed: {table}",
|
|
616
|
+
error_type=type(e).__name__,
|
|
617
|
+
error_message=str(e),
|
|
618
|
+
elapsed_ms=round(elapsed, 2),
|
|
619
|
+
)
|
|
620
|
+
raise
|
|
621
|
+
|
|
622
|
+
elif path:
|
|
623
|
+
# File Path
|
|
624
|
+
full_path = connection.get_path(path)
|
|
625
|
+
ctx.debug(f"Reading from path: {full_path}")
|
|
626
|
+
|
|
627
|
+
# Auto-detect encoding for CSV (Batch only)
|
|
628
|
+
if not streaming and format == "csv" and options.get("auto_encoding"):
|
|
629
|
+
options = options.copy()
|
|
630
|
+
options.pop("auto_encoding")
|
|
631
|
+
|
|
632
|
+
if "encoding" not in options:
|
|
633
|
+
try:
|
|
634
|
+
from odibi.utils.encoding import detect_encoding
|
|
635
|
+
|
|
636
|
+
detected = detect_encoding(connection, path)
|
|
637
|
+
if detected:
|
|
638
|
+
options["encoding"] = detected
|
|
639
|
+
ctx.debug(f"Detected encoding: {detected}", path=path)
|
|
640
|
+
except ImportError:
|
|
641
|
+
pass
|
|
642
|
+
except Exception as e:
|
|
643
|
+
ctx.warning(
|
|
644
|
+
f"Encoding detection failed for {path}",
|
|
645
|
+
error_message=str(e),
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
if streaming:
|
|
649
|
+
reader = self.spark.readStream.format(format)
|
|
650
|
+
if schema:
|
|
651
|
+
reader = reader.schema(schema)
|
|
652
|
+
ctx.debug(f"Applied schema for streaming read: {schema[:100]}...")
|
|
653
|
+
else:
|
|
654
|
+
# Determine if we should warn about missing schema
|
|
655
|
+
# Formats that can infer schema: delta, parquet, avro (embedded schema)
|
|
656
|
+
# cloudFiles with schemaLocation or self-describing formats (avro, parquet) are fine
|
|
657
|
+
should_warn = True
|
|
658
|
+
|
|
659
|
+
if format in ["delta", "parquet"]:
|
|
660
|
+
should_warn = False
|
|
661
|
+
elif format == "cloudFiles":
|
|
662
|
+
cloud_format = options.get("cloudFiles.format", "")
|
|
663
|
+
has_schema_location = "cloudFiles.schemaLocation" in options
|
|
664
|
+
# avro and parquet have embedded schemas
|
|
665
|
+
if cloud_format in ["avro", "parquet"] or has_schema_location:
|
|
666
|
+
should_warn = False
|
|
667
|
+
|
|
668
|
+
if should_warn:
|
|
669
|
+
ctx.warning(
|
|
670
|
+
f"Streaming read from '{format}' format without schema. "
|
|
671
|
+
"Schema inference is not supported for streaming sources. "
|
|
672
|
+
"Consider adding 'schema' to your read config."
|
|
673
|
+
)
|
|
674
|
+
else:
|
|
675
|
+
reader = self.spark.read.format(format)
|
|
676
|
+
if schema:
|
|
677
|
+
reader = reader.schema(schema)
|
|
678
|
+
|
|
679
|
+
for key, value in options.items():
|
|
680
|
+
if key == "header" and isinstance(value, bool):
|
|
681
|
+
value = str(value).lower()
|
|
682
|
+
reader = reader.option(key, value)
|
|
683
|
+
|
|
684
|
+
try:
|
|
685
|
+
df = reader.load(full_path)
|
|
686
|
+
|
|
687
|
+
if "filter" in options:
|
|
688
|
+
df = df.filter(options["filter"])
|
|
689
|
+
ctx.debug(f"Applied filter: {options['filter']}")
|
|
690
|
+
|
|
691
|
+
elapsed = (time.time() - start_time) * 1000
|
|
692
|
+
|
|
693
|
+
if not streaming:
|
|
694
|
+
partition_count = df.rdd.getNumPartitions()
|
|
695
|
+
ctx.log_spark_metrics(partition_count=partition_count)
|
|
696
|
+
ctx.log_file_io(path=path, format=format, mode="read")
|
|
697
|
+
ctx.info(
|
|
698
|
+
f"File read completed: {path}",
|
|
699
|
+
elapsed_ms=round(elapsed, 2),
|
|
700
|
+
partitions=partition_count,
|
|
701
|
+
format=format,
|
|
702
|
+
)
|
|
703
|
+
else:
|
|
704
|
+
ctx.info(f"Streaming read started: {path}", elapsed_ms=round(elapsed, 2))
|
|
705
|
+
|
|
706
|
+
return df
|
|
707
|
+
|
|
708
|
+
except Exception as e:
|
|
709
|
+
elapsed = (time.time() - start_time) * 1000
|
|
710
|
+
ctx.error(
|
|
711
|
+
f"File read failed: {path}",
|
|
712
|
+
error_type=type(e).__name__,
|
|
713
|
+
error_message=str(e),
|
|
714
|
+
elapsed_ms=round(elapsed, 2),
|
|
715
|
+
format=format,
|
|
716
|
+
)
|
|
717
|
+
raise
|
|
718
|
+
else:
|
|
719
|
+
ctx.error("Either path or table must be provided")
|
|
720
|
+
raise ValueError("Either path or table must be provided")
|
|
721
|
+
|
|
722
|
+
def write(
|
|
723
|
+
self,
|
|
724
|
+
df: Any,
|
|
725
|
+
connection: Any,
|
|
726
|
+
format: str,
|
|
727
|
+
table: Optional[str] = None,
|
|
728
|
+
path: Optional[str] = None,
|
|
729
|
+
register_table: Optional[str] = None,
|
|
730
|
+
mode: str = "overwrite",
|
|
731
|
+
options: Optional[Dict[str, Any]] = None,
|
|
732
|
+
streaming_config: Optional[Any] = None,
|
|
733
|
+
) -> Optional[Dict[str, Any]]:
|
|
734
|
+
"""Write data using Spark.
|
|
735
|
+
|
|
736
|
+
Args:
|
|
737
|
+
df: Spark DataFrame to write
|
|
738
|
+
connection: Connection object
|
|
739
|
+
format: Output format (csv, parquet, json, delta)
|
|
740
|
+
table: Table name
|
|
741
|
+
path: File path
|
|
742
|
+
register_table: Name to register as external table (if path is used)
|
|
743
|
+
mode: Write mode (overwrite, append, error, ignore, upsert, append_once)
|
|
744
|
+
options: Format-specific options (including partition_by for partitioning)
|
|
745
|
+
streaming_config: StreamingWriteConfig for streaming DataFrames
|
|
746
|
+
|
|
747
|
+
Returns:
|
|
748
|
+
Optional dictionary containing Delta commit metadata (if format=delta),
|
|
749
|
+
or streaming query info (if streaming)
|
|
750
|
+
"""
|
|
751
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
752
|
+
start_time = time.time()
|
|
753
|
+
options = options or {}
|
|
754
|
+
|
|
755
|
+
if getattr(df, "isStreaming", False) is True:
|
|
756
|
+
return self._write_streaming(
|
|
757
|
+
df=df,
|
|
758
|
+
connection=connection,
|
|
759
|
+
format=format,
|
|
760
|
+
table=table,
|
|
761
|
+
path=path,
|
|
762
|
+
register_table=register_table,
|
|
763
|
+
options=options,
|
|
764
|
+
streaming_config=streaming_config,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
target_identifier = table or path or "unknown"
|
|
768
|
+
try:
|
|
769
|
+
partition_count = df.rdd.getNumPartitions()
|
|
770
|
+
except Exception:
|
|
771
|
+
partition_count = 1 # Fallback for mocks or unsupported DataFrames
|
|
772
|
+
|
|
773
|
+
# Auto-coalesce DataFrames for Delta writes to reduce file overhead
|
|
774
|
+
# Use coalesce_partitions option to explicitly set target partitions
|
|
775
|
+
# NOTE: We avoid df.count() here as it would trigger double-evaluation of lazy DataFrames
|
|
776
|
+
coalesce_partitions = options.pop("coalesce_partitions", None)
|
|
777
|
+
if (
|
|
778
|
+
coalesce_partitions
|
|
779
|
+
and isinstance(partition_count, int)
|
|
780
|
+
and partition_count > coalesce_partitions
|
|
781
|
+
):
|
|
782
|
+
df = df.coalesce(coalesce_partitions)
|
|
783
|
+
ctx.debug(
|
|
784
|
+
f"Coalesced DataFrame to {coalesce_partitions} partition(s)",
|
|
785
|
+
original_partitions=partition_count,
|
|
786
|
+
)
|
|
787
|
+
partition_count = coalesce_partitions
|
|
788
|
+
|
|
789
|
+
ctx.debug(
|
|
790
|
+
"Starting Spark write",
|
|
791
|
+
format=format,
|
|
792
|
+
target=target_identifier,
|
|
793
|
+
mode=mode,
|
|
794
|
+
partitions=partition_count,
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
# SQL Server / Azure SQL Support
|
|
798
|
+
if format in ["sql", "sql_server", "azure_sql"]:
|
|
799
|
+
if not hasattr(connection, "get_spark_options"):
|
|
800
|
+
conn_type = type(connection).__name__
|
|
801
|
+
msg = f"Connection type '{conn_type}' does not support Spark SQL write"
|
|
802
|
+
ctx.error(msg, connection_type=conn_type)
|
|
803
|
+
raise ValueError(msg)
|
|
804
|
+
|
|
805
|
+
jdbc_options = connection.get_spark_options()
|
|
806
|
+
merged_options = {**jdbc_options, **options}
|
|
807
|
+
|
|
808
|
+
if table:
|
|
809
|
+
merged_options["dbtable"] = table
|
|
810
|
+
elif "dbtable" not in merged_options:
|
|
811
|
+
ctx.error("SQL format requires 'table' config or 'dbtable' option")
|
|
812
|
+
raise ValueError("SQL format requires 'table' config or 'dbtable' option")
|
|
813
|
+
|
|
814
|
+
# Handle MERGE mode for SQL Server
|
|
815
|
+
if mode == "merge":
|
|
816
|
+
merge_keys = options.get("merge_keys")
|
|
817
|
+
merge_options = options.get("merge_options")
|
|
818
|
+
|
|
819
|
+
if not merge_keys:
|
|
820
|
+
ctx.error("MERGE mode requires 'merge_keys' in options")
|
|
821
|
+
raise ValueError(
|
|
822
|
+
"MERGE mode requires 'merge_keys' in options. "
|
|
823
|
+
"Specify the key columns for the MERGE ON clause."
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
from odibi.writers.sql_server_writer import SqlServerMergeWriter
|
|
827
|
+
|
|
828
|
+
writer = SqlServerMergeWriter(connection)
|
|
829
|
+
ctx.debug(
|
|
830
|
+
"Executing SQL Server MERGE",
|
|
831
|
+
target=table,
|
|
832
|
+
merge_keys=merge_keys,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
try:
|
|
836
|
+
result = writer.merge(
|
|
837
|
+
df=df,
|
|
838
|
+
spark_engine=self,
|
|
839
|
+
target_table=table,
|
|
840
|
+
merge_keys=merge_keys,
|
|
841
|
+
options=merge_options,
|
|
842
|
+
jdbc_options=jdbc_options,
|
|
843
|
+
)
|
|
844
|
+
elapsed = (time.time() - start_time) * 1000
|
|
845
|
+
ctx.log_file_io(path=target_identifier, format=format, mode="write")
|
|
846
|
+
ctx.info(
|
|
847
|
+
"SQL Server MERGE completed",
|
|
848
|
+
target=target_identifier,
|
|
849
|
+
mode=mode,
|
|
850
|
+
inserted=result.inserted,
|
|
851
|
+
updated=result.updated,
|
|
852
|
+
deleted=result.deleted,
|
|
853
|
+
elapsed_ms=round(elapsed, 2),
|
|
854
|
+
)
|
|
855
|
+
return {
|
|
856
|
+
"mode": "merge",
|
|
857
|
+
"inserted": result.inserted,
|
|
858
|
+
"updated": result.updated,
|
|
859
|
+
"deleted": result.deleted,
|
|
860
|
+
"total_affected": result.total_affected,
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
except Exception as e:
|
|
864
|
+
elapsed = (time.time() - start_time) * 1000
|
|
865
|
+
ctx.error(
|
|
866
|
+
"SQL Server MERGE failed",
|
|
867
|
+
target=target_identifier,
|
|
868
|
+
error_type=type(e).__name__,
|
|
869
|
+
error_message=str(e),
|
|
870
|
+
elapsed_ms=round(elapsed, 2),
|
|
871
|
+
)
|
|
872
|
+
raise
|
|
873
|
+
|
|
874
|
+
# Handle enhanced overwrite with strategies
|
|
875
|
+
if mode == "overwrite" and options.get("overwrite_options"):
|
|
876
|
+
from odibi.writers.sql_server_writer import SqlServerMergeWriter
|
|
877
|
+
|
|
878
|
+
overwrite_options = options.get("overwrite_options")
|
|
879
|
+
writer = SqlServerMergeWriter(connection)
|
|
880
|
+
|
|
881
|
+
ctx.debug(
|
|
882
|
+
"Executing SQL Server enhanced overwrite",
|
|
883
|
+
target=table,
|
|
884
|
+
strategy=(
|
|
885
|
+
overwrite_options.strategy.value
|
|
886
|
+
if hasattr(overwrite_options, "strategy")
|
|
887
|
+
else "truncate_insert"
|
|
888
|
+
),
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
try:
|
|
892
|
+
result = writer.overwrite_spark(
|
|
893
|
+
df=df,
|
|
894
|
+
target_table=table,
|
|
895
|
+
options=overwrite_options,
|
|
896
|
+
jdbc_options=jdbc_options,
|
|
897
|
+
)
|
|
898
|
+
elapsed = (time.time() - start_time) * 1000
|
|
899
|
+
ctx.log_file_io(path=target_identifier, format=format, mode="write")
|
|
900
|
+
ctx.info(
|
|
901
|
+
"SQL Server enhanced overwrite completed",
|
|
902
|
+
target=target_identifier,
|
|
903
|
+
strategy=result.strategy,
|
|
904
|
+
rows_written=result.rows_written,
|
|
905
|
+
elapsed_ms=round(elapsed, 2),
|
|
906
|
+
)
|
|
907
|
+
return {
|
|
908
|
+
"mode": "overwrite",
|
|
909
|
+
"strategy": result.strategy,
|
|
910
|
+
"rows_written": result.rows_written,
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
except Exception as e:
|
|
914
|
+
elapsed = (time.time() - start_time) * 1000
|
|
915
|
+
ctx.error(
|
|
916
|
+
"SQL Server enhanced overwrite failed",
|
|
917
|
+
target=target_identifier,
|
|
918
|
+
error_type=type(e).__name__,
|
|
919
|
+
error_message=str(e),
|
|
920
|
+
elapsed_ms=round(elapsed, 2),
|
|
921
|
+
)
|
|
922
|
+
raise
|
|
923
|
+
|
|
924
|
+
if mode not in ["overwrite", "append", "ignore", "error"]:
|
|
925
|
+
if mode == "fail":
|
|
926
|
+
mode = "error"
|
|
927
|
+
else:
|
|
928
|
+
ctx.error(f"Write mode '{mode}' not supported for Spark SQL write")
|
|
929
|
+
raise ValueError(f"Write mode '{mode}' not supported for Spark SQL write")
|
|
930
|
+
|
|
931
|
+
ctx.debug("Executing JDBC write", target=table or merged_options.get("dbtable"))
|
|
932
|
+
|
|
933
|
+
try:
|
|
934
|
+
df.write.format("jdbc").options(**merged_options).mode(mode).save()
|
|
935
|
+
elapsed = (time.time() - start_time) * 1000
|
|
936
|
+
ctx.log_file_io(path=target_identifier, format=format, mode="write")
|
|
937
|
+
ctx.info(
|
|
938
|
+
"JDBC write completed",
|
|
939
|
+
target=target_identifier,
|
|
940
|
+
mode=mode,
|
|
941
|
+
elapsed_ms=round(elapsed, 2),
|
|
942
|
+
)
|
|
943
|
+
return None
|
|
944
|
+
|
|
945
|
+
except Exception as e:
|
|
946
|
+
elapsed = (time.time() - start_time) * 1000
|
|
947
|
+
ctx.error(
|
|
948
|
+
"JDBC write failed",
|
|
949
|
+
target=target_identifier,
|
|
950
|
+
error_type=type(e).__name__,
|
|
951
|
+
error_message=str(e),
|
|
952
|
+
elapsed_ms=round(elapsed, 2),
|
|
953
|
+
)
|
|
954
|
+
raise
|
|
955
|
+
|
|
956
|
+
# Handle Upsert/AppendOnce (Delta Only)
|
|
957
|
+
if mode in ["upsert", "append_once"]:
|
|
958
|
+
if format != "delta":
|
|
959
|
+
ctx.error(f"Mode '{mode}' only supported for Delta format")
|
|
960
|
+
raise NotImplementedError(
|
|
961
|
+
f"Mode '{mode}' only supported for Delta format in Spark engine."
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
keys = options.get("keys")
|
|
965
|
+
if not keys:
|
|
966
|
+
ctx.error(f"Mode '{mode}' requires 'keys' list in options")
|
|
967
|
+
raise ValueError(f"Mode '{mode}' requires 'keys' list in options")
|
|
968
|
+
|
|
969
|
+
if isinstance(keys, str):
|
|
970
|
+
keys = [keys]
|
|
971
|
+
|
|
972
|
+
exists = self.table_exists(connection, table, path)
|
|
973
|
+
ctx.debug("Table existence check for merge", target=target_identifier, exists=exists)
|
|
974
|
+
|
|
975
|
+
if not exists:
|
|
976
|
+
mode = "overwrite"
|
|
977
|
+
ctx.debug("Target does not exist, falling back to overwrite mode")
|
|
978
|
+
else:
|
|
979
|
+
from delta.tables import DeltaTable
|
|
980
|
+
|
|
981
|
+
target_dt = None
|
|
982
|
+
target_name = ""
|
|
983
|
+
is_table_target = False
|
|
984
|
+
|
|
985
|
+
if table:
|
|
986
|
+
target_dt = DeltaTable.forName(self.spark, table)
|
|
987
|
+
target_name = table
|
|
988
|
+
is_table_target = True
|
|
989
|
+
elif path:
|
|
990
|
+
full_path = connection.get_path(path)
|
|
991
|
+
target_dt = DeltaTable.forPath(self.spark, full_path)
|
|
992
|
+
target_name = full_path
|
|
993
|
+
is_table_target = False
|
|
994
|
+
|
|
995
|
+
condition = " AND ".join([f"target.`{k}` = source.`{k}`" for k in keys])
|
|
996
|
+
ctx.debug("Executing Delta merge", merge_mode=mode, keys=keys, condition=condition)
|
|
997
|
+
|
|
998
|
+
merge_builder = target_dt.alias("target").merge(df.alias("source"), condition)
|
|
999
|
+
|
|
1000
|
+
try:
|
|
1001
|
+
if mode == "upsert":
|
|
1002
|
+
merge_builder.whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
|
|
1003
|
+
elif mode == "append_once":
|
|
1004
|
+
merge_builder.whenNotMatchedInsertAll().execute()
|
|
1005
|
+
|
|
1006
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1007
|
+
ctx.info(
|
|
1008
|
+
"Delta merge completed",
|
|
1009
|
+
target=target_name,
|
|
1010
|
+
mode=mode,
|
|
1011
|
+
elapsed_ms=round(elapsed, 2),
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
self._optimize_delta_write(target_name, options, is_table=is_table_target)
|
|
1015
|
+
commit_info = self._get_last_delta_commit_info(
|
|
1016
|
+
target_name, is_table=is_table_target
|
|
1017
|
+
)
|
|
1018
|
+
|
|
1019
|
+
if commit_info:
|
|
1020
|
+
ctx.debug(
|
|
1021
|
+
"Delta commit info",
|
|
1022
|
+
version=commit_info.get("version"),
|
|
1023
|
+
operation=commit_info.get("operation"),
|
|
1024
|
+
)
|
|
1025
|
+
|
|
1026
|
+
return commit_info
|
|
1027
|
+
|
|
1028
|
+
except Exception as e:
|
|
1029
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1030
|
+
ctx.error(
|
|
1031
|
+
"Delta merge failed",
|
|
1032
|
+
target=target_name,
|
|
1033
|
+
error_type=type(e).__name__,
|
|
1034
|
+
error_message=str(e),
|
|
1035
|
+
elapsed_ms=round(elapsed, 2),
|
|
1036
|
+
)
|
|
1037
|
+
raise
|
|
1038
|
+
|
|
1039
|
+
# Get output location
|
|
1040
|
+
if table:
|
|
1041
|
+
# Managed/External Table (Catalog)
|
|
1042
|
+
ctx.debug(f"Writing to catalog table: {table}")
|
|
1043
|
+
writer = df.write.format(format).mode(mode)
|
|
1044
|
+
|
|
1045
|
+
partition_by = options.get("partition_by")
|
|
1046
|
+
if partition_by:
|
|
1047
|
+
if isinstance(partition_by, str):
|
|
1048
|
+
partition_by = [partition_by]
|
|
1049
|
+
writer = writer.partitionBy(*partition_by)
|
|
1050
|
+
ctx.debug(f"Partitioning by: {partition_by}")
|
|
1051
|
+
|
|
1052
|
+
for key, value in options.items():
|
|
1053
|
+
writer = writer.option(key, value)
|
|
1054
|
+
|
|
1055
|
+
try:
|
|
1056
|
+
writer.saveAsTable(table)
|
|
1057
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1058
|
+
|
|
1059
|
+
ctx.log_file_io(
|
|
1060
|
+
path=table,
|
|
1061
|
+
format=format,
|
|
1062
|
+
mode=mode,
|
|
1063
|
+
partitions=partition_by,
|
|
1064
|
+
)
|
|
1065
|
+
ctx.info(
|
|
1066
|
+
f"Table write completed: {table}",
|
|
1067
|
+
mode=mode,
|
|
1068
|
+
elapsed_ms=round(elapsed, 2),
|
|
1069
|
+
)
|
|
1070
|
+
|
|
1071
|
+
if format == "delta":
|
|
1072
|
+
self._optimize_delta_write(table, options, is_table=True)
|
|
1073
|
+
return self._get_last_delta_commit_info(table, is_table=True)
|
|
1074
|
+
return None
|
|
1075
|
+
|
|
1076
|
+
except Exception as e:
|
|
1077
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1078
|
+
ctx.error(
|
|
1079
|
+
f"Table write failed: {table}",
|
|
1080
|
+
error_type=type(e).__name__,
|
|
1081
|
+
error_message=str(e),
|
|
1082
|
+
elapsed_ms=round(elapsed, 2),
|
|
1083
|
+
)
|
|
1084
|
+
raise
|
|
1085
|
+
|
|
1086
|
+
elif path:
|
|
1087
|
+
full_path = connection.get_path(path)
|
|
1088
|
+
else:
|
|
1089
|
+
ctx.error("Either path or table must be provided")
|
|
1090
|
+
raise ValueError("Either path or table must be provided")
|
|
1091
|
+
|
|
1092
|
+
# Extract partition_by option
|
|
1093
|
+
partition_by = options.pop("partition_by", None) or options.pop("partitionBy", None)
|
|
1094
|
+
|
|
1095
|
+
# Extract cluster_by option (Liquid Clustering)
|
|
1096
|
+
cluster_by = options.pop("cluster_by", None)
|
|
1097
|
+
|
|
1098
|
+
# Warn about partitioning anti-patterns
|
|
1099
|
+
if partition_by and cluster_by:
|
|
1100
|
+
import warnings
|
|
1101
|
+
|
|
1102
|
+
ctx.warning(
|
|
1103
|
+
"Conflict: Both 'partition_by' and 'cluster_by' are set",
|
|
1104
|
+
partition_by=partition_by,
|
|
1105
|
+
cluster_by=cluster_by,
|
|
1106
|
+
)
|
|
1107
|
+
warnings.warn(
|
|
1108
|
+
"⚠️ Conflict: Both 'partition_by' and 'cluster_by' (Liquid Clustering) are set. "
|
|
1109
|
+
"Liquid Clustering supersedes partitioning. 'partition_by' will be ignored "
|
|
1110
|
+
"if the table is being created now.",
|
|
1111
|
+
UserWarning,
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
elif partition_by:
|
|
1115
|
+
import warnings
|
|
1116
|
+
|
|
1117
|
+
ctx.warning(
|
|
1118
|
+
"Partitioning warning: ensure low-cardinality columns",
|
|
1119
|
+
partition_by=partition_by,
|
|
1120
|
+
)
|
|
1121
|
+
warnings.warn(
|
|
1122
|
+
"⚠️ Partitioning can cause performance issues if misused. "
|
|
1123
|
+
"Only partition on low-cardinality columns (< 1000 unique values) "
|
|
1124
|
+
"and ensure each partition has > 1000 rows.",
|
|
1125
|
+
UserWarning,
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
# Handle Upsert/Append-Once for Delta Lake (Path-based only for now)
|
|
1129
|
+
if format == "delta" and mode in ["upsert", "append_once"]:
|
|
1130
|
+
try:
|
|
1131
|
+
from delta.tables import DeltaTable
|
|
1132
|
+
except ImportError:
|
|
1133
|
+
ctx.error("Delta Lake support requires 'delta-spark'")
|
|
1134
|
+
raise ImportError("Delta Lake support requires 'delta-spark'")
|
|
1135
|
+
|
|
1136
|
+
if "keys" not in options:
|
|
1137
|
+
ctx.error(f"Mode '{mode}' requires 'keys' list in options")
|
|
1138
|
+
raise ValueError(f"Mode '{mode}' requires 'keys' list in options")
|
|
1139
|
+
|
|
1140
|
+
if DeltaTable.isDeltaTable(self.spark, full_path):
|
|
1141
|
+
ctx.debug(f"Performing Delta merge at path: {full_path}")
|
|
1142
|
+
delta_table = DeltaTable.forPath(self.spark, full_path)
|
|
1143
|
+
keys = options["keys"]
|
|
1144
|
+
if isinstance(keys, str):
|
|
1145
|
+
keys = [keys]
|
|
1146
|
+
|
|
1147
|
+
condition = " AND ".join([f"target.{k} = source.{k}" for k in keys])
|
|
1148
|
+
merger = delta_table.alias("target").merge(df.alias("source"), condition)
|
|
1149
|
+
|
|
1150
|
+
try:
|
|
1151
|
+
if mode == "upsert":
|
|
1152
|
+
merger.whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
|
|
1153
|
+
else:
|
|
1154
|
+
merger.whenNotMatchedInsertAll().execute()
|
|
1155
|
+
|
|
1156
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1157
|
+
ctx.info(
|
|
1158
|
+
"Delta merge completed at path",
|
|
1159
|
+
path=path,
|
|
1160
|
+
mode=mode,
|
|
1161
|
+
elapsed_ms=round(elapsed, 2),
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
if register_table:
|
|
1165
|
+
try:
|
|
1166
|
+
table_in_catalog = self.spark.catalog.tableExists(register_table)
|
|
1167
|
+
needs_registration = not table_in_catalog
|
|
1168
|
+
|
|
1169
|
+
# Handle orphan catalog entries (only for path-not-found errors)
|
|
1170
|
+
if table_in_catalog:
|
|
1171
|
+
try:
|
|
1172
|
+
self.spark.table(register_table).limit(0).collect()
|
|
1173
|
+
ctx.debug(
|
|
1174
|
+
f"Table '{register_table}' already registered and valid"
|
|
1175
|
+
)
|
|
1176
|
+
except Exception as verify_err:
|
|
1177
|
+
error_str = str(verify_err)
|
|
1178
|
+
is_orphan = (
|
|
1179
|
+
"DELTA_PATH_DOES_NOT_EXIST" in error_str
|
|
1180
|
+
or "Path does not exist" in error_str
|
|
1181
|
+
or "FileNotFoundException" in error_str
|
|
1182
|
+
)
|
|
1183
|
+
if is_orphan:
|
|
1184
|
+
ctx.warning(
|
|
1185
|
+
f"Table '{register_table}' is orphan, re-registering"
|
|
1186
|
+
)
|
|
1187
|
+
try:
|
|
1188
|
+
self.spark.sql(f"DROP TABLE IF EXISTS {register_table}")
|
|
1189
|
+
except Exception:
|
|
1190
|
+
pass
|
|
1191
|
+
needs_registration = True
|
|
1192
|
+
else:
|
|
1193
|
+
ctx.debug(
|
|
1194
|
+
f"Table '{register_table}' verify failed, "
|
|
1195
|
+
"skipping registration"
|
|
1196
|
+
)
|
|
1197
|
+
|
|
1198
|
+
if needs_registration:
|
|
1199
|
+
create_sql = (
|
|
1200
|
+
f"CREATE TABLE IF NOT EXISTS {register_table} "
|
|
1201
|
+
f"USING DELTA LOCATION '{full_path}'"
|
|
1202
|
+
)
|
|
1203
|
+
self.spark.sql(create_sql)
|
|
1204
|
+
ctx.info(f"Registered table: {register_table}", path=full_path)
|
|
1205
|
+
except Exception as e:
|
|
1206
|
+
ctx.error(
|
|
1207
|
+
f"Failed to register external table '{register_table}'",
|
|
1208
|
+
error_message=str(e),
|
|
1209
|
+
)
|
|
1210
|
+
|
|
1211
|
+
self._optimize_delta_write(full_path, options, is_table=False)
|
|
1212
|
+
return self._get_last_delta_commit_info(full_path, is_table=False)
|
|
1213
|
+
|
|
1214
|
+
except Exception as e:
|
|
1215
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1216
|
+
ctx.error(
|
|
1217
|
+
"Delta merge failed at path",
|
|
1218
|
+
path=path,
|
|
1219
|
+
error_type=type(e).__name__,
|
|
1220
|
+
error_message=str(e),
|
|
1221
|
+
elapsed_ms=round(elapsed, 2),
|
|
1222
|
+
)
|
|
1223
|
+
raise
|
|
1224
|
+
else:
|
|
1225
|
+
mode = "overwrite"
|
|
1226
|
+
ctx.debug("Target does not exist, falling back to overwrite mode")
|
|
1227
|
+
|
|
1228
|
+
# Write based on format (Path-based)
|
|
1229
|
+
ctx.debug(f"Writing to path: {full_path}")
|
|
1230
|
+
|
|
1231
|
+
# Handle Liquid Clustering (New Table Creation via SQL)
|
|
1232
|
+
if format == "delta" and cluster_by:
|
|
1233
|
+
should_create = False
|
|
1234
|
+
target_name = None
|
|
1235
|
+
|
|
1236
|
+
if table:
|
|
1237
|
+
target_name = table
|
|
1238
|
+
if mode == "overwrite":
|
|
1239
|
+
should_create = True
|
|
1240
|
+
elif mode == "append":
|
|
1241
|
+
if not self.spark.catalog.tableExists(table):
|
|
1242
|
+
should_create = True
|
|
1243
|
+
elif path:
|
|
1244
|
+
full_path = connection.get_path(path)
|
|
1245
|
+
target_name = f"delta.`{full_path}`"
|
|
1246
|
+
if mode == "overwrite":
|
|
1247
|
+
should_create = True
|
|
1248
|
+
elif mode == "append":
|
|
1249
|
+
try:
|
|
1250
|
+
from delta.tables import DeltaTable
|
|
1251
|
+
|
|
1252
|
+
if not DeltaTable.isDeltaTable(self.spark, full_path):
|
|
1253
|
+
should_create = True
|
|
1254
|
+
except ImportError:
|
|
1255
|
+
pass
|
|
1256
|
+
|
|
1257
|
+
if should_create:
|
|
1258
|
+
if isinstance(cluster_by, str):
|
|
1259
|
+
cluster_by = [cluster_by]
|
|
1260
|
+
|
|
1261
|
+
cols = ", ".join(cluster_by)
|
|
1262
|
+
temp_view = f"odibi_temp_writer_{abs(hash(str(target_name)))}"
|
|
1263
|
+
df.createOrReplaceTempView(temp_view)
|
|
1264
|
+
|
|
1265
|
+
create_cmd = (
|
|
1266
|
+
"CREATE OR REPLACE TABLE"
|
|
1267
|
+
if mode == "overwrite"
|
|
1268
|
+
else "CREATE TABLE IF NOT EXISTS"
|
|
1269
|
+
)
|
|
1270
|
+
|
|
1271
|
+
sql = (
|
|
1272
|
+
f"{create_cmd} {target_name} USING DELTA CLUSTER BY ({cols}) "
|
|
1273
|
+
f"AS SELECT * FROM {temp_view}"
|
|
1274
|
+
)
|
|
1275
|
+
|
|
1276
|
+
ctx.debug("Creating clustered Delta table", sql=sql, cluster_by=cluster_by)
|
|
1277
|
+
|
|
1278
|
+
try:
|
|
1279
|
+
self.spark.sql(sql)
|
|
1280
|
+
self.spark.catalog.dropTempView(temp_view)
|
|
1281
|
+
|
|
1282
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1283
|
+
ctx.info(
|
|
1284
|
+
"Clustered Delta table created",
|
|
1285
|
+
target=target_name,
|
|
1286
|
+
cluster_by=cluster_by,
|
|
1287
|
+
elapsed_ms=round(elapsed, 2),
|
|
1288
|
+
)
|
|
1289
|
+
|
|
1290
|
+
if register_table and path:
|
|
1291
|
+
try:
|
|
1292
|
+
reg_sql = (
|
|
1293
|
+
f"CREATE TABLE IF NOT EXISTS {register_table} "
|
|
1294
|
+
f"USING DELTA LOCATION '{full_path}'"
|
|
1295
|
+
)
|
|
1296
|
+
self.spark.sql(reg_sql)
|
|
1297
|
+
ctx.info(f"Registered table: {register_table}")
|
|
1298
|
+
except Exception:
|
|
1299
|
+
pass
|
|
1300
|
+
|
|
1301
|
+
if format == "delta":
|
|
1302
|
+
self._optimize_delta_write(
|
|
1303
|
+
target_name if table else full_path, options, is_table=bool(table)
|
|
1304
|
+
)
|
|
1305
|
+
return self._get_last_delta_commit_info(
|
|
1306
|
+
target_name if table else full_path, is_table=bool(table)
|
|
1307
|
+
)
|
|
1308
|
+
return None
|
|
1309
|
+
|
|
1310
|
+
except Exception as e:
|
|
1311
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1312
|
+
ctx.error(
|
|
1313
|
+
"Failed to create clustered Delta table",
|
|
1314
|
+
error_type=type(e).__name__,
|
|
1315
|
+
error_message=str(e),
|
|
1316
|
+
elapsed_ms=round(elapsed, 2),
|
|
1317
|
+
)
|
|
1318
|
+
raise
|
|
1319
|
+
|
|
1320
|
+
# Extract table_properties from options
|
|
1321
|
+
table_properties = options.pop("table_properties", None)
|
|
1322
|
+
|
|
1323
|
+
# For column mapping and other properties that must be set BEFORE write
|
|
1324
|
+
original_configs = {}
|
|
1325
|
+
if table_properties and format == "delta":
|
|
1326
|
+
for prop_name, prop_value in table_properties.items():
|
|
1327
|
+
spark_conf_key = (
|
|
1328
|
+
f"spark.databricks.delta.properties.defaults.{prop_name.replace('delta.', '')}"
|
|
1329
|
+
)
|
|
1330
|
+
try:
|
|
1331
|
+
original_configs[spark_conf_key] = self.spark.conf.get(spark_conf_key, None)
|
|
1332
|
+
except Exception:
|
|
1333
|
+
original_configs[spark_conf_key] = None
|
|
1334
|
+
self.spark.conf.set(spark_conf_key, prop_value)
|
|
1335
|
+
ctx.debug(
|
|
1336
|
+
"Applied table properties as session defaults",
|
|
1337
|
+
properties=list(table_properties.keys()),
|
|
1338
|
+
)
|
|
1339
|
+
|
|
1340
|
+
writer = df.write.format(format).mode(mode)
|
|
1341
|
+
|
|
1342
|
+
if partition_by:
|
|
1343
|
+
if isinstance(partition_by, str):
|
|
1344
|
+
partition_by = [partition_by]
|
|
1345
|
+
writer = writer.partitionBy(*partition_by)
|
|
1346
|
+
ctx.debug(f"Partitioning by: {partition_by}")
|
|
1347
|
+
|
|
1348
|
+
for key, value in options.items():
|
|
1349
|
+
writer = writer.option(key, value)
|
|
1350
|
+
|
|
1351
|
+
try:
|
|
1352
|
+
writer.save(full_path)
|
|
1353
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1354
|
+
|
|
1355
|
+
ctx.log_file_io(
|
|
1356
|
+
path=path,
|
|
1357
|
+
format=format,
|
|
1358
|
+
mode=mode,
|
|
1359
|
+
partitions=partition_by,
|
|
1360
|
+
)
|
|
1361
|
+
ctx.info(
|
|
1362
|
+
f"File write completed: {path}",
|
|
1363
|
+
format=format,
|
|
1364
|
+
mode=mode,
|
|
1365
|
+
elapsed_ms=round(elapsed, 2),
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
except Exception as e:
|
|
1369
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1370
|
+
ctx.error(
|
|
1371
|
+
f"File write failed: {path}",
|
|
1372
|
+
error_type=type(e).__name__,
|
|
1373
|
+
error_message=str(e),
|
|
1374
|
+
elapsed_ms=round(elapsed, 2),
|
|
1375
|
+
)
|
|
1376
|
+
raise
|
|
1377
|
+
finally:
|
|
1378
|
+
for conf_key, original_value in original_configs.items():
|
|
1379
|
+
if original_value is None:
|
|
1380
|
+
self.spark.conf.unset(conf_key)
|
|
1381
|
+
else:
|
|
1382
|
+
self.spark.conf.set(conf_key, original_value)
|
|
1383
|
+
|
|
1384
|
+
if format == "delta":
|
|
1385
|
+
self._optimize_delta_write(full_path, options, is_table=False)
|
|
1386
|
+
|
|
1387
|
+
if register_table and format == "delta":
|
|
1388
|
+
try:
|
|
1389
|
+
table_in_catalog = self.spark.catalog.tableExists(register_table)
|
|
1390
|
+
needs_registration = not table_in_catalog
|
|
1391
|
+
|
|
1392
|
+
# Handle orphan catalog entries: table exists but points to deleted path
|
|
1393
|
+
# Only treat as orphan if it's specifically a DELTA_PATH_DOES_NOT_EXIST error
|
|
1394
|
+
if table_in_catalog:
|
|
1395
|
+
try:
|
|
1396
|
+
self.spark.table(register_table).limit(0).collect()
|
|
1397
|
+
ctx.debug(
|
|
1398
|
+
f"Table '{register_table}' already registered and valid, "
|
|
1399
|
+
"skipping registration"
|
|
1400
|
+
)
|
|
1401
|
+
except Exception as verify_err:
|
|
1402
|
+
error_str = str(verify_err)
|
|
1403
|
+
is_orphan = (
|
|
1404
|
+
"DELTA_PATH_DOES_NOT_EXIST" in error_str
|
|
1405
|
+
or "Path does not exist" in error_str
|
|
1406
|
+
or "FileNotFoundException" in error_str
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
if is_orphan:
|
|
1410
|
+
# Orphan entry - table in catalog but path was deleted
|
|
1411
|
+
ctx.warning(
|
|
1412
|
+
f"Table '{register_table}' is orphan (path deleted), "
|
|
1413
|
+
"dropping and re-registering",
|
|
1414
|
+
error_message=error_str[:200],
|
|
1415
|
+
)
|
|
1416
|
+
try:
|
|
1417
|
+
self.spark.sql(f"DROP TABLE IF EXISTS {register_table}")
|
|
1418
|
+
except Exception:
|
|
1419
|
+
pass # Best effort cleanup
|
|
1420
|
+
needs_registration = True
|
|
1421
|
+
else:
|
|
1422
|
+
# Other error (auth, network, etc.) - don't drop, just log
|
|
1423
|
+
ctx.debug(
|
|
1424
|
+
f"Table '{register_table}' exists but verify failed "
|
|
1425
|
+
"(not orphan), skipping registration",
|
|
1426
|
+
error_message=error_str[:200],
|
|
1427
|
+
)
|
|
1428
|
+
|
|
1429
|
+
if needs_registration:
|
|
1430
|
+
ctx.debug(f"Registering table '{register_table}' at '{full_path}'")
|
|
1431
|
+
reg_sql = (
|
|
1432
|
+
f"CREATE TABLE IF NOT EXISTS {register_table} "
|
|
1433
|
+
f"USING DELTA LOCATION '{full_path}'"
|
|
1434
|
+
)
|
|
1435
|
+
self.spark.sql(reg_sql)
|
|
1436
|
+
ctx.info(f"Registered table: {register_table}", path=full_path)
|
|
1437
|
+
except Exception as e:
|
|
1438
|
+
ctx.error(
|
|
1439
|
+
f"Failed to register table '{register_table}'",
|
|
1440
|
+
error_message=str(e),
|
|
1441
|
+
)
|
|
1442
|
+
raise RuntimeError(
|
|
1443
|
+
f"Failed to register external table '{register_table}': {e}"
|
|
1444
|
+
) from e
|
|
1445
|
+
|
|
1446
|
+
if format == "delta":
|
|
1447
|
+
return self._get_last_delta_commit_info(full_path, is_table=False)
|
|
1448
|
+
|
|
1449
|
+
return None
|
|
1450
|
+
|
|
1451
|
+
def _write_streaming(
|
|
1452
|
+
self,
|
|
1453
|
+
df,
|
|
1454
|
+
connection: Any,
|
|
1455
|
+
format: str,
|
|
1456
|
+
table: Optional[str] = None,
|
|
1457
|
+
path: Optional[str] = None,
|
|
1458
|
+
register_table: Optional[str] = None,
|
|
1459
|
+
options: Optional[Dict[str, Any]] = None,
|
|
1460
|
+
streaming_config: Optional[Any] = None,
|
|
1461
|
+
) -> Dict[str, Any]:
|
|
1462
|
+
"""Write streaming DataFrame using Spark Structured Streaming.
|
|
1463
|
+
|
|
1464
|
+
Args:
|
|
1465
|
+
df: Streaming Spark DataFrame
|
|
1466
|
+
connection: Connection object
|
|
1467
|
+
format: Output format (delta, kafka, etc.)
|
|
1468
|
+
table: Table name
|
|
1469
|
+
path: File path
|
|
1470
|
+
register_table: Name to register as external table (if path is used)
|
|
1471
|
+
options: Format-specific options
|
|
1472
|
+
streaming_config: StreamingWriteConfig with streaming parameters
|
|
1473
|
+
|
|
1474
|
+
Returns:
|
|
1475
|
+
Dictionary with streaming query information
|
|
1476
|
+
"""
|
|
1477
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
1478
|
+
start_time = time.time()
|
|
1479
|
+
options = options or {}
|
|
1480
|
+
|
|
1481
|
+
if streaming_config is None:
|
|
1482
|
+
ctx.error("Streaming DataFrame requires streaming_config")
|
|
1483
|
+
raise ValueError(
|
|
1484
|
+
"Streaming DataFrame detected but no streaming_config provided. "
|
|
1485
|
+
"Add a 'streaming' section to your write config with at least "
|
|
1486
|
+
"'checkpoint_location' specified."
|
|
1487
|
+
)
|
|
1488
|
+
|
|
1489
|
+
target_identifier = table or path or "unknown"
|
|
1490
|
+
|
|
1491
|
+
checkpoint_location = streaming_config.checkpoint_location
|
|
1492
|
+
if checkpoint_location and connection:
|
|
1493
|
+
if not checkpoint_location.startswith(
|
|
1494
|
+
("abfss://", "s3://", "gs://", "dbfs://", "hdfs://", "wasbs://")
|
|
1495
|
+
):
|
|
1496
|
+
checkpoint_location = connection.get_path(checkpoint_location)
|
|
1497
|
+
ctx.debug(
|
|
1498
|
+
"Resolved checkpoint location through connection",
|
|
1499
|
+
original=streaming_config.checkpoint_location,
|
|
1500
|
+
resolved=checkpoint_location,
|
|
1501
|
+
)
|
|
1502
|
+
|
|
1503
|
+
ctx.debug(
|
|
1504
|
+
"Starting streaming write",
|
|
1505
|
+
format=format,
|
|
1506
|
+
target=target_identifier,
|
|
1507
|
+
output_mode=streaming_config.output_mode,
|
|
1508
|
+
checkpoint=checkpoint_location,
|
|
1509
|
+
)
|
|
1510
|
+
|
|
1511
|
+
writer = df.writeStream.format(format)
|
|
1512
|
+
writer = writer.outputMode(streaming_config.output_mode)
|
|
1513
|
+
writer = writer.option("checkpointLocation", checkpoint_location)
|
|
1514
|
+
|
|
1515
|
+
if streaming_config.query_name:
|
|
1516
|
+
writer = writer.queryName(streaming_config.query_name)
|
|
1517
|
+
|
|
1518
|
+
if streaming_config.trigger:
|
|
1519
|
+
trigger = streaming_config.trigger
|
|
1520
|
+
if trigger.once:
|
|
1521
|
+
writer = writer.trigger(once=True)
|
|
1522
|
+
elif trigger.available_now:
|
|
1523
|
+
writer = writer.trigger(availableNow=True)
|
|
1524
|
+
elif trigger.processing_time:
|
|
1525
|
+
writer = writer.trigger(processingTime=trigger.processing_time)
|
|
1526
|
+
elif trigger.continuous:
|
|
1527
|
+
writer = writer.trigger(continuous=trigger.continuous)
|
|
1528
|
+
|
|
1529
|
+
partition_by = options.pop("partition_by", None) or options.pop("partitionBy", None)
|
|
1530
|
+
if partition_by:
|
|
1531
|
+
if isinstance(partition_by, str):
|
|
1532
|
+
partition_by = [partition_by]
|
|
1533
|
+
writer = writer.partitionBy(*partition_by)
|
|
1534
|
+
ctx.debug(f"Partitioning by: {partition_by}")
|
|
1535
|
+
|
|
1536
|
+
for key, value in options.items():
|
|
1537
|
+
writer = writer.option(key, value)
|
|
1538
|
+
|
|
1539
|
+
try:
|
|
1540
|
+
if table:
|
|
1541
|
+
query = writer.toTable(table)
|
|
1542
|
+
ctx.info(
|
|
1543
|
+
f"Streaming query started: writing to table {table}",
|
|
1544
|
+
query_id=str(query.id),
|
|
1545
|
+
query_name=query.name,
|
|
1546
|
+
)
|
|
1547
|
+
elif path:
|
|
1548
|
+
full_path = connection.get_path(path)
|
|
1549
|
+
query = writer.start(full_path)
|
|
1550
|
+
ctx.info(
|
|
1551
|
+
f"Streaming query started: writing to path {path}",
|
|
1552
|
+
query_id=str(query.id),
|
|
1553
|
+
query_name=query.name,
|
|
1554
|
+
)
|
|
1555
|
+
else:
|
|
1556
|
+
ctx.error("Either path or table must be provided for streaming write")
|
|
1557
|
+
raise ValueError(
|
|
1558
|
+
"Streaming write operation failed: neither 'path' nor 'table' was provided. "
|
|
1559
|
+
"Specify a file path or table name in your streaming configuration."
|
|
1560
|
+
)
|
|
1561
|
+
|
|
1562
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1563
|
+
|
|
1564
|
+
result = {
|
|
1565
|
+
"streaming": True,
|
|
1566
|
+
"query_id": str(query.id),
|
|
1567
|
+
"query_name": query.name,
|
|
1568
|
+
"status": "running",
|
|
1569
|
+
"target": target_identifier,
|
|
1570
|
+
"output_mode": streaming_config.output_mode,
|
|
1571
|
+
"checkpoint_location": streaming_config.checkpoint_location,
|
|
1572
|
+
"elapsed_ms": round(elapsed, 2),
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
should_wait = streaming_config.await_termination
|
|
1576
|
+
if streaming_config.trigger:
|
|
1577
|
+
trigger = streaming_config.trigger
|
|
1578
|
+
if trigger.once or trigger.available_now:
|
|
1579
|
+
should_wait = True
|
|
1580
|
+
|
|
1581
|
+
if should_wait:
|
|
1582
|
+
ctx.info(
|
|
1583
|
+
"Awaiting streaming query termination",
|
|
1584
|
+
timeout_seconds=streaming_config.timeout_seconds,
|
|
1585
|
+
)
|
|
1586
|
+
query.awaitTermination(streaming_config.timeout_seconds)
|
|
1587
|
+
result["status"] = "terminated"
|
|
1588
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1589
|
+
result["elapsed_ms"] = round(elapsed, 2)
|
|
1590
|
+
ctx.info(
|
|
1591
|
+
"Streaming query terminated",
|
|
1592
|
+
query_id=str(query.id),
|
|
1593
|
+
elapsed_ms=round(elapsed, 2),
|
|
1594
|
+
)
|
|
1595
|
+
|
|
1596
|
+
if register_table and path and format == "delta":
|
|
1597
|
+
full_path = connection.get_path(path)
|
|
1598
|
+
try:
|
|
1599
|
+
self.spark.sql(
|
|
1600
|
+
f"CREATE TABLE IF NOT EXISTS {register_table} "
|
|
1601
|
+
f"USING DELTA LOCATION '{full_path}'"
|
|
1602
|
+
)
|
|
1603
|
+
ctx.info(
|
|
1604
|
+
f"Registered external table: {register_table}",
|
|
1605
|
+
path=full_path,
|
|
1606
|
+
)
|
|
1607
|
+
result["registered_table"] = register_table
|
|
1608
|
+
except Exception as reg_err:
|
|
1609
|
+
ctx.warning(
|
|
1610
|
+
f"Failed to register external table '{register_table}'",
|
|
1611
|
+
error=str(reg_err),
|
|
1612
|
+
)
|
|
1613
|
+
else:
|
|
1614
|
+
result["streaming_query"] = query
|
|
1615
|
+
if register_table:
|
|
1616
|
+
ctx.warning(
|
|
1617
|
+
"register_table ignored for continuous streaming. "
|
|
1618
|
+
"Table will be registered after query terminates or manually."
|
|
1619
|
+
)
|
|
1620
|
+
|
|
1621
|
+
return result
|
|
1622
|
+
|
|
1623
|
+
except Exception as e:
|
|
1624
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1625
|
+
ctx.error(
|
|
1626
|
+
"Streaming write failed",
|
|
1627
|
+
target=target_identifier,
|
|
1628
|
+
error_type=type(e).__name__,
|
|
1629
|
+
error_message=str(e),
|
|
1630
|
+
elapsed_ms=round(elapsed, 2),
|
|
1631
|
+
)
|
|
1632
|
+
raise
|
|
1633
|
+
|
|
1634
|
+
def execute_sql(self, sql: str, context: Any = None) -> Any:
|
|
1635
|
+
"""Execute SQL query in Spark.
|
|
1636
|
+
|
|
1637
|
+
Args:
|
|
1638
|
+
sql: SQL query string
|
|
1639
|
+
context: Execution context (optional, not used for Spark)
|
|
1640
|
+
|
|
1641
|
+
Returns:
|
|
1642
|
+
Spark DataFrame with query results
|
|
1643
|
+
"""
|
|
1644
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
1645
|
+
start_time = time.time()
|
|
1646
|
+
|
|
1647
|
+
ctx.debug("Executing Spark SQL", query_preview=sql[:200] if len(sql) > 200 else sql)
|
|
1648
|
+
|
|
1649
|
+
try:
|
|
1650
|
+
result = self.spark.sql(sql)
|
|
1651
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1652
|
+
partition_count = result.rdd.getNumPartitions()
|
|
1653
|
+
|
|
1654
|
+
ctx.log_spark_metrics(partition_count=partition_count)
|
|
1655
|
+
ctx.info(
|
|
1656
|
+
"Spark SQL executed",
|
|
1657
|
+
elapsed_ms=round(elapsed, 2),
|
|
1658
|
+
partitions=partition_count,
|
|
1659
|
+
)
|
|
1660
|
+
|
|
1661
|
+
return result
|
|
1662
|
+
|
|
1663
|
+
except Exception as e:
|
|
1664
|
+
elapsed = (time.time() - start_time) * 1000
|
|
1665
|
+
error_type = type(e).__name__
|
|
1666
|
+
clean_message = _extract_spark_error_message(e)
|
|
1667
|
+
|
|
1668
|
+
if "AnalysisException" in error_type:
|
|
1669
|
+
ctx.error(
|
|
1670
|
+
"Spark SQL Analysis Error",
|
|
1671
|
+
error_type=error_type,
|
|
1672
|
+
error_message=clean_message,
|
|
1673
|
+
query_preview=sql[:200] if len(sql) > 200 else sql,
|
|
1674
|
+
elapsed_ms=round(elapsed, 2),
|
|
1675
|
+
)
|
|
1676
|
+
raise TransformError(f"Spark SQL Analysis Error: {clean_message}")
|
|
1677
|
+
|
|
1678
|
+
if "ParseException" in error_type:
|
|
1679
|
+
ctx.error(
|
|
1680
|
+
"Spark SQL Parse Error",
|
|
1681
|
+
error_type=error_type,
|
|
1682
|
+
error_message=clean_message,
|
|
1683
|
+
query_preview=sql[:200] if len(sql) > 200 else sql,
|
|
1684
|
+
elapsed_ms=round(elapsed, 2),
|
|
1685
|
+
)
|
|
1686
|
+
raise TransformError(f"Spark SQL Parse Error: {clean_message}")
|
|
1687
|
+
|
|
1688
|
+
ctx.error(
|
|
1689
|
+
"Spark SQL execution failed",
|
|
1690
|
+
error_type=error_type,
|
|
1691
|
+
error_message=clean_message,
|
|
1692
|
+
elapsed_ms=round(elapsed, 2),
|
|
1693
|
+
)
|
|
1694
|
+
raise TransformError(f"Spark SQL Error: {clean_message}")
|
|
1695
|
+
|
|
1696
|
+
def execute_transform(self, *args, **kwargs):
|
|
1697
|
+
raise NotImplementedError(
|
|
1698
|
+
"SparkEngine.execute_transform() will be implemented in Phase 2B. "
|
|
1699
|
+
"See PHASES.md for implementation plan."
|
|
1700
|
+
)
|
|
1701
|
+
|
|
1702
|
+
def execute_operation(self, operation: str, params: Dict[str, Any], df) -> Any:
|
|
1703
|
+
"""Execute built-in operation on Spark DataFrame."""
|
|
1704
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
1705
|
+
params = params or {}
|
|
1706
|
+
|
|
1707
|
+
ctx.debug(f"Executing operation: {operation}", params=list(params.keys()))
|
|
1708
|
+
|
|
1709
|
+
if operation == "pivot":
|
|
1710
|
+
group_by = params.get("group_by", [])
|
|
1711
|
+
pivot_column = params.get("pivot_column")
|
|
1712
|
+
value_column = params.get("value_column")
|
|
1713
|
+
agg_func = params.get("agg_func", "first")
|
|
1714
|
+
|
|
1715
|
+
if not pivot_column or not value_column:
|
|
1716
|
+
ctx.error("Pivot requires 'pivot_column' and 'value_column'")
|
|
1717
|
+
raise ValueError("Pivot requires 'pivot_column' and 'value_column'")
|
|
1718
|
+
|
|
1719
|
+
if isinstance(group_by, str):
|
|
1720
|
+
group_by = [group_by]
|
|
1721
|
+
|
|
1722
|
+
agg_expr = {value_column: agg_func}
|
|
1723
|
+
return df.groupBy(*group_by).pivot(pivot_column).agg(agg_expr)
|
|
1724
|
+
|
|
1725
|
+
elif operation == "drop_duplicates":
|
|
1726
|
+
subset = params.get("subset")
|
|
1727
|
+
if subset:
|
|
1728
|
+
if isinstance(subset, str):
|
|
1729
|
+
subset = [subset]
|
|
1730
|
+
return df.dropDuplicates(subset=subset)
|
|
1731
|
+
return df.dropDuplicates()
|
|
1732
|
+
|
|
1733
|
+
elif operation == "fillna":
|
|
1734
|
+
value = params.get("value")
|
|
1735
|
+
subset = params.get("subset")
|
|
1736
|
+
return df.fillna(value, subset=subset)
|
|
1737
|
+
|
|
1738
|
+
elif operation == "drop":
|
|
1739
|
+
columns = params.get("columns")
|
|
1740
|
+
if not columns:
|
|
1741
|
+
return df
|
|
1742
|
+
if isinstance(columns, str):
|
|
1743
|
+
columns = [columns]
|
|
1744
|
+
return df.drop(*columns)
|
|
1745
|
+
|
|
1746
|
+
elif operation == "rename":
|
|
1747
|
+
columns = params.get("columns")
|
|
1748
|
+
if not columns:
|
|
1749
|
+
return df
|
|
1750
|
+
|
|
1751
|
+
res = df
|
|
1752
|
+
for old_name, new_name in columns.items():
|
|
1753
|
+
res = res.withColumnRenamed(old_name, new_name)
|
|
1754
|
+
return res
|
|
1755
|
+
|
|
1756
|
+
elif operation == "sort":
|
|
1757
|
+
by = params.get("by")
|
|
1758
|
+
ascending = params.get("ascending", True)
|
|
1759
|
+
|
|
1760
|
+
if not by:
|
|
1761
|
+
return df
|
|
1762
|
+
|
|
1763
|
+
if isinstance(by, str):
|
|
1764
|
+
by = [by]
|
|
1765
|
+
|
|
1766
|
+
if not ascending:
|
|
1767
|
+
from pyspark.sql.functions import desc
|
|
1768
|
+
|
|
1769
|
+
sort_cols = [desc(c) for c in by]
|
|
1770
|
+
return df.orderBy(*sort_cols)
|
|
1771
|
+
|
|
1772
|
+
return df.orderBy(*by)
|
|
1773
|
+
|
|
1774
|
+
elif operation == "sample":
|
|
1775
|
+
fraction = params.get("frac", 0.1)
|
|
1776
|
+
seed = params.get("random_state")
|
|
1777
|
+
with_replacement = params.get("replace", False)
|
|
1778
|
+
return df.sample(withReplacement=with_replacement, fraction=fraction, seed=seed)
|
|
1779
|
+
|
|
1780
|
+
else:
|
|
1781
|
+
# Fallback: check if operation is a registered transformer
|
|
1782
|
+
from odibi.context import EngineContext
|
|
1783
|
+
from odibi.registry import FunctionRegistry
|
|
1784
|
+
|
|
1785
|
+
ctx.debug(
|
|
1786
|
+
f"Checking registry for operation: {operation}",
|
|
1787
|
+
registered_functions=list(FunctionRegistry._functions.keys())[:10],
|
|
1788
|
+
has_function=FunctionRegistry.has_function(operation),
|
|
1789
|
+
)
|
|
1790
|
+
|
|
1791
|
+
if FunctionRegistry.has_function(operation):
|
|
1792
|
+
ctx.debug(f"Executing registered transformer as operation: {operation}")
|
|
1793
|
+
func = FunctionRegistry.get_function(operation)
|
|
1794
|
+
param_model = FunctionRegistry.get_param_model(operation)
|
|
1795
|
+
|
|
1796
|
+
# Create EngineContext from current df
|
|
1797
|
+
from odibi.context import SparkContext
|
|
1798
|
+
|
|
1799
|
+
engine_ctx = EngineContext(
|
|
1800
|
+
context=SparkContext(self.spark),
|
|
1801
|
+
df=df,
|
|
1802
|
+
engine=self,
|
|
1803
|
+
engine_type=self.engine_type,
|
|
1804
|
+
)
|
|
1805
|
+
|
|
1806
|
+
# Validate and instantiate params
|
|
1807
|
+
if param_model:
|
|
1808
|
+
validated_params = param_model(**params)
|
|
1809
|
+
result_ctx = func(engine_ctx, validated_params)
|
|
1810
|
+
else:
|
|
1811
|
+
result_ctx = func(engine_ctx, **params)
|
|
1812
|
+
|
|
1813
|
+
return result_ctx.df
|
|
1814
|
+
|
|
1815
|
+
ctx.error(f"Unsupported operation for Spark engine: {operation}")
|
|
1816
|
+
raise ValueError(f"Unsupported operation for Spark engine: {operation}")
|
|
1817
|
+
|
|
1818
|
+
def count_nulls(self, df, columns: List[str]) -> Dict[str, int]:
|
|
1819
|
+
"""Count nulls in specified columns."""
|
|
1820
|
+
from pyspark.sql.functions import col, count, when
|
|
1821
|
+
|
|
1822
|
+
missing = set(columns) - set(df.columns)
|
|
1823
|
+
if missing:
|
|
1824
|
+
raise ValueError(f"Columns not found in DataFrame: {', '.join(missing)}")
|
|
1825
|
+
|
|
1826
|
+
aggs = [count(when(col(c).isNull(), c)).alias(c) for c in columns]
|
|
1827
|
+
result = df.select(*aggs).collect()[0].asDict()
|
|
1828
|
+
return result
|
|
1829
|
+
|
|
1830
|
+
def validate_schema(self, df, schema_rules: Dict[str, Any]) -> List[str]:
|
|
1831
|
+
"""Validate DataFrame schema."""
|
|
1832
|
+
failures = []
|
|
1833
|
+
|
|
1834
|
+
if "required_columns" in schema_rules:
|
|
1835
|
+
required = schema_rules["required_columns"]
|
|
1836
|
+
missing = set(required) - set(df.columns)
|
|
1837
|
+
if missing:
|
|
1838
|
+
failures.append(f"Missing required columns: {', '.join(missing)}")
|
|
1839
|
+
|
|
1840
|
+
if "types" in schema_rules:
|
|
1841
|
+
type_map = {
|
|
1842
|
+
"int": ["integer", "long", "short", "byte", "bigint"],
|
|
1843
|
+
"float": ["double", "float"],
|
|
1844
|
+
"str": ["string"],
|
|
1845
|
+
"bool": ["boolean"],
|
|
1846
|
+
}
|
|
1847
|
+
|
|
1848
|
+
for col_name, expected_type in schema_rules["types"].items():
|
|
1849
|
+
if col_name not in df.columns:
|
|
1850
|
+
failures.append(f"Column '{col_name}' not found for type validation")
|
|
1851
|
+
continue
|
|
1852
|
+
|
|
1853
|
+
actual_type = dict(df.dtypes)[col_name]
|
|
1854
|
+
expected_dtypes = type_map.get(expected_type, [expected_type])
|
|
1855
|
+
|
|
1856
|
+
if actual_type not in expected_dtypes:
|
|
1857
|
+
failures.append(
|
|
1858
|
+
f"Column '{col_name}' has type '{actual_type}', expected '{expected_type}'"
|
|
1859
|
+
)
|
|
1860
|
+
|
|
1861
|
+
return failures
|
|
1862
|
+
|
|
1863
|
+
def validate_data(self, df, validation_config: Any) -> List[str]:
|
|
1864
|
+
"""Validate DataFrame against rules."""
|
|
1865
|
+
from pyspark.sql.functions import col
|
|
1866
|
+
|
|
1867
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
1868
|
+
failures = []
|
|
1869
|
+
|
|
1870
|
+
if validation_config.not_empty:
|
|
1871
|
+
if df.isEmpty():
|
|
1872
|
+
failures.append("DataFrame is empty")
|
|
1873
|
+
|
|
1874
|
+
if validation_config.no_nulls:
|
|
1875
|
+
null_counts = self.count_nulls(df, validation_config.no_nulls)
|
|
1876
|
+
for col_name, count in null_counts.items():
|
|
1877
|
+
if count > 0:
|
|
1878
|
+
failures.append(f"Column '{col_name}' has {count} null values")
|
|
1879
|
+
|
|
1880
|
+
if validation_config.schema_validation:
|
|
1881
|
+
schema_failures = self.validate_schema(df, validation_config.schema_validation)
|
|
1882
|
+
failures.extend(schema_failures)
|
|
1883
|
+
|
|
1884
|
+
if validation_config.ranges:
|
|
1885
|
+
for col_name, bounds in validation_config.ranges.items():
|
|
1886
|
+
if col_name in df.columns:
|
|
1887
|
+
min_val = bounds.get("min")
|
|
1888
|
+
max_val = bounds.get("max")
|
|
1889
|
+
|
|
1890
|
+
if min_val is not None:
|
|
1891
|
+
count = df.filter(col(col_name) < min_val).count()
|
|
1892
|
+
if count > 0:
|
|
1893
|
+
failures.append(f"Column '{col_name}' has values < {min_val}")
|
|
1894
|
+
|
|
1895
|
+
if max_val is not None:
|
|
1896
|
+
count = df.filter(col(col_name) > max_val).count()
|
|
1897
|
+
if count > 0:
|
|
1898
|
+
failures.append(f"Column '{col_name}' has values > {max_val}")
|
|
1899
|
+
else:
|
|
1900
|
+
failures.append(f"Column '{col_name}' not found for range validation")
|
|
1901
|
+
|
|
1902
|
+
if validation_config.allowed_values:
|
|
1903
|
+
for col_name, allowed in validation_config.allowed_values.items():
|
|
1904
|
+
if col_name in df.columns:
|
|
1905
|
+
count = df.filter(~col(col_name).isin(allowed)).count()
|
|
1906
|
+
if count > 0:
|
|
1907
|
+
failures.append(f"Column '{col_name}' has invalid values")
|
|
1908
|
+
else:
|
|
1909
|
+
failures.append(f"Column '{col_name}' not found for allowed values validation")
|
|
1910
|
+
|
|
1911
|
+
ctx.log_validation_result(
|
|
1912
|
+
passed=len(failures) == 0,
|
|
1913
|
+
rule_name="data_validation",
|
|
1914
|
+
failures=failures if failures else None,
|
|
1915
|
+
)
|
|
1916
|
+
|
|
1917
|
+
return failures
|
|
1918
|
+
|
|
1919
|
+
def get_sample(self, df, n: int = 10) -> List[Dict[str, Any]]:
|
|
1920
|
+
"""Get sample rows as list of dictionaries."""
|
|
1921
|
+
return [row.asDict() for row in df.limit(n).collect()]
|
|
1922
|
+
|
|
1923
|
+
def table_exists(
|
|
1924
|
+
self, connection: Any, table: Optional[str] = None, path: Optional[str] = None
|
|
1925
|
+
) -> bool:
|
|
1926
|
+
"""Check if table or location exists.
|
|
1927
|
+
|
|
1928
|
+
Handles orphan catalog entries where the table is registered but
|
|
1929
|
+
the underlying Delta path no longer exists.
|
|
1930
|
+
"""
|
|
1931
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
1932
|
+
|
|
1933
|
+
if table:
|
|
1934
|
+
try:
|
|
1935
|
+
if not self.spark.catalog.tableExists(table):
|
|
1936
|
+
ctx.debug(f"Table does not exist: {table}")
|
|
1937
|
+
return False
|
|
1938
|
+
# Table exists in catalog - verify it's actually readable
|
|
1939
|
+
# This catches orphan entries where path was deleted
|
|
1940
|
+
self.spark.table(table).limit(0).collect()
|
|
1941
|
+
ctx.debug(f"Table existence check: {table}", exists=True)
|
|
1942
|
+
return True
|
|
1943
|
+
except Exception as e:
|
|
1944
|
+
# Table exists in catalog but underlying data is gone (orphan entry)
|
|
1945
|
+
# This is expected during first-run detection - log at debug level
|
|
1946
|
+
ctx.debug(
|
|
1947
|
+
f"Table {table} exists in catalog but is not accessible (treating as first run)",
|
|
1948
|
+
error_message=str(e),
|
|
1949
|
+
)
|
|
1950
|
+
return False
|
|
1951
|
+
elif path:
|
|
1952
|
+
try:
|
|
1953
|
+
from delta.tables import DeltaTable
|
|
1954
|
+
|
|
1955
|
+
full_path = connection.get_path(path)
|
|
1956
|
+
exists = DeltaTable.isDeltaTable(self.spark, full_path)
|
|
1957
|
+
ctx.debug(f"Delta table existence check: {path}", exists=exists)
|
|
1958
|
+
return exists
|
|
1959
|
+
except ImportError:
|
|
1960
|
+
try:
|
|
1961
|
+
full_path = connection.get_path(path)
|
|
1962
|
+
exists = (
|
|
1963
|
+
self.spark.sparkContext._gateway.jvm.org.apache.hadoop.fs.FileSystem.get(
|
|
1964
|
+
self.spark.sparkContext._jsc.hadoopConfiguration()
|
|
1965
|
+
).exists(
|
|
1966
|
+
self.spark.sparkContext._gateway.jvm.org.apache.hadoop.fs.Path(
|
|
1967
|
+
full_path
|
|
1968
|
+
)
|
|
1969
|
+
)
|
|
1970
|
+
)
|
|
1971
|
+
ctx.debug(f"Path existence check: {path}", exists=exists)
|
|
1972
|
+
return exists
|
|
1973
|
+
except Exception as e:
|
|
1974
|
+
ctx.warning(f"Path existence check failed: {path}", error_message=str(e))
|
|
1975
|
+
return False
|
|
1976
|
+
except Exception as e:
|
|
1977
|
+
ctx.warning(f"Table existence check failed: {path}", error_message=str(e))
|
|
1978
|
+
return False
|
|
1979
|
+
return False
|
|
1980
|
+
|
|
1981
|
+
def get_table_schema(
|
|
1982
|
+
self,
|
|
1983
|
+
connection: Any,
|
|
1984
|
+
table: Optional[str] = None,
|
|
1985
|
+
path: Optional[str] = None,
|
|
1986
|
+
format: Optional[str] = None,
|
|
1987
|
+
) -> Optional[Dict[str, str]]:
|
|
1988
|
+
"""Get schema of an existing table/file."""
|
|
1989
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
1990
|
+
|
|
1991
|
+
try:
|
|
1992
|
+
if table:
|
|
1993
|
+
if self.spark.catalog.tableExists(table):
|
|
1994
|
+
schema = self.get_schema(self.spark.table(table))
|
|
1995
|
+
ctx.debug(f"Retrieved schema for table: {table}", columns=len(schema))
|
|
1996
|
+
return schema
|
|
1997
|
+
elif path:
|
|
1998
|
+
full_path = connection.get_path(path)
|
|
1999
|
+
if format == "delta":
|
|
2000
|
+
from delta.tables import DeltaTable
|
|
2001
|
+
|
|
2002
|
+
if DeltaTable.isDeltaTable(self.spark, full_path):
|
|
2003
|
+
schema = self.get_schema(DeltaTable.forPath(self.spark, full_path).toDF())
|
|
2004
|
+
ctx.debug(f"Retrieved Delta schema: {path}", columns=len(schema))
|
|
2005
|
+
return schema
|
|
2006
|
+
elif format == "parquet":
|
|
2007
|
+
schema = self.get_schema(self.spark.read.parquet(full_path))
|
|
2008
|
+
ctx.debug(f"Retrieved Parquet schema: {path}", columns=len(schema))
|
|
2009
|
+
return schema
|
|
2010
|
+
elif format:
|
|
2011
|
+
schema = self.get_schema(self.spark.read.format(format).load(full_path))
|
|
2012
|
+
ctx.debug(f"Retrieved schema: {path}", format=format, columns=len(schema))
|
|
2013
|
+
return schema
|
|
2014
|
+
except Exception as e:
|
|
2015
|
+
ctx.warning(
|
|
2016
|
+
"Failed to get schema",
|
|
2017
|
+
table=table,
|
|
2018
|
+
path=path,
|
|
2019
|
+
error_message=str(e),
|
|
2020
|
+
)
|
|
2021
|
+
return None
|
|
2022
|
+
|
|
2023
|
+
def vacuum_delta(
|
|
2024
|
+
self,
|
|
2025
|
+
connection: Any,
|
|
2026
|
+
path: str,
|
|
2027
|
+
retention_hours: int = 168,
|
|
2028
|
+
) -> None:
|
|
2029
|
+
"""VACUUM a Delta table to remove old files."""
|
|
2030
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
2031
|
+
start_time = time.time()
|
|
2032
|
+
|
|
2033
|
+
ctx.debug(
|
|
2034
|
+
"Starting Delta VACUUM",
|
|
2035
|
+
path=path,
|
|
2036
|
+
retention_hours=retention_hours,
|
|
2037
|
+
)
|
|
2038
|
+
|
|
2039
|
+
try:
|
|
2040
|
+
from delta.tables import DeltaTable
|
|
2041
|
+
except ImportError:
|
|
2042
|
+
ctx.error("Delta Lake support requires 'delta-spark'")
|
|
2043
|
+
raise ImportError(
|
|
2044
|
+
"Delta Lake support requires 'pip install odibi[spark]' "
|
|
2045
|
+
"with delta-spark. "
|
|
2046
|
+
"See README.md for installation instructions."
|
|
2047
|
+
)
|
|
2048
|
+
|
|
2049
|
+
full_path = connection.get_path(path)
|
|
2050
|
+
|
|
2051
|
+
try:
|
|
2052
|
+
delta_table = DeltaTable.forPath(self.spark, full_path)
|
|
2053
|
+
delta_table.vacuum(retention_hours / 24.0)
|
|
2054
|
+
|
|
2055
|
+
elapsed = (time.time() - start_time) * 1000
|
|
2056
|
+
ctx.info(
|
|
2057
|
+
"Delta VACUUM completed",
|
|
2058
|
+
path=path,
|
|
2059
|
+
retention_hours=retention_hours,
|
|
2060
|
+
elapsed_ms=round(elapsed, 2),
|
|
2061
|
+
)
|
|
2062
|
+
|
|
2063
|
+
except Exception as e:
|
|
2064
|
+
elapsed = (time.time() - start_time) * 1000
|
|
2065
|
+
ctx.error(
|
|
2066
|
+
"Delta VACUUM failed",
|
|
2067
|
+
path=path,
|
|
2068
|
+
error_type=type(e).__name__,
|
|
2069
|
+
error_message=str(e),
|
|
2070
|
+
elapsed_ms=round(elapsed, 2),
|
|
2071
|
+
)
|
|
2072
|
+
raise
|
|
2073
|
+
|
|
2074
|
+
def get_delta_history(
|
|
2075
|
+
self, connection: Any, path: str, limit: Optional[int] = None
|
|
2076
|
+
) -> List[Dict[str, Any]]:
|
|
2077
|
+
"""Get Delta table history."""
|
|
2078
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
2079
|
+
start_time = time.time()
|
|
2080
|
+
|
|
2081
|
+
ctx.debug("Fetching Delta history", path=path, limit=limit)
|
|
2082
|
+
|
|
2083
|
+
try:
|
|
2084
|
+
from delta.tables import DeltaTable
|
|
2085
|
+
except ImportError:
|
|
2086
|
+
ctx.error("Delta Lake support requires 'delta-spark'")
|
|
2087
|
+
raise ImportError(
|
|
2088
|
+
"Delta Lake support requires 'pip install odibi[spark]' "
|
|
2089
|
+
"with delta-spark. "
|
|
2090
|
+
"See README.md for installation instructions."
|
|
2091
|
+
)
|
|
2092
|
+
|
|
2093
|
+
full_path = connection.get_path(path)
|
|
2094
|
+
|
|
2095
|
+
try:
|
|
2096
|
+
delta_table = DeltaTable.forPath(self.spark, full_path)
|
|
2097
|
+
history_df = delta_table.history(limit) if limit else delta_table.history()
|
|
2098
|
+
history = [row.asDict() for row in history_df.collect()]
|
|
2099
|
+
|
|
2100
|
+
elapsed = (time.time() - start_time) * 1000
|
|
2101
|
+
ctx.info(
|
|
2102
|
+
"Delta history retrieved",
|
|
2103
|
+
path=path,
|
|
2104
|
+
versions_returned=len(history),
|
|
2105
|
+
elapsed_ms=round(elapsed, 2),
|
|
2106
|
+
)
|
|
2107
|
+
|
|
2108
|
+
return history
|
|
2109
|
+
|
|
2110
|
+
except Exception as e:
|
|
2111
|
+
elapsed = (time.time() - start_time) * 1000
|
|
2112
|
+
ctx.error(
|
|
2113
|
+
"Failed to get Delta history",
|
|
2114
|
+
path=path,
|
|
2115
|
+
error_type=type(e).__name__,
|
|
2116
|
+
error_message=str(e),
|
|
2117
|
+
elapsed_ms=round(elapsed, 2),
|
|
2118
|
+
)
|
|
2119
|
+
raise
|
|
2120
|
+
|
|
2121
|
+
def restore_delta(self, connection: Any, path: str, version: int) -> None:
|
|
2122
|
+
"""Restore Delta table to a specific version."""
|
|
2123
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
2124
|
+
start_time = time.time()
|
|
2125
|
+
|
|
2126
|
+
ctx.debug("Restoring Delta table", path=path, version=version)
|
|
2127
|
+
|
|
2128
|
+
try:
|
|
2129
|
+
from delta.tables import DeltaTable
|
|
2130
|
+
except ImportError:
|
|
2131
|
+
ctx.error("Delta Lake support requires 'delta-spark'")
|
|
2132
|
+
raise ImportError(
|
|
2133
|
+
"Delta Lake support requires 'pip install odibi[spark]' "
|
|
2134
|
+
"with delta-spark. "
|
|
2135
|
+
"See README.md for installation instructions."
|
|
2136
|
+
)
|
|
2137
|
+
|
|
2138
|
+
full_path = connection.get_path(path)
|
|
2139
|
+
|
|
2140
|
+
try:
|
|
2141
|
+
delta_table = DeltaTable.forPath(self.spark, full_path)
|
|
2142
|
+
delta_table.restoreToVersion(version)
|
|
2143
|
+
|
|
2144
|
+
elapsed = (time.time() - start_time) * 1000
|
|
2145
|
+
ctx.info(
|
|
2146
|
+
"Delta table restored",
|
|
2147
|
+
path=path,
|
|
2148
|
+
version=version,
|
|
2149
|
+
elapsed_ms=round(elapsed, 2),
|
|
2150
|
+
)
|
|
2151
|
+
|
|
2152
|
+
except Exception as e:
|
|
2153
|
+
elapsed = (time.time() - start_time) * 1000
|
|
2154
|
+
ctx.error(
|
|
2155
|
+
"Delta restore failed",
|
|
2156
|
+
path=path,
|
|
2157
|
+
version=version,
|
|
2158
|
+
error_type=type(e).__name__,
|
|
2159
|
+
error_message=str(e),
|
|
2160
|
+
elapsed_ms=round(elapsed, 2),
|
|
2161
|
+
)
|
|
2162
|
+
raise
|
|
2163
|
+
|
|
2164
|
+
def maintain_table(
|
|
2165
|
+
self,
|
|
2166
|
+
connection: Any,
|
|
2167
|
+
format: str,
|
|
2168
|
+
table: Optional[str] = None,
|
|
2169
|
+
path: Optional[str] = None,
|
|
2170
|
+
config: Optional[Any] = None,
|
|
2171
|
+
) -> None:
|
|
2172
|
+
"""Run table maintenance operations (optimize, vacuum)."""
|
|
2173
|
+
if format != "delta" or not config or not config.enabled:
|
|
2174
|
+
return
|
|
2175
|
+
|
|
2176
|
+
ctx = get_logging_context().with_context(engine="spark")
|
|
2177
|
+
start_time = time.time()
|
|
2178
|
+
|
|
2179
|
+
if table:
|
|
2180
|
+
target = table
|
|
2181
|
+
elif path:
|
|
2182
|
+
full_path = connection.get_path(path)
|
|
2183
|
+
target = f"delta.`{full_path}`"
|
|
2184
|
+
else:
|
|
2185
|
+
return
|
|
2186
|
+
|
|
2187
|
+
ctx.debug("Starting table maintenance", target=target)
|
|
2188
|
+
|
|
2189
|
+
try:
|
|
2190
|
+
ctx.debug(f"Running OPTIMIZE on {target}")
|
|
2191
|
+
self.spark.sql(f"OPTIMIZE {target}")
|
|
2192
|
+
|
|
2193
|
+
retention = config.vacuum_retention_hours
|
|
2194
|
+
if retention is not None and retention > 0:
|
|
2195
|
+
ctx.debug(f"Running VACUUM on {target}", retention_hours=retention)
|
|
2196
|
+
self.spark.sql(f"VACUUM {target} RETAIN {retention} HOURS")
|
|
2197
|
+
|
|
2198
|
+
elapsed = (time.time() - start_time) * 1000
|
|
2199
|
+
ctx.info(
|
|
2200
|
+
"Table maintenance completed",
|
|
2201
|
+
target=target,
|
|
2202
|
+
vacuum_retention_hours=retention,
|
|
2203
|
+
elapsed_ms=round(elapsed, 2),
|
|
2204
|
+
)
|
|
2205
|
+
|
|
2206
|
+
except Exception as e:
|
|
2207
|
+
elapsed = (time.time() - start_time) * 1000
|
|
2208
|
+
ctx.warning(
|
|
2209
|
+
f"Auto-optimize failed for {target}",
|
|
2210
|
+
error_type=type(e).__name__,
|
|
2211
|
+
error_message=str(e),
|
|
2212
|
+
elapsed_ms=round(elapsed, 2),
|
|
2213
|
+
)
|
|
2214
|
+
|
|
2215
|
+
def get_source_files(self, df) -> List[str]:
|
|
2216
|
+
"""Get list of source files that generated this DataFrame."""
|
|
2217
|
+
try:
|
|
2218
|
+
return df.inputFiles()
|
|
2219
|
+
except Exception:
|
|
2220
|
+
return []
|
|
2221
|
+
|
|
2222
|
+
def profile_nulls(self, df) -> Dict[str, float]:
|
|
2223
|
+
"""Calculate null percentage for each column."""
|
|
2224
|
+
from pyspark.sql.functions import col, mean, when
|
|
2225
|
+
|
|
2226
|
+
aggs = []
|
|
2227
|
+
for c in df.columns:
|
|
2228
|
+
aggs.append(mean(when(col(c).isNull(), 1).otherwise(0)).alias(c))
|
|
2229
|
+
|
|
2230
|
+
if not aggs:
|
|
2231
|
+
return {}
|
|
2232
|
+
|
|
2233
|
+
try:
|
|
2234
|
+
result = df.select(*aggs).collect()[0].asDict()
|
|
2235
|
+
return result
|
|
2236
|
+
except Exception:
|
|
2237
|
+
return {}
|
|
2238
|
+
|
|
2239
|
+
def filter_greater_than(self, df, column: str, value: Any) -> Any:
|
|
2240
|
+
"""Filter DataFrame where column > value.
|
|
2241
|
+
|
|
2242
|
+
Automatically casts string columns to timestamp for proper comparison.
|
|
2243
|
+
Tries multiple date formats including Oracle-style (DD-MON-YY).
|
|
2244
|
+
"""
|
|
2245
|
+
from pyspark.sql import functions as F
|
|
2246
|
+
from pyspark.sql.types import StringType
|
|
2247
|
+
|
|
2248
|
+
col_type = df.schema[column].dataType
|
|
2249
|
+
if isinstance(col_type, StringType):
|
|
2250
|
+
ts_col = self._parse_string_to_timestamp(F.col(column))
|
|
2251
|
+
return df.filter(ts_col > value)
|
|
2252
|
+
return df.filter(F.col(column) > value)
|
|
2253
|
+
|
|
2254
|
+
def _parse_string_to_timestamp(self, col):
|
|
2255
|
+
"""Parse string column to timestamp, trying multiple formats.
|
|
2256
|
+
|
|
2257
|
+
Supports:
|
|
2258
|
+
- ISO format: 2024-04-20 07:11:01
|
|
2259
|
+
- Oracle format: 20-APR-24 07:11:01.0 (handles uppercase months)
|
|
2260
|
+
"""
|
|
2261
|
+
from pyspark.sql import functions as F
|
|
2262
|
+
|
|
2263
|
+
result = F.to_timestamp(col)
|
|
2264
|
+
|
|
2265
|
+
result = F.coalesce(result, F.to_timestamp(col, "yyyy-MM-dd HH:mm:ss"))
|
|
2266
|
+
result = F.coalesce(result, F.to_timestamp(col, "yyyy-MM-dd'T'HH:mm:ss"))
|
|
2267
|
+
result = F.coalesce(result, F.to_timestamp(col, "MM/dd/yyyy HH:mm:ss"))
|
|
2268
|
+
|
|
2269
|
+
col_oracle = F.concat(
|
|
2270
|
+
F.substring(col, 1, 3),
|
|
2271
|
+
F.upper(F.substring(col, 4, 1)),
|
|
2272
|
+
F.lower(F.substring(col, 5, 2)),
|
|
2273
|
+
F.substring(col, 7, 100),
|
|
2274
|
+
)
|
|
2275
|
+
result = F.coalesce(result, F.to_timestamp(col_oracle, "dd-MMM-yy HH:mm:ss.S"))
|
|
2276
|
+
result = F.coalesce(result, F.to_timestamp(col_oracle, "dd-MMM-yy HH:mm:ss"))
|
|
2277
|
+
|
|
2278
|
+
return result
|
|
2279
|
+
|
|
2280
|
+
def filter_coalesce(self, df, col1: str, col2: str, op: str, value: Any) -> Any:
|
|
2281
|
+
"""Filter using COALESCE(col1, col2) op value.
|
|
2282
|
+
|
|
2283
|
+
Automatically casts string columns to timestamp for proper comparison.
|
|
2284
|
+
Tries multiple date formats including Oracle-style (DD-MON-YY).
|
|
2285
|
+
"""
|
|
2286
|
+
from pyspark.sql import functions as F
|
|
2287
|
+
from pyspark.sql.types import StringType
|
|
2288
|
+
|
|
2289
|
+
col1_type = df.schema[col1].dataType
|
|
2290
|
+
col2_type = df.schema[col2].dataType
|
|
2291
|
+
|
|
2292
|
+
if isinstance(col1_type, StringType):
|
|
2293
|
+
c1 = self._parse_string_to_timestamp(F.col(col1))
|
|
2294
|
+
else:
|
|
2295
|
+
c1 = F.col(col1)
|
|
2296
|
+
|
|
2297
|
+
if isinstance(col2_type, StringType):
|
|
2298
|
+
c2 = self._parse_string_to_timestamp(F.col(col2))
|
|
2299
|
+
else:
|
|
2300
|
+
c2 = F.col(col2)
|
|
2301
|
+
|
|
2302
|
+
coalesced = F.coalesce(c1, c2)
|
|
2303
|
+
|
|
2304
|
+
if op == ">":
|
|
2305
|
+
return df.filter(coalesced > value)
|
|
2306
|
+
elif op == ">=":
|
|
2307
|
+
return df.filter(coalesced >= value)
|
|
2308
|
+
elif op == "<":
|
|
2309
|
+
return df.filter(coalesced < value)
|
|
2310
|
+
elif op == "<=":
|
|
2311
|
+
return df.filter(coalesced <= value)
|
|
2312
|
+
elif op == "=":
|
|
2313
|
+
return df.filter(coalesced == value)
|
|
2314
|
+
else:
|
|
2315
|
+
return df.filter(f"COALESCE({col1}, {col2}) {op} '{value}'")
|
|
2316
|
+
|
|
2317
|
+
def add_write_metadata(
|
|
2318
|
+
self,
|
|
2319
|
+
df: Any,
|
|
2320
|
+
metadata_config: Any,
|
|
2321
|
+
source_connection: Optional[str] = None,
|
|
2322
|
+
source_table: Optional[str] = None,
|
|
2323
|
+
source_path: Optional[str] = None,
|
|
2324
|
+
is_file_source: bool = False,
|
|
2325
|
+
) -> Any:
|
|
2326
|
+
"""Add metadata columns to DataFrame before writing (Bronze layer lineage).
|
|
2327
|
+
|
|
2328
|
+
Args:
|
|
2329
|
+
df: Spark DataFrame
|
|
2330
|
+
metadata_config: WriteMetadataConfig or True (for all defaults)
|
|
2331
|
+
source_connection: Name of the source connection
|
|
2332
|
+
source_table: Name of the source table (SQL sources)
|
|
2333
|
+
source_path: Path of the source file (file sources)
|
|
2334
|
+
is_file_source: True if source is a file-based read
|
|
2335
|
+
|
|
2336
|
+
Returns:
|
|
2337
|
+
DataFrame with metadata columns added
|
|
2338
|
+
"""
|
|
2339
|
+
from pyspark.sql import functions as F
|
|
2340
|
+
|
|
2341
|
+
from odibi.config import WriteMetadataConfig
|
|
2342
|
+
|
|
2343
|
+
if metadata_config is True:
|
|
2344
|
+
config = WriteMetadataConfig()
|
|
2345
|
+
elif isinstance(metadata_config, WriteMetadataConfig):
|
|
2346
|
+
config = metadata_config
|
|
2347
|
+
else:
|
|
2348
|
+
return df
|
|
2349
|
+
|
|
2350
|
+
if config.extracted_at:
|
|
2351
|
+
df = df.withColumn("_extracted_at", F.current_timestamp())
|
|
2352
|
+
|
|
2353
|
+
if config.source_file and is_file_source and source_path:
|
|
2354
|
+
df = df.withColumn("_source_file", F.lit(source_path))
|
|
2355
|
+
|
|
2356
|
+
if config.source_connection and source_connection:
|
|
2357
|
+
df = df.withColumn("_source_connection", F.lit(source_connection))
|
|
2358
|
+
|
|
2359
|
+
if config.source_table and source_table:
|
|
2360
|
+
df = df.withColumn("_source_table", F.lit(source_table))
|
|
2361
|
+
|
|
2362
|
+
return df
|