odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2081 @@
|
|
|
1
|
+
"""SQL Server MERGE and overwrite writer for incremental sync operations.
|
|
2
|
+
|
|
3
|
+
Phase 1: Spark → SQL Server MERGE via staging table.
|
|
4
|
+
Phase 2: Enhanced overwrite strategies and validations.
|
|
5
|
+
Phase 3: Pandas engine support.
|
|
6
|
+
Phase 4: Polars engine support, auto schema/table creation, schema evolution, batch processing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
from odibi.config import (
|
|
13
|
+
SqlServerAuditColsConfig,
|
|
14
|
+
SqlServerMergeOptions,
|
|
15
|
+
SqlServerMergeValidationConfig,
|
|
16
|
+
SqlServerOverwriteOptions,
|
|
17
|
+
SqlServerOverwriteStrategy,
|
|
18
|
+
SqlServerSchemaEvolutionMode,
|
|
19
|
+
)
|
|
20
|
+
from odibi.utils.logging_context import get_logging_context
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Type mapping for schema inference
|
|
24
|
+
POLARS_TO_SQL_TYPE_MAP: Dict[str, str] = {
|
|
25
|
+
"Int8": "TINYINT",
|
|
26
|
+
"Int16": "SMALLINT",
|
|
27
|
+
"Int32": "INT",
|
|
28
|
+
"Int64": "BIGINT",
|
|
29
|
+
"UInt8": "TINYINT",
|
|
30
|
+
"UInt16": "SMALLINT",
|
|
31
|
+
"UInt32": "INT",
|
|
32
|
+
"UInt64": "BIGINT",
|
|
33
|
+
"Float32": "REAL",
|
|
34
|
+
"Float64": "FLOAT",
|
|
35
|
+
"Boolean": "BIT",
|
|
36
|
+
"Utf8": "NVARCHAR(MAX)",
|
|
37
|
+
"String": "NVARCHAR(MAX)",
|
|
38
|
+
"Date": "DATE",
|
|
39
|
+
"Datetime": "DATETIME2",
|
|
40
|
+
"Time": "TIME",
|
|
41
|
+
"Duration": "BIGINT",
|
|
42
|
+
"Binary": "VARBINARY(MAX)",
|
|
43
|
+
"Null": "NVARCHAR(1)",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
PANDAS_TO_SQL_TYPE_MAP: Dict[str, str] = {
|
|
47
|
+
"int8": "TINYINT",
|
|
48
|
+
"int16": "SMALLINT",
|
|
49
|
+
"int32": "INT",
|
|
50
|
+
"int64": "BIGINT",
|
|
51
|
+
"uint8": "TINYINT",
|
|
52
|
+
"uint16": "SMALLINT",
|
|
53
|
+
"uint32": "INT",
|
|
54
|
+
"uint64": "BIGINT",
|
|
55
|
+
"float16": "REAL",
|
|
56
|
+
"float32": "REAL",
|
|
57
|
+
"float64": "FLOAT",
|
|
58
|
+
"bool": "BIT",
|
|
59
|
+
"boolean": "BIT",
|
|
60
|
+
"object": "NVARCHAR(MAX)",
|
|
61
|
+
"string": "NVARCHAR(MAX)",
|
|
62
|
+
"datetime64[ns]": "DATETIME2",
|
|
63
|
+
"datetime64[us]": "DATETIME2",
|
|
64
|
+
"timedelta64[ns]": "BIGINT",
|
|
65
|
+
"category": "NVARCHAR(MAX)",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class MergeResult:
|
|
71
|
+
"""Result of a SQL Server MERGE operation."""
|
|
72
|
+
|
|
73
|
+
inserted: int = 0
|
|
74
|
+
updated: int = 0
|
|
75
|
+
deleted: int = 0
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def total_affected(self) -> int:
|
|
79
|
+
return self.inserted + self.updated + self.deleted
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class OverwriteResult:
|
|
84
|
+
"""Result of a SQL Server overwrite operation."""
|
|
85
|
+
|
|
86
|
+
rows_written: int = 0
|
|
87
|
+
strategy: str = "truncate_insert"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class ValidationResult:
|
|
92
|
+
"""Result of data validation checks."""
|
|
93
|
+
|
|
94
|
+
is_valid: bool = True
|
|
95
|
+
null_key_count: int = 0
|
|
96
|
+
duplicate_key_count: int = 0
|
|
97
|
+
errors: List[str] = None
|
|
98
|
+
|
|
99
|
+
def __post_init__(self):
|
|
100
|
+
if self.errors is None:
|
|
101
|
+
self.errors = []
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class SqlServerMergeWriter:
|
|
105
|
+
"""
|
|
106
|
+
Executes SQL Server MERGE and overwrite operations.
|
|
107
|
+
|
|
108
|
+
Supports:
|
|
109
|
+
- MERGE via staging table pattern
|
|
110
|
+
- Enhanced overwrite with multiple strategies
|
|
111
|
+
- Data validations (null keys, duplicate keys)
|
|
112
|
+
- Both Spark and Pandas DataFrames
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def __init__(self, connection: Any):
|
|
116
|
+
"""
|
|
117
|
+
Initialize the writer with a SQL Server connection.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
connection: Connection object with execute_sql and get_spark_options methods
|
|
121
|
+
"""
|
|
122
|
+
self.connection = connection
|
|
123
|
+
self.ctx = get_logging_context()
|
|
124
|
+
|
|
125
|
+
def get_staging_table_name(self, target_table: str, staging_schema: str) -> str:
|
|
126
|
+
"""
|
|
127
|
+
Generate staging table name from target table.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
target_table: Target table name (e.g., 'sales.fact_orders')
|
|
131
|
+
staging_schema: Schema for staging table
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Staging table name (e.g., '[staging].[fact_orders_staging]')
|
|
135
|
+
"""
|
|
136
|
+
if "." in target_table:
|
|
137
|
+
_, table_name = target_table.split(".", 1)
|
|
138
|
+
else:
|
|
139
|
+
table_name = target_table
|
|
140
|
+
|
|
141
|
+
table_name = table_name.strip("[]")
|
|
142
|
+
return f"[{staging_schema}].[{table_name}_staging]"
|
|
143
|
+
|
|
144
|
+
def escape_column(self, col: str) -> str:
|
|
145
|
+
"""Escape column name for SQL Server."""
|
|
146
|
+
col = col.strip("[]")
|
|
147
|
+
return f"[{col}]"
|
|
148
|
+
|
|
149
|
+
def parse_table_name(self, table: str) -> Tuple[str, str]:
|
|
150
|
+
"""
|
|
151
|
+
Parse table name into schema and table parts.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
table: Table name (e.g., 'sales.fact_orders' or 'fact_orders')
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Tuple of (schema, table_name)
|
|
158
|
+
"""
|
|
159
|
+
if "." in table:
|
|
160
|
+
schema, table_name = table.split(".", 1)
|
|
161
|
+
else:
|
|
162
|
+
schema = "dbo"
|
|
163
|
+
table_name = table
|
|
164
|
+
|
|
165
|
+
schema = schema.strip("[]")
|
|
166
|
+
table_name = table_name.strip("[]")
|
|
167
|
+
return schema, table_name
|
|
168
|
+
|
|
169
|
+
def get_escaped_table_name(self, table: str) -> str:
|
|
170
|
+
"""Get fully escaped table name."""
|
|
171
|
+
schema, table_name = self.parse_table_name(table)
|
|
172
|
+
return f"[{schema}].[{table_name}]"
|
|
173
|
+
|
|
174
|
+
def check_table_exists(self, table: str) -> bool:
|
|
175
|
+
"""
|
|
176
|
+
Check if a table exists in SQL Server.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
table: Table name (e.g., 'sales.fact_orders')
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
True if table exists
|
|
183
|
+
"""
|
|
184
|
+
schema, table_name = self.parse_table_name(table)
|
|
185
|
+
sql = f"""
|
|
186
|
+
SELECT 1 FROM INFORMATION_SCHEMA.TABLES
|
|
187
|
+
WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table_name}'
|
|
188
|
+
"""
|
|
189
|
+
result = self.connection.execute_sql(sql)
|
|
190
|
+
# Result is now a list of rows (fetchall already called in AzureSQL.execute)
|
|
191
|
+
row = result[0] if result else None
|
|
192
|
+
return row is not None
|
|
193
|
+
|
|
194
|
+
def read_target_hashes(
|
|
195
|
+
self,
|
|
196
|
+
target_table: str,
|
|
197
|
+
merge_keys: List[str],
|
|
198
|
+
hash_column: str,
|
|
199
|
+
) -> List[Dict[str, Any]]:
|
|
200
|
+
"""
|
|
201
|
+
Read merge keys and hash column from target table for incremental comparison.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
target_table: Target table name
|
|
205
|
+
merge_keys: Key columns
|
|
206
|
+
hash_column: Hash column name
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
List of dicts with keys and hash values, or empty list if hash column missing
|
|
210
|
+
"""
|
|
211
|
+
escaped_table = self.get_escaped_table_name(target_table)
|
|
212
|
+
|
|
213
|
+
# Check if hash column exists in target table before querying
|
|
214
|
+
existing_columns = self.get_table_columns(target_table)
|
|
215
|
+
if existing_columns and hash_column not in existing_columns:
|
|
216
|
+
self.ctx.info(
|
|
217
|
+
"Hash column not found in target table, skipping incremental comparison",
|
|
218
|
+
hash_column=hash_column,
|
|
219
|
+
target_table=target_table,
|
|
220
|
+
)
|
|
221
|
+
return []
|
|
222
|
+
|
|
223
|
+
key_cols = ", ".join([self.escape_column(k) for k in merge_keys])
|
|
224
|
+
hash_col = self.escape_column(hash_column)
|
|
225
|
+
|
|
226
|
+
sql = f"SELECT {key_cols}, {hash_col} FROM {escaped_table}"
|
|
227
|
+
self.ctx.debug("Reading target hashes for incremental merge", table=target_table)
|
|
228
|
+
|
|
229
|
+
result = self.connection.execute_sql(sql)
|
|
230
|
+
if not result:
|
|
231
|
+
return []
|
|
232
|
+
|
|
233
|
+
# Convert SQLAlchemy Row objects to dicts for Spark compatibility
|
|
234
|
+
# Row objects have _mapping attribute or can be accessed via _asdict()
|
|
235
|
+
dicts = []
|
|
236
|
+
for row in result:
|
|
237
|
+
if hasattr(row, "_asdict"):
|
|
238
|
+
dicts.append(row._asdict())
|
|
239
|
+
elif hasattr(row, "_mapping"):
|
|
240
|
+
dicts.append(dict(row._mapping))
|
|
241
|
+
else:
|
|
242
|
+
# Fallback: assume row is dict-like or tuple with known columns
|
|
243
|
+
columns = merge_keys + [hash_column]
|
|
244
|
+
dicts.append(dict(zip(columns, row)))
|
|
245
|
+
return dicts
|
|
246
|
+
|
|
247
|
+
def get_hash_column_name(
|
|
248
|
+
self,
|
|
249
|
+
df_columns: List[str],
|
|
250
|
+
options_hash_column: Optional[str],
|
|
251
|
+
) -> Optional[str]:
|
|
252
|
+
"""
|
|
253
|
+
Determine which hash column to use for incremental merge.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
df_columns: List of DataFrame column names
|
|
257
|
+
options_hash_column: Explicitly configured hash column
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Hash column name or None if not available
|
|
261
|
+
"""
|
|
262
|
+
if options_hash_column:
|
|
263
|
+
if options_hash_column in df_columns:
|
|
264
|
+
return options_hash_column
|
|
265
|
+
else:
|
|
266
|
+
self.ctx.warning(
|
|
267
|
+
f"Configured hash_column '{options_hash_column}' not found in DataFrame"
|
|
268
|
+
)
|
|
269
|
+
return None
|
|
270
|
+
|
|
271
|
+
# Auto-detect common hash column names
|
|
272
|
+
for candidate in ["_hash_diff", "_hash", "hash_diff", "row_hash"]:
|
|
273
|
+
if candidate in df_columns:
|
|
274
|
+
self.ctx.debug(f"Auto-detected hash column: {candidate}")
|
|
275
|
+
return candidate
|
|
276
|
+
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
def compute_hash_spark(
|
|
280
|
+
self, df: Any, columns: List[str], hash_col_name: str = "_computed_hash"
|
|
281
|
+
):
|
|
282
|
+
"""
|
|
283
|
+
Compute hash column for Spark DataFrame.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
df: Spark DataFrame
|
|
287
|
+
columns: Columns to include in hash
|
|
288
|
+
hash_col_name: Name for the computed hash column
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
DataFrame with hash column added
|
|
292
|
+
"""
|
|
293
|
+
from pyspark.sql import functions as F
|
|
294
|
+
|
|
295
|
+
# Concatenate columns and compute MD5 hash
|
|
296
|
+
concat_expr = F.concat_ws(
|
|
297
|
+
"||", *[F.coalesce(F.col(c).cast("string"), F.lit("NULL")) for c in columns]
|
|
298
|
+
)
|
|
299
|
+
return df.withColumn(hash_col_name, F.md5(concat_expr))
|
|
300
|
+
|
|
301
|
+
def compute_hash_pandas(
|
|
302
|
+
self, df: Any, columns: List[str], hash_col_name: str = "_computed_hash"
|
|
303
|
+
):
|
|
304
|
+
"""
|
|
305
|
+
Compute hash column for Pandas DataFrame.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
df: Pandas DataFrame
|
|
309
|
+
columns: Columns to include in hash
|
|
310
|
+
hash_col_name: Name for the computed hash column
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
DataFrame with hash column added
|
|
314
|
+
"""
|
|
315
|
+
import hashlib
|
|
316
|
+
|
|
317
|
+
def row_hash(row):
|
|
318
|
+
concat = "||".join(str(row[c]) if row[c] is not None else "NULL" for c in columns)
|
|
319
|
+
return hashlib.md5(concat.encode()).hexdigest()
|
|
320
|
+
|
|
321
|
+
df = df.copy()
|
|
322
|
+
df[hash_col_name] = df.apply(row_hash, axis=1)
|
|
323
|
+
return df
|
|
324
|
+
|
|
325
|
+
def compute_hash_polars(
|
|
326
|
+
self, df: Any, columns: List[str], hash_col_name: str = "_computed_hash"
|
|
327
|
+
):
|
|
328
|
+
"""
|
|
329
|
+
Compute hash column for Polars DataFrame.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
df: Polars DataFrame
|
|
333
|
+
columns: Columns to include in hash
|
|
334
|
+
hash_col_name: Name for the computed hash column
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
DataFrame with hash column added
|
|
338
|
+
"""
|
|
339
|
+
import polars as pl
|
|
340
|
+
|
|
341
|
+
# Concatenate columns and compute hash
|
|
342
|
+
concat_expr = pl.concat_str(
|
|
343
|
+
[pl.col(c).cast(pl.Utf8).fill_null("NULL") for c in columns],
|
|
344
|
+
separator="||",
|
|
345
|
+
)
|
|
346
|
+
return df.with_columns(concat_expr.hash().cast(pl.Utf8).alias(hash_col_name))
|
|
347
|
+
|
|
348
|
+
def filter_changed_rows_spark(
|
|
349
|
+
self,
|
|
350
|
+
source_df: Any,
|
|
351
|
+
target_hashes: List[Dict[str, Any]],
|
|
352
|
+
merge_keys: List[str],
|
|
353
|
+
hash_column: str,
|
|
354
|
+
):
|
|
355
|
+
"""
|
|
356
|
+
Filter Spark DataFrame to only rows that are new or changed.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
source_df: Source Spark DataFrame
|
|
360
|
+
target_hashes: List of dicts with target keys and hashes
|
|
361
|
+
merge_keys: Key columns
|
|
362
|
+
hash_column: Hash column name
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
Filtered DataFrame with only new/changed rows
|
|
366
|
+
"""
|
|
367
|
+
from pyspark.sql import functions as F
|
|
368
|
+
|
|
369
|
+
if not target_hashes:
|
|
370
|
+
# No existing data, all rows are new
|
|
371
|
+
return source_df
|
|
372
|
+
|
|
373
|
+
# Get SparkSession from DataFrame
|
|
374
|
+
spark = source_df.sparkSession
|
|
375
|
+
|
|
376
|
+
# Create DataFrame from target hashes
|
|
377
|
+
target_df = spark.createDataFrame(target_hashes)
|
|
378
|
+
|
|
379
|
+
# Rename hash column in target to avoid collision
|
|
380
|
+
target_hash_col = f"_target_{hash_column}"
|
|
381
|
+
target_df = target_df.withColumnRenamed(hash_column, target_hash_col)
|
|
382
|
+
|
|
383
|
+
# Left join source with target on merge keys
|
|
384
|
+
join_condition = [source_df[k] == target_df[k] for k in merge_keys]
|
|
385
|
+
joined = source_df.join(target_df, join_condition, "left")
|
|
386
|
+
|
|
387
|
+
# Filter to rows where:
|
|
388
|
+
# 1. No match in target (new rows) - target hash is null
|
|
389
|
+
# 2. Hash differs (changed rows)
|
|
390
|
+
changed = joined.filter(
|
|
391
|
+
F.col(target_hash_col).isNull() | (F.col(hash_column) != F.col(target_hash_col))
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# Drop the target columns
|
|
395
|
+
for k in merge_keys:
|
|
396
|
+
changed = changed.drop(target_df[k])
|
|
397
|
+
changed = changed.drop(target_hash_col)
|
|
398
|
+
|
|
399
|
+
return changed
|
|
400
|
+
|
|
401
|
+
def filter_changed_rows_pandas(
|
|
402
|
+
self,
|
|
403
|
+
source_df: Any,
|
|
404
|
+
target_hashes: List[Dict[str, Any]],
|
|
405
|
+
merge_keys: List[str],
|
|
406
|
+
hash_column: str,
|
|
407
|
+
):
|
|
408
|
+
"""
|
|
409
|
+
Filter Pandas DataFrame to only rows that are new or changed.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
source_df: Source Pandas DataFrame
|
|
413
|
+
target_hashes: List of dicts with target keys and hashes
|
|
414
|
+
merge_keys: Key columns
|
|
415
|
+
hash_column: Hash column name
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
Filtered DataFrame with only new/changed rows
|
|
419
|
+
"""
|
|
420
|
+
import pandas as pd
|
|
421
|
+
|
|
422
|
+
if not target_hashes:
|
|
423
|
+
return source_df
|
|
424
|
+
|
|
425
|
+
target_df = pd.DataFrame(target_hashes)
|
|
426
|
+
target_hash_col = f"_target_{hash_column}"
|
|
427
|
+
target_df = target_df.rename(columns={hash_column: target_hash_col})
|
|
428
|
+
|
|
429
|
+
# Merge to find matching rows
|
|
430
|
+
merged = source_df.merge(target_df, on=merge_keys, how="left")
|
|
431
|
+
|
|
432
|
+
# Filter to new or changed rows
|
|
433
|
+
is_new = merged[target_hash_col].isna()
|
|
434
|
+
is_changed = merged[hash_column] != merged[target_hash_col]
|
|
435
|
+
changed = merged[is_new | is_changed].copy()
|
|
436
|
+
|
|
437
|
+
# Drop the target hash column
|
|
438
|
+
changed = changed.drop(columns=[target_hash_col])
|
|
439
|
+
|
|
440
|
+
return changed
|
|
441
|
+
|
|
442
|
+
def filter_changed_rows_polars(
|
|
443
|
+
self,
|
|
444
|
+
source_df: Any,
|
|
445
|
+
target_hashes: List[Dict[str, Any]],
|
|
446
|
+
merge_keys: List[str],
|
|
447
|
+
hash_column: str,
|
|
448
|
+
):
|
|
449
|
+
"""
|
|
450
|
+
Filter Polars DataFrame to only rows that are new or changed.
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
source_df: Source Polars DataFrame
|
|
454
|
+
target_hashes: List of dicts with target keys and hashes
|
|
455
|
+
merge_keys: Key columns
|
|
456
|
+
hash_column: Hash column name
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
Filtered DataFrame with only new/changed rows
|
|
460
|
+
"""
|
|
461
|
+
import polars as pl
|
|
462
|
+
|
|
463
|
+
if not target_hashes:
|
|
464
|
+
return source_df
|
|
465
|
+
|
|
466
|
+
target_df = pl.DataFrame(target_hashes)
|
|
467
|
+
target_hash_col = f"_target_{hash_column}"
|
|
468
|
+
target_df = target_df.rename({hash_column: target_hash_col})
|
|
469
|
+
|
|
470
|
+
# Join to find matching rows
|
|
471
|
+
joined = source_df.join(target_df, on=merge_keys, how="left")
|
|
472
|
+
|
|
473
|
+
# Filter to new or changed rows
|
|
474
|
+
changed = joined.filter(
|
|
475
|
+
pl.col(target_hash_col).is_null() | (pl.col(hash_column) != pl.col(target_hash_col))
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Drop the target hash column
|
|
479
|
+
changed = changed.drop(target_hash_col)
|
|
480
|
+
|
|
481
|
+
return changed
|
|
482
|
+
|
|
483
|
+
def validate_keys_spark(
|
|
484
|
+
self,
|
|
485
|
+
df: Any,
|
|
486
|
+
merge_keys: List[str],
|
|
487
|
+
config: Optional[SqlServerMergeValidationConfig] = None,
|
|
488
|
+
) -> ValidationResult:
|
|
489
|
+
"""
|
|
490
|
+
Validate merge keys in a Spark DataFrame.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
df: Spark DataFrame
|
|
494
|
+
merge_keys: Key columns to validate
|
|
495
|
+
config: Validation configuration
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
ValidationResult with validation status
|
|
499
|
+
"""
|
|
500
|
+
config = config or SqlServerMergeValidationConfig()
|
|
501
|
+
result = ValidationResult()
|
|
502
|
+
|
|
503
|
+
if config.check_null_keys:
|
|
504
|
+
from pyspark.sql import functions as F
|
|
505
|
+
|
|
506
|
+
null_condition = F.lit(False)
|
|
507
|
+
for key in merge_keys:
|
|
508
|
+
null_condition = null_condition | F.col(key).isNull()
|
|
509
|
+
|
|
510
|
+
null_count = df.filter(null_condition).count()
|
|
511
|
+
if null_count > 0:
|
|
512
|
+
result.null_key_count = null_count
|
|
513
|
+
result.errors.append(
|
|
514
|
+
f"Found {null_count} rows with NULL values in merge keys: {merge_keys}"
|
|
515
|
+
)
|
|
516
|
+
result.is_valid = False
|
|
517
|
+
|
|
518
|
+
if config.check_duplicate_keys:
|
|
519
|
+
total_count = df.count()
|
|
520
|
+
distinct_count = df.select(*merge_keys).distinct().count()
|
|
521
|
+
duplicate_count = total_count - distinct_count
|
|
522
|
+
|
|
523
|
+
if duplicate_count > 0:
|
|
524
|
+
result.duplicate_key_count = duplicate_count
|
|
525
|
+
result.errors.append(
|
|
526
|
+
f"Found {duplicate_count} duplicate key combinations in merge keys: {merge_keys}"
|
|
527
|
+
)
|
|
528
|
+
result.is_valid = False
|
|
529
|
+
|
|
530
|
+
return result
|
|
531
|
+
|
|
532
|
+
def validate_keys_pandas(
|
|
533
|
+
self,
|
|
534
|
+
df: Any,
|
|
535
|
+
merge_keys: List[str],
|
|
536
|
+
config: Optional[SqlServerMergeValidationConfig] = None,
|
|
537
|
+
) -> ValidationResult:
|
|
538
|
+
"""
|
|
539
|
+
Validate merge keys in a Pandas DataFrame.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
df: Pandas DataFrame
|
|
543
|
+
merge_keys: Key columns to validate
|
|
544
|
+
config: Validation configuration
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
ValidationResult with validation status
|
|
548
|
+
"""
|
|
549
|
+
config = config or SqlServerMergeValidationConfig()
|
|
550
|
+
result = ValidationResult()
|
|
551
|
+
|
|
552
|
+
if config.check_null_keys:
|
|
553
|
+
null_mask = df[merge_keys].isnull().any(axis=1)
|
|
554
|
+
null_count = null_mask.sum()
|
|
555
|
+
|
|
556
|
+
if null_count > 0:
|
|
557
|
+
result.null_key_count = int(null_count)
|
|
558
|
+
result.errors.append(
|
|
559
|
+
f"Found {null_count} rows with NULL values in merge keys: {merge_keys}"
|
|
560
|
+
)
|
|
561
|
+
result.is_valid = False
|
|
562
|
+
|
|
563
|
+
if config.check_duplicate_keys:
|
|
564
|
+
duplicates = df.duplicated(subset=merge_keys, keep=False)
|
|
565
|
+
duplicate_count = (
|
|
566
|
+
duplicates.sum() - df.duplicated(subset=merge_keys, keep="first").sum()
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
if duplicate_count > 0:
|
|
570
|
+
result.duplicate_key_count = int(duplicate_count)
|
|
571
|
+
result.errors.append(
|
|
572
|
+
f"Found {duplicate_count} duplicate key combinations in merge keys: {merge_keys}"
|
|
573
|
+
)
|
|
574
|
+
result.is_valid = False
|
|
575
|
+
|
|
576
|
+
return result
|
|
577
|
+
|
|
578
|
+
def validate_keys_polars(
|
|
579
|
+
self,
|
|
580
|
+
df: Any,
|
|
581
|
+
merge_keys: List[str],
|
|
582
|
+
config: Optional[SqlServerMergeValidationConfig] = None,
|
|
583
|
+
) -> ValidationResult:
|
|
584
|
+
"""
|
|
585
|
+
Validate merge keys in a Polars DataFrame/LazyFrame.
|
|
586
|
+
|
|
587
|
+
Args:
|
|
588
|
+
df: Polars DataFrame or LazyFrame
|
|
589
|
+
merge_keys: Key columns to validate
|
|
590
|
+
config: Validation configuration
|
|
591
|
+
|
|
592
|
+
Returns:
|
|
593
|
+
ValidationResult with validation status
|
|
594
|
+
"""
|
|
595
|
+
try:
|
|
596
|
+
import polars as pl
|
|
597
|
+
except ImportError:
|
|
598
|
+
raise ImportError("Polars not installed. Run 'pip install polars'.")
|
|
599
|
+
|
|
600
|
+
config = config or SqlServerMergeValidationConfig()
|
|
601
|
+
result = ValidationResult()
|
|
602
|
+
|
|
603
|
+
is_lazy = isinstance(df, pl.LazyFrame)
|
|
604
|
+
if is_lazy:
|
|
605
|
+
df_materialized = df.collect()
|
|
606
|
+
else:
|
|
607
|
+
df_materialized = df
|
|
608
|
+
|
|
609
|
+
if config.check_null_keys:
|
|
610
|
+
null_condition = pl.lit(False)
|
|
611
|
+
for key in merge_keys:
|
|
612
|
+
null_condition = null_condition | pl.col(key).is_null()
|
|
613
|
+
|
|
614
|
+
null_count = df_materialized.filter(null_condition).height
|
|
615
|
+
|
|
616
|
+
if null_count > 0:
|
|
617
|
+
result.null_key_count = null_count
|
|
618
|
+
result.errors.append(
|
|
619
|
+
f"Found {null_count} rows with NULL values in merge keys: {merge_keys}"
|
|
620
|
+
)
|
|
621
|
+
result.is_valid = False
|
|
622
|
+
|
|
623
|
+
if config.check_duplicate_keys:
|
|
624
|
+
total_count = df_materialized.height
|
|
625
|
+
distinct_count = df_materialized.select(merge_keys).unique().height
|
|
626
|
+
duplicate_count = total_count - distinct_count
|
|
627
|
+
|
|
628
|
+
if duplicate_count > 0:
|
|
629
|
+
result.duplicate_key_count = duplicate_count
|
|
630
|
+
result.errors.append(
|
|
631
|
+
f"Found {duplicate_count} duplicate key combinations in merge keys: {merge_keys}"
|
|
632
|
+
)
|
|
633
|
+
result.is_valid = False
|
|
634
|
+
|
|
635
|
+
return result
|
|
636
|
+
|
|
637
|
+
def check_schema_exists(self, schema: str) -> bool:
|
|
638
|
+
"""Check if a schema exists in SQL Server."""
|
|
639
|
+
sql = f"SELECT 1 FROM sys.schemas WHERE name = '{schema}'"
|
|
640
|
+
result = self.connection.execute_sql(sql)
|
|
641
|
+
# Result is now a list of rows (fetchall already called in AzureSQL.execute)
|
|
642
|
+
row = result[0] if result else None
|
|
643
|
+
return row is not None
|
|
644
|
+
|
|
645
|
+
def create_schema(self, schema: str) -> None:
|
|
646
|
+
"""Create a schema if it doesn't exist."""
|
|
647
|
+
if not self.check_schema_exists(schema):
|
|
648
|
+
sql = f"CREATE SCHEMA [{schema}]"
|
|
649
|
+
self.ctx.info("Creating schema", schema=schema)
|
|
650
|
+
self.connection.execute_sql(sql)
|
|
651
|
+
|
|
652
|
+
def get_table_columns(self, table: str) -> Dict[str, str]:
|
|
653
|
+
"""
|
|
654
|
+
Get column names and full types (with length/precision) for a table.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
Dictionary mapping column names to full SQL types (e.g., 'nvarchar(255)')
|
|
658
|
+
"""
|
|
659
|
+
schema, table_name = self.parse_table_name(table)
|
|
660
|
+
sql = f"""
|
|
661
|
+
SELECT
|
|
662
|
+
COLUMN_NAME,
|
|
663
|
+
DATA_TYPE,
|
|
664
|
+
CHARACTER_MAXIMUM_LENGTH,
|
|
665
|
+
NUMERIC_PRECISION,
|
|
666
|
+
NUMERIC_SCALE
|
|
667
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
|
668
|
+
WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table_name}'
|
|
669
|
+
ORDER BY ORDINAL_POSITION
|
|
670
|
+
"""
|
|
671
|
+
result = self.connection.execute_sql(sql)
|
|
672
|
+
columns = {}
|
|
673
|
+
for row in result:
|
|
674
|
+
if isinstance(row, dict):
|
|
675
|
+
col_name = row["COLUMN_NAME"]
|
|
676
|
+
data_type = row["DATA_TYPE"]
|
|
677
|
+
char_len = row.get("CHARACTER_MAXIMUM_LENGTH")
|
|
678
|
+
num_prec = row.get("NUMERIC_PRECISION")
|
|
679
|
+
num_scale = row.get("NUMERIC_SCALE")
|
|
680
|
+
else:
|
|
681
|
+
col_name = row[0]
|
|
682
|
+
data_type = row[1]
|
|
683
|
+
char_len = row[2] if len(row) > 2 else None
|
|
684
|
+
num_prec = row[3] if len(row) > 3 else None
|
|
685
|
+
num_scale = row[4] if len(row) > 4 else None
|
|
686
|
+
|
|
687
|
+
# Build full type with length/precision
|
|
688
|
+
if data_type.lower() in ("nvarchar", "varchar", "char", "nchar", "binary", "varbinary"):
|
|
689
|
+
if char_len == -1:
|
|
690
|
+
full_type = f"{data_type}(MAX)"
|
|
691
|
+
elif char_len:
|
|
692
|
+
full_type = f"{data_type}({char_len})"
|
|
693
|
+
else:
|
|
694
|
+
full_type = f"{data_type}(MAX)"
|
|
695
|
+
elif data_type.lower() in ("decimal", "numeric"):
|
|
696
|
+
if num_prec and num_scale is not None:
|
|
697
|
+
full_type = f"{data_type}({num_prec},{num_scale})"
|
|
698
|
+
else:
|
|
699
|
+
full_type = data_type
|
|
700
|
+
else:
|
|
701
|
+
full_type = data_type
|
|
702
|
+
|
|
703
|
+
columns[col_name] = full_type
|
|
704
|
+
return columns
|
|
705
|
+
|
|
706
|
+
def infer_sql_type_pandas(self, dtype: Any) -> str:
|
|
707
|
+
"""Infer SQL Server type from Pandas dtype."""
|
|
708
|
+
dtype_str = str(dtype).lower()
|
|
709
|
+
for pattern, sql_type in PANDAS_TO_SQL_TYPE_MAP.items():
|
|
710
|
+
if pattern in dtype_str:
|
|
711
|
+
return sql_type
|
|
712
|
+
return "NVARCHAR(MAX)"
|
|
713
|
+
|
|
714
|
+
def infer_sql_type_polars(self, dtype: Any) -> str:
|
|
715
|
+
"""Infer SQL Server type from Polars dtype."""
|
|
716
|
+
dtype_str = str(dtype)
|
|
717
|
+
for pattern, sql_type in POLARS_TO_SQL_TYPE_MAP.items():
|
|
718
|
+
if pattern in dtype_str:
|
|
719
|
+
return sql_type
|
|
720
|
+
return "NVARCHAR(MAX)"
|
|
721
|
+
|
|
722
|
+
def create_table_from_pandas(
|
|
723
|
+
self,
|
|
724
|
+
df: Any,
|
|
725
|
+
table: str,
|
|
726
|
+
audit_cols: Optional[SqlServerAuditColsConfig] = None,
|
|
727
|
+
) -> None:
|
|
728
|
+
"""
|
|
729
|
+
Create a SQL Server table from Pandas DataFrame schema.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
df: Pandas DataFrame
|
|
733
|
+
table: Target table name
|
|
734
|
+
audit_cols: Optional audit column config to add created_ts/updated_ts columns
|
|
735
|
+
"""
|
|
736
|
+
schema, table_name = self.parse_table_name(table)
|
|
737
|
+
columns = []
|
|
738
|
+
existing_cols = set()
|
|
739
|
+
for col_name, dtype in df.dtypes.items():
|
|
740
|
+
sql_type = self.infer_sql_type_pandas(dtype)
|
|
741
|
+
escaped_col = self.escape_column(col_name)
|
|
742
|
+
columns.append(f"{escaped_col} {sql_type} NULL")
|
|
743
|
+
existing_cols.add(col_name)
|
|
744
|
+
|
|
745
|
+
if audit_cols:
|
|
746
|
+
if audit_cols.created_col and audit_cols.created_col not in existing_cols:
|
|
747
|
+
escaped_col = self.escape_column(audit_cols.created_col)
|
|
748
|
+
columns.append(f"{escaped_col} DATETIME2 NULL")
|
|
749
|
+
self.ctx.debug(f"Adding audit column: {audit_cols.created_col}")
|
|
750
|
+
if audit_cols.updated_col and audit_cols.updated_col not in existing_cols:
|
|
751
|
+
escaped_col = self.escape_column(audit_cols.updated_col)
|
|
752
|
+
columns.append(f"{escaped_col} DATETIME2 NULL")
|
|
753
|
+
self.ctx.debug(f"Adding audit column: {audit_cols.updated_col}")
|
|
754
|
+
|
|
755
|
+
columns_sql = ",\n ".join(columns)
|
|
756
|
+
sql = f"CREATE TABLE [{schema}].[{table_name}] (\n {columns_sql}\n)"
|
|
757
|
+
self.ctx.info("Creating table from DataFrame", table=table)
|
|
758
|
+
self.connection.execute_sql(sql)
|
|
759
|
+
|
|
760
|
+
def create_table_from_polars(
|
|
761
|
+
self,
|
|
762
|
+
df: Any,
|
|
763
|
+
table: str,
|
|
764
|
+
audit_cols: Optional[SqlServerAuditColsConfig] = None,
|
|
765
|
+
) -> None:
|
|
766
|
+
"""
|
|
767
|
+
Create a SQL Server table from Polars DataFrame schema.
|
|
768
|
+
|
|
769
|
+
Args:
|
|
770
|
+
df: Polars DataFrame or LazyFrame
|
|
771
|
+
table: Target table name
|
|
772
|
+
audit_cols: Optional audit column config to add created_ts/updated_ts columns
|
|
773
|
+
"""
|
|
774
|
+
try:
|
|
775
|
+
import polars as pl
|
|
776
|
+
except ImportError:
|
|
777
|
+
raise ImportError("Polars not installed. Run 'pip install polars'.")
|
|
778
|
+
|
|
779
|
+
schema_name, table_name = self.parse_table_name(table)
|
|
780
|
+
|
|
781
|
+
if isinstance(df, pl.LazyFrame):
|
|
782
|
+
df_schema = df.collect_schema()
|
|
783
|
+
else:
|
|
784
|
+
df_schema = df.schema
|
|
785
|
+
|
|
786
|
+
columns = []
|
|
787
|
+
existing_cols = set()
|
|
788
|
+
for col_name, dtype in df_schema.items():
|
|
789
|
+
sql_type = self.infer_sql_type_polars(dtype)
|
|
790
|
+
escaped_col = self.escape_column(col_name)
|
|
791
|
+
columns.append(f"{escaped_col} {sql_type} NULL")
|
|
792
|
+
existing_cols.add(col_name)
|
|
793
|
+
|
|
794
|
+
if audit_cols:
|
|
795
|
+
if audit_cols.created_col and audit_cols.created_col not in existing_cols:
|
|
796
|
+
escaped_col = self.escape_column(audit_cols.created_col)
|
|
797
|
+
columns.append(f"{escaped_col} DATETIME2 NULL")
|
|
798
|
+
self.ctx.debug(f"Adding audit column: {audit_cols.created_col}")
|
|
799
|
+
if audit_cols.updated_col and audit_cols.updated_col not in existing_cols:
|
|
800
|
+
escaped_col = self.escape_column(audit_cols.updated_col)
|
|
801
|
+
columns.append(f"{escaped_col} DATETIME2 NULL")
|
|
802
|
+
self.ctx.debug(f"Adding audit column: {audit_cols.updated_col}")
|
|
803
|
+
|
|
804
|
+
columns_sql = ",\n ".join(columns)
|
|
805
|
+
sql = f"CREATE TABLE [{schema_name}].[{table_name}] (\n {columns_sql}\n)"
|
|
806
|
+
self.ctx.info("Creating table from Polars DataFrame", table=table)
|
|
807
|
+
self.connection.execute_sql(sql)
|
|
808
|
+
|
|
809
|
+
def add_columns(self, table: str, new_columns: Dict[str, str]) -> None:
|
|
810
|
+
"""Add new columns to an existing table."""
|
|
811
|
+
if not new_columns:
|
|
812
|
+
return
|
|
813
|
+
|
|
814
|
+
escaped_table = self.get_escaped_table_name(table)
|
|
815
|
+
for col_name, sql_type in new_columns.items():
|
|
816
|
+
escaped_col = self.escape_column(col_name)
|
|
817
|
+
sql = f"ALTER TABLE {escaped_table} ADD {escaped_col} {sql_type} NULL"
|
|
818
|
+
self.ctx.info("Adding column to table", table=table, column=col_name)
|
|
819
|
+
self.connection.execute_sql(sql)
|
|
820
|
+
|
|
821
|
+
def _fix_max_columns_for_indexing(self, table: str, columns: List[str]) -> None:
|
|
822
|
+
"""
|
|
823
|
+
Convert MAX columns to sized types for indexing compatibility.
|
|
824
|
+
|
|
825
|
+
SQL Server cannot use nvarchar(MAX), varchar(MAX), or varbinary(MAX)
|
|
826
|
+
columns in primary keys or indexes. This method converts them to
|
|
827
|
+
sized equivalents (e.g., nvarchar(450) - max size for indexed columns).
|
|
828
|
+
|
|
829
|
+
Args:
|
|
830
|
+
table: Table name
|
|
831
|
+
columns: Columns that will be used in index/primary key
|
|
832
|
+
"""
|
|
833
|
+
escaped_table = self.get_escaped_table_name(table)
|
|
834
|
+
existing_cols = self.get_table_columns(table)
|
|
835
|
+
# Build case-insensitive lookup
|
|
836
|
+
existing_cols_lower = {k.lower(): v for k, v in existing_cols.items()}
|
|
837
|
+
|
|
838
|
+
for col in columns:
|
|
839
|
+
col_type = existing_cols_lower.get(col.lower(), "")
|
|
840
|
+
col_type_upper = col_type.upper()
|
|
841
|
+
|
|
842
|
+
# Check if it's a MAX type that needs conversion
|
|
843
|
+
if "(MAX)" in col_type_upper:
|
|
844
|
+
# SQL Server max key length is 900 bytes for clustered index
|
|
845
|
+
# nvarchar uses 2 bytes per char, so max is 450 chars
|
|
846
|
+
if "NVARCHAR" in col_type_upper or "NCHAR" in col_type_upper:
|
|
847
|
+
new_type = "NVARCHAR(450)"
|
|
848
|
+
elif "VARCHAR" in col_type_upper or "CHAR" in col_type_upper:
|
|
849
|
+
new_type = "VARCHAR(900)"
|
|
850
|
+
elif "VARBINARY" in col_type_upper or "BINARY" in col_type_upper:
|
|
851
|
+
new_type = "VARBINARY(900)"
|
|
852
|
+
else:
|
|
853
|
+
continue # Unknown MAX type, skip
|
|
854
|
+
|
|
855
|
+
escaped_col = self.escape_column(col)
|
|
856
|
+
alter_sql = f"ALTER TABLE {escaped_table} ALTER COLUMN {escaped_col} {new_type}"
|
|
857
|
+
self.ctx.info(
|
|
858
|
+
"Converting MAX column to sized type for indexing",
|
|
859
|
+
table=table,
|
|
860
|
+
column=col,
|
|
861
|
+
old_type=col_type,
|
|
862
|
+
new_type=new_type,
|
|
863
|
+
)
|
|
864
|
+
self.connection.execute_sql(alter_sql)
|
|
865
|
+
|
|
866
|
+
def create_primary_key(self, table: str, columns: List[str]) -> None:
|
|
867
|
+
"""
|
|
868
|
+
Create a clustered primary key on the specified columns.
|
|
869
|
+
|
|
870
|
+
First makes columns NOT NULL (required for PK), then adds the constraint.
|
|
871
|
+
|
|
872
|
+
Args:
|
|
873
|
+
table: Table name (e.g., 'oee.oee_fact')
|
|
874
|
+
columns: List of column names for the primary key
|
|
875
|
+
"""
|
|
876
|
+
escaped_table = self.get_escaped_table_name(table)
|
|
877
|
+
schema, table_name = self.parse_table_name(table)
|
|
878
|
+
pk_name = f"PK_{table_name}"
|
|
879
|
+
|
|
880
|
+
# Get column types so we can ALTER to NOT NULL
|
|
881
|
+
existing_cols = self.get_table_columns(table)
|
|
882
|
+
# Build case-insensitive lookup for column types
|
|
883
|
+
existing_cols_lower = {k.lower(): v for k, v in existing_cols.items()}
|
|
884
|
+
|
|
885
|
+
# First, make PK columns NOT NULL (required for primary key)
|
|
886
|
+
for col in columns:
|
|
887
|
+
escaped_col = self.escape_column(col)
|
|
888
|
+
col_type = existing_cols_lower.get(col.lower())
|
|
889
|
+
if col_type is None:
|
|
890
|
+
raise ValueError(
|
|
891
|
+
f"Cannot create primary key: column '{col}' not found in table '{table}'. "
|
|
892
|
+
f"Available columns: {list(existing_cols.keys())}"
|
|
893
|
+
)
|
|
894
|
+
alter_sql = (
|
|
895
|
+
f"ALTER TABLE {escaped_table} ALTER COLUMN {escaped_col} {col_type} NOT NULL"
|
|
896
|
+
)
|
|
897
|
+
self.ctx.debug(f"Setting column NOT NULL: {col}")
|
|
898
|
+
self.connection.execute_sql(alter_sql)
|
|
899
|
+
|
|
900
|
+
# Now create the primary key
|
|
901
|
+
escaped_cols = ", ".join([self.escape_column(c) for c in columns])
|
|
902
|
+
sql = f"""
|
|
903
|
+
ALTER TABLE {escaped_table}
|
|
904
|
+
ADD CONSTRAINT [{pk_name}] PRIMARY KEY CLUSTERED ({escaped_cols})
|
|
905
|
+
"""
|
|
906
|
+
self.ctx.info(
|
|
907
|
+
"Creating primary key",
|
|
908
|
+
table=table,
|
|
909
|
+
constraint=pk_name,
|
|
910
|
+
columns=columns,
|
|
911
|
+
)
|
|
912
|
+
self.connection.execute_sql(sql)
|
|
913
|
+
|
|
914
|
+
def create_index(self, table: str, columns: List[str], index_name: str = None) -> None:
|
|
915
|
+
"""
|
|
916
|
+
Create a nonclustered index on the specified columns.
|
|
917
|
+
|
|
918
|
+
Args:
|
|
919
|
+
table: Table name (e.g., 'oee.oee_fact')
|
|
920
|
+
columns: List of column names for the index
|
|
921
|
+
index_name: Optional custom index name (auto-generated if not provided)
|
|
922
|
+
"""
|
|
923
|
+
escaped_table = self.get_escaped_table_name(table)
|
|
924
|
+
schema, table_name = self.parse_table_name(table)
|
|
925
|
+
|
|
926
|
+
if index_name is None:
|
|
927
|
+
col_suffix = "_".join(columns[:3]) # Use first 3 columns in name
|
|
928
|
+
index_name = f"IX_{table_name}_{col_suffix}"
|
|
929
|
+
|
|
930
|
+
escaped_cols = ", ".join([self.escape_column(c) for c in columns])
|
|
931
|
+
|
|
932
|
+
sql = f"""
|
|
933
|
+
CREATE NONCLUSTERED INDEX [{index_name}]
|
|
934
|
+
ON {escaped_table} ({escaped_cols})
|
|
935
|
+
"""
|
|
936
|
+
self.ctx.info(
|
|
937
|
+
"Creating index",
|
|
938
|
+
table=table,
|
|
939
|
+
index=index_name,
|
|
940
|
+
columns=columns,
|
|
941
|
+
)
|
|
942
|
+
self.connection.execute_sql(sql)
|
|
943
|
+
|
|
944
|
+
def handle_schema_evolution_pandas(
|
|
945
|
+
self, df: Any, table: str, evolution_config: Any
|
|
946
|
+
) -> List[str]:
|
|
947
|
+
"""
|
|
948
|
+
Handle schema evolution for Pandas DataFrame.
|
|
949
|
+
|
|
950
|
+
Returns list of columns to write (may be subset if mode=ignore).
|
|
951
|
+
"""
|
|
952
|
+
if evolution_config is None:
|
|
953
|
+
return list(df.columns)
|
|
954
|
+
|
|
955
|
+
mode = evolution_config.mode
|
|
956
|
+
existing_cols = self.get_table_columns(table)
|
|
957
|
+
df_cols = set(df.columns)
|
|
958
|
+
table_cols = set(existing_cols.keys())
|
|
959
|
+
|
|
960
|
+
new_cols = df_cols - table_cols
|
|
961
|
+
|
|
962
|
+
if mode == SqlServerSchemaEvolutionMode.STRICT:
|
|
963
|
+
if new_cols:
|
|
964
|
+
raise ValueError(
|
|
965
|
+
f"Schema evolution mode is 'strict' but DataFrame has new columns "
|
|
966
|
+
f"not in target table: {new_cols}"
|
|
967
|
+
)
|
|
968
|
+
return list(df.columns)
|
|
969
|
+
|
|
970
|
+
elif mode == SqlServerSchemaEvolutionMode.EVOLVE:
|
|
971
|
+
if new_cols and evolution_config.add_columns:
|
|
972
|
+
new_cols_with_types = {}
|
|
973
|
+
for col in new_cols:
|
|
974
|
+
new_cols_with_types[col] = self.infer_sql_type_pandas(df[col].dtype)
|
|
975
|
+
self.add_columns(table, new_cols_with_types)
|
|
976
|
+
return list(df.columns)
|
|
977
|
+
|
|
978
|
+
elif mode == SqlServerSchemaEvolutionMode.IGNORE:
|
|
979
|
+
return [c for c in df.columns if c in table_cols]
|
|
980
|
+
|
|
981
|
+
return list(df.columns)
|
|
982
|
+
|
|
983
|
+
def handle_schema_evolution_polars(
|
|
984
|
+
self, df: Any, table: str, evolution_config: Any
|
|
985
|
+
) -> List[str]:
|
|
986
|
+
"""
|
|
987
|
+
Handle schema evolution for Polars DataFrame.
|
|
988
|
+
|
|
989
|
+
Returns list of columns to write (may be subset if mode=ignore).
|
|
990
|
+
"""
|
|
991
|
+
try:
|
|
992
|
+
import polars as pl
|
|
993
|
+
except ImportError:
|
|
994
|
+
raise ImportError("Polars not installed. Run 'pip install polars'.")
|
|
995
|
+
|
|
996
|
+
if evolution_config is None:
|
|
997
|
+
if isinstance(df, pl.LazyFrame):
|
|
998
|
+
return list(df.collect_schema().names())
|
|
999
|
+
return df.columns
|
|
1000
|
+
|
|
1001
|
+
mode = evolution_config.mode
|
|
1002
|
+
existing_cols = self.get_table_columns(table)
|
|
1003
|
+
|
|
1004
|
+
if isinstance(df, pl.LazyFrame):
|
|
1005
|
+
df_schema = df.collect_schema()
|
|
1006
|
+
df_cols = set(df_schema.names())
|
|
1007
|
+
else:
|
|
1008
|
+
df_schema = df.schema
|
|
1009
|
+
df_cols = set(df.columns)
|
|
1010
|
+
|
|
1011
|
+
table_cols = set(existing_cols.keys())
|
|
1012
|
+
new_cols = df_cols - table_cols
|
|
1013
|
+
|
|
1014
|
+
if mode == SqlServerSchemaEvolutionMode.STRICT:
|
|
1015
|
+
if new_cols:
|
|
1016
|
+
raise ValueError(
|
|
1017
|
+
f"Schema evolution mode is 'strict' but DataFrame has new columns "
|
|
1018
|
+
f"not in target table: {new_cols}"
|
|
1019
|
+
)
|
|
1020
|
+
return list(df_cols)
|
|
1021
|
+
|
|
1022
|
+
elif mode == SqlServerSchemaEvolutionMode.EVOLVE:
|
|
1023
|
+
if new_cols and evolution_config.add_columns:
|
|
1024
|
+
new_cols_with_types = {}
|
|
1025
|
+
for col in new_cols:
|
|
1026
|
+
new_cols_with_types[col] = self.infer_sql_type_polars(df_schema[col])
|
|
1027
|
+
self.add_columns(table, new_cols_with_types)
|
|
1028
|
+
return list(df_cols)
|
|
1029
|
+
|
|
1030
|
+
elif mode == SqlServerSchemaEvolutionMode.IGNORE:
|
|
1031
|
+
return [c for c in df_cols if c in table_cols]
|
|
1032
|
+
|
|
1033
|
+
return list(df_cols)
|
|
1034
|
+
|
|
1035
|
+
def truncate_staging(self, staging_table: str) -> None:
|
|
1036
|
+
"""
|
|
1037
|
+
Truncate staging table if it exists.
|
|
1038
|
+
|
|
1039
|
+
Args:
|
|
1040
|
+
staging_table: Full staging table name (e.g., '[staging].[oee_fact_staging]')
|
|
1041
|
+
"""
|
|
1042
|
+
sql = f"""
|
|
1043
|
+
IF OBJECT_ID('{staging_table}', 'U') IS NOT NULL
|
|
1044
|
+
TRUNCATE TABLE {staging_table}
|
|
1045
|
+
"""
|
|
1046
|
+
self.ctx.debug("Truncating staging table", staging_table=staging_table)
|
|
1047
|
+
self.connection.execute_sql(sql)
|
|
1048
|
+
|
|
1049
|
+
def truncate_table(self, table: str) -> None:
|
|
1050
|
+
"""Truncate a table."""
|
|
1051
|
+
escaped = self.get_escaped_table_name(table)
|
|
1052
|
+
sql = f"TRUNCATE TABLE {escaped}"
|
|
1053
|
+
self.ctx.debug("Truncating table", table=table)
|
|
1054
|
+
self.connection.execute_sql(sql)
|
|
1055
|
+
|
|
1056
|
+
def delete_from_table(self, table: str) -> int:
|
|
1057
|
+
"""Delete all rows from a table and return count."""
|
|
1058
|
+
escaped = self.get_escaped_table_name(table)
|
|
1059
|
+
sql = f"DELETE FROM {escaped}; SELECT @@ROWCOUNT AS deleted_count;"
|
|
1060
|
+
self.ctx.debug("Deleting from table", table=table)
|
|
1061
|
+
result = self.connection.execute_sql(sql)
|
|
1062
|
+
# Result is now a list of rows (fetchall already called in AzureSQL.execute)
|
|
1063
|
+
row = result[0] if result else None
|
|
1064
|
+
if row:
|
|
1065
|
+
return row.get("deleted_count", 0) if isinstance(row, dict) else row[0]
|
|
1066
|
+
return 0
|
|
1067
|
+
|
|
1068
|
+
def drop_table(self, table: str) -> None:
|
|
1069
|
+
"""Drop a table if it exists."""
|
|
1070
|
+
escaped = self.get_escaped_table_name(table)
|
|
1071
|
+
sql = f"DROP TABLE IF EXISTS {escaped}"
|
|
1072
|
+
self.ctx.debug("Dropping table", table=table)
|
|
1073
|
+
self.connection.execute_sql(sql)
|
|
1074
|
+
|
|
1075
|
+
def build_merge_sql(
|
|
1076
|
+
self,
|
|
1077
|
+
target_table: str,
|
|
1078
|
+
staging_table: str,
|
|
1079
|
+
merge_keys: List[str],
|
|
1080
|
+
columns: List[str],
|
|
1081
|
+
options: Optional[SqlServerMergeOptions] = None,
|
|
1082
|
+
) -> str:
|
|
1083
|
+
"""
|
|
1084
|
+
Build T-SQL MERGE statement.
|
|
1085
|
+
|
|
1086
|
+
Args:
|
|
1087
|
+
target_table: Target table name
|
|
1088
|
+
staging_table: Staging table name
|
|
1089
|
+
merge_keys: Key columns for ON clause
|
|
1090
|
+
columns: All columns in the DataFrame
|
|
1091
|
+
options: Merge options (conditions, audit cols, etc.)
|
|
1092
|
+
|
|
1093
|
+
Returns:
|
|
1094
|
+
T-SQL MERGE statement
|
|
1095
|
+
"""
|
|
1096
|
+
options = options or SqlServerMergeOptions()
|
|
1097
|
+
|
|
1098
|
+
exclude_cols = set(options.exclude_columns)
|
|
1099
|
+
audit_created = options.audit_cols.created_col if options.audit_cols else None
|
|
1100
|
+
audit_updated = options.audit_cols.updated_col if options.audit_cols else None
|
|
1101
|
+
|
|
1102
|
+
merge_cols = [c for c in columns if c not in exclude_cols]
|
|
1103
|
+
|
|
1104
|
+
update_cols = [c for c in merge_cols if c not in merge_keys and c != audit_created]
|
|
1105
|
+
insert_cols = [c for c in merge_cols]
|
|
1106
|
+
|
|
1107
|
+
on_clause = " AND ".join(
|
|
1108
|
+
[f"target.{self.escape_column(k)} = source.{self.escape_column(k)}" for k in merge_keys]
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
update_set_parts = []
|
|
1112
|
+
for col in update_cols:
|
|
1113
|
+
if col == audit_updated:
|
|
1114
|
+
update_set_parts.append(f"{self.escape_column(col)} = GETUTCDATE()")
|
|
1115
|
+
else:
|
|
1116
|
+
update_set_parts.append(
|
|
1117
|
+
f"{self.escape_column(col)} = source.{self.escape_column(col)}"
|
|
1118
|
+
)
|
|
1119
|
+
update_set = ",\n ".join(update_set_parts)
|
|
1120
|
+
|
|
1121
|
+
insert_col_list = ", ".join([self.escape_column(c) for c in insert_cols])
|
|
1122
|
+
insert_value_parts = []
|
|
1123
|
+
for col in insert_cols:
|
|
1124
|
+
if col == audit_created or col == audit_updated:
|
|
1125
|
+
insert_value_parts.append("GETUTCDATE()")
|
|
1126
|
+
else:
|
|
1127
|
+
insert_value_parts.append(f"source.{self.escape_column(col)}")
|
|
1128
|
+
insert_values = ", ".join(insert_value_parts)
|
|
1129
|
+
|
|
1130
|
+
target_escaped = self.get_escaped_table_name(target_table)
|
|
1131
|
+
|
|
1132
|
+
sql_parts = [
|
|
1133
|
+
"DECLARE @MergeActions TABLE (action NVARCHAR(10));",
|
|
1134
|
+
"",
|
|
1135
|
+
f"MERGE {target_escaped} AS target",
|
|
1136
|
+
f"USING {staging_table} AS source",
|
|
1137
|
+
f"ON {on_clause}",
|
|
1138
|
+
]
|
|
1139
|
+
|
|
1140
|
+
if options.update_condition:
|
|
1141
|
+
sql_parts.append(f"WHEN MATCHED AND {options.update_condition} THEN")
|
|
1142
|
+
else:
|
|
1143
|
+
sql_parts.append("WHEN MATCHED THEN")
|
|
1144
|
+
|
|
1145
|
+
sql_parts.append(" UPDATE SET")
|
|
1146
|
+
sql_parts.append(f" {update_set}")
|
|
1147
|
+
|
|
1148
|
+
if options.delete_condition:
|
|
1149
|
+
sql_parts.append(f"WHEN MATCHED AND {options.delete_condition} THEN")
|
|
1150
|
+
sql_parts.append(" DELETE")
|
|
1151
|
+
|
|
1152
|
+
if options.insert_condition:
|
|
1153
|
+
sql_parts.append(f"WHEN NOT MATCHED BY TARGET AND {options.insert_condition} THEN")
|
|
1154
|
+
else:
|
|
1155
|
+
sql_parts.append("WHEN NOT MATCHED BY TARGET THEN")
|
|
1156
|
+
|
|
1157
|
+
sql_parts.append(f" INSERT ({insert_col_list})")
|
|
1158
|
+
sql_parts.append(f" VALUES ({insert_values})")
|
|
1159
|
+
|
|
1160
|
+
sql_parts.append("OUTPUT $action INTO @MergeActions;")
|
|
1161
|
+
sql_parts.append("")
|
|
1162
|
+
sql_parts.append("SELECT")
|
|
1163
|
+
sql_parts.append(" SUM(CASE WHEN action = 'INSERT' THEN 1 ELSE 0 END) AS inserted,")
|
|
1164
|
+
sql_parts.append(" SUM(CASE WHEN action = 'UPDATE' THEN 1 ELSE 0 END) AS updated,")
|
|
1165
|
+
sql_parts.append(" SUM(CASE WHEN action = 'DELETE' THEN 1 ELSE 0 END) AS deleted")
|
|
1166
|
+
sql_parts.append("FROM @MergeActions;")
|
|
1167
|
+
|
|
1168
|
+
return "\n".join(sql_parts)
|
|
1169
|
+
|
|
1170
|
+
def execute_merge(
|
|
1171
|
+
self,
|
|
1172
|
+
target_table: str,
|
|
1173
|
+
staging_table: str,
|
|
1174
|
+
merge_keys: List[str],
|
|
1175
|
+
columns: List[str],
|
|
1176
|
+
options: Optional[SqlServerMergeOptions] = None,
|
|
1177
|
+
) -> MergeResult:
|
|
1178
|
+
"""
|
|
1179
|
+
Execute MERGE operation and return counts.
|
|
1180
|
+
|
|
1181
|
+
Args:
|
|
1182
|
+
target_table: Target table name
|
|
1183
|
+
staging_table: Staging table name
|
|
1184
|
+
merge_keys: Key columns for ON clause
|
|
1185
|
+
columns: All columns in the DataFrame
|
|
1186
|
+
options: Merge options
|
|
1187
|
+
|
|
1188
|
+
Returns:
|
|
1189
|
+
MergeResult with insert/update/delete counts
|
|
1190
|
+
"""
|
|
1191
|
+
sql = self.build_merge_sql(
|
|
1192
|
+
target_table=target_table,
|
|
1193
|
+
staging_table=staging_table,
|
|
1194
|
+
merge_keys=merge_keys,
|
|
1195
|
+
columns=columns,
|
|
1196
|
+
options=options,
|
|
1197
|
+
)
|
|
1198
|
+
|
|
1199
|
+
self.ctx.debug(
|
|
1200
|
+
"Executing MERGE",
|
|
1201
|
+
target_table=target_table,
|
|
1202
|
+
staging_table=staging_table,
|
|
1203
|
+
merge_keys=merge_keys,
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
try:
|
|
1207
|
+
result = self.connection.execute_sql(sql)
|
|
1208
|
+
|
|
1209
|
+
# Result is now a list of rows (fetchall already called in AzureSQL.execute)
|
|
1210
|
+
row = result[0] if result else None
|
|
1211
|
+
if row:
|
|
1212
|
+
if isinstance(row, dict):
|
|
1213
|
+
merge_result = MergeResult(
|
|
1214
|
+
inserted=row.get("inserted", 0) or 0,
|
|
1215
|
+
updated=row.get("updated", 0) or 0,
|
|
1216
|
+
deleted=row.get("deleted", 0) or 0,
|
|
1217
|
+
)
|
|
1218
|
+
else:
|
|
1219
|
+
merge_result = MergeResult(
|
|
1220
|
+
inserted=row[0] or 0,
|
|
1221
|
+
updated=row[1] or 0,
|
|
1222
|
+
deleted=row[2] or 0,
|
|
1223
|
+
)
|
|
1224
|
+
else:
|
|
1225
|
+
merge_result = MergeResult()
|
|
1226
|
+
|
|
1227
|
+
self.ctx.info(
|
|
1228
|
+
"MERGE completed",
|
|
1229
|
+
target_table=target_table,
|
|
1230
|
+
inserted=merge_result.inserted,
|
|
1231
|
+
updated=merge_result.updated,
|
|
1232
|
+
deleted=merge_result.deleted,
|
|
1233
|
+
total_affected=merge_result.total_affected,
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
return merge_result
|
|
1237
|
+
|
|
1238
|
+
except Exception as e:
|
|
1239
|
+
self.ctx.error(
|
|
1240
|
+
"MERGE failed",
|
|
1241
|
+
target_table=target_table,
|
|
1242
|
+
error_type=type(e).__name__,
|
|
1243
|
+
error_message=str(e),
|
|
1244
|
+
)
|
|
1245
|
+
raise
|
|
1246
|
+
|
|
1247
|
+
def merge(
|
|
1248
|
+
self,
|
|
1249
|
+
df: Any,
|
|
1250
|
+
spark_engine: Any,
|
|
1251
|
+
target_table: str,
|
|
1252
|
+
merge_keys: List[str],
|
|
1253
|
+
options: Optional[SqlServerMergeOptions] = None,
|
|
1254
|
+
jdbc_options: Optional[Dict[str, Any]] = None,
|
|
1255
|
+
) -> MergeResult:
|
|
1256
|
+
"""
|
|
1257
|
+
Execute full merge operation: validation + staging write + MERGE.
|
|
1258
|
+
|
|
1259
|
+
Args:
|
|
1260
|
+
df: Spark DataFrame to merge
|
|
1261
|
+
spark_engine: SparkEngine instance for writing to staging
|
|
1262
|
+
target_table: Target table name (e.g., 'oee.oee_fact')
|
|
1263
|
+
merge_keys: Key columns for ON clause
|
|
1264
|
+
options: Merge options
|
|
1265
|
+
jdbc_options: JDBC connection options
|
|
1266
|
+
|
|
1267
|
+
Returns:
|
|
1268
|
+
MergeResult with counts
|
|
1269
|
+
"""
|
|
1270
|
+
options = options or SqlServerMergeOptions()
|
|
1271
|
+
jdbc_options = jdbc_options or {}
|
|
1272
|
+
|
|
1273
|
+
# Auto-create schema if needed
|
|
1274
|
+
if options.auto_create_schema:
|
|
1275
|
+
schema, _ = self.parse_table_name(target_table)
|
|
1276
|
+
if not self.check_schema_exists(schema):
|
|
1277
|
+
self.create_schema(schema)
|
|
1278
|
+
|
|
1279
|
+
# Check if table exists, auto-create if configured
|
|
1280
|
+
if not self.check_table_exists(target_table):
|
|
1281
|
+
if options.auto_create_table:
|
|
1282
|
+
self.ctx.info(
|
|
1283
|
+
"Auto-creating target table from Spark DataFrame",
|
|
1284
|
+
target_table=target_table,
|
|
1285
|
+
)
|
|
1286
|
+
|
|
1287
|
+
# Create table using JDBC write with overwrite mode (initial load)
|
|
1288
|
+
staging_jdbc_options = {**jdbc_options, "dbtable": target_table}
|
|
1289
|
+
df.write.format("jdbc").options(**staging_jdbc_options).mode("overwrite").save()
|
|
1290
|
+
|
|
1291
|
+
row_count = df.count()
|
|
1292
|
+
|
|
1293
|
+
# Add audit columns if configured (JDBC doesn't create them automatically)
|
|
1294
|
+
if options.audit_cols:
|
|
1295
|
+
audit_cols_to_add = {}
|
|
1296
|
+
existing_cols = self.get_table_columns(target_table)
|
|
1297
|
+
if (
|
|
1298
|
+
options.audit_cols.created_col
|
|
1299
|
+
and options.audit_cols.created_col not in existing_cols
|
|
1300
|
+
):
|
|
1301
|
+
audit_cols_to_add[options.audit_cols.created_col] = "DATETIME2"
|
|
1302
|
+
if (
|
|
1303
|
+
options.audit_cols.updated_col
|
|
1304
|
+
and options.audit_cols.updated_col not in existing_cols
|
|
1305
|
+
):
|
|
1306
|
+
audit_cols_to_add[options.audit_cols.updated_col] = "DATETIME2"
|
|
1307
|
+
if audit_cols_to_add:
|
|
1308
|
+
self.add_columns(target_table, audit_cols_to_add)
|
|
1309
|
+
|
|
1310
|
+
# Populate audit columns for all rows on first load
|
|
1311
|
+
escaped_table = self.get_escaped_table_name(target_table)
|
|
1312
|
+
update_parts = []
|
|
1313
|
+
if options.audit_cols.created_col:
|
|
1314
|
+
escaped_col = self.escape_column(options.audit_cols.created_col)
|
|
1315
|
+
update_parts.append(f"{escaped_col} = GETUTCDATE()")
|
|
1316
|
+
if options.audit_cols.updated_col:
|
|
1317
|
+
escaped_col = self.escape_column(options.audit_cols.updated_col)
|
|
1318
|
+
update_parts.append(f"{escaped_col} = GETUTCDATE()")
|
|
1319
|
+
if update_parts:
|
|
1320
|
+
update_sql = f"UPDATE {escaped_table} SET {', '.join(update_parts)}"
|
|
1321
|
+
self.ctx.debug("Populating audit columns on initial load")
|
|
1322
|
+
self.connection.execute_sql(update_sql)
|
|
1323
|
+
|
|
1324
|
+
# Create primary key or index on merge keys if configured
|
|
1325
|
+
if options.primary_key_on_merge_keys or options.index_on_merge_keys:
|
|
1326
|
+
# Fix MAX columns in merge keys - SQL Server can't index MAX types
|
|
1327
|
+
self._fix_max_columns_for_indexing(target_table, merge_keys)
|
|
1328
|
+
|
|
1329
|
+
if options.primary_key_on_merge_keys:
|
|
1330
|
+
self.create_primary_key(target_table, merge_keys)
|
|
1331
|
+
elif options.index_on_merge_keys:
|
|
1332
|
+
self.create_index(target_table, merge_keys)
|
|
1333
|
+
|
|
1334
|
+
self.ctx.info(
|
|
1335
|
+
"Target table created and initial data loaded",
|
|
1336
|
+
target_table=target_table,
|
|
1337
|
+
rows=row_count,
|
|
1338
|
+
)
|
|
1339
|
+
# Return as if merge completed (all inserts)
|
|
1340
|
+
return MergeResult(inserted=row_count, updated=0, deleted=0)
|
|
1341
|
+
else:
|
|
1342
|
+
raise ValueError(
|
|
1343
|
+
f"Target table '{target_table}' does not exist. "
|
|
1344
|
+
"SQL Server MERGE mode requires the target table to exist. "
|
|
1345
|
+
"Set auto_create_table=true or use mode='overwrite' for initial load."
|
|
1346
|
+
)
|
|
1347
|
+
|
|
1348
|
+
if options.validations:
|
|
1349
|
+
validation_result = self.validate_keys_spark(df, merge_keys, options.validations)
|
|
1350
|
+
if not validation_result.is_valid:
|
|
1351
|
+
error_msg = "; ".join(validation_result.errors)
|
|
1352
|
+
if options.validations.fail_on_validation_error:
|
|
1353
|
+
raise ValueError(f"Merge key validation failed: {error_msg}")
|
|
1354
|
+
else:
|
|
1355
|
+
self.ctx.warning(f"Merge key validation warnings: {error_msg}")
|
|
1356
|
+
|
|
1357
|
+
staging_table = self.get_staging_table_name(target_table, options.staging_schema)
|
|
1358
|
+
|
|
1359
|
+
# Auto-create staging schema if needed
|
|
1360
|
+
if options.auto_create_schema:
|
|
1361
|
+
if not self.check_schema_exists(options.staging_schema):
|
|
1362
|
+
self.create_schema(options.staging_schema)
|
|
1363
|
+
|
|
1364
|
+
self.ctx.info(
|
|
1365
|
+
"Starting SQL Server MERGE",
|
|
1366
|
+
target_table=target_table,
|
|
1367
|
+
staging_table=staging_table,
|
|
1368
|
+
merge_keys=merge_keys,
|
|
1369
|
+
incremental=options.incremental,
|
|
1370
|
+
)
|
|
1371
|
+
|
|
1372
|
+
self.truncate_staging(staging_table)
|
|
1373
|
+
|
|
1374
|
+
columns = list(df.columns)
|
|
1375
|
+
df_to_write = df
|
|
1376
|
+
|
|
1377
|
+
if options.audit_cols:
|
|
1378
|
+
if options.audit_cols.created_col and options.audit_cols.created_col not in columns:
|
|
1379
|
+
columns.append(options.audit_cols.created_col)
|
|
1380
|
+
if options.audit_cols.updated_col and options.audit_cols.updated_col not in columns:
|
|
1381
|
+
columns.append(options.audit_cols.updated_col)
|
|
1382
|
+
|
|
1383
|
+
# Incremental merge: filter to only changed rows before writing to staging
|
|
1384
|
+
if options.incremental:
|
|
1385
|
+
hash_column = self.get_hash_column_name(df.columns, options.hash_column)
|
|
1386
|
+
|
|
1387
|
+
if hash_column is None and options.change_detection_columns:
|
|
1388
|
+
# Compute hash from specified columns
|
|
1389
|
+
hash_column = "_computed_hash"
|
|
1390
|
+
df_to_write = self.compute_hash_spark(
|
|
1391
|
+
df, options.change_detection_columns, hash_column
|
|
1392
|
+
)
|
|
1393
|
+
columns.append(hash_column)
|
|
1394
|
+
elif hash_column is None:
|
|
1395
|
+
# Compute hash from all non-key columns
|
|
1396
|
+
non_key_cols = [c for c in df.columns if c not in merge_keys]
|
|
1397
|
+
if non_key_cols:
|
|
1398
|
+
hash_column = "_computed_hash"
|
|
1399
|
+
df_to_write = self.compute_hash_spark(df, non_key_cols, hash_column)
|
|
1400
|
+
columns.append(hash_column)
|
|
1401
|
+
|
|
1402
|
+
if hash_column:
|
|
1403
|
+
# Read target hashes and filter source
|
|
1404
|
+
target_hashes = self.read_target_hashes(target_table, merge_keys, hash_column)
|
|
1405
|
+
original_count = df_to_write.count()
|
|
1406
|
+
df_to_write = self.filter_changed_rows_spark(
|
|
1407
|
+
df_to_write, target_hashes, merge_keys, hash_column
|
|
1408
|
+
)
|
|
1409
|
+
filtered_count = df_to_write.count()
|
|
1410
|
+
self.ctx.info(
|
|
1411
|
+
"Incremental filter applied",
|
|
1412
|
+
original_rows=original_count,
|
|
1413
|
+
changed_rows=filtered_count,
|
|
1414
|
+
skipped_rows=original_count - filtered_count,
|
|
1415
|
+
)
|
|
1416
|
+
|
|
1417
|
+
if filtered_count == 0:
|
|
1418
|
+
self.ctx.info("No changed rows detected, skipping merge")
|
|
1419
|
+
return MergeResult(inserted=0, updated=0, deleted=0)
|
|
1420
|
+
|
|
1421
|
+
staging_jdbc_options = {**jdbc_options, "dbtable": staging_table}
|
|
1422
|
+
df_to_write.write.format("jdbc").options(**staging_jdbc_options).mode("overwrite").save()
|
|
1423
|
+
|
|
1424
|
+
self.ctx.debug("Staging write completed", staging_table=staging_table)
|
|
1425
|
+
|
|
1426
|
+
# Handle schema evolution before MERGE - add any new columns to target table
|
|
1427
|
+
if options.schema_evolution and options.schema_evolution.add_columns:
|
|
1428
|
+
existing_cols = self.get_table_columns(target_table)
|
|
1429
|
+
new_cols = [c for c in columns if c not in existing_cols]
|
|
1430
|
+
if new_cols:
|
|
1431
|
+
new_cols_with_types = {}
|
|
1432
|
+
staging_cols = self.get_table_columns(staging_table)
|
|
1433
|
+
for col in new_cols:
|
|
1434
|
+
# Use appropriate type for hash columns (SHA256 = 64 chars)
|
|
1435
|
+
if col in ("_computed_hash", "_hash", "_hash_diff"):
|
|
1436
|
+
new_cols_with_types[col] = "NVARCHAR(256)"
|
|
1437
|
+
elif col in staging_cols:
|
|
1438
|
+
new_cols_with_types[col] = staging_cols[col]
|
|
1439
|
+
else:
|
|
1440
|
+
new_cols_with_types[col] = "NVARCHAR(MAX)"
|
|
1441
|
+
self.ctx.info(
|
|
1442
|
+
"Adding new columns to target table via schema evolution",
|
|
1443
|
+
target_table=target_table,
|
|
1444
|
+
new_columns=list(new_cols_with_types.keys()),
|
|
1445
|
+
)
|
|
1446
|
+
self.add_columns(target_table, new_cols_with_types)
|
|
1447
|
+
|
|
1448
|
+
result = self.execute_merge(
|
|
1449
|
+
target_table=target_table,
|
|
1450
|
+
staging_table=staging_table,
|
|
1451
|
+
merge_keys=merge_keys,
|
|
1452
|
+
columns=columns,
|
|
1453
|
+
options=options,
|
|
1454
|
+
)
|
|
1455
|
+
|
|
1456
|
+
return result
|
|
1457
|
+
|
|
1458
|
+
def merge_pandas(
|
|
1459
|
+
self,
|
|
1460
|
+
df: Any,
|
|
1461
|
+
target_table: str,
|
|
1462
|
+
merge_keys: List[str],
|
|
1463
|
+
options: Optional[SqlServerMergeOptions] = None,
|
|
1464
|
+
) -> MergeResult:
|
|
1465
|
+
"""
|
|
1466
|
+
Execute full merge operation for Pandas DataFrame.
|
|
1467
|
+
|
|
1468
|
+
Args:
|
|
1469
|
+
df: Pandas DataFrame to merge
|
|
1470
|
+
target_table: Target table name (e.g., 'oee.oee_fact')
|
|
1471
|
+
merge_keys: Key columns for ON clause
|
|
1472
|
+
options: Merge options
|
|
1473
|
+
|
|
1474
|
+
Returns:
|
|
1475
|
+
MergeResult with counts
|
|
1476
|
+
"""
|
|
1477
|
+
options = options or SqlServerMergeOptions()
|
|
1478
|
+
|
|
1479
|
+
schema, _ = self.parse_table_name(target_table)
|
|
1480
|
+
if options.auto_create_schema:
|
|
1481
|
+
self.create_schema(schema)
|
|
1482
|
+
|
|
1483
|
+
table_exists = self.check_table_exists(target_table)
|
|
1484
|
+
if not table_exists:
|
|
1485
|
+
if options.auto_create_table:
|
|
1486
|
+
self.create_table_from_pandas(df, target_table, audit_cols=options.audit_cols)
|
|
1487
|
+
if options.primary_key_on_merge_keys or options.index_on_merge_keys:
|
|
1488
|
+
# Fix MAX columns in merge keys - SQL Server can't index MAX types
|
|
1489
|
+
self._fix_max_columns_for_indexing(target_table, merge_keys)
|
|
1490
|
+
if options.primary_key_on_merge_keys:
|
|
1491
|
+
self.create_primary_key(target_table, merge_keys)
|
|
1492
|
+
elif options.index_on_merge_keys:
|
|
1493
|
+
self.create_index(target_table, merge_keys)
|
|
1494
|
+
else:
|
|
1495
|
+
raise ValueError(
|
|
1496
|
+
f"Target table '{target_table}' does not exist. "
|
|
1497
|
+
"SQL Server MERGE mode requires the target table to exist. "
|
|
1498
|
+
"Set auto_create_table=true or use mode='overwrite' for initial load."
|
|
1499
|
+
)
|
|
1500
|
+
|
|
1501
|
+
if options.validations:
|
|
1502
|
+
validation_result = self.validate_keys_pandas(df, merge_keys, options.validations)
|
|
1503
|
+
if not validation_result.is_valid:
|
|
1504
|
+
error_msg = "; ".join(validation_result.errors)
|
|
1505
|
+
if options.validations.fail_on_validation_error:
|
|
1506
|
+
raise ValueError(f"Merge key validation failed: {error_msg}")
|
|
1507
|
+
else:
|
|
1508
|
+
self.ctx.warning(f"Merge key validation warnings: {error_msg}")
|
|
1509
|
+
|
|
1510
|
+
staging_table = self.get_staging_table_name(target_table, options.staging_schema)
|
|
1511
|
+
|
|
1512
|
+
self.ctx.info(
|
|
1513
|
+
"Starting SQL Server MERGE (Pandas)",
|
|
1514
|
+
target_table=target_table,
|
|
1515
|
+
staging_table=staging_table,
|
|
1516
|
+
merge_keys=merge_keys,
|
|
1517
|
+
incremental=options.incremental,
|
|
1518
|
+
)
|
|
1519
|
+
|
|
1520
|
+
columns = list(df.columns)
|
|
1521
|
+
df_to_write = df
|
|
1522
|
+
|
|
1523
|
+
if options.audit_cols:
|
|
1524
|
+
if options.audit_cols.created_col and options.audit_cols.created_col not in columns:
|
|
1525
|
+
columns.append(options.audit_cols.created_col)
|
|
1526
|
+
if options.audit_cols.updated_col and options.audit_cols.updated_col not in columns:
|
|
1527
|
+
columns.append(options.audit_cols.updated_col)
|
|
1528
|
+
|
|
1529
|
+
# Incremental merge: filter to only changed rows before writing to staging
|
|
1530
|
+
if options.incremental and table_exists:
|
|
1531
|
+
hash_column = self.get_hash_column_name(list(df.columns), options.hash_column)
|
|
1532
|
+
|
|
1533
|
+
if hash_column is None and options.change_detection_columns:
|
|
1534
|
+
hash_column = "_computed_hash"
|
|
1535
|
+
df_to_write = self.compute_hash_pandas(
|
|
1536
|
+
df, options.change_detection_columns, hash_column
|
|
1537
|
+
)
|
|
1538
|
+
columns.append(hash_column)
|
|
1539
|
+
elif hash_column is None:
|
|
1540
|
+
non_key_cols = [c for c in df.columns if c not in merge_keys]
|
|
1541
|
+
if non_key_cols:
|
|
1542
|
+
hash_column = "_computed_hash"
|
|
1543
|
+
df_to_write = self.compute_hash_pandas(df, list(non_key_cols), hash_column)
|
|
1544
|
+
columns.append(hash_column)
|
|
1545
|
+
|
|
1546
|
+
if hash_column:
|
|
1547
|
+
target_hashes = self.read_target_hashes(target_table, merge_keys, hash_column)
|
|
1548
|
+
original_count = len(df_to_write)
|
|
1549
|
+
df_to_write = self.filter_changed_rows_pandas(
|
|
1550
|
+
df_to_write, target_hashes, merge_keys, hash_column
|
|
1551
|
+
)
|
|
1552
|
+
filtered_count = len(df_to_write)
|
|
1553
|
+
self.ctx.info(
|
|
1554
|
+
"Incremental filter applied (Pandas)",
|
|
1555
|
+
original_rows=original_count,
|
|
1556
|
+
changed_rows=filtered_count,
|
|
1557
|
+
skipped_rows=original_count - filtered_count,
|
|
1558
|
+
)
|
|
1559
|
+
|
|
1560
|
+
if filtered_count == 0:
|
|
1561
|
+
self.ctx.info("No changed rows detected, skipping merge")
|
|
1562
|
+
return MergeResult(inserted=0, updated=0, deleted=0)
|
|
1563
|
+
|
|
1564
|
+
schema, table_name = staging_table.strip("[]").split("].[")
|
|
1565
|
+
schema = schema.strip("[")
|
|
1566
|
+
table_name = table_name.strip("]")
|
|
1567
|
+
|
|
1568
|
+
self.connection.write_table(
|
|
1569
|
+
df=df_to_write,
|
|
1570
|
+
table_name=table_name,
|
|
1571
|
+
schema=schema,
|
|
1572
|
+
if_exists="replace",
|
|
1573
|
+
)
|
|
1574
|
+
|
|
1575
|
+
self.ctx.debug("Staging write completed (Pandas)", staging_table=staging_table)
|
|
1576
|
+
|
|
1577
|
+
# Handle schema evolution before MERGE - add any new columns to target table
|
|
1578
|
+
if options.schema_evolution and options.schema_evolution.add_columns:
|
|
1579
|
+
existing_cols = self.get_table_columns(target_table)
|
|
1580
|
+
new_cols = [c for c in columns if c not in existing_cols]
|
|
1581
|
+
if new_cols:
|
|
1582
|
+
new_cols_with_types = {}
|
|
1583
|
+
staging_cols = self.get_table_columns(staging_table)
|
|
1584
|
+
for col in new_cols:
|
|
1585
|
+
# Use appropriate type for hash columns (SHA256 = 64 chars)
|
|
1586
|
+
if col in ("_computed_hash", "_hash", "_hash_diff"):
|
|
1587
|
+
new_cols_with_types[col] = "NVARCHAR(256)"
|
|
1588
|
+
elif col in staging_cols:
|
|
1589
|
+
new_cols_with_types[col] = staging_cols[col]
|
|
1590
|
+
else:
|
|
1591
|
+
new_cols_with_types[col] = "NVARCHAR(MAX)"
|
|
1592
|
+
self.ctx.info(
|
|
1593
|
+
"Adding new columns to target table via schema evolution",
|
|
1594
|
+
target_table=target_table,
|
|
1595
|
+
new_columns=list(new_cols_with_types.keys()),
|
|
1596
|
+
)
|
|
1597
|
+
self.add_columns(target_table, new_cols_with_types)
|
|
1598
|
+
|
|
1599
|
+
result = self.execute_merge(
|
|
1600
|
+
target_table=target_table,
|
|
1601
|
+
staging_table=staging_table,
|
|
1602
|
+
merge_keys=merge_keys,
|
|
1603
|
+
columns=columns,
|
|
1604
|
+
options=options,
|
|
1605
|
+
)
|
|
1606
|
+
|
|
1607
|
+
return result
|
|
1608
|
+
|
|
1609
|
+
def overwrite_spark(
|
|
1610
|
+
self,
|
|
1611
|
+
df: Any,
|
|
1612
|
+
target_table: str,
|
|
1613
|
+
options: Optional[SqlServerOverwriteOptions] = None,
|
|
1614
|
+
jdbc_options: Optional[Dict[str, Any]] = None,
|
|
1615
|
+
) -> OverwriteResult:
|
|
1616
|
+
"""
|
|
1617
|
+
Execute enhanced overwrite operation for Spark DataFrame.
|
|
1618
|
+
|
|
1619
|
+
Args:
|
|
1620
|
+
df: Spark DataFrame to write
|
|
1621
|
+
target_table: Target table name
|
|
1622
|
+
options: Overwrite options
|
|
1623
|
+
jdbc_options: JDBC connection options
|
|
1624
|
+
|
|
1625
|
+
Returns:
|
|
1626
|
+
OverwriteResult with row count
|
|
1627
|
+
"""
|
|
1628
|
+
options = options or SqlServerOverwriteOptions()
|
|
1629
|
+
jdbc_options = jdbc_options or {}
|
|
1630
|
+
strategy = options.strategy
|
|
1631
|
+
|
|
1632
|
+
self.ctx.info(
|
|
1633
|
+
"Starting SQL Server overwrite",
|
|
1634
|
+
target_table=target_table,
|
|
1635
|
+
strategy=strategy.value,
|
|
1636
|
+
)
|
|
1637
|
+
|
|
1638
|
+
table_exists = self.check_table_exists(target_table)
|
|
1639
|
+
|
|
1640
|
+
if strategy == SqlServerOverwriteStrategy.DROP_CREATE:
|
|
1641
|
+
if table_exists:
|
|
1642
|
+
self.drop_table(target_table)
|
|
1643
|
+
|
|
1644
|
+
jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
|
|
1645
|
+
df.write.format("jdbc").options(**jdbc_options_with_table).mode("overwrite").save()
|
|
1646
|
+
|
|
1647
|
+
elif strategy == SqlServerOverwriteStrategy.TRUNCATE_INSERT:
|
|
1648
|
+
if table_exists:
|
|
1649
|
+
self.truncate_table(target_table)
|
|
1650
|
+
jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
|
|
1651
|
+
df.write.format("jdbc").options(**jdbc_options_with_table).mode("append").save()
|
|
1652
|
+
else:
|
|
1653
|
+
jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
|
|
1654
|
+
df.write.format("jdbc").options(**jdbc_options_with_table).mode("overwrite").save()
|
|
1655
|
+
|
|
1656
|
+
elif strategy == SqlServerOverwriteStrategy.DELETE_INSERT:
|
|
1657
|
+
if table_exists:
|
|
1658
|
+
self.delete_from_table(target_table)
|
|
1659
|
+
jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
|
|
1660
|
+
df.write.format("jdbc").options(**jdbc_options_with_table).mode("append").save()
|
|
1661
|
+
else:
|
|
1662
|
+
jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
|
|
1663
|
+
df.write.format("jdbc").options(**jdbc_options_with_table).mode("overwrite").save()
|
|
1664
|
+
|
|
1665
|
+
row_count = df.count()
|
|
1666
|
+
|
|
1667
|
+
self.ctx.info(
|
|
1668
|
+
"Overwrite completed",
|
|
1669
|
+
target_table=target_table,
|
|
1670
|
+
strategy=strategy.value,
|
|
1671
|
+
rows_written=row_count,
|
|
1672
|
+
)
|
|
1673
|
+
|
|
1674
|
+
return OverwriteResult(rows_written=row_count, strategy=strategy.value)
|
|
1675
|
+
|
|
1676
|
+
def overwrite_pandas(
|
|
1677
|
+
self,
|
|
1678
|
+
df: Any,
|
|
1679
|
+
target_table: str,
|
|
1680
|
+
options: Optional[SqlServerOverwriteOptions] = None,
|
|
1681
|
+
) -> OverwriteResult:
|
|
1682
|
+
"""
|
|
1683
|
+
Execute enhanced overwrite operation for Pandas DataFrame.
|
|
1684
|
+
|
|
1685
|
+
Args:
|
|
1686
|
+
df: Pandas DataFrame to write
|
|
1687
|
+
target_table: Target table name
|
|
1688
|
+
options: Overwrite options
|
|
1689
|
+
|
|
1690
|
+
Returns:
|
|
1691
|
+
OverwriteResult with row count
|
|
1692
|
+
"""
|
|
1693
|
+
options = options or SqlServerOverwriteOptions()
|
|
1694
|
+
strategy = options.strategy
|
|
1695
|
+
|
|
1696
|
+
self.ctx.info(
|
|
1697
|
+
"Starting SQL Server overwrite (Pandas)",
|
|
1698
|
+
target_table=target_table,
|
|
1699
|
+
strategy=strategy.value,
|
|
1700
|
+
)
|
|
1701
|
+
|
|
1702
|
+
table_exists = self.check_table_exists(target_table)
|
|
1703
|
+
schema, table_name = self.parse_table_name(target_table)
|
|
1704
|
+
|
|
1705
|
+
if strategy == SqlServerOverwriteStrategy.DROP_CREATE:
|
|
1706
|
+
if table_exists:
|
|
1707
|
+
self.drop_table(target_table)
|
|
1708
|
+
self.connection.write_table(
|
|
1709
|
+
df=df,
|
|
1710
|
+
table_name=table_name,
|
|
1711
|
+
schema=schema,
|
|
1712
|
+
if_exists="replace",
|
|
1713
|
+
)
|
|
1714
|
+
|
|
1715
|
+
elif strategy == SqlServerOverwriteStrategy.TRUNCATE_INSERT:
|
|
1716
|
+
if table_exists:
|
|
1717
|
+
self.truncate_table(target_table)
|
|
1718
|
+
self.connection.write_table(
|
|
1719
|
+
df=df,
|
|
1720
|
+
table_name=table_name,
|
|
1721
|
+
schema=schema,
|
|
1722
|
+
if_exists="append",
|
|
1723
|
+
)
|
|
1724
|
+
else:
|
|
1725
|
+
self.connection.write_table(
|
|
1726
|
+
df=df,
|
|
1727
|
+
table_name=table_name,
|
|
1728
|
+
schema=schema,
|
|
1729
|
+
if_exists="replace",
|
|
1730
|
+
)
|
|
1731
|
+
|
|
1732
|
+
elif strategy == SqlServerOverwriteStrategy.DELETE_INSERT:
|
|
1733
|
+
if table_exists:
|
|
1734
|
+
self.delete_from_table(target_table)
|
|
1735
|
+
self.connection.write_table(
|
|
1736
|
+
df=df,
|
|
1737
|
+
table_name=table_name,
|
|
1738
|
+
schema=schema,
|
|
1739
|
+
if_exists="append",
|
|
1740
|
+
)
|
|
1741
|
+
else:
|
|
1742
|
+
self.connection.write_table(
|
|
1743
|
+
df=df,
|
|
1744
|
+
table_name=table_name,
|
|
1745
|
+
schema=schema,
|
|
1746
|
+
if_exists="replace",
|
|
1747
|
+
)
|
|
1748
|
+
|
|
1749
|
+
row_count = len(df)
|
|
1750
|
+
|
|
1751
|
+
self.ctx.info(
|
|
1752
|
+
"Overwrite completed (Pandas)",
|
|
1753
|
+
target_table=target_table,
|
|
1754
|
+
strategy=strategy.value,
|
|
1755
|
+
rows_written=row_count,
|
|
1756
|
+
)
|
|
1757
|
+
|
|
1758
|
+
return OverwriteResult(rows_written=row_count, strategy=strategy.value)
|
|
1759
|
+
|
|
1760
|
+
def merge_polars(
|
|
1761
|
+
self,
|
|
1762
|
+
df: Any,
|
|
1763
|
+
target_table: str,
|
|
1764
|
+
merge_keys: List[str],
|
|
1765
|
+
options: Optional[SqlServerMergeOptions] = None,
|
|
1766
|
+
) -> MergeResult:
|
|
1767
|
+
"""
|
|
1768
|
+
Execute full merge operation for Polars DataFrame (Phase 4).
|
|
1769
|
+
|
|
1770
|
+
Args:
|
|
1771
|
+
df: Polars DataFrame or LazyFrame to merge
|
|
1772
|
+
target_table: Target table name (e.g., 'oee.oee_fact')
|
|
1773
|
+
merge_keys: Key columns for ON clause
|
|
1774
|
+
options: Merge options
|
|
1775
|
+
|
|
1776
|
+
Returns:
|
|
1777
|
+
MergeResult with counts
|
|
1778
|
+
"""
|
|
1779
|
+
try:
|
|
1780
|
+
import polars as pl
|
|
1781
|
+
except ImportError:
|
|
1782
|
+
raise ImportError("Polars not installed. Run 'pip install polars'.")
|
|
1783
|
+
|
|
1784
|
+
options = options or SqlServerMergeOptions()
|
|
1785
|
+
|
|
1786
|
+
if isinstance(df, pl.LazyFrame):
|
|
1787
|
+
df = df.collect()
|
|
1788
|
+
|
|
1789
|
+
schema, _ = self.parse_table_name(target_table)
|
|
1790
|
+
if options.auto_create_schema:
|
|
1791
|
+
self.create_schema(schema)
|
|
1792
|
+
|
|
1793
|
+
table_exists = self.check_table_exists(target_table)
|
|
1794
|
+
if not table_exists:
|
|
1795
|
+
if options.auto_create_table:
|
|
1796
|
+
self.create_table_from_polars(df, target_table, audit_cols=options.audit_cols)
|
|
1797
|
+
if options.primary_key_on_merge_keys or options.index_on_merge_keys:
|
|
1798
|
+
# Fix MAX columns in merge keys - SQL Server can't index MAX types
|
|
1799
|
+
self._fix_max_columns_for_indexing(target_table, merge_keys)
|
|
1800
|
+
if options.primary_key_on_merge_keys:
|
|
1801
|
+
self.create_primary_key(target_table, merge_keys)
|
|
1802
|
+
elif options.index_on_merge_keys:
|
|
1803
|
+
self.create_index(target_table, merge_keys)
|
|
1804
|
+
else:
|
|
1805
|
+
raise ValueError(
|
|
1806
|
+
f"Target table '{target_table}' does not exist. "
|
|
1807
|
+
"SQL Server MERGE mode requires the target table to exist. "
|
|
1808
|
+
"Set auto_create_table=true or use mode='overwrite' for initial load."
|
|
1809
|
+
)
|
|
1810
|
+
|
|
1811
|
+
if options.schema_evolution and table_exists:
|
|
1812
|
+
columns = self.handle_schema_evolution_polars(
|
|
1813
|
+
df, target_table, options.schema_evolution
|
|
1814
|
+
)
|
|
1815
|
+
else:
|
|
1816
|
+
columns = list(df.columns)
|
|
1817
|
+
|
|
1818
|
+
if options.audit_cols:
|
|
1819
|
+
if options.audit_cols.created_col and options.audit_cols.created_col not in columns:
|
|
1820
|
+
columns.append(options.audit_cols.created_col)
|
|
1821
|
+
if options.audit_cols.updated_col and options.audit_cols.updated_col not in columns:
|
|
1822
|
+
columns.append(options.audit_cols.updated_col)
|
|
1823
|
+
|
|
1824
|
+
if options.validations:
|
|
1825
|
+
validation_result = self.validate_keys_polars(df, merge_keys, options.validations)
|
|
1826
|
+
if not validation_result.is_valid:
|
|
1827
|
+
error_msg = "; ".join(validation_result.errors)
|
|
1828
|
+
if options.validations.fail_on_validation_error:
|
|
1829
|
+
raise ValueError(f"Merge key validation failed: {error_msg}")
|
|
1830
|
+
else:
|
|
1831
|
+
self.ctx.warning(f"Merge key validation warnings: {error_msg}")
|
|
1832
|
+
|
|
1833
|
+
staging_table = self.get_staging_table_name(target_table, options.staging_schema)
|
|
1834
|
+
staging_schema, staging_table_name = staging_table.strip("[]").split("].[")
|
|
1835
|
+
staging_schema = staging_schema.strip("[")
|
|
1836
|
+
staging_table_name = staging_table_name.strip("]")
|
|
1837
|
+
|
|
1838
|
+
if options.auto_create_schema:
|
|
1839
|
+
self.create_schema(staging_schema)
|
|
1840
|
+
|
|
1841
|
+
self.ctx.info(
|
|
1842
|
+
"Starting SQL Server MERGE (Polars)",
|
|
1843
|
+
target_table=target_table,
|
|
1844
|
+
staging_table=staging_table,
|
|
1845
|
+
merge_keys=merge_keys,
|
|
1846
|
+
incremental=options.incremental,
|
|
1847
|
+
)
|
|
1848
|
+
|
|
1849
|
+
df_to_write = df
|
|
1850
|
+
|
|
1851
|
+
# Incremental merge: filter to only changed rows before writing to staging
|
|
1852
|
+
if options.incremental and table_exists:
|
|
1853
|
+
hash_column = self.get_hash_column_name(df.columns, options.hash_column)
|
|
1854
|
+
|
|
1855
|
+
if hash_column is None and options.change_detection_columns:
|
|
1856
|
+
hash_column = "_computed_hash"
|
|
1857
|
+
df_to_write = self.compute_hash_polars(
|
|
1858
|
+
df, options.change_detection_columns, hash_column
|
|
1859
|
+
)
|
|
1860
|
+
columns.append(hash_column)
|
|
1861
|
+
elif hash_column is None:
|
|
1862
|
+
non_key_cols = [c for c in df.columns if c not in merge_keys]
|
|
1863
|
+
if non_key_cols:
|
|
1864
|
+
hash_column = "_computed_hash"
|
|
1865
|
+
df_to_write = self.compute_hash_polars(df, non_key_cols, hash_column)
|
|
1866
|
+
columns.append(hash_column)
|
|
1867
|
+
|
|
1868
|
+
if hash_column:
|
|
1869
|
+
target_hashes = self.read_target_hashes(target_table, merge_keys, hash_column)
|
|
1870
|
+
original_count = len(df_to_write)
|
|
1871
|
+
df_to_write = self.filter_changed_rows_polars(
|
|
1872
|
+
df_to_write, target_hashes, merge_keys, hash_column
|
|
1873
|
+
)
|
|
1874
|
+
filtered_count = len(df_to_write)
|
|
1875
|
+
self.ctx.info(
|
|
1876
|
+
"Incremental filter applied (Polars)",
|
|
1877
|
+
original_rows=original_count,
|
|
1878
|
+
changed_rows=filtered_count,
|
|
1879
|
+
skipped_rows=original_count - filtered_count,
|
|
1880
|
+
)
|
|
1881
|
+
|
|
1882
|
+
if filtered_count == 0:
|
|
1883
|
+
self.ctx.info("No changed rows detected, skipping merge")
|
|
1884
|
+
return MergeResult(inserted=0, updated=0, deleted=0)
|
|
1885
|
+
|
|
1886
|
+
df_pandas = df_to_write.to_pandas()
|
|
1887
|
+
|
|
1888
|
+
batch_size = options.batch_size
|
|
1889
|
+
if batch_size and len(df_pandas) > batch_size:
|
|
1890
|
+
for i in range(0, len(df_pandas), batch_size):
|
|
1891
|
+
chunk = df_pandas.iloc[i : i + batch_size]
|
|
1892
|
+
if_exists = "replace" if i == 0 else "append"
|
|
1893
|
+
self.connection.write_table(
|
|
1894
|
+
df=chunk,
|
|
1895
|
+
table_name=staging_table_name,
|
|
1896
|
+
schema=staging_schema,
|
|
1897
|
+
if_exists=if_exists,
|
|
1898
|
+
)
|
|
1899
|
+
self.ctx.debug(f"Wrote batch {i // batch_size + 1}", rows=len(chunk))
|
|
1900
|
+
else:
|
|
1901
|
+
self.connection.write_table(
|
|
1902
|
+
df=df_pandas,
|
|
1903
|
+
table_name=staging_table_name,
|
|
1904
|
+
schema=staging_schema,
|
|
1905
|
+
if_exists="replace",
|
|
1906
|
+
)
|
|
1907
|
+
|
|
1908
|
+
self.ctx.debug("Staging write completed (Polars)", staging_table=staging_table)
|
|
1909
|
+
|
|
1910
|
+
# Handle schema evolution before MERGE - add any new columns to target table
|
|
1911
|
+
if options.schema_evolution and options.schema_evolution.add_columns:
|
|
1912
|
+
existing_cols = self.get_table_columns(target_table)
|
|
1913
|
+
new_cols = [c for c in columns if c not in existing_cols]
|
|
1914
|
+
if new_cols:
|
|
1915
|
+
new_cols_with_types = {}
|
|
1916
|
+
staging_cols = self.get_table_columns(staging_table)
|
|
1917
|
+
for col in new_cols:
|
|
1918
|
+
# Use appropriate type for hash columns (SHA256 = 64 chars)
|
|
1919
|
+
if col in ("_computed_hash", "_hash", "_hash_diff"):
|
|
1920
|
+
new_cols_with_types[col] = "NVARCHAR(256)"
|
|
1921
|
+
elif col in staging_cols:
|
|
1922
|
+
new_cols_with_types[col] = staging_cols[col]
|
|
1923
|
+
else:
|
|
1924
|
+
new_cols_with_types[col] = "NVARCHAR(MAX)"
|
|
1925
|
+
self.ctx.info(
|
|
1926
|
+
"Adding new columns to target table via schema evolution",
|
|
1927
|
+
target_table=target_table,
|
|
1928
|
+
new_columns=list(new_cols_with_types.keys()),
|
|
1929
|
+
)
|
|
1930
|
+
self.add_columns(target_table, new_cols_with_types)
|
|
1931
|
+
|
|
1932
|
+
result = self.execute_merge(
|
|
1933
|
+
target_table=target_table,
|
|
1934
|
+
staging_table=staging_table,
|
|
1935
|
+
merge_keys=merge_keys,
|
|
1936
|
+
columns=columns,
|
|
1937
|
+
options=options,
|
|
1938
|
+
)
|
|
1939
|
+
|
|
1940
|
+
return result
|
|
1941
|
+
|
|
1942
|
+
def overwrite_polars(
|
|
1943
|
+
self,
|
|
1944
|
+
df: Any,
|
|
1945
|
+
target_table: str,
|
|
1946
|
+
options: Optional[SqlServerOverwriteOptions] = None,
|
|
1947
|
+
) -> OverwriteResult:
|
|
1948
|
+
"""
|
|
1949
|
+
Execute enhanced overwrite operation for Polars DataFrame (Phase 4).
|
|
1950
|
+
|
|
1951
|
+
Args:
|
|
1952
|
+
df: Polars DataFrame or LazyFrame to write
|
|
1953
|
+
target_table: Target table name
|
|
1954
|
+
options: Overwrite options
|
|
1955
|
+
|
|
1956
|
+
Returns:
|
|
1957
|
+
OverwriteResult with row count
|
|
1958
|
+
"""
|
|
1959
|
+
try:
|
|
1960
|
+
import polars as pl
|
|
1961
|
+
except ImportError:
|
|
1962
|
+
raise ImportError("Polars not installed. Run 'pip install polars'.")
|
|
1963
|
+
|
|
1964
|
+
options = options or SqlServerOverwriteOptions()
|
|
1965
|
+
strategy = options.strategy
|
|
1966
|
+
|
|
1967
|
+
if isinstance(df, pl.LazyFrame):
|
|
1968
|
+
df = df.collect()
|
|
1969
|
+
|
|
1970
|
+
schema, table_name = self.parse_table_name(target_table)
|
|
1971
|
+
if options.auto_create_schema:
|
|
1972
|
+
self.create_schema(schema)
|
|
1973
|
+
|
|
1974
|
+
self.ctx.info(
|
|
1975
|
+
"Starting SQL Server overwrite (Polars)",
|
|
1976
|
+
target_table=target_table,
|
|
1977
|
+
strategy=strategy.value,
|
|
1978
|
+
)
|
|
1979
|
+
|
|
1980
|
+
table_exists = self.check_table_exists(target_table)
|
|
1981
|
+
|
|
1982
|
+
if options.auto_create_table and not table_exists:
|
|
1983
|
+
self.create_table_from_polars(df, target_table)
|
|
1984
|
+
table_exists = True
|
|
1985
|
+
|
|
1986
|
+
if options.schema_evolution and table_exists:
|
|
1987
|
+
columns_to_write = self.handle_schema_evolution_polars(
|
|
1988
|
+
df, target_table, options.schema_evolution
|
|
1989
|
+
)
|
|
1990
|
+
df_to_write = df.select(columns_to_write)
|
|
1991
|
+
else:
|
|
1992
|
+
df_to_write = df
|
|
1993
|
+
|
|
1994
|
+
df_pandas = df_to_write.to_pandas()
|
|
1995
|
+
|
|
1996
|
+
batch_size = options.batch_size
|
|
1997
|
+
if strategy == SqlServerOverwriteStrategy.DROP_CREATE:
|
|
1998
|
+
if table_exists:
|
|
1999
|
+
self.drop_table(target_table)
|
|
2000
|
+
if batch_size and len(df_pandas) > batch_size:
|
|
2001
|
+
for i in range(0, len(df_pandas), batch_size):
|
|
2002
|
+
chunk = df_pandas.iloc[i : i + batch_size]
|
|
2003
|
+
if_exists = "replace" if i == 0 else "append"
|
|
2004
|
+
self.connection.write_table(
|
|
2005
|
+
df=chunk,
|
|
2006
|
+
table_name=table_name,
|
|
2007
|
+
schema=schema,
|
|
2008
|
+
if_exists=if_exists,
|
|
2009
|
+
)
|
|
2010
|
+
else:
|
|
2011
|
+
self.connection.write_table(
|
|
2012
|
+
df=df_pandas,
|
|
2013
|
+
table_name=table_name,
|
|
2014
|
+
schema=schema,
|
|
2015
|
+
if_exists="replace",
|
|
2016
|
+
)
|
|
2017
|
+
|
|
2018
|
+
elif strategy == SqlServerOverwriteStrategy.TRUNCATE_INSERT:
|
|
2019
|
+
if table_exists:
|
|
2020
|
+
self.truncate_table(target_table)
|
|
2021
|
+
if batch_size and len(df_pandas) > batch_size:
|
|
2022
|
+
for i in range(0, len(df_pandas), batch_size):
|
|
2023
|
+
chunk = df_pandas.iloc[i : i + batch_size]
|
|
2024
|
+
self.connection.write_table(
|
|
2025
|
+
df=chunk,
|
|
2026
|
+
table_name=table_name,
|
|
2027
|
+
schema=schema,
|
|
2028
|
+
if_exists="append",
|
|
2029
|
+
)
|
|
2030
|
+
else:
|
|
2031
|
+
self.connection.write_table(
|
|
2032
|
+
df=df_pandas,
|
|
2033
|
+
table_name=table_name,
|
|
2034
|
+
schema=schema,
|
|
2035
|
+
if_exists="append",
|
|
2036
|
+
)
|
|
2037
|
+
else:
|
|
2038
|
+
self.connection.write_table(
|
|
2039
|
+
df=df_pandas,
|
|
2040
|
+
table_name=table_name,
|
|
2041
|
+
schema=schema,
|
|
2042
|
+
if_exists="replace",
|
|
2043
|
+
)
|
|
2044
|
+
|
|
2045
|
+
elif strategy == SqlServerOverwriteStrategy.DELETE_INSERT:
|
|
2046
|
+
if table_exists:
|
|
2047
|
+
self.delete_from_table(target_table)
|
|
2048
|
+
if batch_size and len(df_pandas) > batch_size:
|
|
2049
|
+
for i in range(0, len(df_pandas), batch_size):
|
|
2050
|
+
chunk = df_pandas.iloc[i : i + batch_size]
|
|
2051
|
+
self.connection.write_table(
|
|
2052
|
+
df=chunk,
|
|
2053
|
+
table_name=table_name,
|
|
2054
|
+
schema=schema,
|
|
2055
|
+
if_exists="append",
|
|
2056
|
+
)
|
|
2057
|
+
else:
|
|
2058
|
+
self.connection.write_table(
|
|
2059
|
+
df=df_pandas,
|
|
2060
|
+
table_name=table_name,
|
|
2061
|
+
schema=schema,
|
|
2062
|
+
if_exists="append",
|
|
2063
|
+
)
|
|
2064
|
+
else:
|
|
2065
|
+
self.connection.write_table(
|
|
2066
|
+
df=df_pandas,
|
|
2067
|
+
table_name=table_name,
|
|
2068
|
+
schema=schema,
|
|
2069
|
+
if_exists="replace",
|
|
2070
|
+
)
|
|
2071
|
+
|
|
2072
|
+
row_count = len(df)
|
|
2073
|
+
|
|
2074
|
+
self.ctx.info(
|
|
2075
|
+
"Overwrite completed (Polars)",
|
|
2076
|
+
target_table=target_table,
|
|
2077
|
+
strategy=strategy.value,
|
|
2078
|
+
rows_written=row_count,
|
|
2079
|
+
)
|
|
2080
|
+
|
|
2081
|
+
return OverwriteResult(rows_written=row_count, strategy=strategy.value)
|