odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2081 @@
1
+ """SQL Server MERGE and overwrite writer for incremental sync operations.
2
+
3
+ Phase 1: Spark → SQL Server MERGE via staging table.
4
+ Phase 2: Enhanced overwrite strategies and validations.
5
+ Phase 3: Pandas engine support.
6
+ Phase 4: Polars engine support, auto schema/table creation, schema evolution, batch processing.
7
+ """
8
+
9
+ from dataclasses import dataclass
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+
12
+ from odibi.config import (
13
+ SqlServerAuditColsConfig,
14
+ SqlServerMergeOptions,
15
+ SqlServerMergeValidationConfig,
16
+ SqlServerOverwriteOptions,
17
+ SqlServerOverwriteStrategy,
18
+ SqlServerSchemaEvolutionMode,
19
+ )
20
+ from odibi.utils.logging_context import get_logging_context
21
+
22
+
23
+ # Type mapping for schema inference
24
+ POLARS_TO_SQL_TYPE_MAP: Dict[str, str] = {
25
+ "Int8": "TINYINT",
26
+ "Int16": "SMALLINT",
27
+ "Int32": "INT",
28
+ "Int64": "BIGINT",
29
+ "UInt8": "TINYINT",
30
+ "UInt16": "SMALLINT",
31
+ "UInt32": "INT",
32
+ "UInt64": "BIGINT",
33
+ "Float32": "REAL",
34
+ "Float64": "FLOAT",
35
+ "Boolean": "BIT",
36
+ "Utf8": "NVARCHAR(MAX)",
37
+ "String": "NVARCHAR(MAX)",
38
+ "Date": "DATE",
39
+ "Datetime": "DATETIME2",
40
+ "Time": "TIME",
41
+ "Duration": "BIGINT",
42
+ "Binary": "VARBINARY(MAX)",
43
+ "Null": "NVARCHAR(1)",
44
+ }
45
+
46
+ PANDAS_TO_SQL_TYPE_MAP: Dict[str, str] = {
47
+ "int8": "TINYINT",
48
+ "int16": "SMALLINT",
49
+ "int32": "INT",
50
+ "int64": "BIGINT",
51
+ "uint8": "TINYINT",
52
+ "uint16": "SMALLINT",
53
+ "uint32": "INT",
54
+ "uint64": "BIGINT",
55
+ "float16": "REAL",
56
+ "float32": "REAL",
57
+ "float64": "FLOAT",
58
+ "bool": "BIT",
59
+ "boolean": "BIT",
60
+ "object": "NVARCHAR(MAX)",
61
+ "string": "NVARCHAR(MAX)",
62
+ "datetime64[ns]": "DATETIME2",
63
+ "datetime64[us]": "DATETIME2",
64
+ "timedelta64[ns]": "BIGINT",
65
+ "category": "NVARCHAR(MAX)",
66
+ }
67
+
68
+
69
+ @dataclass
70
+ class MergeResult:
71
+ """Result of a SQL Server MERGE operation."""
72
+
73
+ inserted: int = 0
74
+ updated: int = 0
75
+ deleted: int = 0
76
+
77
+ @property
78
+ def total_affected(self) -> int:
79
+ return self.inserted + self.updated + self.deleted
80
+
81
+
82
+ @dataclass
83
+ class OverwriteResult:
84
+ """Result of a SQL Server overwrite operation."""
85
+
86
+ rows_written: int = 0
87
+ strategy: str = "truncate_insert"
88
+
89
+
90
+ @dataclass
91
+ class ValidationResult:
92
+ """Result of data validation checks."""
93
+
94
+ is_valid: bool = True
95
+ null_key_count: int = 0
96
+ duplicate_key_count: int = 0
97
+ errors: List[str] = None
98
+
99
+ def __post_init__(self):
100
+ if self.errors is None:
101
+ self.errors = []
102
+
103
+
104
+ class SqlServerMergeWriter:
105
+ """
106
+ Executes SQL Server MERGE and overwrite operations.
107
+
108
+ Supports:
109
+ - MERGE via staging table pattern
110
+ - Enhanced overwrite with multiple strategies
111
+ - Data validations (null keys, duplicate keys)
112
+ - Both Spark and Pandas DataFrames
113
+ """
114
+
115
+ def __init__(self, connection: Any):
116
+ """
117
+ Initialize the writer with a SQL Server connection.
118
+
119
+ Args:
120
+ connection: Connection object with execute_sql and get_spark_options methods
121
+ """
122
+ self.connection = connection
123
+ self.ctx = get_logging_context()
124
+
125
+ def get_staging_table_name(self, target_table: str, staging_schema: str) -> str:
126
+ """
127
+ Generate staging table name from target table.
128
+
129
+ Args:
130
+ target_table: Target table name (e.g., 'sales.fact_orders')
131
+ staging_schema: Schema for staging table
132
+
133
+ Returns:
134
+ Staging table name (e.g., '[staging].[fact_orders_staging]')
135
+ """
136
+ if "." in target_table:
137
+ _, table_name = target_table.split(".", 1)
138
+ else:
139
+ table_name = target_table
140
+
141
+ table_name = table_name.strip("[]")
142
+ return f"[{staging_schema}].[{table_name}_staging]"
143
+
144
+ def escape_column(self, col: str) -> str:
145
+ """Escape column name for SQL Server."""
146
+ col = col.strip("[]")
147
+ return f"[{col}]"
148
+
149
+ def parse_table_name(self, table: str) -> Tuple[str, str]:
150
+ """
151
+ Parse table name into schema and table parts.
152
+
153
+ Args:
154
+ table: Table name (e.g., 'sales.fact_orders' or 'fact_orders')
155
+
156
+ Returns:
157
+ Tuple of (schema, table_name)
158
+ """
159
+ if "." in table:
160
+ schema, table_name = table.split(".", 1)
161
+ else:
162
+ schema = "dbo"
163
+ table_name = table
164
+
165
+ schema = schema.strip("[]")
166
+ table_name = table_name.strip("[]")
167
+ return schema, table_name
168
+
169
+ def get_escaped_table_name(self, table: str) -> str:
170
+ """Get fully escaped table name."""
171
+ schema, table_name = self.parse_table_name(table)
172
+ return f"[{schema}].[{table_name}]"
173
+
174
+ def check_table_exists(self, table: str) -> bool:
175
+ """
176
+ Check if a table exists in SQL Server.
177
+
178
+ Args:
179
+ table: Table name (e.g., 'sales.fact_orders')
180
+
181
+ Returns:
182
+ True if table exists
183
+ """
184
+ schema, table_name = self.parse_table_name(table)
185
+ sql = f"""
186
+ SELECT 1 FROM INFORMATION_SCHEMA.TABLES
187
+ WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table_name}'
188
+ """
189
+ result = self.connection.execute_sql(sql)
190
+ # Result is now a list of rows (fetchall already called in AzureSQL.execute)
191
+ row = result[0] if result else None
192
+ return row is not None
193
+
194
+ def read_target_hashes(
195
+ self,
196
+ target_table: str,
197
+ merge_keys: List[str],
198
+ hash_column: str,
199
+ ) -> List[Dict[str, Any]]:
200
+ """
201
+ Read merge keys and hash column from target table for incremental comparison.
202
+
203
+ Args:
204
+ target_table: Target table name
205
+ merge_keys: Key columns
206
+ hash_column: Hash column name
207
+
208
+ Returns:
209
+ List of dicts with keys and hash values, or empty list if hash column missing
210
+ """
211
+ escaped_table = self.get_escaped_table_name(target_table)
212
+
213
+ # Check if hash column exists in target table before querying
214
+ existing_columns = self.get_table_columns(target_table)
215
+ if existing_columns and hash_column not in existing_columns:
216
+ self.ctx.info(
217
+ "Hash column not found in target table, skipping incremental comparison",
218
+ hash_column=hash_column,
219
+ target_table=target_table,
220
+ )
221
+ return []
222
+
223
+ key_cols = ", ".join([self.escape_column(k) for k in merge_keys])
224
+ hash_col = self.escape_column(hash_column)
225
+
226
+ sql = f"SELECT {key_cols}, {hash_col} FROM {escaped_table}"
227
+ self.ctx.debug("Reading target hashes for incremental merge", table=target_table)
228
+
229
+ result = self.connection.execute_sql(sql)
230
+ if not result:
231
+ return []
232
+
233
+ # Convert SQLAlchemy Row objects to dicts for Spark compatibility
234
+ # Row objects have _mapping attribute or can be accessed via _asdict()
235
+ dicts = []
236
+ for row in result:
237
+ if hasattr(row, "_asdict"):
238
+ dicts.append(row._asdict())
239
+ elif hasattr(row, "_mapping"):
240
+ dicts.append(dict(row._mapping))
241
+ else:
242
+ # Fallback: assume row is dict-like or tuple with known columns
243
+ columns = merge_keys + [hash_column]
244
+ dicts.append(dict(zip(columns, row)))
245
+ return dicts
246
+
247
+ def get_hash_column_name(
248
+ self,
249
+ df_columns: List[str],
250
+ options_hash_column: Optional[str],
251
+ ) -> Optional[str]:
252
+ """
253
+ Determine which hash column to use for incremental merge.
254
+
255
+ Args:
256
+ df_columns: List of DataFrame column names
257
+ options_hash_column: Explicitly configured hash column
258
+
259
+ Returns:
260
+ Hash column name or None if not available
261
+ """
262
+ if options_hash_column:
263
+ if options_hash_column in df_columns:
264
+ return options_hash_column
265
+ else:
266
+ self.ctx.warning(
267
+ f"Configured hash_column '{options_hash_column}' not found in DataFrame"
268
+ )
269
+ return None
270
+
271
+ # Auto-detect common hash column names
272
+ for candidate in ["_hash_diff", "_hash", "hash_diff", "row_hash"]:
273
+ if candidate in df_columns:
274
+ self.ctx.debug(f"Auto-detected hash column: {candidate}")
275
+ return candidate
276
+
277
+ return None
278
+
279
+ def compute_hash_spark(
280
+ self, df: Any, columns: List[str], hash_col_name: str = "_computed_hash"
281
+ ):
282
+ """
283
+ Compute hash column for Spark DataFrame.
284
+
285
+ Args:
286
+ df: Spark DataFrame
287
+ columns: Columns to include in hash
288
+ hash_col_name: Name for the computed hash column
289
+
290
+ Returns:
291
+ DataFrame with hash column added
292
+ """
293
+ from pyspark.sql import functions as F
294
+
295
+ # Concatenate columns and compute MD5 hash
296
+ concat_expr = F.concat_ws(
297
+ "||", *[F.coalesce(F.col(c).cast("string"), F.lit("NULL")) for c in columns]
298
+ )
299
+ return df.withColumn(hash_col_name, F.md5(concat_expr))
300
+
301
+ def compute_hash_pandas(
302
+ self, df: Any, columns: List[str], hash_col_name: str = "_computed_hash"
303
+ ):
304
+ """
305
+ Compute hash column for Pandas DataFrame.
306
+
307
+ Args:
308
+ df: Pandas DataFrame
309
+ columns: Columns to include in hash
310
+ hash_col_name: Name for the computed hash column
311
+
312
+ Returns:
313
+ DataFrame with hash column added
314
+ """
315
+ import hashlib
316
+
317
+ def row_hash(row):
318
+ concat = "||".join(str(row[c]) if row[c] is not None else "NULL" for c in columns)
319
+ return hashlib.md5(concat.encode()).hexdigest()
320
+
321
+ df = df.copy()
322
+ df[hash_col_name] = df.apply(row_hash, axis=1)
323
+ return df
324
+
325
+ def compute_hash_polars(
326
+ self, df: Any, columns: List[str], hash_col_name: str = "_computed_hash"
327
+ ):
328
+ """
329
+ Compute hash column for Polars DataFrame.
330
+
331
+ Args:
332
+ df: Polars DataFrame
333
+ columns: Columns to include in hash
334
+ hash_col_name: Name for the computed hash column
335
+
336
+ Returns:
337
+ DataFrame with hash column added
338
+ """
339
+ import polars as pl
340
+
341
+ # Concatenate columns and compute hash
342
+ concat_expr = pl.concat_str(
343
+ [pl.col(c).cast(pl.Utf8).fill_null("NULL") for c in columns],
344
+ separator="||",
345
+ )
346
+ return df.with_columns(concat_expr.hash().cast(pl.Utf8).alias(hash_col_name))
347
+
348
+ def filter_changed_rows_spark(
349
+ self,
350
+ source_df: Any,
351
+ target_hashes: List[Dict[str, Any]],
352
+ merge_keys: List[str],
353
+ hash_column: str,
354
+ ):
355
+ """
356
+ Filter Spark DataFrame to only rows that are new or changed.
357
+
358
+ Args:
359
+ source_df: Source Spark DataFrame
360
+ target_hashes: List of dicts with target keys and hashes
361
+ merge_keys: Key columns
362
+ hash_column: Hash column name
363
+
364
+ Returns:
365
+ Filtered DataFrame with only new/changed rows
366
+ """
367
+ from pyspark.sql import functions as F
368
+
369
+ if not target_hashes:
370
+ # No existing data, all rows are new
371
+ return source_df
372
+
373
+ # Get SparkSession from DataFrame
374
+ spark = source_df.sparkSession
375
+
376
+ # Create DataFrame from target hashes
377
+ target_df = spark.createDataFrame(target_hashes)
378
+
379
+ # Rename hash column in target to avoid collision
380
+ target_hash_col = f"_target_{hash_column}"
381
+ target_df = target_df.withColumnRenamed(hash_column, target_hash_col)
382
+
383
+ # Left join source with target on merge keys
384
+ join_condition = [source_df[k] == target_df[k] for k in merge_keys]
385
+ joined = source_df.join(target_df, join_condition, "left")
386
+
387
+ # Filter to rows where:
388
+ # 1. No match in target (new rows) - target hash is null
389
+ # 2. Hash differs (changed rows)
390
+ changed = joined.filter(
391
+ F.col(target_hash_col).isNull() | (F.col(hash_column) != F.col(target_hash_col))
392
+ )
393
+
394
+ # Drop the target columns
395
+ for k in merge_keys:
396
+ changed = changed.drop(target_df[k])
397
+ changed = changed.drop(target_hash_col)
398
+
399
+ return changed
400
+
401
+ def filter_changed_rows_pandas(
402
+ self,
403
+ source_df: Any,
404
+ target_hashes: List[Dict[str, Any]],
405
+ merge_keys: List[str],
406
+ hash_column: str,
407
+ ):
408
+ """
409
+ Filter Pandas DataFrame to only rows that are new or changed.
410
+
411
+ Args:
412
+ source_df: Source Pandas DataFrame
413
+ target_hashes: List of dicts with target keys and hashes
414
+ merge_keys: Key columns
415
+ hash_column: Hash column name
416
+
417
+ Returns:
418
+ Filtered DataFrame with only new/changed rows
419
+ """
420
+ import pandas as pd
421
+
422
+ if not target_hashes:
423
+ return source_df
424
+
425
+ target_df = pd.DataFrame(target_hashes)
426
+ target_hash_col = f"_target_{hash_column}"
427
+ target_df = target_df.rename(columns={hash_column: target_hash_col})
428
+
429
+ # Merge to find matching rows
430
+ merged = source_df.merge(target_df, on=merge_keys, how="left")
431
+
432
+ # Filter to new or changed rows
433
+ is_new = merged[target_hash_col].isna()
434
+ is_changed = merged[hash_column] != merged[target_hash_col]
435
+ changed = merged[is_new | is_changed].copy()
436
+
437
+ # Drop the target hash column
438
+ changed = changed.drop(columns=[target_hash_col])
439
+
440
+ return changed
441
+
442
+ def filter_changed_rows_polars(
443
+ self,
444
+ source_df: Any,
445
+ target_hashes: List[Dict[str, Any]],
446
+ merge_keys: List[str],
447
+ hash_column: str,
448
+ ):
449
+ """
450
+ Filter Polars DataFrame to only rows that are new or changed.
451
+
452
+ Args:
453
+ source_df: Source Polars DataFrame
454
+ target_hashes: List of dicts with target keys and hashes
455
+ merge_keys: Key columns
456
+ hash_column: Hash column name
457
+
458
+ Returns:
459
+ Filtered DataFrame with only new/changed rows
460
+ """
461
+ import polars as pl
462
+
463
+ if not target_hashes:
464
+ return source_df
465
+
466
+ target_df = pl.DataFrame(target_hashes)
467
+ target_hash_col = f"_target_{hash_column}"
468
+ target_df = target_df.rename({hash_column: target_hash_col})
469
+
470
+ # Join to find matching rows
471
+ joined = source_df.join(target_df, on=merge_keys, how="left")
472
+
473
+ # Filter to new or changed rows
474
+ changed = joined.filter(
475
+ pl.col(target_hash_col).is_null() | (pl.col(hash_column) != pl.col(target_hash_col))
476
+ )
477
+
478
+ # Drop the target hash column
479
+ changed = changed.drop(target_hash_col)
480
+
481
+ return changed
482
+
483
+ def validate_keys_spark(
484
+ self,
485
+ df: Any,
486
+ merge_keys: List[str],
487
+ config: Optional[SqlServerMergeValidationConfig] = None,
488
+ ) -> ValidationResult:
489
+ """
490
+ Validate merge keys in a Spark DataFrame.
491
+
492
+ Args:
493
+ df: Spark DataFrame
494
+ merge_keys: Key columns to validate
495
+ config: Validation configuration
496
+
497
+ Returns:
498
+ ValidationResult with validation status
499
+ """
500
+ config = config or SqlServerMergeValidationConfig()
501
+ result = ValidationResult()
502
+
503
+ if config.check_null_keys:
504
+ from pyspark.sql import functions as F
505
+
506
+ null_condition = F.lit(False)
507
+ for key in merge_keys:
508
+ null_condition = null_condition | F.col(key).isNull()
509
+
510
+ null_count = df.filter(null_condition).count()
511
+ if null_count > 0:
512
+ result.null_key_count = null_count
513
+ result.errors.append(
514
+ f"Found {null_count} rows with NULL values in merge keys: {merge_keys}"
515
+ )
516
+ result.is_valid = False
517
+
518
+ if config.check_duplicate_keys:
519
+ total_count = df.count()
520
+ distinct_count = df.select(*merge_keys).distinct().count()
521
+ duplicate_count = total_count - distinct_count
522
+
523
+ if duplicate_count > 0:
524
+ result.duplicate_key_count = duplicate_count
525
+ result.errors.append(
526
+ f"Found {duplicate_count} duplicate key combinations in merge keys: {merge_keys}"
527
+ )
528
+ result.is_valid = False
529
+
530
+ return result
531
+
532
+ def validate_keys_pandas(
533
+ self,
534
+ df: Any,
535
+ merge_keys: List[str],
536
+ config: Optional[SqlServerMergeValidationConfig] = None,
537
+ ) -> ValidationResult:
538
+ """
539
+ Validate merge keys in a Pandas DataFrame.
540
+
541
+ Args:
542
+ df: Pandas DataFrame
543
+ merge_keys: Key columns to validate
544
+ config: Validation configuration
545
+
546
+ Returns:
547
+ ValidationResult with validation status
548
+ """
549
+ config = config or SqlServerMergeValidationConfig()
550
+ result = ValidationResult()
551
+
552
+ if config.check_null_keys:
553
+ null_mask = df[merge_keys].isnull().any(axis=1)
554
+ null_count = null_mask.sum()
555
+
556
+ if null_count > 0:
557
+ result.null_key_count = int(null_count)
558
+ result.errors.append(
559
+ f"Found {null_count} rows with NULL values in merge keys: {merge_keys}"
560
+ )
561
+ result.is_valid = False
562
+
563
+ if config.check_duplicate_keys:
564
+ duplicates = df.duplicated(subset=merge_keys, keep=False)
565
+ duplicate_count = (
566
+ duplicates.sum() - df.duplicated(subset=merge_keys, keep="first").sum()
567
+ )
568
+
569
+ if duplicate_count > 0:
570
+ result.duplicate_key_count = int(duplicate_count)
571
+ result.errors.append(
572
+ f"Found {duplicate_count} duplicate key combinations in merge keys: {merge_keys}"
573
+ )
574
+ result.is_valid = False
575
+
576
+ return result
577
+
578
+ def validate_keys_polars(
579
+ self,
580
+ df: Any,
581
+ merge_keys: List[str],
582
+ config: Optional[SqlServerMergeValidationConfig] = None,
583
+ ) -> ValidationResult:
584
+ """
585
+ Validate merge keys in a Polars DataFrame/LazyFrame.
586
+
587
+ Args:
588
+ df: Polars DataFrame or LazyFrame
589
+ merge_keys: Key columns to validate
590
+ config: Validation configuration
591
+
592
+ Returns:
593
+ ValidationResult with validation status
594
+ """
595
+ try:
596
+ import polars as pl
597
+ except ImportError:
598
+ raise ImportError("Polars not installed. Run 'pip install polars'.")
599
+
600
+ config = config or SqlServerMergeValidationConfig()
601
+ result = ValidationResult()
602
+
603
+ is_lazy = isinstance(df, pl.LazyFrame)
604
+ if is_lazy:
605
+ df_materialized = df.collect()
606
+ else:
607
+ df_materialized = df
608
+
609
+ if config.check_null_keys:
610
+ null_condition = pl.lit(False)
611
+ for key in merge_keys:
612
+ null_condition = null_condition | pl.col(key).is_null()
613
+
614
+ null_count = df_materialized.filter(null_condition).height
615
+
616
+ if null_count > 0:
617
+ result.null_key_count = null_count
618
+ result.errors.append(
619
+ f"Found {null_count} rows with NULL values in merge keys: {merge_keys}"
620
+ )
621
+ result.is_valid = False
622
+
623
+ if config.check_duplicate_keys:
624
+ total_count = df_materialized.height
625
+ distinct_count = df_materialized.select(merge_keys).unique().height
626
+ duplicate_count = total_count - distinct_count
627
+
628
+ if duplicate_count > 0:
629
+ result.duplicate_key_count = duplicate_count
630
+ result.errors.append(
631
+ f"Found {duplicate_count} duplicate key combinations in merge keys: {merge_keys}"
632
+ )
633
+ result.is_valid = False
634
+
635
+ return result
636
+
637
+ def check_schema_exists(self, schema: str) -> bool:
638
+ """Check if a schema exists in SQL Server."""
639
+ sql = f"SELECT 1 FROM sys.schemas WHERE name = '{schema}'"
640
+ result = self.connection.execute_sql(sql)
641
+ # Result is now a list of rows (fetchall already called in AzureSQL.execute)
642
+ row = result[0] if result else None
643
+ return row is not None
644
+
645
+ def create_schema(self, schema: str) -> None:
646
+ """Create a schema if it doesn't exist."""
647
+ if not self.check_schema_exists(schema):
648
+ sql = f"CREATE SCHEMA [{schema}]"
649
+ self.ctx.info("Creating schema", schema=schema)
650
+ self.connection.execute_sql(sql)
651
+
652
+ def get_table_columns(self, table: str) -> Dict[str, str]:
653
+ """
654
+ Get column names and full types (with length/precision) for a table.
655
+
656
+ Returns:
657
+ Dictionary mapping column names to full SQL types (e.g., 'nvarchar(255)')
658
+ """
659
+ schema, table_name = self.parse_table_name(table)
660
+ sql = f"""
661
+ SELECT
662
+ COLUMN_NAME,
663
+ DATA_TYPE,
664
+ CHARACTER_MAXIMUM_LENGTH,
665
+ NUMERIC_PRECISION,
666
+ NUMERIC_SCALE
667
+ FROM INFORMATION_SCHEMA.COLUMNS
668
+ WHERE TABLE_SCHEMA = '{schema}' AND TABLE_NAME = '{table_name}'
669
+ ORDER BY ORDINAL_POSITION
670
+ """
671
+ result = self.connection.execute_sql(sql)
672
+ columns = {}
673
+ for row in result:
674
+ if isinstance(row, dict):
675
+ col_name = row["COLUMN_NAME"]
676
+ data_type = row["DATA_TYPE"]
677
+ char_len = row.get("CHARACTER_MAXIMUM_LENGTH")
678
+ num_prec = row.get("NUMERIC_PRECISION")
679
+ num_scale = row.get("NUMERIC_SCALE")
680
+ else:
681
+ col_name = row[0]
682
+ data_type = row[1]
683
+ char_len = row[2] if len(row) > 2 else None
684
+ num_prec = row[3] if len(row) > 3 else None
685
+ num_scale = row[4] if len(row) > 4 else None
686
+
687
+ # Build full type with length/precision
688
+ if data_type.lower() in ("nvarchar", "varchar", "char", "nchar", "binary", "varbinary"):
689
+ if char_len == -1:
690
+ full_type = f"{data_type}(MAX)"
691
+ elif char_len:
692
+ full_type = f"{data_type}({char_len})"
693
+ else:
694
+ full_type = f"{data_type}(MAX)"
695
+ elif data_type.lower() in ("decimal", "numeric"):
696
+ if num_prec and num_scale is not None:
697
+ full_type = f"{data_type}({num_prec},{num_scale})"
698
+ else:
699
+ full_type = data_type
700
+ else:
701
+ full_type = data_type
702
+
703
+ columns[col_name] = full_type
704
+ return columns
705
+
706
+ def infer_sql_type_pandas(self, dtype: Any) -> str:
707
+ """Infer SQL Server type from Pandas dtype."""
708
+ dtype_str = str(dtype).lower()
709
+ for pattern, sql_type in PANDAS_TO_SQL_TYPE_MAP.items():
710
+ if pattern in dtype_str:
711
+ return sql_type
712
+ return "NVARCHAR(MAX)"
713
+
714
+ def infer_sql_type_polars(self, dtype: Any) -> str:
715
+ """Infer SQL Server type from Polars dtype."""
716
+ dtype_str = str(dtype)
717
+ for pattern, sql_type in POLARS_TO_SQL_TYPE_MAP.items():
718
+ if pattern in dtype_str:
719
+ return sql_type
720
+ return "NVARCHAR(MAX)"
721
+
722
+ def create_table_from_pandas(
723
+ self,
724
+ df: Any,
725
+ table: str,
726
+ audit_cols: Optional[SqlServerAuditColsConfig] = None,
727
+ ) -> None:
728
+ """
729
+ Create a SQL Server table from Pandas DataFrame schema.
730
+
731
+ Args:
732
+ df: Pandas DataFrame
733
+ table: Target table name
734
+ audit_cols: Optional audit column config to add created_ts/updated_ts columns
735
+ """
736
+ schema, table_name = self.parse_table_name(table)
737
+ columns = []
738
+ existing_cols = set()
739
+ for col_name, dtype in df.dtypes.items():
740
+ sql_type = self.infer_sql_type_pandas(dtype)
741
+ escaped_col = self.escape_column(col_name)
742
+ columns.append(f"{escaped_col} {sql_type} NULL")
743
+ existing_cols.add(col_name)
744
+
745
+ if audit_cols:
746
+ if audit_cols.created_col and audit_cols.created_col not in existing_cols:
747
+ escaped_col = self.escape_column(audit_cols.created_col)
748
+ columns.append(f"{escaped_col} DATETIME2 NULL")
749
+ self.ctx.debug(f"Adding audit column: {audit_cols.created_col}")
750
+ if audit_cols.updated_col and audit_cols.updated_col not in existing_cols:
751
+ escaped_col = self.escape_column(audit_cols.updated_col)
752
+ columns.append(f"{escaped_col} DATETIME2 NULL")
753
+ self.ctx.debug(f"Adding audit column: {audit_cols.updated_col}")
754
+
755
+ columns_sql = ",\n ".join(columns)
756
+ sql = f"CREATE TABLE [{schema}].[{table_name}] (\n {columns_sql}\n)"
757
+ self.ctx.info("Creating table from DataFrame", table=table)
758
+ self.connection.execute_sql(sql)
759
+
760
+ def create_table_from_polars(
761
+ self,
762
+ df: Any,
763
+ table: str,
764
+ audit_cols: Optional[SqlServerAuditColsConfig] = None,
765
+ ) -> None:
766
+ """
767
+ Create a SQL Server table from Polars DataFrame schema.
768
+
769
+ Args:
770
+ df: Polars DataFrame or LazyFrame
771
+ table: Target table name
772
+ audit_cols: Optional audit column config to add created_ts/updated_ts columns
773
+ """
774
+ try:
775
+ import polars as pl
776
+ except ImportError:
777
+ raise ImportError("Polars not installed. Run 'pip install polars'.")
778
+
779
+ schema_name, table_name = self.parse_table_name(table)
780
+
781
+ if isinstance(df, pl.LazyFrame):
782
+ df_schema = df.collect_schema()
783
+ else:
784
+ df_schema = df.schema
785
+
786
+ columns = []
787
+ existing_cols = set()
788
+ for col_name, dtype in df_schema.items():
789
+ sql_type = self.infer_sql_type_polars(dtype)
790
+ escaped_col = self.escape_column(col_name)
791
+ columns.append(f"{escaped_col} {sql_type} NULL")
792
+ existing_cols.add(col_name)
793
+
794
+ if audit_cols:
795
+ if audit_cols.created_col and audit_cols.created_col not in existing_cols:
796
+ escaped_col = self.escape_column(audit_cols.created_col)
797
+ columns.append(f"{escaped_col} DATETIME2 NULL")
798
+ self.ctx.debug(f"Adding audit column: {audit_cols.created_col}")
799
+ if audit_cols.updated_col and audit_cols.updated_col not in existing_cols:
800
+ escaped_col = self.escape_column(audit_cols.updated_col)
801
+ columns.append(f"{escaped_col} DATETIME2 NULL")
802
+ self.ctx.debug(f"Adding audit column: {audit_cols.updated_col}")
803
+
804
+ columns_sql = ",\n ".join(columns)
805
+ sql = f"CREATE TABLE [{schema_name}].[{table_name}] (\n {columns_sql}\n)"
806
+ self.ctx.info("Creating table from Polars DataFrame", table=table)
807
+ self.connection.execute_sql(sql)
808
+
809
+ def add_columns(self, table: str, new_columns: Dict[str, str]) -> None:
810
+ """Add new columns to an existing table."""
811
+ if not new_columns:
812
+ return
813
+
814
+ escaped_table = self.get_escaped_table_name(table)
815
+ for col_name, sql_type in new_columns.items():
816
+ escaped_col = self.escape_column(col_name)
817
+ sql = f"ALTER TABLE {escaped_table} ADD {escaped_col} {sql_type} NULL"
818
+ self.ctx.info("Adding column to table", table=table, column=col_name)
819
+ self.connection.execute_sql(sql)
820
+
821
+ def _fix_max_columns_for_indexing(self, table: str, columns: List[str]) -> None:
822
+ """
823
+ Convert MAX columns to sized types for indexing compatibility.
824
+
825
+ SQL Server cannot use nvarchar(MAX), varchar(MAX), or varbinary(MAX)
826
+ columns in primary keys or indexes. This method converts them to
827
+ sized equivalents (e.g., nvarchar(450) - max size for indexed columns).
828
+
829
+ Args:
830
+ table: Table name
831
+ columns: Columns that will be used in index/primary key
832
+ """
833
+ escaped_table = self.get_escaped_table_name(table)
834
+ existing_cols = self.get_table_columns(table)
835
+ # Build case-insensitive lookup
836
+ existing_cols_lower = {k.lower(): v for k, v in existing_cols.items()}
837
+
838
+ for col in columns:
839
+ col_type = existing_cols_lower.get(col.lower(), "")
840
+ col_type_upper = col_type.upper()
841
+
842
+ # Check if it's a MAX type that needs conversion
843
+ if "(MAX)" in col_type_upper:
844
+ # SQL Server max key length is 900 bytes for clustered index
845
+ # nvarchar uses 2 bytes per char, so max is 450 chars
846
+ if "NVARCHAR" in col_type_upper or "NCHAR" in col_type_upper:
847
+ new_type = "NVARCHAR(450)"
848
+ elif "VARCHAR" in col_type_upper or "CHAR" in col_type_upper:
849
+ new_type = "VARCHAR(900)"
850
+ elif "VARBINARY" in col_type_upper or "BINARY" in col_type_upper:
851
+ new_type = "VARBINARY(900)"
852
+ else:
853
+ continue # Unknown MAX type, skip
854
+
855
+ escaped_col = self.escape_column(col)
856
+ alter_sql = f"ALTER TABLE {escaped_table} ALTER COLUMN {escaped_col} {new_type}"
857
+ self.ctx.info(
858
+ "Converting MAX column to sized type for indexing",
859
+ table=table,
860
+ column=col,
861
+ old_type=col_type,
862
+ new_type=new_type,
863
+ )
864
+ self.connection.execute_sql(alter_sql)
865
+
866
+ def create_primary_key(self, table: str, columns: List[str]) -> None:
867
+ """
868
+ Create a clustered primary key on the specified columns.
869
+
870
+ First makes columns NOT NULL (required for PK), then adds the constraint.
871
+
872
+ Args:
873
+ table: Table name (e.g., 'oee.oee_fact')
874
+ columns: List of column names for the primary key
875
+ """
876
+ escaped_table = self.get_escaped_table_name(table)
877
+ schema, table_name = self.parse_table_name(table)
878
+ pk_name = f"PK_{table_name}"
879
+
880
+ # Get column types so we can ALTER to NOT NULL
881
+ existing_cols = self.get_table_columns(table)
882
+ # Build case-insensitive lookup for column types
883
+ existing_cols_lower = {k.lower(): v for k, v in existing_cols.items()}
884
+
885
+ # First, make PK columns NOT NULL (required for primary key)
886
+ for col in columns:
887
+ escaped_col = self.escape_column(col)
888
+ col_type = existing_cols_lower.get(col.lower())
889
+ if col_type is None:
890
+ raise ValueError(
891
+ f"Cannot create primary key: column '{col}' not found in table '{table}'. "
892
+ f"Available columns: {list(existing_cols.keys())}"
893
+ )
894
+ alter_sql = (
895
+ f"ALTER TABLE {escaped_table} ALTER COLUMN {escaped_col} {col_type} NOT NULL"
896
+ )
897
+ self.ctx.debug(f"Setting column NOT NULL: {col}")
898
+ self.connection.execute_sql(alter_sql)
899
+
900
+ # Now create the primary key
901
+ escaped_cols = ", ".join([self.escape_column(c) for c in columns])
902
+ sql = f"""
903
+ ALTER TABLE {escaped_table}
904
+ ADD CONSTRAINT [{pk_name}] PRIMARY KEY CLUSTERED ({escaped_cols})
905
+ """
906
+ self.ctx.info(
907
+ "Creating primary key",
908
+ table=table,
909
+ constraint=pk_name,
910
+ columns=columns,
911
+ )
912
+ self.connection.execute_sql(sql)
913
+
914
+ def create_index(self, table: str, columns: List[str], index_name: str = None) -> None:
915
+ """
916
+ Create a nonclustered index on the specified columns.
917
+
918
+ Args:
919
+ table: Table name (e.g., 'oee.oee_fact')
920
+ columns: List of column names for the index
921
+ index_name: Optional custom index name (auto-generated if not provided)
922
+ """
923
+ escaped_table = self.get_escaped_table_name(table)
924
+ schema, table_name = self.parse_table_name(table)
925
+
926
+ if index_name is None:
927
+ col_suffix = "_".join(columns[:3]) # Use first 3 columns in name
928
+ index_name = f"IX_{table_name}_{col_suffix}"
929
+
930
+ escaped_cols = ", ".join([self.escape_column(c) for c in columns])
931
+
932
+ sql = f"""
933
+ CREATE NONCLUSTERED INDEX [{index_name}]
934
+ ON {escaped_table} ({escaped_cols})
935
+ """
936
+ self.ctx.info(
937
+ "Creating index",
938
+ table=table,
939
+ index=index_name,
940
+ columns=columns,
941
+ )
942
+ self.connection.execute_sql(sql)
943
+
944
+ def handle_schema_evolution_pandas(
945
+ self, df: Any, table: str, evolution_config: Any
946
+ ) -> List[str]:
947
+ """
948
+ Handle schema evolution for Pandas DataFrame.
949
+
950
+ Returns list of columns to write (may be subset if mode=ignore).
951
+ """
952
+ if evolution_config is None:
953
+ return list(df.columns)
954
+
955
+ mode = evolution_config.mode
956
+ existing_cols = self.get_table_columns(table)
957
+ df_cols = set(df.columns)
958
+ table_cols = set(existing_cols.keys())
959
+
960
+ new_cols = df_cols - table_cols
961
+
962
+ if mode == SqlServerSchemaEvolutionMode.STRICT:
963
+ if new_cols:
964
+ raise ValueError(
965
+ f"Schema evolution mode is 'strict' but DataFrame has new columns "
966
+ f"not in target table: {new_cols}"
967
+ )
968
+ return list(df.columns)
969
+
970
+ elif mode == SqlServerSchemaEvolutionMode.EVOLVE:
971
+ if new_cols and evolution_config.add_columns:
972
+ new_cols_with_types = {}
973
+ for col in new_cols:
974
+ new_cols_with_types[col] = self.infer_sql_type_pandas(df[col].dtype)
975
+ self.add_columns(table, new_cols_with_types)
976
+ return list(df.columns)
977
+
978
+ elif mode == SqlServerSchemaEvolutionMode.IGNORE:
979
+ return [c for c in df.columns if c in table_cols]
980
+
981
+ return list(df.columns)
982
+
983
+ def handle_schema_evolution_polars(
984
+ self, df: Any, table: str, evolution_config: Any
985
+ ) -> List[str]:
986
+ """
987
+ Handle schema evolution for Polars DataFrame.
988
+
989
+ Returns list of columns to write (may be subset if mode=ignore).
990
+ """
991
+ try:
992
+ import polars as pl
993
+ except ImportError:
994
+ raise ImportError("Polars not installed. Run 'pip install polars'.")
995
+
996
+ if evolution_config is None:
997
+ if isinstance(df, pl.LazyFrame):
998
+ return list(df.collect_schema().names())
999
+ return df.columns
1000
+
1001
+ mode = evolution_config.mode
1002
+ existing_cols = self.get_table_columns(table)
1003
+
1004
+ if isinstance(df, pl.LazyFrame):
1005
+ df_schema = df.collect_schema()
1006
+ df_cols = set(df_schema.names())
1007
+ else:
1008
+ df_schema = df.schema
1009
+ df_cols = set(df.columns)
1010
+
1011
+ table_cols = set(existing_cols.keys())
1012
+ new_cols = df_cols - table_cols
1013
+
1014
+ if mode == SqlServerSchemaEvolutionMode.STRICT:
1015
+ if new_cols:
1016
+ raise ValueError(
1017
+ f"Schema evolution mode is 'strict' but DataFrame has new columns "
1018
+ f"not in target table: {new_cols}"
1019
+ )
1020
+ return list(df_cols)
1021
+
1022
+ elif mode == SqlServerSchemaEvolutionMode.EVOLVE:
1023
+ if new_cols and evolution_config.add_columns:
1024
+ new_cols_with_types = {}
1025
+ for col in new_cols:
1026
+ new_cols_with_types[col] = self.infer_sql_type_polars(df_schema[col])
1027
+ self.add_columns(table, new_cols_with_types)
1028
+ return list(df_cols)
1029
+
1030
+ elif mode == SqlServerSchemaEvolutionMode.IGNORE:
1031
+ return [c for c in df_cols if c in table_cols]
1032
+
1033
+ return list(df_cols)
1034
+
1035
+ def truncate_staging(self, staging_table: str) -> None:
1036
+ """
1037
+ Truncate staging table if it exists.
1038
+
1039
+ Args:
1040
+ staging_table: Full staging table name (e.g., '[staging].[oee_fact_staging]')
1041
+ """
1042
+ sql = f"""
1043
+ IF OBJECT_ID('{staging_table}', 'U') IS NOT NULL
1044
+ TRUNCATE TABLE {staging_table}
1045
+ """
1046
+ self.ctx.debug("Truncating staging table", staging_table=staging_table)
1047
+ self.connection.execute_sql(sql)
1048
+
1049
+ def truncate_table(self, table: str) -> None:
1050
+ """Truncate a table."""
1051
+ escaped = self.get_escaped_table_name(table)
1052
+ sql = f"TRUNCATE TABLE {escaped}"
1053
+ self.ctx.debug("Truncating table", table=table)
1054
+ self.connection.execute_sql(sql)
1055
+
1056
+ def delete_from_table(self, table: str) -> int:
1057
+ """Delete all rows from a table and return count."""
1058
+ escaped = self.get_escaped_table_name(table)
1059
+ sql = f"DELETE FROM {escaped}; SELECT @@ROWCOUNT AS deleted_count;"
1060
+ self.ctx.debug("Deleting from table", table=table)
1061
+ result = self.connection.execute_sql(sql)
1062
+ # Result is now a list of rows (fetchall already called in AzureSQL.execute)
1063
+ row = result[0] if result else None
1064
+ if row:
1065
+ return row.get("deleted_count", 0) if isinstance(row, dict) else row[0]
1066
+ return 0
1067
+
1068
+ def drop_table(self, table: str) -> None:
1069
+ """Drop a table if it exists."""
1070
+ escaped = self.get_escaped_table_name(table)
1071
+ sql = f"DROP TABLE IF EXISTS {escaped}"
1072
+ self.ctx.debug("Dropping table", table=table)
1073
+ self.connection.execute_sql(sql)
1074
+
1075
+ def build_merge_sql(
1076
+ self,
1077
+ target_table: str,
1078
+ staging_table: str,
1079
+ merge_keys: List[str],
1080
+ columns: List[str],
1081
+ options: Optional[SqlServerMergeOptions] = None,
1082
+ ) -> str:
1083
+ """
1084
+ Build T-SQL MERGE statement.
1085
+
1086
+ Args:
1087
+ target_table: Target table name
1088
+ staging_table: Staging table name
1089
+ merge_keys: Key columns for ON clause
1090
+ columns: All columns in the DataFrame
1091
+ options: Merge options (conditions, audit cols, etc.)
1092
+
1093
+ Returns:
1094
+ T-SQL MERGE statement
1095
+ """
1096
+ options = options or SqlServerMergeOptions()
1097
+
1098
+ exclude_cols = set(options.exclude_columns)
1099
+ audit_created = options.audit_cols.created_col if options.audit_cols else None
1100
+ audit_updated = options.audit_cols.updated_col if options.audit_cols else None
1101
+
1102
+ merge_cols = [c for c in columns if c not in exclude_cols]
1103
+
1104
+ update_cols = [c for c in merge_cols if c not in merge_keys and c != audit_created]
1105
+ insert_cols = [c for c in merge_cols]
1106
+
1107
+ on_clause = " AND ".join(
1108
+ [f"target.{self.escape_column(k)} = source.{self.escape_column(k)}" for k in merge_keys]
1109
+ )
1110
+
1111
+ update_set_parts = []
1112
+ for col in update_cols:
1113
+ if col == audit_updated:
1114
+ update_set_parts.append(f"{self.escape_column(col)} = GETUTCDATE()")
1115
+ else:
1116
+ update_set_parts.append(
1117
+ f"{self.escape_column(col)} = source.{self.escape_column(col)}"
1118
+ )
1119
+ update_set = ",\n ".join(update_set_parts)
1120
+
1121
+ insert_col_list = ", ".join([self.escape_column(c) for c in insert_cols])
1122
+ insert_value_parts = []
1123
+ for col in insert_cols:
1124
+ if col == audit_created or col == audit_updated:
1125
+ insert_value_parts.append("GETUTCDATE()")
1126
+ else:
1127
+ insert_value_parts.append(f"source.{self.escape_column(col)}")
1128
+ insert_values = ", ".join(insert_value_parts)
1129
+
1130
+ target_escaped = self.get_escaped_table_name(target_table)
1131
+
1132
+ sql_parts = [
1133
+ "DECLARE @MergeActions TABLE (action NVARCHAR(10));",
1134
+ "",
1135
+ f"MERGE {target_escaped} AS target",
1136
+ f"USING {staging_table} AS source",
1137
+ f"ON {on_clause}",
1138
+ ]
1139
+
1140
+ if options.update_condition:
1141
+ sql_parts.append(f"WHEN MATCHED AND {options.update_condition} THEN")
1142
+ else:
1143
+ sql_parts.append("WHEN MATCHED THEN")
1144
+
1145
+ sql_parts.append(" UPDATE SET")
1146
+ sql_parts.append(f" {update_set}")
1147
+
1148
+ if options.delete_condition:
1149
+ sql_parts.append(f"WHEN MATCHED AND {options.delete_condition} THEN")
1150
+ sql_parts.append(" DELETE")
1151
+
1152
+ if options.insert_condition:
1153
+ sql_parts.append(f"WHEN NOT MATCHED BY TARGET AND {options.insert_condition} THEN")
1154
+ else:
1155
+ sql_parts.append("WHEN NOT MATCHED BY TARGET THEN")
1156
+
1157
+ sql_parts.append(f" INSERT ({insert_col_list})")
1158
+ sql_parts.append(f" VALUES ({insert_values})")
1159
+
1160
+ sql_parts.append("OUTPUT $action INTO @MergeActions;")
1161
+ sql_parts.append("")
1162
+ sql_parts.append("SELECT")
1163
+ sql_parts.append(" SUM(CASE WHEN action = 'INSERT' THEN 1 ELSE 0 END) AS inserted,")
1164
+ sql_parts.append(" SUM(CASE WHEN action = 'UPDATE' THEN 1 ELSE 0 END) AS updated,")
1165
+ sql_parts.append(" SUM(CASE WHEN action = 'DELETE' THEN 1 ELSE 0 END) AS deleted")
1166
+ sql_parts.append("FROM @MergeActions;")
1167
+
1168
+ return "\n".join(sql_parts)
1169
+
1170
+ def execute_merge(
1171
+ self,
1172
+ target_table: str,
1173
+ staging_table: str,
1174
+ merge_keys: List[str],
1175
+ columns: List[str],
1176
+ options: Optional[SqlServerMergeOptions] = None,
1177
+ ) -> MergeResult:
1178
+ """
1179
+ Execute MERGE operation and return counts.
1180
+
1181
+ Args:
1182
+ target_table: Target table name
1183
+ staging_table: Staging table name
1184
+ merge_keys: Key columns for ON clause
1185
+ columns: All columns in the DataFrame
1186
+ options: Merge options
1187
+
1188
+ Returns:
1189
+ MergeResult with insert/update/delete counts
1190
+ """
1191
+ sql = self.build_merge_sql(
1192
+ target_table=target_table,
1193
+ staging_table=staging_table,
1194
+ merge_keys=merge_keys,
1195
+ columns=columns,
1196
+ options=options,
1197
+ )
1198
+
1199
+ self.ctx.debug(
1200
+ "Executing MERGE",
1201
+ target_table=target_table,
1202
+ staging_table=staging_table,
1203
+ merge_keys=merge_keys,
1204
+ )
1205
+
1206
+ try:
1207
+ result = self.connection.execute_sql(sql)
1208
+
1209
+ # Result is now a list of rows (fetchall already called in AzureSQL.execute)
1210
+ row = result[0] if result else None
1211
+ if row:
1212
+ if isinstance(row, dict):
1213
+ merge_result = MergeResult(
1214
+ inserted=row.get("inserted", 0) or 0,
1215
+ updated=row.get("updated", 0) or 0,
1216
+ deleted=row.get("deleted", 0) or 0,
1217
+ )
1218
+ else:
1219
+ merge_result = MergeResult(
1220
+ inserted=row[0] or 0,
1221
+ updated=row[1] or 0,
1222
+ deleted=row[2] or 0,
1223
+ )
1224
+ else:
1225
+ merge_result = MergeResult()
1226
+
1227
+ self.ctx.info(
1228
+ "MERGE completed",
1229
+ target_table=target_table,
1230
+ inserted=merge_result.inserted,
1231
+ updated=merge_result.updated,
1232
+ deleted=merge_result.deleted,
1233
+ total_affected=merge_result.total_affected,
1234
+ )
1235
+
1236
+ return merge_result
1237
+
1238
+ except Exception as e:
1239
+ self.ctx.error(
1240
+ "MERGE failed",
1241
+ target_table=target_table,
1242
+ error_type=type(e).__name__,
1243
+ error_message=str(e),
1244
+ )
1245
+ raise
1246
+
1247
+ def merge(
1248
+ self,
1249
+ df: Any,
1250
+ spark_engine: Any,
1251
+ target_table: str,
1252
+ merge_keys: List[str],
1253
+ options: Optional[SqlServerMergeOptions] = None,
1254
+ jdbc_options: Optional[Dict[str, Any]] = None,
1255
+ ) -> MergeResult:
1256
+ """
1257
+ Execute full merge operation: validation + staging write + MERGE.
1258
+
1259
+ Args:
1260
+ df: Spark DataFrame to merge
1261
+ spark_engine: SparkEngine instance for writing to staging
1262
+ target_table: Target table name (e.g., 'oee.oee_fact')
1263
+ merge_keys: Key columns for ON clause
1264
+ options: Merge options
1265
+ jdbc_options: JDBC connection options
1266
+
1267
+ Returns:
1268
+ MergeResult with counts
1269
+ """
1270
+ options = options or SqlServerMergeOptions()
1271
+ jdbc_options = jdbc_options or {}
1272
+
1273
+ # Auto-create schema if needed
1274
+ if options.auto_create_schema:
1275
+ schema, _ = self.parse_table_name(target_table)
1276
+ if not self.check_schema_exists(schema):
1277
+ self.create_schema(schema)
1278
+
1279
+ # Check if table exists, auto-create if configured
1280
+ if not self.check_table_exists(target_table):
1281
+ if options.auto_create_table:
1282
+ self.ctx.info(
1283
+ "Auto-creating target table from Spark DataFrame",
1284
+ target_table=target_table,
1285
+ )
1286
+
1287
+ # Create table using JDBC write with overwrite mode (initial load)
1288
+ staging_jdbc_options = {**jdbc_options, "dbtable": target_table}
1289
+ df.write.format("jdbc").options(**staging_jdbc_options).mode("overwrite").save()
1290
+
1291
+ row_count = df.count()
1292
+
1293
+ # Add audit columns if configured (JDBC doesn't create them automatically)
1294
+ if options.audit_cols:
1295
+ audit_cols_to_add = {}
1296
+ existing_cols = self.get_table_columns(target_table)
1297
+ if (
1298
+ options.audit_cols.created_col
1299
+ and options.audit_cols.created_col not in existing_cols
1300
+ ):
1301
+ audit_cols_to_add[options.audit_cols.created_col] = "DATETIME2"
1302
+ if (
1303
+ options.audit_cols.updated_col
1304
+ and options.audit_cols.updated_col not in existing_cols
1305
+ ):
1306
+ audit_cols_to_add[options.audit_cols.updated_col] = "DATETIME2"
1307
+ if audit_cols_to_add:
1308
+ self.add_columns(target_table, audit_cols_to_add)
1309
+
1310
+ # Populate audit columns for all rows on first load
1311
+ escaped_table = self.get_escaped_table_name(target_table)
1312
+ update_parts = []
1313
+ if options.audit_cols.created_col:
1314
+ escaped_col = self.escape_column(options.audit_cols.created_col)
1315
+ update_parts.append(f"{escaped_col} = GETUTCDATE()")
1316
+ if options.audit_cols.updated_col:
1317
+ escaped_col = self.escape_column(options.audit_cols.updated_col)
1318
+ update_parts.append(f"{escaped_col} = GETUTCDATE()")
1319
+ if update_parts:
1320
+ update_sql = f"UPDATE {escaped_table} SET {', '.join(update_parts)}"
1321
+ self.ctx.debug("Populating audit columns on initial load")
1322
+ self.connection.execute_sql(update_sql)
1323
+
1324
+ # Create primary key or index on merge keys if configured
1325
+ if options.primary_key_on_merge_keys or options.index_on_merge_keys:
1326
+ # Fix MAX columns in merge keys - SQL Server can't index MAX types
1327
+ self._fix_max_columns_for_indexing(target_table, merge_keys)
1328
+
1329
+ if options.primary_key_on_merge_keys:
1330
+ self.create_primary_key(target_table, merge_keys)
1331
+ elif options.index_on_merge_keys:
1332
+ self.create_index(target_table, merge_keys)
1333
+
1334
+ self.ctx.info(
1335
+ "Target table created and initial data loaded",
1336
+ target_table=target_table,
1337
+ rows=row_count,
1338
+ )
1339
+ # Return as if merge completed (all inserts)
1340
+ return MergeResult(inserted=row_count, updated=0, deleted=0)
1341
+ else:
1342
+ raise ValueError(
1343
+ f"Target table '{target_table}' does not exist. "
1344
+ "SQL Server MERGE mode requires the target table to exist. "
1345
+ "Set auto_create_table=true or use mode='overwrite' for initial load."
1346
+ )
1347
+
1348
+ if options.validations:
1349
+ validation_result = self.validate_keys_spark(df, merge_keys, options.validations)
1350
+ if not validation_result.is_valid:
1351
+ error_msg = "; ".join(validation_result.errors)
1352
+ if options.validations.fail_on_validation_error:
1353
+ raise ValueError(f"Merge key validation failed: {error_msg}")
1354
+ else:
1355
+ self.ctx.warning(f"Merge key validation warnings: {error_msg}")
1356
+
1357
+ staging_table = self.get_staging_table_name(target_table, options.staging_schema)
1358
+
1359
+ # Auto-create staging schema if needed
1360
+ if options.auto_create_schema:
1361
+ if not self.check_schema_exists(options.staging_schema):
1362
+ self.create_schema(options.staging_schema)
1363
+
1364
+ self.ctx.info(
1365
+ "Starting SQL Server MERGE",
1366
+ target_table=target_table,
1367
+ staging_table=staging_table,
1368
+ merge_keys=merge_keys,
1369
+ incremental=options.incremental,
1370
+ )
1371
+
1372
+ self.truncate_staging(staging_table)
1373
+
1374
+ columns = list(df.columns)
1375
+ df_to_write = df
1376
+
1377
+ if options.audit_cols:
1378
+ if options.audit_cols.created_col and options.audit_cols.created_col not in columns:
1379
+ columns.append(options.audit_cols.created_col)
1380
+ if options.audit_cols.updated_col and options.audit_cols.updated_col not in columns:
1381
+ columns.append(options.audit_cols.updated_col)
1382
+
1383
+ # Incremental merge: filter to only changed rows before writing to staging
1384
+ if options.incremental:
1385
+ hash_column = self.get_hash_column_name(df.columns, options.hash_column)
1386
+
1387
+ if hash_column is None and options.change_detection_columns:
1388
+ # Compute hash from specified columns
1389
+ hash_column = "_computed_hash"
1390
+ df_to_write = self.compute_hash_spark(
1391
+ df, options.change_detection_columns, hash_column
1392
+ )
1393
+ columns.append(hash_column)
1394
+ elif hash_column is None:
1395
+ # Compute hash from all non-key columns
1396
+ non_key_cols = [c for c in df.columns if c not in merge_keys]
1397
+ if non_key_cols:
1398
+ hash_column = "_computed_hash"
1399
+ df_to_write = self.compute_hash_spark(df, non_key_cols, hash_column)
1400
+ columns.append(hash_column)
1401
+
1402
+ if hash_column:
1403
+ # Read target hashes and filter source
1404
+ target_hashes = self.read_target_hashes(target_table, merge_keys, hash_column)
1405
+ original_count = df_to_write.count()
1406
+ df_to_write = self.filter_changed_rows_spark(
1407
+ df_to_write, target_hashes, merge_keys, hash_column
1408
+ )
1409
+ filtered_count = df_to_write.count()
1410
+ self.ctx.info(
1411
+ "Incremental filter applied",
1412
+ original_rows=original_count,
1413
+ changed_rows=filtered_count,
1414
+ skipped_rows=original_count - filtered_count,
1415
+ )
1416
+
1417
+ if filtered_count == 0:
1418
+ self.ctx.info("No changed rows detected, skipping merge")
1419
+ return MergeResult(inserted=0, updated=0, deleted=0)
1420
+
1421
+ staging_jdbc_options = {**jdbc_options, "dbtable": staging_table}
1422
+ df_to_write.write.format("jdbc").options(**staging_jdbc_options).mode("overwrite").save()
1423
+
1424
+ self.ctx.debug("Staging write completed", staging_table=staging_table)
1425
+
1426
+ # Handle schema evolution before MERGE - add any new columns to target table
1427
+ if options.schema_evolution and options.schema_evolution.add_columns:
1428
+ existing_cols = self.get_table_columns(target_table)
1429
+ new_cols = [c for c in columns if c not in existing_cols]
1430
+ if new_cols:
1431
+ new_cols_with_types = {}
1432
+ staging_cols = self.get_table_columns(staging_table)
1433
+ for col in new_cols:
1434
+ # Use appropriate type for hash columns (SHA256 = 64 chars)
1435
+ if col in ("_computed_hash", "_hash", "_hash_diff"):
1436
+ new_cols_with_types[col] = "NVARCHAR(256)"
1437
+ elif col in staging_cols:
1438
+ new_cols_with_types[col] = staging_cols[col]
1439
+ else:
1440
+ new_cols_with_types[col] = "NVARCHAR(MAX)"
1441
+ self.ctx.info(
1442
+ "Adding new columns to target table via schema evolution",
1443
+ target_table=target_table,
1444
+ new_columns=list(new_cols_with_types.keys()),
1445
+ )
1446
+ self.add_columns(target_table, new_cols_with_types)
1447
+
1448
+ result = self.execute_merge(
1449
+ target_table=target_table,
1450
+ staging_table=staging_table,
1451
+ merge_keys=merge_keys,
1452
+ columns=columns,
1453
+ options=options,
1454
+ )
1455
+
1456
+ return result
1457
+
1458
+ def merge_pandas(
1459
+ self,
1460
+ df: Any,
1461
+ target_table: str,
1462
+ merge_keys: List[str],
1463
+ options: Optional[SqlServerMergeOptions] = None,
1464
+ ) -> MergeResult:
1465
+ """
1466
+ Execute full merge operation for Pandas DataFrame.
1467
+
1468
+ Args:
1469
+ df: Pandas DataFrame to merge
1470
+ target_table: Target table name (e.g., 'oee.oee_fact')
1471
+ merge_keys: Key columns for ON clause
1472
+ options: Merge options
1473
+
1474
+ Returns:
1475
+ MergeResult with counts
1476
+ """
1477
+ options = options or SqlServerMergeOptions()
1478
+
1479
+ schema, _ = self.parse_table_name(target_table)
1480
+ if options.auto_create_schema:
1481
+ self.create_schema(schema)
1482
+
1483
+ table_exists = self.check_table_exists(target_table)
1484
+ if not table_exists:
1485
+ if options.auto_create_table:
1486
+ self.create_table_from_pandas(df, target_table, audit_cols=options.audit_cols)
1487
+ if options.primary_key_on_merge_keys or options.index_on_merge_keys:
1488
+ # Fix MAX columns in merge keys - SQL Server can't index MAX types
1489
+ self._fix_max_columns_for_indexing(target_table, merge_keys)
1490
+ if options.primary_key_on_merge_keys:
1491
+ self.create_primary_key(target_table, merge_keys)
1492
+ elif options.index_on_merge_keys:
1493
+ self.create_index(target_table, merge_keys)
1494
+ else:
1495
+ raise ValueError(
1496
+ f"Target table '{target_table}' does not exist. "
1497
+ "SQL Server MERGE mode requires the target table to exist. "
1498
+ "Set auto_create_table=true or use mode='overwrite' for initial load."
1499
+ )
1500
+
1501
+ if options.validations:
1502
+ validation_result = self.validate_keys_pandas(df, merge_keys, options.validations)
1503
+ if not validation_result.is_valid:
1504
+ error_msg = "; ".join(validation_result.errors)
1505
+ if options.validations.fail_on_validation_error:
1506
+ raise ValueError(f"Merge key validation failed: {error_msg}")
1507
+ else:
1508
+ self.ctx.warning(f"Merge key validation warnings: {error_msg}")
1509
+
1510
+ staging_table = self.get_staging_table_name(target_table, options.staging_schema)
1511
+
1512
+ self.ctx.info(
1513
+ "Starting SQL Server MERGE (Pandas)",
1514
+ target_table=target_table,
1515
+ staging_table=staging_table,
1516
+ merge_keys=merge_keys,
1517
+ incremental=options.incremental,
1518
+ )
1519
+
1520
+ columns = list(df.columns)
1521
+ df_to_write = df
1522
+
1523
+ if options.audit_cols:
1524
+ if options.audit_cols.created_col and options.audit_cols.created_col not in columns:
1525
+ columns.append(options.audit_cols.created_col)
1526
+ if options.audit_cols.updated_col and options.audit_cols.updated_col not in columns:
1527
+ columns.append(options.audit_cols.updated_col)
1528
+
1529
+ # Incremental merge: filter to only changed rows before writing to staging
1530
+ if options.incremental and table_exists:
1531
+ hash_column = self.get_hash_column_name(list(df.columns), options.hash_column)
1532
+
1533
+ if hash_column is None and options.change_detection_columns:
1534
+ hash_column = "_computed_hash"
1535
+ df_to_write = self.compute_hash_pandas(
1536
+ df, options.change_detection_columns, hash_column
1537
+ )
1538
+ columns.append(hash_column)
1539
+ elif hash_column is None:
1540
+ non_key_cols = [c for c in df.columns if c not in merge_keys]
1541
+ if non_key_cols:
1542
+ hash_column = "_computed_hash"
1543
+ df_to_write = self.compute_hash_pandas(df, list(non_key_cols), hash_column)
1544
+ columns.append(hash_column)
1545
+
1546
+ if hash_column:
1547
+ target_hashes = self.read_target_hashes(target_table, merge_keys, hash_column)
1548
+ original_count = len(df_to_write)
1549
+ df_to_write = self.filter_changed_rows_pandas(
1550
+ df_to_write, target_hashes, merge_keys, hash_column
1551
+ )
1552
+ filtered_count = len(df_to_write)
1553
+ self.ctx.info(
1554
+ "Incremental filter applied (Pandas)",
1555
+ original_rows=original_count,
1556
+ changed_rows=filtered_count,
1557
+ skipped_rows=original_count - filtered_count,
1558
+ )
1559
+
1560
+ if filtered_count == 0:
1561
+ self.ctx.info("No changed rows detected, skipping merge")
1562
+ return MergeResult(inserted=0, updated=0, deleted=0)
1563
+
1564
+ schema, table_name = staging_table.strip("[]").split("].[")
1565
+ schema = schema.strip("[")
1566
+ table_name = table_name.strip("]")
1567
+
1568
+ self.connection.write_table(
1569
+ df=df_to_write,
1570
+ table_name=table_name,
1571
+ schema=schema,
1572
+ if_exists="replace",
1573
+ )
1574
+
1575
+ self.ctx.debug("Staging write completed (Pandas)", staging_table=staging_table)
1576
+
1577
+ # Handle schema evolution before MERGE - add any new columns to target table
1578
+ if options.schema_evolution and options.schema_evolution.add_columns:
1579
+ existing_cols = self.get_table_columns(target_table)
1580
+ new_cols = [c for c in columns if c not in existing_cols]
1581
+ if new_cols:
1582
+ new_cols_with_types = {}
1583
+ staging_cols = self.get_table_columns(staging_table)
1584
+ for col in new_cols:
1585
+ # Use appropriate type for hash columns (SHA256 = 64 chars)
1586
+ if col in ("_computed_hash", "_hash", "_hash_diff"):
1587
+ new_cols_with_types[col] = "NVARCHAR(256)"
1588
+ elif col in staging_cols:
1589
+ new_cols_with_types[col] = staging_cols[col]
1590
+ else:
1591
+ new_cols_with_types[col] = "NVARCHAR(MAX)"
1592
+ self.ctx.info(
1593
+ "Adding new columns to target table via schema evolution",
1594
+ target_table=target_table,
1595
+ new_columns=list(new_cols_with_types.keys()),
1596
+ )
1597
+ self.add_columns(target_table, new_cols_with_types)
1598
+
1599
+ result = self.execute_merge(
1600
+ target_table=target_table,
1601
+ staging_table=staging_table,
1602
+ merge_keys=merge_keys,
1603
+ columns=columns,
1604
+ options=options,
1605
+ )
1606
+
1607
+ return result
1608
+
1609
+ def overwrite_spark(
1610
+ self,
1611
+ df: Any,
1612
+ target_table: str,
1613
+ options: Optional[SqlServerOverwriteOptions] = None,
1614
+ jdbc_options: Optional[Dict[str, Any]] = None,
1615
+ ) -> OverwriteResult:
1616
+ """
1617
+ Execute enhanced overwrite operation for Spark DataFrame.
1618
+
1619
+ Args:
1620
+ df: Spark DataFrame to write
1621
+ target_table: Target table name
1622
+ options: Overwrite options
1623
+ jdbc_options: JDBC connection options
1624
+
1625
+ Returns:
1626
+ OverwriteResult with row count
1627
+ """
1628
+ options = options or SqlServerOverwriteOptions()
1629
+ jdbc_options = jdbc_options or {}
1630
+ strategy = options.strategy
1631
+
1632
+ self.ctx.info(
1633
+ "Starting SQL Server overwrite",
1634
+ target_table=target_table,
1635
+ strategy=strategy.value,
1636
+ )
1637
+
1638
+ table_exists = self.check_table_exists(target_table)
1639
+
1640
+ if strategy == SqlServerOverwriteStrategy.DROP_CREATE:
1641
+ if table_exists:
1642
+ self.drop_table(target_table)
1643
+
1644
+ jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
1645
+ df.write.format("jdbc").options(**jdbc_options_with_table).mode("overwrite").save()
1646
+
1647
+ elif strategy == SqlServerOverwriteStrategy.TRUNCATE_INSERT:
1648
+ if table_exists:
1649
+ self.truncate_table(target_table)
1650
+ jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
1651
+ df.write.format("jdbc").options(**jdbc_options_with_table).mode("append").save()
1652
+ else:
1653
+ jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
1654
+ df.write.format("jdbc").options(**jdbc_options_with_table).mode("overwrite").save()
1655
+
1656
+ elif strategy == SqlServerOverwriteStrategy.DELETE_INSERT:
1657
+ if table_exists:
1658
+ self.delete_from_table(target_table)
1659
+ jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
1660
+ df.write.format("jdbc").options(**jdbc_options_with_table).mode("append").save()
1661
+ else:
1662
+ jdbc_options_with_table = {**jdbc_options, "dbtable": target_table}
1663
+ df.write.format("jdbc").options(**jdbc_options_with_table).mode("overwrite").save()
1664
+
1665
+ row_count = df.count()
1666
+
1667
+ self.ctx.info(
1668
+ "Overwrite completed",
1669
+ target_table=target_table,
1670
+ strategy=strategy.value,
1671
+ rows_written=row_count,
1672
+ )
1673
+
1674
+ return OverwriteResult(rows_written=row_count, strategy=strategy.value)
1675
+
1676
+ def overwrite_pandas(
1677
+ self,
1678
+ df: Any,
1679
+ target_table: str,
1680
+ options: Optional[SqlServerOverwriteOptions] = None,
1681
+ ) -> OverwriteResult:
1682
+ """
1683
+ Execute enhanced overwrite operation for Pandas DataFrame.
1684
+
1685
+ Args:
1686
+ df: Pandas DataFrame to write
1687
+ target_table: Target table name
1688
+ options: Overwrite options
1689
+
1690
+ Returns:
1691
+ OverwriteResult with row count
1692
+ """
1693
+ options = options or SqlServerOverwriteOptions()
1694
+ strategy = options.strategy
1695
+
1696
+ self.ctx.info(
1697
+ "Starting SQL Server overwrite (Pandas)",
1698
+ target_table=target_table,
1699
+ strategy=strategy.value,
1700
+ )
1701
+
1702
+ table_exists = self.check_table_exists(target_table)
1703
+ schema, table_name = self.parse_table_name(target_table)
1704
+
1705
+ if strategy == SqlServerOverwriteStrategy.DROP_CREATE:
1706
+ if table_exists:
1707
+ self.drop_table(target_table)
1708
+ self.connection.write_table(
1709
+ df=df,
1710
+ table_name=table_name,
1711
+ schema=schema,
1712
+ if_exists="replace",
1713
+ )
1714
+
1715
+ elif strategy == SqlServerOverwriteStrategy.TRUNCATE_INSERT:
1716
+ if table_exists:
1717
+ self.truncate_table(target_table)
1718
+ self.connection.write_table(
1719
+ df=df,
1720
+ table_name=table_name,
1721
+ schema=schema,
1722
+ if_exists="append",
1723
+ )
1724
+ else:
1725
+ self.connection.write_table(
1726
+ df=df,
1727
+ table_name=table_name,
1728
+ schema=schema,
1729
+ if_exists="replace",
1730
+ )
1731
+
1732
+ elif strategy == SqlServerOverwriteStrategy.DELETE_INSERT:
1733
+ if table_exists:
1734
+ self.delete_from_table(target_table)
1735
+ self.connection.write_table(
1736
+ df=df,
1737
+ table_name=table_name,
1738
+ schema=schema,
1739
+ if_exists="append",
1740
+ )
1741
+ else:
1742
+ self.connection.write_table(
1743
+ df=df,
1744
+ table_name=table_name,
1745
+ schema=schema,
1746
+ if_exists="replace",
1747
+ )
1748
+
1749
+ row_count = len(df)
1750
+
1751
+ self.ctx.info(
1752
+ "Overwrite completed (Pandas)",
1753
+ target_table=target_table,
1754
+ strategy=strategy.value,
1755
+ rows_written=row_count,
1756
+ )
1757
+
1758
+ return OverwriteResult(rows_written=row_count, strategy=strategy.value)
1759
+
1760
+ def merge_polars(
1761
+ self,
1762
+ df: Any,
1763
+ target_table: str,
1764
+ merge_keys: List[str],
1765
+ options: Optional[SqlServerMergeOptions] = None,
1766
+ ) -> MergeResult:
1767
+ """
1768
+ Execute full merge operation for Polars DataFrame (Phase 4).
1769
+
1770
+ Args:
1771
+ df: Polars DataFrame or LazyFrame to merge
1772
+ target_table: Target table name (e.g., 'oee.oee_fact')
1773
+ merge_keys: Key columns for ON clause
1774
+ options: Merge options
1775
+
1776
+ Returns:
1777
+ MergeResult with counts
1778
+ """
1779
+ try:
1780
+ import polars as pl
1781
+ except ImportError:
1782
+ raise ImportError("Polars not installed. Run 'pip install polars'.")
1783
+
1784
+ options = options or SqlServerMergeOptions()
1785
+
1786
+ if isinstance(df, pl.LazyFrame):
1787
+ df = df.collect()
1788
+
1789
+ schema, _ = self.parse_table_name(target_table)
1790
+ if options.auto_create_schema:
1791
+ self.create_schema(schema)
1792
+
1793
+ table_exists = self.check_table_exists(target_table)
1794
+ if not table_exists:
1795
+ if options.auto_create_table:
1796
+ self.create_table_from_polars(df, target_table, audit_cols=options.audit_cols)
1797
+ if options.primary_key_on_merge_keys or options.index_on_merge_keys:
1798
+ # Fix MAX columns in merge keys - SQL Server can't index MAX types
1799
+ self._fix_max_columns_for_indexing(target_table, merge_keys)
1800
+ if options.primary_key_on_merge_keys:
1801
+ self.create_primary_key(target_table, merge_keys)
1802
+ elif options.index_on_merge_keys:
1803
+ self.create_index(target_table, merge_keys)
1804
+ else:
1805
+ raise ValueError(
1806
+ f"Target table '{target_table}' does not exist. "
1807
+ "SQL Server MERGE mode requires the target table to exist. "
1808
+ "Set auto_create_table=true or use mode='overwrite' for initial load."
1809
+ )
1810
+
1811
+ if options.schema_evolution and table_exists:
1812
+ columns = self.handle_schema_evolution_polars(
1813
+ df, target_table, options.schema_evolution
1814
+ )
1815
+ else:
1816
+ columns = list(df.columns)
1817
+
1818
+ if options.audit_cols:
1819
+ if options.audit_cols.created_col and options.audit_cols.created_col not in columns:
1820
+ columns.append(options.audit_cols.created_col)
1821
+ if options.audit_cols.updated_col and options.audit_cols.updated_col not in columns:
1822
+ columns.append(options.audit_cols.updated_col)
1823
+
1824
+ if options.validations:
1825
+ validation_result = self.validate_keys_polars(df, merge_keys, options.validations)
1826
+ if not validation_result.is_valid:
1827
+ error_msg = "; ".join(validation_result.errors)
1828
+ if options.validations.fail_on_validation_error:
1829
+ raise ValueError(f"Merge key validation failed: {error_msg}")
1830
+ else:
1831
+ self.ctx.warning(f"Merge key validation warnings: {error_msg}")
1832
+
1833
+ staging_table = self.get_staging_table_name(target_table, options.staging_schema)
1834
+ staging_schema, staging_table_name = staging_table.strip("[]").split("].[")
1835
+ staging_schema = staging_schema.strip("[")
1836
+ staging_table_name = staging_table_name.strip("]")
1837
+
1838
+ if options.auto_create_schema:
1839
+ self.create_schema(staging_schema)
1840
+
1841
+ self.ctx.info(
1842
+ "Starting SQL Server MERGE (Polars)",
1843
+ target_table=target_table,
1844
+ staging_table=staging_table,
1845
+ merge_keys=merge_keys,
1846
+ incremental=options.incremental,
1847
+ )
1848
+
1849
+ df_to_write = df
1850
+
1851
+ # Incremental merge: filter to only changed rows before writing to staging
1852
+ if options.incremental and table_exists:
1853
+ hash_column = self.get_hash_column_name(df.columns, options.hash_column)
1854
+
1855
+ if hash_column is None and options.change_detection_columns:
1856
+ hash_column = "_computed_hash"
1857
+ df_to_write = self.compute_hash_polars(
1858
+ df, options.change_detection_columns, hash_column
1859
+ )
1860
+ columns.append(hash_column)
1861
+ elif hash_column is None:
1862
+ non_key_cols = [c for c in df.columns if c not in merge_keys]
1863
+ if non_key_cols:
1864
+ hash_column = "_computed_hash"
1865
+ df_to_write = self.compute_hash_polars(df, non_key_cols, hash_column)
1866
+ columns.append(hash_column)
1867
+
1868
+ if hash_column:
1869
+ target_hashes = self.read_target_hashes(target_table, merge_keys, hash_column)
1870
+ original_count = len(df_to_write)
1871
+ df_to_write = self.filter_changed_rows_polars(
1872
+ df_to_write, target_hashes, merge_keys, hash_column
1873
+ )
1874
+ filtered_count = len(df_to_write)
1875
+ self.ctx.info(
1876
+ "Incremental filter applied (Polars)",
1877
+ original_rows=original_count,
1878
+ changed_rows=filtered_count,
1879
+ skipped_rows=original_count - filtered_count,
1880
+ )
1881
+
1882
+ if filtered_count == 0:
1883
+ self.ctx.info("No changed rows detected, skipping merge")
1884
+ return MergeResult(inserted=0, updated=0, deleted=0)
1885
+
1886
+ df_pandas = df_to_write.to_pandas()
1887
+
1888
+ batch_size = options.batch_size
1889
+ if batch_size and len(df_pandas) > batch_size:
1890
+ for i in range(0, len(df_pandas), batch_size):
1891
+ chunk = df_pandas.iloc[i : i + batch_size]
1892
+ if_exists = "replace" if i == 0 else "append"
1893
+ self.connection.write_table(
1894
+ df=chunk,
1895
+ table_name=staging_table_name,
1896
+ schema=staging_schema,
1897
+ if_exists=if_exists,
1898
+ )
1899
+ self.ctx.debug(f"Wrote batch {i // batch_size + 1}", rows=len(chunk))
1900
+ else:
1901
+ self.connection.write_table(
1902
+ df=df_pandas,
1903
+ table_name=staging_table_name,
1904
+ schema=staging_schema,
1905
+ if_exists="replace",
1906
+ )
1907
+
1908
+ self.ctx.debug("Staging write completed (Polars)", staging_table=staging_table)
1909
+
1910
+ # Handle schema evolution before MERGE - add any new columns to target table
1911
+ if options.schema_evolution and options.schema_evolution.add_columns:
1912
+ existing_cols = self.get_table_columns(target_table)
1913
+ new_cols = [c for c in columns if c not in existing_cols]
1914
+ if new_cols:
1915
+ new_cols_with_types = {}
1916
+ staging_cols = self.get_table_columns(staging_table)
1917
+ for col in new_cols:
1918
+ # Use appropriate type for hash columns (SHA256 = 64 chars)
1919
+ if col in ("_computed_hash", "_hash", "_hash_diff"):
1920
+ new_cols_with_types[col] = "NVARCHAR(256)"
1921
+ elif col in staging_cols:
1922
+ new_cols_with_types[col] = staging_cols[col]
1923
+ else:
1924
+ new_cols_with_types[col] = "NVARCHAR(MAX)"
1925
+ self.ctx.info(
1926
+ "Adding new columns to target table via schema evolution",
1927
+ target_table=target_table,
1928
+ new_columns=list(new_cols_with_types.keys()),
1929
+ )
1930
+ self.add_columns(target_table, new_cols_with_types)
1931
+
1932
+ result = self.execute_merge(
1933
+ target_table=target_table,
1934
+ staging_table=staging_table,
1935
+ merge_keys=merge_keys,
1936
+ columns=columns,
1937
+ options=options,
1938
+ )
1939
+
1940
+ return result
1941
+
1942
+ def overwrite_polars(
1943
+ self,
1944
+ df: Any,
1945
+ target_table: str,
1946
+ options: Optional[SqlServerOverwriteOptions] = None,
1947
+ ) -> OverwriteResult:
1948
+ """
1949
+ Execute enhanced overwrite operation for Polars DataFrame (Phase 4).
1950
+
1951
+ Args:
1952
+ df: Polars DataFrame or LazyFrame to write
1953
+ target_table: Target table name
1954
+ options: Overwrite options
1955
+
1956
+ Returns:
1957
+ OverwriteResult with row count
1958
+ """
1959
+ try:
1960
+ import polars as pl
1961
+ except ImportError:
1962
+ raise ImportError("Polars not installed. Run 'pip install polars'.")
1963
+
1964
+ options = options or SqlServerOverwriteOptions()
1965
+ strategy = options.strategy
1966
+
1967
+ if isinstance(df, pl.LazyFrame):
1968
+ df = df.collect()
1969
+
1970
+ schema, table_name = self.parse_table_name(target_table)
1971
+ if options.auto_create_schema:
1972
+ self.create_schema(schema)
1973
+
1974
+ self.ctx.info(
1975
+ "Starting SQL Server overwrite (Polars)",
1976
+ target_table=target_table,
1977
+ strategy=strategy.value,
1978
+ )
1979
+
1980
+ table_exists = self.check_table_exists(target_table)
1981
+
1982
+ if options.auto_create_table and not table_exists:
1983
+ self.create_table_from_polars(df, target_table)
1984
+ table_exists = True
1985
+
1986
+ if options.schema_evolution and table_exists:
1987
+ columns_to_write = self.handle_schema_evolution_polars(
1988
+ df, target_table, options.schema_evolution
1989
+ )
1990
+ df_to_write = df.select(columns_to_write)
1991
+ else:
1992
+ df_to_write = df
1993
+
1994
+ df_pandas = df_to_write.to_pandas()
1995
+
1996
+ batch_size = options.batch_size
1997
+ if strategy == SqlServerOverwriteStrategy.DROP_CREATE:
1998
+ if table_exists:
1999
+ self.drop_table(target_table)
2000
+ if batch_size and len(df_pandas) > batch_size:
2001
+ for i in range(0, len(df_pandas), batch_size):
2002
+ chunk = df_pandas.iloc[i : i + batch_size]
2003
+ if_exists = "replace" if i == 0 else "append"
2004
+ self.connection.write_table(
2005
+ df=chunk,
2006
+ table_name=table_name,
2007
+ schema=schema,
2008
+ if_exists=if_exists,
2009
+ )
2010
+ else:
2011
+ self.connection.write_table(
2012
+ df=df_pandas,
2013
+ table_name=table_name,
2014
+ schema=schema,
2015
+ if_exists="replace",
2016
+ )
2017
+
2018
+ elif strategy == SqlServerOverwriteStrategy.TRUNCATE_INSERT:
2019
+ if table_exists:
2020
+ self.truncate_table(target_table)
2021
+ if batch_size and len(df_pandas) > batch_size:
2022
+ for i in range(0, len(df_pandas), batch_size):
2023
+ chunk = df_pandas.iloc[i : i + batch_size]
2024
+ self.connection.write_table(
2025
+ df=chunk,
2026
+ table_name=table_name,
2027
+ schema=schema,
2028
+ if_exists="append",
2029
+ )
2030
+ else:
2031
+ self.connection.write_table(
2032
+ df=df_pandas,
2033
+ table_name=table_name,
2034
+ schema=schema,
2035
+ if_exists="append",
2036
+ )
2037
+ else:
2038
+ self.connection.write_table(
2039
+ df=df_pandas,
2040
+ table_name=table_name,
2041
+ schema=schema,
2042
+ if_exists="replace",
2043
+ )
2044
+
2045
+ elif strategy == SqlServerOverwriteStrategy.DELETE_INSERT:
2046
+ if table_exists:
2047
+ self.delete_from_table(target_table)
2048
+ if batch_size and len(df_pandas) > batch_size:
2049
+ for i in range(0, len(df_pandas), batch_size):
2050
+ chunk = df_pandas.iloc[i : i + batch_size]
2051
+ self.connection.write_table(
2052
+ df=chunk,
2053
+ table_name=table_name,
2054
+ schema=schema,
2055
+ if_exists="append",
2056
+ )
2057
+ else:
2058
+ self.connection.write_table(
2059
+ df=df_pandas,
2060
+ table_name=table_name,
2061
+ schema=schema,
2062
+ if_exists="append",
2063
+ )
2064
+ else:
2065
+ self.connection.write_table(
2066
+ df=df_pandas,
2067
+ table_name=table_name,
2068
+ schema=schema,
2069
+ if_exists="replace",
2070
+ )
2071
+
2072
+ row_count = len(df)
2073
+
2074
+ self.ctx.info(
2075
+ "Overwrite completed (Polars)",
2076
+ target_table=target_table,
2077
+ strategy=strategy.value,
2078
+ rows_written=row_count,
2079
+ )
2080
+
2081
+ return OverwriteResult(rows_written=row_count, strategy=strategy.value)