odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,610 @@
1
+ """
2
+ Delete Detection Transformer for CDC-like behavior.
3
+
4
+ Detects records that existed in previous extractions but no longer exist,
5
+ enabling CDC-like behavior for sources without native Change Data Capture.
6
+ """
7
+
8
+ import logging
9
+ from typing import Any, Dict, Optional
10
+
11
+ from odibi.config import (
12
+ DeleteDetectionConfig,
13
+ DeleteDetectionMode,
14
+ FirstRunBehavior,
15
+ ThresholdBreachAction,
16
+ )
17
+ from odibi.context import EngineContext
18
+ from odibi.enums import EngineType
19
+ from odibi.registry import transform
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class DeleteThresholdExceeded(Exception):
25
+ """Raised when delete percentage exceeds configured threshold."""
26
+
27
+ pass
28
+
29
+
30
+ @transform("detect_deletes", category="transformer", param_model=DeleteDetectionConfig)
31
+ def detect_deletes(
32
+ context: EngineContext, config: DeleteDetectionConfig = None, **params
33
+ ) -> EngineContext:
34
+ """
35
+ Detects deleted records based on configured mode.
36
+
37
+ Returns:
38
+ - soft_delete_col set: Adds boolean column (True = deleted)
39
+ - soft_delete_col = None: Removes deleted rows (hard delete)
40
+ """
41
+ if config is None:
42
+ config = DeleteDetectionConfig(**params)
43
+
44
+ if config.mode == DeleteDetectionMode.NONE:
45
+ return context
46
+
47
+ if config.mode == DeleteDetectionMode.SNAPSHOT_DIFF:
48
+ return _detect_deletes_snapshot_diff(context, config)
49
+
50
+ if config.mode == DeleteDetectionMode.SQL_COMPARE:
51
+ return _detect_deletes_sql_compare(context, config)
52
+
53
+ raise ValueError(
54
+ f"Unknown delete detection mode: '{config.mode}'. "
55
+ f"Supported modes: 'none', 'snapshot_diff', 'sql_compare'. "
56
+ f"Check your 'mode' configuration."
57
+ )
58
+
59
+
60
+ def _detect_deletes_snapshot_diff(
61
+ context: EngineContext,
62
+ config: DeleteDetectionConfig,
63
+ ) -> EngineContext:
64
+ """
65
+ Compare current Delta version to previous version.
66
+ Keys in previous but not in current = deleted.
67
+ """
68
+ if context.engine_type == EngineType.SPARK:
69
+ return _snapshot_diff_spark(context, config)
70
+ else:
71
+ return _snapshot_diff_pandas(context, config)
72
+
73
+
74
+ def _snapshot_diff_spark(
75
+ context: EngineContext,
76
+ config: DeleteDetectionConfig,
77
+ ) -> EngineContext:
78
+ """Spark implementation of snapshot_diff using Delta time travel."""
79
+ from delta.tables import DeltaTable
80
+
81
+ keys = config.keys
82
+ spark = context.spark
83
+
84
+ # Priority: explicit connection+path from config, then fallback to context inference
85
+ table_path = None
86
+ if config.connection and config.path:
87
+ conn = _get_connection(context, config.connection)
88
+ if conn and hasattr(conn, "get_path"):
89
+ table_path = conn.get_path(config.path)
90
+ else:
91
+ logger.warning(
92
+ f"detect_deletes: Connection '{config.connection}' not found or doesn't support get_path."
93
+ )
94
+
95
+ if not table_path:
96
+ table_path = _get_target_path(context)
97
+
98
+ if not table_path:
99
+ logger.warning(
100
+ "detect_deletes: Could not determine target table path. Skipping. "
101
+ "Provide 'connection' and 'path' params, or ensure the node has a 'write' block."
102
+ )
103
+ return context
104
+
105
+ if not DeltaTable.isDeltaTable(spark, table_path):
106
+ logger.info("detect_deletes: Target is not a Delta table. Skipping snapshot_diff.")
107
+ return context
108
+
109
+ dt = DeltaTable.forPath(spark, table_path)
110
+ current_version = dt.history(1).collect()[0]["version"]
111
+
112
+ if current_version == 0:
113
+ if config.on_first_run == FirstRunBehavior.ERROR:
114
+ raise ValueError("detect_deletes: No previous version exists for snapshot_diff.")
115
+ logger.info("detect_deletes: First run detected (version 0). Skipping delete detection.")
116
+ return _ensure_delete_column(context, config)
117
+
118
+ prev_version = current_version - 1
119
+
120
+ # Validate keys exist in current DataFrame
121
+ curr_columns = [c.lower() for c in context.df.columns]
122
+ missing_curr_keys = [k for k in keys if k.lower() not in curr_columns]
123
+ if missing_curr_keys:
124
+ logger.warning(
125
+ f"detect_deletes: Keys {missing_curr_keys} not found in current DataFrame. "
126
+ f"Available columns: {context.df.columns}. Skipping delete detection."
127
+ )
128
+ return _ensure_delete_column(context, config)
129
+
130
+ # Load previous version and validate schema
131
+ prev_df = spark.read.format("delta").option("versionAsOf", prev_version).load(table_path)
132
+ prev_columns = [c.lower() for c in prev_df.columns]
133
+ missing_prev_keys = [k for k in keys if k.lower() not in prev_columns]
134
+ if missing_prev_keys:
135
+ logger.warning(
136
+ f"detect_deletes: Keys {missing_prev_keys} not found in previous version (v{prev_version}). "
137
+ f"Schema may have changed. Skipping delete detection."
138
+ )
139
+ return _ensure_delete_column(context, config)
140
+
141
+ curr_keys = context.df.select(keys).distinct()
142
+ prev_keys = prev_df.select(keys).distinct()
143
+
144
+ deleted_keys = prev_keys.exceptAll(curr_keys)
145
+
146
+ return _apply_deletes(context, deleted_keys, config, prev_df=prev_df)
147
+
148
+
149
+ def _snapshot_diff_pandas(
150
+ context: EngineContext,
151
+ config: DeleteDetectionConfig,
152
+ ) -> EngineContext:
153
+ """Pandas implementation of snapshot_diff using deltalake library."""
154
+ try:
155
+ from deltalake import DeltaTable
156
+ except ImportError:
157
+ raise ImportError(
158
+ "detect_deletes snapshot_diff mode requires 'deltalake' package. "
159
+ "Install with: pip install deltalake"
160
+ )
161
+
162
+ keys = config.keys
163
+
164
+ # Priority: explicit connection+path from config, then fallback to context inference
165
+ table_path = None
166
+ if config.connection and config.path:
167
+ conn = _get_connection(context, config.connection)
168
+ if conn and hasattr(conn, "get_path"):
169
+ table_path = conn.get_path(config.path)
170
+ else:
171
+ logger.warning(
172
+ f"detect_deletes: Connection '{config.connection}' not found or doesn't support get_path."
173
+ )
174
+
175
+ if not table_path:
176
+ table_path = _get_target_path(context)
177
+
178
+ if not table_path:
179
+ logger.warning(
180
+ "detect_deletes: Could not determine target table path. Skipping. "
181
+ "Provide 'connection' and 'path' params, or ensure the node has a 'write' block."
182
+ )
183
+ return context
184
+
185
+ try:
186
+ dt = DeltaTable(table_path)
187
+ except Exception as e:
188
+ logger.info(f"detect_deletes: Target is not a Delta table ({e}). Skipping.")
189
+ return context
190
+
191
+ current_version = dt.version()
192
+
193
+ if current_version == 0:
194
+ if config.on_first_run == FirstRunBehavior.ERROR:
195
+ raise ValueError("detect_deletes: No previous version exists for snapshot_diff.")
196
+ logger.info("detect_deletes: First run detected (version 0). Skipping delete detection.")
197
+ return _ensure_delete_column(context, config)
198
+
199
+ prev_version = current_version - 1
200
+
201
+ # Validate keys exist in current DataFrame
202
+ curr_columns = [c.lower() for c in context.df.columns]
203
+ missing_curr_keys = [k for k in keys if k.lower() not in curr_columns]
204
+ if missing_curr_keys:
205
+ logger.warning(
206
+ f"detect_deletes: Keys {missing_curr_keys} not found in current DataFrame. "
207
+ f"Available columns: {list(context.df.columns)}. Skipping delete detection."
208
+ )
209
+ return _ensure_delete_column(context, config)
210
+
211
+ # Load previous version and validate schema
212
+ prev_df = DeltaTable(table_path, version=prev_version).to_pandas()
213
+ prev_columns = [c.lower() for c in prev_df.columns]
214
+ missing_prev_keys = [k for k in keys if k.lower() not in prev_columns]
215
+ if missing_prev_keys:
216
+ logger.warning(
217
+ f"detect_deletes: Keys {missing_prev_keys} not found in previous version (v{prev_version}). "
218
+ f"Schema may have changed. Skipping delete detection."
219
+ )
220
+ return _ensure_delete_column(context, config)
221
+
222
+ curr_keys = context.df[keys].drop_duplicates()
223
+ prev_keys = prev_df[keys].drop_duplicates()
224
+
225
+ merged = prev_keys.merge(curr_keys, on=keys, how="left", indicator=True)
226
+ deleted_keys = merged[merged["_merge"] == "left_only"][keys].copy()
227
+
228
+ return _apply_deletes(context, deleted_keys, config, prev_df=prev_df)
229
+
230
+
231
+ def _detect_deletes_sql_compare(
232
+ context: EngineContext,
233
+ config: DeleteDetectionConfig,
234
+ ) -> EngineContext:
235
+ """
236
+ Compare Silver keys against live source.
237
+ Keys in Silver but not in source = deleted.
238
+ """
239
+ if context.engine_type == EngineType.SPARK:
240
+ return _sql_compare_spark(context, config)
241
+ else:
242
+ return _sql_compare_pandas(context, config)
243
+
244
+
245
+ def _sql_compare_spark(
246
+ context: EngineContext,
247
+ config: DeleteDetectionConfig,
248
+ ) -> EngineContext:
249
+ """Spark implementation of sql_compare using JDBC."""
250
+ keys = config.keys
251
+ spark = context.spark
252
+
253
+ conn = _get_connection(context, config.source_connection)
254
+ if conn is None:
255
+ raise ValueError(
256
+ f"detect_deletes: Connection '{config.source_connection}' not found in engine connections. "
257
+ f"Available connections: {list(context.engine.connections.keys()) if hasattr(context, 'engine') and hasattr(context.engine, 'connections') else 'None'}. "
258
+ f"Define the connection in your project config or check the connection name."
259
+ )
260
+
261
+ source_keys_query = _build_source_keys_query(config)
262
+
263
+ jdbc_url = _get_jdbc_url(conn)
264
+ jdbc_props = _get_jdbc_properties(conn)
265
+
266
+ source_keys = (
267
+ spark.read.format("jdbc")
268
+ .option("url", jdbc_url)
269
+ .option("query", source_keys_query)
270
+ .options(**jdbc_props)
271
+ .load()
272
+ )
273
+
274
+ silver_keys = context.df.select(keys).distinct()
275
+ deleted_keys = silver_keys.exceptAll(source_keys)
276
+
277
+ return _apply_deletes(context, deleted_keys, config)
278
+
279
+
280
+ def _sql_compare_pandas(
281
+ context: EngineContext,
282
+ config: DeleteDetectionConfig,
283
+ ) -> EngineContext:
284
+ """Pandas implementation of sql_compare using SQLAlchemy."""
285
+ import pandas as pd
286
+
287
+ keys = config.keys
288
+
289
+ conn = _get_connection(context, config.source_connection)
290
+ if conn is None:
291
+ raise ValueError(
292
+ f"detect_deletes: Connection '{config.source_connection}' not found in engine connections. "
293
+ f"Available connections: {list(context.engine.connections.keys()) if hasattr(context, 'engine') and hasattr(context.engine, 'connections') else 'None'}. "
294
+ f"Define the connection in your project config or check the connection name."
295
+ )
296
+
297
+ source_keys_query = _build_source_keys_query(config)
298
+
299
+ engine = _get_sqlalchemy_engine(conn)
300
+ source_keys = pd.read_sql(source_keys_query, engine)
301
+
302
+ silver_keys = context.df[keys].drop_duplicates()
303
+
304
+ merged = silver_keys.merge(source_keys, on=keys, how="left", indicator=True)
305
+ deleted_keys = merged[merged["_merge"] == "left_only"][keys].copy()
306
+
307
+ return _apply_deletes(context, deleted_keys, config)
308
+
309
+
310
+ def _apply_deletes(
311
+ context: EngineContext,
312
+ deleted_keys: Any,
313
+ config: DeleteDetectionConfig,
314
+ prev_df: Any = None,
315
+ ) -> EngineContext:
316
+ """Apply soft or hard delete based on config."""
317
+ deleted_count = _get_row_count(deleted_keys, context.engine_type)
318
+ total_count = _get_row_count(context.df, context.engine_type)
319
+
320
+ if deleted_count == 0:
321
+ logger.info("detect_deletes: No deleted records found.")
322
+ return _ensure_delete_column(context, config)
323
+
324
+ delete_percent = (deleted_count / total_count * 100) if total_count > 0 else 0
325
+
326
+ if config.max_delete_percent is not None:
327
+ if delete_percent > config.max_delete_percent:
328
+ if config.on_threshold_breach == ThresholdBreachAction.ERROR:
329
+ raise DeleteThresholdExceeded(
330
+ f"detect_deletes: {delete_percent:.1f}% of rows flagged for deletion "
331
+ f"exceeds threshold of {config.max_delete_percent}%"
332
+ )
333
+ elif config.on_threshold_breach == ThresholdBreachAction.WARN:
334
+ logger.warning(
335
+ f"detect_deletes: {delete_percent:.1f}% of rows flagged for deletion "
336
+ f"(threshold: {config.max_delete_percent}%)"
337
+ )
338
+ elif config.on_threshold_breach == ThresholdBreachAction.SKIP:
339
+ logger.info(
340
+ f"detect_deletes: Delete threshold exceeded ({delete_percent:.1f}%). "
341
+ "Skipping delete detection."
342
+ )
343
+ return _ensure_delete_column(context, config)
344
+
345
+ logger.info(
346
+ f"detect_deletes: Found {deleted_count} deleted records "
347
+ f"({delete_percent:.1f}% of {total_count} rows)"
348
+ )
349
+
350
+ if config.soft_delete_col:
351
+ return _apply_soft_delete(context, deleted_keys, config, prev_df=prev_df)
352
+ else:
353
+ return _apply_hard_delete(context, deleted_keys, config)
354
+
355
+
356
+ def _apply_soft_delete(
357
+ context: EngineContext,
358
+ deleted_keys: Any,
359
+ config: DeleteDetectionConfig,
360
+ prev_df: Any = None,
361
+ ) -> EngineContext:
362
+ """
363
+ Add soft delete flag column and optionally UNION deleted rows from target.
364
+
365
+ For snapshot_diff mode with merge delete_condition, deleted rows must BE IN
366
+ the source DataFrame with _is_deleted=true. This function:
367
+ 1. Flags existing source rows based on whether their keys are in deleted_keys
368
+ 2. If prev_df is provided (snapshot_diff), fetches deleted rows from target
369
+ and adds them with _is_deleted=true
370
+ 3. Returns the result (union if prev_df provided, otherwise just flagged source)
371
+
372
+ For sql_compare mode (no prev_df), deleted keys are already in context.df,
373
+ so we just flag them.
374
+ """
375
+ keys = config.keys
376
+ soft_delete_col = config.soft_delete_col
377
+
378
+ if context.engine_type == EngineType.SPARK:
379
+ from pyspark.sql.functions import col, lit, when
380
+
381
+ if prev_df is not None:
382
+ # snapshot_diff mode: deleted rows are NOT in source, need to union them
383
+ # Mark existing source rows as not deleted
384
+ source_with_flag = context.df.withColumn(soft_delete_col, lit(False))
385
+
386
+ # Get full deleted rows from target, mark as deleted
387
+ deleted_rows = prev_df.join(deleted_keys, on=keys, how="inner")
388
+
389
+ # Align schema: select only columns that exist in source
390
+ source_cols = source_with_flag.columns
391
+ deleted_cols_to_select = []
392
+ for col_name in source_cols:
393
+ if col_name == soft_delete_col:
394
+ deleted_cols_to_select.append(lit(True).alias(soft_delete_col))
395
+ elif col_name in deleted_rows.columns:
396
+ deleted_cols_to_select.append(deleted_rows[col_name])
397
+ else:
398
+ deleted_cols_to_select.append(lit(None).alias(col_name))
399
+
400
+ deleted_rows_aligned = deleted_rows.select(deleted_cols_to_select)
401
+
402
+ # Union source rows with deleted rows
403
+ result = source_with_flag.unionByName(deleted_rows_aligned, allowMissingColumns=True)
404
+ else:
405
+ # sql_compare mode: deleted rows ARE in source, just flag them
406
+ deleted_keys_flagged = deleted_keys.withColumn("_del_flag", lit(True))
407
+
408
+ result = context.df.join(deleted_keys_flagged, on=keys, how="left").withColumn(
409
+ soft_delete_col,
410
+ when(col("_del_flag").isNotNull(), True).otherwise(False),
411
+ )
412
+ result = result.drop("_del_flag")
413
+
414
+ else:
415
+ import pandas as pd
416
+
417
+ df = context.df.copy()
418
+
419
+ if prev_df is not None:
420
+ # snapshot_diff mode: deleted rows are NOT in source, need to union them
421
+ df[soft_delete_col] = False
422
+
423
+ # Get full deleted rows from target
424
+ deleted_rows = prev_df.merge(deleted_keys, on=keys, how="inner")
425
+ deleted_rows[soft_delete_col] = True
426
+
427
+ # Align columns to match source schema
428
+ for col_name in df.columns:
429
+ if col_name not in deleted_rows.columns:
430
+ deleted_rows[col_name] = None
431
+
432
+ # Keep only columns that exist in source
433
+ deleted_rows = deleted_rows[df.columns]
434
+
435
+ # Union source with deleted rows
436
+ result = pd.concat([df, deleted_rows], ignore_index=True)
437
+ else:
438
+ # sql_compare mode: deleted rows ARE in source, just flag them
439
+ deleted_keys_df = deleted_keys.copy()
440
+ deleted_keys_df["_del_flag"] = True
441
+
442
+ df = df.merge(deleted_keys_df, on=keys, how="left")
443
+ df[soft_delete_col] = df["_del_flag"].notna()
444
+ df = df.drop(columns=["_del_flag"])
445
+ result = df
446
+
447
+ return context.with_df(result)
448
+
449
+
450
+ def _apply_hard_delete(
451
+ context: EngineContext,
452
+ deleted_keys: Any,
453
+ config: DeleteDetectionConfig,
454
+ ) -> EngineContext:
455
+ """Remove deleted rows."""
456
+ keys = config.keys
457
+
458
+ if context.engine_type == EngineType.SPARK:
459
+ result = context.df.join(deleted_keys, on=keys, how="left_anti")
460
+ else:
461
+ df = context.df.copy()
462
+ merged = df.merge(deleted_keys, on=keys, how="left", indicator=True)
463
+ result = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])
464
+
465
+ return context.with_df(result)
466
+
467
+
468
+ def _ensure_delete_column(
469
+ context: EngineContext,
470
+ config: DeleteDetectionConfig,
471
+ ) -> EngineContext:
472
+ """Ensure soft delete column exists with False values when no deletes found."""
473
+ if not config.soft_delete_col:
474
+ return context
475
+
476
+ soft_delete_col = config.soft_delete_col
477
+
478
+ if context.engine_type == EngineType.SPARK:
479
+ if soft_delete_col not in context.df.columns:
480
+ from pyspark.sql.functions import lit
481
+
482
+ result = context.df.withColumn(soft_delete_col, lit(False))
483
+ return context.with_df(result)
484
+ else:
485
+ if soft_delete_col not in context.df.columns:
486
+ df = context.df.copy()
487
+ df[soft_delete_col] = False
488
+ return context.with_df(df)
489
+
490
+ return context
491
+
492
+
493
+ def _build_source_keys_query(config: DeleteDetectionConfig) -> str:
494
+ """Build SQL query to get source keys."""
495
+ if config.source_query:
496
+ return config.source_query
497
+
498
+ keys = config.keys
499
+ key_cols = ", ".join(keys)
500
+ return f"SELECT DISTINCT {key_cols} FROM {config.source_table}"
501
+
502
+
503
+ def _get_row_count(df: Any, engine_type: EngineType) -> int:
504
+ """Get row count from DataFrame."""
505
+ if engine_type == EngineType.SPARK:
506
+ return df.count()
507
+ else:
508
+ return len(df)
509
+
510
+
511
+ def _get_target_path(context: EngineContext) -> Optional[str]:
512
+ """
513
+ Get target table path from context.
514
+ This is used for snapshot_diff to access Delta time travel.
515
+
516
+ Priority:
517
+ 1. _current_write_path (from node's write block)
518
+ 2. _current_input_path (from node's inputs - for cross-pipeline references)
519
+ 3. current_table_path (legacy)
520
+ """
521
+ if hasattr(context, "engine") and context.engine:
522
+ engine = context.engine
523
+ if hasattr(engine, "_current_write_path") and engine._current_write_path:
524
+ return engine._current_write_path
525
+ if hasattr(engine, "_current_input_path") and engine._current_input_path:
526
+ return engine._current_input_path
527
+ if hasattr(engine, "current_table_path"):
528
+ return engine.current_table_path
529
+
530
+ if hasattr(context, "context"):
531
+ inner_ctx = context.context
532
+ if hasattr(inner_ctx, "_current_table_path"):
533
+ return inner_ctx._current_table_path
534
+
535
+ return None
536
+
537
+
538
+ def _get_connection(context: EngineContext, connection_name: str) -> Optional[Any]:
539
+ """Get connection from context's engine."""
540
+ if hasattr(context, "engine") and context.engine:
541
+ if hasattr(context.engine, "connections"):
542
+ return context.engine.connections.get(connection_name)
543
+ return None
544
+
545
+
546
+ def _get_jdbc_url(conn: Any) -> str:
547
+ """Extract JDBC URL from connection object."""
548
+ if hasattr(conn, "jdbc_url"):
549
+ return conn.jdbc_url
550
+ if hasattr(conn, "get_jdbc_url"):
551
+ return conn.get_jdbc_url()
552
+ if hasattr(conn, "url"):
553
+ return conn.url
554
+ if hasattr(conn, "get_spark_options"):
555
+ opts = conn.get_spark_options()
556
+ if isinstance(opts, dict) and "url" in opts:
557
+ return opts["url"]
558
+
559
+ raise ValueError(
560
+ f"Cannot determine JDBC URL from connection type '{type(conn).__name__}'. "
561
+ f"Expected one of these attributes: 'jdbc_url', 'get_jdbc_url()', 'url', or 'get_spark_options()'. "
562
+ f"Available attributes: {[a for a in dir(conn) if not a.startswith('_')]}. "
563
+ f"Ensure your connection class implements JDBC URL access."
564
+ )
565
+
566
+
567
+ def _get_jdbc_properties(conn: Any) -> Dict[str, str]:
568
+ """Extract JDBC properties from connection object."""
569
+ props = {}
570
+
571
+ if hasattr(conn, "get_spark_options"):
572
+ opts = conn.get_spark_options()
573
+ if isinstance(opts, dict):
574
+ if "user" in opts:
575
+ props["user"] = opts["user"]
576
+ if "password" in opts:
577
+ props["password"] = opts["password"]
578
+ if "driver" in opts:
579
+ props["driver"] = opts["driver"]
580
+ return props
581
+
582
+ if hasattr(conn, "user"):
583
+ props["user"] = conn.user
584
+ if hasattr(conn, "password"):
585
+ props["password"] = conn.password
586
+ if hasattr(conn, "jdbc_driver"):
587
+ props["driver"] = conn.jdbc_driver
588
+ if hasattr(conn, "jdbc_properties"):
589
+ props.update(conn.jdbc_properties)
590
+
591
+ return props
592
+
593
+
594
+ def _get_sqlalchemy_engine(conn: Any) -> Any:
595
+ """Get SQLAlchemy engine from connection object."""
596
+ if hasattr(conn, "engine"):
597
+ return conn.engine
598
+ if hasattr(conn, "get_engine"):
599
+ return conn.get_engine()
600
+ if hasattr(conn, "connection_string"):
601
+ from sqlalchemy import create_engine
602
+
603
+ return create_engine(conn.connection_string)
604
+
605
+ raise ValueError(
606
+ f"Cannot create SQLAlchemy engine from connection type '{type(conn).__name__}'. "
607
+ f"Expected one of these attributes: 'engine', 'get_engine()', or 'connection_string'. "
608
+ f"Available attributes: {[a for a in dir(conn) if not a.startswith('_')]}. "
609
+ f"Ensure your connection class provides SQLAlchemy engine access."
610
+ )