odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,610 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Delete Detection Transformer for CDC-like behavior.
|
|
3
|
+
|
|
4
|
+
Detects records that existed in previous extractions but no longer exist,
|
|
5
|
+
enabling CDC-like behavior for sources without native Change Data Capture.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, Optional
|
|
10
|
+
|
|
11
|
+
from odibi.config import (
|
|
12
|
+
DeleteDetectionConfig,
|
|
13
|
+
DeleteDetectionMode,
|
|
14
|
+
FirstRunBehavior,
|
|
15
|
+
ThresholdBreachAction,
|
|
16
|
+
)
|
|
17
|
+
from odibi.context import EngineContext
|
|
18
|
+
from odibi.enums import EngineType
|
|
19
|
+
from odibi.registry import transform
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DeleteThresholdExceeded(Exception):
|
|
25
|
+
"""Raised when delete percentage exceeds configured threshold."""
|
|
26
|
+
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@transform("detect_deletes", category="transformer", param_model=DeleteDetectionConfig)
|
|
31
|
+
def detect_deletes(
|
|
32
|
+
context: EngineContext, config: DeleteDetectionConfig = None, **params
|
|
33
|
+
) -> EngineContext:
|
|
34
|
+
"""
|
|
35
|
+
Detects deleted records based on configured mode.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
- soft_delete_col set: Adds boolean column (True = deleted)
|
|
39
|
+
- soft_delete_col = None: Removes deleted rows (hard delete)
|
|
40
|
+
"""
|
|
41
|
+
if config is None:
|
|
42
|
+
config = DeleteDetectionConfig(**params)
|
|
43
|
+
|
|
44
|
+
if config.mode == DeleteDetectionMode.NONE:
|
|
45
|
+
return context
|
|
46
|
+
|
|
47
|
+
if config.mode == DeleteDetectionMode.SNAPSHOT_DIFF:
|
|
48
|
+
return _detect_deletes_snapshot_diff(context, config)
|
|
49
|
+
|
|
50
|
+
if config.mode == DeleteDetectionMode.SQL_COMPARE:
|
|
51
|
+
return _detect_deletes_sql_compare(context, config)
|
|
52
|
+
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"Unknown delete detection mode: '{config.mode}'. "
|
|
55
|
+
f"Supported modes: 'none', 'snapshot_diff', 'sql_compare'. "
|
|
56
|
+
f"Check your 'mode' configuration."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _detect_deletes_snapshot_diff(
|
|
61
|
+
context: EngineContext,
|
|
62
|
+
config: DeleteDetectionConfig,
|
|
63
|
+
) -> EngineContext:
|
|
64
|
+
"""
|
|
65
|
+
Compare current Delta version to previous version.
|
|
66
|
+
Keys in previous but not in current = deleted.
|
|
67
|
+
"""
|
|
68
|
+
if context.engine_type == EngineType.SPARK:
|
|
69
|
+
return _snapshot_diff_spark(context, config)
|
|
70
|
+
else:
|
|
71
|
+
return _snapshot_diff_pandas(context, config)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _snapshot_diff_spark(
|
|
75
|
+
context: EngineContext,
|
|
76
|
+
config: DeleteDetectionConfig,
|
|
77
|
+
) -> EngineContext:
|
|
78
|
+
"""Spark implementation of snapshot_diff using Delta time travel."""
|
|
79
|
+
from delta.tables import DeltaTable
|
|
80
|
+
|
|
81
|
+
keys = config.keys
|
|
82
|
+
spark = context.spark
|
|
83
|
+
|
|
84
|
+
# Priority: explicit connection+path from config, then fallback to context inference
|
|
85
|
+
table_path = None
|
|
86
|
+
if config.connection and config.path:
|
|
87
|
+
conn = _get_connection(context, config.connection)
|
|
88
|
+
if conn and hasattr(conn, "get_path"):
|
|
89
|
+
table_path = conn.get_path(config.path)
|
|
90
|
+
else:
|
|
91
|
+
logger.warning(
|
|
92
|
+
f"detect_deletes: Connection '{config.connection}' not found or doesn't support get_path."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if not table_path:
|
|
96
|
+
table_path = _get_target_path(context)
|
|
97
|
+
|
|
98
|
+
if not table_path:
|
|
99
|
+
logger.warning(
|
|
100
|
+
"detect_deletes: Could not determine target table path. Skipping. "
|
|
101
|
+
"Provide 'connection' and 'path' params, or ensure the node has a 'write' block."
|
|
102
|
+
)
|
|
103
|
+
return context
|
|
104
|
+
|
|
105
|
+
if not DeltaTable.isDeltaTable(spark, table_path):
|
|
106
|
+
logger.info("detect_deletes: Target is not a Delta table. Skipping snapshot_diff.")
|
|
107
|
+
return context
|
|
108
|
+
|
|
109
|
+
dt = DeltaTable.forPath(spark, table_path)
|
|
110
|
+
current_version = dt.history(1).collect()[0]["version"]
|
|
111
|
+
|
|
112
|
+
if current_version == 0:
|
|
113
|
+
if config.on_first_run == FirstRunBehavior.ERROR:
|
|
114
|
+
raise ValueError("detect_deletes: No previous version exists for snapshot_diff.")
|
|
115
|
+
logger.info("detect_deletes: First run detected (version 0). Skipping delete detection.")
|
|
116
|
+
return _ensure_delete_column(context, config)
|
|
117
|
+
|
|
118
|
+
prev_version = current_version - 1
|
|
119
|
+
|
|
120
|
+
# Validate keys exist in current DataFrame
|
|
121
|
+
curr_columns = [c.lower() for c in context.df.columns]
|
|
122
|
+
missing_curr_keys = [k for k in keys if k.lower() not in curr_columns]
|
|
123
|
+
if missing_curr_keys:
|
|
124
|
+
logger.warning(
|
|
125
|
+
f"detect_deletes: Keys {missing_curr_keys} not found in current DataFrame. "
|
|
126
|
+
f"Available columns: {context.df.columns}. Skipping delete detection."
|
|
127
|
+
)
|
|
128
|
+
return _ensure_delete_column(context, config)
|
|
129
|
+
|
|
130
|
+
# Load previous version and validate schema
|
|
131
|
+
prev_df = spark.read.format("delta").option("versionAsOf", prev_version).load(table_path)
|
|
132
|
+
prev_columns = [c.lower() for c in prev_df.columns]
|
|
133
|
+
missing_prev_keys = [k for k in keys if k.lower() not in prev_columns]
|
|
134
|
+
if missing_prev_keys:
|
|
135
|
+
logger.warning(
|
|
136
|
+
f"detect_deletes: Keys {missing_prev_keys} not found in previous version (v{prev_version}). "
|
|
137
|
+
f"Schema may have changed. Skipping delete detection."
|
|
138
|
+
)
|
|
139
|
+
return _ensure_delete_column(context, config)
|
|
140
|
+
|
|
141
|
+
curr_keys = context.df.select(keys).distinct()
|
|
142
|
+
prev_keys = prev_df.select(keys).distinct()
|
|
143
|
+
|
|
144
|
+
deleted_keys = prev_keys.exceptAll(curr_keys)
|
|
145
|
+
|
|
146
|
+
return _apply_deletes(context, deleted_keys, config, prev_df=prev_df)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _snapshot_diff_pandas(
|
|
150
|
+
context: EngineContext,
|
|
151
|
+
config: DeleteDetectionConfig,
|
|
152
|
+
) -> EngineContext:
|
|
153
|
+
"""Pandas implementation of snapshot_diff using deltalake library."""
|
|
154
|
+
try:
|
|
155
|
+
from deltalake import DeltaTable
|
|
156
|
+
except ImportError:
|
|
157
|
+
raise ImportError(
|
|
158
|
+
"detect_deletes snapshot_diff mode requires 'deltalake' package. "
|
|
159
|
+
"Install with: pip install deltalake"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
keys = config.keys
|
|
163
|
+
|
|
164
|
+
# Priority: explicit connection+path from config, then fallback to context inference
|
|
165
|
+
table_path = None
|
|
166
|
+
if config.connection and config.path:
|
|
167
|
+
conn = _get_connection(context, config.connection)
|
|
168
|
+
if conn and hasattr(conn, "get_path"):
|
|
169
|
+
table_path = conn.get_path(config.path)
|
|
170
|
+
else:
|
|
171
|
+
logger.warning(
|
|
172
|
+
f"detect_deletes: Connection '{config.connection}' not found or doesn't support get_path."
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if not table_path:
|
|
176
|
+
table_path = _get_target_path(context)
|
|
177
|
+
|
|
178
|
+
if not table_path:
|
|
179
|
+
logger.warning(
|
|
180
|
+
"detect_deletes: Could not determine target table path. Skipping. "
|
|
181
|
+
"Provide 'connection' and 'path' params, or ensure the node has a 'write' block."
|
|
182
|
+
)
|
|
183
|
+
return context
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
dt = DeltaTable(table_path)
|
|
187
|
+
except Exception as e:
|
|
188
|
+
logger.info(f"detect_deletes: Target is not a Delta table ({e}). Skipping.")
|
|
189
|
+
return context
|
|
190
|
+
|
|
191
|
+
current_version = dt.version()
|
|
192
|
+
|
|
193
|
+
if current_version == 0:
|
|
194
|
+
if config.on_first_run == FirstRunBehavior.ERROR:
|
|
195
|
+
raise ValueError("detect_deletes: No previous version exists for snapshot_diff.")
|
|
196
|
+
logger.info("detect_deletes: First run detected (version 0). Skipping delete detection.")
|
|
197
|
+
return _ensure_delete_column(context, config)
|
|
198
|
+
|
|
199
|
+
prev_version = current_version - 1
|
|
200
|
+
|
|
201
|
+
# Validate keys exist in current DataFrame
|
|
202
|
+
curr_columns = [c.lower() for c in context.df.columns]
|
|
203
|
+
missing_curr_keys = [k for k in keys if k.lower() not in curr_columns]
|
|
204
|
+
if missing_curr_keys:
|
|
205
|
+
logger.warning(
|
|
206
|
+
f"detect_deletes: Keys {missing_curr_keys} not found in current DataFrame. "
|
|
207
|
+
f"Available columns: {list(context.df.columns)}. Skipping delete detection."
|
|
208
|
+
)
|
|
209
|
+
return _ensure_delete_column(context, config)
|
|
210
|
+
|
|
211
|
+
# Load previous version and validate schema
|
|
212
|
+
prev_df = DeltaTable(table_path, version=prev_version).to_pandas()
|
|
213
|
+
prev_columns = [c.lower() for c in prev_df.columns]
|
|
214
|
+
missing_prev_keys = [k for k in keys if k.lower() not in prev_columns]
|
|
215
|
+
if missing_prev_keys:
|
|
216
|
+
logger.warning(
|
|
217
|
+
f"detect_deletes: Keys {missing_prev_keys} not found in previous version (v{prev_version}). "
|
|
218
|
+
f"Schema may have changed. Skipping delete detection."
|
|
219
|
+
)
|
|
220
|
+
return _ensure_delete_column(context, config)
|
|
221
|
+
|
|
222
|
+
curr_keys = context.df[keys].drop_duplicates()
|
|
223
|
+
prev_keys = prev_df[keys].drop_duplicates()
|
|
224
|
+
|
|
225
|
+
merged = prev_keys.merge(curr_keys, on=keys, how="left", indicator=True)
|
|
226
|
+
deleted_keys = merged[merged["_merge"] == "left_only"][keys].copy()
|
|
227
|
+
|
|
228
|
+
return _apply_deletes(context, deleted_keys, config, prev_df=prev_df)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _detect_deletes_sql_compare(
|
|
232
|
+
context: EngineContext,
|
|
233
|
+
config: DeleteDetectionConfig,
|
|
234
|
+
) -> EngineContext:
|
|
235
|
+
"""
|
|
236
|
+
Compare Silver keys against live source.
|
|
237
|
+
Keys in Silver but not in source = deleted.
|
|
238
|
+
"""
|
|
239
|
+
if context.engine_type == EngineType.SPARK:
|
|
240
|
+
return _sql_compare_spark(context, config)
|
|
241
|
+
else:
|
|
242
|
+
return _sql_compare_pandas(context, config)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _sql_compare_spark(
|
|
246
|
+
context: EngineContext,
|
|
247
|
+
config: DeleteDetectionConfig,
|
|
248
|
+
) -> EngineContext:
|
|
249
|
+
"""Spark implementation of sql_compare using JDBC."""
|
|
250
|
+
keys = config.keys
|
|
251
|
+
spark = context.spark
|
|
252
|
+
|
|
253
|
+
conn = _get_connection(context, config.source_connection)
|
|
254
|
+
if conn is None:
|
|
255
|
+
raise ValueError(
|
|
256
|
+
f"detect_deletes: Connection '{config.source_connection}' not found in engine connections. "
|
|
257
|
+
f"Available connections: {list(context.engine.connections.keys()) if hasattr(context, 'engine') and hasattr(context.engine, 'connections') else 'None'}. "
|
|
258
|
+
f"Define the connection in your project config or check the connection name."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
source_keys_query = _build_source_keys_query(config)
|
|
262
|
+
|
|
263
|
+
jdbc_url = _get_jdbc_url(conn)
|
|
264
|
+
jdbc_props = _get_jdbc_properties(conn)
|
|
265
|
+
|
|
266
|
+
source_keys = (
|
|
267
|
+
spark.read.format("jdbc")
|
|
268
|
+
.option("url", jdbc_url)
|
|
269
|
+
.option("query", source_keys_query)
|
|
270
|
+
.options(**jdbc_props)
|
|
271
|
+
.load()
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
silver_keys = context.df.select(keys).distinct()
|
|
275
|
+
deleted_keys = silver_keys.exceptAll(source_keys)
|
|
276
|
+
|
|
277
|
+
return _apply_deletes(context, deleted_keys, config)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _sql_compare_pandas(
|
|
281
|
+
context: EngineContext,
|
|
282
|
+
config: DeleteDetectionConfig,
|
|
283
|
+
) -> EngineContext:
|
|
284
|
+
"""Pandas implementation of sql_compare using SQLAlchemy."""
|
|
285
|
+
import pandas as pd
|
|
286
|
+
|
|
287
|
+
keys = config.keys
|
|
288
|
+
|
|
289
|
+
conn = _get_connection(context, config.source_connection)
|
|
290
|
+
if conn is None:
|
|
291
|
+
raise ValueError(
|
|
292
|
+
f"detect_deletes: Connection '{config.source_connection}' not found in engine connections. "
|
|
293
|
+
f"Available connections: {list(context.engine.connections.keys()) if hasattr(context, 'engine') and hasattr(context.engine, 'connections') else 'None'}. "
|
|
294
|
+
f"Define the connection in your project config or check the connection name."
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
source_keys_query = _build_source_keys_query(config)
|
|
298
|
+
|
|
299
|
+
engine = _get_sqlalchemy_engine(conn)
|
|
300
|
+
source_keys = pd.read_sql(source_keys_query, engine)
|
|
301
|
+
|
|
302
|
+
silver_keys = context.df[keys].drop_duplicates()
|
|
303
|
+
|
|
304
|
+
merged = silver_keys.merge(source_keys, on=keys, how="left", indicator=True)
|
|
305
|
+
deleted_keys = merged[merged["_merge"] == "left_only"][keys].copy()
|
|
306
|
+
|
|
307
|
+
return _apply_deletes(context, deleted_keys, config)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _apply_deletes(
|
|
311
|
+
context: EngineContext,
|
|
312
|
+
deleted_keys: Any,
|
|
313
|
+
config: DeleteDetectionConfig,
|
|
314
|
+
prev_df: Any = None,
|
|
315
|
+
) -> EngineContext:
|
|
316
|
+
"""Apply soft or hard delete based on config."""
|
|
317
|
+
deleted_count = _get_row_count(deleted_keys, context.engine_type)
|
|
318
|
+
total_count = _get_row_count(context.df, context.engine_type)
|
|
319
|
+
|
|
320
|
+
if deleted_count == 0:
|
|
321
|
+
logger.info("detect_deletes: No deleted records found.")
|
|
322
|
+
return _ensure_delete_column(context, config)
|
|
323
|
+
|
|
324
|
+
delete_percent = (deleted_count / total_count * 100) if total_count > 0 else 0
|
|
325
|
+
|
|
326
|
+
if config.max_delete_percent is not None:
|
|
327
|
+
if delete_percent > config.max_delete_percent:
|
|
328
|
+
if config.on_threshold_breach == ThresholdBreachAction.ERROR:
|
|
329
|
+
raise DeleteThresholdExceeded(
|
|
330
|
+
f"detect_deletes: {delete_percent:.1f}% of rows flagged for deletion "
|
|
331
|
+
f"exceeds threshold of {config.max_delete_percent}%"
|
|
332
|
+
)
|
|
333
|
+
elif config.on_threshold_breach == ThresholdBreachAction.WARN:
|
|
334
|
+
logger.warning(
|
|
335
|
+
f"detect_deletes: {delete_percent:.1f}% of rows flagged for deletion "
|
|
336
|
+
f"(threshold: {config.max_delete_percent}%)"
|
|
337
|
+
)
|
|
338
|
+
elif config.on_threshold_breach == ThresholdBreachAction.SKIP:
|
|
339
|
+
logger.info(
|
|
340
|
+
f"detect_deletes: Delete threshold exceeded ({delete_percent:.1f}%). "
|
|
341
|
+
"Skipping delete detection."
|
|
342
|
+
)
|
|
343
|
+
return _ensure_delete_column(context, config)
|
|
344
|
+
|
|
345
|
+
logger.info(
|
|
346
|
+
f"detect_deletes: Found {deleted_count} deleted records "
|
|
347
|
+
f"({delete_percent:.1f}% of {total_count} rows)"
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
if config.soft_delete_col:
|
|
351
|
+
return _apply_soft_delete(context, deleted_keys, config, prev_df=prev_df)
|
|
352
|
+
else:
|
|
353
|
+
return _apply_hard_delete(context, deleted_keys, config)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _apply_soft_delete(
|
|
357
|
+
context: EngineContext,
|
|
358
|
+
deleted_keys: Any,
|
|
359
|
+
config: DeleteDetectionConfig,
|
|
360
|
+
prev_df: Any = None,
|
|
361
|
+
) -> EngineContext:
|
|
362
|
+
"""
|
|
363
|
+
Add soft delete flag column and optionally UNION deleted rows from target.
|
|
364
|
+
|
|
365
|
+
For snapshot_diff mode with merge delete_condition, deleted rows must BE IN
|
|
366
|
+
the source DataFrame with _is_deleted=true. This function:
|
|
367
|
+
1. Flags existing source rows based on whether their keys are in deleted_keys
|
|
368
|
+
2. If prev_df is provided (snapshot_diff), fetches deleted rows from target
|
|
369
|
+
and adds them with _is_deleted=true
|
|
370
|
+
3. Returns the result (union if prev_df provided, otherwise just flagged source)
|
|
371
|
+
|
|
372
|
+
For sql_compare mode (no prev_df), deleted keys are already in context.df,
|
|
373
|
+
so we just flag them.
|
|
374
|
+
"""
|
|
375
|
+
keys = config.keys
|
|
376
|
+
soft_delete_col = config.soft_delete_col
|
|
377
|
+
|
|
378
|
+
if context.engine_type == EngineType.SPARK:
|
|
379
|
+
from pyspark.sql.functions import col, lit, when
|
|
380
|
+
|
|
381
|
+
if prev_df is not None:
|
|
382
|
+
# snapshot_diff mode: deleted rows are NOT in source, need to union them
|
|
383
|
+
# Mark existing source rows as not deleted
|
|
384
|
+
source_with_flag = context.df.withColumn(soft_delete_col, lit(False))
|
|
385
|
+
|
|
386
|
+
# Get full deleted rows from target, mark as deleted
|
|
387
|
+
deleted_rows = prev_df.join(deleted_keys, on=keys, how="inner")
|
|
388
|
+
|
|
389
|
+
# Align schema: select only columns that exist in source
|
|
390
|
+
source_cols = source_with_flag.columns
|
|
391
|
+
deleted_cols_to_select = []
|
|
392
|
+
for col_name in source_cols:
|
|
393
|
+
if col_name == soft_delete_col:
|
|
394
|
+
deleted_cols_to_select.append(lit(True).alias(soft_delete_col))
|
|
395
|
+
elif col_name in deleted_rows.columns:
|
|
396
|
+
deleted_cols_to_select.append(deleted_rows[col_name])
|
|
397
|
+
else:
|
|
398
|
+
deleted_cols_to_select.append(lit(None).alias(col_name))
|
|
399
|
+
|
|
400
|
+
deleted_rows_aligned = deleted_rows.select(deleted_cols_to_select)
|
|
401
|
+
|
|
402
|
+
# Union source rows with deleted rows
|
|
403
|
+
result = source_with_flag.unionByName(deleted_rows_aligned, allowMissingColumns=True)
|
|
404
|
+
else:
|
|
405
|
+
# sql_compare mode: deleted rows ARE in source, just flag them
|
|
406
|
+
deleted_keys_flagged = deleted_keys.withColumn("_del_flag", lit(True))
|
|
407
|
+
|
|
408
|
+
result = context.df.join(deleted_keys_flagged, on=keys, how="left").withColumn(
|
|
409
|
+
soft_delete_col,
|
|
410
|
+
when(col("_del_flag").isNotNull(), True).otherwise(False),
|
|
411
|
+
)
|
|
412
|
+
result = result.drop("_del_flag")
|
|
413
|
+
|
|
414
|
+
else:
|
|
415
|
+
import pandas as pd
|
|
416
|
+
|
|
417
|
+
df = context.df.copy()
|
|
418
|
+
|
|
419
|
+
if prev_df is not None:
|
|
420
|
+
# snapshot_diff mode: deleted rows are NOT in source, need to union them
|
|
421
|
+
df[soft_delete_col] = False
|
|
422
|
+
|
|
423
|
+
# Get full deleted rows from target
|
|
424
|
+
deleted_rows = prev_df.merge(deleted_keys, on=keys, how="inner")
|
|
425
|
+
deleted_rows[soft_delete_col] = True
|
|
426
|
+
|
|
427
|
+
# Align columns to match source schema
|
|
428
|
+
for col_name in df.columns:
|
|
429
|
+
if col_name not in deleted_rows.columns:
|
|
430
|
+
deleted_rows[col_name] = None
|
|
431
|
+
|
|
432
|
+
# Keep only columns that exist in source
|
|
433
|
+
deleted_rows = deleted_rows[df.columns]
|
|
434
|
+
|
|
435
|
+
# Union source with deleted rows
|
|
436
|
+
result = pd.concat([df, deleted_rows], ignore_index=True)
|
|
437
|
+
else:
|
|
438
|
+
# sql_compare mode: deleted rows ARE in source, just flag them
|
|
439
|
+
deleted_keys_df = deleted_keys.copy()
|
|
440
|
+
deleted_keys_df["_del_flag"] = True
|
|
441
|
+
|
|
442
|
+
df = df.merge(deleted_keys_df, on=keys, how="left")
|
|
443
|
+
df[soft_delete_col] = df["_del_flag"].notna()
|
|
444
|
+
df = df.drop(columns=["_del_flag"])
|
|
445
|
+
result = df
|
|
446
|
+
|
|
447
|
+
return context.with_df(result)
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def _apply_hard_delete(
|
|
451
|
+
context: EngineContext,
|
|
452
|
+
deleted_keys: Any,
|
|
453
|
+
config: DeleteDetectionConfig,
|
|
454
|
+
) -> EngineContext:
|
|
455
|
+
"""Remove deleted rows."""
|
|
456
|
+
keys = config.keys
|
|
457
|
+
|
|
458
|
+
if context.engine_type == EngineType.SPARK:
|
|
459
|
+
result = context.df.join(deleted_keys, on=keys, how="left_anti")
|
|
460
|
+
else:
|
|
461
|
+
df = context.df.copy()
|
|
462
|
+
merged = df.merge(deleted_keys, on=keys, how="left", indicator=True)
|
|
463
|
+
result = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])
|
|
464
|
+
|
|
465
|
+
return context.with_df(result)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def _ensure_delete_column(
|
|
469
|
+
context: EngineContext,
|
|
470
|
+
config: DeleteDetectionConfig,
|
|
471
|
+
) -> EngineContext:
|
|
472
|
+
"""Ensure soft delete column exists with False values when no deletes found."""
|
|
473
|
+
if not config.soft_delete_col:
|
|
474
|
+
return context
|
|
475
|
+
|
|
476
|
+
soft_delete_col = config.soft_delete_col
|
|
477
|
+
|
|
478
|
+
if context.engine_type == EngineType.SPARK:
|
|
479
|
+
if soft_delete_col not in context.df.columns:
|
|
480
|
+
from pyspark.sql.functions import lit
|
|
481
|
+
|
|
482
|
+
result = context.df.withColumn(soft_delete_col, lit(False))
|
|
483
|
+
return context.with_df(result)
|
|
484
|
+
else:
|
|
485
|
+
if soft_delete_col not in context.df.columns:
|
|
486
|
+
df = context.df.copy()
|
|
487
|
+
df[soft_delete_col] = False
|
|
488
|
+
return context.with_df(df)
|
|
489
|
+
|
|
490
|
+
return context
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _build_source_keys_query(config: DeleteDetectionConfig) -> str:
|
|
494
|
+
"""Build SQL query to get source keys."""
|
|
495
|
+
if config.source_query:
|
|
496
|
+
return config.source_query
|
|
497
|
+
|
|
498
|
+
keys = config.keys
|
|
499
|
+
key_cols = ", ".join(keys)
|
|
500
|
+
return f"SELECT DISTINCT {key_cols} FROM {config.source_table}"
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def _get_row_count(df: Any, engine_type: EngineType) -> int:
|
|
504
|
+
"""Get row count from DataFrame."""
|
|
505
|
+
if engine_type == EngineType.SPARK:
|
|
506
|
+
return df.count()
|
|
507
|
+
else:
|
|
508
|
+
return len(df)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _get_target_path(context: EngineContext) -> Optional[str]:
|
|
512
|
+
"""
|
|
513
|
+
Get target table path from context.
|
|
514
|
+
This is used for snapshot_diff to access Delta time travel.
|
|
515
|
+
|
|
516
|
+
Priority:
|
|
517
|
+
1. _current_write_path (from node's write block)
|
|
518
|
+
2. _current_input_path (from node's inputs - for cross-pipeline references)
|
|
519
|
+
3. current_table_path (legacy)
|
|
520
|
+
"""
|
|
521
|
+
if hasattr(context, "engine") and context.engine:
|
|
522
|
+
engine = context.engine
|
|
523
|
+
if hasattr(engine, "_current_write_path") and engine._current_write_path:
|
|
524
|
+
return engine._current_write_path
|
|
525
|
+
if hasattr(engine, "_current_input_path") and engine._current_input_path:
|
|
526
|
+
return engine._current_input_path
|
|
527
|
+
if hasattr(engine, "current_table_path"):
|
|
528
|
+
return engine.current_table_path
|
|
529
|
+
|
|
530
|
+
if hasattr(context, "context"):
|
|
531
|
+
inner_ctx = context.context
|
|
532
|
+
if hasattr(inner_ctx, "_current_table_path"):
|
|
533
|
+
return inner_ctx._current_table_path
|
|
534
|
+
|
|
535
|
+
return None
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def _get_connection(context: EngineContext, connection_name: str) -> Optional[Any]:
|
|
539
|
+
"""Get connection from context's engine."""
|
|
540
|
+
if hasattr(context, "engine") and context.engine:
|
|
541
|
+
if hasattr(context.engine, "connections"):
|
|
542
|
+
return context.engine.connections.get(connection_name)
|
|
543
|
+
return None
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def _get_jdbc_url(conn: Any) -> str:
|
|
547
|
+
"""Extract JDBC URL from connection object."""
|
|
548
|
+
if hasattr(conn, "jdbc_url"):
|
|
549
|
+
return conn.jdbc_url
|
|
550
|
+
if hasattr(conn, "get_jdbc_url"):
|
|
551
|
+
return conn.get_jdbc_url()
|
|
552
|
+
if hasattr(conn, "url"):
|
|
553
|
+
return conn.url
|
|
554
|
+
if hasattr(conn, "get_spark_options"):
|
|
555
|
+
opts = conn.get_spark_options()
|
|
556
|
+
if isinstance(opts, dict) and "url" in opts:
|
|
557
|
+
return opts["url"]
|
|
558
|
+
|
|
559
|
+
raise ValueError(
|
|
560
|
+
f"Cannot determine JDBC URL from connection type '{type(conn).__name__}'. "
|
|
561
|
+
f"Expected one of these attributes: 'jdbc_url', 'get_jdbc_url()', 'url', or 'get_spark_options()'. "
|
|
562
|
+
f"Available attributes: {[a for a in dir(conn) if not a.startswith('_')]}. "
|
|
563
|
+
f"Ensure your connection class implements JDBC URL access."
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def _get_jdbc_properties(conn: Any) -> Dict[str, str]:
|
|
568
|
+
"""Extract JDBC properties from connection object."""
|
|
569
|
+
props = {}
|
|
570
|
+
|
|
571
|
+
if hasattr(conn, "get_spark_options"):
|
|
572
|
+
opts = conn.get_spark_options()
|
|
573
|
+
if isinstance(opts, dict):
|
|
574
|
+
if "user" in opts:
|
|
575
|
+
props["user"] = opts["user"]
|
|
576
|
+
if "password" in opts:
|
|
577
|
+
props["password"] = opts["password"]
|
|
578
|
+
if "driver" in opts:
|
|
579
|
+
props["driver"] = opts["driver"]
|
|
580
|
+
return props
|
|
581
|
+
|
|
582
|
+
if hasattr(conn, "user"):
|
|
583
|
+
props["user"] = conn.user
|
|
584
|
+
if hasattr(conn, "password"):
|
|
585
|
+
props["password"] = conn.password
|
|
586
|
+
if hasattr(conn, "jdbc_driver"):
|
|
587
|
+
props["driver"] = conn.jdbc_driver
|
|
588
|
+
if hasattr(conn, "jdbc_properties"):
|
|
589
|
+
props.update(conn.jdbc_properties)
|
|
590
|
+
|
|
591
|
+
return props
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def _get_sqlalchemy_engine(conn: Any) -> Any:
|
|
595
|
+
"""Get SQLAlchemy engine from connection object."""
|
|
596
|
+
if hasattr(conn, "engine"):
|
|
597
|
+
return conn.engine
|
|
598
|
+
if hasattr(conn, "get_engine"):
|
|
599
|
+
return conn.get_engine()
|
|
600
|
+
if hasattr(conn, "connection_string"):
|
|
601
|
+
from sqlalchemy import create_engine
|
|
602
|
+
|
|
603
|
+
return create_engine(conn.connection_string)
|
|
604
|
+
|
|
605
|
+
raise ValueError(
|
|
606
|
+
f"Cannot create SQLAlchemy engine from connection type '{type(conn).__name__}'. "
|
|
607
|
+
f"Expected one of these attributes: 'engine', 'get_engine()', or 'connection_string'. "
|
|
608
|
+
f"Available attributes: {[a for a in dir(conn) if not a.startswith('_')]}. "
|
|
609
|
+
f"Ensure your connection class provides SQLAlchemy engine access."
|
|
610
|
+
)
|