odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,579 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, List, Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, model_validator
|
|
6
|
+
|
|
7
|
+
from odibi.context import EngineContext
|
|
8
|
+
from odibi.enums import EngineType
|
|
9
|
+
from odibi.utils.logging_context import get_logging_context
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SCD2Params(BaseModel):
|
|
13
|
+
"""
|
|
14
|
+
Parameters for SCD Type 2 (Slowly Changing Dimensions) transformer.
|
|
15
|
+
|
|
16
|
+
### 🕰️ The "Time Machine" Pattern
|
|
17
|
+
|
|
18
|
+
**Business Problem:**
|
|
19
|
+
"I need to know what the customer's address was *last month*, not just where they live now."
|
|
20
|
+
|
|
21
|
+
**The Solution:**
|
|
22
|
+
SCD Type 2 tracks the full history of changes. Each record has an "effective window" (start/end dates) and a flag indicating if it is the current version.
|
|
23
|
+
|
|
24
|
+
**Recipe 1: Using table name**
|
|
25
|
+
```yaml
|
|
26
|
+
transformer: "scd2"
|
|
27
|
+
params:
|
|
28
|
+
target: "silver.dim_customers" # Registered table name
|
|
29
|
+
keys: ["customer_id"]
|
|
30
|
+
track_cols: ["address", "tier"]
|
|
31
|
+
effective_time_col: "txn_date"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Recipe 2: Using connection + path (ADLS)**
|
|
35
|
+
```yaml
|
|
36
|
+
transformer: "scd2"
|
|
37
|
+
params:
|
|
38
|
+
connection: adls_prod # Connection name
|
|
39
|
+
path: OEE/silver/dim_customers # Relative path
|
|
40
|
+
keys: ["customer_id"]
|
|
41
|
+
track_cols: ["address", "tier"]
|
|
42
|
+
effective_time_col: "txn_date"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**How it works:**
|
|
46
|
+
1. **Match**: Finds existing records using `keys`.
|
|
47
|
+
2. **Compare**: Checks `track_cols` to see if data changed.
|
|
48
|
+
3. **Close**: If changed, updates the old record's `end_time_col` to the new `effective_time_col`.
|
|
49
|
+
4. **Insert**: Adds a new record with `effective_time_col` as start and open-ended end date.
|
|
50
|
+
|
|
51
|
+
**Note:** SCD2 returns a DataFrame containing the full history. You must use a `write:` block
|
|
52
|
+
to persist the result (typically with `mode: overwrite` to the same location as `target`).
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
target: Optional[str] = Field(
|
|
56
|
+
None,
|
|
57
|
+
description="Target table name or full path (use this OR connection+path)",
|
|
58
|
+
)
|
|
59
|
+
connection: Optional[str] = Field(
|
|
60
|
+
None,
|
|
61
|
+
description="Connection name to resolve path (use with 'path' param)",
|
|
62
|
+
)
|
|
63
|
+
path: Optional[str] = Field(
|
|
64
|
+
None,
|
|
65
|
+
description="Relative path within connection (e.g., 'OEE/silver/dim_customers')",
|
|
66
|
+
)
|
|
67
|
+
keys: List[str] = Field(..., description="Natural keys to identify unique entities")
|
|
68
|
+
track_cols: List[str] = Field(..., description="Columns to monitor for changes")
|
|
69
|
+
effective_time_col: str = Field(
|
|
70
|
+
...,
|
|
71
|
+
description="Source column indicating when the change occurred.",
|
|
72
|
+
)
|
|
73
|
+
end_time_col: str = Field(default="valid_to", description="Name of the end timestamp column")
|
|
74
|
+
current_flag_col: str = Field(
|
|
75
|
+
default="is_current", description="Name of the current record flag column"
|
|
76
|
+
)
|
|
77
|
+
delete_col: Optional[str] = Field(
|
|
78
|
+
default=None, description="Column indicating soft deletion (boolean)"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
@model_validator(mode="after")
|
|
82
|
+
def check_target_or_connection(self):
|
|
83
|
+
"""Ensure either target or connection+path is provided."""
|
|
84
|
+
if not self.target and not (self.connection and self.path):
|
|
85
|
+
raise ValueError("SCD2: provide either 'target' OR both 'connection' and 'path'.")
|
|
86
|
+
if self.target and (self.connection or self.path):
|
|
87
|
+
raise ValueError("SCD2: use 'target' OR 'connection'+'path', not both.")
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def scd2(context: EngineContext, params: SCD2Params, current: Any = None) -> EngineContext:
|
|
92
|
+
"""
|
|
93
|
+
Implements SCD Type 2 Logic.
|
|
94
|
+
|
|
95
|
+
Returns the FULL history dataset (to be written via Overwrite).
|
|
96
|
+
"""
|
|
97
|
+
ctx = get_logging_context()
|
|
98
|
+
start_time = time.time()
|
|
99
|
+
|
|
100
|
+
# Resolve target path from connection if provided
|
|
101
|
+
target = params.target
|
|
102
|
+
|
|
103
|
+
if params.connection and params.path:
|
|
104
|
+
# Resolve path via connection
|
|
105
|
+
connection = None
|
|
106
|
+
if hasattr(context, "engine") and hasattr(context.engine, "connections"):
|
|
107
|
+
connections = context.engine.connections
|
|
108
|
+
if connections and params.connection in connections:
|
|
109
|
+
connection = connections[params.connection]
|
|
110
|
+
|
|
111
|
+
if connection is None:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"SCD2: connection '{params.connection}' not found. "
|
|
114
|
+
"Ensure the connection is defined in your project config."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
if hasattr(connection, "get_path"):
|
|
118
|
+
target = connection.get_path(params.path)
|
|
119
|
+
ctx.debug(
|
|
120
|
+
"Resolved SCD2 target path via connection",
|
|
121
|
+
connection=params.connection,
|
|
122
|
+
relative_path=params.path,
|
|
123
|
+
resolved_path=target,
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"SCD2: connection '{params.connection}' (type: {type(connection).__name__}) "
|
|
128
|
+
f"does not support path resolution. Expected a connection with 'get_path' method. "
|
|
129
|
+
f"Connection type must be 'local', 'adls', or similar file-based connection."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
ctx.debug(
|
|
133
|
+
"SCD2 starting",
|
|
134
|
+
target=target,
|
|
135
|
+
keys=params.keys,
|
|
136
|
+
track_cols=params.track_cols,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
source_df = context.df if current is None else current
|
|
140
|
+
|
|
141
|
+
rows_before = None
|
|
142
|
+
try:
|
|
143
|
+
rows_before = source_df.shape[0] if hasattr(source_df, "shape") else None
|
|
144
|
+
if rows_before is None and hasattr(source_df, "count"):
|
|
145
|
+
rows_before = source_df.count()
|
|
146
|
+
except Exception as e:
|
|
147
|
+
ctx.debug(f"Could not get row count: {type(e).__name__}")
|
|
148
|
+
|
|
149
|
+
ctx.debug(
|
|
150
|
+
"SCD2 source loaded",
|
|
151
|
+
source_rows=rows_before,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Create a modified params with resolved target for internal functions
|
|
155
|
+
resolved_params = params.model_copy(update={"target": target})
|
|
156
|
+
|
|
157
|
+
if context.engine_type == EngineType.SPARK:
|
|
158
|
+
result = _scd2_spark(context, source_df, resolved_params)
|
|
159
|
+
elif context.engine_type == EngineType.PANDAS:
|
|
160
|
+
result = _scd2_pandas(context, source_df, resolved_params)
|
|
161
|
+
else:
|
|
162
|
+
ctx.error("SCD2 failed: unsupported engine", engine_type=str(context.engine_type))
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"SCD2 transformer does not support engine type '{context.engine_type}'. "
|
|
165
|
+
f"Supported engines: SPARK, PANDAS. "
|
|
166
|
+
f"Check your engine configuration or use a different transformer."
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
rows_after = None
|
|
170
|
+
try:
|
|
171
|
+
rows_after = result.df.shape[0] if hasattr(result.df, "shape") else None
|
|
172
|
+
if rows_after is None and hasattr(result.df, "count"):
|
|
173
|
+
rows_after = result.df.count()
|
|
174
|
+
except Exception as e:
|
|
175
|
+
ctx.debug(f"Could not get row count: {type(e).__name__}")
|
|
176
|
+
|
|
177
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
178
|
+
ctx.debug(
|
|
179
|
+
"SCD2 completed",
|
|
180
|
+
target=target,
|
|
181
|
+
source_rows=rows_before,
|
|
182
|
+
result_rows=rows_after,
|
|
183
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
return result
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _scd2_spark(context: EngineContext, source_df, params: SCD2Params) -> EngineContext:
|
|
190
|
+
from pyspark.sql import functions as F
|
|
191
|
+
|
|
192
|
+
spark = context.spark
|
|
193
|
+
|
|
194
|
+
# 1. Check if target exists
|
|
195
|
+
target_df = None
|
|
196
|
+
try:
|
|
197
|
+
# Try reading as table first
|
|
198
|
+
target_df = spark.table(params.target)
|
|
199
|
+
except Exception:
|
|
200
|
+
try:
|
|
201
|
+
# Try reading as Delta path
|
|
202
|
+
target_df = spark.read.format("delta").load(params.target)
|
|
203
|
+
except Exception:
|
|
204
|
+
# Target doesn't exist yet - First Run
|
|
205
|
+
pass
|
|
206
|
+
|
|
207
|
+
# Define Columns
|
|
208
|
+
eff_col = params.effective_time_col
|
|
209
|
+
end_col = params.end_time_col
|
|
210
|
+
flag_col = params.current_flag_col
|
|
211
|
+
|
|
212
|
+
# Validate effective_time_col exists in source
|
|
213
|
+
source_cols = source_df.columns
|
|
214
|
+
if eff_col not in source_cols:
|
|
215
|
+
raise ValueError(
|
|
216
|
+
f"SCD2: effective_time_col '{eff_col}' not found in source DataFrame. "
|
|
217
|
+
f"Available columns: {source_cols}"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Prepare Source: Add SCD metadata columns
|
|
221
|
+
# New records start as Current
|
|
222
|
+
new_records = source_df.withColumn(end_col, F.lit(None).cast("timestamp")).withColumn(
|
|
223
|
+
flag_col, F.lit(True)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if target_df is None:
|
|
227
|
+
# First Run: Return Source prepared
|
|
228
|
+
# Drop effective_time_col as it's only used for SCD logic, not stored in target
|
|
229
|
+
if eff_col in new_records.columns:
|
|
230
|
+
new_records = new_records.drop(eff_col)
|
|
231
|
+
return context.with_df(new_records)
|
|
232
|
+
|
|
233
|
+
# 2. Logic: Compare Source vs Target (Current Records Only)
|
|
234
|
+
# We only compare against currently open records in target
|
|
235
|
+
# Handle optional filtering if flag col doesn't exist in target yet (migration?)
|
|
236
|
+
if flag_col in target_df.columns:
|
|
237
|
+
current_target = target_df.filter(F.col(flag_col) == F.lit(True))
|
|
238
|
+
else:
|
|
239
|
+
current_target = target_df
|
|
240
|
+
|
|
241
|
+
# Rename target cols to avoid collision in join
|
|
242
|
+
t_prefix = "__target_"
|
|
243
|
+
renamed_target = current_target
|
|
244
|
+
for c in current_target.columns:
|
|
245
|
+
renamed_target = renamed_target.withColumnRenamed(c, f"{t_prefix}{c}")
|
|
246
|
+
|
|
247
|
+
# Preserve effective_time_col with a unique name before join to avoid resolution issues
|
|
248
|
+
# This ensures we can always reference it regardless of target schema
|
|
249
|
+
# Use source_df[col] syntax to bind column reference directly to this DataFrame
|
|
250
|
+
# (F.col() can get confused during lazy evaluation with complex join plans)
|
|
251
|
+
eff_col_preserved = "__src_eff_time"
|
|
252
|
+
source_with_eff = source_df.withColumn(eff_col_preserved, source_df[eff_col])
|
|
253
|
+
|
|
254
|
+
# Alias source_df to ensure column references are unambiguous after join
|
|
255
|
+
# Use backticks to handle column names with spaces or special characters
|
|
256
|
+
source_aliased = source_with_eff.alias("__source")
|
|
257
|
+
join_cond = [F.col(f"`__source`.`{k}`") == F.col(f"`{t_prefix}{k}`") for k in params.keys]
|
|
258
|
+
|
|
259
|
+
joined = source_aliased.join(renamed_target, join_cond, "left")
|
|
260
|
+
|
|
261
|
+
# Determine Status: Changed if track columns differ
|
|
262
|
+
# Use explicit __source alias for source columns to avoid ambiguity
|
|
263
|
+
# Use backticks to handle column names with spaces or special characters
|
|
264
|
+
change_conds = []
|
|
265
|
+
for col in params.track_cols:
|
|
266
|
+
s_col = F.col(f"`__source`.`{col}`")
|
|
267
|
+
t_col = F.col(f"`{t_prefix}{col}`")
|
|
268
|
+
# Null-safe equality check: NOT (source <=> target)
|
|
269
|
+
# Use ~ operator instead of F.not_() which doesn't exist in PySpark
|
|
270
|
+
change_conds.append(~s_col.eqNullSafe(t_col))
|
|
271
|
+
|
|
272
|
+
if change_conds:
|
|
273
|
+
from functools import reduce
|
|
274
|
+
|
|
275
|
+
is_changed = reduce(lambda a, b: a | b, change_conds)
|
|
276
|
+
else:
|
|
277
|
+
is_changed = F.lit(False)
|
|
278
|
+
|
|
279
|
+
# A) Rows to Insert (New Keys OR Changed Keys)
|
|
280
|
+
# Filter: TargetKey IS NULL OR is_changed
|
|
281
|
+
# Select source columns using the __source alias with backticks for special chars
|
|
282
|
+
rows_to_insert = joined.filter(
|
|
283
|
+
F.col(f"`{t_prefix}{params.keys[0]}`").isNull() | is_changed
|
|
284
|
+
).select([F.col(f"`__source`.`{c}`").alias(c) for c in source_df.columns])
|
|
285
|
+
|
|
286
|
+
# Add metadata to inserts (Start=eff_col, End=Null, Current=True)
|
|
287
|
+
rows_to_insert = rows_to_insert.withColumn(end_col, F.lit(None).cast("timestamp")).withColumn(
|
|
288
|
+
flag_col, F.lit(True)
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Drop the effective_time_col (txn_date) from inserts since it's not part of target schema
|
|
292
|
+
# Target schema = source columns (minus eff_col) + end_col + flag_col
|
|
293
|
+
if eff_col in rows_to_insert.columns:
|
|
294
|
+
rows_to_insert = rows_to_insert.drop(eff_col)
|
|
295
|
+
|
|
296
|
+
# B) Close Old Records
|
|
297
|
+
# We need to update target_df.
|
|
298
|
+
# Strategy:
|
|
299
|
+
# 1. Identify keys that CHANGED (from joined result)
|
|
300
|
+
# Also carry over the NEW effective date from source to use as END date
|
|
301
|
+
# Use backticks to handle column names with spaces or special characters
|
|
302
|
+
changed_keys_with_date = joined.filter(is_changed).select(
|
|
303
|
+
*[F.col(f"`__source`.`{k}`").alias(k) for k in params.keys],
|
|
304
|
+
F.col(f"`__source`.`{eff_col_preserved}`").alias("__new_end_date"),
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
# 2. Join Target with Changed Keys to apply updates
|
|
308
|
+
# We rejoin target_df with changed_keys_with_date
|
|
309
|
+
# Update logic: If match found AND is_current, set end_date = __new_end_date, flag = False
|
|
310
|
+
|
|
311
|
+
target_updated = target_df.alias("tgt").join(
|
|
312
|
+
changed_keys_with_date.alias("chg"), on=params.keys, how="left"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Apply conditional logic
|
|
316
|
+
# If chg.__new_end_date IS NOT NULL AND tgt.is_current == True:
|
|
317
|
+
# end_col = chg.__new_end_date
|
|
318
|
+
# flag_col = False
|
|
319
|
+
# Else:
|
|
320
|
+
# Keep original
|
|
321
|
+
|
|
322
|
+
# Use backticks for column references to handle special characters
|
|
323
|
+
final_target = target_updated.select(
|
|
324
|
+
*[
|
|
325
|
+
(
|
|
326
|
+
F.when(
|
|
327
|
+
(F.col("`__new_end_date`").isNotNull())
|
|
328
|
+
& (F.col(f"`tgt`.`{flag_col}`") == F.lit(True)),
|
|
329
|
+
F.col("`__new_end_date`"),
|
|
330
|
+
)
|
|
331
|
+
.otherwise(F.col(f"`tgt`.`{end_col}`"))
|
|
332
|
+
.alias(end_col)
|
|
333
|
+
if c == end_col
|
|
334
|
+
else (
|
|
335
|
+
F.when(
|
|
336
|
+
(F.col("`__new_end_date`").isNotNull())
|
|
337
|
+
& (F.col(f"`tgt`.`{flag_col}`") == F.lit(True)),
|
|
338
|
+
F.lit(False),
|
|
339
|
+
)
|
|
340
|
+
.otherwise(F.col(f"`tgt`.`{c}`"))
|
|
341
|
+
.alias(c)
|
|
342
|
+
if c == flag_col
|
|
343
|
+
else F.col(f"`tgt`.`{c}`")
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
for c in target_df.columns
|
|
347
|
+
]
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# 3. Union: Updated History + New Inserts
|
|
351
|
+
# Drop effective_time_col from final_target if it exists (legacy data migration)
|
|
352
|
+
# This ensures schema consistency with rows_to_insert which also drops eff_col
|
|
353
|
+
if eff_col in final_target.columns:
|
|
354
|
+
final_target = final_target.drop(eff_col)
|
|
355
|
+
|
|
356
|
+
# UnionByName handles column order differences
|
|
357
|
+
final_df = final_target.unionByName(rows_to_insert)
|
|
358
|
+
|
|
359
|
+
return context.with_df(final_df)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _scd2_pandas(context: EngineContext, source_df, params: SCD2Params) -> EngineContext:
|
|
363
|
+
import logging
|
|
364
|
+
|
|
365
|
+
import pandas as pd
|
|
366
|
+
|
|
367
|
+
logger = logging.getLogger(__name__)
|
|
368
|
+
|
|
369
|
+
# Try using DuckDB
|
|
370
|
+
try:
|
|
371
|
+
import duckdb
|
|
372
|
+
|
|
373
|
+
HAS_DUCKDB = True
|
|
374
|
+
except ImportError:
|
|
375
|
+
HAS_DUCKDB = False
|
|
376
|
+
|
|
377
|
+
# 1. Load Target
|
|
378
|
+
path = params.target
|
|
379
|
+
|
|
380
|
+
# Resolve path if context has engine (EngineContext)
|
|
381
|
+
if hasattr(context, "engine") and context.engine:
|
|
382
|
+
# Try to resolve 'connection.path'
|
|
383
|
+
if "." in path:
|
|
384
|
+
parts = path.split(".", 1)
|
|
385
|
+
conn_name = parts[0]
|
|
386
|
+
rel_path = parts[1]
|
|
387
|
+
if conn_name in context.engine.connections:
|
|
388
|
+
try:
|
|
389
|
+
path = context.engine.connections[conn_name].get_path(rel_path)
|
|
390
|
+
except Exception as e:
|
|
391
|
+
get_logging_context().debug(
|
|
392
|
+
f"Could not resolve connection path: {type(e).__name__}"
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Define Cols
|
|
396
|
+
keys = params.keys
|
|
397
|
+
eff_col = params.effective_time_col
|
|
398
|
+
end_col = params.end_time_col
|
|
399
|
+
flag_col = params.current_flag_col
|
|
400
|
+
track = params.track_cols
|
|
401
|
+
|
|
402
|
+
# --- DUCKDB IMPLEMENTATION ---
|
|
403
|
+
if HAS_DUCKDB and str(path).endswith(".parquet") and os.path.exists(path):
|
|
404
|
+
try:
|
|
405
|
+
con = duckdb.connect(database=":memory:")
|
|
406
|
+
con.register("source_df", source_df)
|
|
407
|
+
|
|
408
|
+
# Helper to build condition string
|
|
409
|
+
# DuckDB supports IS DISTINCT FROM
|
|
410
|
+
change_cond_parts = []
|
|
411
|
+
for col in track:
|
|
412
|
+
change_cond_parts.append(f"s.{col} IS DISTINCT FROM t.{col}")
|
|
413
|
+
change_cond = " OR ".join(change_cond_parts)
|
|
414
|
+
|
|
415
|
+
join_cond = " AND ".join([f"s.{k} = t.{k}" for k in keys])
|
|
416
|
+
|
|
417
|
+
src_cols = [c for c in source_df.columns if c not in [end_col, flag_col]]
|
|
418
|
+
cols_select = ", ".join([f"s.{c}" for c in src_cols])
|
|
419
|
+
|
|
420
|
+
sql_new_inserts = f"""
|
|
421
|
+
SELECT {cols_select}, NULL::TIMESTAMP as {end_col}, True as {flag_col}
|
|
422
|
+
FROM source_df s
|
|
423
|
+
LEFT JOIN (SELECT * FROM read_parquet('{path}') WHERE {flag_col} = True) t
|
|
424
|
+
ON {join_cond}
|
|
425
|
+
WHERE t.{keys[0]} IS NULL
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
sql_changed_inserts = f"""
|
|
429
|
+
SELECT {cols_select}, NULL::TIMESTAMP as {end_col}, True as {flag_col}
|
|
430
|
+
FROM source_df s
|
|
431
|
+
JOIN (SELECT * FROM read_parquet('{path}') WHERE {flag_col} = True) t
|
|
432
|
+
ON {join_cond}
|
|
433
|
+
WHERE ({change_cond})
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
sql_closed_records = f"""
|
|
437
|
+
SELECT
|
|
438
|
+
t.* EXCLUDE ({end_col}, {flag_col}),
|
|
439
|
+
s.{eff_col}::TIMESTAMP as {end_col},
|
|
440
|
+
False as {flag_col}
|
|
441
|
+
FROM read_parquet('{path}') t
|
|
442
|
+
JOIN source_df s ON {join_cond}
|
|
443
|
+
WHERE t.{flag_col} = True AND ({change_cond})
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
sql_unchanged = f"""
|
|
447
|
+
SELECT * FROM read_parquet('{path}') t
|
|
448
|
+
WHERE NOT (
|
|
449
|
+
t.{flag_col} = True AND EXISTS (
|
|
450
|
+
SELECT 1 FROM source_df s
|
|
451
|
+
WHERE {join_cond} AND ({change_cond})
|
|
452
|
+
)
|
|
453
|
+
)
|
|
454
|
+
"""
|
|
455
|
+
|
|
456
|
+
final_query = f"""
|
|
457
|
+
{sql_new_inserts}
|
|
458
|
+
UNION ALL
|
|
459
|
+
{sql_changed_inserts}
|
|
460
|
+
UNION ALL
|
|
461
|
+
{sql_closed_records}
|
|
462
|
+
UNION ALL
|
|
463
|
+
{sql_unchanged}
|
|
464
|
+
"""
|
|
465
|
+
|
|
466
|
+
temp_path = str(path) + ".tmp.parquet"
|
|
467
|
+
con.execute(f"COPY ({final_query}) TO '{temp_path}' (FORMAT PARQUET)")
|
|
468
|
+
con.close()
|
|
469
|
+
|
|
470
|
+
if os.path.exists(temp_path):
|
|
471
|
+
if os.path.exists(path):
|
|
472
|
+
os.remove(path)
|
|
473
|
+
os.rename(temp_path, path)
|
|
474
|
+
|
|
475
|
+
return context.with_df(source_df)
|
|
476
|
+
|
|
477
|
+
except Exception as e:
|
|
478
|
+
logger.warning(f"DuckDB SCD2 failed, falling back to Pandas: {e}")
|
|
479
|
+
pass
|
|
480
|
+
|
|
481
|
+
# --- PANDAS FALLBACK ---
|
|
482
|
+
target_df = pd.DataFrame()
|
|
483
|
+
|
|
484
|
+
# Try loading if exists
|
|
485
|
+
if os.path.exists(path):
|
|
486
|
+
try:
|
|
487
|
+
# Naive format detection or try/except
|
|
488
|
+
if str(path).endswith(".parquet") or os.path.isdir(path): # Parquet often directory
|
|
489
|
+
target_df = pd.read_parquet(path)
|
|
490
|
+
elif str(path).endswith(".csv"):
|
|
491
|
+
target_df = pd.read_csv(path)
|
|
492
|
+
except Exception as e:
|
|
493
|
+
get_logging_context().debug(f"Could not read target file: {type(e).__name__}")
|
|
494
|
+
|
|
495
|
+
# Prepare Source
|
|
496
|
+
source_df = source_df.copy()
|
|
497
|
+
source_df[end_col] = None
|
|
498
|
+
source_df[flag_col] = True
|
|
499
|
+
|
|
500
|
+
if target_df.empty:
|
|
501
|
+
return context.with_df(source_df)
|
|
502
|
+
|
|
503
|
+
# Ensure types match for merge
|
|
504
|
+
# (Skipping complex type alignment for brevity, relying on Pandas)
|
|
505
|
+
|
|
506
|
+
# 2. Logic
|
|
507
|
+
# Identify Current Records in Target
|
|
508
|
+
if flag_col in target_df.columns:
|
|
509
|
+
# Filter for current
|
|
510
|
+
current_target = target_df[target_df[flag_col] == True].copy() # noqa: E712
|
|
511
|
+
else:
|
|
512
|
+
current_target = target_df.copy()
|
|
513
|
+
|
|
514
|
+
# Merge Source and Current Target to detect changes
|
|
515
|
+
merged = pd.merge(
|
|
516
|
+
source_df, current_target, on=keys, how="left", suffixes=("", "_tgt"), indicator=True
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# A) New Records (Left Only) -> Insert as is
|
|
520
|
+
new_inserts = merged[merged["_merge"] == "left_only"][source_df.columns].copy()
|
|
521
|
+
|
|
522
|
+
# B) Potential Updates (Both)
|
|
523
|
+
updates = merged[merged["_merge"] == "both"].copy()
|
|
524
|
+
|
|
525
|
+
# Detect Changes
|
|
526
|
+
def has_changed(row):
|
|
527
|
+
for col in track:
|
|
528
|
+
s = row.get(col)
|
|
529
|
+
t = row.get(col + "_tgt")
|
|
530
|
+
# Handle NaNs
|
|
531
|
+
if pd.isna(s) and pd.isna(t):
|
|
532
|
+
continue
|
|
533
|
+
if s != t:
|
|
534
|
+
return True
|
|
535
|
+
return False
|
|
536
|
+
|
|
537
|
+
updates["_changed"] = updates.apply(has_changed, axis=1)
|
|
538
|
+
|
|
539
|
+
changed_records = updates[updates["_changed"] == True].copy() # noqa: E712
|
|
540
|
+
|
|
541
|
+
# Inserts for changed records (New Version)
|
|
542
|
+
changed_inserts = changed_records[source_df.columns].copy()
|
|
543
|
+
|
|
544
|
+
all_inserts = pd.concat([new_inserts, changed_inserts], ignore_index=True)
|
|
545
|
+
|
|
546
|
+
# C) Close Old Records
|
|
547
|
+
# We need to update rows in TARGET_DF
|
|
548
|
+
# Update: end_date = source.eff_date, current = False
|
|
549
|
+
|
|
550
|
+
final_target = target_df.copy()
|
|
551
|
+
|
|
552
|
+
if not changed_records.empty:
|
|
553
|
+
# Create a lookup for closing dates: Key -> New Effective Date
|
|
554
|
+
# We use set_index on keys to facilitate mapping
|
|
555
|
+
# Note: This assumes keys are unique in current_target (valid for SCD2)
|
|
556
|
+
|
|
557
|
+
# Prepare DataFrame of keys to close + new end date
|
|
558
|
+
keys_to_close = changed_records[keys + [eff_col]].rename(columns={eff_col: "__new_end"})
|
|
559
|
+
|
|
560
|
+
# Merge original target with closing info
|
|
561
|
+
# We use left merge to preserve all target rows
|
|
562
|
+
final_target = final_target.merge(keys_to_close, on=keys, how="left")
|
|
563
|
+
|
|
564
|
+
# Identify rows to update:
|
|
565
|
+
# 1. Match found (__new_end is not null)
|
|
566
|
+
# 2. Is currently active
|
|
567
|
+
mask = (final_target["__new_end"].notna()) & (final_target[flag_col] == True) # noqa: E712
|
|
568
|
+
|
|
569
|
+
# Apply updates
|
|
570
|
+
final_target.loc[mask, end_col] = final_target.loc[mask, "__new_end"]
|
|
571
|
+
final_target.loc[mask, flag_col] = False
|
|
572
|
+
|
|
573
|
+
# Cleanup
|
|
574
|
+
final_target = final_target.drop(columns=["__new_end"])
|
|
575
|
+
|
|
576
|
+
# 3. Combine
|
|
577
|
+
result = pd.concat([final_target, all_inserts], ignore_index=True)
|
|
578
|
+
|
|
579
|
+
return context.with_df(result)
|