odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,579 @@
1
+ import os
2
+ import time
3
+ from typing import Any, List, Optional
4
+
5
+ from pydantic import BaseModel, Field, model_validator
6
+
7
+ from odibi.context import EngineContext
8
+ from odibi.enums import EngineType
9
+ from odibi.utils.logging_context import get_logging_context
10
+
11
+
12
+ class SCD2Params(BaseModel):
13
+ """
14
+ Parameters for SCD Type 2 (Slowly Changing Dimensions) transformer.
15
+
16
+ ### 🕰️ The "Time Machine" Pattern
17
+
18
+ **Business Problem:**
19
+ "I need to know what the customer's address was *last month*, not just where they live now."
20
+
21
+ **The Solution:**
22
+ SCD Type 2 tracks the full history of changes. Each record has an "effective window" (start/end dates) and a flag indicating if it is the current version.
23
+
24
+ **Recipe 1: Using table name**
25
+ ```yaml
26
+ transformer: "scd2"
27
+ params:
28
+ target: "silver.dim_customers" # Registered table name
29
+ keys: ["customer_id"]
30
+ track_cols: ["address", "tier"]
31
+ effective_time_col: "txn_date"
32
+ ```
33
+
34
+ **Recipe 2: Using connection + path (ADLS)**
35
+ ```yaml
36
+ transformer: "scd2"
37
+ params:
38
+ connection: adls_prod # Connection name
39
+ path: OEE/silver/dim_customers # Relative path
40
+ keys: ["customer_id"]
41
+ track_cols: ["address", "tier"]
42
+ effective_time_col: "txn_date"
43
+ ```
44
+
45
+ **How it works:**
46
+ 1. **Match**: Finds existing records using `keys`.
47
+ 2. **Compare**: Checks `track_cols` to see if data changed.
48
+ 3. **Close**: If changed, updates the old record's `end_time_col` to the new `effective_time_col`.
49
+ 4. **Insert**: Adds a new record with `effective_time_col` as start and open-ended end date.
50
+
51
+ **Note:** SCD2 returns a DataFrame containing the full history. You must use a `write:` block
52
+ to persist the result (typically with `mode: overwrite` to the same location as `target`).
53
+ """
54
+
55
+ target: Optional[str] = Field(
56
+ None,
57
+ description="Target table name or full path (use this OR connection+path)",
58
+ )
59
+ connection: Optional[str] = Field(
60
+ None,
61
+ description="Connection name to resolve path (use with 'path' param)",
62
+ )
63
+ path: Optional[str] = Field(
64
+ None,
65
+ description="Relative path within connection (e.g., 'OEE/silver/dim_customers')",
66
+ )
67
+ keys: List[str] = Field(..., description="Natural keys to identify unique entities")
68
+ track_cols: List[str] = Field(..., description="Columns to monitor for changes")
69
+ effective_time_col: str = Field(
70
+ ...,
71
+ description="Source column indicating when the change occurred.",
72
+ )
73
+ end_time_col: str = Field(default="valid_to", description="Name of the end timestamp column")
74
+ current_flag_col: str = Field(
75
+ default="is_current", description="Name of the current record flag column"
76
+ )
77
+ delete_col: Optional[str] = Field(
78
+ default=None, description="Column indicating soft deletion (boolean)"
79
+ )
80
+
81
+ @model_validator(mode="after")
82
+ def check_target_or_connection(self):
83
+ """Ensure either target or connection+path is provided."""
84
+ if not self.target and not (self.connection and self.path):
85
+ raise ValueError("SCD2: provide either 'target' OR both 'connection' and 'path'.")
86
+ if self.target and (self.connection or self.path):
87
+ raise ValueError("SCD2: use 'target' OR 'connection'+'path', not both.")
88
+ return self
89
+
90
+
91
+ def scd2(context: EngineContext, params: SCD2Params, current: Any = None) -> EngineContext:
92
+ """
93
+ Implements SCD Type 2 Logic.
94
+
95
+ Returns the FULL history dataset (to be written via Overwrite).
96
+ """
97
+ ctx = get_logging_context()
98
+ start_time = time.time()
99
+
100
+ # Resolve target path from connection if provided
101
+ target = params.target
102
+
103
+ if params.connection and params.path:
104
+ # Resolve path via connection
105
+ connection = None
106
+ if hasattr(context, "engine") and hasattr(context.engine, "connections"):
107
+ connections = context.engine.connections
108
+ if connections and params.connection in connections:
109
+ connection = connections[params.connection]
110
+
111
+ if connection is None:
112
+ raise ValueError(
113
+ f"SCD2: connection '{params.connection}' not found. "
114
+ "Ensure the connection is defined in your project config."
115
+ )
116
+
117
+ if hasattr(connection, "get_path"):
118
+ target = connection.get_path(params.path)
119
+ ctx.debug(
120
+ "Resolved SCD2 target path via connection",
121
+ connection=params.connection,
122
+ relative_path=params.path,
123
+ resolved_path=target,
124
+ )
125
+ else:
126
+ raise ValueError(
127
+ f"SCD2: connection '{params.connection}' (type: {type(connection).__name__}) "
128
+ f"does not support path resolution. Expected a connection with 'get_path' method. "
129
+ f"Connection type must be 'local', 'adls', or similar file-based connection."
130
+ )
131
+
132
+ ctx.debug(
133
+ "SCD2 starting",
134
+ target=target,
135
+ keys=params.keys,
136
+ track_cols=params.track_cols,
137
+ )
138
+
139
+ source_df = context.df if current is None else current
140
+
141
+ rows_before = None
142
+ try:
143
+ rows_before = source_df.shape[0] if hasattr(source_df, "shape") else None
144
+ if rows_before is None and hasattr(source_df, "count"):
145
+ rows_before = source_df.count()
146
+ except Exception as e:
147
+ ctx.debug(f"Could not get row count: {type(e).__name__}")
148
+
149
+ ctx.debug(
150
+ "SCD2 source loaded",
151
+ source_rows=rows_before,
152
+ )
153
+
154
+ # Create a modified params with resolved target for internal functions
155
+ resolved_params = params.model_copy(update={"target": target})
156
+
157
+ if context.engine_type == EngineType.SPARK:
158
+ result = _scd2_spark(context, source_df, resolved_params)
159
+ elif context.engine_type == EngineType.PANDAS:
160
+ result = _scd2_pandas(context, source_df, resolved_params)
161
+ else:
162
+ ctx.error("SCD2 failed: unsupported engine", engine_type=str(context.engine_type))
163
+ raise ValueError(
164
+ f"SCD2 transformer does not support engine type '{context.engine_type}'. "
165
+ f"Supported engines: SPARK, PANDAS. "
166
+ f"Check your engine configuration or use a different transformer."
167
+ )
168
+
169
+ rows_after = None
170
+ try:
171
+ rows_after = result.df.shape[0] if hasattr(result.df, "shape") else None
172
+ if rows_after is None and hasattr(result.df, "count"):
173
+ rows_after = result.df.count()
174
+ except Exception as e:
175
+ ctx.debug(f"Could not get row count: {type(e).__name__}")
176
+
177
+ elapsed_ms = (time.time() - start_time) * 1000
178
+ ctx.debug(
179
+ "SCD2 completed",
180
+ target=target,
181
+ source_rows=rows_before,
182
+ result_rows=rows_after,
183
+ elapsed_ms=round(elapsed_ms, 2),
184
+ )
185
+
186
+ return result
187
+
188
+
189
+ def _scd2_spark(context: EngineContext, source_df, params: SCD2Params) -> EngineContext:
190
+ from pyspark.sql import functions as F
191
+
192
+ spark = context.spark
193
+
194
+ # 1. Check if target exists
195
+ target_df = None
196
+ try:
197
+ # Try reading as table first
198
+ target_df = spark.table(params.target)
199
+ except Exception:
200
+ try:
201
+ # Try reading as Delta path
202
+ target_df = spark.read.format("delta").load(params.target)
203
+ except Exception:
204
+ # Target doesn't exist yet - First Run
205
+ pass
206
+
207
+ # Define Columns
208
+ eff_col = params.effective_time_col
209
+ end_col = params.end_time_col
210
+ flag_col = params.current_flag_col
211
+
212
+ # Validate effective_time_col exists in source
213
+ source_cols = source_df.columns
214
+ if eff_col not in source_cols:
215
+ raise ValueError(
216
+ f"SCD2: effective_time_col '{eff_col}' not found in source DataFrame. "
217
+ f"Available columns: {source_cols}"
218
+ )
219
+
220
+ # Prepare Source: Add SCD metadata columns
221
+ # New records start as Current
222
+ new_records = source_df.withColumn(end_col, F.lit(None).cast("timestamp")).withColumn(
223
+ flag_col, F.lit(True)
224
+ )
225
+
226
+ if target_df is None:
227
+ # First Run: Return Source prepared
228
+ # Drop effective_time_col as it's only used for SCD logic, not stored in target
229
+ if eff_col in new_records.columns:
230
+ new_records = new_records.drop(eff_col)
231
+ return context.with_df(new_records)
232
+
233
+ # 2. Logic: Compare Source vs Target (Current Records Only)
234
+ # We only compare against currently open records in target
235
+ # Handle optional filtering if flag col doesn't exist in target yet (migration?)
236
+ if flag_col in target_df.columns:
237
+ current_target = target_df.filter(F.col(flag_col) == F.lit(True))
238
+ else:
239
+ current_target = target_df
240
+
241
+ # Rename target cols to avoid collision in join
242
+ t_prefix = "__target_"
243
+ renamed_target = current_target
244
+ for c in current_target.columns:
245
+ renamed_target = renamed_target.withColumnRenamed(c, f"{t_prefix}{c}")
246
+
247
+ # Preserve effective_time_col with a unique name before join to avoid resolution issues
248
+ # This ensures we can always reference it regardless of target schema
249
+ # Use source_df[col] syntax to bind column reference directly to this DataFrame
250
+ # (F.col() can get confused during lazy evaluation with complex join plans)
251
+ eff_col_preserved = "__src_eff_time"
252
+ source_with_eff = source_df.withColumn(eff_col_preserved, source_df[eff_col])
253
+
254
+ # Alias source_df to ensure column references are unambiguous after join
255
+ # Use backticks to handle column names with spaces or special characters
256
+ source_aliased = source_with_eff.alias("__source")
257
+ join_cond = [F.col(f"`__source`.`{k}`") == F.col(f"`{t_prefix}{k}`") for k in params.keys]
258
+
259
+ joined = source_aliased.join(renamed_target, join_cond, "left")
260
+
261
+ # Determine Status: Changed if track columns differ
262
+ # Use explicit __source alias for source columns to avoid ambiguity
263
+ # Use backticks to handle column names with spaces or special characters
264
+ change_conds = []
265
+ for col in params.track_cols:
266
+ s_col = F.col(f"`__source`.`{col}`")
267
+ t_col = F.col(f"`{t_prefix}{col}`")
268
+ # Null-safe equality check: NOT (source <=> target)
269
+ # Use ~ operator instead of F.not_() which doesn't exist in PySpark
270
+ change_conds.append(~s_col.eqNullSafe(t_col))
271
+
272
+ if change_conds:
273
+ from functools import reduce
274
+
275
+ is_changed = reduce(lambda a, b: a | b, change_conds)
276
+ else:
277
+ is_changed = F.lit(False)
278
+
279
+ # A) Rows to Insert (New Keys OR Changed Keys)
280
+ # Filter: TargetKey IS NULL OR is_changed
281
+ # Select source columns using the __source alias with backticks for special chars
282
+ rows_to_insert = joined.filter(
283
+ F.col(f"`{t_prefix}{params.keys[0]}`").isNull() | is_changed
284
+ ).select([F.col(f"`__source`.`{c}`").alias(c) for c in source_df.columns])
285
+
286
+ # Add metadata to inserts (Start=eff_col, End=Null, Current=True)
287
+ rows_to_insert = rows_to_insert.withColumn(end_col, F.lit(None).cast("timestamp")).withColumn(
288
+ flag_col, F.lit(True)
289
+ )
290
+
291
+ # Drop the effective_time_col (txn_date) from inserts since it's not part of target schema
292
+ # Target schema = source columns (minus eff_col) + end_col + flag_col
293
+ if eff_col in rows_to_insert.columns:
294
+ rows_to_insert = rows_to_insert.drop(eff_col)
295
+
296
+ # B) Close Old Records
297
+ # We need to update target_df.
298
+ # Strategy:
299
+ # 1. Identify keys that CHANGED (from joined result)
300
+ # Also carry over the NEW effective date from source to use as END date
301
+ # Use backticks to handle column names with spaces or special characters
302
+ changed_keys_with_date = joined.filter(is_changed).select(
303
+ *[F.col(f"`__source`.`{k}`").alias(k) for k in params.keys],
304
+ F.col(f"`__source`.`{eff_col_preserved}`").alias("__new_end_date"),
305
+ )
306
+
307
+ # 2. Join Target with Changed Keys to apply updates
308
+ # We rejoin target_df with changed_keys_with_date
309
+ # Update logic: If match found AND is_current, set end_date = __new_end_date, flag = False
310
+
311
+ target_updated = target_df.alias("tgt").join(
312
+ changed_keys_with_date.alias("chg"), on=params.keys, how="left"
313
+ )
314
+
315
+ # Apply conditional logic
316
+ # If chg.__new_end_date IS NOT NULL AND tgt.is_current == True:
317
+ # end_col = chg.__new_end_date
318
+ # flag_col = False
319
+ # Else:
320
+ # Keep original
321
+
322
+ # Use backticks for column references to handle special characters
323
+ final_target = target_updated.select(
324
+ *[
325
+ (
326
+ F.when(
327
+ (F.col("`__new_end_date`").isNotNull())
328
+ & (F.col(f"`tgt`.`{flag_col}`") == F.lit(True)),
329
+ F.col("`__new_end_date`"),
330
+ )
331
+ .otherwise(F.col(f"`tgt`.`{end_col}`"))
332
+ .alias(end_col)
333
+ if c == end_col
334
+ else (
335
+ F.when(
336
+ (F.col("`__new_end_date`").isNotNull())
337
+ & (F.col(f"`tgt`.`{flag_col}`") == F.lit(True)),
338
+ F.lit(False),
339
+ )
340
+ .otherwise(F.col(f"`tgt`.`{c}`"))
341
+ .alias(c)
342
+ if c == flag_col
343
+ else F.col(f"`tgt`.`{c}`")
344
+ )
345
+ )
346
+ for c in target_df.columns
347
+ ]
348
+ )
349
+
350
+ # 3. Union: Updated History + New Inserts
351
+ # Drop effective_time_col from final_target if it exists (legacy data migration)
352
+ # This ensures schema consistency with rows_to_insert which also drops eff_col
353
+ if eff_col in final_target.columns:
354
+ final_target = final_target.drop(eff_col)
355
+
356
+ # UnionByName handles column order differences
357
+ final_df = final_target.unionByName(rows_to_insert)
358
+
359
+ return context.with_df(final_df)
360
+
361
+
362
+ def _scd2_pandas(context: EngineContext, source_df, params: SCD2Params) -> EngineContext:
363
+ import logging
364
+
365
+ import pandas as pd
366
+
367
+ logger = logging.getLogger(__name__)
368
+
369
+ # Try using DuckDB
370
+ try:
371
+ import duckdb
372
+
373
+ HAS_DUCKDB = True
374
+ except ImportError:
375
+ HAS_DUCKDB = False
376
+
377
+ # 1. Load Target
378
+ path = params.target
379
+
380
+ # Resolve path if context has engine (EngineContext)
381
+ if hasattr(context, "engine") and context.engine:
382
+ # Try to resolve 'connection.path'
383
+ if "." in path:
384
+ parts = path.split(".", 1)
385
+ conn_name = parts[0]
386
+ rel_path = parts[1]
387
+ if conn_name in context.engine.connections:
388
+ try:
389
+ path = context.engine.connections[conn_name].get_path(rel_path)
390
+ except Exception as e:
391
+ get_logging_context().debug(
392
+ f"Could not resolve connection path: {type(e).__name__}"
393
+ )
394
+
395
+ # Define Cols
396
+ keys = params.keys
397
+ eff_col = params.effective_time_col
398
+ end_col = params.end_time_col
399
+ flag_col = params.current_flag_col
400
+ track = params.track_cols
401
+
402
+ # --- DUCKDB IMPLEMENTATION ---
403
+ if HAS_DUCKDB and str(path).endswith(".parquet") and os.path.exists(path):
404
+ try:
405
+ con = duckdb.connect(database=":memory:")
406
+ con.register("source_df", source_df)
407
+
408
+ # Helper to build condition string
409
+ # DuckDB supports IS DISTINCT FROM
410
+ change_cond_parts = []
411
+ for col in track:
412
+ change_cond_parts.append(f"s.{col} IS DISTINCT FROM t.{col}")
413
+ change_cond = " OR ".join(change_cond_parts)
414
+
415
+ join_cond = " AND ".join([f"s.{k} = t.{k}" for k in keys])
416
+
417
+ src_cols = [c for c in source_df.columns if c not in [end_col, flag_col]]
418
+ cols_select = ", ".join([f"s.{c}" for c in src_cols])
419
+
420
+ sql_new_inserts = f"""
421
+ SELECT {cols_select}, NULL::TIMESTAMP as {end_col}, True as {flag_col}
422
+ FROM source_df s
423
+ LEFT JOIN (SELECT * FROM read_parquet('{path}') WHERE {flag_col} = True) t
424
+ ON {join_cond}
425
+ WHERE t.{keys[0]} IS NULL
426
+ """
427
+
428
+ sql_changed_inserts = f"""
429
+ SELECT {cols_select}, NULL::TIMESTAMP as {end_col}, True as {flag_col}
430
+ FROM source_df s
431
+ JOIN (SELECT * FROM read_parquet('{path}') WHERE {flag_col} = True) t
432
+ ON {join_cond}
433
+ WHERE ({change_cond})
434
+ """
435
+
436
+ sql_closed_records = f"""
437
+ SELECT
438
+ t.* EXCLUDE ({end_col}, {flag_col}),
439
+ s.{eff_col}::TIMESTAMP as {end_col},
440
+ False as {flag_col}
441
+ FROM read_parquet('{path}') t
442
+ JOIN source_df s ON {join_cond}
443
+ WHERE t.{flag_col} = True AND ({change_cond})
444
+ """
445
+
446
+ sql_unchanged = f"""
447
+ SELECT * FROM read_parquet('{path}') t
448
+ WHERE NOT (
449
+ t.{flag_col} = True AND EXISTS (
450
+ SELECT 1 FROM source_df s
451
+ WHERE {join_cond} AND ({change_cond})
452
+ )
453
+ )
454
+ """
455
+
456
+ final_query = f"""
457
+ {sql_new_inserts}
458
+ UNION ALL
459
+ {sql_changed_inserts}
460
+ UNION ALL
461
+ {sql_closed_records}
462
+ UNION ALL
463
+ {sql_unchanged}
464
+ """
465
+
466
+ temp_path = str(path) + ".tmp.parquet"
467
+ con.execute(f"COPY ({final_query}) TO '{temp_path}' (FORMAT PARQUET)")
468
+ con.close()
469
+
470
+ if os.path.exists(temp_path):
471
+ if os.path.exists(path):
472
+ os.remove(path)
473
+ os.rename(temp_path, path)
474
+
475
+ return context.with_df(source_df)
476
+
477
+ except Exception as e:
478
+ logger.warning(f"DuckDB SCD2 failed, falling back to Pandas: {e}")
479
+ pass
480
+
481
+ # --- PANDAS FALLBACK ---
482
+ target_df = pd.DataFrame()
483
+
484
+ # Try loading if exists
485
+ if os.path.exists(path):
486
+ try:
487
+ # Naive format detection or try/except
488
+ if str(path).endswith(".parquet") or os.path.isdir(path): # Parquet often directory
489
+ target_df = pd.read_parquet(path)
490
+ elif str(path).endswith(".csv"):
491
+ target_df = pd.read_csv(path)
492
+ except Exception as e:
493
+ get_logging_context().debug(f"Could not read target file: {type(e).__name__}")
494
+
495
+ # Prepare Source
496
+ source_df = source_df.copy()
497
+ source_df[end_col] = None
498
+ source_df[flag_col] = True
499
+
500
+ if target_df.empty:
501
+ return context.with_df(source_df)
502
+
503
+ # Ensure types match for merge
504
+ # (Skipping complex type alignment for brevity, relying on Pandas)
505
+
506
+ # 2. Logic
507
+ # Identify Current Records in Target
508
+ if flag_col in target_df.columns:
509
+ # Filter for current
510
+ current_target = target_df[target_df[flag_col] == True].copy() # noqa: E712
511
+ else:
512
+ current_target = target_df.copy()
513
+
514
+ # Merge Source and Current Target to detect changes
515
+ merged = pd.merge(
516
+ source_df, current_target, on=keys, how="left", suffixes=("", "_tgt"), indicator=True
517
+ )
518
+
519
+ # A) New Records (Left Only) -> Insert as is
520
+ new_inserts = merged[merged["_merge"] == "left_only"][source_df.columns].copy()
521
+
522
+ # B) Potential Updates (Both)
523
+ updates = merged[merged["_merge"] == "both"].copy()
524
+
525
+ # Detect Changes
526
+ def has_changed(row):
527
+ for col in track:
528
+ s = row.get(col)
529
+ t = row.get(col + "_tgt")
530
+ # Handle NaNs
531
+ if pd.isna(s) and pd.isna(t):
532
+ continue
533
+ if s != t:
534
+ return True
535
+ return False
536
+
537
+ updates["_changed"] = updates.apply(has_changed, axis=1)
538
+
539
+ changed_records = updates[updates["_changed"] == True].copy() # noqa: E712
540
+
541
+ # Inserts for changed records (New Version)
542
+ changed_inserts = changed_records[source_df.columns].copy()
543
+
544
+ all_inserts = pd.concat([new_inserts, changed_inserts], ignore_index=True)
545
+
546
+ # C) Close Old Records
547
+ # We need to update rows in TARGET_DF
548
+ # Update: end_date = source.eff_date, current = False
549
+
550
+ final_target = target_df.copy()
551
+
552
+ if not changed_records.empty:
553
+ # Create a lookup for closing dates: Key -> New Effective Date
554
+ # We use set_index on keys to facilitate mapping
555
+ # Note: This assumes keys are unique in current_target (valid for SCD2)
556
+
557
+ # Prepare DataFrame of keys to close + new end date
558
+ keys_to_close = changed_records[keys + [eff_col]].rename(columns={eff_col: "__new_end"})
559
+
560
+ # Merge original target with closing info
561
+ # We use left merge to preserve all target rows
562
+ final_target = final_target.merge(keys_to_close, on=keys, how="left")
563
+
564
+ # Identify rows to update:
565
+ # 1. Match found (__new_end is not null)
566
+ # 2. Is currently active
567
+ mask = (final_target["__new_end"].notna()) & (final_target[flag_col] == True) # noqa: E712
568
+
569
+ # Apply updates
570
+ final_target.loc[mask, end_col] = final_target.loc[mask, "__new_end"]
571
+ final_target.loc[mask, flag_col] = False
572
+
573
+ # Cleanup
574
+ final_target = final_target.drop(columns=["__new_end"])
575
+
576
+ # 3. Combine
577
+ result = pd.concat([final_target, all_inserts], ignore_index=True)
578
+
579
+ return context.with_df(result)