odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/patterns/fact.py ADDED
@@ -0,0 +1,748 @@
1
+ import time
2
+ from datetime import datetime
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from odibi.context import EngineContext
6
+ from odibi.enums import EngineType
7
+ from odibi.patterns.base import Pattern
8
+ from odibi.utils.logging_context import get_logging_context
9
+
10
+
11
+ class FactPattern(Pattern):
12
+ """
13
+ Enhanced Fact Pattern: Builds fact tables with automatic SK lookups.
14
+
15
+ Features:
16
+ - Automatic surrogate key lookups from dimension tables
17
+ - Orphan handling (unknown member, reject, or quarantine)
18
+ - Grain validation (detect duplicates at PK level)
19
+ - Audit columns (load_timestamp, source_system)
20
+ - Deduplication support
21
+ - Measure calculations and renaming
22
+
23
+ Basic Params (backward compatible):
24
+ deduplicate (bool): If true, removes duplicates before insert.
25
+ keys (list): Keys for deduplication.
26
+
27
+ Enhanced Params:
28
+ grain (list): Columns that define uniqueness (validates no duplicates)
29
+ dimensions (list): Dimension lookup configurations
30
+ - source_column: Column in source data
31
+ - dimension_table: Name of dimension in context
32
+ - dimension_key: Natural key column in dimension
33
+ - surrogate_key: Surrogate key to retrieve
34
+ - scd2 (bool): If true, filter is_current=true
35
+ orphan_handling (str): "unknown" | "reject" | "quarantine"
36
+ quarantine (dict): Quarantine configuration (required if orphan_handling=quarantine)
37
+ - connection: Connection name for quarantine writes
38
+ - path: Path for quarantine data (or use 'table')
39
+ - table: Table name for quarantine (or use 'path')
40
+ - add_columns (dict): Metadata columns to add
41
+ - _rejection_reason (bool): Add rejection reason column
42
+ - _rejected_at (bool): Add rejection timestamp column
43
+ - _source_dimension (bool): Add source dimension name column
44
+ measures (list): Measure definitions (passthrough, rename, or calculated)
45
+ audit (dict): Audit column configuration
46
+ - load_timestamp (bool)
47
+ - source_system (str)
48
+
49
+ Example Config:
50
+ pattern:
51
+ type: fact
52
+ params:
53
+ grain: [order_id]
54
+ dimensions:
55
+ - source_column: customer_id
56
+ dimension_table: dim_customer
57
+ dimension_key: customer_id
58
+ surrogate_key: customer_sk
59
+ scd2: true
60
+ orphan_handling: unknown
61
+ measures:
62
+ - quantity
63
+ - total_amount: "quantity * price"
64
+ audit:
65
+ load_timestamp: true
66
+ source_system: "pos"
67
+
68
+ Example with Quarantine:
69
+ pattern:
70
+ type: fact
71
+ params:
72
+ dimensions:
73
+ - source_column: customer_id
74
+ dimension_table: dim_customer
75
+ dimension_key: customer_id
76
+ surrogate_key: customer_sk
77
+ orphan_handling: quarantine
78
+ quarantine:
79
+ connection: silver
80
+ path: fact_orders_orphans
81
+ add_columns:
82
+ _rejection_reason: true
83
+ _rejected_at: true
84
+ _source_dimension: true
85
+ """
86
+
87
+ def validate(self) -> None:
88
+ ctx = get_logging_context()
89
+ deduplicate = self.params.get("deduplicate")
90
+ keys = self.params.get("keys")
91
+ grain = self.params.get("grain")
92
+ dimensions = self.params.get("dimensions", [])
93
+ orphan_handling = self.params.get("orphan_handling", "unknown")
94
+
95
+ ctx.debug(
96
+ "FactPattern validation starting",
97
+ pattern="FactPattern",
98
+ deduplicate=deduplicate,
99
+ keys=keys,
100
+ grain=grain,
101
+ dimensions_count=len(dimensions),
102
+ )
103
+
104
+ if deduplicate and not keys:
105
+ ctx.error(
106
+ "FactPattern validation failed: 'keys' required when 'deduplicate' is True",
107
+ pattern="FactPattern",
108
+ )
109
+ raise ValueError(
110
+ "FactPattern: 'keys' required when 'deduplicate' is True. "
111
+ "Keys define which columns uniquely identify a fact row for deduplication. "
112
+ "Provide keys=['col1', 'col2'] to specify the deduplication columns."
113
+ )
114
+
115
+ if orphan_handling not in ("unknown", "reject", "quarantine"):
116
+ ctx.error(
117
+ f"FactPattern validation failed: invalid orphan_handling '{orphan_handling}'",
118
+ pattern="FactPattern",
119
+ )
120
+ raise ValueError(
121
+ f"FactPattern: 'orphan_handling' must be 'unknown', 'reject', or 'quarantine'. "
122
+ f"Got: {orphan_handling}"
123
+ )
124
+
125
+ if orphan_handling == "quarantine":
126
+ quarantine_config = self.params.get("quarantine")
127
+ if not quarantine_config:
128
+ ctx.error(
129
+ "FactPattern validation failed: 'quarantine' config required "
130
+ "when orphan_handling='quarantine'",
131
+ pattern="FactPattern",
132
+ )
133
+ raise ValueError(
134
+ "FactPattern: 'quarantine' configuration is required when "
135
+ "orphan_handling='quarantine'."
136
+ )
137
+ if not quarantine_config.get("connection"):
138
+ ctx.error(
139
+ "FactPattern validation failed: quarantine.connection is required",
140
+ pattern="FactPattern",
141
+ )
142
+ raise ValueError(
143
+ "FactPattern: 'quarantine.connection' is required. "
144
+ "The connection specifies where to write quarantined orphan records "
145
+ "(e.g., a Spark session or database connection). "
146
+ "Add 'connection' to your quarantine config."
147
+ )
148
+ if not quarantine_config.get("path") and not quarantine_config.get("table"):
149
+ ctx.error(
150
+ "FactPattern validation failed: quarantine requires 'path' or 'table'",
151
+ pattern="FactPattern",
152
+ )
153
+ raise ValueError(
154
+ f"FactPattern: 'quarantine' requires either 'path' or 'table'. "
155
+ f"Got config: {quarantine_config}. "
156
+ "Add 'path' for file storage or 'table' for database storage."
157
+ )
158
+
159
+ for i, dim in enumerate(dimensions):
160
+ required_keys = ["source_column", "dimension_table", "dimension_key", "surrogate_key"]
161
+ for key in required_keys:
162
+ if key not in dim:
163
+ ctx.error(
164
+ f"FactPattern validation failed: dimension[{i}] missing '{key}'",
165
+ pattern="FactPattern",
166
+ )
167
+ raise ValueError(
168
+ f"FactPattern: dimension[{i}] missing required key '{key}'. "
169
+ f"Required keys: {required_keys}. "
170
+ f"Got: {dim}. "
171
+ f"Ensure all required keys are provided in the dimension config."
172
+ )
173
+
174
+ ctx.debug(
175
+ "FactPattern validation passed",
176
+ pattern="FactPattern",
177
+ )
178
+
179
+ def execute(self, context: EngineContext) -> Any:
180
+ ctx = get_logging_context()
181
+ start_time = time.time()
182
+
183
+ deduplicate = self.params.get("deduplicate")
184
+ keys = self.params.get("keys")
185
+ grain = self.params.get("grain")
186
+ dimensions = self.params.get("dimensions", [])
187
+ orphan_handling = self.params.get("orphan_handling", "unknown")
188
+ quarantine_config = self.params.get("quarantine", {})
189
+ measures = self.params.get("measures", [])
190
+ audit_config = self.params.get("audit", {})
191
+
192
+ ctx.debug(
193
+ "FactPattern starting",
194
+ pattern="FactPattern",
195
+ deduplicate=deduplicate,
196
+ keys=keys,
197
+ grain=grain,
198
+ dimensions_count=len(dimensions),
199
+ orphan_handling=orphan_handling,
200
+ )
201
+
202
+ df = context.df
203
+ source_count = self._get_row_count(df, context.engine_type)
204
+ ctx.debug("Fact source loaded", pattern="FactPattern", source_rows=source_count)
205
+
206
+ try:
207
+ if deduplicate and keys:
208
+ df = self._deduplicate(context, df, keys)
209
+ ctx.debug(
210
+ "Fact deduplication complete",
211
+ pattern="FactPattern",
212
+ rows_after=self._get_row_count(df, context.engine_type),
213
+ )
214
+
215
+ if dimensions:
216
+ df, orphan_count, quarantined_df = self._lookup_dimensions(
217
+ context, df, dimensions, orphan_handling, quarantine_config
218
+ )
219
+ ctx.debug(
220
+ "Fact dimension lookups complete",
221
+ pattern="FactPattern",
222
+ orphan_count=orphan_count,
223
+ )
224
+
225
+ if orphan_handling == "quarantine" and quarantined_df is not None:
226
+ self._write_quarantine(context, quarantined_df, quarantine_config)
227
+ ctx.info(
228
+ f"Quarantined {orphan_count} orphan records",
229
+ pattern="FactPattern",
230
+ quarantine_path=quarantine_config.get("path")
231
+ or quarantine_config.get("table"),
232
+ )
233
+
234
+ if measures:
235
+ df = self._apply_measures(context, df, measures)
236
+
237
+ if grain:
238
+ self._validate_grain(context, df, grain)
239
+
240
+ df = self._add_audit_columns(context, df, audit_config)
241
+
242
+ result_count = self._get_row_count(df, context.engine_type)
243
+ elapsed_ms = (time.time() - start_time) * 1000
244
+
245
+ ctx.info(
246
+ "FactPattern completed",
247
+ pattern="FactPattern",
248
+ elapsed_ms=round(elapsed_ms, 2),
249
+ source_rows=source_count,
250
+ result_rows=result_count,
251
+ )
252
+
253
+ return df
254
+
255
+ except Exception as e:
256
+ elapsed_ms = (time.time() - start_time) * 1000
257
+ ctx.error(
258
+ f"FactPattern failed: {e}",
259
+ pattern="FactPattern",
260
+ error_type=type(e).__name__,
261
+ elapsed_ms=round(elapsed_ms, 2),
262
+ )
263
+ raise
264
+
265
+ def _get_row_count(self, df, engine_type) -> Optional[int]:
266
+ try:
267
+ if engine_type == EngineType.SPARK:
268
+ return df.count()
269
+ else:
270
+ return len(df)
271
+ except Exception:
272
+ return None
273
+
274
+ def _deduplicate(self, context: EngineContext, df, keys: List[str]):
275
+ """Remove duplicates based on keys."""
276
+ if context.engine_type == EngineType.SPARK:
277
+ return df.dropDuplicates(keys)
278
+ else:
279
+ return df.drop_duplicates(subset=keys)
280
+
281
+ def _lookup_dimensions(
282
+ self,
283
+ context: EngineContext,
284
+ df,
285
+ dimensions: List[Dict],
286
+ orphan_handling: str,
287
+ quarantine_config: Dict,
288
+ ):
289
+ """
290
+ Perform surrogate key lookups from dimension tables.
291
+
292
+ Returns:
293
+ Tuple of (result_df, orphan_count, quarantined_df)
294
+ """
295
+ total_orphans = 0
296
+ all_quarantined = []
297
+
298
+ for dim_config in dimensions:
299
+ source_col = dim_config["source_column"]
300
+ dim_table = dim_config["dimension_table"]
301
+ dim_key = dim_config["dimension_key"]
302
+ sk_col = dim_config["surrogate_key"]
303
+ is_scd2 = dim_config.get("scd2", False)
304
+
305
+ dim_df = self._get_dimension_df(context, dim_table, is_scd2)
306
+ if dim_df is None:
307
+ raise ValueError(
308
+ f"FactPattern: Dimension table '{dim_table}' not found in context."
309
+ )
310
+
311
+ df, orphan_count, quarantined = self._join_dimension(
312
+ context,
313
+ df,
314
+ dim_df,
315
+ source_col,
316
+ dim_key,
317
+ sk_col,
318
+ orphan_handling,
319
+ dim_table,
320
+ quarantine_config,
321
+ )
322
+ total_orphans += orphan_count
323
+ if quarantined is not None:
324
+ all_quarantined.append(quarantined)
325
+
326
+ quarantined_df = None
327
+ if all_quarantined:
328
+ quarantined_df = self._union_dataframes(context, all_quarantined)
329
+
330
+ return df, total_orphans, quarantined_df
331
+
332
+ def _union_dataframes(self, context: EngineContext, dfs: List):
333
+ """Union multiple DataFrames together."""
334
+ if not dfs:
335
+ return None
336
+ if context.engine_type == EngineType.SPARK:
337
+ result = dfs[0]
338
+ for df in dfs[1:]:
339
+ result = result.unionByName(df, allowMissingColumns=True)
340
+ return result
341
+ else:
342
+ import pandas as pd
343
+
344
+ return pd.concat(dfs, ignore_index=True)
345
+
346
+ def _get_dimension_df(self, context: EngineContext, dim_table: str, is_scd2: bool):
347
+ """Get dimension DataFrame from context, optionally filtering for current records."""
348
+ try:
349
+ dim_df = context.get(dim_table)
350
+ except KeyError:
351
+ return None
352
+
353
+ if is_scd2:
354
+ is_current_col = "is_current"
355
+ if context.engine_type == EngineType.SPARK:
356
+ from pyspark.sql import functions as F
357
+
358
+ if is_current_col in dim_df.columns:
359
+ dim_df = dim_df.filter(F.col(is_current_col) == True) # noqa: E712
360
+ else:
361
+ if is_current_col in dim_df.columns:
362
+ dim_df = dim_df[dim_df[is_current_col] == True].copy() # noqa: E712
363
+
364
+ return dim_df
365
+
366
+ def _join_dimension(
367
+ self,
368
+ context: EngineContext,
369
+ fact_df,
370
+ dim_df,
371
+ source_col: str,
372
+ dim_key: str,
373
+ sk_col: str,
374
+ orphan_handling: str,
375
+ dim_table: str,
376
+ quarantine_config: Dict,
377
+ ):
378
+ """
379
+ Join fact to dimension and retrieve surrogate key.
380
+
381
+ Returns:
382
+ Tuple of (result_df, orphan_count, quarantined_df)
383
+ """
384
+ if context.engine_type == EngineType.SPARK:
385
+ return self._join_dimension_spark(
386
+ context,
387
+ fact_df,
388
+ dim_df,
389
+ source_col,
390
+ dim_key,
391
+ sk_col,
392
+ orphan_handling,
393
+ dim_table,
394
+ quarantine_config,
395
+ )
396
+ else:
397
+ return self._join_dimension_pandas(
398
+ fact_df,
399
+ dim_df,
400
+ source_col,
401
+ dim_key,
402
+ sk_col,
403
+ orphan_handling,
404
+ dim_table,
405
+ quarantine_config,
406
+ )
407
+
408
+ def _join_dimension_spark(
409
+ self,
410
+ context: EngineContext,
411
+ fact_df,
412
+ dim_df,
413
+ source_col: str,
414
+ dim_key: str,
415
+ sk_col: str,
416
+ orphan_handling: str,
417
+ dim_table: str,
418
+ quarantine_config: Dict,
419
+ ):
420
+ from pyspark.sql import functions as F
421
+
422
+ dim_subset = dim_df.select(
423
+ F.col(dim_key).alias(f"_dim_{dim_key}"),
424
+ F.col(sk_col).alias(sk_col),
425
+ )
426
+
427
+ joined = fact_df.join(
428
+ dim_subset,
429
+ fact_df[source_col] == dim_subset[f"_dim_{dim_key}"],
430
+ "left",
431
+ )
432
+
433
+ orphan_mask = F.col(sk_col).isNull()
434
+ orphan_count = joined.filter(orphan_mask).count()
435
+ quarantined_df = None
436
+
437
+ if orphan_handling == "reject" and orphan_count > 0:
438
+ raise ValueError(
439
+ f"FactPattern: {orphan_count} orphan records found for dimension "
440
+ f"lookup on '{source_col}'. Orphan handling is set to 'reject'."
441
+ )
442
+
443
+ if orphan_handling == "unknown":
444
+ joined = joined.withColumn(sk_col, F.coalesce(F.col(sk_col), F.lit(0)))
445
+
446
+ if orphan_handling == "quarantine" and orphan_count > 0:
447
+ orphan_rows = joined.filter(orphan_mask).drop(f"_dim_{dim_key}")
448
+ orphan_rows = self._add_quarantine_metadata_spark(
449
+ orphan_rows, dim_table, source_col, quarantine_config
450
+ )
451
+ quarantined_df = orphan_rows
452
+ joined = joined.filter(~orphan_mask)
453
+
454
+ result = joined.drop(f"_dim_{dim_key}")
455
+
456
+ return result, orphan_count, quarantined_df
457
+
458
+ def _join_dimension_pandas(
459
+ self,
460
+ fact_df,
461
+ dim_df,
462
+ source_col: str,
463
+ dim_key: str,
464
+ sk_col: str,
465
+ orphan_handling: str,
466
+ dim_table: str,
467
+ quarantine_config: Dict,
468
+ ):
469
+ import pandas as pd
470
+
471
+ dim_subset = dim_df[[dim_key, sk_col]].copy()
472
+ dim_subset = dim_subset.rename(columns={dim_key: f"_dim_{dim_key}"})
473
+
474
+ merged = pd.merge(
475
+ fact_df,
476
+ dim_subset,
477
+ left_on=source_col,
478
+ right_on=f"_dim_{dim_key}",
479
+ how="left",
480
+ )
481
+
482
+ orphan_mask = merged[sk_col].isna()
483
+ orphan_count = orphan_mask.sum()
484
+ quarantined_df = None
485
+
486
+ if orphan_handling == "reject" and orphan_count > 0:
487
+ raise ValueError(
488
+ f"FactPattern: {orphan_count} orphan records found for dimension "
489
+ f"lookup on '{source_col}'. Orphan handling is set to 'reject'."
490
+ )
491
+
492
+ if orphan_handling == "unknown":
493
+ merged[sk_col] = merged[sk_col].fillna(0).infer_objects(copy=False).astype(int)
494
+
495
+ if orphan_handling == "quarantine" and orphan_count > 0:
496
+ orphan_rows = merged[orphan_mask].drop(columns=[f"_dim_{dim_key}"]).copy()
497
+ orphan_rows = self._add_quarantine_metadata_pandas(
498
+ orphan_rows, dim_table, source_col, quarantine_config
499
+ )
500
+ quarantined_df = orphan_rows
501
+ merged = merged[~orphan_mask].copy()
502
+
503
+ result = merged.drop(columns=[f"_dim_{dim_key}"])
504
+
505
+ return result, int(orphan_count), quarantined_df
506
+
507
+ def _apply_measures(self, context: EngineContext, df, measures: List):
508
+ """
509
+ Apply measure transformations.
510
+
511
+ Measures can be:
512
+ - String: passthrough column name
513
+ - Dict with single key-value: rename or calculate
514
+ - {"new_name": "old_name"} -> rename
515
+ - {"new_name": "expr"} -> calculate (if expr contains operators)
516
+ """
517
+ for measure in measures:
518
+ if isinstance(measure, str):
519
+ continue
520
+ elif isinstance(measure, dict):
521
+ for new_name, expr in measure.items():
522
+ if self._is_expression(expr):
523
+ df = self._add_calculated_measure(context, df, new_name, expr)
524
+ else:
525
+ df = self._rename_column(context, df, expr, new_name)
526
+
527
+ return df
528
+
529
+ def _is_expression(self, expr: str) -> bool:
530
+ """Check if string is a calculation expression."""
531
+ operators = ["+", "-", "*", "/", "(", ")"]
532
+ return any(op in expr for op in operators)
533
+
534
+ def _add_calculated_measure(self, context: EngineContext, df, name: str, expr: str):
535
+ """Add a calculated measure column."""
536
+ if context.engine_type == EngineType.SPARK:
537
+ from pyspark.sql import functions as F
538
+
539
+ return df.withColumn(name, F.expr(expr))
540
+ else:
541
+ df = df.copy()
542
+ df[name] = df.eval(expr)
543
+ return df
544
+
545
+ def _rename_column(self, context: EngineContext, df, old_name: str, new_name: str):
546
+ """Rename a column."""
547
+ if context.engine_type == EngineType.SPARK:
548
+ return df.withColumnRenamed(old_name, new_name)
549
+ else:
550
+ return df.rename(columns={old_name: new_name})
551
+
552
+ def _validate_grain(self, context: EngineContext, df, grain: List[str]):
553
+ """
554
+ Validate that no duplicate rows exist at the grain level.
555
+
556
+ Raises ValueError if duplicates are found.
557
+ """
558
+ ctx = get_logging_context()
559
+
560
+ if context.engine_type == EngineType.SPARK:
561
+ total_count = df.count()
562
+ distinct_count = df.select(grain).distinct().count()
563
+ else:
564
+ total_count = len(df)
565
+ distinct_count = len(df.drop_duplicates(subset=grain))
566
+
567
+ if total_count != distinct_count:
568
+ duplicate_count = total_count - distinct_count
569
+ ctx.error(
570
+ f"FactPattern grain validation failed: {duplicate_count} duplicate rows",
571
+ pattern="FactPattern",
572
+ grain=grain,
573
+ total_rows=total_count,
574
+ distinct_rows=distinct_count,
575
+ )
576
+ raise ValueError(
577
+ f"FactPattern: Grain validation failed. Found {duplicate_count} duplicate "
578
+ f"rows at grain level {grain}. Total rows: {total_count}, "
579
+ f"Distinct rows: {distinct_count}."
580
+ )
581
+
582
+ ctx.debug(
583
+ "FactPattern grain validation passed",
584
+ pattern="FactPattern",
585
+ grain=grain,
586
+ total_rows=total_count,
587
+ )
588
+
589
+ def _add_audit_columns(self, context: EngineContext, df, audit_config: Dict):
590
+ """Add audit columns (load_timestamp, source_system)."""
591
+ load_timestamp = audit_config.get("load_timestamp", False)
592
+ source_system = audit_config.get("source_system")
593
+
594
+ if context.engine_type == EngineType.SPARK:
595
+ from pyspark.sql import functions as F
596
+
597
+ if load_timestamp:
598
+ df = df.withColumn("load_timestamp", F.current_timestamp())
599
+ if source_system:
600
+ df = df.withColumn("source_system", F.lit(source_system))
601
+ else:
602
+ if load_timestamp or source_system:
603
+ df = df.copy()
604
+ if load_timestamp:
605
+ df["load_timestamp"] = datetime.now()
606
+ if source_system:
607
+ df["source_system"] = source_system
608
+
609
+ return df
610
+
611
+ def _add_quarantine_metadata_spark(
612
+ self,
613
+ df,
614
+ dim_table: str,
615
+ source_col: str,
616
+ quarantine_config: Dict,
617
+ ):
618
+ """Add metadata columns to quarantined Spark DataFrame."""
619
+ from pyspark.sql import functions as F
620
+
621
+ add_columns = quarantine_config.get("add_columns", {})
622
+
623
+ if add_columns.get("_rejection_reason", False):
624
+ reason = f"Orphan record: no match in dimension '{dim_table}' on column '{source_col}'"
625
+ df = df.withColumn("_rejection_reason", F.lit(reason))
626
+
627
+ if add_columns.get("_rejected_at", False):
628
+ df = df.withColumn("_rejected_at", F.current_timestamp())
629
+
630
+ if add_columns.get("_source_dimension", False):
631
+ df = df.withColumn("_source_dimension", F.lit(dim_table))
632
+
633
+ return df
634
+
635
+ def _add_quarantine_metadata_pandas(
636
+ self,
637
+ df,
638
+ dim_table: str,
639
+ source_col: str,
640
+ quarantine_config: Dict,
641
+ ):
642
+ """Add metadata columns to quarantined Pandas DataFrame."""
643
+ add_columns = quarantine_config.get("add_columns", {})
644
+
645
+ if add_columns.get("_rejection_reason", False):
646
+ reason = f"Orphan record: no match in dimension '{dim_table}' on column '{source_col}'"
647
+ df["_rejection_reason"] = reason
648
+
649
+ if add_columns.get("_rejected_at", False):
650
+ df["_rejected_at"] = datetime.now()
651
+
652
+ if add_columns.get("_source_dimension", False):
653
+ df["_source_dimension"] = dim_table
654
+
655
+ return df
656
+
657
+ def _write_quarantine(
658
+ self,
659
+ context: EngineContext,
660
+ quarantined_df,
661
+ quarantine_config: Dict,
662
+ ):
663
+ """Write quarantined records to the configured destination."""
664
+ ctx = get_logging_context()
665
+ connection = quarantine_config.get("connection")
666
+ path = quarantine_config.get("path")
667
+ table = quarantine_config.get("table")
668
+
669
+ if context.engine_type == EngineType.SPARK:
670
+ self._write_quarantine_spark(context, quarantined_df, connection, path, table)
671
+ else:
672
+ self._write_quarantine_pandas(context, quarantined_df, connection, path, table)
673
+
674
+ ctx.debug(
675
+ "Quarantine data written",
676
+ pattern="FactPattern",
677
+ connection=connection,
678
+ destination=path or table,
679
+ )
680
+
681
+ def _write_quarantine_spark(
682
+ self,
683
+ context: EngineContext,
684
+ df,
685
+ connection: str,
686
+ path: Optional[str],
687
+ table: Optional[str],
688
+ ):
689
+ """Write quarantine data using Spark."""
690
+ if table:
691
+ full_table = f"{connection}.{table}" if connection else table
692
+ df.write.format("delta").mode("append").saveAsTable(full_table)
693
+ elif path:
694
+ full_path = path
695
+ if hasattr(context, "engine") and context.engine:
696
+ if connection in getattr(context.engine, "connections", {}):
697
+ try:
698
+ full_path = context.engine.connections[connection].get_path(path)
699
+ except Exception:
700
+ pass
701
+ df.write.format("delta").mode("append").save(full_path)
702
+
703
+ def _write_quarantine_pandas(
704
+ self,
705
+ context: EngineContext,
706
+ df,
707
+ connection: str,
708
+ path: Optional[str],
709
+ table: Optional[str],
710
+ ):
711
+ """Write quarantine data using Pandas."""
712
+ import os
713
+
714
+ destination = path or table
715
+ full_path = destination
716
+
717
+ if hasattr(context, "engine") and context.engine:
718
+ if connection in getattr(context.engine, "connections", {}):
719
+ try:
720
+ full_path = context.engine.connections[connection].get_path(destination)
721
+ except Exception:
722
+ pass
723
+
724
+ path_lower = str(full_path).lower()
725
+
726
+ if path_lower.endswith(".csv"):
727
+ if os.path.exists(full_path):
728
+ df.to_csv(full_path, mode="a", header=False, index=False)
729
+ else:
730
+ df.to_csv(full_path, index=False)
731
+ elif path_lower.endswith(".json"):
732
+ if os.path.exists(full_path):
733
+ import pandas as pd
734
+
735
+ existing = pd.read_json(full_path)
736
+ combined = pd.concat([existing, df], ignore_index=True)
737
+ combined.to_json(full_path, orient="records")
738
+ else:
739
+ df.to_json(full_path, orient="records")
740
+ else:
741
+ if os.path.exists(full_path):
742
+ import pandas as pd
743
+
744
+ existing = pd.read_parquet(full_path)
745
+ combined = pd.concat([existing, df], ignore_index=True)
746
+ combined.to_parquet(full_path, index=False)
747
+ else:
748
+ df.to_parquet(full_path, index=False)