odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/patterns/fact.py
ADDED
|
@@ -0,0 +1,748 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from odibi.context import EngineContext
|
|
6
|
+
from odibi.enums import EngineType
|
|
7
|
+
from odibi.patterns.base import Pattern
|
|
8
|
+
from odibi.utils.logging_context import get_logging_context
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FactPattern(Pattern):
|
|
12
|
+
"""
|
|
13
|
+
Enhanced Fact Pattern: Builds fact tables with automatic SK lookups.
|
|
14
|
+
|
|
15
|
+
Features:
|
|
16
|
+
- Automatic surrogate key lookups from dimension tables
|
|
17
|
+
- Orphan handling (unknown member, reject, or quarantine)
|
|
18
|
+
- Grain validation (detect duplicates at PK level)
|
|
19
|
+
- Audit columns (load_timestamp, source_system)
|
|
20
|
+
- Deduplication support
|
|
21
|
+
- Measure calculations and renaming
|
|
22
|
+
|
|
23
|
+
Basic Params (backward compatible):
|
|
24
|
+
deduplicate (bool): If true, removes duplicates before insert.
|
|
25
|
+
keys (list): Keys for deduplication.
|
|
26
|
+
|
|
27
|
+
Enhanced Params:
|
|
28
|
+
grain (list): Columns that define uniqueness (validates no duplicates)
|
|
29
|
+
dimensions (list): Dimension lookup configurations
|
|
30
|
+
- source_column: Column in source data
|
|
31
|
+
- dimension_table: Name of dimension in context
|
|
32
|
+
- dimension_key: Natural key column in dimension
|
|
33
|
+
- surrogate_key: Surrogate key to retrieve
|
|
34
|
+
- scd2 (bool): If true, filter is_current=true
|
|
35
|
+
orphan_handling (str): "unknown" | "reject" | "quarantine"
|
|
36
|
+
quarantine (dict): Quarantine configuration (required if orphan_handling=quarantine)
|
|
37
|
+
- connection: Connection name for quarantine writes
|
|
38
|
+
- path: Path for quarantine data (or use 'table')
|
|
39
|
+
- table: Table name for quarantine (or use 'path')
|
|
40
|
+
- add_columns (dict): Metadata columns to add
|
|
41
|
+
- _rejection_reason (bool): Add rejection reason column
|
|
42
|
+
- _rejected_at (bool): Add rejection timestamp column
|
|
43
|
+
- _source_dimension (bool): Add source dimension name column
|
|
44
|
+
measures (list): Measure definitions (passthrough, rename, or calculated)
|
|
45
|
+
audit (dict): Audit column configuration
|
|
46
|
+
- load_timestamp (bool)
|
|
47
|
+
- source_system (str)
|
|
48
|
+
|
|
49
|
+
Example Config:
|
|
50
|
+
pattern:
|
|
51
|
+
type: fact
|
|
52
|
+
params:
|
|
53
|
+
grain: [order_id]
|
|
54
|
+
dimensions:
|
|
55
|
+
- source_column: customer_id
|
|
56
|
+
dimension_table: dim_customer
|
|
57
|
+
dimension_key: customer_id
|
|
58
|
+
surrogate_key: customer_sk
|
|
59
|
+
scd2: true
|
|
60
|
+
orphan_handling: unknown
|
|
61
|
+
measures:
|
|
62
|
+
- quantity
|
|
63
|
+
- total_amount: "quantity * price"
|
|
64
|
+
audit:
|
|
65
|
+
load_timestamp: true
|
|
66
|
+
source_system: "pos"
|
|
67
|
+
|
|
68
|
+
Example with Quarantine:
|
|
69
|
+
pattern:
|
|
70
|
+
type: fact
|
|
71
|
+
params:
|
|
72
|
+
dimensions:
|
|
73
|
+
- source_column: customer_id
|
|
74
|
+
dimension_table: dim_customer
|
|
75
|
+
dimension_key: customer_id
|
|
76
|
+
surrogate_key: customer_sk
|
|
77
|
+
orphan_handling: quarantine
|
|
78
|
+
quarantine:
|
|
79
|
+
connection: silver
|
|
80
|
+
path: fact_orders_orphans
|
|
81
|
+
add_columns:
|
|
82
|
+
_rejection_reason: true
|
|
83
|
+
_rejected_at: true
|
|
84
|
+
_source_dimension: true
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def validate(self) -> None:
|
|
88
|
+
ctx = get_logging_context()
|
|
89
|
+
deduplicate = self.params.get("deduplicate")
|
|
90
|
+
keys = self.params.get("keys")
|
|
91
|
+
grain = self.params.get("grain")
|
|
92
|
+
dimensions = self.params.get("dimensions", [])
|
|
93
|
+
orphan_handling = self.params.get("orphan_handling", "unknown")
|
|
94
|
+
|
|
95
|
+
ctx.debug(
|
|
96
|
+
"FactPattern validation starting",
|
|
97
|
+
pattern="FactPattern",
|
|
98
|
+
deduplicate=deduplicate,
|
|
99
|
+
keys=keys,
|
|
100
|
+
grain=grain,
|
|
101
|
+
dimensions_count=len(dimensions),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if deduplicate and not keys:
|
|
105
|
+
ctx.error(
|
|
106
|
+
"FactPattern validation failed: 'keys' required when 'deduplicate' is True",
|
|
107
|
+
pattern="FactPattern",
|
|
108
|
+
)
|
|
109
|
+
raise ValueError(
|
|
110
|
+
"FactPattern: 'keys' required when 'deduplicate' is True. "
|
|
111
|
+
"Keys define which columns uniquely identify a fact row for deduplication. "
|
|
112
|
+
"Provide keys=['col1', 'col2'] to specify the deduplication columns."
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if orphan_handling not in ("unknown", "reject", "quarantine"):
|
|
116
|
+
ctx.error(
|
|
117
|
+
f"FactPattern validation failed: invalid orphan_handling '{orphan_handling}'",
|
|
118
|
+
pattern="FactPattern",
|
|
119
|
+
)
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"FactPattern: 'orphan_handling' must be 'unknown', 'reject', or 'quarantine'. "
|
|
122
|
+
f"Got: {orphan_handling}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if orphan_handling == "quarantine":
|
|
126
|
+
quarantine_config = self.params.get("quarantine")
|
|
127
|
+
if not quarantine_config:
|
|
128
|
+
ctx.error(
|
|
129
|
+
"FactPattern validation failed: 'quarantine' config required "
|
|
130
|
+
"when orphan_handling='quarantine'",
|
|
131
|
+
pattern="FactPattern",
|
|
132
|
+
)
|
|
133
|
+
raise ValueError(
|
|
134
|
+
"FactPattern: 'quarantine' configuration is required when "
|
|
135
|
+
"orphan_handling='quarantine'."
|
|
136
|
+
)
|
|
137
|
+
if not quarantine_config.get("connection"):
|
|
138
|
+
ctx.error(
|
|
139
|
+
"FactPattern validation failed: quarantine.connection is required",
|
|
140
|
+
pattern="FactPattern",
|
|
141
|
+
)
|
|
142
|
+
raise ValueError(
|
|
143
|
+
"FactPattern: 'quarantine.connection' is required. "
|
|
144
|
+
"The connection specifies where to write quarantined orphan records "
|
|
145
|
+
"(e.g., a Spark session or database connection). "
|
|
146
|
+
"Add 'connection' to your quarantine config."
|
|
147
|
+
)
|
|
148
|
+
if not quarantine_config.get("path") and not quarantine_config.get("table"):
|
|
149
|
+
ctx.error(
|
|
150
|
+
"FactPattern validation failed: quarantine requires 'path' or 'table'",
|
|
151
|
+
pattern="FactPattern",
|
|
152
|
+
)
|
|
153
|
+
raise ValueError(
|
|
154
|
+
f"FactPattern: 'quarantine' requires either 'path' or 'table'. "
|
|
155
|
+
f"Got config: {quarantine_config}. "
|
|
156
|
+
"Add 'path' for file storage or 'table' for database storage."
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
for i, dim in enumerate(dimensions):
|
|
160
|
+
required_keys = ["source_column", "dimension_table", "dimension_key", "surrogate_key"]
|
|
161
|
+
for key in required_keys:
|
|
162
|
+
if key not in dim:
|
|
163
|
+
ctx.error(
|
|
164
|
+
f"FactPattern validation failed: dimension[{i}] missing '{key}'",
|
|
165
|
+
pattern="FactPattern",
|
|
166
|
+
)
|
|
167
|
+
raise ValueError(
|
|
168
|
+
f"FactPattern: dimension[{i}] missing required key '{key}'. "
|
|
169
|
+
f"Required keys: {required_keys}. "
|
|
170
|
+
f"Got: {dim}. "
|
|
171
|
+
f"Ensure all required keys are provided in the dimension config."
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
ctx.debug(
|
|
175
|
+
"FactPattern validation passed",
|
|
176
|
+
pattern="FactPattern",
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def execute(self, context: EngineContext) -> Any:
|
|
180
|
+
ctx = get_logging_context()
|
|
181
|
+
start_time = time.time()
|
|
182
|
+
|
|
183
|
+
deduplicate = self.params.get("deduplicate")
|
|
184
|
+
keys = self.params.get("keys")
|
|
185
|
+
grain = self.params.get("grain")
|
|
186
|
+
dimensions = self.params.get("dimensions", [])
|
|
187
|
+
orphan_handling = self.params.get("orphan_handling", "unknown")
|
|
188
|
+
quarantine_config = self.params.get("quarantine", {})
|
|
189
|
+
measures = self.params.get("measures", [])
|
|
190
|
+
audit_config = self.params.get("audit", {})
|
|
191
|
+
|
|
192
|
+
ctx.debug(
|
|
193
|
+
"FactPattern starting",
|
|
194
|
+
pattern="FactPattern",
|
|
195
|
+
deduplicate=deduplicate,
|
|
196
|
+
keys=keys,
|
|
197
|
+
grain=grain,
|
|
198
|
+
dimensions_count=len(dimensions),
|
|
199
|
+
orphan_handling=orphan_handling,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
df = context.df
|
|
203
|
+
source_count = self._get_row_count(df, context.engine_type)
|
|
204
|
+
ctx.debug("Fact source loaded", pattern="FactPattern", source_rows=source_count)
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
if deduplicate and keys:
|
|
208
|
+
df = self._deduplicate(context, df, keys)
|
|
209
|
+
ctx.debug(
|
|
210
|
+
"Fact deduplication complete",
|
|
211
|
+
pattern="FactPattern",
|
|
212
|
+
rows_after=self._get_row_count(df, context.engine_type),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if dimensions:
|
|
216
|
+
df, orphan_count, quarantined_df = self._lookup_dimensions(
|
|
217
|
+
context, df, dimensions, orphan_handling, quarantine_config
|
|
218
|
+
)
|
|
219
|
+
ctx.debug(
|
|
220
|
+
"Fact dimension lookups complete",
|
|
221
|
+
pattern="FactPattern",
|
|
222
|
+
orphan_count=orphan_count,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if orphan_handling == "quarantine" and quarantined_df is not None:
|
|
226
|
+
self._write_quarantine(context, quarantined_df, quarantine_config)
|
|
227
|
+
ctx.info(
|
|
228
|
+
f"Quarantined {orphan_count} orphan records",
|
|
229
|
+
pattern="FactPattern",
|
|
230
|
+
quarantine_path=quarantine_config.get("path")
|
|
231
|
+
or quarantine_config.get("table"),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if measures:
|
|
235
|
+
df = self._apply_measures(context, df, measures)
|
|
236
|
+
|
|
237
|
+
if grain:
|
|
238
|
+
self._validate_grain(context, df, grain)
|
|
239
|
+
|
|
240
|
+
df = self._add_audit_columns(context, df, audit_config)
|
|
241
|
+
|
|
242
|
+
result_count = self._get_row_count(df, context.engine_type)
|
|
243
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
244
|
+
|
|
245
|
+
ctx.info(
|
|
246
|
+
"FactPattern completed",
|
|
247
|
+
pattern="FactPattern",
|
|
248
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
249
|
+
source_rows=source_count,
|
|
250
|
+
result_rows=result_count,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return df
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
257
|
+
ctx.error(
|
|
258
|
+
f"FactPattern failed: {e}",
|
|
259
|
+
pattern="FactPattern",
|
|
260
|
+
error_type=type(e).__name__,
|
|
261
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
262
|
+
)
|
|
263
|
+
raise
|
|
264
|
+
|
|
265
|
+
def _get_row_count(self, df, engine_type) -> Optional[int]:
|
|
266
|
+
try:
|
|
267
|
+
if engine_type == EngineType.SPARK:
|
|
268
|
+
return df.count()
|
|
269
|
+
else:
|
|
270
|
+
return len(df)
|
|
271
|
+
except Exception:
|
|
272
|
+
return None
|
|
273
|
+
|
|
274
|
+
def _deduplicate(self, context: EngineContext, df, keys: List[str]):
|
|
275
|
+
"""Remove duplicates based on keys."""
|
|
276
|
+
if context.engine_type == EngineType.SPARK:
|
|
277
|
+
return df.dropDuplicates(keys)
|
|
278
|
+
else:
|
|
279
|
+
return df.drop_duplicates(subset=keys)
|
|
280
|
+
|
|
281
|
+
def _lookup_dimensions(
|
|
282
|
+
self,
|
|
283
|
+
context: EngineContext,
|
|
284
|
+
df,
|
|
285
|
+
dimensions: List[Dict],
|
|
286
|
+
orphan_handling: str,
|
|
287
|
+
quarantine_config: Dict,
|
|
288
|
+
):
|
|
289
|
+
"""
|
|
290
|
+
Perform surrogate key lookups from dimension tables.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Tuple of (result_df, orphan_count, quarantined_df)
|
|
294
|
+
"""
|
|
295
|
+
total_orphans = 0
|
|
296
|
+
all_quarantined = []
|
|
297
|
+
|
|
298
|
+
for dim_config in dimensions:
|
|
299
|
+
source_col = dim_config["source_column"]
|
|
300
|
+
dim_table = dim_config["dimension_table"]
|
|
301
|
+
dim_key = dim_config["dimension_key"]
|
|
302
|
+
sk_col = dim_config["surrogate_key"]
|
|
303
|
+
is_scd2 = dim_config.get("scd2", False)
|
|
304
|
+
|
|
305
|
+
dim_df = self._get_dimension_df(context, dim_table, is_scd2)
|
|
306
|
+
if dim_df is None:
|
|
307
|
+
raise ValueError(
|
|
308
|
+
f"FactPattern: Dimension table '{dim_table}' not found in context."
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
df, orphan_count, quarantined = self._join_dimension(
|
|
312
|
+
context,
|
|
313
|
+
df,
|
|
314
|
+
dim_df,
|
|
315
|
+
source_col,
|
|
316
|
+
dim_key,
|
|
317
|
+
sk_col,
|
|
318
|
+
orphan_handling,
|
|
319
|
+
dim_table,
|
|
320
|
+
quarantine_config,
|
|
321
|
+
)
|
|
322
|
+
total_orphans += orphan_count
|
|
323
|
+
if quarantined is not None:
|
|
324
|
+
all_quarantined.append(quarantined)
|
|
325
|
+
|
|
326
|
+
quarantined_df = None
|
|
327
|
+
if all_quarantined:
|
|
328
|
+
quarantined_df = self._union_dataframes(context, all_quarantined)
|
|
329
|
+
|
|
330
|
+
return df, total_orphans, quarantined_df
|
|
331
|
+
|
|
332
|
+
def _union_dataframes(self, context: EngineContext, dfs: List):
|
|
333
|
+
"""Union multiple DataFrames together."""
|
|
334
|
+
if not dfs:
|
|
335
|
+
return None
|
|
336
|
+
if context.engine_type == EngineType.SPARK:
|
|
337
|
+
result = dfs[0]
|
|
338
|
+
for df in dfs[1:]:
|
|
339
|
+
result = result.unionByName(df, allowMissingColumns=True)
|
|
340
|
+
return result
|
|
341
|
+
else:
|
|
342
|
+
import pandas as pd
|
|
343
|
+
|
|
344
|
+
return pd.concat(dfs, ignore_index=True)
|
|
345
|
+
|
|
346
|
+
def _get_dimension_df(self, context: EngineContext, dim_table: str, is_scd2: bool):
|
|
347
|
+
"""Get dimension DataFrame from context, optionally filtering for current records."""
|
|
348
|
+
try:
|
|
349
|
+
dim_df = context.get(dim_table)
|
|
350
|
+
except KeyError:
|
|
351
|
+
return None
|
|
352
|
+
|
|
353
|
+
if is_scd2:
|
|
354
|
+
is_current_col = "is_current"
|
|
355
|
+
if context.engine_type == EngineType.SPARK:
|
|
356
|
+
from pyspark.sql import functions as F
|
|
357
|
+
|
|
358
|
+
if is_current_col in dim_df.columns:
|
|
359
|
+
dim_df = dim_df.filter(F.col(is_current_col) == True) # noqa: E712
|
|
360
|
+
else:
|
|
361
|
+
if is_current_col in dim_df.columns:
|
|
362
|
+
dim_df = dim_df[dim_df[is_current_col] == True].copy() # noqa: E712
|
|
363
|
+
|
|
364
|
+
return dim_df
|
|
365
|
+
|
|
366
|
+
def _join_dimension(
|
|
367
|
+
self,
|
|
368
|
+
context: EngineContext,
|
|
369
|
+
fact_df,
|
|
370
|
+
dim_df,
|
|
371
|
+
source_col: str,
|
|
372
|
+
dim_key: str,
|
|
373
|
+
sk_col: str,
|
|
374
|
+
orphan_handling: str,
|
|
375
|
+
dim_table: str,
|
|
376
|
+
quarantine_config: Dict,
|
|
377
|
+
):
|
|
378
|
+
"""
|
|
379
|
+
Join fact to dimension and retrieve surrogate key.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Tuple of (result_df, orphan_count, quarantined_df)
|
|
383
|
+
"""
|
|
384
|
+
if context.engine_type == EngineType.SPARK:
|
|
385
|
+
return self._join_dimension_spark(
|
|
386
|
+
context,
|
|
387
|
+
fact_df,
|
|
388
|
+
dim_df,
|
|
389
|
+
source_col,
|
|
390
|
+
dim_key,
|
|
391
|
+
sk_col,
|
|
392
|
+
orphan_handling,
|
|
393
|
+
dim_table,
|
|
394
|
+
quarantine_config,
|
|
395
|
+
)
|
|
396
|
+
else:
|
|
397
|
+
return self._join_dimension_pandas(
|
|
398
|
+
fact_df,
|
|
399
|
+
dim_df,
|
|
400
|
+
source_col,
|
|
401
|
+
dim_key,
|
|
402
|
+
sk_col,
|
|
403
|
+
orphan_handling,
|
|
404
|
+
dim_table,
|
|
405
|
+
quarantine_config,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
def _join_dimension_spark(
|
|
409
|
+
self,
|
|
410
|
+
context: EngineContext,
|
|
411
|
+
fact_df,
|
|
412
|
+
dim_df,
|
|
413
|
+
source_col: str,
|
|
414
|
+
dim_key: str,
|
|
415
|
+
sk_col: str,
|
|
416
|
+
orphan_handling: str,
|
|
417
|
+
dim_table: str,
|
|
418
|
+
quarantine_config: Dict,
|
|
419
|
+
):
|
|
420
|
+
from pyspark.sql import functions as F
|
|
421
|
+
|
|
422
|
+
dim_subset = dim_df.select(
|
|
423
|
+
F.col(dim_key).alias(f"_dim_{dim_key}"),
|
|
424
|
+
F.col(sk_col).alias(sk_col),
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
joined = fact_df.join(
|
|
428
|
+
dim_subset,
|
|
429
|
+
fact_df[source_col] == dim_subset[f"_dim_{dim_key}"],
|
|
430
|
+
"left",
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
orphan_mask = F.col(sk_col).isNull()
|
|
434
|
+
orphan_count = joined.filter(orphan_mask).count()
|
|
435
|
+
quarantined_df = None
|
|
436
|
+
|
|
437
|
+
if orphan_handling == "reject" and orphan_count > 0:
|
|
438
|
+
raise ValueError(
|
|
439
|
+
f"FactPattern: {orphan_count} orphan records found for dimension "
|
|
440
|
+
f"lookup on '{source_col}'. Orphan handling is set to 'reject'."
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
if orphan_handling == "unknown":
|
|
444
|
+
joined = joined.withColumn(sk_col, F.coalesce(F.col(sk_col), F.lit(0)))
|
|
445
|
+
|
|
446
|
+
if orphan_handling == "quarantine" and orphan_count > 0:
|
|
447
|
+
orphan_rows = joined.filter(orphan_mask).drop(f"_dim_{dim_key}")
|
|
448
|
+
orphan_rows = self._add_quarantine_metadata_spark(
|
|
449
|
+
orphan_rows, dim_table, source_col, quarantine_config
|
|
450
|
+
)
|
|
451
|
+
quarantined_df = orphan_rows
|
|
452
|
+
joined = joined.filter(~orphan_mask)
|
|
453
|
+
|
|
454
|
+
result = joined.drop(f"_dim_{dim_key}")
|
|
455
|
+
|
|
456
|
+
return result, orphan_count, quarantined_df
|
|
457
|
+
|
|
458
|
+
def _join_dimension_pandas(
|
|
459
|
+
self,
|
|
460
|
+
fact_df,
|
|
461
|
+
dim_df,
|
|
462
|
+
source_col: str,
|
|
463
|
+
dim_key: str,
|
|
464
|
+
sk_col: str,
|
|
465
|
+
orphan_handling: str,
|
|
466
|
+
dim_table: str,
|
|
467
|
+
quarantine_config: Dict,
|
|
468
|
+
):
|
|
469
|
+
import pandas as pd
|
|
470
|
+
|
|
471
|
+
dim_subset = dim_df[[dim_key, sk_col]].copy()
|
|
472
|
+
dim_subset = dim_subset.rename(columns={dim_key: f"_dim_{dim_key}"})
|
|
473
|
+
|
|
474
|
+
merged = pd.merge(
|
|
475
|
+
fact_df,
|
|
476
|
+
dim_subset,
|
|
477
|
+
left_on=source_col,
|
|
478
|
+
right_on=f"_dim_{dim_key}",
|
|
479
|
+
how="left",
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
orphan_mask = merged[sk_col].isna()
|
|
483
|
+
orphan_count = orphan_mask.sum()
|
|
484
|
+
quarantined_df = None
|
|
485
|
+
|
|
486
|
+
if orphan_handling == "reject" and orphan_count > 0:
|
|
487
|
+
raise ValueError(
|
|
488
|
+
f"FactPattern: {orphan_count} orphan records found for dimension "
|
|
489
|
+
f"lookup on '{source_col}'. Orphan handling is set to 'reject'."
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
if orphan_handling == "unknown":
|
|
493
|
+
merged[sk_col] = merged[sk_col].fillna(0).infer_objects(copy=False).astype(int)
|
|
494
|
+
|
|
495
|
+
if orphan_handling == "quarantine" and orphan_count > 0:
|
|
496
|
+
orphan_rows = merged[orphan_mask].drop(columns=[f"_dim_{dim_key}"]).copy()
|
|
497
|
+
orphan_rows = self._add_quarantine_metadata_pandas(
|
|
498
|
+
orphan_rows, dim_table, source_col, quarantine_config
|
|
499
|
+
)
|
|
500
|
+
quarantined_df = orphan_rows
|
|
501
|
+
merged = merged[~orphan_mask].copy()
|
|
502
|
+
|
|
503
|
+
result = merged.drop(columns=[f"_dim_{dim_key}"])
|
|
504
|
+
|
|
505
|
+
return result, int(orphan_count), quarantined_df
|
|
506
|
+
|
|
507
|
+
def _apply_measures(self, context: EngineContext, df, measures: List):
|
|
508
|
+
"""
|
|
509
|
+
Apply measure transformations.
|
|
510
|
+
|
|
511
|
+
Measures can be:
|
|
512
|
+
- String: passthrough column name
|
|
513
|
+
- Dict with single key-value: rename or calculate
|
|
514
|
+
- {"new_name": "old_name"} -> rename
|
|
515
|
+
- {"new_name": "expr"} -> calculate (if expr contains operators)
|
|
516
|
+
"""
|
|
517
|
+
for measure in measures:
|
|
518
|
+
if isinstance(measure, str):
|
|
519
|
+
continue
|
|
520
|
+
elif isinstance(measure, dict):
|
|
521
|
+
for new_name, expr in measure.items():
|
|
522
|
+
if self._is_expression(expr):
|
|
523
|
+
df = self._add_calculated_measure(context, df, new_name, expr)
|
|
524
|
+
else:
|
|
525
|
+
df = self._rename_column(context, df, expr, new_name)
|
|
526
|
+
|
|
527
|
+
return df
|
|
528
|
+
|
|
529
|
+
def _is_expression(self, expr: str) -> bool:
|
|
530
|
+
"""Check if string is a calculation expression."""
|
|
531
|
+
operators = ["+", "-", "*", "/", "(", ")"]
|
|
532
|
+
return any(op in expr for op in operators)
|
|
533
|
+
|
|
534
|
+
def _add_calculated_measure(self, context: EngineContext, df, name: str, expr: str):
|
|
535
|
+
"""Add a calculated measure column."""
|
|
536
|
+
if context.engine_type == EngineType.SPARK:
|
|
537
|
+
from pyspark.sql import functions as F
|
|
538
|
+
|
|
539
|
+
return df.withColumn(name, F.expr(expr))
|
|
540
|
+
else:
|
|
541
|
+
df = df.copy()
|
|
542
|
+
df[name] = df.eval(expr)
|
|
543
|
+
return df
|
|
544
|
+
|
|
545
|
+
def _rename_column(self, context: EngineContext, df, old_name: str, new_name: str):
|
|
546
|
+
"""Rename a column."""
|
|
547
|
+
if context.engine_type == EngineType.SPARK:
|
|
548
|
+
return df.withColumnRenamed(old_name, new_name)
|
|
549
|
+
else:
|
|
550
|
+
return df.rename(columns={old_name: new_name})
|
|
551
|
+
|
|
552
|
+
def _validate_grain(self, context: EngineContext, df, grain: List[str]):
|
|
553
|
+
"""
|
|
554
|
+
Validate that no duplicate rows exist at the grain level.
|
|
555
|
+
|
|
556
|
+
Raises ValueError if duplicates are found.
|
|
557
|
+
"""
|
|
558
|
+
ctx = get_logging_context()
|
|
559
|
+
|
|
560
|
+
if context.engine_type == EngineType.SPARK:
|
|
561
|
+
total_count = df.count()
|
|
562
|
+
distinct_count = df.select(grain).distinct().count()
|
|
563
|
+
else:
|
|
564
|
+
total_count = len(df)
|
|
565
|
+
distinct_count = len(df.drop_duplicates(subset=grain))
|
|
566
|
+
|
|
567
|
+
if total_count != distinct_count:
|
|
568
|
+
duplicate_count = total_count - distinct_count
|
|
569
|
+
ctx.error(
|
|
570
|
+
f"FactPattern grain validation failed: {duplicate_count} duplicate rows",
|
|
571
|
+
pattern="FactPattern",
|
|
572
|
+
grain=grain,
|
|
573
|
+
total_rows=total_count,
|
|
574
|
+
distinct_rows=distinct_count,
|
|
575
|
+
)
|
|
576
|
+
raise ValueError(
|
|
577
|
+
f"FactPattern: Grain validation failed. Found {duplicate_count} duplicate "
|
|
578
|
+
f"rows at grain level {grain}. Total rows: {total_count}, "
|
|
579
|
+
f"Distinct rows: {distinct_count}."
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
ctx.debug(
|
|
583
|
+
"FactPattern grain validation passed",
|
|
584
|
+
pattern="FactPattern",
|
|
585
|
+
grain=grain,
|
|
586
|
+
total_rows=total_count,
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
def _add_audit_columns(self, context: EngineContext, df, audit_config: Dict):
|
|
590
|
+
"""Add audit columns (load_timestamp, source_system)."""
|
|
591
|
+
load_timestamp = audit_config.get("load_timestamp", False)
|
|
592
|
+
source_system = audit_config.get("source_system")
|
|
593
|
+
|
|
594
|
+
if context.engine_type == EngineType.SPARK:
|
|
595
|
+
from pyspark.sql import functions as F
|
|
596
|
+
|
|
597
|
+
if load_timestamp:
|
|
598
|
+
df = df.withColumn("load_timestamp", F.current_timestamp())
|
|
599
|
+
if source_system:
|
|
600
|
+
df = df.withColumn("source_system", F.lit(source_system))
|
|
601
|
+
else:
|
|
602
|
+
if load_timestamp or source_system:
|
|
603
|
+
df = df.copy()
|
|
604
|
+
if load_timestamp:
|
|
605
|
+
df["load_timestamp"] = datetime.now()
|
|
606
|
+
if source_system:
|
|
607
|
+
df["source_system"] = source_system
|
|
608
|
+
|
|
609
|
+
return df
|
|
610
|
+
|
|
611
|
+
def _add_quarantine_metadata_spark(
|
|
612
|
+
self,
|
|
613
|
+
df,
|
|
614
|
+
dim_table: str,
|
|
615
|
+
source_col: str,
|
|
616
|
+
quarantine_config: Dict,
|
|
617
|
+
):
|
|
618
|
+
"""Add metadata columns to quarantined Spark DataFrame."""
|
|
619
|
+
from pyspark.sql import functions as F
|
|
620
|
+
|
|
621
|
+
add_columns = quarantine_config.get("add_columns", {})
|
|
622
|
+
|
|
623
|
+
if add_columns.get("_rejection_reason", False):
|
|
624
|
+
reason = f"Orphan record: no match in dimension '{dim_table}' on column '{source_col}'"
|
|
625
|
+
df = df.withColumn("_rejection_reason", F.lit(reason))
|
|
626
|
+
|
|
627
|
+
if add_columns.get("_rejected_at", False):
|
|
628
|
+
df = df.withColumn("_rejected_at", F.current_timestamp())
|
|
629
|
+
|
|
630
|
+
if add_columns.get("_source_dimension", False):
|
|
631
|
+
df = df.withColumn("_source_dimension", F.lit(dim_table))
|
|
632
|
+
|
|
633
|
+
return df
|
|
634
|
+
|
|
635
|
+
def _add_quarantine_metadata_pandas(
|
|
636
|
+
self,
|
|
637
|
+
df,
|
|
638
|
+
dim_table: str,
|
|
639
|
+
source_col: str,
|
|
640
|
+
quarantine_config: Dict,
|
|
641
|
+
):
|
|
642
|
+
"""Add metadata columns to quarantined Pandas DataFrame."""
|
|
643
|
+
add_columns = quarantine_config.get("add_columns", {})
|
|
644
|
+
|
|
645
|
+
if add_columns.get("_rejection_reason", False):
|
|
646
|
+
reason = f"Orphan record: no match in dimension '{dim_table}' on column '{source_col}'"
|
|
647
|
+
df["_rejection_reason"] = reason
|
|
648
|
+
|
|
649
|
+
if add_columns.get("_rejected_at", False):
|
|
650
|
+
df["_rejected_at"] = datetime.now()
|
|
651
|
+
|
|
652
|
+
if add_columns.get("_source_dimension", False):
|
|
653
|
+
df["_source_dimension"] = dim_table
|
|
654
|
+
|
|
655
|
+
return df
|
|
656
|
+
|
|
657
|
+
def _write_quarantine(
|
|
658
|
+
self,
|
|
659
|
+
context: EngineContext,
|
|
660
|
+
quarantined_df,
|
|
661
|
+
quarantine_config: Dict,
|
|
662
|
+
):
|
|
663
|
+
"""Write quarantined records to the configured destination."""
|
|
664
|
+
ctx = get_logging_context()
|
|
665
|
+
connection = quarantine_config.get("connection")
|
|
666
|
+
path = quarantine_config.get("path")
|
|
667
|
+
table = quarantine_config.get("table")
|
|
668
|
+
|
|
669
|
+
if context.engine_type == EngineType.SPARK:
|
|
670
|
+
self._write_quarantine_spark(context, quarantined_df, connection, path, table)
|
|
671
|
+
else:
|
|
672
|
+
self._write_quarantine_pandas(context, quarantined_df, connection, path, table)
|
|
673
|
+
|
|
674
|
+
ctx.debug(
|
|
675
|
+
"Quarantine data written",
|
|
676
|
+
pattern="FactPattern",
|
|
677
|
+
connection=connection,
|
|
678
|
+
destination=path or table,
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
def _write_quarantine_spark(
|
|
682
|
+
self,
|
|
683
|
+
context: EngineContext,
|
|
684
|
+
df,
|
|
685
|
+
connection: str,
|
|
686
|
+
path: Optional[str],
|
|
687
|
+
table: Optional[str],
|
|
688
|
+
):
|
|
689
|
+
"""Write quarantine data using Spark."""
|
|
690
|
+
if table:
|
|
691
|
+
full_table = f"{connection}.{table}" if connection else table
|
|
692
|
+
df.write.format("delta").mode("append").saveAsTable(full_table)
|
|
693
|
+
elif path:
|
|
694
|
+
full_path = path
|
|
695
|
+
if hasattr(context, "engine") and context.engine:
|
|
696
|
+
if connection in getattr(context.engine, "connections", {}):
|
|
697
|
+
try:
|
|
698
|
+
full_path = context.engine.connections[connection].get_path(path)
|
|
699
|
+
except Exception:
|
|
700
|
+
pass
|
|
701
|
+
df.write.format("delta").mode("append").save(full_path)
|
|
702
|
+
|
|
703
|
+
def _write_quarantine_pandas(
|
|
704
|
+
self,
|
|
705
|
+
context: EngineContext,
|
|
706
|
+
df,
|
|
707
|
+
connection: str,
|
|
708
|
+
path: Optional[str],
|
|
709
|
+
table: Optional[str],
|
|
710
|
+
):
|
|
711
|
+
"""Write quarantine data using Pandas."""
|
|
712
|
+
import os
|
|
713
|
+
|
|
714
|
+
destination = path or table
|
|
715
|
+
full_path = destination
|
|
716
|
+
|
|
717
|
+
if hasattr(context, "engine") and context.engine:
|
|
718
|
+
if connection in getattr(context.engine, "connections", {}):
|
|
719
|
+
try:
|
|
720
|
+
full_path = context.engine.connections[connection].get_path(destination)
|
|
721
|
+
except Exception:
|
|
722
|
+
pass
|
|
723
|
+
|
|
724
|
+
path_lower = str(full_path).lower()
|
|
725
|
+
|
|
726
|
+
if path_lower.endswith(".csv"):
|
|
727
|
+
if os.path.exists(full_path):
|
|
728
|
+
df.to_csv(full_path, mode="a", header=False, index=False)
|
|
729
|
+
else:
|
|
730
|
+
df.to_csv(full_path, index=False)
|
|
731
|
+
elif path_lower.endswith(".json"):
|
|
732
|
+
if os.path.exists(full_path):
|
|
733
|
+
import pandas as pd
|
|
734
|
+
|
|
735
|
+
existing = pd.read_json(full_path)
|
|
736
|
+
combined = pd.concat([existing, df], ignore_index=True)
|
|
737
|
+
combined.to_json(full_path, orient="records")
|
|
738
|
+
else:
|
|
739
|
+
df.to_json(full_path, orient="records")
|
|
740
|
+
else:
|
|
741
|
+
if os.path.exists(full_path):
|
|
742
|
+
import pandas as pd
|
|
743
|
+
|
|
744
|
+
existing = pd.read_parquet(full_path)
|
|
745
|
+
combined = pd.concat([existing, df], ignore_index=True)
|
|
746
|
+
combined.to_parquet(full_path, index=False)
|
|
747
|
+
else:
|
|
748
|
+
df.to_parquet(full_path, index=False)
|