odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,599 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from odibi.context import EngineContext
|
|
6
|
+
from odibi.enums import EngineType
|
|
7
|
+
from odibi.patterns.base import Pattern
|
|
8
|
+
from odibi.utils.logging_context import get_logging_context
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AggregationPattern(Pattern):
|
|
12
|
+
"""
|
|
13
|
+
Aggregation Pattern: Declarative aggregation with time-grain rollups.
|
|
14
|
+
|
|
15
|
+
Features:
|
|
16
|
+
- Declare grain (GROUP BY columns)
|
|
17
|
+
- Declare measures with aggregation functions
|
|
18
|
+
- Incremental aggregation (merge new data with existing)
|
|
19
|
+
- Time rollups (generate multiple grain levels)
|
|
20
|
+
- Audit columns
|
|
21
|
+
|
|
22
|
+
Configuration Options (via params dict):
|
|
23
|
+
- **grain** (list): Columns to GROUP BY (defines uniqueness)
|
|
24
|
+
- **measures** (list): Measure definitions with name and aggregation expr
|
|
25
|
+
- name: Output column name
|
|
26
|
+
- expr: SQL aggregation expression (e.g., "SUM(amount)")
|
|
27
|
+
- **incremental** (dict): Incremental merge configuration (optional)
|
|
28
|
+
- timestamp_column: Column to identify new data
|
|
29
|
+
- merge_strategy: "replace", "sum", "min", or "max"
|
|
30
|
+
- **having** (str): Optional HAVING clause for filtering aggregates
|
|
31
|
+
- **audit** (dict): Audit column configuration
|
|
32
|
+
|
|
33
|
+
Example Config:
|
|
34
|
+
pattern:
|
|
35
|
+
type: aggregation
|
|
36
|
+
params:
|
|
37
|
+
grain: [date_sk, product_sk]
|
|
38
|
+
measures:
|
|
39
|
+
- name: total_revenue
|
|
40
|
+
expr: "SUM(total_amount)"
|
|
41
|
+
- name: order_count
|
|
42
|
+
expr: "COUNT(*)"
|
|
43
|
+
- name: avg_order_value
|
|
44
|
+
expr: "AVG(total_amount)"
|
|
45
|
+
having: "COUNT(*) > 0"
|
|
46
|
+
audit:
|
|
47
|
+
load_timestamp: true
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def validate(self) -> None:
|
|
51
|
+
ctx = get_logging_context()
|
|
52
|
+
grain = self.params.get("grain")
|
|
53
|
+
measures = self.params.get("measures", [])
|
|
54
|
+
|
|
55
|
+
ctx.debug(
|
|
56
|
+
"AggregationPattern validation starting",
|
|
57
|
+
pattern="AggregationPattern",
|
|
58
|
+
grain=grain,
|
|
59
|
+
measures_count=len(measures),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
if not grain:
|
|
63
|
+
ctx.error(
|
|
64
|
+
"AggregationPattern validation failed: 'grain' is required",
|
|
65
|
+
pattern="AggregationPattern",
|
|
66
|
+
)
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"AggregationPattern: 'grain' parameter is required. "
|
|
69
|
+
"Grain defines the grouping columns for aggregation (e.g., ['date', 'region']). "
|
|
70
|
+
"Provide a list of column names to group by."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if not measures:
|
|
74
|
+
ctx.error(
|
|
75
|
+
"AggregationPattern validation failed: 'measures' is required",
|
|
76
|
+
pattern="AggregationPattern",
|
|
77
|
+
)
|
|
78
|
+
raise ValueError(
|
|
79
|
+
"AggregationPattern: 'measures' parameter is required. "
|
|
80
|
+
"Measures define the aggregations to compute (e.g., [{'name': 'total_sales', 'expr': 'sum(amount)'}]). "
|
|
81
|
+
"Provide a list of dicts, each with 'name' and 'expr' keys."
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
for i, measure in enumerate(measures):
|
|
85
|
+
if not isinstance(measure, dict):
|
|
86
|
+
ctx.error(
|
|
87
|
+
f"AggregationPattern validation failed: measure[{i}] must be a dict",
|
|
88
|
+
pattern="AggregationPattern",
|
|
89
|
+
)
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"AggregationPattern: measure[{i}] must be a dict with 'name' and 'expr'. "
|
|
92
|
+
f"Got {type(measure).__name__}: {measure!r}. "
|
|
93
|
+
"Example: {'name': 'total_sales', 'expr': 'sum(amount)'}"
|
|
94
|
+
)
|
|
95
|
+
if "name" not in measure:
|
|
96
|
+
ctx.error(
|
|
97
|
+
f"AggregationPattern validation failed: measure[{i}] missing 'name'",
|
|
98
|
+
pattern="AggregationPattern",
|
|
99
|
+
)
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"AggregationPattern: measure[{i}] missing 'name'. "
|
|
102
|
+
f"Got: {measure!r}. Add a 'name' key for the output column name."
|
|
103
|
+
)
|
|
104
|
+
if "expr" not in measure:
|
|
105
|
+
ctx.error(
|
|
106
|
+
f"AggregationPattern validation failed: measure[{i}] missing 'expr'",
|
|
107
|
+
pattern="AggregationPattern",
|
|
108
|
+
)
|
|
109
|
+
raise ValueError(
|
|
110
|
+
f"AggregationPattern: measure[{i}] missing 'expr'. "
|
|
111
|
+
f"Got: {measure!r}. Add an 'expr' key with the aggregation expression (e.g., 'sum(amount)')."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
incremental = self.params.get("incremental")
|
|
115
|
+
if incremental:
|
|
116
|
+
if "timestamp_column" not in incremental:
|
|
117
|
+
ctx.error(
|
|
118
|
+
"AggregationPattern validation failed: incremental missing 'timestamp_column'",
|
|
119
|
+
pattern="AggregationPattern",
|
|
120
|
+
)
|
|
121
|
+
raise ValueError(
|
|
122
|
+
"AggregationPattern: incremental config requires 'timestamp_column'. "
|
|
123
|
+
f"Got: {incremental!r}. "
|
|
124
|
+
"Add 'timestamp_column' to specify which column tracks record timestamps."
|
|
125
|
+
)
|
|
126
|
+
merge_strategy = incremental.get("merge_strategy", "replace")
|
|
127
|
+
if merge_strategy not in ("replace", "sum", "min", "max"):
|
|
128
|
+
ctx.error(
|
|
129
|
+
f"AggregationPattern validation failed: invalid merge_strategy '{merge_strategy}'",
|
|
130
|
+
pattern="AggregationPattern",
|
|
131
|
+
)
|
|
132
|
+
raise ValueError(
|
|
133
|
+
f"AggregationPattern: 'merge_strategy' must be 'replace', 'sum', 'min', or 'max'. "
|
|
134
|
+
f"Got: {merge_strategy}"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
ctx.debug(
|
|
138
|
+
"AggregationPattern validation passed",
|
|
139
|
+
pattern="AggregationPattern",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def execute(self, context: EngineContext) -> Any:
|
|
143
|
+
ctx = get_logging_context()
|
|
144
|
+
start_time = time.time()
|
|
145
|
+
|
|
146
|
+
grain = self.params.get("grain")
|
|
147
|
+
measures = self.params.get("measures", [])
|
|
148
|
+
having = self.params.get("having")
|
|
149
|
+
incremental = self.params.get("incremental")
|
|
150
|
+
audit_config = self.params.get("audit", {})
|
|
151
|
+
target = self.params.get("target")
|
|
152
|
+
|
|
153
|
+
ctx.debug(
|
|
154
|
+
"AggregationPattern starting",
|
|
155
|
+
pattern="AggregationPattern",
|
|
156
|
+
grain=grain,
|
|
157
|
+
measures_count=len(measures),
|
|
158
|
+
incremental=incremental is not None,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
df = context.df
|
|
162
|
+
source_count = self._get_row_count(df, context.engine_type)
|
|
163
|
+
ctx.debug(
|
|
164
|
+
"Aggregation source loaded",
|
|
165
|
+
pattern="AggregationPattern",
|
|
166
|
+
source_rows=source_count,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
result_df = self._aggregate(context, df, grain, measures, having)
|
|
171
|
+
|
|
172
|
+
if incremental and target:
|
|
173
|
+
result_df = self._apply_incremental(
|
|
174
|
+
context, result_df, grain, measures, incremental, target
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
result_df = self._add_audit_columns(context, result_df, audit_config)
|
|
178
|
+
|
|
179
|
+
result_count = self._get_row_count(result_df, context.engine_type)
|
|
180
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
181
|
+
|
|
182
|
+
ctx.info(
|
|
183
|
+
"AggregationPattern completed",
|
|
184
|
+
pattern="AggregationPattern",
|
|
185
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
186
|
+
source_rows=source_count,
|
|
187
|
+
result_rows=result_count,
|
|
188
|
+
grain=grain,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return result_df
|
|
192
|
+
|
|
193
|
+
except Exception as e:
|
|
194
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
195
|
+
ctx.error(
|
|
196
|
+
f"AggregationPattern failed: {e}",
|
|
197
|
+
pattern="AggregationPattern",
|
|
198
|
+
error_type=type(e).__name__,
|
|
199
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
200
|
+
)
|
|
201
|
+
raise
|
|
202
|
+
|
|
203
|
+
def _get_row_count(self, df, engine_type) -> Optional[int]:
|
|
204
|
+
try:
|
|
205
|
+
if engine_type == EngineType.SPARK:
|
|
206
|
+
return df.count()
|
|
207
|
+
else:
|
|
208
|
+
return len(df)
|
|
209
|
+
except Exception:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
def _aggregate(
|
|
213
|
+
self,
|
|
214
|
+
context: EngineContext,
|
|
215
|
+
df,
|
|
216
|
+
grain: List[str],
|
|
217
|
+
measures: List[Dict],
|
|
218
|
+
having: Optional[str],
|
|
219
|
+
):
|
|
220
|
+
"""Perform the aggregation using SQL."""
|
|
221
|
+
if context.engine_type == EngineType.SPARK:
|
|
222
|
+
return self._aggregate_spark(context, df, grain, measures, having)
|
|
223
|
+
else:
|
|
224
|
+
return self._aggregate_pandas(context, df, grain, measures, having)
|
|
225
|
+
|
|
226
|
+
def _aggregate_spark(
|
|
227
|
+
self,
|
|
228
|
+
context: EngineContext,
|
|
229
|
+
df,
|
|
230
|
+
grain: List[str],
|
|
231
|
+
measures: List[Dict],
|
|
232
|
+
having: Optional[str],
|
|
233
|
+
):
|
|
234
|
+
"""Aggregate using Spark SQL."""
|
|
235
|
+
from pyspark.sql import functions as F
|
|
236
|
+
|
|
237
|
+
grain_cols = [F.col(c) for c in grain]
|
|
238
|
+
|
|
239
|
+
agg_exprs = []
|
|
240
|
+
for measure in measures:
|
|
241
|
+
name = measure["name"]
|
|
242
|
+
expr = measure["expr"]
|
|
243
|
+
agg_exprs.append(F.expr(expr).alias(name))
|
|
244
|
+
|
|
245
|
+
result = df.groupBy(*grain_cols).agg(*agg_exprs)
|
|
246
|
+
|
|
247
|
+
if having:
|
|
248
|
+
result = result.filter(F.expr(having))
|
|
249
|
+
|
|
250
|
+
return result
|
|
251
|
+
|
|
252
|
+
def _aggregate_pandas(
|
|
253
|
+
self,
|
|
254
|
+
context: EngineContext,
|
|
255
|
+
df,
|
|
256
|
+
grain: List[str],
|
|
257
|
+
measures: List[Dict],
|
|
258
|
+
having: Optional[str],
|
|
259
|
+
):
|
|
260
|
+
"""Aggregate using DuckDB SQL via context.sql()."""
|
|
261
|
+
grain_str = ", ".join(grain)
|
|
262
|
+
|
|
263
|
+
measure_exprs = []
|
|
264
|
+
for measure in measures:
|
|
265
|
+
name = measure["name"]
|
|
266
|
+
expr = measure["expr"]
|
|
267
|
+
measure_exprs.append(f"{expr} AS {name}")
|
|
268
|
+
measures_str = ", ".join(measure_exprs)
|
|
269
|
+
|
|
270
|
+
sql = f"SELECT {grain_str}, {measures_str} FROM df GROUP BY {grain_str}"
|
|
271
|
+
|
|
272
|
+
if having:
|
|
273
|
+
sql += f" HAVING {having}"
|
|
274
|
+
|
|
275
|
+
temp_context = context.with_df(df)
|
|
276
|
+
result_context = temp_context.sql(sql)
|
|
277
|
+
return result_context.df
|
|
278
|
+
|
|
279
|
+
def _apply_incremental(
|
|
280
|
+
self,
|
|
281
|
+
context: EngineContext,
|
|
282
|
+
new_agg_df,
|
|
283
|
+
grain: List[str],
|
|
284
|
+
measures: List[Dict],
|
|
285
|
+
incremental: Dict,
|
|
286
|
+
target: str,
|
|
287
|
+
):
|
|
288
|
+
"""Apply incremental merge with existing aggregations."""
|
|
289
|
+
merge_strategy = incremental.get("merge_strategy", "replace")
|
|
290
|
+
|
|
291
|
+
existing_df = self._load_existing_target(context, target)
|
|
292
|
+
if existing_df is None:
|
|
293
|
+
return new_agg_df
|
|
294
|
+
|
|
295
|
+
if merge_strategy == "replace":
|
|
296
|
+
return self._merge_replace(context, existing_df, new_agg_df, grain)
|
|
297
|
+
elif merge_strategy == "sum":
|
|
298
|
+
return self._merge_sum(context, existing_df, new_agg_df, grain, measures)
|
|
299
|
+
elif merge_strategy == "min":
|
|
300
|
+
return self._merge_min(context, existing_df, new_agg_df, grain, measures)
|
|
301
|
+
else: # max
|
|
302
|
+
return self._merge_max(context, existing_df, new_agg_df, grain, measures)
|
|
303
|
+
|
|
304
|
+
def _load_existing_target(self, context: EngineContext, target: str):
|
|
305
|
+
"""Load existing target table if it exists."""
|
|
306
|
+
if context.engine_type == EngineType.SPARK:
|
|
307
|
+
return self._load_existing_spark(context, target)
|
|
308
|
+
else:
|
|
309
|
+
return self._load_existing_pandas(context, target)
|
|
310
|
+
|
|
311
|
+
def _load_existing_spark(self, context: EngineContext, target: str):
|
|
312
|
+
spark = context.spark
|
|
313
|
+
try:
|
|
314
|
+
return spark.table(target)
|
|
315
|
+
except Exception:
|
|
316
|
+
try:
|
|
317
|
+
return spark.read.format("delta").load(target)
|
|
318
|
+
except Exception:
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
def _load_existing_pandas(self, context: EngineContext, target: str):
|
|
322
|
+
import os
|
|
323
|
+
|
|
324
|
+
import pandas as pd
|
|
325
|
+
|
|
326
|
+
path = target
|
|
327
|
+
if hasattr(context, "engine") and context.engine:
|
|
328
|
+
if "." in path:
|
|
329
|
+
parts = path.split(".", 1)
|
|
330
|
+
conn_name = parts[0]
|
|
331
|
+
rel_path = parts[1]
|
|
332
|
+
if conn_name in context.engine.connections:
|
|
333
|
+
try:
|
|
334
|
+
path = context.engine.connections[conn_name].get_path(rel_path)
|
|
335
|
+
except Exception:
|
|
336
|
+
pass
|
|
337
|
+
|
|
338
|
+
if not os.path.exists(path):
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
if str(path).endswith(".parquet") or os.path.isdir(path):
|
|
343
|
+
return pd.read_parquet(path)
|
|
344
|
+
elif str(path).endswith(".csv"):
|
|
345
|
+
return pd.read_csv(path)
|
|
346
|
+
except Exception:
|
|
347
|
+
return None
|
|
348
|
+
|
|
349
|
+
return None
|
|
350
|
+
|
|
351
|
+
def _merge_replace(self, context: EngineContext, existing_df, new_df, grain: List[str]):
|
|
352
|
+
"""
|
|
353
|
+
Replace strategy: New aggregates overwrite existing for matching grain keys.
|
|
354
|
+
"""
|
|
355
|
+
if context.engine_type == EngineType.SPARK:
|
|
356
|
+
new_keys = new_df.select(grain).distinct()
|
|
357
|
+
|
|
358
|
+
unchanged = existing_df.join(new_keys, on=grain, how="left_anti")
|
|
359
|
+
|
|
360
|
+
return unchanged.unionByName(new_df, allowMissingColumns=True)
|
|
361
|
+
else:
|
|
362
|
+
import pandas as pd
|
|
363
|
+
|
|
364
|
+
new_keys = new_df[grain].drop_duplicates()
|
|
365
|
+
|
|
366
|
+
merged = pd.merge(existing_df, new_keys, on=grain, how="left", indicator=True)
|
|
367
|
+
unchanged = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])
|
|
368
|
+
|
|
369
|
+
return pd.concat([unchanged, new_df], ignore_index=True)
|
|
370
|
+
|
|
371
|
+
def _merge_sum(
|
|
372
|
+
self,
|
|
373
|
+
context: EngineContext,
|
|
374
|
+
existing_df,
|
|
375
|
+
new_df,
|
|
376
|
+
grain: List[str],
|
|
377
|
+
measures: List[Dict],
|
|
378
|
+
):
|
|
379
|
+
"""
|
|
380
|
+
Sum strategy: Add new measure values to existing for matching grain keys.
|
|
381
|
+
"""
|
|
382
|
+
measure_names = [m["name"] for m in measures]
|
|
383
|
+
|
|
384
|
+
if context.engine_type == EngineType.SPARK:
|
|
385
|
+
from pyspark.sql import functions as F
|
|
386
|
+
|
|
387
|
+
joined = existing_df.alias("e").join(new_df.alias("n"), on=grain, how="full_outer")
|
|
388
|
+
|
|
389
|
+
select_cols = []
|
|
390
|
+
for col in grain:
|
|
391
|
+
select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
|
|
392
|
+
|
|
393
|
+
for name in measure_names:
|
|
394
|
+
select_cols.append(
|
|
395
|
+
(
|
|
396
|
+
F.coalesce(F.col(f"e.{name}"), F.lit(0))
|
|
397
|
+
+ F.coalesce(F.col(f"n.{name}"), F.lit(0))
|
|
398
|
+
).alias(name)
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
other_cols = [
|
|
402
|
+
c for c in existing_df.columns if c not in grain and c not in measure_names
|
|
403
|
+
]
|
|
404
|
+
for col in other_cols:
|
|
405
|
+
select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
|
|
406
|
+
|
|
407
|
+
return joined.select(select_cols)
|
|
408
|
+
else:
|
|
409
|
+
import pandas as pd
|
|
410
|
+
|
|
411
|
+
merged = pd.merge(existing_df, new_df, on=grain, how="outer", suffixes=("_e", "_n"))
|
|
412
|
+
|
|
413
|
+
result = merged[grain].copy()
|
|
414
|
+
|
|
415
|
+
for name in measure_names:
|
|
416
|
+
e_col = f"{name}_e" if f"{name}_e" in merged.columns else name
|
|
417
|
+
n_col = f"{name}_n" if f"{name}_n" in merged.columns else name
|
|
418
|
+
|
|
419
|
+
if e_col in merged.columns and n_col in merged.columns:
|
|
420
|
+
result[name] = merged[e_col].fillna(0).infer_objects(copy=False) + merged[
|
|
421
|
+
n_col
|
|
422
|
+
].fillna(0).infer_objects(copy=False)
|
|
423
|
+
elif e_col in merged.columns:
|
|
424
|
+
result[name] = merged[e_col].fillna(0).infer_objects(copy=False)
|
|
425
|
+
elif n_col in merged.columns:
|
|
426
|
+
result[name] = merged[n_col].fillna(0).infer_objects(copy=False)
|
|
427
|
+
else:
|
|
428
|
+
result[name] = 0
|
|
429
|
+
|
|
430
|
+
other_cols = [
|
|
431
|
+
c for c in existing_df.columns if c not in grain and c not in measure_names
|
|
432
|
+
]
|
|
433
|
+
for col in other_cols:
|
|
434
|
+
e_col = f"{col}_e" if f"{col}_e" in merged.columns else col
|
|
435
|
+
n_col = f"{col}_n" if f"{col}_n" in merged.columns else col
|
|
436
|
+
if e_col in merged.columns:
|
|
437
|
+
result[col] = merged[e_col]
|
|
438
|
+
elif n_col in merged.columns:
|
|
439
|
+
result[col] = merged[n_col]
|
|
440
|
+
|
|
441
|
+
return result
|
|
442
|
+
|
|
443
|
+
def _merge_min(
|
|
444
|
+
self,
|
|
445
|
+
context: EngineContext,
|
|
446
|
+
existing_df,
|
|
447
|
+
new_df,
|
|
448
|
+
grain: List[str],
|
|
449
|
+
measures: List[Dict],
|
|
450
|
+
):
|
|
451
|
+
"""
|
|
452
|
+
Min strategy: Keep the minimum value for each measure across existing and new.
|
|
453
|
+
"""
|
|
454
|
+
measure_names = [m["name"] for m in measures]
|
|
455
|
+
|
|
456
|
+
if context.engine_type == EngineType.SPARK:
|
|
457
|
+
from pyspark.sql import functions as F
|
|
458
|
+
|
|
459
|
+
joined = existing_df.alias("e").join(new_df.alias("n"), on=grain, how="full_outer")
|
|
460
|
+
|
|
461
|
+
select_cols = []
|
|
462
|
+
for col in grain:
|
|
463
|
+
select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
|
|
464
|
+
|
|
465
|
+
for name in measure_names:
|
|
466
|
+
select_cols.append(
|
|
467
|
+
F.least(
|
|
468
|
+
F.coalesce(F.col(f"e.{name}"), F.col(f"n.{name}")),
|
|
469
|
+
F.coalesce(F.col(f"n.{name}"), F.col(f"e.{name}")),
|
|
470
|
+
).alias(name)
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
other_cols = [
|
|
474
|
+
c for c in existing_df.columns if c not in grain and c not in measure_names
|
|
475
|
+
]
|
|
476
|
+
for col in other_cols:
|
|
477
|
+
select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
|
|
478
|
+
|
|
479
|
+
return joined.select(select_cols)
|
|
480
|
+
else:
|
|
481
|
+
import pandas as pd
|
|
482
|
+
|
|
483
|
+
merged = pd.merge(existing_df, new_df, on=grain, how="outer", suffixes=("_e", "_n"))
|
|
484
|
+
|
|
485
|
+
result = merged[grain].copy()
|
|
486
|
+
|
|
487
|
+
for name in measure_names:
|
|
488
|
+
e_col = f"{name}_e" if f"{name}_e" in merged.columns else name
|
|
489
|
+
n_col = f"{name}_n" if f"{name}_n" in merged.columns else name
|
|
490
|
+
|
|
491
|
+
if e_col in merged.columns and n_col in merged.columns:
|
|
492
|
+
result[name] = merged[[e_col, n_col]].min(axis=1)
|
|
493
|
+
elif e_col in merged.columns:
|
|
494
|
+
result[name] = merged[e_col]
|
|
495
|
+
elif n_col in merged.columns:
|
|
496
|
+
result[name] = merged[n_col]
|
|
497
|
+
|
|
498
|
+
other_cols = [
|
|
499
|
+
c for c in existing_df.columns if c not in grain and c not in measure_names
|
|
500
|
+
]
|
|
501
|
+
for col in other_cols:
|
|
502
|
+
e_col = f"{col}_e" if f"{col}_e" in merged.columns else col
|
|
503
|
+
n_col = f"{col}_n" if f"{col}_n" in merged.columns else col
|
|
504
|
+
if e_col in merged.columns:
|
|
505
|
+
result[col] = merged[e_col]
|
|
506
|
+
elif n_col in merged.columns:
|
|
507
|
+
result[col] = merged[n_col]
|
|
508
|
+
|
|
509
|
+
return result
|
|
510
|
+
|
|
511
|
+
def _merge_max(
|
|
512
|
+
self,
|
|
513
|
+
context: EngineContext,
|
|
514
|
+
existing_df,
|
|
515
|
+
new_df,
|
|
516
|
+
grain: List[str],
|
|
517
|
+
measures: List[Dict],
|
|
518
|
+
):
|
|
519
|
+
"""
|
|
520
|
+
Max strategy: Keep the maximum value for each measure across existing and new.
|
|
521
|
+
"""
|
|
522
|
+
measure_names = [m["name"] for m in measures]
|
|
523
|
+
|
|
524
|
+
if context.engine_type == EngineType.SPARK:
|
|
525
|
+
from pyspark.sql import functions as F
|
|
526
|
+
|
|
527
|
+
joined = existing_df.alias("e").join(new_df.alias("n"), on=grain, how="full_outer")
|
|
528
|
+
|
|
529
|
+
select_cols = []
|
|
530
|
+
for col in grain:
|
|
531
|
+
select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
|
|
532
|
+
|
|
533
|
+
for name in measure_names:
|
|
534
|
+
select_cols.append(
|
|
535
|
+
F.greatest(
|
|
536
|
+
F.coalesce(F.col(f"e.{name}"), F.col(f"n.{name}")),
|
|
537
|
+
F.coalesce(F.col(f"n.{name}"), F.col(f"e.{name}")),
|
|
538
|
+
).alias(name)
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
other_cols = [
|
|
542
|
+
c for c in existing_df.columns if c not in grain and c not in measure_names
|
|
543
|
+
]
|
|
544
|
+
for col in other_cols:
|
|
545
|
+
select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
|
|
546
|
+
|
|
547
|
+
return joined.select(select_cols)
|
|
548
|
+
else:
|
|
549
|
+
import pandas as pd
|
|
550
|
+
|
|
551
|
+
merged = pd.merge(existing_df, new_df, on=grain, how="outer", suffixes=("_e", "_n"))
|
|
552
|
+
|
|
553
|
+
result = merged[grain].copy()
|
|
554
|
+
|
|
555
|
+
for name in measure_names:
|
|
556
|
+
e_col = f"{name}_e" if f"{name}_e" in merged.columns else name
|
|
557
|
+
n_col = f"{name}_n" if f"{name}_n" in merged.columns else name
|
|
558
|
+
|
|
559
|
+
if e_col in merged.columns and n_col in merged.columns:
|
|
560
|
+
result[name] = merged[[e_col, n_col]].max(axis=1)
|
|
561
|
+
elif e_col in merged.columns:
|
|
562
|
+
result[name] = merged[e_col]
|
|
563
|
+
elif n_col in merged.columns:
|
|
564
|
+
result[name] = merged[n_col]
|
|
565
|
+
|
|
566
|
+
other_cols = [
|
|
567
|
+
c for c in existing_df.columns if c not in grain and c not in measure_names
|
|
568
|
+
]
|
|
569
|
+
for col in other_cols:
|
|
570
|
+
e_col = f"{col}_e" if f"{col}_e" in merged.columns else col
|
|
571
|
+
n_col = f"{col}_n" if f"{col}_n" in merged.columns else col
|
|
572
|
+
if e_col in merged.columns:
|
|
573
|
+
result[col] = merged[e_col]
|
|
574
|
+
elif n_col in merged.columns:
|
|
575
|
+
result[col] = merged[n_col]
|
|
576
|
+
|
|
577
|
+
return result
|
|
578
|
+
|
|
579
|
+
def _add_audit_columns(self, context: EngineContext, df, audit_config: Dict):
|
|
580
|
+
"""Add audit columns (load_timestamp, source_system)."""
|
|
581
|
+
load_timestamp = audit_config.get("load_timestamp", False)
|
|
582
|
+
source_system = audit_config.get("source_system")
|
|
583
|
+
|
|
584
|
+
if context.engine_type == EngineType.SPARK:
|
|
585
|
+
from pyspark.sql import functions as F
|
|
586
|
+
|
|
587
|
+
if load_timestamp:
|
|
588
|
+
df = df.withColumn("load_timestamp", F.current_timestamp())
|
|
589
|
+
if source_system:
|
|
590
|
+
df = df.withColumn("source_system", F.lit(source_system))
|
|
591
|
+
else:
|
|
592
|
+
if load_timestamp or source_system:
|
|
593
|
+
df = df.copy()
|
|
594
|
+
if load_timestamp:
|
|
595
|
+
df["load_timestamp"] = datetime.now()
|
|
596
|
+
if source_system:
|
|
597
|
+
df["source_system"] = source_system
|
|
598
|
+
|
|
599
|
+
return df
|
odibi/patterns/base.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from odibi.config import NodeConfig
|
|
6
|
+
from odibi.context import EngineContext
|
|
7
|
+
from odibi.engine.base import Engine
|
|
8
|
+
from odibi.utils.logging_context import get_logging_context
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Pattern(ABC):
|
|
12
|
+
"""Base class for Execution Patterns."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, engine: Engine, config: NodeConfig):
|
|
15
|
+
self.engine = engine
|
|
16
|
+
self.config = config
|
|
17
|
+
self.params = config.params
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def execute(self, context: EngineContext) -> Any:
|
|
21
|
+
"""
|
|
22
|
+
Execute the pattern logic.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
context: EngineContext containing current DataFrame and helpers.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
The transformed DataFrame.
|
|
29
|
+
"""
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
def validate(self) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Validate pattern configuration.
|
|
35
|
+
Raises ValueError if invalid.
|
|
36
|
+
"""
|
|
37
|
+
ctx = get_logging_context()
|
|
38
|
+
pattern_name = self.__class__.__name__
|
|
39
|
+
ctx.debug(
|
|
40
|
+
f"{pattern_name} validation starting",
|
|
41
|
+
pattern=pattern_name,
|
|
42
|
+
params=self.params,
|
|
43
|
+
)
|
|
44
|
+
ctx.debug(f"{pattern_name} validation passed", pattern=pattern_name)
|
|
45
|
+
|
|
46
|
+
def _log_execution_start(self, **kwargs) -> float:
|
|
47
|
+
"""
|
|
48
|
+
Log pattern execution start. Returns start time for elapsed calculation.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
**kwargs: Additional key-value pairs to log.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Start time in seconds.
|
|
55
|
+
"""
|
|
56
|
+
ctx = get_logging_context()
|
|
57
|
+
pattern_name = self.__class__.__name__
|
|
58
|
+
ctx.debug(f"{pattern_name} execution starting", pattern=pattern_name, **kwargs)
|
|
59
|
+
return time.time()
|
|
60
|
+
|
|
61
|
+
def _log_execution_complete(self, start_time: float, **kwargs) -> None:
|
|
62
|
+
"""
|
|
63
|
+
Log pattern execution completion with elapsed time.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
start_time: Start time from _log_execution_start.
|
|
67
|
+
**kwargs: Additional key-value pairs to log (e.g., row counts).
|
|
68
|
+
"""
|
|
69
|
+
ctx = get_logging_context()
|
|
70
|
+
pattern_name = self.__class__.__name__
|
|
71
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
72
|
+
ctx.info(
|
|
73
|
+
f"{pattern_name} execution completed",
|
|
74
|
+
pattern=pattern_name,
|
|
75
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
76
|
+
**kwargs,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def _log_error(self, error: Exception, **kwargs) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Log error context before raising exceptions.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
error: The exception that occurred.
|
|
85
|
+
**kwargs: Additional context to log.
|
|
86
|
+
"""
|
|
87
|
+
ctx = get_logging_context()
|
|
88
|
+
pattern_name = self.__class__.__name__
|
|
89
|
+
ctx.error(
|
|
90
|
+
f"{pattern_name} execution failed: {error}",
|
|
91
|
+
pattern=pattern_name,
|
|
92
|
+
error_type=type(error).__name__,
|
|
93
|
+
**kwargs,
|
|
94
|
+
)
|