odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,599 @@
1
+ import time
2
+ from datetime import datetime
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from odibi.context import EngineContext
6
+ from odibi.enums import EngineType
7
+ from odibi.patterns.base import Pattern
8
+ from odibi.utils.logging_context import get_logging_context
9
+
10
+
11
+ class AggregationPattern(Pattern):
12
+ """
13
+ Aggregation Pattern: Declarative aggregation with time-grain rollups.
14
+
15
+ Features:
16
+ - Declare grain (GROUP BY columns)
17
+ - Declare measures with aggregation functions
18
+ - Incremental aggregation (merge new data with existing)
19
+ - Time rollups (generate multiple grain levels)
20
+ - Audit columns
21
+
22
+ Configuration Options (via params dict):
23
+ - **grain** (list): Columns to GROUP BY (defines uniqueness)
24
+ - **measures** (list): Measure definitions with name and aggregation expr
25
+ - name: Output column name
26
+ - expr: SQL aggregation expression (e.g., "SUM(amount)")
27
+ - **incremental** (dict): Incremental merge configuration (optional)
28
+ - timestamp_column: Column to identify new data
29
+ - merge_strategy: "replace", "sum", "min", or "max"
30
+ - **having** (str): Optional HAVING clause for filtering aggregates
31
+ - **audit** (dict): Audit column configuration
32
+
33
+ Example Config:
34
+ pattern:
35
+ type: aggregation
36
+ params:
37
+ grain: [date_sk, product_sk]
38
+ measures:
39
+ - name: total_revenue
40
+ expr: "SUM(total_amount)"
41
+ - name: order_count
42
+ expr: "COUNT(*)"
43
+ - name: avg_order_value
44
+ expr: "AVG(total_amount)"
45
+ having: "COUNT(*) > 0"
46
+ audit:
47
+ load_timestamp: true
48
+ """
49
+
50
+ def validate(self) -> None:
51
+ ctx = get_logging_context()
52
+ grain = self.params.get("grain")
53
+ measures = self.params.get("measures", [])
54
+
55
+ ctx.debug(
56
+ "AggregationPattern validation starting",
57
+ pattern="AggregationPattern",
58
+ grain=grain,
59
+ measures_count=len(measures),
60
+ )
61
+
62
+ if not grain:
63
+ ctx.error(
64
+ "AggregationPattern validation failed: 'grain' is required",
65
+ pattern="AggregationPattern",
66
+ )
67
+ raise ValueError(
68
+ "AggregationPattern: 'grain' parameter is required. "
69
+ "Grain defines the grouping columns for aggregation (e.g., ['date', 'region']). "
70
+ "Provide a list of column names to group by."
71
+ )
72
+
73
+ if not measures:
74
+ ctx.error(
75
+ "AggregationPattern validation failed: 'measures' is required",
76
+ pattern="AggregationPattern",
77
+ )
78
+ raise ValueError(
79
+ "AggregationPattern: 'measures' parameter is required. "
80
+ "Measures define the aggregations to compute (e.g., [{'name': 'total_sales', 'expr': 'sum(amount)'}]). "
81
+ "Provide a list of dicts, each with 'name' and 'expr' keys."
82
+ )
83
+
84
+ for i, measure in enumerate(measures):
85
+ if not isinstance(measure, dict):
86
+ ctx.error(
87
+ f"AggregationPattern validation failed: measure[{i}] must be a dict",
88
+ pattern="AggregationPattern",
89
+ )
90
+ raise ValueError(
91
+ f"AggregationPattern: measure[{i}] must be a dict with 'name' and 'expr'. "
92
+ f"Got {type(measure).__name__}: {measure!r}. "
93
+ "Example: {'name': 'total_sales', 'expr': 'sum(amount)'}"
94
+ )
95
+ if "name" not in measure:
96
+ ctx.error(
97
+ f"AggregationPattern validation failed: measure[{i}] missing 'name'",
98
+ pattern="AggregationPattern",
99
+ )
100
+ raise ValueError(
101
+ f"AggregationPattern: measure[{i}] missing 'name'. "
102
+ f"Got: {measure!r}. Add a 'name' key for the output column name."
103
+ )
104
+ if "expr" not in measure:
105
+ ctx.error(
106
+ f"AggregationPattern validation failed: measure[{i}] missing 'expr'",
107
+ pattern="AggregationPattern",
108
+ )
109
+ raise ValueError(
110
+ f"AggregationPattern: measure[{i}] missing 'expr'. "
111
+ f"Got: {measure!r}. Add an 'expr' key with the aggregation expression (e.g., 'sum(amount)')."
112
+ )
113
+
114
+ incremental = self.params.get("incremental")
115
+ if incremental:
116
+ if "timestamp_column" not in incremental:
117
+ ctx.error(
118
+ "AggregationPattern validation failed: incremental missing 'timestamp_column'",
119
+ pattern="AggregationPattern",
120
+ )
121
+ raise ValueError(
122
+ "AggregationPattern: incremental config requires 'timestamp_column'. "
123
+ f"Got: {incremental!r}. "
124
+ "Add 'timestamp_column' to specify which column tracks record timestamps."
125
+ )
126
+ merge_strategy = incremental.get("merge_strategy", "replace")
127
+ if merge_strategy not in ("replace", "sum", "min", "max"):
128
+ ctx.error(
129
+ f"AggregationPattern validation failed: invalid merge_strategy '{merge_strategy}'",
130
+ pattern="AggregationPattern",
131
+ )
132
+ raise ValueError(
133
+ f"AggregationPattern: 'merge_strategy' must be 'replace', 'sum', 'min', or 'max'. "
134
+ f"Got: {merge_strategy}"
135
+ )
136
+
137
+ ctx.debug(
138
+ "AggregationPattern validation passed",
139
+ pattern="AggregationPattern",
140
+ )
141
+
142
+ def execute(self, context: EngineContext) -> Any:
143
+ ctx = get_logging_context()
144
+ start_time = time.time()
145
+
146
+ grain = self.params.get("grain")
147
+ measures = self.params.get("measures", [])
148
+ having = self.params.get("having")
149
+ incremental = self.params.get("incremental")
150
+ audit_config = self.params.get("audit", {})
151
+ target = self.params.get("target")
152
+
153
+ ctx.debug(
154
+ "AggregationPattern starting",
155
+ pattern="AggregationPattern",
156
+ grain=grain,
157
+ measures_count=len(measures),
158
+ incremental=incremental is not None,
159
+ )
160
+
161
+ df = context.df
162
+ source_count = self._get_row_count(df, context.engine_type)
163
+ ctx.debug(
164
+ "Aggregation source loaded",
165
+ pattern="AggregationPattern",
166
+ source_rows=source_count,
167
+ )
168
+
169
+ try:
170
+ result_df = self._aggregate(context, df, grain, measures, having)
171
+
172
+ if incremental and target:
173
+ result_df = self._apply_incremental(
174
+ context, result_df, grain, measures, incremental, target
175
+ )
176
+
177
+ result_df = self._add_audit_columns(context, result_df, audit_config)
178
+
179
+ result_count = self._get_row_count(result_df, context.engine_type)
180
+ elapsed_ms = (time.time() - start_time) * 1000
181
+
182
+ ctx.info(
183
+ "AggregationPattern completed",
184
+ pattern="AggregationPattern",
185
+ elapsed_ms=round(elapsed_ms, 2),
186
+ source_rows=source_count,
187
+ result_rows=result_count,
188
+ grain=grain,
189
+ )
190
+
191
+ return result_df
192
+
193
+ except Exception as e:
194
+ elapsed_ms = (time.time() - start_time) * 1000
195
+ ctx.error(
196
+ f"AggregationPattern failed: {e}",
197
+ pattern="AggregationPattern",
198
+ error_type=type(e).__name__,
199
+ elapsed_ms=round(elapsed_ms, 2),
200
+ )
201
+ raise
202
+
203
+ def _get_row_count(self, df, engine_type) -> Optional[int]:
204
+ try:
205
+ if engine_type == EngineType.SPARK:
206
+ return df.count()
207
+ else:
208
+ return len(df)
209
+ except Exception:
210
+ return None
211
+
212
+ def _aggregate(
213
+ self,
214
+ context: EngineContext,
215
+ df,
216
+ grain: List[str],
217
+ measures: List[Dict],
218
+ having: Optional[str],
219
+ ):
220
+ """Perform the aggregation using SQL."""
221
+ if context.engine_type == EngineType.SPARK:
222
+ return self._aggregate_spark(context, df, grain, measures, having)
223
+ else:
224
+ return self._aggregate_pandas(context, df, grain, measures, having)
225
+
226
+ def _aggregate_spark(
227
+ self,
228
+ context: EngineContext,
229
+ df,
230
+ grain: List[str],
231
+ measures: List[Dict],
232
+ having: Optional[str],
233
+ ):
234
+ """Aggregate using Spark SQL."""
235
+ from pyspark.sql import functions as F
236
+
237
+ grain_cols = [F.col(c) for c in grain]
238
+
239
+ agg_exprs = []
240
+ for measure in measures:
241
+ name = measure["name"]
242
+ expr = measure["expr"]
243
+ agg_exprs.append(F.expr(expr).alias(name))
244
+
245
+ result = df.groupBy(*grain_cols).agg(*agg_exprs)
246
+
247
+ if having:
248
+ result = result.filter(F.expr(having))
249
+
250
+ return result
251
+
252
+ def _aggregate_pandas(
253
+ self,
254
+ context: EngineContext,
255
+ df,
256
+ grain: List[str],
257
+ measures: List[Dict],
258
+ having: Optional[str],
259
+ ):
260
+ """Aggregate using DuckDB SQL via context.sql()."""
261
+ grain_str = ", ".join(grain)
262
+
263
+ measure_exprs = []
264
+ for measure in measures:
265
+ name = measure["name"]
266
+ expr = measure["expr"]
267
+ measure_exprs.append(f"{expr} AS {name}")
268
+ measures_str = ", ".join(measure_exprs)
269
+
270
+ sql = f"SELECT {grain_str}, {measures_str} FROM df GROUP BY {grain_str}"
271
+
272
+ if having:
273
+ sql += f" HAVING {having}"
274
+
275
+ temp_context = context.with_df(df)
276
+ result_context = temp_context.sql(sql)
277
+ return result_context.df
278
+
279
+ def _apply_incremental(
280
+ self,
281
+ context: EngineContext,
282
+ new_agg_df,
283
+ grain: List[str],
284
+ measures: List[Dict],
285
+ incremental: Dict,
286
+ target: str,
287
+ ):
288
+ """Apply incremental merge with existing aggregations."""
289
+ merge_strategy = incremental.get("merge_strategy", "replace")
290
+
291
+ existing_df = self._load_existing_target(context, target)
292
+ if existing_df is None:
293
+ return new_agg_df
294
+
295
+ if merge_strategy == "replace":
296
+ return self._merge_replace(context, existing_df, new_agg_df, grain)
297
+ elif merge_strategy == "sum":
298
+ return self._merge_sum(context, existing_df, new_agg_df, grain, measures)
299
+ elif merge_strategy == "min":
300
+ return self._merge_min(context, existing_df, new_agg_df, grain, measures)
301
+ else: # max
302
+ return self._merge_max(context, existing_df, new_agg_df, grain, measures)
303
+
304
+ def _load_existing_target(self, context: EngineContext, target: str):
305
+ """Load existing target table if it exists."""
306
+ if context.engine_type == EngineType.SPARK:
307
+ return self._load_existing_spark(context, target)
308
+ else:
309
+ return self._load_existing_pandas(context, target)
310
+
311
+ def _load_existing_spark(self, context: EngineContext, target: str):
312
+ spark = context.spark
313
+ try:
314
+ return spark.table(target)
315
+ except Exception:
316
+ try:
317
+ return spark.read.format("delta").load(target)
318
+ except Exception:
319
+ return None
320
+
321
+ def _load_existing_pandas(self, context: EngineContext, target: str):
322
+ import os
323
+
324
+ import pandas as pd
325
+
326
+ path = target
327
+ if hasattr(context, "engine") and context.engine:
328
+ if "." in path:
329
+ parts = path.split(".", 1)
330
+ conn_name = parts[0]
331
+ rel_path = parts[1]
332
+ if conn_name in context.engine.connections:
333
+ try:
334
+ path = context.engine.connections[conn_name].get_path(rel_path)
335
+ except Exception:
336
+ pass
337
+
338
+ if not os.path.exists(path):
339
+ return None
340
+
341
+ try:
342
+ if str(path).endswith(".parquet") or os.path.isdir(path):
343
+ return pd.read_parquet(path)
344
+ elif str(path).endswith(".csv"):
345
+ return pd.read_csv(path)
346
+ except Exception:
347
+ return None
348
+
349
+ return None
350
+
351
+ def _merge_replace(self, context: EngineContext, existing_df, new_df, grain: List[str]):
352
+ """
353
+ Replace strategy: New aggregates overwrite existing for matching grain keys.
354
+ """
355
+ if context.engine_type == EngineType.SPARK:
356
+ new_keys = new_df.select(grain).distinct()
357
+
358
+ unchanged = existing_df.join(new_keys, on=grain, how="left_anti")
359
+
360
+ return unchanged.unionByName(new_df, allowMissingColumns=True)
361
+ else:
362
+ import pandas as pd
363
+
364
+ new_keys = new_df[grain].drop_duplicates()
365
+
366
+ merged = pd.merge(existing_df, new_keys, on=grain, how="left", indicator=True)
367
+ unchanged = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])
368
+
369
+ return pd.concat([unchanged, new_df], ignore_index=True)
370
+
371
+ def _merge_sum(
372
+ self,
373
+ context: EngineContext,
374
+ existing_df,
375
+ new_df,
376
+ grain: List[str],
377
+ measures: List[Dict],
378
+ ):
379
+ """
380
+ Sum strategy: Add new measure values to existing for matching grain keys.
381
+ """
382
+ measure_names = [m["name"] for m in measures]
383
+
384
+ if context.engine_type == EngineType.SPARK:
385
+ from pyspark.sql import functions as F
386
+
387
+ joined = existing_df.alias("e").join(new_df.alias("n"), on=grain, how="full_outer")
388
+
389
+ select_cols = []
390
+ for col in grain:
391
+ select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
392
+
393
+ for name in measure_names:
394
+ select_cols.append(
395
+ (
396
+ F.coalesce(F.col(f"e.{name}"), F.lit(0))
397
+ + F.coalesce(F.col(f"n.{name}"), F.lit(0))
398
+ ).alias(name)
399
+ )
400
+
401
+ other_cols = [
402
+ c for c in existing_df.columns if c not in grain and c not in measure_names
403
+ ]
404
+ for col in other_cols:
405
+ select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
406
+
407
+ return joined.select(select_cols)
408
+ else:
409
+ import pandas as pd
410
+
411
+ merged = pd.merge(existing_df, new_df, on=grain, how="outer", suffixes=("_e", "_n"))
412
+
413
+ result = merged[grain].copy()
414
+
415
+ for name in measure_names:
416
+ e_col = f"{name}_e" if f"{name}_e" in merged.columns else name
417
+ n_col = f"{name}_n" if f"{name}_n" in merged.columns else name
418
+
419
+ if e_col in merged.columns and n_col in merged.columns:
420
+ result[name] = merged[e_col].fillna(0).infer_objects(copy=False) + merged[
421
+ n_col
422
+ ].fillna(0).infer_objects(copy=False)
423
+ elif e_col in merged.columns:
424
+ result[name] = merged[e_col].fillna(0).infer_objects(copy=False)
425
+ elif n_col in merged.columns:
426
+ result[name] = merged[n_col].fillna(0).infer_objects(copy=False)
427
+ else:
428
+ result[name] = 0
429
+
430
+ other_cols = [
431
+ c for c in existing_df.columns if c not in grain and c not in measure_names
432
+ ]
433
+ for col in other_cols:
434
+ e_col = f"{col}_e" if f"{col}_e" in merged.columns else col
435
+ n_col = f"{col}_n" if f"{col}_n" in merged.columns else col
436
+ if e_col in merged.columns:
437
+ result[col] = merged[e_col]
438
+ elif n_col in merged.columns:
439
+ result[col] = merged[n_col]
440
+
441
+ return result
442
+
443
+ def _merge_min(
444
+ self,
445
+ context: EngineContext,
446
+ existing_df,
447
+ new_df,
448
+ grain: List[str],
449
+ measures: List[Dict],
450
+ ):
451
+ """
452
+ Min strategy: Keep the minimum value for each measure across existing and new.
453
+ """
454
+ measure_names = [m["name"] for m in measures]
455
+
456
+ if context.engine_type == EngineType.SPARK:
457
+ from pyspark.sql import functions as F
458
+
459
+ joined = existing_df.alias("e").join(new_df.alias("n"), on=grain, how="full_outer")
460
+
461
+ select_cols = []
462
+ for col in grain:
463
+ select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
464
+
465
+ for name in measure_names:
466
+ select_cols.append(
467
+ F.least(
468
+ F.coalesce(F.col(f"e.{name}"), F.col(f"n.{name}")),
469
+ F.coalesce(F.col(f"n.{name}"), F.col(f"e.{name}")),
470
+ ).alias(name)
471
+ )
472
+
473
+ other_cols = [
474
+ c for c in existing_df.columns if c not in grain and c not in measure_names
475
+ ]
476
+ for col in other_cols:
477
+ select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
478
+
479
+ return joined.select(select_cols)
480
+ else:
481
+ import pandas as pd
482
+
483
+ merged = pd.merge(existing_df, new_df, on=grain, how="outer", suffixes=("_e", "_n"))
484
+
485
+ result = merged[grain].copy()
486
+
487
+ for name in measure_names:
488
+ e_col = f"{name}_e" if f"{name}_e" in merged.columns else name
489
+ n_col = f"{name}_n" if f"{name}_n" in merged.columns else name
490
+
491
+ if e_col in merged.columns and n_col in merged.columns:
492
+ result[name] = merged[[e_col, n_col]].min(axis=1)
493
+ elif e_col in merged.columns:
494
+ result[name] = merged[e_col]
495
+ elif n_col in merged.columns:
496
+ result[name] = merged[n_col]
497
+
498
+ other_cols = [
499
+ c for c in existing_df.columns if c not in grain and c not in measure_names
500
+ ]
501
+ for col in other_cols:
502
+ e_col = f"{col}_e" if f"{col}_e" in merged.columns else col
503
+ n_col = f"{col}_n" if f"{col}_n" in merged.columns else col
504
+ if e_col in merged.columns:
505
+ result[col] = merged[e_col]
506
+ elif n_col in merged.columns:
507
+ result[col] = merged[n_col]
508
+
509
+ return result
510
+
511
+ def _merge_max(
512
+ self,
513
+ context: EngineContext,
514
+ existing_df,
515
+ new_df,
516
+ grain: List[str],
517
+ measures: List[Dict],
518
+ ):
519
+ """
520
+ Max strategy: Keep the maximum value for each measure across existing and new.
521
+ """
522
+ measure_names = [m["name"] for m in measures]
523
+
524
+ if context.engine_type == EngineType.SPARK:
525
+ from pyspark.sql import functions as F
526
+
527
+ joined = existing_df.alias("e").join(new_df.alias("n"), on=grain, how="full_outer")
528
+
529
+ select_cols = []
530
+ for col in grain:
531
+ select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
532
+
533
+ for name in measure_names:
534
+ select_cols.append(
535
+ F.greatest(
536
+ F.coalesce(F.col(f"e.{name}"), F.col(f"n.{name}")),
537
+ F.coalesce(F.col(f"n.{name}"), F.col(f"e.{name}")),
538
+ ).alias(name)
539
+ )
540
+
541
+ other_cols = [
542
+ c for c in existing_df.columns if c not in grain and c not in measure_names
543
+ ]
544
+ for col in other_cols:
545
+ select_cols.append(F.coalesce(F.col(f"e.{col}"), F.col(f"n.{col}")).alias(col))
546
+
547
+ return joined.select(select_cols)
548
+ else:
549
+ import pandas as pd
550
+
551
+ merged = pd.merge(existing_df, new_df, on=grain, how="outer", suffixes=("_e", "_n"))
552
+
553
+ result = merged[grain].copy()
554
+
555
+ for name in measure_names:
556
+ e_col = f"{name}_e" if f"{name}_e" in merged.columns else name
557
+ n_col = f"{name}_n" if f"{name}_n" in merged.columns else name
558
+
559
+ if e_col in merged.columns and n_col in merged.columns:
560
+ result[name] = merged[[e_col, n_col]].max(axis=1)
561
+ elif e_col in merged.columns:
562
+ result[name] = merged[e_col]
563
+ elif n_col in merged.columns:
564
+ result[name] = merged[n_col]
565
+
566
+ other_cols = [
567
+ c for c in existing_df.columns if c not in grain and c not in measure_names
568
+ ]
569
+ for col in other_cols:
570
+ e_col = f"{col}_e" if f"{col}_e" in merged.columns else col
571
+ n_col = f"{col}_n" if f"{col}_n" in merged.columns else col
572
+ if e_col in merged.columns:
573
+ result[col] = merged[e_col]
574
+ elif n_col in merged.columns:
575
+ result[col] = merged[n_col]
576
+
577
+ return result
578
+
579
+ def _add_audit_columns(self, context: EngineContext, df, audit_config: Dict):
580
+ """Add audit columns (load_timestamp, source_system)."""
581
+ load_timestamp = audit_config.get("load_timestamp", False)
582
+ source_system = audit_config.get("source_system")
583
+
584
+ if context.engine_type == EngineType.SPARK:
585
+ from pyspark.sql import functions as F
586
+
587
+ if load_timestamp:
588
+ df = df.withColumn("load_timestamp", F.current_timestamp())
589
+ if source_system:
590
+ df = df.withColumn("source_system", F.lit(source_system))
591
+ else:
592
+ if load_timestamp or source_system:
593
+ df = df.copy()
594
+ if load_timestamp:
595
+ df["load_timestamp"] = datetime.now()
596
+ if source_system:
597
+ df["source_system"] = source_system
598
+
599
+ return df
odibi/patterns/base.py ADDED
@@ -0,0 +1,94 @@
1
+ import time
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ from odibi.config import NodeConfig
6
+ from odibi.context import EngineContext
7
+ from odibi.engine.base import Engine
8
+ from odibi.utils.logging_context import get_logging_context
9
+
10
+
11
+ class Pattern(ABC):
12
+ """Base class for Execution Patterns."""
13
+
14
+ def __init__(self, engine: Engine, config: NodeConfig):
15
+ self.engine = engine
16
+ self.config = config
17
+ self.params = config.params
18
+
19
+ @abstractmethod
20
+ def execute(self, context: EngineContext) -> Any:
21
+ """
22
+ Execute the pattern logic.
23
+
24
+ Args:
25
+ context: EngineContext containing current DataFrame and helpers.
26
+
27
+ Returns:
28
+ The transformed DataFrame.
29
+ """
30
+ pass
31
+
32
+ def validate(self) -> None:
33
+ """
34
+ Validate pattern configuration.
35
+ Raises ValueError if invalid.
36
+ """
37
+ ctx = get_logging_context()
38
+ pattern_name = self.__class__.__name__
39
+ ctx.debug(
40
+ f"{pattern_name} validation starting",
41
+ pattern=pattern_name,
42
+ params=self.params,
43
+ )
44
+ ctx.debug(f"{pattern_name} validation passed", pattern=pattern_name)
45
+
46
+ def _log_execution_start(self, **kwargs) -> float:
47
+ """
48
+ Log pattern execution start. Returns start time for elapsed calculation.
49
+
50
+ Args:
51
+ **kwargs: Additional key-value pairs to log.
52
+
53
+ Returns:
54
+ Start time in seconds.
55
+ """
56
+ ctx = get_logging_context()
57
+ pattern_name = self.__class__.__name__
58
+ ctx.debug(f"{pattern_name} execution starting", pattern=pattern_name, **kwargs)
59
+ return time.time()
60
+
61
+ def _log_execution_complete(self, start_time: float, **kwargs) -> None:
62
+ """
63
+ Log pattern execution completion with elapsed time.
64
+
65
+ Args:
66
+ start_time: Start time from _log_execution_start.
67
+ **kwargs: Additional key-value pairs to log (e.g., row counts).
68
+ """
69
+ ctx = get_logging_context()
70
+ pattern_name = self.__class__.__name__
71
+ elapsed_ms = (time.time() - start_time) * 1000
72
+ ctx.info(
73
+ f"{pattern_name} execution completed",
74
+ pattern=pattern_name,
75
+ elapsed_ms=round(elapsed_ms, 2),
76
+ **kwargs,
77
+ )
78
+
79
+ def _log_error(self, error: Exception, **kwargs) -> None:
80
+ """
81
+ Log error context before raising exceptions.
82
+
83
+ Args:
84
+ error: The exception that occurred.
85
+ **kwargs: Additional context to log.
86
+ """
87
+ ctx = get_logging_context()
88
+ pattern_name = self.__class__.__name__
89
+ ctx.error(
90
+ f"{pattern_name} execution failed: {error}",
91
+ pattern=pattern_name,
92
+ error_type=type(error).__name__,
93
+ **kwargs,
94
+ )