odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,392 @@
1
+ """
2
+ Materialization Module
3
+ ======================
4
+
5
+ Execute and persist materialized metric aggregations.
6
+
7
+ Materialization pre-computes aggregated metrics at a specific grain
8
+ and writes them to an output table for faster querying.
9
+ """
10
+
11
+ import time
12
+ from dataclasses import dataclass
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from odibi.context import EngineContext
16
+ from odibi.enums import EngineType
17
+ from odibi.semantics.metrics import (
18
+ MaterializationConfig,
19
+ SemanticLayerConfig,
20
+ )
21
+ from odibi.semantics.query import SemanticQuery
22
+ from odibi.utils.logging_context import get_logging_context
23
+
24
+
25
+ @dataclass
26
+ class MaterializationResult:
27
+ """Result of a materialization execution."""
28
+
29
+ name: str
30
+ output: str
31
+ row_count: int
32
+ elapsed_ms: float
33
+ success: bool
34
+ error: Optional[str] = None
35
+
36
+
37
+ class Materializer:
38
+ """
39
+ Execute materializations defined in the semantic layer config.
40
+
41
+ Usage:
42
+ config = SemanticLayerConfig(...)
43
+ materializer = Materializer(config)
44
+ result = materializer.execute("monthly_revenue_by_region", context)
45
+ """
46
+
47
+ def __init__(self, config: SemanticLayerConfig):
48
+ """
49
+ Initialize with semantic layer configuration.
50
+
51
+ Args:
52
+ config: SemanticLayerConfig with materializations
53
+ """
54
+ self.config = config
55
+ self._query = SemanticQuery(config)
56
+
57
+ def execute(
58
+ self,
59
+ name: str,
60
+ context: EngineContext,
61
+ write_callback: Optional[Any] = None,
62
+ ) -> MaterializationResult:
63
+ """
64
+ Execute a single materialization.
65
+
66
+ Args:
67
+ name: Name of the materialization to execute
68
+ context: EngineContext with source data
69
+ write_callback: Optional callback to write output
70
+ Function signature: (df, output_path) -> None
71
+
72
+ Returns:
73
+ MaterializationResult with execution details
74
+ """
75
+ ctx = get_logging_context()
76
+ start_time = time.time()
77
+
78
+ ctx.info("Starting materialization", name=name)
79
+
80
+ mat_config = self.config.get_materialization(name)
81
+ if mat_config is None:
82
+ available = [m.name for m in self.config.materializations]
83
+ raise ValueError(f"Materialization '{name}' not found. Available: {available}")
84
+
85
+ try:
86
+ query_string = self._build_query_string(mat_config)
87
+ ctx.debug("Built query for materialization", query=query_string)
88
+
89
+ result = self._query.execute(query_string, context)
90
+
91
+ if write_callback:
92
+ write_callback(result.df, mat_config.output)
93
+ ctx.debug("Wrote materialization output", output=mat_config.output)
94
+
95
+ elapsed_ms = (time.time() - start_time) * 1000
96
+
97
+ ctx.info(
98
+ "Materialization completed",
99
+ name=name,
100
+ output=mat_config.output,
101
+ rows=result.row_count,
102
+ elapsed_ms=round(elapsed_ms, 2),
103
+ )
104
+
105
+ return MaterializationResult(
106
+ name=name,
107
+ output=mat_config.output,
108
+ row_count=result.row_count,
109
+ elapsed_ms=elapsed_ms,
110
+ success=True,
111
+ )
112
+
113
+ except Exception as e:
114
+ elapsed_ms = (time.time() - start_time) * 1000
115
+ ctx.error(
116
+ f"Materialization failed: {e}",
117
+ name=name,
118
+ error_type=type(e).__name__,
119
+ elapsed_ms=round(elapsed_ms, 2),
120
+ )
121
+
122
+ return MaterializationResult(
123
+ name=name,
124
+ output=mat_config.output if mat_config else "",
125
+ row_count=0,
126
+ elapsed_ms=elapsed_ms,
127
+ success=False,
128
+ error=str(e),
129
+ )
130
+
131
+ def execute_all(
132
+ self,
133
+ context: EngineContext,
134
+ write_callback: Optional[Any] = None,
135
+ ) -> List[MaterializationResult]:
136
+ """
137
+ Execute all configured materializations.
138
+
139
+ Args:
140
+ context: EngineContext with source data
141
+ write_callback: Optional callback to write output
142
+
143
+ Returns:
144
+ List of MaterializationResult for each materialization
145
+ """
146
+ ctx = get_logging_context()
147
+ results = []
148
+
149
+ for mat_config in self.config.materializations:
150
+ result = self.execute(mat_config.name, context, write_callback)
151
+ results.append(result)
152
+
153
+ success_count = sum(1 for r in results if r.success)
154
+ ctx.info(
155
+ "All materializations completed",
156
+ total=len(results),
157
+ success=success_count,
158
+ failed=len(results) - success_count,
159
+ )
160
+
161
+ return results
162
+
163
+ def _build_query_string(self, mat_config: MaterializationConfig) -> str:
164
+ """
165
+ Build a semantic query string from materialization config.
166
+
167
+ Args:
168
+ mat_config: MaterializationConfig
169
+
170
+ Returns:
171
+ Query string like "metric1, metric2 BY dim1, dim2"
172
+ """
173
+ metrics_part = ", ".join(mat_config.metrics)
174
+ dims_part = ", ".join(mat_config.dimensions)
175
+
176
+ return f"{metrics_part} BY {dims_part}"
177
+
178
+ def get_schedule(self, name: str) -> Optional[str]:
179
+ """
180
+ Get the schedule for a materialization.
181
+
182
+ Args:
183
+ name: Materialization name
184
+
185
+ Returns:
186
+ Cron schedule string or None
187
+ """
188
+ mat_config = self.config.get_materialization(name)
189
+ return mat_config.schedule if mat_config else None
190
+
191
+ def list_materializations(self) -> List[Dict[str, Any]]:
192
+ """
193
+ List all configured materializations.
194
+
195
+ Returns:
196
+ List of materialization info dicts
197
+ """
198
+ return [
199
+ {
200
+ "name": mat.name,
201
+ "metrics": mat.metrics,
202
+ "dimensions": mat.dimensions,
203
+ "output": mat.output,
204
+ "schedule": mat.schedule,
205
+ }
206
+ for mat in self.config.materializations
207
+ ]
208
+
209
+
210
+ class IncrementalMaterializer:
211
+ """
212
+ Execute incremental materializations with merge strategy.
213
+
214
+ Supports:
215
+ - Replace: Replace rows matching the grain
216
+ - Sum: Add values to existing aggregates
217
+ """
218
+
219
+ def __init__(self, config: SemanticLayerConfig):
220
+ """
221
+ Initialize with semantic layer configuration.
222
+
223
+ Args:
224
+ config: SemanticLayerConfig with materializations
225
+ """
226
+ self.config = config
227
+ self._base_materializer = Materializer(config)
228
+
229
+ def execute_incremental(
230
+ self,
231
+ name: str,
232
+ context: EngineContext,
233
+ existing_df: Any,
234
+ timestamp_column: str,
235
+ since_timestamp: Any = None,
236
+ merge_strategy: str = "replace",
237
+ ) -> MaterializationResult:
238
+ """
239
+ Execute an incremental materialization.
240
+
241
+ Args:
242
+ name: Materialization name
243
+ context: EngineContext with source data
244
+ existing_df: Existing materialized data
245
+ timestamp_column: Column for incremental filtering
246
+ since_timestamp: Only process data after this timestamp
247
+ merge_strategy: "replace" or "sum"
248
+
249
+ Returns:
250
+ MaterializationResult with merged data
251
+ """
252
+ ctx = get_logging_context()
253
+ start_time = time.time()
254
+
255
+ mat_config = self.config.get_materialization(name)
256
+ if mat_config is None:
257
+ raise ValueError(f"Materialization '{name}' not found")
258
+
259
+ try:
260
+ source_df = context.df
261
+
262
+ if since_timestamp is not None:
263
+ if context.engine_type == EngineType.SPARK:
264
+ from pyspark.sql import functions as F
265
+
266
+ source_df = source_df.filter(F.col(timestamp_column) > since_timestamp)
267
+ else:
268
+ source_df = source_df[source_df[timestamp_column] > since_timestamp]
269
+
270
+ incremental_context = context.with_df(source_df)
271
+
272
+ query_string = self._base_materializer._build_query_string(mat_config)
273
+ query = SemanticQuery(self.config)
274
+ new_result = query.execute(query_string, incremental_context)
275
+
276
+ merged_df = self._merge_results(
277
+ context,
278
+ existing_df,
279
+ new_result.df,
280
+ mat_config.dimensions,
281
+ mat_config.metrics,
282
+ merge_strategy,
283
+ )
284
+
285
+ if context.engine_type == EngineType.SPARK:
286
+ row_count = merged_df.count()
287
+ else:
288
+ row_count = len(merged_df)
289
+
290
+ elapsed_ms = (time.time() - start_time) * 1000
291
+
292
+ ctx.info(
293
+ "Incremental materialization completed",
294
+ name=name,
295
+ strategy=merge_strategy,
296
+ rows=row_count,
297
+ elapsed_ms=round(elapsed_ms, 2),
298
+ )
299
+
300
+ return MaterializationResult(
301
+ name=name,
302
+ output=mat_config.output,
303
+ row_count=row_count,
304
+ elapsed_ms=elapsed_ms,
305
+ success=True,
306
+ )
307
+
308
+ except Exception as e:
309
+ elapsed_ms = (time.time() - start_time) * 1000
310
+ ctx.error(
311
+ f"Incremental materialization failed: {e}",
312
+ name=name,
313
+ error_type=type(e).__name__,
314
+ )
315
+
316
+ return MaterializationResult(
317
+ name=name,
318
+ output=mat_config.output if mat_config else "",
319
+ row_count=0,
320
+ elapsed_ms=elapsed_ms,
321
+ success=False,
322
+ error=str(e),
323
+ )
324
+
325
+ def _merge_results(
326
+ self,
327
+ context: EngineContext,
328
+ existing_df: Any,
329
+ new_df: Any,
330
+ dimensions: List[str],
331
+ metrics: List[str],
332
+ strategy: str,
333
+ ) -> Any:
334
+ """Merge new results with existing data."""
335
+ if context.engine_type == EngineType.SPARK:
336
+ return self._merge_spark(existing_df, new_df, dimensions, metrics, strategy)
337
+ else:
338
+ return self._merge_pandas(existing_df, new_df, dimensions, metrics, strategy)
339
+
340
+ def _merge_spark(
341
+ self,
342
+ existing_df: Any,
343
+ new_df: Any,
344
+ dimensions: List[str],
345
+ metrics: List[str],
346
+ strategy: str,
347
+ ) -> Any:
348
+ """Merge using Spark."""
349
+ from pyspark.sql import functions as F
350
+
351
+ if strategy == "replace":
352
+ join_cond = [existing_df[d] == new_df[d] for d in dimensions]
353
+ unchanged = existing_df.join(new_df, join_cond, "left_anti")
354
+ return unchanged.unionByName(new_df, allowMissingColumns=True)
355
+
356
+ elif strategy == "sum":
357
+ combined = existing_df.unionByName(new_df, allowMissingColumns=True)
358
+ agg_exprs = [F.sum(F.col(m)).alias(m) for m in metrics]
359
+ return combined.groupBy(dimensions).agg(*agg_exprs)
360
+
361
+ else:
362
+ raise ValueError(f"Unknown merge strategy: {strategy}")
363
+
364
+ def _merge_pandas(
365
+ self,
366
+ existing_df: Any,
367
+ new_df: Any,
368
+ dimensions: List[str],
369
+ metrics: List[str],
370
+ strategy: str,
371
+ ) -> Any:
372
+ """Merge using Pandas."""
373
+ import pandas as pd
374
+
375
+ if strategy == "replace":
376
+ merged = pd.merge(
377
+ existing_df,
378
+ new_df[dimensions],
379
+ on=dimensions,
380
+ how="left",
381
+ indicator=True,
382
+ )
383
+ unchanged = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])
384
+ unchanged = unchanged[existing_df.columns]
385
+ return pd.concat([unchanged, new_df], ignore_index=True)
386
+
387
+ elif strategy == "sum":
388
+ combined = pd.concat([existing_df, new_df], ignore_index=True)
389
+ return combined.groupby(dimensions, as_index=False)[metrics].sum()
390
+
391
+ else:
392
+ raise ValueError(f"Unknown merge strategy: {strategy}")