odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Materialization Module
|
|
3
|
+
======================
|
|
4
|
+
|
|
5
|
+
Execute and persist materialized metric aggregations.
|
|
6
|
+
|
|
7
|
+
Materialization pre-computes aggregated metrics at a specific grain
|
|
8
|
+
and writes them to an output table for faster querying.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
from odibi.context import EngineContext
|
|
16
|
+
from odibi.enums import EngineType
|
|
17
|
+
from odibi.semantics.metrics import (
|
|
18
|
+
MaterializationConfig,
|
|
19
|
+
SemanticLayerConfig,
|
|
20
|
+
)
|
|
21
|
+
from odibi.semantics.query import SemanticQuery
|
|
22
|
+
from odibi.utils.logging_context import get_logging_context
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class MaterializationResult:
|
|
27
|
+
"""Result of a materialization execution."""
|
|
28
|
+
|
|
29
|
+
name: str
|
|
30
|
+
output: str
|
|
31
|
+
row_count: int
|
|
32
|
+
elapsed_ms: float
|
|
33
|
+
success: bool
|
|
34
|
+
error: Optional[str] = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Materializer:
|
|
38
|
+
"""
|
|
39
|
+
Execute materializations defined in the semantic layer config.
|
|
40
|
+
|
|
41
|
+
Usage:
|
|
42
|
+
config = SemanticLayerConfig(...)
|
|
43
|
+
materializer = Materializer(config)
|
|
44
|
+
result = materializer.execute("monthly_revenue_by_region", context)
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, config: SemanticLayerConfig):
|
|
48
|
+
"""
|
|
49
|
+
Initialize with semantic layer configuration.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
config: SemanticLayerConfig with materializations
|
|
53
|
+
"""
|
|
54
|
+
self.config = config
|
|
55
|
+
self._query = SemanticQuery(config)
|
|
56
|
+
|
|
57
|
+
def execute(
|
|
58
|
+
self,
|
|
59
|
+
name: str,
|
|
60
|
+
context: EngineContext,
|
|
61
|
+
write_callback: Optional[Any] = None,
|
|
62
|
+
) -> MaterializationResult:
|
|
63
|
+
"""
|
|
64
|
+
Execute a single materialization.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
name: Name of the materialization to execute
|
|
68
|
+
context: EngineContext with source data
|
|
69
|
+
write_callback: Optional callback to write output
|
|
70
|
+
Function signature: (df, output_path) -> None
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
MaterializationResult with execution details
|
|
74
|
+
"""
|
|
75
|
+
ctx = get_logging_context()
|
|
76
|
+
start_time = time.time()
|
|
77
|
+
|
|
78
|
+
ctx.info("Starting materialization", name=name)
|
|
79
|
+
|
|
80
|
+
mat_config = self.config.get_materialization(name)
|
|
81
|
+
if mat_config is None:
|
|
82
|
+
available = [m.name for m in self.config.materializations]
|
|
83
|
+
raise ValueError(f"Materialization '{name}' not found. Available: {available}")
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
query_string = self._build_query_string(mat_config)
|
|
87
|
+
ctx.debug("Built query for materialization", query=query_string)
|
|
88
|
+
|
|
89
|
+
result = self._query.execute(query_string, context)
|
|
90
|
+
|
|
91
|
+
if write_callback:
|
|
92
|
+
write_callback(result.df, mat_config.output)
|
|
93
|
+
ctx.debug("Wrote materialization output", output=mat_config.output)
|
|
94
|
+
|
|
95
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
96
|
+
|
|
97
|
+
ctx.info(
|
|
98
|
+
"Materialization completed",
|
|
99
|
+
name=name,
|
|
100
|
+
output=mat_config.output,
|
|
101
|
+
rows=result.row_count,
|
|
102
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return MaterializationResult(
|
|
106
|
+
name=name,
|
|
107
|
+
output=mat_config.output,
|
|
108
|
+
row_count=result.row_count,
|
|
109
|
+
elapsed_ms=elapsed_ms,
|
|
110
|
+
success=True,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
except Exception as e:
|
|
114
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
115
|
+
ctx.error(
|
|
116
|
+
f"Materialization failed: {e}",
|
|
117
|
+
name=name,
|
|
118
|
+
error_type=type(e).__name__,
|
|
119
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return MaterializationResult(
|
|
123
|
+
name=name,
|
|
124
|
+
output=mat_config.output if mat_config else "",
|
|
125
|
+
row_count=0,
|
|
126
|
+
elapsed_ms=elapsed_ms,
|
|
127
|
+
success=False,
|
|
128
|
+
error=str(e),
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def execute_all(
|
|
132
|
+
self,
|
|
133
|
+
context: EngineContext,
|
|
134
|
+
write_callback: Optional[Any] = None,
|
|
135
|
+
) -> List[MaterializationResult]:
|
|
136
|
+
"""
|
|
137
|
+
Execute all configured materializations.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
context: EngineContext with source data
|
|
141
|
+
write_callback: Optional callback to write output
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
List of MaterializationResult for each materialization
|
|
145
|
+
"""
|
|
146
|
+
ctx = get_logging_context()
|
|
147
|
+
results = []
|
|
148
|
+
|
|
149
|
+
for mat_config in self.config.materializations:
|
|
150
|
+
result = self.execute(mat_config.name, context, write_callback)
|
|
151
|
+
results.append(result)
|
|
152
|
+
|
|
153
|
+
success_count = sum(1 for r in results if r.success)
|
|
154
|
+
ctx.info(
|
|
155
|
+
"All materializations completed",
|
|
156
|
+
total=len(results),
|
|
157
|
+
success=success_count,
|
|
158
|
+
failed=len(results) - success_count,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return results
|
|
162
|
+
|
|
163
|
+
def _build_query_string(self, mat_config: MaterializationConfig) -> str:
|
|
164
|
+
"""
|
|
165
|
+
Build a semantic query string from materialization config.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
mat_config: MaterializationConfig
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Query string like "metric1, metric2 BY dim1, dim2"
|
|
172
|
+
"""
|
|
173
|
+
metrics_part = ", ".join(mat_config.metrics)
|
|
174
|
+
dims_part = ", ".join(mat_config.dimensions)
|
|
175
|
+
|
|
176
|
+
return f"{metrics_part} BY {dims_part}"
|
|
177
|
+
|
|
178
|
+
def get_schedule(self, name: str) -> Optional[str]:
|
|
179
|
+
"""
|
|
180
|
+
Get the schedule for a materialization.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
name: Materialization name
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
Cron schedule string or None
|
|
187
|
+
"""
|
|
188
|
+
mat_config = self.config.get_materialization(name)
|
|
189
|
+
return mat_config.schedule if mat_config else None
|
|
190
|
+
|
|
191
|
+
def list_materializations(self) -> List[Dict[str, Any]]:
|
|
192
|
+
"""
|
|
193
|
+
List all configured materializations.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
List of materialization info dicts
|
|
197
|
+
"""
|
|
198
|
+
return [
|
|
199
|
+
{
|
|
200
|
+
"name": mat.name,
|
|
201
|
+
"metrics": mat.metrics,
|
|
202
|
+
"dimensions": mat.dimensions,
|
|
203
|
+
"output": mat.output,
|
|
204
|
+
"schedule": mat.schedule,
|
|
205
|
+
}
|
|
206
|
+
for mat in self.config.materializations
|
|
207
|
+
]
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class IncrementalMaterializer:
|
|
211
|
+
"""
|
|
212
|
+
Execute incremental materializations with merge strategy.
|
|
213
|
+
|
|
214
|
+
Supports:
|
|
215
|
+
- Replace: Replace rows matching the grain
|
|
216
|
+
- Sum: Add values to existing aggregates
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
def __init__(self, config: SemanticLayerConfig):
|
|
220
|
+
"""
|
|
221
|
+
Initialize with semantic layer configuration.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
config: SemanticLayerConfig with materializations
|
|
225
|
+
"""
|
|
226
|
+
self.config = config
|
|
227
|
+
self._base_materializer = Materializer(config)
|
|
228
|
+
|
|
229
|
+
def execute_incremental(
|
|
230
|
+
self,
|
|
231
|
+
name: str,
|
|
232
|
+
context: EngineContext,
|
|
233
|
+
existing_df: Any,
|
|
234
|
+
timestamp_column: str,
|
|
235
|
+
since_timestamp: Any = None,
|
|
236
|
+
merge_strategy: str = "replace",
|
|
237
|
+
) -> MaterializationResult:
|
|
238
|
+
"""
|
|
239
|
+
Execute an incremental materialization.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
name: Materialization name
|
|
243
|
+
context: EngineContext with source data
|
|
244
|
+
existing_df: Existing materialized data
|
|
245
|
+
timestamp_column: Column for incremental filtering
|
|
246
|
+
since_timestamp: Only process data after this timestamp
|
|
247
|
+
merge_strategy: "replace" or "sum"
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
MaterializationResult with merged data
|
|
251
|
+
"""
|
|
252
|
+
ctx = get_logging_context()
|
|
253
|
+
start_time = time.time()
|
|
254
|
+
|
|
255
|
+
mat_config = self.config.get_materialization(name)
|
|
256
|
+
if mat_config is None:
|
|
257
|
+
raise ValueError(f"Materialization '{name}' not found")
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
source_df = context.df
|
|
261
|
+
|
|
262
|
+
if since_timestamp is not None:
|
|
263
|
+
if context.engine_type == EngineType.SPARK:
|
|
264
|
+
from pyspark.sql import functions as F
|
|
265
|
+
|
|
266
|
+
source_df = source_df.filter(F.col(timestamp_column) > since_timestamp)
|
|
267
|
+
else:
|
|
268
|
+
source_df = source_df[source_df[timestamp_column] > since_timestamp]
|
|
269
|
+
|
|
270
|
+
incremental_context = context.with_df(source_df)
|
|
271
|
+
|
|
272
|
+
query_string = self._base_materializer._build_query_string(mat_config)
|
|
273
|
+
query = SemanticQuery(self.config)
|
|
274
|
+
new_result = query.execute(query_string, incremental_context)
|
|
275
|
+
|
|
276
|
+
merged_df = self._merge_results(
|
|
277
|
+
context,
|
|
278
|
+
existing_df,
|
|
279
|
+
new_result.df,
|
|
280
|
+
mat_config.dimensions,
|
|
281
|
+
mat_config.metrics,
|
|
282
|
+
merge_strategy,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
if context.engine_type == EngineType.SPARK:
|
|
286
|
+
row_count = merged_df.count()
|
|
287
|
+
else:
|
|
288
|
+
row_count = len(merged_df)
|
|
289
|
+
|
|
290
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
291
|
+
|
|
292
|
+
ctx.info(
|
|
293
|
+
"Incremental materialization completed",
|
|
294
|
+
name=name,
|
|
295
|
+
strategy=merge_strategy,
|
|
296
|
+
rows=row_count,
|
|
297
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
return MaterializationResult(
|
|
301
|
+
name=name,
|
|
302
|
+
output=mat_config.output,
|
|
303
|
+
row_count=row_count,
|
|
304
|
+
elapsed_ms=elapsed_ms,
|
|
305
|
+
success=True,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
except Exception as e:
|
|
309
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
310
|
+
ctx.error(
|
|
311
|
+
f"Incremental materialization failed: {e}",
|
|
312
|
+
name=name,
|
|
313
|
+
error_type=type(e).__name__,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
return MaterializationResult(
|
|
317
|
+
name=name,
|
|
318
|
+
output=mat_config.output if mat_config else "",
|
|
319
|
+
row_count=0,
|
|
320
|
+
elapsed_ms=elapsed_ms,
|
|
321
|
+
success=False,
|
|
322
|
+
error=str(e),
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
def _merge_results(
|
|
326
|
+
self,
|
|
327
|
+
context: EngineContext,
|
|
328
|
+
existing_df: Any,
|
|
329
|
+
new_df: Any,
|
|
330
|
+
dimensions: List[str],
|
|
331
|
+
metrics: List[str],
|
|
332
|
+
strategy: str,
|
|
333
|
+
) -> Any:
|
|
334
|
+
"""Merge new results with existing data."""
|
|
335
|
+
if context.engine_type == EngineType.SPARK:
|
|
336
|
+
return self._merge_spark(existing_df, new_df, dimensions, metrics, strategy)
|
|
337
|
+
else:
|
|
338
|
+
return self._merge_pandas(existing_df, new_df, dimensions, metrics, strategy)
|
|
339
|
+
|
|
340
|
+
def _merge_spark(
|
|
341
|
+
self,
|
|
342
|
+
existing_df: Any,
|
|
343
|
+
new_df: Any,
|
|
344
|
+
dimensions: List[str],
|
|
345
|
+
metrics: List[str],
|
|
346
|
+
strategy: str,
|
|
347
|
+
) -> Any:
|
|
348
|
+
"""Merge using Spark."""
|
|
349
|
+
from pyspark.sql import functions as F
|
|
350
|
+
|
|
351
|
+
if strategy == "replace":
|
|
352
|
+
join_cond = [existing_df[d] == new_df[d] for d in dimensions]
|
|
353
|
+
unchanged = existing_df.join(new_df, join_cond, "left_anti")
|
|
354
|
+
return unchanged.unionByName(new_df, allowMissingColumns=True)
|
|
355
|
+
|
|
356
|
+
elif strategy == "sum":
|
|
357
|
+
combined = existing_df.unionByName(new_df, allowMissingColumns=True)
|
|
358
|
+
agg_exprs = [F.sum(F.col(m)).alias(m) for m in metrics]
|
|
359
|
+
return combined.groupBy(dimensions).agg(*agg_exprs)
|
|
360
|
+
|
|
361
|
+
else:
|
|
362
|
+
raise ValueError(f"Unknown merge strategy: {strategy}")
|
|
363
|
+
|
|
364
|
+
def _merge_pandas(
|
|
365
|
+
self,
|
|
366
|
+
existing_df: Any,
|
|
367
|
+
new_df: Any,
|
|
368
|
+
dimensions: List[str],
|
|
369
|
+
metrics: List[str],
|
|
370
|
+
strategy: str,
|
|
371
|
+
) -> Any:
|
|
372
|
+
"""Merge using Pandas."""
|
|
373
|
+
import pandas as pd
|
|
374
|
+
|
|
375
|
+
if strategy == "replace":
|
|
376
|
+
merged = pd.merge(
|
|
377
|
+
existing_df,
|
|
378
|
+
new_df[dimensions],
|
|
379
|
+
on=dimensions,
|
|
380
|
+
how="left",
|
|
381
|
+
indicator=True,
|
|
382
|
+
)
|
|
383
|
+
unchanged = merged[merged["_merge"] == "left_only"].drop(columns=["_merge"])
|
|
384
|
+
unchanged = unchanged[existing_df.columns]
|
|
385
|
+
return pd.concat([unchanged, new_df], ignore_index=True)
|
|
386
|
+
|
|
387
|
+
elif strategy == "sum":
|
|
388
|
+
combined = pd.concat([existing_df, new_df], ignore_index=True)
|
|
389
|
+
return combined.groupby(dimensions, as_index=False)[metrics].sum()
|
|
390
|
+
|
|
391
|
+
else:
|
|
392
|
+
raise ValueError(f"Unknown merge strategy: {strategy}")
|