odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
odibi/semantics/query.py
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Query Module
|
|
3
|
+
=====================
|
|
4
|
+
|
|
5
|
+
Parse and execute semantic queries in the format:
|
|
6
|
+
"metric1, metric2 BY dimension1, dimension2"
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
"revenue, order_count BY region, month"
|
|
10
|
+
|
|
11
|
+
This generates SQL-like aggregation queries from semantic definitions.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
18
|
+
|
|
19
|
+
from odibi.context import EngineContext
|
|
20
|
+
from odibi.enums import EngineType
|
|
21
|
+
from odibi.semantics.metrics import (
|
|
22
|
+
DimensionDefinition,
|
|
23
|
+
MetricDefinition,
|
|
24
|
+
MetricType,
|
|
25
|
+
SemanticLayerConfig,
|
|
26
|
+
)
|
|
27
|
+
from odibi.utils.logging_context import get_logging_context
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ParsedQuery:
|
|
32
|
+
"""Result of parsing a semantic query string."""
|
|
33
|
+
|
|
34
|
+
metrics: List[str] = field(default_factory=list)
|
|
35
|
+
dimensions: List[str] = field(default_factory=list)
|
|
36
|
+
filters: List[str] = field(default_factory=list)
|
|
37
|
+
raw_query: str = ""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class QueryResult:
|
|
42
|
+
"""Result of executing a semantic query."""
|
|
43
|
+
|
|
44
|
+
df: Any
|
|
45
|
+
metrics: List[str]
|
|
46
|
+
dimensions: List[str]
|
|
47
|
+
row_count: int
|
|
48
|
+
elapsed_ms: float
|
|
49
|
+
sql_generated: Optional[str] = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SemanticQuery:
|
|
53
|
+
"""
|
|
54
|
+
Execute semantic queries against a configured semantic layer.
|
|
55
|
+
|
|
56
|
+
Usage:
|
|
57
|
+
config = SemanticLayerConfig(...)
|
|
58
|
+
query = SemanticQuery(config)
|
|
59
|
+
result = query.execute("revenue BY region, month", context)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self, config: SemanticLayerConfig):
|
|
63
|
+
"""
|
|
64
|
+
Initialize with semantic layer configuration.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
config: SemanticLayerConfig with metrics and dimensions
|
|
68
|
+
"""
|
|
69
|
+
self.config = config
|
|
70
|
+
self._metric_cache: Dict[str, MetricDefinition] = {}
|
|
71
|
+
self._dimension_cache: Dict[str, DimensionDefinition] = {}
|
|
72
|
+
|
|
73
|
+
for metric in config.metrics:
|
|
74
|
+
self._metric_cache[metric.name] = metric
|
|
75
|
+
|
|
76
|
+
for dim in config.dimensions:
|
|
77
|
+
self._dimension_cache[dim.name] = dim
|
|
78
|
+
|
|
79
|
+
def parse(self, query_string: str) -> ParsedQuery:
|
|
80
|
+
"""
|
|
81
|
+
Parse a semantic query string.
|
|
82
|
+
|
|
83
|
+
Format: "metric1, metric2 BY dimension1, dimension2 WHERE condition"
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
query_string: Query string to parse
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
ParsedQuery with extracted metrics, dimensions, filters
|
|
90
|
+
"""
|
|
91
|
+
ctx = get_logging_context()
|
|
92
|
+
ctx.debug("Parsing semantic query", query=query_string)
|
|
93
|
+
|
|
94
|
+
result = ParsedQuery(raw_query=query_string)
|
|
95
|
+
query = query_string.strip()
|
|
96
|
+
|
|
97
|
+
where_match = re.search(r"\s+WHERE\s+(.+)$", query, re.IGNORECASE)
|
|
98
|
+
if where_match:
|
|
99
|
+
result.filters = [where_match.group(1).strip()]
|
|
100
|
+
query = query[: where_match.start()]
|
|
101
|
+
|
|
102
|
+
by_match = re.search(r"\s+BY\s+(.+)$", query, re.IGNORECASE)
|
|
103
|
+
if by_match:
|
|
104
|
+
dim_part = by_match.group(1).strip()
|
|
105
|
+
result.dimensions = [d.strip().lower() for d in dim_part.split(",")]
|
|
106
|
+
query = query[: by_match.start()]
|
|
107
|
+
|
|
108
|
+
metric_part = query.strip()
|
|
109
|
+
if metric_part:
|
|
110
|
+
result.metrics = [m.strip().lower() for m in metric_part.split(",")]
|
|
111
|
+
|
|
112
|
+
ctx.debug(
|
|
113
|
+
"Parsed semantic query",
|
|
114
|
+
metrics=result.metrics,
|
|
115
|
+
dimensions=result.dimensions,
|
|
116
|
+
filters=result.filters,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
def validate(self, parsed: ParsedQuery) -> List[str]:
|
|
122
|
+
"""
|
|
123
|
+
Validate a parsed query against the semantic layer config.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
parsed: ParsedQuery to validate
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
List of validation errors (empty if valid)
|
|
130
|
+
"""
|
|
131
|
+
errors = []
|
|
132
|
+
|
|
133
|
+
for metric_name in parsed.metrics:
|
|
134
|
+
if metric_name not in self._metric_cache:
|
|
135
|
+
available = list(self._metric_cache.keys())
|
|
136
|
+
errors.append(f"Unknown metric '{metric_name}'. Available: {available}")
|
|
137
|
+
|
|
138
|
+
for dim_name in parsed.dimensions:
|
|
139
|
+
if dim_name not in self._dimension_cache:
|
|
140
|
+
available = list(self._dimension_cache.keys())
|
|
141
|
+
errors.append(f"Unknown dimension '{dim_name}'. Available: {available}")
|
|
142
|
+
|
|
143
|
+
if not parsed.metrics:
|
|
144
|
+
errors.append("At least one metric is required")
|
|
145
|
+
|
|
146
|
+
return errors
|
|
147
|
+
|
|
148
|
+
def generate_sql(self, parsed: ParsedQuery) -> Tuple[str, str]:
|
|
149
|
+
"""
|
|
150
|
+
Generate SQL from a parsed query.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
parsed: ParsedQuery with metrics and dimensions
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Tuple of (SQL query, source table name)
|
|
157
|
+
"""
|
|
158
|
+
if not parsed.metrics:
|
|
159
|
+
raise ValueError("At least one metric is required")
|
|
160
|
+
|
|
161
|
+
metric_defs = [self._metric_cache[m] for m in parsed.metrics]
|
|
162
|
+
|
|
163
|
+
all_component_metrics = set()
|
|
164
|
+
for metric_def in metric_defs:
|
|
165
|
+
if metric_def.type == MetricType.DERIVED and metric_def.components:
|
|
166
|
+
for comp_name in metric_def.components:
|
|
167
|
+
comp_metric = self._metric_cache.get(comp_name.lower())
|
|
168
|
+
if comp_metric:
|
|
169
|
+
all_component_metrics.add(comp_name.lower())
|
|
170
|
+
|
|
171
|
+
sources = set()
|
|
172
|
+
for m in metric_defs:
|
|
173
|
+
if m.source:
|
|
174
|
+
sources.add(m.source)
|
|
175
|
+
for comp_name in all_component_metrics:
|
|
176
|
+
comp_metric = self._metric_cache.get(comp_name)
|
|
177
|
+
if comp_metric and comp_metric.source:
|
|
178
|
+
sources.add(comp_metric.source)
|
|
179
|
+
|
|
180
|
+
if not sources:
|
|
181
|
+
raise ValueError("No source table found for metrics")
|
|
182
|
+
|
|
183
|
+
source_table = list(sources)[0]
|
|
184
|
+
|
|
185
|
+
select_parts = []
|
|
186
|
+
|
|
187
|
+
for dim_name in parsed.dimensions:
|
|
188
|
+
dim_def = self._dimension_cache.get(dim_name)
|
|
189
|
+
if dim_def:
|
|
190
|
+
col = dim_def.get_column()
|
|
191
|
+
select_parts.append(col)
|
|
192
|
+
else:
|
|
193
|
+
select_parts.append(dim_name)
|
|
194
|
+
|
|
195
|
+
for comp_name in all_component_metrics:
|
|
196
|
+
comp_metric = self._metric_cache.get(comp_name)
|
|
197
|
+
if comp_metric and comp_metric.expr:
|
|
198
|
+
select_parts.append(f"{comp_metric.expr} AS {comp_name}")
|
|
199
|
+
|
|
200
|
+
for metric_def in metric_defs:
|
|
201
|
+
if metric_def.type == MetricType.DERIVED:
|
|
202
|
+
formula_sql = self._build_derived_formula_sql(metric_def)
|
|
203
|
+
select_parts.append(f"{formula_sql} AS {metric_def.name}")
|
|
204
|
+
elif metric_def.name not in all_component_metrics:
|
|
205
|
+
select_parts.append(f"{metric_def.expr} AS {metric_def.name}")
|
|
206
|
+
|
|
207
|
+
select_clause = ", ".join(select_parts) if select_parts else "*"
|
|
208
|
+
|
|
209
|
+
all_filters = []
|
|
210
|
+
for metric_def in metric_defs:
|
|
211
|
+
all_filters.extend(metric_def.filters)
|
|
212
|
+
all_filters.extend(parsed.filters)
|
|
213
|
+
|
|
214
|
+
where_clause = ""
|
|
215
|
+
if all_filters:
|
|
216
|
+
where_clause = " WHERE " + " AND ".join(f"({f})" for f in all_filters)
|
|
217
|
+
|
|
218
|
+
group_by_clause = ""
|
|
219
|
+
if parsed.dimensions:
|
|
220
|
+
group_cols = []
|
|
221
|
+
for dim_name in parsed.dimensions:
|
|
222
|
+
dim_def = self._dimension_cache.get(dim_name)
|
|
223
|
+
if dim_def:
|
|
224
|
+
group_cols.append(dim_def.get_column())
|
|
225
|
+
else:
|
|
226
|
+
group_cols.append(dim_name)
|
|
227
|
+
group_by_clause = " GROUP BY " + ", ".join(group_cols)
|
|
228
|
+
|
|
229
|
+
sql = f"SELECT {select_clause} FROM {source_table}{where_clause}{group_by_clause}"
|
|
230
|
+
|
|
231
|
+
return sql, source_table
|
|
232
|
+
|
|
233
|
+
def _build_derived_formula_sql(self, metric_def: MetricDefinition) -> str:
|
|
234
|
+
"""
|
|
235
|
+
Build SQL for a derived metric formula.
|
|
236
|
+
|
|
237
|
+
Replaces component names with their aggregation expressions
|
|
238
|
+
and wraps divisors with NULLIF to prevent division by zero.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
metric_def: The derived metric definition
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
SQL expression string
|
|
245
|
+
"""
|
|
246
|
+
if not metric_def.formula or not metric_def.components:
|
|
247
|
+
raise ValueError(f"Derived metric '{metric_def.name}' missing formula or components")
|
|
248
|
+
|
|
249
|
+
formula = metric_def.formula
|
|
250
|
+
|
|
251
|
+
component_exprs = {}
|
|
252
|
+
for comp_name in metric_def.components:
|
|
253
|
+
comp_metric = self._metric_cache.get(comp_name.lower())
|
|
254
|
+
if comp_metric and comp_metric.expr:
|
|
255
|
+
component_exprs[comp_name.lower()] = comp_metric.expr
|
|
256
|
+
|
|
257
|
+
sorted_names = sorted(component_exprs.keys(), key=len, reverse=True)
|
|
258
|
+
result = formula
|
|
259
|
+
for name in sorted_names:
|
|
260
|
+
result = result.replace(name, component_exprs[name])
|
|
261
|
+
|
|
262
|
+
result = self._wrap_divisors_with_nullif(result)
|
|
263
|
+
|
|
264
|
+
return result
|
|
265
|
+
|
|
266
|
+
def _wrap_divisors_with_nullif(self, expr: str) -> str:
|
|
267
|
+
"""
|
|
268
|
+
Wrap division operands with NULLIF to prevent division by zero.
|
|
269
|
+
|
|
270
|
+
Handles patterns like:
|
|
271
|
+
- expr / SUM(x) -> expr / NULLIF(SUM(x), 0)
|
|
272
|
+
- (a - b) / SUM(x) -> (a - b) / NULLIF(SUM(x), 0)
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
expr: SQL expression string
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Expression with NULLIF wrapping divisors
|
|
279
|
+
"""
|
|
280
|
+
import re
|
|
281
|
+
|
|
282
|
+
pattern = r"/\s*(\([^)]+\)|SUM\([^)]+\)|COUNT\([^)]+\)|AVG\([^)]+\)|[A-Za-z_][A-Za-z0-9_]*)"
|
|
283
|
+
matches = list(re.finditer(pattern, expr, re.IGNORECASE))
|
|
284
|
+
|
|
285
|
+
for match in reversed(matches):
|
|
286
|
+
divisor = match.group(1)
|
|
287
|
+
if not divisor.upper().startswith("NULLIF"):
|
|
288
|
+
start, end = match.span()
|
|
289
|
+
new_text = f"/ NULLIF({divisor}, 0)"
|
|
290
|
+
expr = expr[:start] + new_text + expr[end:]
|
|
291
|
+
|
|
292
|
+
return expr
|
|
293
|
+
|
|
294
|
+
def execute(
|
|
295
|
+
self,
|
|
296
|
+
query_string: str,
|
|
297
|
+
context: EngineContext,
|
|
298
|
+
source_df: Optional[Any] = None,
|
|
299
|
+
) -> QueryResult:
|
|
300
|
+
"""
|
|
301
|
+
Execute a semantic query.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
query_string: Semantic query string (e.g., "revenue BY region")
|
|
305
|
+
context: EngineContext for execution
|
|
306
|
+
source_df: Optional source DataFrame (overrides context lookup)
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
QueryResult with DataFrame and metadata
|
|
310
|
+
"""
|
|
311
|
+
ctx = get_logging_context()
|
|
312
|
+
start_time = time.time()
|
|
313
|
+
|
|
314
|
+
ctx.info("Executing semantic query", query=query_string)
|
|
315
|
+
|
|
316
|
+
parsed = self.parse(query_string)
|
|
317
|
+
errors = self.validate(parsed)
|
|
318
|
+
if errors:
|
|
319
|
+
raise ValueError(f"Invalid semantic query: {'; '.join(errors)}")
|
|
320
|
+
|
|
321
|
+
sql, source_table = self.generate_sql(parsed)
|
|
322
|
+
ctx.debug("Generated SQL", sql=sql, source=source_table)
|
|
323
|
+
|
|
324
|
+
if source_df is None:
|
|
325
|
+
try:
|
|
326
|
+
source_df = context.get(source_table)
|
|
327
|
+
except KeyError:
|
|
328
|
+
raise ValueError(f"Source table '{source_table}' not found in context")
|
|
329
|
+
|
|
330
|
+
result_df = self._execute_query(context, source_df, parsed)
|
|
331
|
+
|
|
332
|
+
if context.engine_type == EngineType.SPARK:
|
|
333
|
+
row_count = result_df.count()
|
|
334
|
+
else:
|
|
335
|
+
row_count = len(result_df)
|
|
336
|
+
|
|
337
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
338
|
+
|
|
339
|
+
ctx.info(
|
|
340
|
+
"Semantic query completed",
|
|
341
|
+
query=query_string,
|
|
342
|
+
rows=row_count,
|
|
343
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
return QueryResult(
|
|
347
|
+
df=result_df,
|
|
348
|
+
metrics=parsed.metrics,
|
|
349
|
+
dimensions=parsed.dimensions,
|
|
350
|
+
row_count=row_count,
|
|
351
|
+
elapsed_ms=elapsed_ms,
|
|
352
|
+
sql_generated=sql,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def _execute_query(
|
|
356
|
+
self,
|
|
357
|
+
context: EngineContext,
|
|
358
|
+
source_df: Any,
|
|
359
|
+
parsed: ParsedQuery,
|
|
360
|
+
) -> Any:
|
|
361
|
+
"""Execute the query using the appropriate engine."""
|
|
362
|
+
if context.engine_type == EngineType.SPARK:
|
|
363
|
+
return self._execute_spark(context, source_df, parsed)
|
|
364
|
+
elif context.engine_type == EngineType.POLARS:
|
|
365
|
+
return self._execute_polars(source_df, parsed)
|
|
366
|
+
else:
|
|
367
|
+
return self._execute_pandas(source_df, parsed)
|
|
368
|
+
|
|
369
|
+
def _execute_spark(
|
|
370
|
+
self,
|
|
371
|
+
context: EngineContext,
|
|
372
|
+
source_df: Any,
|
|
373
|
+
parsed: ParsedQuery,
|
|
374
|
+
) -> Any:
|
|
375
|
+
"""Execute query using Spark."""
|
|
376
|
+
from pyspark.sql import functions as F
|
|
377
|
+
|
|
378
|
+
df = source_df
|
|
379
|
+
|
|
380
|
+
all_filters = []
|
|
381
|
+
for metric_name in parsed.metrics:
|
|
382
|
+
metric_def = self._metric_cache.get(metric_name)
|
|
383
|
+
if metric_def:
|
|
384
|
+
all_filters.extend(metric_def.filters)
|
|
385
|
+
all_filters.extend(parsed.filters)
|
|
386
|
+
|
|
387
|
+
for filter_expr in all_filters:
|
|
388
|
+
df = df.filter(filter_expr)
|
|
389
|
+
|
|
390
|
+
group_cols = []
|
|
391
|
+
for dim_name in parsed.dimensions:
|
|
392
|
+
dim_def = self._dimension_cache.get(dim_name)
|
|
393
|
+
if dim_def:
|
|
394
|
+
group_cols.append(F.col(dim_def.get_column()))
|
|
395
|
+
else:
|
|
396
|
+
group_cols.append(F.col(dim_name))
|
|
397
|
+
|
|
398
|
+
component_metrics = set()
|
|
399
|
+
derived_metrics = []
|
|
400
|
+
simple_metrics = []
|
|
401
|
+
|
|
402
|
+
for metric_name in parsed.metrics:
|
|
403
|
+
metric_def = self._metric_cache.get(metric_name)
|
|
404
|
+
if metric_def:
|
|
405
|
+
if metric_def.type == MetricType.DERIVED:
|
|
406
|
+
derived_metrics.append(metric_def)
|
|
407
|
+
if metric_def.components:
|
|
408
|
+
for comp in metric_def.components:
|
|
409
|
+
component_metrics.add(comp.lower())
|
|
410
|
+
else:
|
|
411
|
+
simple_metrics.append(metric_def)
|
|
412
|
+
|
|
413
|
+
agg_exprs = []
|
|
414
|
+
for comp_name in component_metrics:
|
|
415
|
+
comp_def = self._metric_cache.get(comp_name)
|
|
416
|
+
if comp_def and comp_def.expr:
|
|
417
|
+
agg_exprs.append(F.expr(comp_def.expr).alias(comp_name))
|
|
418
|
+
|
|
419
|
+
for metric_def in simple_metrics:
|
|
420
|
+
if metric_def.name not in component_metrics and metric_def.expr:
|
|
421
|
+
agg_exprs.append(F.expr(metric_def.expr).alias(metric_def.name))
|
|
422
|
+
|
|
423
|
+
if group_cols:
|
|
424
|
+
result = df.groupBy(group_cols).agg(*agg_exprs)
|
|
425
|
+
else:
|
|
426
|
+
result = df.agg(*agg_exprs)
|
|
427
|
+
|
|
428
|
+
for derived in derived_metrics:
|
|
429
|
+
formula_expr = self._build_pandas_derived_formula(derived)
|
|
430
|
+
result = result.withColumn(derived.name, F.expr(formula_expr))
|
|
431
|
+
|
|
432
|
+
return result
|
|
433
|
+
|
|
434
|
+
def _execute_polars(self, source_df: Any, parsed: ParsedQuery) -> Any:
|
|
435
|
+
"""Execute query using Polars."""
|
|
436
|
+
import polars as pl
|
|
437
|
+
|
|
438
|
+
df = source_df
|
|
439
|
+
if isinstance(df, pl.LazyFrame):
|
|
440
|
+
df = df.collect()
|
|
441
|
+
|
|
442
|
+
all_filters = []
|
|
443
|
+
for metric_name in parsed.metrics:
|
|
444
|
+
metric_def = self._metric_cache.get(metric_name)
|
|
445
|
+
if metric_def:
|
|
446
|
+
all_filters.extend(metric_def.filters)
|
|
447
|
+
all_filters.extend(parsed.filters)
|
|
448
|
+
|
|
449
|
+
for filter_expr in all_filters:
|
|
450
|
+
df = df.filter(pl.sql_expr(filter_expr))
|
|
451
|
+
|
|
452
|
+
group_cols = []
|
|
453
|
+
for dim_name in parsed.dimensions:
|
|
454
|
+
dim_def = self._dimension_cache.get(dim_name)
|
|
455
|
+
if dim_def:
|
|
456
|
+
group_cols.append(dim_def.get_column())
|
|
457
|
+
else:
|
|
458
|
+
group_cols.append(dim_name)
|
|
459
|
+
|
|
460
|
+
component_metrics = set()
|
|
461
|
+
derived_metrics = []
|
|
462
|
+
simple_metrics = []
|
|
463
|
+
|
|
464
|
+
for metric_name in parsed.metrics:
|
|
465
|
+
metric_def = self._metric_cache.get(metric_name)
|
|
466
|
+
if metric_def:
|
|
467
|
+
if metric_def.type == MetricType.DERIVED:
|
|
468
|
+
derived_metrics.append(metric_def)
|
|
469
|
+
if metric_def.components:
|
|
470
|
+
for comp in metric_def.components:
|
|
471
|
+
component_metrics.add(comp.lower())
|
|
472
|
+
else:
|
|
473
|
+
simple_metrics.append(metric_def)
|
|
474
|
+
|
|
475
|
+
agg_exprs = []
|
|
476
|
+
for comp_name in component_metrics:
|
|
477
|
+
comp_def = self._metric_cache.get(comp_name)
|
|
478
|
+
if comp_def and comp_def.expr:
|
|
479
|
+
col, func = self._parse_pandas_agg(comp_def.expr)
|
|
480
|
+
agg_exprs.append(self._polars_agg_expr(col, func, comp_name))
|
|
481
|
+
|
|
482
|
+
for metric_def in simple_metrics:
|
|
483
|
+
if metric_def.name not in component_metrics and metric_def.expr:
|
|
484
|
+
col, func = self._parse_pandas_agg(metric_def.expr)
|
|
485
|
+
agg_exprs.append(self._polars_agg_expr(col, func, metric_def.name))
|
|
486
|
+
|
|
487
|
+
if group_cols:
|
|
488
|
+
result = df.group_by(group_cols).agg(agg_exprs)
|
|
489
|
+
else:
|
|
490
|
+
result = df.select(agg_exprs)
|
|
491
|
+
|
|
492
|
+
for derived in derived_metrics:
|
|
493
|
+
result = self._apply_polars_derived_formula(result, derived)
|
|
494
|
+
|
|
495
|
+
return result
|
|
496
|
+
|
|
497
|
+
def _polars_agg_expr(self, col: str, func: str, alias: str) -> Any:
|
|
498
|
+
"""Build a Polars aggregation expression."""
|
|
499
|
+
import polars as pl
|
|
500
|
+
|
|
501
|
+
if col == "*":
|
|
502
|
+
return pl.len().alias(alias)
|
|
503
|
+
|
|
504
|
+
if func == "sum":
|
|
505
|
+
return pl.col(col).sum().alias(alias)
|
|
506
|
+
elif func == "mean":
|
|
507
|
+
return pl.col(col).mean().alias(alias)
|
|
508
|
+
elif func == "count":
|
|
509
|
+
return pl.col(col).count().alias(alias)
|
|
510
|
+
elif func == "min":
|
|
511
|
+
return pl.col(col).min().alias(alias)
|
|
512
|
+
elif func == "max":
|
|
513
|
+
return pl.col(col).max().alias(alias)
|
|
514
|
+
else:
|
|
515
|
+
return pl.col(col).sum().alias(alias)
|
|
516
|
+
|
|
517
|
+
def _apply_polars_derived_formula(self, df: Any, metric_def: MetricDefinition) -> Any:
|
|
518
|
+
"""
|
|
519
|
+
Apply a derived metric formula to a Polars DataFrame.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
df: DataFrame with component metrics already calculated
|
|
523
|
+
metric_def: The derived metric definition
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
DataFrame with the derived metric column added
|
|
527
|
+
"""
|
|
528
|
+
import polars as pl
|
|
529
|
+
|
|
530
|
+
if not metric_def.formula or not metric_def.components:
|
|
531
|
+
raise ValueError(f"Derived metric '{metric_def.name}' missing formula or components")
|
|
532
|
+
|
|
533
|
+
formula = metric_def.formula
|
|
534
|
+
|
|
535
|
+
expr_parts = {}
|
|
536
|
+
for comp_name in metric_def.components:
|
|
537
|
+
comp_lower = comp_name.lower()
|
|
538
|
+
if comp_lower in df.columns:
|
|
539
|
+
expr_parts[comp_lower] = pl.col(comp_lower)
|
|
540
|
+
|
|
541
|
+
try:
|
|
542
|
+
result_expr = eval(formula, {"__builtins__": {}}, expr_parts)
|
|
543
|
+
df = df.with_columns(result_expr.alias(metric_def.name))
|
|
544
|
+
except ZeroDivisionError:
|
|
545
|
+
df = df.with_columns(pl.lit(float("nan")).alias(metric_def.name))
|
|
546
|
+
|
|
547
|
+
return df
|
|
548
|
+
|
|
549
|
+
def _execute_pandas(self, source_df: Any, parsed: ParsedQuery) -> Any:
|
|
550
|
+
"""Execute query using Pandas."""
|
|
551
|
+
|
|
552
|
+
df = source_df.copy()
|
|
553
|
+
|
|
554
|
+
all_filters = []
|
|
555
|
+
for metric_name in parsed.metrics:
|
|
556
|
+
metric_def = self._metric_cache.get(metric_name)
|
|
557
|
+
if metric_def:
|
|
558
|
+
all_filters.extend(metric_def.filters)
|
|
559
|
+
all_filters.extend(parsed.filters)
|
|
560
|
+
|
|
561
|
+
for filter_expr in all_filters:
|
|
562
|
+
df = df.query(filter_expr)
|
|
563
|
+
|
|
564
|
+
group_cols = []
|
|
565
|
+
for dim_name in parsed.dimensions:
|
|
566
|
+
dim_def = self._dimension_cache.get(dim_name)
|
|
567
|
+
if dim_def:
|
|
568
|
+
group_cols.append(dim_def.get_column())
|
|
569
|
+
else:
|
|
570
|
+
group_cols.append(dim_name)
|
|
571
|
+
|
|
572
|
+
component_metrics = set()
|
|
573
|
+
derived_metrics = []
|
|
574
|
+
simple_metric_names = []
|
|
575
|
+
|
|
576
|
+
for metric_name in parsed.metrics:
|
|
577
|
+
metric_def = self._metric_cache.get(metric_name)
|
|
578
|
+
if metric_def:
|
|
579
|
+
if metric_def.type == MetricType.DERIVED:
|
|
580
|
+
derived_metrics.append(metric_def)
|
|
581
|
+
if metric_def.components:
|
|
582
|
+
for comp in metric_def.components:
|
|
583
|
+
component_metrics.add(comp.lower())
|
|
584
|
+
else:
|
|
585
|
+
simple_metric_names.append(metric_name)
|
|
586
|
+
|
|
587
|
+
all_metrics_to_agg = list(component_metrics)
|
|
588
|
+
for name in simple_metric_names:
|
|
589
|
+
if name not in component_metrics:
|
|
590
|
+
all_metrics_to_agg.append(name)
|
|
591
|
+
|
|
592
|
+
if group_cols:
|
|
593
|
+
result = self._pandas_groupby_agg(df, group_cols, all_metrics_to_agg)
|
|
594
|
+
else:
|
|
595
|
+
result = self._pandas_agg_all(df, all_metrics_to_agg)
|
|
596
|
+
|
|
597
|
+
for derived in derived_metrics:
|
|
598
|
+
result = self._apply_pandas_derived_formula(result, derived)
|
|
599
|
+
|
|
600
|
+
return result
|
|
601
|
+
|
|
602
|
+
def _apply_pandas_derived_formula(self, df: Any, metric_def: MetricDefinition) -> Any:
|
|
603
|
+
"""
|
|
604
|
+
Apply a derived metric formula to a pandas DataFrame.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
df: DataFrame with component metrics already calculated
|
|
608
|
+
metric_def: The derived metric definition
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
DataFrame with the derived metric column added
|
|
612
|
+
"""
|
|
613
|
+
if not metric_def.formula or not metric_def.components:
|
|
614
|
+
raise ValueError(f"Derived metric '{metric_def.name}' missing formula or components")
|
|
615
|
+
|
|
616
|
+
formula = metric_def.formula
|
|
617
|
+
|
|
618
|
+
local_vars = {}
|
|
619
|
+
for comp_name in metric_def.components:
|
|
620
|
+
comp_lower = comp_name.lower()
|
|
621
|
+
if comp_lower in df.columns:
|
|
622
|
+
local_vars[comp_lower] = df[comp_lower]
|
|
623
|
+
|
|
624
|
+
try:
|
|
625
|
+
df[metric_def.name] = eval(formula, {"__builtins__": {}}, local_vars)
|
|
626
|
+
except ZeroDivisionError:
|
|
627
|
+
df[metric_def.name] = float("nan")
|
|
628
|
+
|
|
629
|
+
return df
|
|
630
|
+
|
|
631
|
+
def _build_pandas_derived_formula(self, metric_def: MetricDefinition) -> str:
|
|
632
|
+
"""
|
|
633
|
+
Build a formula expression for pandas/spark using column names.
|
|
634
|
+
|
|
635
|
+
Wraps divisors with null protection for Spark SQL.
|
|
636
|
+
|
|
637
|
+
Args:
|
|
638
|
+
metric_def: The derived metric definition
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
Formula expression string using column names with NULLIF protection
|
|
642
|
+
"""
|
|
643
|
+
if not metric_def.formula:
|
|
644
|
+
raise ValueError(f"Derived metric '{metric_def.name}' missing formula")
|
|
645
|
+
|
|
646
|
+
formula = metric_def.formula
|
|
647
|
+
result = self._wrap_divisors_with_nullif(formula)
|
|
648
|
+
|
|
649
|
+
return result
|
|
650
|
+
|
|
651
|
+
def _parse_pandas_agg(self, expr: str) -> Tuple[str, str]:
|
|
652
|
+
"""
|
|
653
|
+
Parse a SQL aggregation expression to (column, function).
|
|
654
|
+
|
|
655
|
+
Example: "SUM(total_amount)" -> ("total_amount", "sum")
|
|
656
|
+
"""
|
|
657
|
+
expr_stripped = expr.strip()
|
|
658
|
+
|
|
659
|
+
match = re.match(r"(\w+)\(([^)]+)\)", expr_stripped, re.IGNORECASE)
|
|
660
|
+
if match:
|
|
661
|
+
func = match.group(1).upper()
|
|
662
|
+
col = match.group(2).strip()
|
|
663
|
+
|
|
664
|
+
func_map = {
|
|
665
|
+
"SUM": "sum",
|
|
666
|
+
"AVG": "mean",
|
|
667
|
+
"COUNT": "count",
|
|
668
|
+
"MIN": "min",
|
|
669
|
+
"MAX": "max",
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
return (col, func_map.get(func, func.lower()))
|
|
673
|
+
|
|
674
|
+
return (expr_stripped, "first")
|
|
675
|
+
|
|
676
|
+
def _pandas_groupby_agg(self, df: Any, group_cols: List[str], metric_names: List[str]) -> Any:
|
|
677
|
+
"""Execute pandas groupby aggregation."""
|
|
678
|
+
import pandas as pd
|
|
679
|
+
|
|
680
|
+
agg_operations = {}
|
|
681
|
+
|
|
682
|
+
for metric_name in metric_names:
|
|
683
|
+
metric_def = self._metric_cache.get(metric_name)
|
|
684
|
+
if metric_def:
|
|
685
|
+
col, func = self._parse_pandas_agg(metric_def.expr)
|
|
686
|
+
|
|
687
|
+
if col == "*":
|
|
688
|
+
first_col = df.columns[0]
|
|
689
|
+
if metric_name not in agg_operations:
|
|
690
|
+
agg_operations[metric_name] = (first_col, "count")
|
|
691
|
+
else:
|
|
692
|
+
agg_operations[metric_name] = (col, func)
|
|
693
|
+
|
|
694
|
+
if not agg_operations:
|
|
695
|
+
return df.groupby(group_cols, as_index=False).size()
|
|
696
|
+
|
|
697
|
+
grouped = df.groupby(group_cols, as_index=False)
|
|
698
|
+
|
|
699
|
+
result_frames = []
|
|
700
|
+
for metric_name, (col, func) in agg_operations.items():
|
|
701
|
+
if func == "count" and col == df.columns[0]:
|
|
702
|
+
agg_result = grouped.size().rename(columns={"size": metric_name})
|
|
703
|
+
else:
|
|
704
|
+
agg_result = grouped.agg(**{metric_name: (col, func)})
|
|
705
|
+
result_frames.append(agg_result)
|
|
706
|
+
|
|
707
|
+
if len(result_frames) == 1:
|
|
708
|
+
return result_frames[0]
|
|
709
|
+
|
|
710
|
+
result = result_frames[0]
|
|
711
|
+
for frame in result_frames[1:]:
|
|
712
|
+
new_cols = [c for c in frame.columns if c not in group_cols]
|
|
713
|
+
result = pd.merge(result, frame[group_cols + new_cols], on=group_cols)
|
|
714
|
+
|
|
715
|
+
return result
|
|
716
|
+
|
|
717
|
+
def _pandas_agg_all(self, df: Any, metric_names: List[str]) -> Any:
|
|
718
|
+
"""Execute pandas aggregation without grouping."""
|
|
719
|
+
import pandas as pd
|
|
720
|
+
|
|
721
|
+
results = {}
|
|
722
|
+
|
|
723
|
+
for metric_name in metric_names:
|
|
724
|
+
metric_def = self._metric_cache.get(metric_name)
|
|
725
|
+
if metric_def:
|
|
726
|
+
col, func = self._parse_pandas_agg(metric_def.expr)
|
|
727
|
+
|
|
728
|
+
if col == "*":
|
|
729
|
+
results[metric_name] = len(df)
|
|
730
|
+
elif func == "sum":
|
|
731
|
+
results[metric_name] = df[col].sum()
|
|
732
|
+
elif func == "mean":
|
|
733
|
+
results[metric_name] = df[col].mean()
|
|
734
|
+
elif func == "count":
|
|
735
|
+
results[metric_name] = df[col].count()
|
|
736
|
+
elif func == "min":
|
|
737
|
+
results[metric_name] = df[col].min()
|
|
738
|
+
elif func == "max":
|
|
739
|
+
results[metric_name] = df[col].max()
|
|
740
|
+
else:
|
|
741
|
+
results[metric_name] = df[col].agg(func)
|
|
742
|
+
|
|
743
|
+
return pd.DataFrame([results])
|