odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,743 @@
1
+ """
2
+ Semantic Query Module
3
+ =====================
4
+
5
+ Parse and execute semantic queries in the format:
6
+ "metric1, metric2 BY dimension1, dimension2"
7
+
8
+ Example:
9
+ "revenue, order_count BY region, month"
10
+
11
+ This generates SQL-like aggregation queries from semantic definitions.
12
+ """
13
+
14
+ import re
15
+ import time
16
+ from dataclasses import dataclass, field
17
+ from typing import Any, Dict, List, Optional, Tuple
18
+
19
+ from odibi.context import EngineContext
20
+ from odibi.enums import EngineType
21
+ from odibi.semantics.metrics import (
22
+ DimensionDefinition,
23
+ MetricDefinition,
24
+ MetricType,
25
+ SemanticLayerConfig,
26
+ )
27
+ from odibi.utils.logging_context import get_logging_context
28
+
29
+
30
+ @dataclass
31
+ class ParsedQuery:
32
+ """Result of parsing a semantic query string."""
33
+
34
+ metrics: List[str] = field(default_factory=list)
35
+ dimensions: List[str] = field(default_factory=list)
36
+ filters: List[str] = field(default_factory=list)
37
+ raw_query: str = ""
38
+
39
+
40
+ @dataclass
41
+ class QueryResult:
42
+ """Result of executing a semantic query."""
43
+
44
+ df: Any
45
+ metrics: List[str]
46
+ dimensions: List[str]
47
+ row_count: int
48
+ elapsed_ms: float
49
+ sql_generated: Optional[str] = None
50
+
51
+
52
+ class SemanticQuery:
53
+ """
54
+ Execute semantic queries against a configured semantic layer.
55
+
56
+ Usage:
57
+ config = SemanticLayerConfig(...)
58
+ query = SemanticQuery(config)
59
+ result = query.execute("revenue BY region, month", context)
60
+ """
61
+
62
+ def __init__(self, config: SemanticLayerConfig):
63
+ """
64
+ Initialize with semantic layer configuration.
65
+
66
+ Args:
67
+ config: SemanticLayerConfig with metrics and dimensions
68
+ """
69
+ self.config = config
70
+ self._metric_cache: Dict[str, MetricDefinition] = {}
71
+ self._dimension_cache: Dict[str, DimensionDefinition] = {}
72
+
73
+ for metric in config.metrics:
74
+ self._metric_cache[metric.name] = metric
75
+
76
+ for dim in config.dimensions:
77
+ self._dimension_cache[dim.name] = dim
78
+
79
+ def parse(self, query_string: str) -> ParsedQuery:
80
+ """
81
+ Parse a semantic query string.
82
+
83
+ Format: "metric1, metric2 BY dimension1, dimension2 WHERE condition"
84
+
85
+ Args:
86
+ query_string: Query string to parse
87
+
88
+ Returns:
89
+ ParsedQuery with extracted metrics, dimensions, filters
90
+ """
91
+ ctx = get_logging_context()
92
+ ctx.debug("Parsing semantic query", query=query_string)
93
+
94
+ result = ParsedQuery(raw_query=query_string)
95
+ query = query_string.strip()
96
+
97
+ where_match = re.search(r"\s+WHERE\s+(.+)$", query, re.IGNORECASE)
98
+ if where_match:
99
+ result.filters = [where_match.group(1).strip()]
100
+ query = query[: where_match.start()]
101
+
102
+ by_match = re.search(r"\s+BY\s+(.+)$", query, re.IGNORECASE)
103
+ if by_match:
104
+ dim_part = by_match.group(1).strip()
105
+ result.dimensions = [d.strip().lower() for d in dim_part.split(",")]
106
+ query = query[: by_match.start()]
107
+
108
+ metric_part = query.strip()
109
+ if metric_part:
110
+ result.metrics = [m.strip().lower() for m in metric_part.split(",")]
111
+
112
+ ctx.debug(
113
+ "Parsed semantic query",
114
+ metrics=result.metrics,
115
+ dimensions=result.dimensions,
116
+ filters=result.filters,
117
+ )
118
+
119
+ return result
120
+
121
+ def validate(self, parsed: ParsedQuery) -> List[str]:
122
+ """
123
+ Validate a parsed query against the semantic layer config.
124
+
125
+ Args:
126
+ parsed: ParsedQuery to validate
127
+
128
+ Returns:
129
+ List of validation errors (empty if valid)
130
+ """
131
+ errors = []
132
+
133
+ for metric_name in parsed.metrics:
134
+ if metric_name not in self._metric_cache:
135
+ available = list(self._metric_cache.keys())
136
+ errors.append(f"Unknown metric '{metric_name}'. Available: {available}")
137
+
138
+ for dim_name in parsed.dimensions:
139
+ if dim_name not in self._dimension_cache:
140
+ available = list(self._dimension_cache.keys())
141
+ errors.append(f"Unknown dimension '{dim_name}'. Available: {available}")
142
+
143
+ if not parsed.metrics:
144
+ errors.append("At least one metric is required")
145
+
146
+ return errors
147
+
148
+ def generate_sql(self, parsed: ParsedQuery) -> Tuple[str, str]:
149
+ """
150
+ Generate SQL from a parsed query.
151
+
152
+ Args:
153
+ parsed: ParsedQuery with metrics and dimensions
154
+
155
+ Returns:
156
+ Tuple of (SQL query, source table name)
157
+ """
158
+ if not parsed.metrics:
159
+ raise ValueError("At least one metric is required")
160
+
161
+ metric_defs = [self._metric_cache[m] for m in parsed.metrics]
162
+
163
+ all_component_metrics = set()
164
+ for metric_def in metric_defs:
165
+ if metric_def.type == MetricType.DERIVED and metric_def.components:
166
+ for comp_name in metric_def.components:
167
+ comp_metric = self._metric_cache.get(comp_name.lower())
168
+ if comp_metric:
169
+ all_component_metrics.add(comp_name.lower())
170
+
171
+ sources = set()
172
+ for m in metric_defs:
173
+ if m.source:
174
+ sources.add(m.source)
175
+ for comp_name in all_component_metrics:
176
+ comp_metric = self._metric_cache.get(comp_name)
177
+ if comp_metric and comp_metric.source:
178
+ sources.add(comp_metric.source)
179
+
180
+ if not sources:
181
+ raise ValueError("No source table found for metrics")
182
+
183
+ source_table = list(sources)[0]
184
+
185
+ select_parts = []
186
+
187
+ for dim_name in parsed.dimensions:
188
+ dim_def = self._dimension_cache.get(dim_name)
189
+ if dim_def:
190
+ col = dim_def.get_column()
191
+ select_parts.append(col)
192
+ else:
193
+ select_parts.append(dim_name)
194
+
195
+ for comp_name in all_component_metrics:
196
+ comp_metric = self._metric_cache.get(comp_name)
197
+ if comp_metric and comp_metric.expr:
198
+ select_parts.append(f"{comp_metric.expr} AS {comp_name}")
199
+
200
+ for metric_def in metric_defs:
201
+ if metric_def.type == MetricType.DERIVED:
202
+ formula_sql = self._build_derived_formula_sql(metric_def)
203
+ select_parts.append(f"{formula_sql} AS {metric_def.name}")
204
+ elif metric_def.name not in all_component_metrics:
205
+ select_parts.append(f"{metric_def.expr} AS {metric_def.name}")
206
+
207
+ select_clause = ", ".join(select_parts) if select_parts else "*"
208
+
209
+ all_filters = []
210
+ for metric_def in metric_defs:
211
+ all_filters.extend(metric_def.filters)
212
+ all_filters.extend(parsed.filters)
213
+
214
+ where_clause = ""
215
+ if all_filters:
216
+ where_clause = " WHERE " + " AND ".join(f"({f})" for f in all_filters)
217
+
218
+ group_by_clause = ""
219
+ if parsed.dimensions:
220
+ group_cols = []
221
+ for dim_name in parsed.dimensions:
222
+ dim_def = self._dimension_cache.get(dim_name)
223
+ if dim_def:
224
+ group_cols.append(dim_def.get_column())
225
+ else:
226
+ group_cols.append(dim_name)
227
+ group_by_clause = " GROUP BY " + ", ".join(group_cols)
228
+
229
+ sql = f"SELECT {select_clause} FROM {source_table}{where_clause}{group_by_clause}"
230
+
231
+ return sql, source_table
232
+
233
+ def _build_derived_formula_sql(self, metric_def: MetricDefinition) -> str:
234
+ """
235
+ Build SQL for a derived metric formula.
236
+
237
+ Replaces component names with their aggregation expressions
238
+ and wraps divisors with NULLIF to prevent division by zero.
239
+
240
+ Args:
241
+ metric_def: The derived metric definition
242
+
243
+ Returns:
244
+ SQL expression string
245
+ """
246
+ if not metric_def.formula or not metric_def.components:
247
+ raise ValueError(f"Derived metric '{metric_def.name}' missing formula or components")
248
+
249
+ formula = metric_def.formula
250
+
251
+ component_exprs = {}
252
+ for comp_name in metric_def.components:
253
+ comp_metric = self._metric_cache.get(comp_name.lower())
254
+ if comp_metric and comp_metric.expr:
255
+ component_exprs[comp_name.lower()] = comp_metric.expr
256
+
257
+ sorted_names = sorted(component_exprs.keys(), key=len, reverse=True)
258
+ result = formula
259
+ for name in sorted_names:
260
+ result = result.replace(name, component_exprs[name])
261
+
262
+ result = self._wrap_divisors_with_nullif(result)
263
+
264
+ return result
265
+
266
+ def _wrap_divisors_with_nullif(self, expr: str) -> str:
267
+ """
268
+ Wrap division operands with NULLIF to prevent division by zero.
269
+
270
+ Handles patterns like:
271
+ - expr / SUM(x) -> expr / NULLIF(SUM(x), 0)
272
+ - (a - b) / SUM(x) -> (a - b) / NULLIF(SUM(x), 0)
273
+
274
+ Args:
275
+ expr: SQL expression string
276
+
277
+ Returns:
278
+ Expression with NULLIF wrapping divisors
279
+ """
280
+ import re
281
+
282
+ pattern = r"/\s*(\([^)]+\)|SUM\([^)]+\)|COUNT\([^)]+\)|AVG\([^)]+\)|[A-Za-z_][A-Za-z0-9_]*)"
283
+ matches = list(re.finditer(pattern, expr, re.IGNORECASE))
284
+
285
+ for match in reversed(matches):
286
+ divisor = match.group(1)
287
+ if not divisor.upper().startswith("NULLIF"):
288
+ start, end = match.span()
289
+ new_text = f"/ NULLIF({divisor}, 0)"
290
+ expr = expr[:start] + new_text + expr[end:]
291
+
292
+ return expr
293
+
294
+ def execute(
295
+ self,
296
+ query_string: str,
297
+ context: EngineContext,
298
+ source_df: Optional[Any] = None,
299
+ ) -> QueryResult:
300
+ """
301
+ Execute a semantic query.
302
+
303
+ Args:
304
+ query_string: Semantic query string (e.g., "revenue BY region")
305
+ context: EngineContext for execution
306
+ source_df: Optional source DataFrame (overrides context lookup)
307
+
308
+ Returns:
309
+ QueryResult with DataFrame and metadata
310
+ """
311
+ ctx = get_logging_context()
312
+ start_time = time.time()
313
+
314
+ ctx.info("Executing semantic query", query=query_string)
315
+
316
+ parsed = self.parse(query_string)
317
+ errors = self.validate(parsed)
318
+ if errors:
319
+ raise ValueError(f"Invalid semantic query: {'; '.join(errors)}")
320
+
321
+ sql, source_table = self.generate_sql(parsed)
322
+ ctx.debug("Generated SQL", sql=sql, source=source_table)
323
+
324
+ if source_df is None:
325
+ try:
326
+ source_df = context.get(source_table)
327
+ except KeyError:
328
+ raise ValueError(f"Source table '{source_table}' not found in context")
329
+
330
+ result_df = self._execute_query(context, source_df, parsed)
331
+
332
+ if context.engine_type == EngineType.SPARK:
333
+ row_count = result_df.count()
334
+ else:
335
+ row_count = len(result_df)
336
+
337
+ elapsed_ms = (time.time() - start_time) * 1000
338
+
339
+ ctx.info(
340
+ "Semantic query completed",
341
+ query=query_string,
342
+ rows=row_count,
343
+ elapsed_ms=round(elapsed_ms, 2),
344
+ )
345
+
346
+ return QueryResult(
347
+ df=result_df,
348
+ metrics=parsed.metrics,
349
+ dimensions=parsed.dimensions,
350
+ row_count=row_count,
351
+ elapsed_ms=elapsed_ms,
352
+ sql_generated=sql,
353
+ )
354
+
355
+ def _execute_query(
356
+ self,
357
+ context: EngineContext,
358
+ source_df: Any,
359
+ parsed: ParsedQuery,
360
+ ) -> Any:
361
+ """Execute the query using the appropriate engine."""
362
+ if context.engine_type == EngineType.SPARK:
363
+ return self._execute_spark(context, source_df, parsed)
364
+ elif context.engine_type == EngineType.POLARS:
365
+ return self._execute_polars(source_df, parsed)
366
+ else:
367
+ return self._execute_pandas(source_df, parsed)
368
+
369
+ def _execute_spark(
370
+ self,
371
+ context: EngineContext,
372
+ source_df: Any,
373
+ parsed: ParsedQuery,
374
+ ) -> Any:
375
+ """Execute query using Spark."""
376
+ from pyspark.sql import functions as F
377
+
378
+ df = source_df
379
+
380
+ all_filters = []
381
+ for metric_name in parsed.metrics:
382
+ metric_def = self._metric_cache.get(metric_name)
383
+ if metric_def:
384
+ all_filters.extend(metric_def.filters)
385
+ all_filters.extend(parsed.filters)
386
+
387
+ for filter_expr in all_filters:
388
+ df = df.filter(filter_expr)
389
+
390
+ group_cols = []
391
+ for dim_name in parsed.dimensions:
392
+ dim_def = self._dimension_cache.get(dim_name)
393
+ if dim_def:
394
+ group_cols.append(F.col(dim_def.get_column()))
395
+ else:
396
+ group_cols.append(F.col(dim_name))
397
+
398
+ component_metrics = set()
399
+ derived_metrics = []
400
+ simple_metrics = []
401
+
402
+ for metric_name in parsed.metrics:
403
+ metric_def = self._metric_cache.get(metric_name)
404
+ if metric_def:
405
+ if metric_def.type == MetricType.DERIVED:
406
+ derived_metrics.append(metric_def)
407
+ if metric_def.components:
408
+ for comp in metric_def.components:
409
+ component_metrics.add(comp.lower())
410
+ else:
411
+ simple_metrics.append(metric_def)
412
+
413
+ agg_exprs = []
414
+ for comp_name in component_metrics:
415
+ comp_def = self._metric_cache.get(comp_name)
416
+ if comp_def and comp_def.expr:
417
+ agg_exprs.append(F.expr(comp_def.expr).alias(comp_name))
418
+
419
+ for metric_def in simple_metrics:
420
+ if metric_def.name not in component_metrics and metric_def.expr:
421
+ agg_exprs.append(F.expr(metric_def.expr).alias(metric_def.name))
422
+
423
+ if group_cols:
424
+ result = df.groupBy(group_cols).agg(*agg_exprs)
425
+ else:
426
+ result = df.agg(*agg_exprs)
427
+
428
+ for derived in derived_metrics:
429
+ formula_expr = self._build_pandas_derived_formula(derived)
430
+ result = result.withColumn(derived.name, F.expr(formula_expr))
431
+
432
+ return result
433
+
434
+ def _execute_polars(self, source_df: Any, parsed: ParsedQuery) -> Any:
435
+ """Execute query using Polars."""
436
+ import polars as pl
437
+
438
+ df = source_df
439
+ if isinstance(df, pl.LazyFrame):
440
+ df = df.collect()
441
+
442
+ all_filters = []
443
+ for metric_name in parsed.metrics:
444
+ metric_def = self._metric_cache.get(metric_name)
445
+ if metric_def:
446
+ all_filters.extend(metric_def.filters)
447
+ all_filters.extend(parsed.filters)
448
+
449
+ for filter_expr in all_filters:
450
+ df = df.filter(pl.sql_expr(filter_expr))
451
+
452
+ group_cols = []
453
+ for dim_name in parsed.dimensions:
454
+ dim_def = self._dimension_cache.get(dim_name)
455
+ if dim_def:
456
+ group_cols.append(dim_def.get_column())
457
+ else:
458
+ group_cols.append(dim_name)
459
+
460
+ component_metrics = set()
461
+ derived_metrics = []
462
+ simple_metrics = []
463
+
464
+ for metric_name in parsed.metrics:
465
+ metric_def = self._metric_cache.get(metric_name)
466
+ if metric_def:
467
+ if metric_def.type == MetricType.DERIVED:
468
+ derived_metrics.append(metric_def)
469
+ if metric_def.components:
470
+ for comp in metric_def.components:
471
+ component_metrics.add(comp.lower())
472
+ else:
473
+ simple_metrics.append(metric_def)
474
+
475
+ agg_exprs = []
476
+ for comp_name in component_metrics:
477
+ comp_def = self._metric_cache.get(comp_name)
478
+ if comp_def and comp_def.expr:
479
+ col, func = self._parse_pandas_agg(comp_def.expr)
480
+ agg_exprs.append(self._polars_agg_expr(col, func, comp_name))
481
+
482
+ for metric_def in simple_metrics:
483
+ if metric_def.name not in component_metrics and metric_def.expr:
484
+ col, func = self._parse_pandas_agg(metric_def.expr)
485
+ agg_exprs.append(self._polars_agg_expr(col, func, metric_def.name))
486
+
487
+ if group_cols:
488
+ result = df.group_by(group_cols).agg(agg_exprs)
489
+ else:
490
+ result = df.select(agg_exprs)
491
+
492
+ for derived in derived_metrics:
493
+ result = self._apply_polars_derived_formula(result, derived)
494
+
495
+ return result
496
+
497
+ def _polars_agg_expr(self, col: str, func: str, alias: str) -> Any:
498
+ """Build a Polars aggregation expression."""
499
+ import polars as pl
500
+
501
+ if col == "*":
502
+ return pl.len().alias(alias)
503
+
504
+ if func == "sum":
505
+ return pl.col(col).sum().alias(alias)
506
+ elif func == "mean":
507
+ return pl.col(col).mean().alias(alias)
508
+ elif func == "count":
509
+ return pl.col(col).count().alias(alias)
510
+ elif func == "min":
511
+ return pl.col(col).min().alias(alias)
512
+ elif func == "max":
513
+ return pl.col(col).max().alias(alias)
514
+ else:
515
+ return pl.col(col).sum().alias(alias)
516
+
517
+ def _apply_polars_derived_formula(self, df: Any, metric_def: MetricDefinition) -> Any:
518
+ """
519
+ Apply a derived metric formula to a Polars DataFrame.
520
+
521
+ Args:
522
+ df: DataFrame with component metrics already calculated
523
+ metric_def: The derived metric definition
524
+
525
+ Returns:
526
+ DataFrame with the derived metric column added
527
+ """
528
+ import polars as pl
529
+
530
+ if not metric_def.formula or not metric_def.components:
531
+ raise ValueError(f"Derived metric '{metric_def.name}' missing formula or components")
532
+
533
+ formula = metric_def.formula
534
+
535
+ expr_parts = {}
536
+ for comp_name in metric_def.components:
537
+ comp_lower = comp_name.lower()
538
+ if comp_lower in df.columns:
539
+ expr_parts[comp_lower] = pl.col(comp_lower)
540
+
541
+ try:
542
+ result_expr = eval(formula, {"__builtins__": {}}, expr_parts)
543
+ df = df.with_columns(result_expr.alias(metric_def.name))
544
+ except ZeroDivisionError:
545
+ df = df.with_columns(pl.lit(float("nan")).alias(metric_def.name))
546
+
547
+ return df
548
+
549
+ def _execute_pandas(self, source_df: Any, parsed: ParsedQuery) -> Any:
550
+ """Execute query using Pandas."""
551
+
552
+ df = source_df.copy()
553
+
554
+ all_filters = []
555
+ for metric_name in parsed.metrics:
556
+ metric_def = self._metric_cache.get(metric_name)
557
+ if metric_def:
558
+ all_filters.extend(metric_def.filters)
559
+ all_filters.extend(parsed.filters)
560
+
561
+ for filter_expr in all_filters:
562
+ df = df.query(filter_expr)
563
+
564
+ group_cols = []
565
+ for dim_name in parsed.dimensions:
566
+ dim_def = self._dimension_cache.get(dim_name)
567
+ if dim_def:
568
+ group_cols.append(dim_def.get_column())
569
+ else:
570
+ group_cols.append(dim_name)
571
+
572
+ component_metrics = set()
573
+ derived_metrics = []
574
+ simple_metric_names = []
575
+
576
+ for metric_name in parsed.metrics:
577
+ metric_def = self._metric_cache.get(metric_name)
578
+ if metric_def:
579
+ if metric_def.type == MetricType.DERIVED:
580
+ derived_metrics.append(metric_def)
581
+ if metric_def.components:
582
+ for comp in metric_def.components:
583
+ component_metrics.add(comp.lower())
584
+ else:
585
+ simple_metric_names.append(metric_name)
586
+
587
+ all_metrics_to_agg = list(component_metrics)
588
+ for name in simple_metric_names:
589
+ if name not in component_metrics:
590
+ all_metrics_to_agg.append(name)
591
+
592
+ if group_cols:
593
+ result = self._pandas_groupby_agg(df, group_cols, all_metrics_to_agg)
594
+ else:
595
+ result = self._pandas_agg_all(df, all_metrics_to_agg)
596
+
597
+ for derived in derived_metrics:
598
+ result = self._apply_pandas_derived_formula(result, derived)
599
+
600
+ return result
601
+
602
+ def _apply_pandas_derived_formula(self, df: Any, metric_def: MetricDefinition) -> Any:
603
+ """
604
+ Apply a derived metric formula to a pandas DataFrame.
605
+
606
+ Args:
607
+ df: DataFrame with component metrics already calculated
608
+ metric_def: The derived metric definition
609
+
610
+ Returns:
611
+ DataFrame with the derived metric column added
612
+ """
613
+ if not metric_def.formula or not metric_def.components:
614
+ raise ValueError(f"Derived metric '{metric_def.name}' missing formula or components")
615
+
616
+ formula = metric_def.formula
617
+
618
+ local_vars = {}
619
+ for comp_name in metric_def.components:
620
+ comp_lower = comp_name.lower()
621
+ if comp_lower in df.columns:
622
+ local_vars[comp_lower] = df[comp_lower]
623
+
624
+ try:
625
+ df[metric_def.name] = eval(formula, {"__builtins__": {}}, local_vars)
626
+ except ZeroDivisionError:
627
+ df[metric_def.name] = float("nan")
628
+
629
+ return df
630
+
631
+ def _build_pandas_derived_formula(self, metric_def: MetricDefinition) -> str:
632
+ """
633
+ Build a formula expression for pandas/spark using column names.
634
+
635
+ Wraps divisors with null protection for Spark SQL.
636
+
637
+ Args:
638
+ metric_def: The derived metric definition
639
+
640
+ Returns:
641
+ Formula expression string using column names with NULLIF protection
642
+ """
643
+ if not metric_def.formula:
644
+ raise ValueError(f"Derived metric '{metric_def.name}' missing formula")
645
+
646
+ formula = metric_def.formula
647
+ result = self._wrap_divisors_with_nullif(formula)
648
+
649
+ return result
650
+
651
+ def _parse_pandas_agg(self, expr: str) -> Tuple[str, str]:
652
+ """
653
+ Parse a SQL aggregation expression to (column, function).
654
+
655
+ Example: "SUM(total_amount)" -> ("total_amount", "sum")
656
+ """
657
+ expr_stripped = expr.strip()
658
+
659
+ match = re.match(r"(\w+)\(([^)]+)\)", expr_stripped, re.IGNORECASE)
660
+ if match:
661
+ func = match.group(1).upper()
662
+ col = match.group(2).strip()
663
+
664
+ func_map = {
665
+ "SUM": "sum",
666
+ "AVG": "mean",
667
+ "COUNT": "count",
668
+ "MIN": "min",
669
+ "MAX": "max",
670
+ }
671
+
672
+ return (col, func_map.get(func, func.lower()))
673
+
674
+ return (expr_stripped, "first")
675
+
676
+ def _pandas_groupby_agg(self, df: Any, group_cols: List[str], metric_names: List[str]) -> Any:
677
+ """Execute pandas groupby aggregation."""
678
+ import pandas as pd
679
+
680
+ agg_operations = {}
681
+
682
+ for metric_name in metric_names:
683
+ metric_def = self._metric_cache.get(metric_name)
684
+ if metric_def:
685
+ col, func = self._parse_pandas_agg(metric_def.expr)
686
+
687
+ if col == "*":
688
+ first_col = df.columns[0]
689
+ if metric_name not in agg_operations:
690
+ agg_operations[metric_name] = (first_col, "count")
691
+ else:
692
+ agg_operations[metric_name] = (col, func)
693
+
694
+ if not agg_operations:
695
+ return df.groupby(group_cols, as_index=False).size()
696
+
697
+ grouped = df.groupby(group_cols, as_index=False)
698
+
699
+ result_frames = []
700
+ for metric_name, (col, func) in agg_operations.items():
701
+ if func == "count" and col == df.columns[0]:
702
+ agg_result = grouped.size().rename(columns={"size": metric_name})
703
+ else:
704
+ agg_result = grouped.agg(**{metric_name: (col, func)})
705
+ result_frames.append(agg_result)
706
+
707
+ if len(result_frames) == 1:
708
+ return result_frames[0]
709
+
710
+ result = result_frames[0]
711
+ for frame in result_frames[1:]:
712
+ new_cols = [c for c in frame.columns if c not in group_cols]
713
+ result = pd.merge(result, frame[group_cols + new_cols], on=group_cols)
714
+
715
+ return result
716
+
717
+ def _pandas_agg_all(self, df: Any, metric_names: List[str]) -> Any:
718
+ """Execute pandas aggregation without grouping."""
719
+ import pandas as pd
720
+
721
+ results = {}
722
+
723
+ for metric_name in metric_names:
724
+ metric_def = self._metric_cache.get(metric_name)
725
+ if metric_def:
726
+ col, func = self._parse_pandas_agg(metric_def.expr)
727
+
728
+ if col == "*":
729
+ results[metric_name] = len(df)
730
+ elif func == "sum":
731
+ results[metric_name] = df[col].sum()
732
+ elif func == "mean":
733
+ results[metric_name] = df[col].mean()
734
+ elif func == "count":
735
+ results[metric_name] = df[col].count()
736
+ elif func == "min":
737
+ results[metric_name] = df[col].min()
738
+ elif func == "max":
739
+ results[metric_name] = df[col].max()
740
+ else:
741
+ results[metric_name] = df[col].agg(func)
742
+
743
+ return pd.DataFrame([results])