aetherdialect 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aetherdialect-0.1.0.dist-info/METADATA +197 -0
- aetherdialect-0.1.0.dist-info/RECORD +34 -0
- aetherdialect-0.1.0.dist-info/WHEEL +5 -0
- aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
- aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
- text2sql/__init__.py +7 -0
- text2sql/config.py +1063 -0
- text2sql/contracts_base.py +952 -0
- text2sql/contracts_core.py +1890 -0
- text2sql/core_utils.py +834 -0
- text2sql/dialect.py +1134 -0
- text2sql/expansion_ops.py +1218 -0
- text2sql/expansion_rules.py +496 -0
- text2sql/intent_expr.py +1759 -0
- text2sql/intent_process.py +2133 -0
- text2sql/intent_repair.py +1733 -0
- text2sql/intent_resolve.py +1292 -0
- text2sql/live_testing.py +1117 -0
- text2sql/main_execution.py +799 -0
- text2sql/pipeline.py +1662 -0
- text2sql/qsim_ops.py +1286 -0
- text2sql/qsim_sample.py +609 -0
- text2sql/qsim_struct.py +569 -0
- text2sql/schema.py +973 -0
- text2sql/schema_profiling.py +2075 -0
- text2sql/simulator.py +970 -0
- text2sql/sql_gen.py +1537 -0
- text2sql/templates.py +1037 -0
- text2sql/text2sql.py +726 -0
- text2sql/utils.py +973 -0
- text2sql/validation_agg.py +1033 -0
- text2sql/validation_execute.py +1092 -0
- text2sql/validation_schema.py +1847 -0
- text2sql/validation_semantic.py +2122 -0
|
@@ -0,0 +1,2122 @@
|
|
|
1
|
+
"""Semantic and intent-level validation for SQL generation.
|
|
2
|
+
|
|
3
|
+
Validates grain consistency (scalar/grouped/row_level vs aggregation and GROUP BY), semantic contradictions (for example MAX and MIN on the same column or scalar grain with ``expected_rows`` many), and expression-vs-expression filter/HAVING type compatibility. Validates filter/HAVING placement (no aggregation in WHERE, aggregation required in HAVING), nested and mixed aggregation rules, ORDER BY aggregation context, and SELECT/GROUP BY membership. Validates CTE grain consistency and dependency grains, orchestrates full intent validation, and provides auto-repair for misplaced filter/HAVING conditions.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
from .config import (
|
|
11
|
+
AGG_KEYWORDS_RE,
|
|
12
|
+
AGG_QUANTITY_RE,
|
|
13
|
+
COMPATIBLE_TYPE_PAIRS,
|
|
14
|
+
COUNT_THRESHOLD_TABLE_RE,
|
|
15
|
+
NUMERIC_ONLY_AGGREGATIONS,
|
|
16
|
+
NUMERIC_RESULT_AGGS,
|
|
17
|
+
NUMERIC_RESULT_OPS,
|
|
18
|
+
NUMERIC_RESULT_SCALARS,
|
|
19
|
+
SCALAR_FUNCTIONS_NUMERIC,
|
|
20
|
+
VALID_AGG_FUNCS,
|
|
21
|
+
)
|
|
22
|
+
from .contracts_base import (
|
|
23
|
+
CteOutputColumnMeta,
|
|
24
|
+
IntentIssue,
|
|
25
|
+
SchemaGraph,
|
|
26
|
+
)
|
|
27
|
+
from .contracts_core import (
|
|
28
|
+
FilterParam,
|
|
29
|
+
HavingParam,
|
|
30
|
+
MulGroup,
|
|
31
|
+
NormalizedExpr,
|
|
32
|
+
OrderByCol,
|
|
33
|
+
RuntimeCteStep,
|
|
34
|
+
SelectCol,
|
|
35
|
+
)
|
|
36
|
+
from .core_utils import debug
|
|
37
|
+
from .validation_agg import (
|
|
38
|
+
expr_has_arithmetic,
|
|
39
|
+
expr_result_is_numeric,
|
|
40
|
+
strip_function_wrappers,
|
|
41
|
+
term_result_is_numeric,
|
|
42
|
+
)
|
|
43
|
+
from .validation_schema import (
|
|
44
|
+
extract_agg_col,
|
|
45
|
+
get_col_meta,
|
|
46
|
+
get_col_type,
|
|
47
|
+
is_col_arithmetic_role,
|
|
48
|
+
is_col_numeric,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def validate_grain_consistency(
|
|
53
|
+
grain: str,
|
|
54
|
+
select_cols: list[SelectCol],
|
|
55
|
+
group_by_cols: list[NormalizedExpr],
|
|
56
|
+
having_param: list[HavingParam],
|
|
57
|
+
context: str = "main",
|
|
58
|
+
) -> list[IntentIssue]:
|
|
59
|
+
"""Validate that the declared grain is consistent with aggregation and GROUP BY presence.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
|
|
63
|
+
grain: Declared query grain (``"scalar"``, ``"grouped"``, or ``"row_level"``).
|
|
64
|
+
select_cols: SELECT column list to inspect for aggregation.
|
|
65
|
+
group_by_cols: GROUP BY expression list.
|
|
66
|
+
having_param: HAVING conditions to check for aggregation-free usage.
|
|
67
|
+
context: Query context label for issue messages.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
|
|
71
|
+
List of ``IntentIssue`` instances describing grain inconsistencies.
|
|
72
|
+
"""
|
|
73
|
+
issues = []
|
|
74
|
+
debug(
|
|
75
|
+
f"[validation_semantic.validate_grain_consistency] grain={grain}, group_by={len(group_by_cols)}, having={len(having_param)}"
|
|
76
|
+
)
|
|
77
|
+
if grain not in {"scalar", "grouped", "row_level"}:
|
|
78
|
+
issues.append(
|
|
79
|
+
IntentIssue(
|
|
80
|
+
issue_id=f"invalid_grain_{grain}",
|
|
81
|
+
category="grain_validity",
|
|
82
|
+
severity="error",
|
|
83
|
+
message=f"Invalid grain value '{grain}'. Allowed: scalar, grouped, row_level",
|
|
84
|
+
context={"grain": grain, "location": context},
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
return issues
|
|
88
|
+
has_agg = any(sc.is_aggregated for sc in select_cols)
|
|
89
|
+
has_group_by = bool(group_by_cols)
|
|
90
|
+
has_having = bool(having_param)
|
|
91
|
+
if grain == "grouped" and not has_group_by:
|
|
92
|
+
issues.append(
|
|
93
|
+
IntentIssue(
|
|
94
|
+
issue_id=f"grouped_without_group_by_{context}",
|
|
95
|
+
category="grain_consistency",
|
|
96
|
+
severity="error",
|
|
97
|
+
message=f"Grouped grain without GROUP BY columns in {context}",
|
|
98
|
+
context={"grain": grain, "location": context},
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
debug("[validation_semantic.validate_grain_consistency] grouped grain without group_by")
|
|
102
|
+
if grain in {"scalar", "row_level"} and has_group_by:
|
|
103
|
+
issues.append(
|
|
104
|
+
IntentIssue(
|
|
105
|
+
issue_id=f"group_by_with_{grain}_{context}",
|
|
106
|
+
category="grain_consistency",
|
|
107
|
+
severity="error",
|
|
108
|
+
message=f"GROUP BY columns present but grain={grain} in {context}",
|
|
109
|
+
context={
|
|
110
|
+
"grain": grain,
|
|
111
|
+
"group_by": [g.primary_column for g in group_by_cols],
|
|
112
|
+
"location": context,
|
|
113
|
+
},
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
debug(f"[validation_semantic.validate_grain_consistency] group_by present but grain={grain}")
|
|
117
|
+
if has_agg and grain == "row_level":
|
|
118
|
+
agg_funcs = [sc.expr.primary_term for sc in select_cols if sc.is_aggregated]
|
|
119
|
+
issues.append(
|
|
120
|
+
IntentIssue(
|
|
121
|
+
issue_id=f"agg_with_row_level_{context}_{','.join(agg_funcs)}",
|
|
122
|
+
category="grain_consistency",
|
|
123
|
+
severity="error",
|
|
124
|
+
message=f"Aggregation functions {agg_funcs} with row_level grain in {context}",
|
|
125
|
+
context={"agg_funcs": agg_funcs, "grain": grain, "location": context},
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
debug("[validation_semantic.validate_grain_consistency] agg funcs with row_level grain")
|
|
129
|
+
if has_having and grain not in {"grouped", "scalar"}:
|
|
130
|
+
issues.append(
|
|
131
|
+
IntentIssue(
|
|
132
|
+
issue_id=f"having_without_agg_{grain}_{context}",
|
|
133
|
+
category="grain_consistency",
|
|
134
|
+
severity="error",
|
|
135
|
+
message=f"HAVING conditions without aggregation: grain={grain} but HAVING present in {context}",
|
|
136
|
+
context={
|
|
137
|
+
"grain": grain,
|
|
138
|
+
"having_count": len(having_param),
|
|
139
|
+
"location": context,
|
|
140
|
+
},
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
debug(f"[validation_semantic.validate_grain_consistency] HAVING without aggregation: grain={grain}")
|
|
144
|
+
debug(f"[validation_semantic.validate_grain_consistency] {len(issues)} issues in {context}")
|
|
145
|
+
return issues
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def validate_grouped_requires_aggregation(
|
|
149
|
+
grain: str,
|
|
150
|
+
select_cols: list[SelectCol],
|
|
151
|
+
group_by_cols: list[NormalizedExpr],
|
|
152
|
+
context: str = "main",
|
|
153
|
+
) -> list[IntentIssue]:
|
|
154
|
+
issues: list[IntentIssue] = []
|
|
155
|
+
if grain != "grouped":
|
|
156
|
+
return issues
|
|
157
|
+
if not group_by_cols:
|
|
158
|
+
return issues
|
|
159
|
+
has_agg = any(sc.is_aggregated for sc in select_cols)
|
|
160
|
+
if has_agg:
|
|
161
|
+
return issues
|
|
162
|
+
issues.append(
|
|
163
|
+
IntentIssue(
|
|
164
|
+
issue_id=f"grouped_without_aggregation_{context}",
|
|
165
|
+
category="grain_consistency",
|
|
166
|
+
severity="error",
|
|
167
|
+
message=f"Grouped grain with GROUP BY but no aggregation in SELECT in {context}. Use row_level with DISTINCT instead.",
|
|
168
|
+
context={
|
|
169
|
+
"grain": grain,
|
|
170
|
+
"group_by": [g.primary_column for g in group_by_cols],
|
|
171
|
+
"location": context,
|
|
172
|
+
},
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
debug(f"[validation_semantic.validate_grouped_requires_aggregation] grouped without aggregation in {context}")
|
|
176
|
+
return issues
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def validate_semantic_contradictions(
|
|
180
|
+
select_cols: list[SelectCol],
|
|
181
|
+
natural_language: str,
|
|
182
|
+
grain: str,
|
|
183
|
+
expected_rows: str,
|
|
184
|
+
context: str = "main",
|
|
185
|
+
) -> list[IntentIssue]:
|
|
186
|
+
"""Check for contradictory operations in the intent (for example MAX and MIN on the same column).
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
|
|
190
|
+
select_cols: SELECT column list to inspect for contradictory aggregations.
|
|
191
|
+
natural_language: Original natural-language question for pattern contradiction checks.
|
|
192
|
+
grain: Declared query grain used to assess expected-row contradictions.
|
|
193
|
+
expected_rows: Expected row count hint from the intent (for example ``"few"`` or ``"many"``).
|
|
194
|
+
context: Query context label for issue messages.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
|
|
198
|
+
List of ``IntentIssue`` instances describing semantic contradictions found.
|
|
199
|
+
"""
|
|
200
|
+
issues = []
|
|
201
|
+
debug("[validation_semantic.validate_semantic_contradictions] checking for contradictions")
|
|
202
|
+
agg_funcs = {extract_agg_col(sc.expr.primary_term)[0] for sc in select_cols if sc.is_aggregated} - {None}
|
|
203
|
+
contradictory_pairs = [
|
|
204
|
+
({"highest", "max"}, {"lowest", "min"}),
|
|
205
|
+
({"most", "maximum"}, {"least", "minimum"}),
|
|
206
|
+
({"first", "earliest"}, {"last", "latest"}),
|
|
207
|
+
]
|
|
208
|
+
for set1, set2 in contradictory_pairs:
|
|
209
|
+
if agg_funcs & set1 and agg_funcs & set2:
|
|
210
|
+
issues.append(
|
|
211
|
+
IntentIssue(
|
|
212
|
+
issue_id=f"contradictory_ops_{','.join(sorted(set1 & agg_funcs))}_{','.join(sorted(set2 & agg_funcs))}",
|
|
213
|
+
category="semantic_contradiction",
|
|
214
|
+
severity="error",
|
|
215
|
+
message=f"Intent contains contradictory operations: {set1 & agg_funcs} and {set2 & agg_funcs}",
|
|
216
|
+
context={
|
|
217
|
+
"ops1": list(set1 & agg_funcs),
|
|
218
|
+
"ops2": list(set2 & agg_funcs),
|
|
219
|
+
"location": context,
|
|
220
|
+
},
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
debug(
|
|
224
|
+
f"[validation_semantic.validate_semantic_contradictions] CONTRADICTION: {set1 & agg_funcs} vs {set2 & agg_funcs}"
|
|
225
|
+
)
|
|
226
|
+
if grain == "scalar" and expected_rows in {"few", "many"}:
|
|
227
|
+
issues.append(
|
|
228
|
+
IntentIssue(
|
|
229
|
+
issue_id=f"grain_expected_mismatch_scalar_{expected_rows}",
|
|
230
|
+
category="semantic_contradiction",
|
|
231
|
+
severity="error",
|
|
232
|
+
message=f"Intent expects a single value (scalar) but also expects multiple rows ({expected_rows})",
|
|
233
|
+
context={
|
|
234
|
+
"grain": grain,
|
|
235
|
+
"expected_rows": expected_rows,
|
|
236
|
+
"location": context,
|
|
237
|
+
},
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
debug(
|
|
241
|
+
f"[validation_semantic.validate_semantic_contradictions] CONTRADICTION: grain=scalar but expected_rows={expected_rows}"
|
|
242
|
+
)
|
|
243
|
+
nl = natural_language.lower() if natural_language else ""
|
|
244
|
+
contradiction_patterns = [
|
|
245
|
+
("never", "total"),
|
|
246
|
+
("no records", "count"),
|
|
247
|
+
("zero", "greater than"),
|
|
248
|
+
("empty", "count"),
|
|
249
|
+
]
|
|
250
|
+
for pattern1, pattern2 in contradiction_patterns:
|
|
251
|
+
if pattern1 in nl and pattern2 in nl:
|
|
252
|
+
issues.append(
|
|
253
|
+
IntentIssue(
|
|
254
|
+
issue_id=f"nl_contradiction_{pattern1.replace(' ', '_')}_{pattern2.replace(' ', '_')}",
|
|
255
|
+
category="semantic_contradiction",
|
|
256
|
+
severity="warning",
|
|
257
|
+
message=f"Intent may contain contradiction: mentions '{pattern1}' and '{pattern2}'",
|
|
258
|
+
context={
|
|
259
|
+
"pattern1": pattern1,
|
|
260
|
+
"pattern2": pattern2,
|
|
261
|
+
"location": context,
|
|
262
|
+
},
|
|
263
|
+
)
|
|
264
|
+
)
|
|
265
|
+
debug(
|
|
266
|
+
f"[validation_semantic.validate_semantic_contradictions] POTENTIAL CONTRADICTION: '{pattern1}' and '{pattern2}'"
|
|
267
|
+
)
|
|
268
|
+
debug(f"[validation_semantic.validate_semantic_contradictions] {len(issues)} issues")
|
|
269
|
+
return issues
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _validate_single_expr_types(
|
|
273
|
+
expr: NormalizedExpr,
|
|
274
|
+
schema: SchemaGraph,
|
|
275
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]],
|
|
276
|
+
location: str,
|
|
277
|
+
context: str,
|
|
278
|
+
) -> list[IntentIssue]:
|
|
279
|
+
"""Validate that all columns in an expression are type-appropriate for their usage.
|
|
280
|
+
|
|
281
|
+
Checks that columns appearing in arithmetic or numeric-input aggregation/scalar contexts have a numeric type and an arithmetic-compatible role.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
|
|
285
|
+
expr: The normalised expression to validate.
|
|
286
|
+
schema: Schema graph for resolving column types and roles.
|
|
287
|
+
cte_outputs: Map of CTE name to column output metadata.
|
|
288
|
+
location: Human-readable location label for issue context (for example ``"select_cols[0]"``).
|
|
289
|
+
context: Query context label (for example ``"main query"`` or ``"CTE 'base'"``).
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
|
|
293
|
+
List of ``IntentIssue`` instances describing type or role violations found.
|
|
294
|
+
"""
|
|
295
|
+
issues: list[IntentIssue] = []
|
|
296
|
+
has_arith = expr_has_arithmetic(expr)
|
|
297
|
+
for g in expr.add_groups + expr.sub_groups:
|
|
298
|
+
group_requires_numeric = (g.agg_func and g.agg_func in NUMERIC_ONLY_AGGREGATIONS) or (
|
|
299
|
+
g.inner_scalar_func and g.inner_scalar_func in SCALAR_FUNCTIONS_NUMERIC
|
|
300
|
+
)
|
|
301
|
+
expr_requires_numeric = (expr.agg_func and expr.agg_func in NUMERIC_ONLY_AGGREGATIONS) or (
|
|
302
|
+
expr.inner_scalar_func and expr.inner_scalar_func in SCALAR_FUNCTIONS_NUMERIC
|
|
303
|
+
)
|
|
304
|
+
needs_numeric = (
|
|
305
|
+
has_arith
|
|
306
|
+
or len(g.multiply) > 1
|
|
307
|
+
or len(g.divide) > 0
|
|
308
|
+
or g.coefficient != 1.0
|
|
309
|
+
or group_requires_numeric
|
|
310
|
+
or expr_requires_numeric
|
|
311
|
+
)
|
|
312
|
+
if not needs_numeric:
|
|
313
|
+
continue
|
|
314
|
+
if g.inner_scalar_func:
|
|
315
|
+
col_check_skippable = (
|
|
316
|
+
g.inner_scalar_func in NUMERIC_RESULT_SCALARS and g.inner_scalar_func not in SCALAR_FUNCTIONS_NUMERIC
|
|
317
|
+
)
|
|
318
|
+
elif g.agg_func:
|
|
319
|
+
col_check_skippable = g.agg_func in NUMERIC_RESULT_AGGS and g.agg_func not in NUMERIC_ONLY_AGGREGATIONS
|
|
320
|
+
elif expr.inner_scalar_func:
|
|
321
|
+
col_check_skippable = (
|
|
322
|
+
expr.inner_scalar_func in NUMERIC_RESULT_SCALARS
|
|
323
|
+
and expr.inner_scalar_func not in SCALAR_FUNCTIONS_NUMERIC
|
|
324
|
+
)
|
|
325
|
+
elif expr.agg_func:
|
|
326
|
+
col_check_skippable = (
|
|
327
|
+
expr.agg_func in NUMERIC_RESULT_AGGS and expr.agg_func not in NUMERIC_ONLY_AGGREGATIONS
|
|
328
|
+
)
|
|
329
|
+
else:
|
|
330
|
+
col_check_skippable = False
|
|
331
|
+
for term in g.multiply:
|
|
332
|
+
if col_check_skippable or term_result_is_numeric(term):
|
|
333
|
+
continue
|
|
334
|
+
ref = strip_function_wrappers(term)
|
|
335
|
+
if not ref or ref == "*" or "." not in ref:
|
|
336
|
+
continue
|
|
337
|
+
is_num = is_col_numeric(ref, schema, cte_outputs)
|
|
338
|
+
if is_num is False:
|
|
339
|
+
col_type = get_col_type(ref, schema, cte_outputs) or "unknown"
|
|
340
|
+
if has_arith and col_type in ("date", "date_window"):
|
|
341
|
+
continue
|
|
342
|
+
issues.append(
|
|
343
|
+
IntentIssue(
|
|
344
|
+
issue_id=f"expr_non_numeric_{location}_{ref}",
|
|
345
|
+
category="expression_type",
|
|
346
|
+
severity="error",
|
|
347
|
+
message=f"Non-numeric column '{ref}' (type={col_type}) in arithmetic at {location} in {context}",
|
|
348
|
+
context={
|
|
349
|
+
"column": ref,
|
|
350
|
+
"data_type": col_type,
|
|
351
|
+
"location": location,
|
|
352
|
+
},
|
|
353
|
+
)
|
|
354
|
+
)
|
|
355
|
+
role_ok = is_col_arithmetic_role(ref, schema, cte_outputs)
|
|
356
|
+
if role_ok is False:
|
|
357
|
+
meta = get_col_meta(ref, schema, cte_outputs)
|
|
358
|
+
role = meta.role if meta else "unknown"
|
|
359
|
+
issues.append(
|
|
360
|
+
IntentIssue(
|
|
361
|
+
issue_id=f"expr_invalid_role_{location}_{ref}",
|
|
362
|
+
category="expression_type",
|
|
363
|
+
severity="warning",
|
|
364
|
+
message=f"Column '{ref}' (role={role}) not suited for arithmetic at {location} in {context}",
|
|
365
|
+
context={"column": ref, "role": role, "location": location},
|
|
366
|
+
)
|
|
367
|
+
)
|
|
368
|
+
for div_term in g.divide:
|
|
369
|
+
if col_check_skippable or term_result_is_numeric(div_term):
|
|
370
|
+
continue
|
|
371
|
+
ref = strip_function_wrappers(div_term)
|
|
372
|
+
if not ref or ref == "*" or "." not in ref:
|
|
373
|
+
continue
|
|
374
|
+
is_num = is_col_numeric(ref, schema, cte_outputs)
|
|
375
|
+
if is_num is False:
|
|
376
|
+
col_type = get_col_type(ref, schema, cte_outputs) or "unknown"
|
|
377
|
+
issues.append(
|
|
378
|
+
IntentIssue(
|
|
379
|
+
issue_id=f"expr_non_numeric_div_{location}_{ref}",
|
|
380
|
+
category="expression_type",
|
|
381
|
+
severity="error",
|
|
382
|
+
message=f"Non-numeric column '{ref}' (type={col_type}) in divide at {location} in {context}",
|
|
383
|
+
context={
|
|
384
|
+
"column": ref,
|
|
385
|
+
"data_type": col_type,
|
|
386
|
+
"location": location,
|
|
387
|
+
},
|
|
388
|
+
)
|
|
389
|
+
)
|
|
390
|
+
return issues
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def validate_expr_vs_expr_filters(
|
|
394
|
+
filters_param: list[FilterParam],
|
|
395
|
+
schema: SchemaGraph,
|
|
396
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
397
|
+
context: str = "main",
|
|
398
|
+
) -> list[IntentIssue]:
|
|
399
|
+
"""Validate ``FilterParam`` expression-vs-expression comparisons for numeric type compatibility.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
|
|
403
|
+
filters_param: List of filter conditions to validate.
|
|
404
|
+
schema: Schema graph for resolving column types.
|
|
405
|
+
cte_outputs: Optional map of CTE name to column output metadata.
|
|
406
|
+
context: Query context label for issue messages.
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
|
|
410
|
+
List of ``IntentIssue`` instances where a numeric expression is compared to a non-numeric expression within the same filter condition.
|
|
411
|
+
"""
|
|
412
|
+
issues = []
|
|
413
|
+
if not filters_param:
|
|
414
|
+
return []
|
|
415
|
+
cte_outputs = cte_outputs or {}
|
|
416
|
+
debug("[validation_semantic.validate_expr_vs_expr_filters] checking expr-vs-expr type compatibility")
|
|
417
|
+
for fp in filters_param:
|
|
418
|
+
if not fp.right_expr:
|
|
419
|
+
continue
|
|
420
|
+
left_col = fp.left_expr.primary_column
|
|
421
|
+
right_col = fp.right_expr.primary_column
|
|
422
|
+
if left_col == right_col:
|
|
423
|
+
issues.append(
|
|
424
|
+
IntentIssue(
|
|
425
|
+
issue_id=f"self_comparison_filter_{left_col}",
|
|
426
|
+
category="filter_semantic",
|
|
427
|
+
severity="error",
|
|
428
|
+
message=f"Self-comparison in filter: {left_col} compared to itself",
|
|
429
|
+
context={
|
|
430
|
+
"column": left_col,
|
|
431
|
+
"param_key": fp.param_key,
|
|
432
|
+
"location": context,
|
|
433
|
+
},
|
|
434
|
+
)
|
|
435
|
+
)
|
|
436
|
+
debug(f"[validation_semantic.validate_expr_vs_expr_filters] self-comparison: {left_col}")
|
|
437
|
+
continue
|
|
438
|
+
left_type = get_col_type(left_col, schema, cte_outputs)
|
|
439
|
+
right_type = get_col_type(right_col, schema, cte_outputs)
|
|
440
|
+
if left_type and right_type:
|
|
441
|
+
if (left_type, right_type) not in COMPATIBLE_TYPE_PAIRS and (
|
|
442
|
+
right_type,
|
|
443
|
+
left_type,
|
|
444
|
+
) not in COMPATIBLE_TYPE_PAIRS:
|
|
445
|
+
if left_type != right_type:
|
|
446
|
+
issues.append(
|
|
447
|
+
IntentIssue(
|
|
448
|
+
issue_id=f"filter_type_mismatch_{left_col}_{right_col}",
|
|
449
|
+
category="filter_semantic",
|
|
450
|
+
severity="error",
|
|
451
|
+
message=f"Type mismatch in filter: {left_col} ({left_type}) vs {right_col} ({right_type})",
|
|
452
|
+
context={
|
|
453
|
+
"left_col": left_col,
|
|
454
|
+
"left_type": left_type,
|
|
455
|
+
"right_col": right_col,
|
|
456
|
+
"right_type": right_type,
|
|
457
|
+
"param_key": fp.param_key,
|
|
458
|
+
"location": context,
|
|
459
|
+
},
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
debug(
|
|
463
|
+
f"[validation_semantic.validate_expr_vs_expr_filters] type mismatch: {left_type} vs {right_type}"
|
|
464
|
+
)
|
|
465
|
+
left_meta = get_col_meta(left_col, schema, cte_outputs)
|
|
466
|
+
right_meta = get_col_meta(right_col, schema, cte_outputs)
|
|
467
|
+
if left_meta and right_meta:
|
|
468
|
+
if left_meta.is_primary_key or right_meta.is_primary_key:
|
|
469
|
+
issues.append(
|
|
470
|
+
IntentIssue(
|
|
471
|
+
issue_id=f"pk_comparison_filter_{left_col}_{right_col}",
|
|
472
|
+
category="filter_semantic",
|
|
473
|
+
severity="warning",
|
|
474
|
+
message=f"Comparing primary key in filter: {left_col} vs {right_col}",
|
|
475
|
+
context={
|
|
476
|
+
"left_col": left_col,
|
|
477
|
+
"right_col": right_col,
|
|
478
|
+
"param_key": fp.param_key,
|
|
479
|
+
"location": context,
|
|
480
|
+
},
|
|
481
|
+
)
|
|
482
|
+
)
|
|
483
|
+
debug(f"[validation_semantic.validate_expr_vs_expr_filters] PK comparison: {left_col}")
|
|
484
|
+
debug(f"[validation_semantic.validate_expr_vs_expr_filters] {len(issues)} issues in {context}")
|
|
485
|
+
return issues
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def validate_agg_vs_agg_having(
|
|
489
|
+
having_param: list[HavingParam],
|
|
490
|
+
schema: SchemaGraph,
|
|
491
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
492
|
+
context: str = "main",
|
|
493
|
+
) -> list[IntentIssue]:
|
|
494
|
+
"""Validate ``HavingParam`` expression-vs-expression comparisons for numeric type compatibility.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
|
|
498
|
+
having_param: List of HAVING conditions to validate.
|
|
499
|
+
schema: Schema graph for resolving column types.
|
|
500
|
+
cte_outputs: Optional map of CTE name to column output metadata.
|
|
501
|
+
context: Query context label for issue messages.
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
|
|
505
|
+
List of ``IntentIssue`` instances where numeric and non-numeric aggregation results are compared within the same HAVING condition.
|
|
506
|
+
"""
|
|
507
|
+
issues = []
|
|
508
|
+
if not having_param:
|
|
509
|
+
return []
|
|
510
|
+
cte_outputs = cte_outputs or {}
|
|
511
|
+
debug("[validation_semantic.validate_agg_vs_agg_having] checking agg-vs-agg type compatibility")
|
|
512
|
+
for hp in having_param:
|
|
513
|
+
if not hp.right_expr:
|
|
514
|
+
continue
|
|
515
|
+
left_term = hp.left_expr.primary_term
|
|
516
|
+
right_term = hp.right_expr.primary_term
|
|
517
|
+
left_result = extract_agg_col(left_term)
|
|
518
|
+
right_result = extract_agg_col(right_term)
|
|
519
|
+
if len(left_result) != 3 or len(right_result) != 3:
|
|
520
|
+
continue
|
|
521
|
+
left_func, left_target, _ = left_result
|
|
522
|
+
right_func, right_target, _ = right_result
|
|
523
|
+
if not left_func or not right_func:
|
|
524
|
+
continue
|
|
525
|
+
if left_target == right_target and left_func == right_func:
|
|
526
|
+
issues.append(
|
|
527
|
+
IntentIssue(
|
|
528
|
+
issue_id=f"self_comparison_having_{left_term}",
|
|
529
|
+
category="having_semantic",
|
|
530
|
+
severity="error",
|
|
531
|
+
message=f"Self-comparison in HAVING: {left_term} compared to itself",
|
|
532
|
+
context={
|
|
533
|
+
"aggregation": left_term,
|
|
534
|
+
"param_key": hp.param_key,
|
|
535
|
+
"location": context,
|
|
536
|
+
},
|
|
537
|
+
)
|
|
538
|
+
)
|
|
539
|
+
debug(f"[validation_semantic.validate_agg_vs_agg_having] self-comparison: {left_term}")
|
|
540
|
+
continue
|
|
541
|
+
if left_target != "*" and right_target != "*":
|
|
542
|
+
left_type = get_col_type(left_target, schema, cte_outputs)
|
|
543
|
+
right_type = get_col_type(right_target, schema, cte_outputs)
|
|
544
|
+
if left_type and right_type:
|
|
545
|
+
numeric_funcs = {"sum", "avg", "count"}
|
|
546
|
+
if left_func in numeric_funcs and right_func in numeric_funcs:
|
|
547
|
+
pass
|
|
548
|
+
elif (left_type, right_type) not in COMPATIBLE_TYPE_PAIRS and (
|
|
549
|
+
right_type,
|
|
550
|
+
left_type,
|
|
551
|
+
) not in COMPATIBLE_TYPE_PAIRS:
|
|
552
|
+
if left_type != right_type:
|
|
553
|
+
issues.append(
|
|
554
|
+
IntentIssue(
|
|
555
|
+
issue_id=f"having_type_mismatch_{left_term}_{right_term}",
|
|
556
|
+
category="having_semantic",
|
|
557
|
+
severity="warning",
|
|
558
|
+
message=f"Type mismatch in HAVING: {left_term} ({left_type}) vs {right_term} ({right_type})",
|
|
559
|
+
context={
|
|
560
|
+
"left_agg": left_term,
|
|
561
|
+
"left_type": left_type,
|
|
562
|
+
"right_agg": right_term,
|
|
563
|
+
"right_type": right_type,
|
|
564
|
+
"param_key": hp.param_key,
|
|
565
|
+
"location": context,
|
|
566
|
+
},
|
|
567
|
+
)
|
|
568
|
+
)
|
|
569
|
+
debug(
|
|
570
|
+
f"[validation_semantic.validate_agg_vs_agg_having] type mismatch: {left_type} vs {right_type}"
|
|
571
|
+
)
|
|
572
|
+
debug(f"[validation_semantic.validate_agg_vs_agg_having] {len(issues)} issues in {context}")
|
|
573
|
+
return issues
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def validate_select_expr_types(
|
|
577
|
+
select_cols: list[SelectCol],
|
|
578
|
+
schema: SchemaGraph,
|
|
579
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
580
|
+
context: str = "main",
|
|
581
|
+
) -> list[IntentIssue]:
|
|
582
|
+
"""Validate that SELECT column arithmetic expressions reference numeric columns with valid roles.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
|
|
586
|
+
select_cols: SELECT column list whose expressions are to be validated.
|
|
587
|
+
schema: Schema graph for resolving column types and roles.
|
|
588
|
+
cte_outputs: Optional map of CTE name to column output metadata.
|
|
589
|
+
context: Query context label for issue messages.
|
|
590
|
+
|
|
591
|
+
Returns:
|
|
592
|
+
|
|
593
|
+
List of ``IntentIssue`` instances describing expression type violations in SELECT.
|
|
594
|
+
"""
|
|
595
|
+
issues: list[IntentIssue] = []
|
|
596
|
+
cte_outputs = cte_outputs or {}
|
|
597
|
+
for idx, sc in enumerate(select_cols or []):
|
|
598
|
+
issues.extend(_validate_single_expr_types(sc.expr, schema, cte_outputs, f"select_cols[{idx}]", context))
|
|
599
|
+
if issues:
|
|
600
|
+
debug(f"[validation_semantic.validate_select_expr_types] {len(issues)} issues in {context}")
|
|
601
|
+
return issues
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def validate_order_by_expr_types(
|
|
605
|
+
order_by_cols: list[OrderByCol],
|
|
606
|
+
schema: SchemaGraph,
|
|
607
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
608
|
+
context: str = "main",
|
|
609
|
+
) -> list[IntentIssue]:
|
|
610
|
+
"""Validate that ORDER BY column arithmetic expressions reference numeric columns with valid roles.
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
|
|
614
|
+
order_by_cols: ORDER BY column list whose expressions are to be validated.
|
|
615
|
+
schema: Schema graph for resolving column types and roles.
|
|
616
|
+
cte_outputs: Optional map of CTE name to column output metadata.
|
|
617
|
+
context: Query context label for issue messages.
|
|
618
|
+
|
|
619
|
+
Returns:
|
|
620
|
+
|
|
621
|
+
List of ``IntentIssue`` instances describing expression type violations in ORDER BY.
|
|
622
|
+
"""
|
|
623
|
+
issues: list[IntentIssue] = []
|
|
624
|
+
cte_outputs = cte_outputs or {}
|
|
625
|
+
for idx, obc in enumerate(order_by_cols or []):
|
|
626
|
+
issues.extend(_validate_single_expr_types(obc.expr, schema, cte_outputs, f"order_by_cols[{idx}]", context))
|
|
627
|
+
if issues:
|
|
628
|
+
debug(f"[validation_semantic.validate_order_by_expr_types] {len(issues)} issues in {context}")
|
|
629
|
+
return issues
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def validate_filter_expr_types(
|
|
633
|
+
filters_param: list[FilterParam],
|
|
634
|
+
schema: SchemaGraph,
|
|
635
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
636
|
+
context: str = "main",
|
|
637
|
+
) -> list[IntentIssue]:
|
|
638
|
+
"""Validate ``FilterParam`` expression types, cross-expression type compatibility, and operator compatibility.
|
|
639
|
+
|
|
640
|
+
Args:
|
|
641
|
+
|
|
642
|
+
filters_param: List of filter conditions to validate.
|
|
643
|
+
schema: Schema graph for resolving column types.
|
|
644
|
+
cte_outputs: Optional map of CTE name to column output metadata.
|
|
645
|
+
context: Query context label for issue messages.
|
|
646
|
+
|
|
647
|
+
Returns:
|
|
648
|
+
|
|
649
|
+
List of ``IntentIssue`` instances describing type mismatches or operator incompatibilities in filter conditions.
|
|
650
|
+
"""
|
|
651
|
+
issues: list[IntentIssue] = []
|
|
652
|
+
cte_outputs = cte_outputs or {}
|
|
653
|
+
for fp in filters_param or []:
|
|
654
|
+
pk = fp.param_key or "unknown"
|
|
655
|
+
issues.extend(_validate_single_expr_types(fp.left_expr, schema, cte_outputs, f"filter_{pk}_left", context))
|
|
656
|
+
if fp.right_expr:
|
|
657
|
+
issues.extend(
|
|
658
|
+
_validate_single_expr_types(fp.right_expr, schema, cte_outputs, f"filter_{pk}_right", context)
|
|
659
|
+
)
|
|
660
|
+
left_num = expr_result_is_numeric(fp.left_expr, schema, cte_outputs)
|
|
661
|
+
right_num = expr_result_is_numeric(fp.right_expr, schema, cte_outputs)
|
|
662
|
+
if left_num is not None and right_num is not None and left_num != right_num:
|
|
663
|
+
issues.append(
|
|
664
|
+
IntentIssue(
|
|
665
|
+
issue_id=f"filter_cross_type_mismatch_{pk}",
|
|
666
|
+
category="expression_type",
|
|
667
|
+
severity="error",
|
|
668
|
+
message=f"Filter '{pk}' compares numeric expression to non-numeric expression in {context}",
|
|
669
|
+
context={
|
|
670
|
+
"param_key": pk,
|
|
671
|
+
"left_numeric": left_num,
|
|
672
|
+
"right_numeric": right_num,
|
|
673
|
+
"location": context,
|
|
674
|
+
},
|
|
675
|
+
)
|
|
676
|
+
)
|
|
677
|
+
left_arith = expr_has_arithmetic(fp.left_expr)
|
|
678
|
+
right_arith = fp.right_expr and expr_has_arithmetic(fp.right_expr)
|
|
679
|
+
if (left_arith or right_arith) and fp.op not in NUMERIC_RESULT_OPS and fp.op not in ("is null", "is not null"):
|
|
680
|
+
issues.append(
|
|
681
|
+
IntentIssue(
|
|
682
|
+
issue_id=f"filter_op_on_arith_{pk}_{fp.op}",
|
|
683
|
+
category="expression_type",
|
|
684
|
+
severity="error",
|
|
685
|
+
message=f"Operator '{fp.op}' invalid on arithmetic expression in filter '{pk}' in {context}. Expected: {sorted(NUMERIC_RESULT_OPS)}",
|
|
686
|
+
context={
|
|
687
|
+
"param_key": pk,
|
|
688
|
+
"operator": fp.op,
|
|
689
|
+
"valid_ops": sorted(NUMERIC_RESULT_OPS),
|
|
690
|
+
"location": context,
|
|
691
|
+
},
|
|
692
|
+
)
|
|
693
|
+
)
|
|
694
|
+
if issues:
|
|
695
|
+
debug(f"[validation_semantic.validate_filter_expr_types] {len(issues)} issues in {context}")
|
|
696
|
+
return issues
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def validate_having_expr_types(
|
|
700
|
+
having_param: list[HavingParam],
|
|
701
|
+
schema: SchemaGraph,
|
|
702
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
703
|
+
context: str = "main",
|
|
704
|
+
) -> list[IntentIssue]:
|
|
705
|
+
"""Validate ``HavingParam`` expression types and cross-expression numeric type compatibility.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
|
|
709
|
+
having_param: List of HAVING conditions to validate.
|
|
710
|
+
schema: Schema graph for resolving column types.
|
|
711
|
+
cte_outputs: Optional map of CTE name to column output metadata.
|
|
712
|
+
context: Query context label for issue messages.
|
|
713
|
+
|
|
714
|
+
Returns:
|
|
715
|
+
|
|
716
|
+
List of ``IntentIssue`` instances describing type mismatches in HAVING conditions.
|
|
717
|
+
"""
|
|
718
|
+
issues: list[IntentIssue] = []
|
|
719
|
+
cte_outputs = cte_outputs or {}
|
|
720
|
+
for hp in having_param or []:
|
|
721
|
+
pk = hp.param_key or "unknown"
|
|
722
|
+
issues.extend(_validate_single_expr_types(hp.left_expr, schema, cte_outputs, f"having_{pk}_left", context))
|
|
723
|
+
if hp.right_expr:
|
|
724
|
+
issues.extend(
|
|
725
|
+
_validate_single_expr_types(hp.right_expr, schema, cte_outputs, f"having_{pk}_right", context)
|
|
726
|
+
)
|
|
727
|
+
left_num = expr_result_is_numeric(hp.left_expr, schema, cte_outputs)
|
|
728
|
+
right_num = expr_result_is_numeric(hp.right_expr, schema, cte_outputs)
|
|
729
|
+
if left_num is not None and right_num is not None and left_num != right_num:
|
|
730
|
+
issues.append(
|
|
731
|
+
IntentIssue(
|
|
732
|
+
issue_id=f"having_cross_type_mismatch_{pk}",
|
|
733
|
+
category="expression_type",
|
|
734
|
+
severity="error",
|
|
735
|
+
message=f"Having '{pk}' compares numeric expression to non-numeric expression in {context}",
|
|
736
|
+
context={
|
|
737
|
+
"param_key": pk,
|
|
738
|
+
"left_numeric": left_num,
|
|
739
|
+
"right_numeric": right_num,
|
|
740
|
+
"location": context,
|
|
741
|
+
},
|
|
742
|
+
)
|
|
743
|
+
)
|
|
744
|
+
if issues:
|
|
745
|
+
debug(f"[validation_semantic.validate_having_expr_types] {len(issues)} issues in {context}")
|
|
746
|
+
return issues
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
def validate_arith_expression_semantics(
|
|
750
|
+
filters_param: list[FilterParam],
|
|
751
|
+
having_param: list[HavingParam],
|
|
752
|
+
schema: SchemaGraph,
|
|
753
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
754
|
+
context: str = "main",
|
|
755
|
+
) -> list[IntentIssue]:
|
|
756
|
+
"""Validate that arithmetic expressions in filters and HAVING use compatible operand types.
|
|
757
|
+
|
|
758
|
+
Args:
|
|
759
|
+
|
|
760
|
+
filters_param: Filter conditions to inspect for arithmetic type violations.
|
|
761
|
+
having_param: HAVING conditions to inspect.
|
|
762
|
+
schema: Schema graph for resolving column types.
|
|
763
|
+
cte_outputs: Optional map of CTE name to column output metadata.
|
|
764
|
+
context: Query context label for issue messages.
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
|
|
768
|
+
List of ``IntentIssue`` instances describing arithmetic expression semantic violations.
|
|
769
|
+
"""
|
|
770
|
+
issues = []
|
|
771
|
+
cte_outputs = cte_outputs or {}
|
|
772
|
+
debug("[validation_semantic.validate_arith_expression_semantics] checking arithmetic semantics")
|
|
773
|
+
for fp in filters_param:
|
|
774
|
+
if expr_has_arithmetic(fp.left_expr):
|
|
775
|
+
issues.extend(
|
|
776
|
+
_validate_single_expr_types(
|
|
777
|
+
fp.left_expr,
|
|
778
|
+
schema,
|
|
779
|
+
cte_outputs,
|
|
780
|
+
f"filter_{fp.param_key or 'unknown'}_left",
|
|
781
|
+
context,
|
|
782
|
+
)
|
|
783
|
+
)
|
|
784
|
+
if fp.right_expr and expr_has_arithmetic(fp.right_expr):
|
|
785
|
+
issues.extend(
|
|
786
|
+
_validate_single_expr_types(
|
|
787
|
+
fp.right_expr,
|
|
788
|
+
schema,
|
|
789
|
+
cte_outputs,
|
|
790
|
+
f"filter_{fp.param_key or 'unknown'}_right",
|
|
791
|
+
context,
|
|
792
|
+
)
|
|
793
|
+
)
|
|
794
|
+
for hp in having_param:
|
|
795
|
+
if expr_has_arithmetic(hp.left_expr):
|
|
796
|
+
issues.extend(
|
|
797
|
+
_validate_single_expr_types(
|
|
798
|
+
hp.left_expr,
|
|
799
|
+
schema,
|
|
800
|
+
cte_outputs,
|
|
801
|
+
f"having_{hp.param_key or 'unknown'}_left",
|
|
802
|
+
context,
|
|
803
|
+
)
|
|
804
|
+
)
|
|
805
|
+
if hp.right_expr and expr_has_arithmetic(hp.right_expr):
|
|
806
|
+
issues.extend(
|
|
807
|
+
_validate_single_expr_types(
|
|
808
|
+
hp.right_expr,
|
|
809
|
+
schema,
|
|
810
|
+
cte_outputs,
|
|
811
|
+
f"having_{hp.param_key or 'unknown'}_right",
|
|
812
|
+
context,
|
|
813
|
+
)
|
|
814
|
+
)
|
|
815
|
+
debug(f"[validation_semantic.validate_arith_expression_semantics] {len(issues)} issues in {context}")
|
|
816
|
+
return issues
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
def _term_has_aggregation(term: str) -> bool:
|
|
820
|
+
"""Return whether a single multiply/divide term string contains an inline aggregation call.
|
|
821
|
+
|
|
822
|
+
Args:
|
|
823
|
+
|
|
824
|
+
term: A raw SQL term string (for example a single element from ``MulGroup.multiply``).
|
|
825
|
+
|
|
826
|
+
Returns:
|
|
827
|
+
|
|
828
|
+
``True`` if the term begins with or contains one of the standard aggregation functions (``COUNT``, ``SUM``, ``AVG``, ``MIN``, ``MAX``); ``False`` otherwise.
|
|
829
|
+
"""
|
|
830
|
+
upper = term.upper()
|
|
831
|
+
return any(upper.startswith(f"{a.upper()}(") or f" {a.upper()}(" in upper for a in VALID_AGG_FUNCS)
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def validate_filter_no_aggregation(filters_param: list[FilterParam], context: str = "main") -> list[IntentIssue]:
|
|
835
|
+
"""Validate that filter (WHERE) conditions do not contain aggregation functions.
|
|
836
|
+
|
|
837
|
+
Args:
|
|
838
|
+
|
|
839
|
+
filters_param: List of filter conditions to inspect.
|
|
840
|
+
context: Query context label for issue messages.
|
|
841
|
+
|
|
842
|
+
Returns:
|
|
843
|
+
|
|
844
|
+
List of ``IntentIssue`` instances where a filter expression contains an aggregation function that should be in HAVING instead.
|
|
845
|
+
"""
|
|
846
|
+
issues: list[IntentIssue] = []
|
|
847
|
+
debug("[validation_semantic.validate_filter_no_aggregation] checking filter aggregation ban")
|
|
848
|
+
for fp in filters_param or []:
|
|
849
|
+
pk = fp.param_key or "unknown"
|
|
850
|
+
if fp.left_expr.has_aggregation:
|
|
851
|
+
issues.append(
|
|
852
|
+
IntentIssue(
|
|
853
|
+
issue_id=f"filter_has_aggregation_{pk}_left",
|
|
854
|
+
category="filter_aggregation",
|
|
855
|
+
severity="error",
|
|
856
|
+
message=f"Filter '{pk}' left expression contains aggregation in {context}; use HAVING instead of WHERE",
|
|
857
|
+
context={"param_key": pk, "side": "left", "location": context},
|
|
858
|
+
)
|
|
859
|
+
)
|
|
860
|
+
if fp.right_expr and fp.right_expr.has_aggregation:
|
|
861
|
+
issues.append(
|
|
862
|
+
IntentIssue(
|
|
863
|
+
issue_id=f"filter_has_aggregation_{pk}_right",
|
|
864
|
+
category="filter_aggregation",
|
|
865
|
+
severity="error",
|
|
866
|
+
message=f"Filter '{pk}' right expression contains aggregation in {context}; use HAVING instead of WHERE",
|
|
867
|
+
context={"param_key": pk, "side": "right", "location": context},
|
|
868
|
+
)
|
|
869
|
+
)
|
|
870
|
+
if issues:
|
|
871
|
+
debug(f"[validation_semantic.validate_filter_no_aggregation] {len(issues)} issues in {context}")
|
|
872
|
+
return issues
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
def validate_having_requires_aggregation(having_param: list[HavingParam], context: str = "main") -> list[IntentIssue]:
|
|
876
|
+
"""Validate that each HAVING condition contains at least one aggregation (left or right expression).
|
|
877
|
+
|
|
878
|
+
Args:
|
|
879
|
+
|
|
880
|
+
having_param: List of HAVING conditions to inspect.
|
|
881
|
+
context: Query context label for issue messages.
|
|
882
|
+
|
|
883
|
+
Returns:
|
|
884
|
+
|
|
885
|
+
List of ``IntentIssue`` instances where a HAVING condition lacks any aggregation and should be moved to WHERE instead.
|
|
886
|
+
"""
|
|
887
|
+
issues: list[IntentIssue] = []
|
|
888
|
+
debug("[validation_semantic.validate_having_requires_aggregation] checking having aggregation requirement")
|
|
889
|
+
for hp in having_param or []:
|
|
890
|
+
pk = hp.param_key or "unknown"
|
|
891
|
+
has_agg = hp.left_expr.has_aggregation or (
|
|
892
|
+
hp.right_expr is not None and hp.right_expr.has_aggregation
|
|
893
|
+
)
|
|
894
|
+
if not has_agg:
|
|
895
|
+
issues.append(
|
|
896
|
+
IntentIssue(
|
|
897
|
+
issue_id=f"having_missing_aggregation_{pk}",
|
|
898
|
+
category="having_aggregation",
|
|
899
|
+
severity="error",
|
|
900
|
+
message=f"Having '{pk}' has no aggregation in {context}; belongs in WHERE not HAVING",
|
|
901
|
+
context={"param_key": pk, "location": context},
|
|
902
|
+
)
|
|
903
|
+
)
|
|
904
|
+
if issues:
|
|
905
|
+
debug(f"[validation_semantic.validate_having_requires_aggregation] {len(issues)} issues in {context}")
|
|
906
|
+
return issues
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
def _check_nested_aggregation(expr: NormalizedExpr, location: str, context: str) -> list[IntentIssue]:
|
|
910
|
+
"""Check a single ``NormalizedExpr`` for double-wrap (nested) aggregation.
|
|
911
|
+
|
|
912
|
+
Detects cases where an expression-level aggregation wraps a group-level aggregation, or either level wraps an inline aggregation term.
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
|
|
916
|
+
expr: The normalised expression to inspect.
|
|
917
|
+
location: Human-readable location label for issue context.
|
|
918
|
+
context: Query context label for issue messages.
|
|
919
|
+
|
|
920
|
+
Returns:
|
|
921
|
+
|
|
922
|
+
List of ``IntentIssue`` instances describing nested aggregation patterns found.
|
|
923
|
+
"""
|
|
924
|
+
issues: list[IntentIssue] = []
|
|
925
|
+
for g in expr.add_groups + expr.sub_groups:
|
|
926
|
+
group_inline_agg = any(_term_has_aggregation(t) for t in g.multiply + g.divide)
|
|
927
|
+
if expr.agg_func and g.agg_func:
|
|
928
|
+
issues.append(
|
|
929
|
+
IntentIssue(
|
|
930
|
+
issue_id=f"nested_agg_expr_group_{location}",
|
|
931
|
+
category="nested_aggregation",
|
|
932
|
+
severity="error",
|
|
933
|
+
message=f"Nested aggregation {expr.agg_func.upper()}({g.agg_func.upper()}(...)) at {location} in {context}",
|
|
934
|
+
context={
|
|
935
|
+
"outer": expr.agg_func,
|
|
936
|
+
"inner": g.agg_func,
|
|
937
|
+
"location": location,
|
|
938
|
+
},
|
|
939
|
+
)
|
|
940
|
+
)
|
|
941
|
+
if expr.agg_func and group_inline_agg:
|
|
942
|
+
issues.append(
|
|
943
|
+
IntentIssue(
|
|
944
|
+
issue_id=f"nested_agg_expr_inline_{location}",
|
|
945
|
+
category="nested_aggregation",
|
|
946
|
+
severity="error",
|
|
947
|
+
message=f"Nested aggregation: expr-level {expr.agg_func.upper()} wraps inline aggregation at {location} in {context}",
|
|
948
|
+
context={"outer": expr.agg_func, "location": location},
|
|
949
|
+
)
|
|
950
|
+
)
|
|
951
|
+
if g.agg_func and group_inline_agg:
|
|
952
|
+
issues.append(
|
|
953
|
+
IntentIssue(
|
|
954
|
+
issue_id=f"nested_agg_group_inline_{location}",
|
|
955
|
+
category="nested_aggregation",
|
|
956
|
+
severity="error",
|
|
957
|
+
message=f"Nested aggregation: group-level {g.agg_func.upper()} wraps inline aggregation at {location} in {context}",
|
|
958
|
+
context={"outer": g.agg_func, "location": location},
|
|
959
|
+
)
|
|
960
|
+
)
|
|
961
|
+
return issues
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
def validate_no_nested_aggregation(
|
|
965
|
+
select_cols: list[SelectCol],
|
|
966
|
+
order_by_cols: list[OrderByCol],
|
|
967
|
+
filters_param: list[FilterParam],
|
|
968
|
+
having_param: list[HavingParam],
|
|
969
|
+
context: str = "main",
|
|
970
|
+
) -> list[IntentIssue]:
|
|
971
|
+
"""Validate that no expression across SELECT, ORDER BY, filters, or HAVING contains nested aggregation.
|
|
972
|
+
|
|
973
|
+
Args:
|
|
974
|
+
|
|
975
|
+
select_cols: SELECT column list to inspect.
|
|
976
|
+
order_by_cols: ORDER BY column list to inspect.
|
|
977
|
+
filters_param: Filter conditions to inspect.
|
|
978
|
+
having_param: HAVING conditions to inspect.
|
|
979
|
+
context: Query context label for issue messages.
|
|
980
|
+
|
|
981
|
+
Returns:
|
|
982
|
+
|
|
983
|
+
List of ``IntentIssue`` instances describing nested aggregation violations found.
|
|
984
|
+
"""
|
|
985
|
+
issues: list[IntentIssue] = []
|
|
986
|
+
debug("[validation_semantic.validate_no_nested_aggregation] checking nested aggregation")
|
|
987
|
+
for idx, sc in enumerate(select_cols or []):
|
|
988
|
+
issues.extend(_check_nested_aggregation(sc.expr, f"select_cols[{idx}]", context))
|
|
989
|
+
for idx, obc in enumerate(order_by_cols or []):
|
|
990
|
+
issues.extend(_check_nested_aggregation(obc.expr, f"order_by_cols[{idx}]", context))
|
|
991
|
+
for fp in filters_param or []:
|
|
992
|
+
pk = fp.param_key or "unknown"
|
|
993
|
+
issues.extend(_check_nested_aggregation(fp.left_expr, f"filter_{pk}_left", context))
|
|
994
|
+
if fp.right_expr:
|
|
995
|
+
issues.extend(_check_nested_aggregation(fp.right_expr, f"filter_{pk}_right", context))
|
|
996
|
+
for hp in having_param or []:
|
|
997
|
+
pk = hp.param_key or "unknown"
|
|
998
|
+
issues.extend(_check_nested_aggregation(hp.left_expr, f"having_{pk}_left", context))
|
|
999
|
+
if hp.right_expr:
|
|
1000
|
+
issues.extend(_check_nested_aggregation(hp.right_expr, f"having_{pk}_right", context))
|
|
1001
|
+
if issues:
|
|
1002
|
+
debug(f"[validation_semantic.validate_no_nested_aggregation] {len(issues)} issues in {context}")
|
|
1003
|
+
return issues
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
def _check_mixed_aggregation_in_group(group: MulGroup, location: str, context: str) -> list[IntentIssue]:
|
|
1007
|
+
"""Check a single ``MulGroup`` for mixed aggregated and bare column terms.
|
|
1008
|
+
|
|
1009
|
+
A group-level ``agg_func`` is considered to cover all terms; the check only applies to groups without an outer aggregation.
|
|
1010
|
+
|
|
1011
|
+
Args:
|
|
1012
|
+
|
|
1013
|
+
group: The ``MulGroup`` to inspect.
|
|
1014
|
+
location: Human-readable location label for issue context.
|
|
1015
|
+
context: Query context label for issue messages.
|
|
1016
|
+
|
|
1017
|
+
Returns:
|
|
1018
|
+
|
|
1019
|
+
List of ``IntentIssue`` instances where aggregated and non-aggregated bare column terms appear together in the same multiply/divide group.
|
|
1020
|
+
"""
|
|
1021
|
+
issues: list[IntentIssue] = []
|
|
1022
|
+
if group.agg_func:
|
|
1023
|
+
return issues
|
|
1024
|
+
all_terms = list(group.multiply) + list(group.divide)
|
|
1025
|
+
if len(all_terms) < 2:
|
|
1026
|
+
return issues
|
|
1027
|
+
agg_terms: list[str] = []
|
|
1028
|
+
bare_terms: list[str] = []
|
|
1029
|
+
for term in all_terms:
|
|
1030
|
+
if _term_has_aggregation(term):
|
|
1031
|
+
agg_terms.append(term)
|
|
1032
|
+
else:
|
|
1033
|
+
ref = strip_function_wrappers(term)
|
|
1034
|
+
if ref and ref != "*" and "." in ref:
|
|
1035
|
+
bare_terms.append(term)
|
|
1036
|
+
if agg_terms and bare_terms:
|
|
1037
|
+
issues.append(
|
|
1038
|
+
IntentIssue(
|
|
1039
|
+
issue_id=f"mixed_agg_bare_{location}",
|
|
1040
|
+
category="mixed_aggregation",
|
|
1041
|
+
severity="error",
|
|
1042
|
+
message=f"MulGroup at {location} in {context} mixes aggregated terms ({', '.join(agg_terms)}) with bare columns ({', '.join(bare_terms)})",
|
|
1043
|
+
context={
|
|
1044
|
+
"agg_terms": agg_terms,
|
|
1045
|
+
"bare_terms": bare_terms,
|
|
1046
|
+
"location": location,
|
|
1047
|
+
},
|
|
1048
|
+
)
|
|
1049
|
+
)
|
|
1050
|
+
return issues
|
|
1051
|
+
|
|
1052
|
+
|
|
1053
|
+
def _check_mixed_aggregation_in_expr(expr: NormalizedExpr, location: str, context: str) -> list[IntentIssue]:
|
|
1054
|
+
"""Check all ``MulGroup`` entries in a ``NormalizedExpr`` for mixed aggregation.
|
|
1055
|
+
|
|
1056
|
+
Delegates per-group checking to ``_check_mixed_aggregation_in_group`` and additionally checks across add/sub groups when no outer expression-level aggregation is present.
|
|
1057
|
+
|
|
1058
|
+
Args:
|
|
1059
|
+
|
|
1060
|
+
expr: The normalised expression to inspect.
|
|
1061
|
+
location: Human-readable location label for issue context.
|
|
1062
|
+
context: Query context label for issue messages.
|
|
1063
|
+
|
|
1064
|
+
Returns:
|
|
1065
|
+
|
|
1066
|
+
List of ``IntentIssue`` instances describing mixed aggregation violations found within or across ``MulGroup`` entries.
|
|
1067
|
+
"""
|
|
1068
|
+
issues: list[IntentIssue] = []
|
|
1069
|
+
for idx, g in enumerate(expr.add_groups):
|
|
1070
|
+
issues.extend(_check_mixed_aggregation_in_group(g, f"{location}_add[{idx}]", context))
|
|
1071
|
+
for idx, g in enumerate(expr.sub_groups):
|
|
1072
|
+
issues.extend(_check_mixed_aggregation_in_group(g, f"{location}_sub[{idx}]", context))
|
|
1073
|
+
all_groups = list(expr.add_groups) + list(expr.sub_groups)
|
|
1074
|
+
if len(all_groups) >= 2 and not expr.agg_func:
|
|
1075
|
+
agg_groups: list[str] = []
|
|
1076
|
+
bare_groups: list[str] = []
|
|
1077
|
+
for g in all_groups:
|
|
1078
|
+
has_agg = g.agg_func or any(_term_has_aggregation(t) for t in g.multiply + g.divide)
|
|
1079
|
+
sig = g.signature_key
|
|
1080
|
+
if has_agg:
|
|
1081
|
+
agg_groups.append(sig)
|
|
1082
|
+
else:
|
|
1083
|
+
has_bare = any("." in t and not _term_has_aggregation(t) for t in g.multiply + g.divide)
|
|
1084
|
+
if has_bare:
|
|
1085
|
+
bare_groups.append(sig)
|
|
1086
|
+
if agg_groups and bare_groups:
|
|
1087
|
+
issues.append(
|
|
1088
|
+
IntentIssue(
|
|
1089
|
+
issue_id=f"mixed_agg_across_groups_{location}",
|
|
1090
|
+
category="mixed_aggregation",
|
|
1091
|
+
severity="error",
|
|
1092
|
+
message=f"Expression at {location} in {context} mixes aggregated groups ({', '.join(agg_groups)}) with bare column groups ({', '.join(bare_groups)})",
|
|
1093
|
+
context={
|
|
1094
|
+
"agg_groups": agg_groups,
|
|
1095
|
+
"bare_groups": bare_groups,
|
|
1096
|
+
"location": location,
|
|
1097
|
+
},
|
|
1098
|
+
)
|
|
1099
|
+
)
|
|
1100
|
+
return issues
|
|
1101
|
+
|
|
1102
|
+
|
|
1103
|
+
def validate_mixed_aggregation_in_mulgroup(
|
|
1104
|
+
select_cols: list[SelectCol],
|
|
1105
|
+
order_by_cols: list[OrderByCol],
|
|
1106
|
+
filters_param: list[FilterParam],
|
|
1107
|
+
having_param: list[HavingParam],
|
|
1108
|
+
context: str = "main",
|
|
1109
|
+
) -> list[IntentIssue]:
|
|
1110
|
+
"""Validate that no ``MulGroup`` mixes aggregated terms with bare column references.
|
|
1111
|
+
|
|
1112
|
+
Args:
|
|
1113
|
+
|
|
1114
|
+
select_cols: SELECT column list to inspect.
|
|
1115
|
+
order_by_cols: ORDER BY column list to inspect.
|
|
1116
|
+
filters_param: Filter conditions to inspect.
|
|
1117
|
+
having_param: HAVING conditions to inspect.
|
|
1118
|
+
context: Query context label for issue messages.
|
|
1119
|
+
|
|
1120
|
+
Returns:
|
|
1121
|
+
|
|
1122
|
+
List of ``IntentIssue`` instances describing mixed aggregation violations across all inspected clause types.
|
|
1123
|
+
"""
|
|
1124
|
+
issues: list[IntentIssue] = []
|
|
1125
|
+
debug("[validation_semantic.validate_mixed_aggregation_in_mulgroup] checking mixed aggregation")
|
|
1126
|
+
for idx, sc in enumerate(select_cols or []):
|
|
1127
|
+
issues.extend(_check_mixed_aggregation_in_expr(sc.expr, f"select_cols[{idx}]", context))
|
|
1128
|
+
for idx, obc in enumerate(order_by_cols or []):
|
|
1129
|
+
issues.extend(_check_mixed_aggregation_in_expr(obc.expr, f"order_by_cols[{idx}]", context))
|
|
1130
|
+
for fp in filters_param or []:
|
|
1131
|
+
pk = fp.param_key or "unknown"
|
|
1132
|
+
issues.extend(_check_mixed_aggregation_in_expr(fp.left_expr, f"filter_{pk}_left", context))
|
|
1133
|
+
if fp.right_expr:
|
|
1134
|
+
issues.extend(_check_mixed_aggregation_in_expr(fp.right_expr, f"filter_{pk}_right", context))
|
|
1135
|
+
for hp in having_param or []:
|
|
1136
|
+
pk = hp.param_key or "unknown"
|
|
1137
|
+
issues.extend(_check_mixed_aggregation_in_expr(hp.left_expr, f"having_{pk}_left", context))
|
|
1138
|
+
if hp.right_expr:
|
|
1139
|
+
issues.extend(_check_mixed_aggregation_in_expr(hp.right_expr, f"having_{pk}_right", context))
|
|
1140
|
+
if issues:
|
|
1141
|
+
debug(f"[validation_semantic.validate_mixed_aggregation_in_mulgroup] {len(issues)} issues in {context}")
|
|
1142
|
+
return issues
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
def validate_order_by_aggregation_context(
|
|
1146
|
+
order_by_cols: list[OrderByCol], grain: str, context: str = "main"
|
|
1147
|
+
) -> list[IntentIssue]:
|
|
1148
|
+
"""Validate that ORDER BY aggregation expressions are compatible with the query grain.
|
|
1149
|
+
|
|
1150
|
+
Args:
|
|
1151
|
+
|
|
1152
|
+
order_by_cols: ORDER BY column list to inspect for aggregation usage.
|
|
1153
|
+
grain: Declared query grain; aggregation in ORDER BY is invalid when grain is ``"row_level"``.
|
|
1154
|
+
context: Query context label for issue messages.
|
|
1155
|
+
|
|
1156
|
+
Returns:
|
|
1157
|
+
|
|
1158
|
+
List of ``IntentIssue`` instances where ORDER BY contains aggregation incompatible with the declared grain.
|
|
1159
|
+
"""
|
|
1160
|
+
issues: list[IntentIssue] = []
|
|
1161
|
+
debug(f"[validation_semantic.validate_order_by_aggregation_context] grain={grain}")
|
|
1162
|
+
if grain != "row_level":
|
|
1163
|
+
return issues
|
|
1164
|
+
for idx, obc in enumerate(order_by_cols or []):
|
|
1165
|
+
if obc.expr.has_aggregation:
|
|
1166
|
+
issues.append(
|
|
1167
|
+
IntentIssue(
|
|
1168
|
+
issue_id=f"order_by_agg_row_level_{idx}",
|
|
1169
|
+
category="order_by_aggregation",
|
|
1170
|
+
severity="error",
|
|
1171
|
+
message=f"Order-by[{idx}] contains aggregation but grain is row_level in {context}",
|
|
1172
|
+
context={"index": idx, "grain": grain, "location": context},
|
|
1173
|
+
)
|
|
1174
|
+
)
|
|
1175
|
+
if issues:
|
|
1176
|
+
debug(f"[validation_semantic.validate_order_by_aggregation_context] {len(issues)} issues in {context}")
|
|
1177
|
+
return issues
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
def validate_select_group_by_membership(
|
|
1181
|
+
select_cols: list[SelectCol],
|
|
1182
|
+
group_by_cols: list[NormalizedExpr],
|
|
1183
|
+
grain: str,
|
|
1184
|
+
context: str = "main",
|
|
1185
|
+
) -> list[IntentIssue]:
|
|
1186
|
+
"""Validate that every non-aggregated SELECT column appears in GROUP BY when grain is ``"grouped"``.
|
|
1187
|
+
|
|
1188
|
+
Args:
|
|
1189
|
+
|
|
1190
|
+
select_cols: SELECT column list to inspect.
|
|
1191
|
+
group_by_cols: GROUP BY expression list providing the allowed column set.
|
|
1192
|
+
grain: Declared query grain; check only applies when grain is ``"grouped"``.
|
|
1193
|
+
context: Query context label for issue messages.
|
|
1194
|
+
|
|
1195
|
+
Returns:
|
|
1196
|
+
|
|
1197
|
+
List of ``IntentIssue`` instances for non-aggregated SELECT columns absent from the GROUP BY clause.
|
|
1198
|
+
"""
|
|
1199
|
+
issues: list[IntentIssue] = []
|
|
1200
|
+
debug(f"[validation_semantic.validate_select_group_by_membership] grain={grain}, group_by={len(group_by_cols)}")
|
|
1201
|
+
if grain != "grouped" or not group_by_cols:
|
|
1202
|
+
return issues
|
|
1203
|
+
group_by_set = frozenset(g.primary_column.lower() for g in group_by_cols)
|
|
1204
|
+
for idx, sc in enumerate(select_cols or []):
|
|
1205
|
+
if sc.is_aggregated:
|
|
1206
|
+
continue
|
|
1207
|
+
col = sc.expr.primary_column
|
|
1208
|
+
if not col:
|
|
1209
|
+
continue
|
|
1210
|
+
if col.lower() not in group_by_set:
|
|
1211
|
+
issues.append(
|
|
1212
|
+
IntentIssue(
|
|
1213
|
+
issue_id=f"select_not_in_group_by_{idx}_{col}",
|
|
1214
|
+
category="group_by_membership",
|
|
1215
|
+
severity="error",
|
|
1216
|
+
message=f"Non-aggregated select column '{col}' at index {idx} not in GROUP BY in {context}",
|
|
1217
|
+
context={
|
|
1218
|
+
"column": col,
|
|
1219
|
+
"index": idx,
|
|
1220
|
+
"group_by_cols": [g.primary_column for g in group_by_cols],
|
|
1221
|
+
"location": context,
|
|
1222
|
+
},
|
|
1223
|
+
)
|
|
1224
|
+
)
|
|
1225
|
+
if issues:
|
|
1226
|
+
debug(f"[validation_semantic.validate_select_group_by_membership] {len(issues)} issues in {context}")
|
|
1227
|
+
return issues
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
def validate_cte_grain_consistency(cte: RuntimeCteStep, context: str) -> list[IntentIssue]:
|
|
1231
|
+
"""Validate that a CTE's declared grain is consistent with its aggregation and GROUP BY.
|
|
1232
|
+
|
|
1233
|
+
Args:
|
|
1234
|
+
|
|
1235
|
+
cte: The CTE step to validate.
|
|
1236
|
+
context: Query context label for issue messages (typically the CTE name).
|
|
1237
|
+
|
|
1238
|
+
Returns:
|
|
1239
|
+
|
|
1240
|
+
List of ``IntentIssue`` instances describing grain inconsistencies within the CTE, such as ``"grouped"`` without GROUP BY or aggregation with ``"row_level"``.
|
|
1241
|
+
"""
|
|
1242
|
+
issues = []
|
|
1243
|
+
grain = cte.grain
|
|
1244
|
+
group_by = cte.group_by_cols or []
|
|
1245
|
+
select_cols = cte.select_cols or []
|
|
1246
|
+
having_param = cte.having_param or []
|
|
1247
|
+
has_agg = any(sc.is_aggregated for sc in select_cols)
|
|
1248
|
+
if grain == "grouped" and not group_by:
|
|
1249
|
+
issues.append(
|
|
1250
|
+
IntentIssue(
|
|
1251
|
+
issue_id=f"cte_grouped_no_groupby_{cte.cte_name}",
|
|
1252
|
+
category="cte_grain_consistency",
|
|
1253
|
+
severity="error",
|
|
1254
|
+
message=f"CTE '{cte.cte_name}' has grain=grouped but no group_by_cols",
|
|
1255
|
+
context={"cte_name": cte.cte_name, "grain": grain},
|
|
1256
|
+
)
|
|
1257
|
+
)
|
|
1258
|
+
if grain in {"scalar", "row_level"} and group_by:
|
|
1259
|
+
issues.append(
|
|
1260
|
+
IntentIssue(
|
|
1261
|
+
issue_id=f"cte_groupby_with_{grain}_{cte.cte_name}",
|
|
1262
|
+
category="cte_grain_consistency",
|
|
1263
|
+
severity="error",
|
|
1264
|
+
message=f"CTE '{cte.cte_name}' has group_by_cols but grain={grain}",
|
|
1265
|
+
context={
|
|
1266
|
+
"cte_name": cte.cte_name,
|
|
1267
|
+
"grain": grain,
|
|
1268
|
+
"group_by": group_by,
|
|
1269
|
+
},
|
|
1270
|
+
)
|
|
1271
|
+
)
|
|
1272
|
+
if has_agg and grain == "row_level":
|
|
1273
|
+
agg_funcs = [sc.expr.primary_term for sc in select_cols if sc.is_aggregated]
|
|
1274
|
+
issues.append(
|
|
1275
|
+
IntentIssue(
|
|
1276
|
+
issue_id=f"cte_agg_row_level_{cte.cte_name}",
|
|
1277
|
+
category="cte_grain_consistency",
|
|
1278
|
+
severity="error",
|
|
1279
|
+
message=f"CTE '{cte.cte_name}' has aggregation with row_level grain",
|
|
1280
|
+
context={
|
|
1281
|
+
"cte_name": cte.cte_name,
|
|
1282
|
+
"agg_funcs": agg_funcs,
|
|
1283
|
+
"grain": grain,
|
|
1284
|
+
},
|
|
1285
|
+
)
|
|
1286
|
+
)
|
|
1287
|
+
if has_agg and not group_by and grain != "scalar":
|
|
1288
|
+
issues.append(
|
|
1289
|
+
IntentIssue(
|
|
1290
|
+
issue_id=f"cte_{cte.cte_name}_agg_no_groupby",
|
|
1291
|
+
category="cte_grain_consistency",
|
|
1292
|
+
severity="warning",
|
|
1293
|
+
message=f"CTE '{cte.cte_name}' has aggregation but no group_by_cols (expected grain=scalar or group_by)",
|
|
1294
|
+
context={"cte_name": cte.cte_name, "grain": grain},
|
|
1295
|
+
)
|
|
1296
|
+
)
|
|
1297
|
+
if having_param and not has_agg:
|
|
1298
|
+
issues.append(
|
|
1299
|
+
IntentIssue(
|
|
1300
|
+
issue_id=f"cte_{cte.cte_name}_having_no_agg",
|
|
1301
|
+
category="cte_aggregation",
|
|
1302
|
+
severity="error",
|
|
1303
|
+
message=f"CTE '{cte.cte_name}' has HAVING clause but no aggregation",
|
|
1304
|
+
context={"cte_name": cte.cte_name, "having_count": len(having_param)},
|
|
1305
|
+
)
|
|
1306
|
+
)
|
|
1307
|
+
debug(f"[validation_semantic.validate_cte_grain_consistency] {len(issues)} issues for CTE '{cte.cte_name}'")
|
|
1308
|
+
return issues
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
def validate_cte_dependency_grains(cte_steps: list[RuntimeCteStep], main_grain: str) -> list[IntentIssue]:
|
|
1312
|
+
"""Validate that CTE grains are compatible with their upstream dependencies and the main query grain.
|
|
1313
|
+
|
|
1314
|
+
Args:
|
|
1315
|
+
|
|
1316
|
+
cte_steps: Ordered list of CTE steps in the query plan.
|
|
1317
|
+
main_grain: Declared grain of the main (outer) query.
|
|
1318
|
+
|
|
1319
|
+
Returns:
|
|
1320
|
+
|
|
1321
|
+
List of ``IntentIssue`` instances where a ``"row_level"`` CTE or the main query depends on an aggregated (``"grouped"`` or ``"scalar"``) CTE.
|
|
1322
|
+
"""
|
|
1323
|
+
issues = []
|
|
1324
|
+
cte_grains: dict[str, str] = {}
|
|
1325
|
+
debug(
|
|
1326
|
+
f"[validation_semantic.validate_cte_dependency_grains] validating {len(cte_steps)} CTEs against main grain '{main_grain}'"
|
|
1327
|
+
)
|
|
1328
|
+
for cte in cte_steps:
|
|
1329
|
+
cte_grains[cte.cte_name] = cte.grain
|
|
1330
|
+
for cte in cte_steps:
|
|
1331
|
+
cte_name = cte.cte_name
|
|
1332
|
+
cte_grain = cte.grain
|
|
1333
|
+
for table in cte.tables:
|
|
1334
|
+
if table in cte_grains:
|
|
1335
|
+
dep_grain = cte_grains[table]
|
|
1336
|
+
if cte_grain == "row_level" and dep_grain in {"grouped", "scalar"}:
|
|
1337
|
+
issues.append(
|
|
1338
|
+
IntentIssue(
|
|
1339
|
+
issue_id=f"cte_grain_incompatible_{cte_name}_{table}",
|
|
1340
|
+
category="cte_grain_compatibility",
|
|
1341
|
+
severity="warning",
|
|
1342
|
+
message=f"CTE '{cte_name}' (row_level) depends on aggregated CTE '{table}' ({dep_grain})",
|
|
1343
|
+
context={
|
|
1344
|
+
"cte_name": cte_name,
|
|
1345
|
+
"cte_grain": cte_grain,
|
|
1346
|
+
"dep_cte": table,
|
|
1347
|
+
"dep_grain": dep_grain,
|
|
1348
|
+
},
|
|
1349
|
+
)
|
|
1350
|
+
)
|
|
1351
|
+
final_cte = cte_steps[-1] if cte_steps else None
|
|
1352
|
+
if final_cte:
|
|
1353
|
+
final_grain = final_cte.grain
|
|
1354
|
+
if main_grain == "row_level" and final_grain in {"grouped", "scalar"}:
|
|
1355
|
+
issues.append(
|
|
1356
|
+
IntentIssue(
|
|
1357
|
+
issue_id=f"cte_main_grain_incompatible_{final_cte.cte_name}",
|
|
1358
|
+
category="cte_grain_compatibility",
|
|
1359
|
+
severity="warning",
|
|
1360
|
+
message=f"Main query (row_level) uses aggregated CTE '{final_cte.cte_name}' ({final_grain})",
|
|
1361
|
+
context={
|
|
1362
|
+
"final_cte": final_cte.cte_name,
|
|
1363
|
+
"final_grain": final_grain,
|
|
1364
|
+
"main_grain": main_grain,
|
|
1365
|
+
},
|
|
1366
|
+
)
|
|
1367
|
+
)
|
|
1368
|
+
debug(f"[validation_semantic.validate_cte_dependency_grains] {len(issues)} grain compatibility issues")
|
|
1369
|
+
return issues
|
|
1370
|
+
|
|
1371
|
+
|
|
1372
|
+
def _flip_comparison_op(op: str) -> str:
|
|
1373
|
+
"""Return the operator when swapping left and right sides of a comparison."""
|
|
1374
|
+
from .config import OP_FLIP
|
|
1375
|
+
|
|
1376
|
+
return OP_FLIP.get(op, op)
|
|
1377
|
+
|
|
1378
|
+
|
|
1379
|
+
def auto_repair_filter_having(
|
|
1380
|
+
filters_param: list[FilterParam],
|
|
1381
|
+
having_param: list[HavingParam],
|
|
1382
|
+
cte_names: set[str] | None = None,
|
|
1383
|
+
) -> tuple[list[FilterParam], list[HavingParam]]:
|
|
1384
|
+
"""Repair misplaced filter/HAVING conditions by moving or flipping.
|
|
1385
|
+
|
|
1386
|
+
Moves filter conditions whose left expression contains aggregation into
|
|
1387
|
+
HAVING. For HAVING: when left lacks aggregation but right has it, flip
|
|
1388
|
+
to put aggregation on left and keep in HAVING; when both lack agg,
|
|
1389
|
+
move to filters.
|
|
1390
|
+
|
|
1391
|
+
Args:
|
|
1392
|
+
|
|
1393
|
+
filters_param: Original list of WHERE filter conditions.
|
|
1394
|
+
having_param: Original list of HAVING conditions.
|
|
1395
|
+
cte_names: Set of CTE names for context when processing main query.
|
|
1396
|
+
|
|
1397
|
+
Returns:
|
|
1398
|
+
|
|
1399
|
+
A 2-tuple ``(repaired_filters, repaired_having)``.
|
|
1400
|
+
"""
|
|
1401
|
+
repaired_filters: list[FilterParam] = []
|
|
1402
|
+
repaired_having: list[HavingParam] = []
|
|
1403
|
+
for fp in filters_param or []:
|
|
1404
|
+
if fp.left_expr.has_aggregation:
|
|
1405
|
+
repaired_having.append(
|
|
1406
|
+
HavingParam(
|
|
1407
|
+
left_expr=fp.left_expr,
|
|
1408
|
+
op=fp.op,
|
|
1409
|
+
right_expr=fp.right_expr,
|
|
1410
|
+
value_type=fp.value_type,
|
|
1411
|
+
param_key=fp.param_key,
|
|
1412
|
+
raw_value=fp.raw_value,
|
|
1413
|
+
)
|
|
1414
|
+
)
|
|
1415
|
+
debug(f"[validation_semantic.auto_repair_filter_having] filter->having: {fp.param_key}")
|
|
1416
|
+
else:
|
|
1417
|
+
repaired_filters.append(fp)
|
|
1418
|
+
for hp in having_param or []:
|
|
1419
|
+
if hp.left_expr.has_aggregation:
|
|
1420
|
+
repaired_having.append(hp)
|
|
1421
|
+
elif hp.right_expr and hp.right_expr.has_aggregation:
|
|
1422
|
+
repaired_having.append(
|
|
1423
|
+
HavingParam(
|
|
1424
|
+
left_expr=hp.right_expr,
|
|
1425
|
+
op=_flip_comparison_op(hp.op),
|
|
1426
|
+
right_expr=hp.left_expr,
|
|
1427
|
+
value_type=hp.value_type,
|
|
1428
|
+
param_key=hp.param_key,
|
|
1429
|
+
raw_value=hp.raw_value,
|
|
1430
|
+
)
|
|
1431
|
+
)
|
|
1432
|
+
debug(f"[validation_semantic.auto_repair_filter_having] having flip (agg->left): {hp.param_key}")
|
|
1433
|
+
else:
|
|
1434
|
+
repaired_filters.append(
|
|
1435
|
+
FilterParam(
|
|
1436
|
+
left_expr=hp.left_expr,
|
|
1437
|
+
op=hp.op,
|
|
1438
|
+
right_expr=hp.right_expr,
|
|
1439
|
+
value_type=hp.value_type,
|
|
1440
|
+
param_key=hp.param_key,
|
|
1441
|
+
raw_value=hp.raw_value,
|
|
1442
|
+
)
|
|
1443
|
+
)
|
|
1444
|
+
debug(f"[validation_semantic.auto_repair_filter_having] having->filter: {hp.param_key}")
|
|
1445
|
+
return repaired_filters, repaired_having
|
|
1446
|
+
|
|
1447
|
+
|
|
1448
|
+
def validate_question_aggregation_hint(
|
|
1449
|
+
natural_language: str,
|
|
1450
|
+
select_cols: list[SelectCol],
|
|
1451
|
+
having_param: list[HavingParam],
|
|
1452
|
+
grain: str,
|
|
1453
|
+
context: str = "main",
|
|
1454
|
+
filters_param: list[FilterParam] | None = None,
|
|
1455
|
+
schema: SchemaGraph | None = None,
|
|
1456
|
+
) -> list[IntentIssue]:
|
|
1457
|
+
"""Detect quantity-comparison phrases that imply aggregation.
|
|
1458
|
+
|
|
1459
|
+
Scans the natural-language question for patterns such as ``"more than N"``, ``"at least N"``, or ``"fewer than N"`` and, when a match is found but the intent has no aggregation, no HAVING conditions, and the grain is ``"row_level"``, raises an error so the semantic repair loop can inject aggregation structure. The check is suppressed when the matched numeric value already corresponds to a row-level filter on a known numeric column, indicating that the phrase describes a simple comparison rather than an aggregation threshold.
|
|
1460
|
+
|
|
1461
|
+
Args:
|
|
1462
|
+
|
|
1463
|
+
natural_language: Original user question text.
|
|
1464
|
+
select_cols: SELECT column list to check for existing aggregation.
|
|
1465
|
+
having_param: HAVING condition list.
|
|
1466
|
+
grain: Declared query grain from the intent.
|
|
1467
|
+
context: Query context label for issue messages.
|
|
1468
|
+
filters_param: Filter list to cross-check matched numeric values.
|
|
1469
|
+
schema: Schema graph for column type lookups.
|
|
1470
|
+
|
|
1471
|
+
Returns:
|
|
1472
|
+
|
|
1473
|
+
List of ``IntentIssue`` instances.
|
|
1474
|
+
"""
|
|
1475
|
+
issues: list[IntentIssue] = []
|
|
1476
|
+
if not natural_language:
|
|
1477
|
+
return issues
|
|
1478
|
+
match = AGG_QUANTITY_RE.search(natural_language)
|
|
1479
|
+
if not match:
|
|
1480
|
+
return issues
|
|
1481
|
+
has_agg = any(sc.is_aggregated for sc in select_cols)
|
|
1482
|
+
has_having = bool(having_param)
|
|
1483
|
+
if has_agg or has_having:
|
|
1484
|
+
return issues
|
|
1485
|
+
if grain in ("grouped", "scalar"):
|
|
1486
|
+
return issues
|
|
1487
|
+
if filters_param and schema:
|
|
1488
|
+
matched_text = match.group()
|
|
1489
|
+
numeric_tokens = [tok for tok in matched_text.split() if tok.isdigit()]
|
|
1490
|
+
if numeric_tokens:
|
|
1491
|
+
matched_value = int(numeric_tokens[-1])
|
|
1492
|
+
for fp in filters_param:
|
|
1493
|
+
if fp.raw_value is None:
|
|
1494
|
+
continue
|
|
1495
|
+
try:
|
|
1496
|
+
filter_val = float(fp.raw_value)
|
|
1497
|
+
except (TypeError, ValueError):
|
|
1498
|
+
continue
|
|
1499
|
+
if filter_val != matched_value:
|
|
1500
|
+
continue
|
|
1501
|
+
col_ref = fp.left_expr.primary_column
|
|
1502
|
+
if col_ref and is_col_numeric(col_ref, schema, {}) is True:
|
|
1503
|
+
debug(
|
|
1504
|
+
f"[validation_semantic.validate_question_aggregation_hint] "
|
|
1505
|
+
f"suppressed: '{matched_text}' matches numeric filter on {col_ref}"
|
|
1506
|
+
)
|
|
1507
|
+
return issues
|
|
1508
|
+
issues.append(
|
|
1509
|
+
IntentIssue(
|
|
1510
|
+
issue_id=f"missing_aggregation_hint_{context}",
|
|
1511
|
+
category="aggregation_hint",
|
|
1512
|
+
severity="warning",
|
|
1513
|
+
message=(
|
|
1514
|
+
f"Question contains quantity comparison '{match.group()}' which typically "
|
|
1515
|
+
f"requires COUNT/SUM aggregation with GROUP BY and HAVING, but intent has "
|
|
1516
|
+
f"no aggregation and grain is '{grain}'."
|
|
1517
|
+
),
|
|
1518
|
+
context={"matched_phrase": match.group(), "location": context},
|
|
1519
|
+
)
|
|
1520
|
+
)
|
|
1521
|
+
debug(
|
|
1522
|
+
f"[validation_semantic.validate_question_aggregation_hint] "
|
|
1523
|
+
f"detected aggregation hint '{match.group()}' but no agg in intent"
|
|
1524
|
+
)
|
|
1525
|
+
return issues
|
|
1526
|
+
|
|
1527
|
+
|
|
1528
|
+
def validate_question_agg_keyword_coverage(
|
|
1529
|
+
natural_language: str,
|
|
1530
|
+
select_cols: list[SelectCol],
|
|
1531
|
+
having_param: list[HavingParam],
|
|
1532
|
+
context: str = "main",
|
|
1533
|
+
) -> list[IntentIssue]:
|
|
1534
|
+
"""Flag aggregation-keyword questions whose intent has no aggregation.
|
|
1535
|
+
|
|
1536
|
+
Fires when the question contains an aggregation keyword such as ``total``, ``count``, ``average``, or ``sum`` but the intent contains neither an aggregated ``SelectCol`` nor a HAVING condition; this catches cases where the LLM drops all aggregation structure and the ``repair_grouped_to_distinct`` pass silently converts the intent to a row-level ``DISTINCT`` query.
|
|
1537
|
+
"""
|
|
1538
|
+
if not natural_language:
|
|
1539
|
+
return []
|
|
1540
|
+
if not AGG_KEYWORDS_RE.search(natural_language):
|
|
1541
|
+
return []
|
|
1542
|
+
has_agg = any(sc.is_aggregated for sc in select_cols)
|
|
1543
|
+
if has_agg or having_param:
|
|
1544
|
+
return []
|
|
1545
|
+
issue = IntentIssue(
|
|
1546
|
+
issue_id=f"agg_keyword_missing_{context}",
|
|
1547
|
+
category="agg_keyword_missing",
|
|
1548
|
+
severity="warning",
|
|
1549
|
+
message=(
|
|
1550
|
+
f"Question contains an aggregation keyword but the intent "
|
|
1551
|
+
f"has no aggregated column and no HAVING condition in "
|
|
1552
|
+
f"{context}. Add the appropriate aggregation function and "
|
|
1553
|
+
f"set grain to 'grouped'."
|
|
1554
|
+
),
|
|
1555
|
+
context={"location": context},
|
|
1556
|
+
)
|
|
1557
|
+
debug(
|
|
1558
|
+
"[validation_semantic.validate_question_agg_keyword_coverage] "
|
|
1559
|
+
"aggregation keyword in question but no agg in intent"
|
|
1560
|
+
)
|
|
1561
|
+
return [issue]
|
|
1562
|
+
|
|
1563
|
+
|
|
1564
|
+
_NUMERIC_LITERAL_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
|
|
1565
|
+
_YEAR_IN_STRING_RE = re.compile(r"\b(19|20)\d{2}\b")
|
|
1566
|
+
_TOP_N_RE = re.compile(r"\b(?:top|first|bottom|last|least|most)\s+\d+\b", re.IGNORECASE)
|
|
1567
|
+
_DISTINCT_RE = re.compile(r"\b(?:distinct|unique)\b", re.IGNORECASE)
|
|
1568
|
+
|
|
1569
|
+
|
|
1570
|
+
def validate_question_numeric_coverage(
|
|
1571
|
+
natural_language: str,
|
|
1572
|
+
filters_param: list[FilterParam],
|
|
1573
|
+
having_param: list[HavingParam],
|
|
1574
|
+
limit: int | None,
|
|
1575
|
+
context: str = "main",
|
|
1576
|
+
) -> list[IntentIssue]:
|
|
1577
|
+
"""Flag numeric literals in the question missing from intent conditions.
|
|
1578
|
+
|
|
1579
|
+
Extracts all numbers from the natural-language question and checks whether each appears as a filter value, HAVING value, or limit, excluding numbers matched by top-N phrasing since they map to ``LIMIT``.
|
|
1580
|
+
"""
|
|
1581
|
+
issues: list[IntentIssue] = []
|
|
1582
|
+
if not natural_language:
|
|
1583
|
+
return issues
|
|
1584
|
+
|
|
1585
|
+
top_n_numbers: set[str] = set()
|
|
1586
|
+
for m in _TOP_N_RE.finditer(natural_language):
|
|
1587
|
+
for tok in m.group().split():
|
|
1588
|
+
if tok.isdigit():
|
|
1589
|
+
top_n_numbers.add(tok)
|
|
1590
|
+
|
|
1591
|
+
all_numbers = _NUMERIC_LITERAL_RE.findall(natural_language)
|
|
1592
|
+
if not all_numbers:
|
|
1593
|
+
return issues
|
|
1594
|
+
|
|
1595
|
+
intent_values: set[float] = set()
|
|
1596
|
+
covered_number_strs: set[str] = set()
|
|
1597
|
+
for fp in filters_param:
|
|
1598
|
+
if fp.raw_value is not None:
|
|
1599
|
+
try:
|
|
1600
|
+
intent_values.add(float(fp.raw_value))
|
|
1601
|
+
except (TypeError, ValueError):
|
|
1602
|
+
pass
|
|
1603
|
+
if isinstance(fp.raw_value, str):
|
|
1604
|
+
for m in _YEAR_IN_STRING_RE.finditer(fp.raw_value):
|
|
1605
|
+
covered_number_strs.add(m.group())
|
|
1606
|
+
for hp in having_param:
|
|
1607
|
+
if hp.raw_value is not None:
|
|
1608
|
+
try:
|
|
1609
|
+
intent_values.add(float(hp.raw_value))
|
|
1610
|
+
except (TypeError, ValueError):
|
|
1611
|
+
pass
|
|
1612
|
+
if isinstance(hp.raw_value, str):
|
|
1613
|
+
for m in _YEAR_IN_STRING_RE.finditer(hp.raw_value):
|
|
1614
|
+
covered_number_strs.add(m.group())
|
|
1615
|
+
if limit is not None:
|
|
1616
|
+
intent_values.add(float(limit))
|
|
1617
|
+
|
|
1618
|
+
for num_str in all_numbers:
|
|
1619
|
+
if num_str in top_n_numbers:
|
|
1620
|
+
continue
|
|
1621
|
+
if num_str in covered_number_strs:
|
|
1622
|
+
continue
|
|
1623
|
+
try:
|
|
1624
|
+
val = float(num_str)
|
|
1625
|
+
except ValueError:
|
|
1626
|
+
continue
|
|
1627
|
+
if val in intent_values:
|
|
1628
|
+
continue
|
|
1629
|
+
issues.append(
|
|
1630
|
+
IntentIssue(
|
|
1631
|
+
issue_id=f"missing_numeric_{num_str}_{context}",
|
|
1632
|
+
category="missing_numeric_filter",
|
|
1633
|
+
severity="warning",
|
|
1634
|
+
message=(
|
|
1635
|
+
f"Question mentions number '{num_str}' which does not "
|
|
1636
|
+
f"appear in any filter, having condition, or limit in "
|
|
1637
|
+
f"{context}."
|
|
1638
|
+
),
|
|
1639
|
+
context={"value": num_str, "location": context},
|
|
1640
|
+
)
|
|
1641
|
+
)
|
|
1642
|
+
debug(
|
|
1643
|
+
f"[validation_semantic.validate_question_numeric_coverage] "
|
|
1644
|
+
f"number '{num_str}' not found in intent conditions"
|
|
1645
|
+
)
|
|
1646
|
+
return issues
|
|
1647
|
+
|
|
1648
|
+
|
|
1649
|
+
def validate_question_distinct_hint(
|
|
1650
|
+
natural_language: str,
|
|
1651
|
+
select_cols: list[SelectCol],
|
|
1652
|
+
context: str = "main",
|
|
1653
|
+
) -> list[IntentIssue]:
|
|
1654
|
+
"""Flag when the question explicitly requests distinct results but no ``DISTINCT`` keyword appears in any expression."""
|
|
1655
|
+
issues: list[IntentIssue] = []
|
|
1656
|
+
if not natural_language:
|
|
1657
|
+
return issues
|
|
1658
|
+
if not _DISTINCT_RE.search(natural_language):
|
|
1659
|
+
return issues
|
|
1660
|
+
for sc in select_cols:
|
|
1661
|
+
raw = sc.expr if isinstance(sc.expr, str) else str(sc.expr)
|
|
1662
|
+
if "DISTINCT" in raw.upper():
|
|
1663
|
+
return issues
|
|
1664
|
+
issues.append(
|
|
1665
|
+
IntentIssue(
|
|
1666
|
+
issue_id=f"missing_distinct_{context}",
|
|
1667
|
+
category="missing_distinct",
|
|
1668
|
+
severity="warning",
|
|
1669
|
+
message=(
|
|
1670
|
+
f"Question explicitly requests distinct/unique results "
|
|
1671
|
+
f"but no DISTINCT keyword found in any expression in "
|
|
1672
|
+
f"{context}."
|
|
1673
|
+
),
|
|
1674
|
+
context={"location": context},
|
|
1675
|
+
)
|
|
1676
|
+
)
|
|
1677
|
+
debug(
|
|
1678
|
+
"[validation_semantic.validate_question_distinct_hint] "
|
|
1679
|
+
"question has distinct/unique keyword but intent lacks DISTINCT"
|
|
1680
|
+
)
|
|
1681
|
+
return issues
|
|
1682
|
+
|
|
1683
|
+
|
|
1684
|
+
def _english_plural_forms(word: str) -> list[str]:
|
|
1685
|
+
"""Return a list of plausible English plural forms for *word*.
|
|
1686
|
+
|
|
1687
|
+
Covers the three most common patterns: consonant + ``y`` → ``-ies`` (``category`` → ``categories``), sibilant ending → ``-es`` (``class`` → ``classes``), and default → ``-s`` (``product`` → ``products``); the original word is always included so callers can iterate one list.
|
|
1688
|
+
"""
|
|
1689
|
+
forms = [word]
|
|
1690
|
+
w = word.lower()
|
|
1691
|
+
if w.endswith("y") and len(w) > 2 and w[-2] not in "aeiou":
|
|
1692
|
+
forms.append(w[:-1] + "ies")
|
|
1693
|
+
elif w.endswith(("s", "sh", "ch", "x", "z")):
|
|
1694
|
+
forms.append(w + "es")
|
|
1695
|
+
else:
|
|
1696
|
+
forms.append(w + "s")
|
|
1697
|
+
return forms
|
|
1698
|
+
|
|
1699
|
+
|
|
1700
|
+
def _word_is_column_component(
|
|
1701
|
+
word: str,
|
|
1702
|
+
intent_tables: set[str],
|
|
1703
|
+
schema: SchemaGraph,
|
|
1704
|
+
) -> bool:
|
|
1705
|
+
"""Return ``True`` when *word* is a component of a column name on any intent table.
|
|
1706
|
+
|
|
1707
|
+
Column names use underscore-separated tokens; a word matches when it equals any underscore-separated part of a column name on a table that is already present in the intent, which suppresses false positives such as matching the ``product`` table because the question contains the phrase ``"product category"`` as a reference to the ``product_category`` column on an existing intent table.
|
|
1708
|
+
"""
|
|
1709
|
+
w = word.lower()
|
|
1710
|
+
for table_name in intent_tables:
|
|
1711
|
+
tbl_meta = schema.tables.get(table_name)
|
|
1712
|
+
if not tbl_meta:
|
|
1713
|
+
continue
|
|
1714
|
+
for col_name in tbl_meta.columns:
|
|
1715
|
+
parts = col_name.lower().split("_")
|
|
1716
|
+
if w in parts:
|
|
1717
|
+
return True
|
|
1718
|
+
return False
|
|
1719
|
+
|
|
1720
|
+
|
|
1721
|
+
def validate_question_table_mentions(
|
|
1722
|
+
natural_language: str,
|
|
1723
|
+
intent_tables: list[str],
|
|
1724
|
+
schema: SchemaGraph,
|
|
1725
|
+
context: str = "main",
|
|
1726
|
+
) -> list[IntentIssue]:
|
|
1727
|
+
"""Flag schema tables mentioned in the question but absent from intent tables.
|
|
1728
|
+
|
|
1729
|
+
Tokenizes the question and checks each token against known schema table names, matching both the exact table name and its common English plural forms, and suppresses matches when the table name word is also a component of a column name on an already-included intent table to avoid mistaking column-concept language for a table reference; tables already in the intent are skipped.
|
|
1730
|
+
"""
|
|
1731
|
+
issues: list[IntentIssue] = []
|
|
1732
|
+
if not natural_language or not schema:
|
|
1733
|
+
return issues
|
|
1734
|
+
intent_set = {t.lower() for t in (intent_tables or [])}
|
|
1735
|
+
nl_lower = natural_language.lower()
|
|
1736
|
+
for table_name in schema.tables:
|
|
1737
|
+
if table_name.lower() in intent_set:
|
|
1738
|
+
continue
|
|
1739
|
+
forms = _english_plural_forms(table_name.lower())
|
|
1740
|
+
alternatives = "|".join(re.escape(f) for f in forms)
|
|
1741
|
+
pattern = rf"\b(?:{alternatives})\b"
|
|
1742
|
+
if not re.search(pattern, nl_lower):
|
|
1743
|
+
continue
|
|
1744
|
+
if _word_is_column_component(table_name.lower(), intent_set, schema):
|
|
1745
|
+
debug(
|
|
1746
|
+
f"[validation_semantic.validate_question_table_mentions] "
|
|
1747
|
+
f"suppressed '{table_name}' — word is a column component "
|
|
1748
|
+
f"on an intent table"
|
|
1749
|
+
)
|
|
1750
|
+
continue
|
|
1751
|
+
issues.append(
|
|
1752
|
+
IntentIssue(
|
|
1753
|
+
issue_id=f"missing_table_{table_name}_{context}",
|
|
1754
|
+
category="missing_scoping_table",
|
|
1755
|
+
severity="warning",
|
|
1756
|
+
message=(
|
|
1757
|
+
f"Question mentions table '{table_name}' which "
|
|
1758
|
+
f"exists in the schema but is not included in "
|
|
1759
|
+
f"intent tables in {context}."
|
|
1760
|
+
),
|
|
1761
|
+
context={"table": table_name, "location": context},
|
|
1762
|
+
)
|
|
1763
|
+
)
|
|
1764
|
+
debug(
|
|
1765
|
+
f"[validation_semantic.validate_question_table_mentions] table '{table_name}' in question but not in intent"
|
|
1766
|
+
)
|
|
1767
|
+
return issues
|
|
1768
|
+
|
|
1769
|
+
|
|
1770
|
+
def validate_no_pk_fk_filters(
|
|
1771
|
+
filters_param: list[FilterParam],
|
|
1772
|
+
schema: SchemaGraph,
|
|
1773
|
+
context: str = "main",
|
|
1774
|
+
) -> list[IntentIssue]:
|
|
1775
|
+
"""Flag filters that operate directly on primary-key or foreign-key columns.
|
|
1776
|
+
|
|
1777
|
+
Primary-key filters are almost certainly hallucinations and are reported as ``"error"``. Foreign-key filters with a literal comparison value are also reported as ``"error"`` because they indicate the LLM should join the referenced table and filter on its descriptive column instead, while foreign-key filters that compare two columns (``right_expr`` present) are reported as ``"warning"`` to avoid blocking legitimate cross-column joins.
|
|
1778
|
+
"""
|
|
1779
|
+
issues: list[IntentIssue] = []
|
|
1780
|
+
for fp in filters_param:
|
|
1781
|
+
# Collect column references to inspect: always left_expr; also
|
|
1782
|
+
# right_expr when it is a column reference (not a literal).
|
|
1783
|
+
col_refs: list[str] = []
|
|
1784
|
+
left_col = fp.left_expr.primary_column
|
|
1785
|
+
if left_col and "." in left_col:
|
|
1786
|
+
col_refs.append(left_col)
|
|
1787
|
+
if fp.right_expr is not None:
|
|
1788
|
+
right_col = fp.right_expr.primary_column
|
|
1789
|
+
if right_col and "." in right_col:
|
|
1790
|
+
col_refs.append(right_col)
|
|
1791
|
+
|
|
1792
|
+
for col_ref in col_refs:
|
|
1793
|
+
table, col = col_ref.split(".", 1)
|
|
1794
|
+
tbl_meta = schema.tables.get(table)
|
|
1795
|
+
if not tbl_meta:
|
|
1796
|
+
continue
|
|
1797
|
+
col_meta = tbl_meta.columns.get(col)
|
|
1798
|
+
if not col_meta:
|
|
1799
|
+
continue
|
|
1800
|
+
if col_meta.is_primary_key:
|
|
1801
|
+
issues.append(
|
|
1802
|
+
IntentIssue(
|
|
1803
|
+
issue_id=f"pk_fk_filter_{col_ref}_{context}",
|
|
1804
|
+
category="pk_fk_filter",
|
|
1805
|
+
severity="error",
|
|
1806
|
+
message=(
|
|
1807
|
+
f"Filter on primary-key column '{col_ref}' in "
|
|
1808
|
+
f"{context}. Use a descriptive column on the "
|
|
1809
|
+
f"target table instead of filtering by "
|
|
1810
|
+
f"identifier."
|
|
1811
|
+
),
|
|
1812
|
+
context={
|
|
1813
|
+
"column": col_ref,
|
|
1814
|
+
"kind": "primary-key",
|
|
1815
|
+
"location": context,
|
|
1816
|
+
},
|
|
1817
|
+
)
|
|
1818
|
+
)
|
|
1819
|
+
debug(f"[validation_semantic.validate_no_pk_fk_filters] primary-key filter on '{col_ref}'")
|
|
1820
|
+
elif col_meta.is_foreign_key:
|
|
1821
|
+
literal_filter = fp.raw_value is not None and fp.right_expr is None
|
|
1822
|
+
severity = "error" if literal_filter else "warning"
|
|
1823
|
+
issues.append(
|
|
1824
|
+
IntentIssue(
|
|
1825
|
+
issue_id=f"pk_fk_filter_{col_ref}_{context}",
|
|
1826
|
+
category="pk_fk_filter",
|
|
1827
|
+
severity=severity,
|
|
1828
|
+
message=(
|
|
1829
|
+
f"Filter on foreign-key column '{col_ref}' in "
|
|
1830
|
+
f"{context}. Consider joining the referenced "
|
|
1831
|
+
f"table and filtering on its descriptive "
|
|
1832
|
+
f"column instead."
|
|
1833
|
+
),
|
|
1834
|
+
context={
|
|
1835
|
+
"column": col_ref,
|
|
1836
|
+
"kind": "foreign-key",
|
|
1837
|
+
"location": context,
|
|
1838
|
+
},
|
|
1839
|
+
)
|
|
1840
|
+
)
|
|
1841
|
+
debug(
|
|
1842
|
+
f"[validation_semantic.validate_no_pk_fk_filters] "
|
|
1843
|
+
f"foreign-key filter on '{col_ref}' severity={severity}"
|
|
1844
|
+
)
|
|
1845
|
+
return issues
|
|
1846
|
+
|
|
1847
|
+
|
|
1848
|
+
def validate_threshold_missing_having(
|
|
1849
|
+
natural_language: str,
|
|
1850
|
+
select_cols: list[SelectCol],
|
|
1851
|
+
having_param: list[HavingParam],
|
|
1852
|
+
grain: str,
|
|
1853
|
+
context: str = "main",
|
|
1854
|
+
) -> list[IntentIssue]:
|
|
1855
|
+
"""Detect threshold phrases where aggregation exists but HAVING is absent.
|
|
1856
|
+
|
|
1857
|
+
Fires when the question contains phrases like ``"more than N"`` or ``"at least N"``, the intent has aggregation and is grouped, but no HAVING condition is defined, which typically indicates the LLM produced the aggregation but forgot the threshold filter.
|
|
1858
|
+
|
|
1859
|
+
Args:
|
|
1860
|
+
|
|
1861
|
+
natural_language: Original user question text.
|
|
1862
|
+
select_cols: SELECT column list to check for existing aggregation.
|
|
1863
|
+
having_param: HAVING condition list.
|
|
1864
|
+
grain: Declared query grain from the intent.
|
|
1865
|
+
context: Query context label for issue messages.
|
|
1866
|
+
|
|
1867
|
+
Returns:
|
|
1868
|
+
|
|
1869
|
+
List of ``IntentIssue`` instances (at most one).
|
|
1870
|
+
"""
|
|
1871
|
+
issues: list[IntentIssue] = []
|
|
1872
|
+
if not natural_language:
|
|
1873
|
+
return issues
|
|
1874
|
+
if grain != "grouped":
|
|
1875
|
+
return issues
|
|
1876
|
+
has_agg = any(sc.is_aggregated for sc in select_cols)
|
|
1877
|
+
if not has_agg:
|
|
1878
|
+
return issues
|
|
1879
|
+
if having_param:
|
|
1880
|
+
return issues
|
|
1881
|
+
match = AGG_QUANTITY_RE.search(natural_language)
|
|
1882
|
+
if not match:
|
|
1883
|
+
return issues
|
|
1884
|
+
issues.append(
|
|
1885
|
+
IntentIssue(
|
|
1886
|
+
issue_id=f"threshold_missing_having_{context}",
|
|
1887
|
+
category="threshold_missing_having",
|
|
1888
|
+
severity="error",
|
|
1889
|
+
message=(
|
|
1890
|
+
f"Question contains threshold phrase '{match.group()}' and "
|
|
1891
|
+
f"intent has aggregation, but no HAVING condition is defined. "
|
|
1892
|
+
f"Add a HAVING clause for the threshold."
|
|
1893
|
+
),
|
|
1894
|
+
context={"matched_phrase": match.group(), "location": context},
|
|
1895
|
+
)
|
|
1896
|
+
)
|
|
1897
|
+
debug(f"[validation_semantic.validate_threshold_missing_having] threshold phrase '{match.group()}' without HAVING")
|
|
1898
|
+
return issues
|
|
1899
|
+
|
|
1900
|
+
|
|
1901
|
+
def validate_count_threshold_missing_having(
|
|
1902
|
+
natural_language: str,
|
|
1903
|
+
tables: list[str],
|
|
1904
|
+
having_param: list[HavingParam],
|
|
1905
|
+
schema: SchemaGraph,
|
|
1906
|
+
context: str = "main",
|
|
1907
|
+
) -> list[IntentIssue]:
|
|
1908
|
+
"""Flag count-threshold phrases that lack a HAVING clause.
|
|
1909
|
+
|
|
1910
|
+
Fires when the question contains an explicit count threshold such as ``"in exactly N <entity>"`` but no HAVING condition references a ``COUNT(DISTINCT ...)`` over the foreign-key column that points at the threshold entity; the issue message includes the specific FK column so the LLM repair loop can synthesise the correct HAVING clause.
|
|
1911
|
+
|
|
1912
|
+
Args:
|
|
1913
|
+
|
|
1914
|
+
natural_language: Original user question text.
|
|
1915
|
+
tables: Table list from the intent.
|
|
1916
|
+
having_param: Current HAVING conditions.
|
|
1917
|
+
schema: ``SchemaGraph`` providing FK metadata.
|
|
1918
|
+
context: Query context label for issue messages.
|
|
1919
|
+
|
|
1920
|
+
Returns:
|
|
1921
|
+
|
|
1922
|
+
List of ``IntentIssue`` instances (at most one).
|
|
1923
|
+
"""
|
|
1924
|
+
issues: list[IntentIssue] = []
|
|
1925
|
+
if not natural_language:
|
|
1926
|
+
return issues
|
|
1927
|
+
|
|
1928
|
+
match = COUNT_THRESHOLD_TABLE_RE.search(natural_language)
|
|
1929
|
+
if not match:
|
|
1930
|
+
return issues
|
|
1931
|
+
|
|
1932
|
+
threshold_count = match.group(1)
|
|
1933
|
+
threshold_word = match.group(2)
|
|
1934
|
+
threshold_table = _resolve_word_to_table(threshold_word, schema)
|
|
1935
|
+
if not threshold_table:
|
|
1936
|
+
return issues
|
|
1937
|
+
|
|
1938
|
+
if having_param:
|
|
1939
|
+
return issues
|
|
1940
|
+
|
|
1941
|
+
fk_col = _find_fk_column_for_target(threshold_table, tables, schema)
|
|
1942
|
+
|
|
1943
|
+
hint = (
|
|
1944
|
+
f"COUNT(DISTINCT {fk_col}) = {threshold_count}"
|
|
1945
|
+
if fk_col
|
|
1946
|
+
else f"COUNT(DISTINCT <fk_column_referencing_{threshold_table}>) = {threshold_count}"
|
|
1947
|
+
)
|
|
1948
|
+
|
|
1949
|
+
issues.append(
|
|
1950
|
+
IntentIssue(
|
|
1951
|
+
issue_id=f"count_threshold_missing_having_{context}",
|
|
1952
|
+
category="count_threshold_missing_having",
|
|
1953
|
+
severity="error",
|
|
1954
|
+
message=(
|
|
1955
|
+
f"Question implies a count threshold of "
|
|
1956
|
+
f"{threshold_count} for entity '{threshold_table}' "
|
|
1957
|
+
f"but no HAVING clause is defined. Add a HAVING "
|
|
1958
|
+
f"condition with {hint}."
|
|
1959
|
+
),
|
|
1960
|
+
context={
|
|
1961
|
+
"threshold_count": threshold_count,
|
|
1962
|
+
"threshold_table": threshold_table,
|
|
1963
|
+
"fk_column": fk_col or "",
|
|
1964
|
+
"location": context,
|
|
1965
|
+
},
|
|
1966
|
+
)
|
|
1967
|
+
)
|
|
1968
|
+
debug(
|
|
1969
|
+
f"[validation_semantic.validate_count_threshold_missing_having] "
|
|
1970
|
+
f"count threshold {threshold_count} for '{threshold_table}' "
|
|
1971
|
+
f"without HAVING"
|
|
1972
|
+
)
|
|
1973
|
+
return issues
|
|
1974
|
+
|
|
1975
|
+
|
|
1976
|
+
def _resolve_word_to_table(
|
|
1977
|
+
word: str,
|
|
1978
|
+
schema: SchemaGraph,
|
|
1979
|
+
) -> str | None:
|
|
1980
|
+
"""Resolve a natural-language word to a schema table name.
|
|
1981
|
+
|
|
1982
|
+
Checks the exact word and common English plural/singular forms against the schema table names.
|
|
1983
|
+
|
|
1984
|
+
Args:
|
|
1985
|
+
|
|
1986
|
+
word: Candidate word from the question.
|
|
1987
|
+
schema: ``SchemaGraph`` providing table names.
|
|
1988
|
+
|
|
1989
|
+
Returns:
|
|
1990
|
+
|
|
1991
|
+
Canonical schema table name, or ``None``.
|
|
1992
|
+
"""
|
|
1993
|
+
word_lower = word.lower()
|
|
1994
|
+
lower_tables = {t.lower(): t for t in schema.tables}
|
|
1995
|
+
if word_lower in lower_tables:
|
|
1996
|
+
return lower_tables[word_lower]
|
|
1997
|
+
for tbl_lower, tbl_canonical in lower_tables.items():
|
|
1998
|
+
if word_lower in _english_plural_forms(tbl_lower):
|
|
1999
|
+
return tbl_canonical
|
|
2000
|
+
return None
|
|
2001
|
+
|
|
2002
|
+
|
|
2003
|
+
def _find_fk_column_for_target(
|
|
2004
|
+
target_table: str,
|
|
2005
|
+
candidate_tables: list[str],
|
|
2006
|
+
schema: SchemaGraph,
|
|
2007
|
+
) -> str | None:
|
|
2008
|
+
"""Find an FK column on *candidate_tables* referencing *target_table*.
|
|
2009
|
+
|
|
2010
|
+
Args:
|
|
2011
|
+
|
|
2012
|
+
target_table: The table being referenced by the FK.
|
|
2013
|
+
candidate_tables: Tables to search for FK columns.
|
|
2014
|
+
schema: ``SchemaGraph`` providing column metadata.
|
|
2015
|
+
|
|
2016
|
+
Returns:
|
|
2017
|
+
|
|
2018
|
+
Qualified ``table.column`` string, or ``None``.
|
|
2019
|
+
"""
|
|
2020
|
+
for tbl in candidate_tables:
|
|
2021
|
+
tbl_meta = schema.tables.get(tbl)
|
|
2022
|
+
if not tbl_meta:
|
|
2023
|
+
continue
|
|
2024
|
+
for col_name, col_meta in tbl_meta.columns.items():
|
|
2025
|
+
if not col_meta.is_foreign_key or not col_meta.fk_target:
|
|
2026
|
+
continue
|
|
2027
|
+
if col_meta.fk_target[0] == target_table:
|
|
2028
|
+
return f"{tbl}.{col_name}"
|
|
2029
|
+
return None
|
|
2030
|
+
|
|
2031
|
+
|
|
2032
|
+
_FOR_EACH_RE = re.compile(
|
|
2033
|
+
r"\b(?:for\s+each|per|each)\s+(\w+(?:\s+\w+)?)\b",
|
|
2034
|
+
re.IGNORECASE,
|
|
2035
|
+
)
|
|
2036
|
+
|
|
2037
|
+
_PER_RATE_PREFIX_RE = re.compile(
|
|
2038
|
+
r"\b(?:average|avg|sum|total|count|min|max|mean|number\s+of|amount)\b",
|
|
2039
|
+
re.IGNORECASE,
|
|
2040
|
+
)
|
|
2041
|
+
|
|
2042
|
+
|
|
2043
|
+
def validate_for_each_grouping(
|
|
2044
|
+
natural_language: str,
|
|
2045
|
+
group_by_cols: list[NormalizedExpr],
|
|
2046
|
+
schema: SchemaGraph,
|
|
2047
|
+
has_aggregation: bool,
|
|
2048
|
+
context: str = "main",
|
|
2049
|
+
) -> list[IntentIssue]:
|
|
2050
|
+
"""Detect ``"for each X"`` patterns missing a corresponding GROUP BY.
|
|
2051
|
+
|
|
2052
|
+
Fires only when the intent already contains aggregation context (aggregated select columns or HAVING conditions); for row-level queries, ``"for each X"`` simply means per row and should not force a GROUP BY. When the question contains phrases like ``"for each entity"`` or ``"per entity"`` but no column from the matching table appears in ``group_by_cols``, an error is raised; matches table names and their common English plural forms.
|
|
2053
|
+
|
|
2054
|
+
Args:
|
|
2055
|
+
|
|
2056
|
+
natural_language: Original user question text.
|
|
2057
|
+
group_by_cols: GROUP BY column list from the intent.
|
|
2058
|
+
schema: ``SchemaGraph`` providing table metadata.
|
|
2059
|
+
has_aggregation: Whether the intent contains any aggregated select columns or HAVING conditions.
|
|
2060
|
+
context: Query context label for issue messages.
|
|
2061
|
+
|
|
2062
|
+
Returns:
|
|
2063
|
+
|
|
2064
|
+
List of ``IntentIssue`` instances with severity ``"error"``.
|
|
2065
|
+
"""
|
|
2066
|
+
issues: list[IntentIssue] = []
|
|
2067
|
+
if not natural_language:
|
|
2068
|
+
return issues
|
|
2069
|
+
if not has_aggregation:
|
|
2070
|
+
return issues
|
|
2071
|
+
nl_lower = natural_language.lower()
|
|
2072
|
+
|
|
2073
|
+
gb_tables: set[str] = set()
|
|
2074
|
+
for g in group_by_cols:
|
|
2075
|
+
col = g.primary_column if hasattr(g, "primary_column") else ""
|
|
2076
|
+
if col and "." in col:
|
|
2077
|
+
gb_tables.add(col.split(".", 1)[0].lower())
|
|
2078
|
+
|
|
2079
|
+
for match in _FOR_EACH_RE.finditer(nl_lower):
|
|
2080
|
+
noun = match.group(1).strip().lower()
|
|
2081
|
+
matched_table = _resolve_word_to_table(noun, schema)
|
|
2082
|
+
if not matched_table:
|
|
2083
|
+
words = noun.split()
|
|
2084
|
+
if words:
|
|
2085
|
+
matched_table = _resolve_word_to_table(words[-1], schema)
|
|
2086
|
+
if not matched_table:
|
|
2087
|
+
continue
|
|
2088
|
+
if matched_table.lower() in gb_tables:
|
|
2089
|
+
continue
|
|
2090
|
+
preceding = nl_lower[: match.start()]
|
|
2091
|
+
if match.group().startswith("per") and _PER_RATE_PREFIX_RE.search(preceding[-40:]):
|
|
2092
|
+
debug(
|
|
2093
|
+
f"[validation_semantic.validate_for_each_grouping] "
|
|
2094
|
+
f"skipping '{match.group()}' — preceded by aggregation "
|
|
2095
|
+
f"keyword (rate context)"
|
|
2096
|
+
)
|
|
2097
|
+
continue
|
|
2098
|
+
issues.append(
|
|
2099
|
+
IntentIssue(
|
|
2100
|
+
issue_id=f"for_each_missing_group_by_{matched_table}_{context}",
|
|
2101
|
+
category="for_each_grouping",
|
|
2102
|
+
severity="error",
|
|
2103
|
+
message=(
|
|
2104
|
+
f"Question says '{match.group()}' implying GROUP BY "
|
|
2105
|
+
f"on table '{matched_table}', but no column from "
|
|
2106
|
+
f"that table appears in group_by_cols. Add the "
|
|
2107
|
+
f"identifying column to group_by_cols and set "
|
|
2108
|
+
f"grain to 'grouped'."
|
|
2109
|
+
),
|
|
2110
|
+
context={
|
|
2111
|
+
"matched_phrase": match.group(),
|
|
2112
|
+
"table": matched_table,
|
|
2113
|
+
"location": context,
|
|
2114
|
+
},
|
|
2115
|
+
)
|
|
2116
|
+
)
|
|
2117
|
+
debug(
|
|
2118
|
+
f"[validation_semantic.validate_for_each_grouping] "
|
|
2119
|
+
f"'{match.group()}' → table '{matched_table}' missing "
|
|
2120
|
+
f"from group_by"
|
|
2121
|
+
)
|
|
2122
|
+
return issues
|