aetherdialect 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aetherdialect-0.1.0.dist-info/METADATA +197 -0
- aetherdialect-0.1.0.dist-info/RECORD +34 -0
- aetherdialect-0.1.0.dist-info/WHEEL +5 -0
- aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
- aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
- text2sql/__init__.py +7 -0
- text2sql/config.py +1063 -0
- text2sql/contracts_base.py +952 -0
- text2sql/contracts_core.py +1890 -0
- text2sql/core_utils.py +834 -0
- text2sql/dialect.py +1134 -0
- text2sql/expansion_ops.py +1218 -0
- text2sql/expansion_rules.py +496 -0
- text2sql/intent_expr.py +1759 -0
- text2sql/intent_process.py +2133 -0
- text2sql/intent_repair.py +1733 -0
- text2sql/intent_resolve.py +1292 -0
- text2sql/live_testing.py +1117 -0
- text2sql/main_execution.py +799 -0
- text2sql/pipeline.py +1662 -0
- text2sql/qsim_ops.py +1286 -0
- text2sql/qsim_sample.py +609 -0
- text2sql/qsim_struct.py +569 -0
- text2sql/schema.py +973 -0
- text2sql/schema_profiling.py +2075 -0
- text2sql/simulator.py +970 -0
- text2sql/sql_gen.py +1537 -0
- text2sql/templates.py +1037 -0
- text2sql/text2sql.py +726 -0
- text2sql/utils.py +973 -0
- text2sql/validation_agg.py +1033 -0
- text2sql/validation_execute.py +1092 -0
- text2sql/validation_schema.py +1847 -0
- text2sql/validation_semantic.py +2122 -0
|
@@ -0,0 +1,1033 @@
|
|
|
1
|
+
"""Aggregation and type validation for SELECT, ORDER BY, and HAVING clauses.
|
|
2
|
+
|
|
3
|
+
Validates aggregation functions (COUNT, SUM, AVG, MIN, MAX) per column role and semantics: SUM/AVG require numeric columns; MIN/MAX on FREE_TEXT columns trigger warnings. Validates scalar function type compatibility (UPPER on string, ABS on numeric, YEAR on date). Provides heuristic column-type checks, expression arithmetic detection, and helpers for numeric-result detection (strip_function_wrappers, term_result_is_numeric, expr_result_is_numeric). Also validates temporal column presence and PK/FK aggregation misuse.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
from .config import (
|
|
11
|
+
NUMERIC_RESULT_AGGS,
|
|
12
|
+
NUMERIC_RESULT_SCALARS,
|
|
13
|
+
SCALAR_FUNCTIONS_AGG_COMPATIBLE,
|
|
14
|
+
SCALAR_FUNCTIONS_NUMERIC,
|
|
15
|
+
SCALAR_FUNCTIONS_STRING,
|
|
16
|
+
SCALAR_FUNCTIONS_TEMPORAL,
|
|
17
|
+
VALID_AGG_FUNCS,
|
|
18
|
+
)
|
|
19
|
+
from .contracts_base import ColumnRole, CteOutputColumnMeta, IntentIssue, SchemaGraph
|
|
20
|
+
from .contracts_core import HavingParam, NormalizedExpr, OrderByCol, SelectCol
|
|
21
|
+
from .core_utils import debug
|
|
22
|
+
from .validation_schema import (
|
|
23
|
+
extract_agg_col,
|
|
24
|
+
extract_col_from_scalar_wrapper,
|
|
25
|
+
extract_functions_from_term,
|
|
26
|
+
get_col_type,
|
|
27
|
+
is_col_numeric,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def validate_having_agg_per_role(
|
|
32
|
+
having_param: list[HavingParam],
|
|
33
|
+
schema: SchemaGraph,
|
|
34
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
35
|
+
context: str = "main",
|
|
36
|
+
) -> list[IntentIssue]:
|
|
37
|
+
"""Validate that HAVING aggregation functions are valid for each column's role.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
|
|
41
|
+
having_param: List of HavingParam instances to validate.
|
|
42
|
+
schema: The SchemaGraph.
|
|
43
|
+
cte_outputs: Dict of CTE name -> output column metadata.
|
|
44
|
+
context: Label used in issue IDs and messages.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
|
|
48
|
+
List of IntentIssue objects.
|
|
49
|
+
"""
|
|
50
|
+
issues = []
|
|
51
|
+
if not having_param:
|
|
52
|
+
return []
|
|
53
|
+
cte_outputs = cte_outputs or {}
|
|
54
|
+
for hp in having_param:
|
|
55
|
+
agg_expr = hp.left_expr.primary_term
|
|
56
|
+
if not agg_expr:
|
|
57
|
+
continue
|
|
58
|
+
result = extract_agg_col(agg_expr)
|
|
59
|
+
if len(result) != 3:
|
|
60
|
+
continue
|
|
61
|
+
func, actual_target, _ = result
|
|
62
|
+
if not func or actual_target == "*":
|
|
63
|
+
continue
|
|
64
|
+
if "." not in actual_target:
|
|
65
|
+
continue
|
|
66
|
+
table_name, col_name = actual_target.rsplit(".", 1)
|
|
67
|
+
if table_name in cte_outputs:
|
|
68
|
+
cte_cols = cte_outputs[table_name]
|
|
69
|
+
matched_key = next((c for c in cte_cols if c.lower() == col_name.lower()), None)
|
|
70
|
+
if matched_key:
|
|
71
|
+
cte_meta = cte_cols[matched_key]
|
|
72
|
+
if cte_meta.valid_aggregations and func not in cte_meta.valid_aggregations:
|
|
73
|
+
issues.append(
|
|
74
|
+
IntentIssue(
|
|
75
|
+
issue_id=f"having_agg_invalid_for_cte_{context}_{actual_target}_{func}",
|
|
76
|
+
category="having_validity",
|
|
77
|
+
severity="error",
|
|
78
|
+
message=f"Aggregation '{func.upper()}' not valid for CTE column '{actual_target}' (role={cte_meta.role}) in HAVING for {context}. Valid: {sorted(cte_meta.valid_aggregations)}",
|
|
79
|
+
context={
|
|
80
|
+
"column": actual_target,
|
|
81
|
+
"function": func,
|
|
82
|
+
"role": cte_meta.role,
|
|
83
|
+
"valid_aggs": sorted(cte_meta.valid_aggregations),
|
|
84
|
+
"location": context,
|
|
85
|
+
},
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
continue
|
|
89
|
+
if table_name not in schema.tables:
|
|
90
|
+
continue
|
|
91
|
+
table_meta = schema.tables[table_name]
|
|
92
|
+
col_meta = table_meta.columns.get(col_name) or table_meta.columns.get(col_name.lower())
|
|
93
|
+
if not col_meta:
|
|
94
|
+
continue
|
|
95
|
+
valid_aggs = col_meta.get_valid_aggregations()
|
|
96
|
+
if func not in valid_aggs:
|
|
97
|
+
issues.append(
|
|
98
|
+
IntentIssue(
|
|
99
|
+
issue_id=f"having_agg_invalid_for_role_{context}_{actual_target}_{func}",
|
|
100
|
+
category="having_validity",
|
|
101
|
+
severity="error",
|
|
102
|
+
message=f"Aggregation '{func.upper()}' not valid for column '{actual_target}' (role={col_meta.role}) in HAVING for {context}. Valid: {sorted(valid_aggs)}",
|
|
103
|
+
context={
|
|
104
|
+
"column": actual_target,
|
|
105
|
+
"function": func,
|
|
106
|
+
"role": col_meta.role,
|
|
107
|
+
"valid_aggs": sorted(valid_aggs),
|
|
108
|
+
"location": context,
|
|
109
|
+
},
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
debug(f"[validation_schema.validate_having_agg_per_role] {len(issues)} issues in {context}")
|
|
113
|
+
return issues
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def validate_select_agg_per_role(
|
|
117
|
+
select_cols: list[SelectCol],
|
|
118
|
+
schema: SchemaGraph,
|
|
119
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
120
|
+
context: str = "main",
|
|
121
|
+
) -> list[IntentIssue]:
|
|
122
|
+
"""Validate that SELECT aggregation functions are valid for each column's role.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
|
|
126
|
+
select_cols: List of SelectCol instances to validate.
|
|
127
|
+
schema: The SchemaGraph.
|
|
128
|
+
cte_outputs: Dict of CTE name -> output column metadata.
|
|
129
|
+
context: Label used in issue IDs and messages.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
|
|
133
|
+
List of IntentIssue objects.
|
|
134
|
+
"""
|
|
135
|
+
issues = []
|
|
136
|
+
if not select_cols:
|
|
137
|
+
return []
|
|
138
|
+
cte_outputs = cte_outputs or {}
|
|
139
|
+
for sc in select_cols:
|
|
140
|
+
_, agg_func = extract_functions_from_term(sc.expr.primary_term)
|
|
141
|
+
if not agg_func:
|
|
142
|
+
continue
|
|
143
|
+
col_expr = sc.expr.primary_column
|
|
144
|
+
if not col_expr:
|
|
145
|
+
continue
|
|
146
|
+
actual_col = extract_col_from_scalar_wrapper(col_expr)
|
|
147
|
+
if actual_col == "*":
|
|
148
|
+
continue
|
|
149
|
+
if "." not in actual_col:
|
|
150
|
+
continue
|
|
151
|
+
table_name, col_name = actual_col.rsplit(".", 1)
|
|
152
|
+
if table_name in cte_outputs:
|
|
153
|
+
cte_cols = cte_outputs[table_name]
|
|
154
|
+
matched_key = next((c for c in cte_cols if c.lower() == col_name.lower()), None)
|
|
155
|
+
if matched_key:
|
|
156
|
+
cte_meta = cte_cols[matched_key]
|
|
157
|
+
if cte_meta.valid_aggregations:
|
|
158
|
+
func_lower = agg_func.lower()
|
|
159
|
+
if func_lower not in cte_meta.valid_aggregations:
|
|
160
|
+
issues.append(
|
|
161
|
+
IntentIssue(
|
|
162
|
+
issue_id=f"select_agg_invalid_for_cte_{context}_{actual_col}_{agg_func}",
|
|
163
|
+
category="aggregation_validity",
|
|
164
|
+
severity="error",
|
|
165
|
+
message=f"Aggregation '{agg_func.upper()}' not valid for CTE column '{actual_col}' (role={cte_meta.role}) in {context}. Valid: {sorted(cte_meta.valid_aggregations)}",
|
|
166
|
+
context={
|
|
167
|
+
"column": actual_col,
|
|
168
|
+
"function": agg_func,
|
|
169
|
+
"role": cte_meta.role,
|
|
170
|
+
"valid_aggs": sorted(cte_meta.valid_aggregations),
|
|
171
|
+
"location": context,
|
|
172
|
+
},
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
continue
|
|
176
|
+
if table_name not in schema.tables:
|
|
177
|
+
continue
|
|
178
|
+
table_meta = schema.tables[table_name]
|
|
179
|
+
col_meta = table_meta.columns.get(col_name) or table_meta.columns.get(col_name.lower())
|
|
180
|
+
if not col_meta:
|
|
181
|
+
continue
|
|
182
|
+
valid_aggs = col_meta.get_valid_aggregations()
|
|
183
|
+
func_lower = agg_func.lower()
|
|
184
|
+
if func_lower not in valid_aggs:
|
|
185
|
+
issues.append(
|
|
186
|
+
IntentIssue(
|
|
187
|
+
issue_id=f"select_agg_invalid_for_role_{context}_{actual_col}_{agg_func}",
|
|
188
|
+
category="aggregation_validity",
|
|
189
|
+
severity="error",
|
|
190
|
+
message=f"Aggregation '{agg_func.upper()}' not valid for column '{actual_col}' (role={col_meta.role}) in {context}. Valid: {sorted(valid_aggs)}",
|
|
191
|
+
context={
|
|
192
|
+
"column": actual_col,
|
|
193
|
+
"function": agg_func,
|
|
194
|
+
"role": col_meta.role,
|
|
195
|
+
"valid_aggs": sorted(valid_aggs),
|
|
196
|
+
"location": context,
|
|
197
|
+
},
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
debug(f"[validation_schema.validate_select_agg_per_role] {len(issues)} issues in {context}")
|
|
201
|
+
return issues
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def validate_select_agg_semantics(
|
|
205
|
+
select_cols: list[SelectCol],
|
|
206
|
+
schema: SchemaGraph,
|
|
207
|
+
context: str = "main",
|
|
208
|
+
) -> list[IntentIssue]:
|
|
209
|
+
"""Validate that SELECT aggregation functions are semantically appropriate for column types.
|
|
210
|
+
|
|
211
|
+
Errors for SUM/AVG on non-numeric columns; warns for MIN/MAX on FREE_TEXT columns.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
|
|
215
|
+
select_cols: List of SelectCol instances to validate.
|
|
216
|
+
schema: The SchemaGraph.
|
|
217
|
+
context: Label used in issue IDs and messages.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
|
|
221
|
+
List of IntentIssue objects.
|
|
222
|
+
"""
|
|
223
|
+
issues = []
|
|
224
|
+
if not select_cols:
|
|
225
|
+
return []
|
|
226
|
+
numeric_aggs = {"sum", "avg"}
|
|
227
|
+
for sc in select_cols:
|
|
228
|
+
_, agg_func = extract_functions_from_term(sc.expr.primary_term)
|
|
229
|
+
if not agg_func:
|
|
230
|
+
continue
|
|
231
|
+
func_lower = agg_func
|
|
232
|
+
if func_lower not in numeric_aggs and func_lower not in {"min", "max"}:
|
|
233
|
+
continue
|
|
234
|
+
col_expr = sc.expr.primary_column
|
|
235
|
+
if not col_expr:
|
|
236
|
+
continue
|
|
237
|
+
actual_col = extract_col_from_scalar_wrapper(col_expr)
|
|
238
|
+
if actual_col == "*":
|
|
239
|
+
continue
|
|
240
|
+
if "." not in actual_col:
|
|
241
|
+
continue
|
|
242
|
+
table_name, col_name = actual_col.rsplit(".", 1)
|
|
243
|
+
if table_name not in schema.tables:
|
|
244
|
+
continue
|
|
245
|
+
col_meta = schema.tables[table_name].columns.get(col_name)
|
|
246
|
+
if not col_meta:
|
|
247
|
+
continue
|
|
248
|
+
vt = col_meta.value_type
|
|
249
|
+
numeric = vt in ("integer", "number")
|
|
250
|
+
temporal = vt == "date"
|
|
251
|
+
if func_lower in numeric_aggs and not numeric:
|
|
252
|
+
issues.append(
|
|
253
|
+
IntentIssue(
|
|
254
|
+
issue_id=f"invalid_agg_semantics_{func_lower}_{table_name}_{col_name}",
|
|
255
|
+
category="aggregation_semantics",
|
|
256
|
+
severity="error",
|
|
257
|
+
message=f"Cannot {func_lower.upper()} on {actual_col} (type={col_meta.data_type}): {func_lower.upper()} requires numeric column",
|
|
258
|
+
context={
|
|
259
|
+
"aggregation": func_lower,
|
|
260
|
+
"column": actual_col,
|
|
261
|
+
"data_type": col_meta.data_type,
|
|
262
|
+
"location": context,
|
|
263
|
+
},
|
|
264
|
+
)
|
|
265
|
+
)
|
|
266
|
+
debug(f"[validation_schema.validate_select_agg_semantics] invalid {func_lower.upper()} on {actual_col}")
|
|
267
|
+
elif func_lower in {"min", "max"} and not numeric and not temporal:
|
|
268
|
+
col_role = col_meta.role if col_meta.role else None
|
|
269
|
+
if col_role == ColumnRole.FREE_TEXT.value:
|
|
270
|
+
issues.append(
|
|
271
|
+
IntentIssue(
|
|
272
|
+
issue_id=f"questionable_agg_{func_lower}_{table_name}_{col_name}",
|
|
273
|
+
category="aggregation_semantics",
|
|
274
|
+
severity="warning",
|
|
275
|
+
message=f"Questionable {func_lower.upper()} on {actual_col} (type={col_meta.data_type}): {func_lower.upper()} on free text is semantically meaningless",
|
|
276
|
+
context={
|
|
277
|
+
"aggregation": func_lower,
|
|
278
|
+
"column": actual_col,
|
|
279
|
+
"data_type": col_meta.data_type,
|
|
280
|
+
"location": context,
|
|
281
|
+
},
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
debug(
|
|
285
|
+
f"[validation_schema.validate_select_agg_semantics] questionable {func_lower.upper()} on {actual_col}"
|
|
286
|
+
)
|
|
287
|
+
if issues:
|
|
288
|
+
debug(f"[validation_schema.validate_select_agg_semantics] found {len(issues)} semantic issues")
|
|
289
|
+
else:
|
|
290
|
+
debug("[validation_schema.validate_select_agg_semantics] no semantic issues")
|
|
291
|
+
return issues
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def validate_order_by_agg_per_role(
|
|
295
|
+
order_by_cols: list[OrderByCol],
|
|
296
|
+
schema: SchemaGraph,
|
|
297
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
|
|
298
|
+
context: str = "main",
|
|
299
|
+
) -> list[IntentIssue]:
|
|
300
|
+
"""Validate that ORDER BY aggregation functions are valid for each column's role.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
|
|
304
|
+
order_by_cols: List of OrderByCol instances to validate.
|
|
305
|
+
schema: The SchemaGraph.
|
|
306
|
+
cte_outputs: Dict of CTE name -> output column metadata.
|
|
307
|
+
context: Label used in issue IDs and messages.
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
|
|
311
|
+
List of IntentIssue objects.
|
|
312
|
+
"""
|
|
313
|
+
issues = []
|
|
314
|
+
if not order_by_cols:
|
|
315
|
+
return []
|
|
316
|
+
cte_outputs = cte_outputs or {}
|
|
317
|
+
for obc in order_by_cols:
|
|
318
|
+
_, agg_func = extract_functions_from_term(obc.expr.primary_term)
|
|
319
|
+
if not agg_func:
|
|
320
|
+
continue
|
|
321
|
+
col_expr = obc.expr.primary_column
|
|
322
|
+
if not col_expr:
|
|
323
|
+
continue
|
|
324
|
+
actual_col = extract_col_from_scalar_wrapper(col_expr)
|
|
325
|
+
if actual_col == "*":
|
|
326
|
+
continue
|
|
327
|
+
if "." not in actual_col:
|
|
328
|
+
continue
|
|
329
|
+
table_name, col_name = actual_col.rsplit(".", 1)
|
|
330
|
+
if table_name in cte_outputs:
|
|
331
|
+
cte_cols = cte_outputs[table_name]
|
|
332
|
+
matched_key = next((c for c in cte_cols if c.lower() == col_name.lower()), None)
|
|
333
|
+
if matched_key:
|
|
334
|
+
cte_meta = cte_cols[matched_key]
|
|
335
|
+
if cte_meta.valid_aggregations:
|
|
336
|
+
func_lower = agg_func.lower()
|
|
337
|
+
if func_lower not in cte_meta.valid_aggregations:
|
|
338
|
+
issues.append(
|
|
339
|
+
IntentIssue(
|
|
340
|
+
issue_id=f"order_by_agg_invalid_for_cte_{context}_{actual_col}_{agg_func}",
|
|
341
|
+
category="aggregation_validity",
|
|
342
|
+
severity="error",
|
|
343
|
+
message=f"Aggregation '{agg_func.upper()}' not valid for CTE column '{actual_col}' (role={cte_meta.role}) in order_by for {context}. Valid: {sorted(cte_meta.valid_aggregations)}",
|
|
344
|
+
context={
|
|
345
|
+
"column": actual_col,
|
|
346
|
+
"function": agg_func,
|
|
347
|
+
"role": cte_meta.role,
|
|
348
|
+
"valid_aggs": sorted(cte_meta.valid_aggregations),
|
|
349
|
+
"location": context,
|
|
350
|
+
},
|
|
351
|
+
)
|
|
352
|
+
)
|
|
353
|
+
continue
|
|
354
|
+
if table_name not in schema.tables:
|
|
355
|
+
continue
|
|
356
|
+
table_meta = schema.tables[table_name]
|
|
357
|
+
col_meta = table_meta.columns.get(col_name) or table_meta.columns.get(col_name.lower())
|
|
358
|
+
if not col_meta:
|
|
359
|
+
continue
|
|
360
|
+
valid_aggs = col_meta.get_valid_aggregations()
|
|
361
|
+
func_lower = agg_func.lower()
|
|
362
|
+
if func_lower not in valid_aggs:
|
|
363
|
+
issues.append(
|
|
364
|
+
IntentIssue(
|
|
365
|
+
issue_id=f"order_by_agg_invalid_for_role_{context}_{actual_col}_{agg_func}",
|
|
366
|
+
category="aggregation_validity",
|
|
367
|
+
severity="error",
|
|
368
|
+
message=f"Aggregation '{agg_func.upper()}' not valid for column '{actual_col}' (role={col_meta.role}) in order_by for {context}. Valid: {sorted(valid_aggs)}",
|
|
369
|
+
context={
|
|
370
|
+
"column": actual_col,
|
|
371
|
+
"function": agg_func,
|
|
372
|
+
"role": col_meta.role,
|
|
373
|
+
"valid_aggs": sorted(valid_aggs),
|
|
374
|
+
"location": context,
|
|
375
|
+
},
|
|
376
|
+
)
|
|
377
|
+
)
|
|
378
|
+
debug(f"[validation_schema.validate_order_by_agg_per_role] {len(issues)} issues in {context}")
|
|
379
|
+
return issues
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def validate_order_by_agg_semantics(
|
|
383
|
+
order_by_cols: list[OrderByCol],
|
|
384
|
+
schema: SchemaGraph,
|
|
385
|
+
context: str = "main",
|
|
386
|
+
) -> list[IntentIssue]:
|
|
387
|
+
"""Validate that ORDER BY aggregation functions are semantically appropriate for column types.
|
|
388
|
+
|
|
389
|
+
Errors for SUM/AVG on non-numeric columns; warns for MIN/MAX on FREE_TEXT columns.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
|
|
393
|
+
order_by_cols: List of OrderByCol instances to validate.
|
|
394
|
+
schema: The SchemaGraph.
|
|
395
|
+
context: Label used in issue IDs and messages.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
|
|
399
|
+
List of IntentIssue objects.
|
|
400
|
+
"""
|
|
401
|
+
issues = []
|
|
402
|
+
if not order_by_cols:
|
|
403
|
+
return []
|
|
404
|
+
numeric_aggs = {"sum", "avg"}
|
|
405
|
+
for obc in order_by_cols:
|
|
406
|
+
_, agg_func = extract_functions_from_term(obc.expr.primary_term)
|
|
407
|
+
if not agg_func:
|
|
408
|
+
continue
|
|
409
|
+
func_lower = agg_func
|
|
410
|
+
if func_lower not in numeric_aggs and func_lower not in {"min", "max"}:
|
|
411
|
+
continue
|
|
412
|
+
col_expr = obc.expr.primary_column
|
|
413
|
+
if not col_expr:
|
|
414
|
+
continue
|
|
415
|
+
actual_col = extract_col_from_scalar_wrapper(col_expr)
|
|
416
|
+
if actual_col == "*":
|
|
417
|
+
continue
|
|
418
|
+
if "." not in actual_col:
|
|
419
|
+
continue
|
|
420
|
+
table_name, col_name = actual_col.rsplit(".", 1)
|
|
421
|
+
if table_name not in schema.tables:
|
|
422
|
+
continue
|
|
423
|
+
col_meta = schema.tables[table_name].columns.get(col_name)
|
|
424
|
+
if not col_meta:
|
|
425
|
+
continue
|
|
426
|
+
vt = col_meta.value_type
|
|
427
|
+
numeric = vt in ("integer", "number")
|
|
428
|
+
temporal = vt == "date"
|
|
429
|
+
if func_lower in numeric_aggs and not numeric:
|
|
430
|
+
issues.append(
|
|
431
|
+
IntentIssue(
|
|
432
|
+
issue_id=f"invalid_order_by_agg_semantics_{func_lower}_{table_name}_{col_name}",
|
|
433
|
+
category="aggregation_semantics",
|
|
434
|
+
severity="error",
|
|
435
|
+
message=f"Cannot {func_lower.upper()} on {actual_col} (type={col_meta.data_type}) in ORDER BY: {func_lower.upper()} requires numeric column",
|
|
436
|
+
context={
|
|
437
|
+
"aggregation": func_lower,
|
|
438
|
+
"column": actual_col,
|
|
439
|
+
"data_type": col_meta.data_type,
|
|
440
|
+
"location": context,
|
|
441
|
+
},
|
|
442
|
+
)
|
|
443
|
+
)
|
|
444
|
+
debug(f"[validation_schema.validate_order_by_agg_semantics] invalid {func_lower.upper()} on {actual_col}")
|
|
445
|
+
elif func_lower in {"min", "max"} and not numeric and not temporal:
|
|
446
|
+
col_role = col_meta.role if col_meta.role else None
|
|
447
|
+
if col_role == ColumnRole.FREE_TEXT.value:
|
|
448
|
+
issues.append(
|
|
449
|
+
IntentIssue(
|
|
450
|
+
issue_id=f"questionable_order_by_agg_{func_lower}_{table_name}_{col_name}",
|
|
451
|
+
category="aggregation_semantics",
|
|
452
|
+
severity="warning",
|
|
453
|
+
message=f"Questionable {func_lower.upper()} on {actual_col} (type={col_meta.data_type}) in ORDER BY: {func_lower.upper()} on free text is semantically meaningless",
|
|
454
|
+
context={
|
|
455
|
+
"aggregation": func_lower,
|
|
456
|
+
"column": actual_col,
|
|
457
|
+
"data_type": col_meta.data_type,
|
|
458
|
+
"location": context,
|
|
459
|
+
},
|
|
460
|
+
)
|
|
461
|
+
)
|
|
462
|
+
debug(
|
|
463
|
+
f"[validation_schema.validate_order_by_agg_semantics] questionable {func_lower.upper()} on {actual_col}"
|
|
464
|
+
)
|
|
465
|
+
if issues:
|
|
466
|
+
debug(f"[validation_schema.validate_order_by_agg_semantics] found {len(issues)} semantic issues")
|
|
467
|
+
else:
|
|
468
|
+
debug("[validation_schema.validate_order_by_agg_semantics] no semantic issues")
|
|
469
|
+
return issues
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def validate_scalar_func_type_semantics(
|
|
473
|
+
select_cols: list[SelectCol],
|
|
474
|
+
order_by_cols: list[OrderByCol],
|
|
475
|
+
schema: SchemaGraph,
|
|
476
|
+
context: str = "main",
|
|
477
|
+
) -> list[IntentIssue]:
|
|
478
|
+
"""Validate that scalar functions are appropriate for column types and aggregation context.
|
|
479
|
+
|
|
480
|
+
Errors when a non-aggregate-compatible scalar wraps an aggregation, or when a type-specific scalar (string, numeric, temporal) is applied to the wrong column type.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
|
|
484
|
+
select_cols: List of SelectCol instances to validate.
|
|
485
|
+
order_by_cols: List of OrderByCol instances to validate.
|
|
486
|
+
schema: The SchemaGraph.
|
|
487
|
+
context: Label used in issue IDs and messages.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
|
|
491
|
+
List of IntentIssue objects.
|
|
492
|
+
"""
|
|
493
|
+
issues = []
|
|
494
|
+
|
|
495
|
+
def check_scalar_semantics(
|
|
496
|
+
scalar_func: str, col_expr: str, agg_func: str | None, location: str
|
|
497
|
+
) -> list[IntentIssue]:
|
|
498
|
+
inner_issues = []
|
|
499
|
+
func_lower = scalar_func.lower()
|
|
500
|
+
if agg_func and func_lower not in SCALAR_FUNCTIONS_AGG_COMPATIBLE:
|
|
501
|
+
inner_issues.append(
|
|
502
|
+
IntentIssue(
|
|
503
|
+
issue_id=f"scalar_on_agg_invalid_{location}_{func_lower}",
|
|
504
|
+
category="scalar_semantics",
|
|
505
|
+
severity="error",
|
|
506
|
+
message=f"Scalar '{scalar_func}' cannot wrap aggregation '{agg_func.upper()}' in {location}. Only {sorted(SCALAR_FUNCTIONS_AGG_COMPATIBLE)} allowed on aggregates",
|
|
507
|
+
context={
|
|
508
|
+
"scalar": scalar_func,
|
|
509
|
+
"aggregation": agg_func,
|
|
510
|
+
"location": location,
|
|
511
|
+
"allowed": sorted(SCALAR_FUNCTIONS_AGG_COMPATIBLE),
|
|
512
|
+
},
|
|
513
|
+
)
|
|
514
|
+
)
|
|
515
|
+
return inner_issues
|
|
516
|
+
if agg_func:
|
|
517
|
+
return inner_issues
|
|
518
|
+
actual_col = extract_col_from_scalar_wrapper(col_expr)
|
|
519
|
+
if not actual_col or "." not in actual_col or actual_col == "*":
|
|
520
|
+
return inner_issues
|
|
521
|
+
table_name, col_name = actual_col.rsplit(".", 1)
|
|
522
|
+
if table_name not in schema.tables:
|
|
523
|
+
return inner_issues
|
|
524
|
+
col_meta = schema.tables[table_name].columns.get(col_name) or schema.tables[table_name].columns.get(
|
|
525
|
+
col_name.lower()
|
|
526
|
+
)
|
|
527
|
+
if not col_meta:
|
|
528
|
+
return inner_issues
|
|
529
|
+
vt = col_meta.value_type
|
|
530
|
+
string = vt == "string"
|
|
531
|
+
numeric = vt in ("integer", "number")
|
|
532
|
+
temporal = vt == "date"
|
|
533
|
+
if func_lower in SCALAR_FUNCTIONS_STRING and not string:
|
|
534
|
+
inner_issues.append(
|
|
535
|
+
IntentIssue(
|
|
536
|
+
issue_id=f"scalar_type_mismatch_{location}_{func_lower}_{actual_col}",
|
|
537
|
+
category="scalar_semantics",
|
|
538
|
+
severity="error",
|
|
539
|
+
message=f"Scalar '{scalar_func}' requires string column, got '{actual_col}' (type={col_meta.data_type}) in {location}",
|
|
540
|
+
context={
|
|
541
|
+
"scalar": scalar_func,
|
|
542
|
+
"column": actual_col,
|
|
543
|
+
"data_type": col_meta.data_type,
|
|
544
|
+
"expected_type": "string",
|
|
545
|
+
"location": location,
|
|
546
|
+
},
|
|
547
|
+
)
|
|
548
|
+
)
|
|
549
|
+
elif func_lower in SCALAR_FUNCTIONS_NUMERIC and not numeric:
|
|
550
|
+
inner_issues.append(
|
|
551
|
+
IntentIssue(
|
|
552
|
+
issue_id=f"scalar_type_mismatch_{location}_{func_lower}_{actual_col}",
|
|
553
|
+
category="scalar_semantics",
|
|
554
|
+
severity="error",
|
|
555
|
+
message=f"Scalar '{scalar_func}' requires numeric column, got '{actual_col}' (type={col_meta.data_type}) in {location}",
|
|
556
|
+
context={
|
|
557
|
+
"scalar": scalar_func,
|
|
558
|
+
"column": actual_col,
|
|
559
|
+
"data_type": col_meta.data_type,
|
|
560
|
+
"expected_type": "numeric",
|
|
561
|
+
"location": location,
|
|
562
|
+
},
|
|
563
|
+
)
|
|
564
|
+
)
|
|
565
|
+
elif func_lower in SCALAR_FUNCTIONS_TEMPORAL and not temporal:
|
|
566
|
+
inner_issues.append(
|
|
567
|
+
IntentIssue(
|
|
568
|
+
issue_id=f"scalar_type_mismatch_{location}_{func_lower}_{actual_col}",
|
|
569
|
+
category="scalar_semantics",
|
|
570
|
+
severity="error",
|
|
571
|
+
message=f"Scalar '{scalar_func}' requires temporal column, got '{actual_col}' (type={col_meta.data_type}) in {location}",
|
|
572
|
+
context={
|
|
573
|
+
"scalar": scalar_func,
|
|
574
|
+
"column": actual_col,
|
|
575
|
+
"data_type": col_meta.data_type,
|
|
576
|
+
"expected_type": "date/timestamp",
|
|
577
|
+
"location": location,
|
|
578
|
+
},
|
|
579
|
+
)
|
|
580
|
+
)
|
|
581
|
+
return inner_issues
|
|
582
|
+
|
|
583
|
+
for idx, sc in enumerate(select_cols or []):
|
|
584
|
+
sc_scalar, sc_agg = extract_functions_from_term(sc.expr.primary_term)
|
|
585
|
+
if sc_scalar:
|
|
586
|
+
issues.extend(check_scalar_semantics(sc_scalar, sc.expr.primary_column, sc_agg, f"select_cols[{idx}]"))
|
|
587
|
+
for idx, obc in enumerate(order_by_cols or []):
|
|
588
|
+
obc_scalar, obc_agg = extract_functions_from_term(obc.expr.primary_term)
|
|
589
|
+
if obc_scalar:
|
|
590
|
+
issues.extend(
|
|
591
|
+
check_scalar_semantics(
|
|
592
|
+
obc_scalar,
|
|
593
|
+
obc.expr.primary_column,
|
|
594
|
+
obc_agg,
|
|
595
|
+
f"order_by_cols[{idx}]",
|
|
596
|
+
)
|
|
597
|
+
)
|
|
598
|
+
if issues:
|
|
599
|
+
debug(
|
|
600
|
+
f"[validation_schema.validate_scalar_func_type_semantics] found {len(issues)} semantic issues in {context}"
|
|
601
|
+
)
|
|
602
|
+
else:
|
|
603
|
+
debug(f"[validation_schema.validate_scalar_func_type_semantics] no semantic issues in {context}")
|
|
604
|
+
return issues
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def validate_column_types(
|
|
608
|
+
select_cols: list[SelectCol],
|
|
609
|
+
schema: SchemaGraph,
|
|
610
|
+
context: str = "main",
|
|
611
|
+
) -> list[IntentIssue]:
|
|
612
|
+
"""Validate that operations match their column types (heuristic checks).
|
|
613
|
+
|
|
614
|
+
Warns for numeric aggregations on text columns, date operations on non-date columns, and string operations on numeric columns.
|
|
615
|
+
|
|
616
|
+
Args:
|
|
617
|
+
|
|
618
|
+
select_cols: List of SelectCol instances to inspect.
|
|
619
|
+
filters_param: List of FilterParam instances (currently unused; reserved).
|
|
620
|
+
schema: The SchemaGraph.
|
|
621
|
+
context: Label used in issue IDs and messages.
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
|
|
625
|
+
List of IntentIssue objects.
|
|
626
|
+
"""
|
|
627
|
+
issues = []
|
|
628
|
+
debug("[validation_schema.validate_column_types] checking type consistency")
|
|
629
|
+
numeric_aggs = {"sum", "avg", "average", "total", "mean"}
|
|
630
|
+
date_ops = {"latest", "earliest", "recent", "oldest", "newest", "before", "after"}
|
|
631
|
+
string_ops = {"contains", "starts", "ends", "like", "match"}
|
|
632
|
+
for sc in select_cols:
|
|
633
|
+
_, agg_func = extract_functions_from_term(sc.expr.primary_term)
|
|
634
|
+
if not agg_func:
|
|
635
|
+
continue
|
|
636
|
+
func_lower = agg_func
|
|
637
|
+
col_expr = sc.expr.primary_column
|
|
638
|
+
if not col_expr:
|
|
639
|
+
continue
|
|
640
|
+
actual_col = extract_col_from_scalar_wrapper(col_expr)
|
|
641
|
+
if "." not in actual_col:
|
|
642
|
+
continue
|
|
643
|
+
table_name, col_name = actual_col.rsplit(".", 1)
|
|
644
|
+
if table_name not in schema.tables:
|
|
645
|
+
continue
|
|
646
|
+
table_meta = schema.tables[table_name]
|
|
647
|
+
col_meta = table_meta.columns.get(col_name) or table_meta.columns.get(col_name.lower())
|
|
648
|
+
if not col_meta:
|
|
649
|
+
continue
|
|
650
|
+
vt = col_meta.value_type
|
|
651
|
+
if vt:
|
|
652
|
+
numeric = vt in ("integer", "number")
|
|
653
|
+
date = vt == "date"
|
|
654
|
+
text = vt == "string"
|
|
655
|
+
else:
|
|
656
|
+
numeric = any(
|
|
657
|
+
hint in col_name.lower()
|
|
658
|
+
for hint in [
|
|
659
|
+
"amount",
|
|
660
|
+
"price",
|
|
661
|
+
"total",
|
|
662
|
+
"count",
|
|
663
|
+
"qty",
|
|
664
|
+
"quantity",
|
|
665
|
+
"rate",
|
|
666
|
+
"cost",
|
|
667
|
+
"num",
|
|
668
|
+
]
|
|
669
|
+
)
|
|
670
|
+
date = any(
|
|
671
|
+
hint in col_name.lower()
|
|
672
|
+
for hint in [
|
|
673
|
+
"date",
|
|
674
|
+
"time",
|
|
675
|
+
"created",
|
|
676
|
+
"updated",
|
|
677
|
+
"at",
|
|
678
|
+
"day",
|
|
679
|
+
"year",
|
|
680
|
+
"month",
|
|
681
|
+
]
|
|
682
|
+
)
|
|
683
|
+
text = any(
|
|
684
|
+
hint in col_name.lower() for hint in ["name", "title", "description", "email", "address", "text"]
|
|
685
|
+
)
|
|
686
|
+
if func_lower in numeric_aggs and text and not numeric:
|
|
687
|
+
issues.append(
|
|
688
|
+
IntentIssue(
|
|
689
|
+
issue_id=f"numeric_on_text_{table_name}_{col_name}",
|
|
690
|
+
category="type_mismatch",
|
|
691
|
+
severity="warning",
|
|
692
|
+
message=f"Attempting numeric aggregation ({func_lower}) on text column '{col_name}' (type: {col_meta.data_type})",
|
|
693
|
+
context={
|
|
694
|
+
"table": table_name,
|
|
695
|
+
"column": col_name,
|
|
696
|
+
"type": col_meta.data_type,
|
|
697
|
+
"agg": func_lower,
|
|
698
|
+
"location": context,
|
|
699
|
+
},
|
|
700
|
+
)
|
|
701
|
+
)
|
|
702
|
+
debug("[validation_schema.validate_column_types] type_mismatch: numeric_on_text")
|
|
703
|
+
if func_lower in date_ops and not date:
|
|
704
|
+
issues.append(
|
|
705
|
+
IntentIssue(
|
|
706
|
+
issue_id=f"date_on_non_date_{table_name}_{col_name}",
|
|
707
|
+
category="type_mismatch",
|
|
708
|
+
severity="warning",
|
|
709
|
+
message=f"Attempting date operation ({func_lower}) on non-date column '{col_name}' (type: {col_meta.data_type})",
|
|
710
|
+
context={
|
|
711
|
+
"table": table_name,
|
|
712
|
+
"column": col_name,
|
|
713
|
+
"type": col_meta.data_type,
|
|
714
|
+
"op": func_lower,
|
|
715
|
+
"location": context,
|
|
716
|
+
},
|
|
717
|
+
)
|
|
718
|
+
)
|
|
719
|
+
debug("[validation_schema.validate_column_types] type_mismatch: date_on_non_date")
|
|
720
|
+
if func_lower in string_ops and numeric and "_id" not in col_name.lower():
|
|
721
|
+
issues.append(
|
|
722
|
+
IntentIssue(
|
|
723
|
+
issue_id=f"string_on_numeric_{table_name}_{col_name}",
|
|
724
|
+
category="type_mismatch",
|
|
725
|
+
severity="warning",
|
|
726
|
+
message=f"Attempting string operation ({func_lower}) on numeric column '{col_name}' (type: {col_meta.data_type})",
|
|
727
|
+
context={
|
|
728
|
+
"table": table_name,
|
|
729
|
+
"column": col_name,
|
|
730
|
+
"type": col_meta.data_type,
|
|
731
|
+
"op": func_lower,
|
|
732
|
+
"location": context,
|
|
733
|
+
},
|
|
734
|
+
)
|
|
735
|
+
)
|
|
736
|
+
debug("[validation_schema.validate_column_types] TYPE MISMATCH: string op on numeric column")
|
|
737
|
+
if issues:
|
|
738
|
+
debug(f"[validation_schema.validate_column_types] FAILED with {len(issues)} issues")
|
|
739
|
+
else:
|
|
740
|
+
debug("[validation_schema.validate_column_types] PASSED")
|
|
741
|
+
return issues
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
def expr_has_arithmetic(expr: NormalizedExpr) -> bool:
|
|
745
|
+
"""Return ``True`` if a ``NormalizedExpr`` contains arithmetic operations.
|
|
746
|
+
|
|
747
|
+
Args:
|
|
748
|
+
|
|
749
|
+
expr: The normalised expression to inspect.
|
|
750
|
+
|
|
751
|
+
Returns:
|
|
752
|
+
|
|
753
|
+
``True`` when the expression has multiple groups, add/sub constant values, a non-unit coefficient, division, or multiple multiply terms; ``False`` otherwise.
|
|
754
|
+
"""
|
|
755
|
+
if len(expr.add_groups) + len(expr.sub_groups) > 1:
|
|
756
|
+
return True
|
|
757
|
+
if expr.add_values or expr.sub_values:
|
|
758
|
+
return True
|
|
759
|
+
for g in expr.add_groups + expr.sub_groups:
|
|
760
|
+
if g.coefficient != 1.0 or g.divide or len(g.multiply) > 1:
|
|
761
|
+
return True
|
|
762
|
+
return False
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
def strip_function_wrappers(term: str) -> str:
|
|
766
|
+
"""Strip all nested function call wrappers to expose the innermost column reference.
|
|
767
|
+
|
|
768
|
+
Args:
|
|
769
|
+
|
|
770
|
+
term: A SQL term string potentially containing function wrappers such as ``UPPER(table.col)`` or ``ABS(SUM(table.col))``.
|
|
771
|
+
|
|
772
|
+
Returns:
|
|
773
|
+
|
|
774
|
+
The bare column reference string after all wrapping functions are removed.
|
|
775
|
+
"""
|
|
776
|
+
while "(" in term:
|
|
777
|
+
start = term.index("(")
|
|
778
|
+
end = term.rindex(")")
|
|
779
|
+
inner = term[start + 1 : end].strip()
|
|
780
|
+
if inner.upper().startswith("DISTINCT "):
|
|
781
|
+
inner = inner[9:].strip()
|
|
782
|
+
term = inner
|
|
783
|
+
return term
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def term_result_is_numeric(term: str) -> bool:
|
|
787
|
+
"""Return ``True`` if function wrappers guarantee a numeric result regardless of column type.
|
|
788
|
+
|
|
789
|
+
Args:
|
|
790
|
+
|
|
791
|
+
term: A SQL term string, possibly containing nested function calls.
|
|
792
|
+
|
|
793
|
+
Returns:
|
|
794
|
+
|
|
795
|
+
``True`` when the outermost function is a known numeric-result aggregation (``COUNT``, ``SUM``, ``AVG``) or scalar (``ABS``, ``ROUND``, etc.) that always returns a number; ``False`` otherwise.
|
|
796
|
+
"""
|
|
797
|
+
remaining = term.strip()
|
|
798
|
+
while True:
|
|
799
|
+
match = re.match(r"^\s*(\w+)\s*\(", remaining)
|
|
800
|
+
if not match:
|
|
801
|
+
return False
|
|
802
|
+
func = match.group(1).lower()
|
|
803
|
+
if func in NUMERIC_RESULT_AGGS or func in NUMERIC_RESULT_SCALARS:
|
|
804
|
+
return True
|
|
805
|
+
inner_start = remaining.index("(") + 1
|
|
806
|
+
inner_end = remaining.rindex(")")
|
|
807
|
+
remaining = remaining[inner_start:inner_end].strip()
|
|
808
|
+
if remaining.upper().startswith("DISTINCT "):
|
|
809
|
+
remaining = remaining[9:].strip()
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def expr_result_is_numeric(
|
|
813
|
+
expr: NormalizedExpr,
|
|
814
|
+
schema: SchemaGraph,
|
|
815
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]],
|
|
816
|
+
) -> bool | None:
|
|
817
|
+
"""Return whether the result of a ``NormalizedExpr`` is numeric.
|
|
818
|
+
|
|
819
|
+
Args:
|
|
820
|
+
|
|
821
|
+
expr: The normalised expression to inspect.
|
|
822
|
+
schema: Schema graph for resolving column types.
|
|
823
|
+
cte_outputs: Map of CTE name to column output metadata.
|
|
824
|
+
|
|
825
|
+
Returns:
|
|
826
|
+
|
|
827
|
+
``True`` if the expression provably produces a numeric result (numeric aggregation, numeric scalar, arithmetic structure, or numeric primary column); ``None`` if the result type cannot be determined.
|
|
828
|
+
"""
|
|
829
|
+
if expr.agg_func and expr.agg_func in NUMERIC_RESULT_AGGS:
|
|
830
|
+
return True
|
|
831
|
+
if expr.scalar_func and expr.scalar_func in NUMERIC_RESULT_SCALARS:
|
|
832
|
+
return True
|
|
833
|
+
if expr.inner_scalar_func and expr.inner_scalar_func in NUMERIC_RESULT_SCALARS:
|
|
834
|
+
return True
|
|
835
|
+
if expr_has_arithmetic(expr):
|
|
836
|
+
return True
|
|
837
|
+
if expr.add_values or expr.sub_values:
|
|
838
|
+
return True
|
|
839
|
+
for g in expr.add_groups + expr.sub_groups:
|
|
840
|
+
if g.agg_func and g.agg_func in NUMERIC_RESULT_AGGS:
|
|
841
|
+
return True
|
|
842
|
+
if g.scalar_func and g.scalar_func in NUMERIC_RESULT_SCALARS:
|
|
843
|
+
return True
|
|
844
|
+
if g.inner_scalar_func and g.inner_scalar_func in NUMERIC_RESULT_SCALARS:
|
|
845
|
+
return True
|
|
846
|
+
if expr.has_aggregation:
|
|
847
|
+
primary = expr.primary_term
|
|
848
|
+
result = extract_agg_col(primary)
|
|
849
|
+
if len(result) == 3 and result[0] in {"count", "sum", "avg"}:
|
|
850
|
+
return True
|
|
851
|
+
col = expr.primary_column
|
|
852
|
+
if col:
|
|
853
|
+
return is_col_numeric(col, schema, cte_outputs)
|
|
854
|
+
return None
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
def validate_scalar_expression_semantics(
|
|
858
|
+
select_cols: list[SelectCol],
|
|
859
|
+
schema: SchemaGraph,
|
|
860
|
+
context: str = "main",
|
|
861
|
+
) -> list[IntentIssue]:
|
|
862
|
+
"""Validate that scalar functions are applied to semantically appropriate column types.
|
|
863
|
+
|
|
864
|
+
Args:
|
|
865
|
+
|
|
866
|
+
select_cols: SELECT column list to inspect for scalar function misuse.
|
|
867
|
+
filters_param: Filter conditions to inspect.
|
|
868
|
+
having_param: HAVING conditions to inspect.
|
|
869
|
+
schema: Schema graph for resolving column types and roles.
|
|
870
|
+
context: Query context label for issue messages.
|
|
871
|
+
|
|
872
|
+
Returns:
|
|
873
|
+
|
|
874
|
+
List of ``IntentIssue`` instances describing scalar function semantic violations.
|
|
875
|
+
"""
|
|
876
|
+
issues = []
|
|
877
|
+
debug("[validation_semantic.validate_scalar_expression_semantics] checking scalar semantics")
|
|
878
|
+
numeric_scalars = {"abs", "round", "ceil", "floor", "sqrt"}
|
|
879
|
+
string_scalars = {"upper", "lower", "trim", "ltrim", "rtrim", "length"}
|
|
880
|
+
for sc in select_cols:
|
|
881
|
+
outer_func, _, _ = extract_agg_col(sc.expr.primary_term)
|
|
882
|
+
if not outer_func or outer_func in VALID_AGG_FUNCS:
|
|
883
|
+
continue
|
|
884
|
+
func_lower = outer_func
|
|
885
|
+
col_type = get_col_type(sc.expr.primary_column, schema, {})
|
|
886
|
+
if col_type:
|
|
887
|
+
numeric = col_type in ("integer", "number")
|
|
888
|
+
text = col_type == "string"
|
|
889
|
+
if func_lower in numeric_scalars and not numeric and not sc.is_aggregated:
|
|
890
|
+
issues.append(
|
|
891
|
+
IntentIssue(
|
|
892
|
+
issue_id=f"numeric_scalar_on_non_numeric_{sc.expr.primary_column}_{func_lower}",
|
|
893
|
+
category="scalar_semantic",
|
|
894
|
+
severity="warning",
|
|
895
|
+
message=f"Numeric scalar '{func_lower}' on non-numeric column '{sc.expr.primary_column}' (type: {col_type})",
|
|
896
|
+
context={
|
|
897
|
+
"column": sc.expr.primary_column,
|
|
898
|
+
"scalar": func_lower,
|
|
899
|
+
"type": col_type,
|
|
900
|
+
"location": context,
|
|
901
|
+
},
|
|
902
|
+
)
|
|
903
|
+
)
|
|
904
|
+
if func_lower in string_scalars and not text:
|
|
905
|
+
issues.append(
|
|
906
|
+
IntentIssue(
|
|
907
|
+
issue_id=f"string_scalar_on_non_string_{sc.expr.primary_column}_{func_lower}",
|
|
908
|
+
category="scalar_semantic",
|
|
909
|
+
severity="warning",
|
|
910
|
+
message=f"String scalar '{func_lower}' on non-string column '{sc.expr.primary_column}' (type: {col_type})",
|
|
911
|
+
context={
|
|
912
|
+
"column": sc.expr.primary_column,
|
|
913
|
+
"scalar": func_lower,
|
|
914
|
+
"type": col_type,
|
|
915
|
+
"location": context,
|
|
916
|
+
},
|
|
917
|
+
)
|
|
918
|
+
)
|
|
919
|
+
debug(f"[validation_semantic.validate_scalar_expression_semantics] {len(issues)} issues in {context}")
|
|
920
|
+
return issues
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
def validate_temporal_columns(
|
|
924
|
+
select_cols: list[SelectCol],
|
|
925
|
+
schema: SchemaGraph,
|
|
926
|
+
context: str = "main",
|
|
927
|
+
) -> list[IntentIssue]:
|
|
928
|
+
"""Validate that temporal scalar functions (YEAR, MONTH, DAY, EXTRACT) are applied to date-type columns.
|
|
929
|
+
|
|
930
|
+
Args:
|
|
931
|
+
|
|
932
|
+
select_cols: SELECT column list to inspect for temporal function misuse.
|
|
933
|
+
schema: Schema graph for resolving column types.
|
|
934
|
+
context: Query context label for issue messages.
|
|
935
|
+
|
|
936
|
+
Returns:
|
|
937
|
+
|
|
938
|
+
List of ``IntentIssue`` instances where a temporal function is applied to a non-date column.
|
|
939
|
+
"""
|
|
940
|
+
issues = []
|
|
941
|
+
temporal_ops = {"latest", "recent", "last", "first", "earliest", "oldest", "newest"}
|
|
942
|
+
agg_funcs = {extract_agg_col(sc.expr.primary_term)[0] for sc in select_cols if sc.is_aggregated} - {None}
|
|
943
|
+
if not (agg_funcs & temporal_ops):
|
|
944
|
+
return []
|
|
945
|
+
debug("[validation_semantic.validate_temporal_columns] checking temporal column presence")
|
|
946
|
+
has_date_column = False
|
|
947
|
+
for sc in select_cols:
|
|
948
|
+
col_expr = sc.expr.primary_column
|
|
949
|
+
if not col_expr:
|
|
950
|
+
continue
|
|
951
|
+
actual_col = extract_col_from_scalar_wrapper(col_expr)
|
|
952
|
+
if "." not in actual_col:
|
|
953
|
+
continue
|
|
954
|
+
table_name, col_name = actual_col.rsplit(".", 1)
|
|
955
|
+
if table_name in schema.tables:
|
|
956
|
+
col_meta = schema.tables[table_name].columns.get(col_name)
|
|
957
|
+
if col_meta:
|
|
958
|
+
if col_meta.value_type == "date":
|
|
959
|
+
has_date_column = True
|
|
960
|
+
break
|
|
961
|
+
if any(hint in col_name.lower() for hint in ["date", "time", "created", "updated", "at"]):
|
|
962
|
+
has_date_column = True
|
|
963
|
+
break
|
|
964
|
+
if not has_date_column:
|
|
965
|
+
issues.append(
|
|
966
|
+
IntentIssue(
|
|
967
|
+
issue_id=f"temporal_no_date_col_{','.join(sorted(agg_funcs & temporal_ops))}",
|
|
968
|
+
category="missing_temporal_column",
|
|
969
|
+
severity="warning",
|
|
970
|
+
message=f"Intent uses temporal operation ({agg_funcs & temporal_ops}) but no date/time column identified",
|
|
971
|
+
context={
|
|
972
|
+
"temporal_ops": list(agg_funcs & temporal_ops),
|
|
973
|
+
"location": context,
|
|
974
|
+
},
|
|
975
|
+
)
|
|
976
|
+
)
|
|
977
|
+
debug("[validation_semantic.validate_temporal_columns] AMBIGUITY: temporal ops but no date column")
|
|
978
|
+
return issues
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def validate_pk_fk_aggregation(
|
|
982
|
+
select_cols: list[SelectCol],
|
|
983
|
+
schema: SchemaGraph,
|
|
984
|
+
context: str = "main",
|
|
985
|
+
) -> list[IntentIssue]:
|
|
986
|
+
"""Validate that primary-key and foreign-key columns are not aggregated with SUM or AVG.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
|
|
990
|
+
select_cols: SELECT column list to inspect for PK/FK aggregation misuse.
|
|
991
|
+
schema: Schema graph for resolving column roles.
|
|
992
|
+
context: Query context label for issue messages.
|
|
993
|
+
|
|
994
|
+
Returns:
|
|
995
|
+
|
|
996
|
+
List of ``IntentIssue`` instances where a PK or FK column is used with a ``SUM`` or ``AVG`` aggregation.
|
|
997
|
+
"""
|
|
998
|
+
issues = []
|
|
999
|
+
suspicious_aggs = {"sum", "avg"}
|
|
1000
|
+
debug("[validation_semantic.validate_pk_fk_aggregation] checking PK/FK aggregation")
|
|
1001
|
+
for sc in select_cols:
|
|
1002
|
+
if not sc.is_aggregated:
|
|
1003
|
+
continue
|
|
1004
|
+
func_lower, _, _ = extract_agg_col(sc.expr.primary_term)
|
|
1005
|
+
if not func_lower or func_lower not in suspicious_aggs:
|
|
1006
|
+
continue
|
|
1007
|
+
col_expr = sc.expr.primary_column
|
|
1008
|
+
if not col_expr:
|
|
1009
|
+
continue
|
|
1010
|
+
actual_col = extract_col_from_scalar_wrapper(col_expr)
|
|
1011
|
+
if "." not in actual_col:
|
|
1012
|
+
continue
|
|
1013
|
+
table_name, col_name = actual_col.rsplit(".", 1)
|
|
1014
|
+
if table_name not in schema.tables:
|
|
1015
|
+
continue
|
|
1016
|
+
col_meta = schema.tables[table_name].columns.get(col_name)
|
|
1017
|
+
if col_meta and (col_meta.is_primary_key or col_meta.is_foreign_key):
|
|
1018
|
+
issues.append(
|
|
1019
|
+
IntentIssue(
|
|
1020
|
+
issue_id=f"agg_on_pk_fk_{table_name}_{col_name}_{func_lower}",
|
|
1021
|
+
category="aggregation_semantics",
|
|
1022
|
+
severity="warning",
|
|
1023
|
+
message=f"{func_lower.upper()} on PK/FK column {actual_col} is suspicious",
|
|
1024
|
+
context={
|
|
1025
|
+
"table": table_name,
|
|
1026
|
+
"column": col_name,
|
|
1027
|
+
"agg": func_lower,
|
|
1028
|
+
"location": context,
|
|
1029
|
+
},
|
|
1030
|
+
)
|
|
1031
|
+
)
|
|
1032
|
+
debug(f"[validation_semantic.validate_pk_fk_aggregation] {func_lower.upper()} on PK/FK: {actual_col}")
|
|
1033
|
+
return issues
|