aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1033 @@
1
+ """Aggregation and type validation for SELECT, ORDER BY, and HAVING clauses.
2
+
3
+ Validates aggregation functions (COUNT, SUM, AVG, MIN, MAX) per column role and semantics: SUM/AVG require numeric columns; MIN/MAX on FREE_TEXT columns trigger warnings. Validates scalar function type compatibility (UPPER on string, ABS on numeric, YEAR on date). Provides heuristic column-type checks, expression arithmetic detection, and helpers for numeric-result detection (strip_function_wrappers, term_result_is_numeric, expr_result_is_numeric). Also validates temporal column presence and PK/FK aggregation misuse.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import re
9
+
10
+ from .config import (
11
+ NUMERIC_RESULT_AGGS,
12
+ NUMERIC_RESULT_SCALARS,
13
+ SCALAR_FUNCTIONS_AGG_COMPATIBLE,
14
+ SCALAR_FUNCTIONS_NUMERIC,
15
+ SCALAR_FUNCTIONS_STRING,
16
+ SCALAR_FUNCTIONS_TEMPORAL,
17
+ VALID_AGG_FUNCS,
18
+ )
19
+ from .contracts_base import ColumnRole, CteOutputColumnMeta, IntentIssue, SchemaGraph
20
+ from .contracts_core import HavingParam, NormalizedExpr, OrderByCol, SelectCol
21
+ from .core_utils import debug
22
+ from .validation_schema import (
23
+ extract_agg_col,
24
+ extract_col_from_scalar_wrapper,
25
+ extract_functions_from_term,
26
+ get_col_type,
27
+ is_col_numeric,
28
+ )
29
+
30
+
31
+ def validate_having_agg_per_role(
32
+ having_param: list[HavingParam],
33
+ schema: SchemaGraph,
34
+ cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
35
+ context: str = "main",
36
+ ) -> list[IntentIssue]:
37
+ """Validate that HAVING aggregation functions are valid for each column's role.
38
+
39
+ Args:
40
+
41
+ having_param: List of HavingParam instances to validate.
42
+ schema: The SchemaGraph.
43
+ cte_outputs: Dict of CTE name -> output column metadata.
44
+ context: Label used in issue IDs and messages.
45
+
46
+ Returns:
47
+
48
+ List of IntentIssue objects.
49
+ """
50
+ issues = []
51
+ if not having_param:
52
+ return []
53
+ cte_outputs = cte_outputs or {}
54
+ for hp in having_param:
55
+ agg_expr = hp.left_expr.primary_term
56
+ if not agg_expr:
57
+ continue
58
+ result = extract_agg_col(agg_expr)
59
+ if len(result) != 3:
60
+ continue
61
+ func, actual_target, _ = result
62
+ if not func or actual_target == "*":
63
+ continue
64
+ if "." not in actual_target:
65
+ continue
66
+ table_name, col_name = actual_target.rsplit(".", 1)
67
+ if table_name in cte_outputs:
68
+ cte_cols = cte_outputs[table_name]
69
+ matched_key = next((c for c in cte_cols if c.lower() == col_name.lower()), None)
70
+ if matched_key:
71
+ cte_meta = cte_cols[matched_key]
72
+ if cte_meta.valid_aggregations and func not in cte_meta.valid_aggregations:
73
+ issues.append(
74
+ IntentIssue(
75
+ issue_id=f"having_agg_invalid_for_cte_{context}_{actual_target}_{func}",
76
+ category="having_validity",
77
+ severity="error",
78
+ message=f"Aggregation '{func.upper()}' not valid for CTE column '{actual_target}' (role={cte_meta.role}) in HAVING for {context}. Valid: {sorted(cte_meta.valid_aggregations)}",
79
+ context={
80
+ "column": actual_target,
81
+ "function": func,
82
+ "role": cte_meta.role,
83
+ "valid_aggs": sorted(cte_meta.valid_aggregations),
84
+ "location": context,
85
+ },
86
+ )
87
+ )
88
+ continue
89
+ if table_name not in schema.tables:
90
+ continue
91
+ table_meta = schema.tables[table_name]
92
+ col_meta = table_meta.columns.get(col_name) or table_meta.columns.get(col_name.lower())
93
+ if not col_meta:
94
+ continue
95
+ valid_aggs = col_meta.get_valid_aggregations()
96
+ if func not in valid_aggs:
97
+ issues.append(
98
+ IntentIssue(
99
+ issue_id=f"having_agg_invalid_for_role_{context}_{actual_target}_{func}",
100
+ category="having_validity",
101
+ severity="error",
102
+ message=f"Aggregation '{func.upper()}' not valid for column '{actual_target}' (role={col_meta.role}) in HAVING for {context}. Valid: {sorted(valid_aggs)}",
103
+ context={
104
+ "column": actual_target,
105
+ "function": func,
106
+ "role": col_meta.role,
107
+ "valid_aggs": sorted(valid_aggs),
108
+ "location": context,
109
+ },
110
+ )
111
+ )
112
+ debug(f"[validation_schema.validate_having_agg_per_role] {len(issues)} issues in {context}")
113
+ return issues
114
+
115
+
116
+ def validate_select_agg_per_role(
117
+ select_cols: list[SelectCol],
118
+ schema: SchemaGraph,
119
+ cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
120
+ context: str = "main",
121
+ ) -> list[IntentIssue]:
122
+ """Validate that SELECT aggregation functions are valid for each column's role.
123
+
124
+ Args:
125
+
126
+ select_cols: List of SelectCol instances to validate.
127
+ schema: The SchemaGraph.
128
+ cte_outputs: Dict of CTE name -> output column metadata.
129
+ context: Label used in issue IDs and messages.
130
+
131
+ Returns:
132
+
133
+ List of IntentIssue objects.
134
+ """
135
+ issues = []
136
+ if not select_cols:
137
+ return []
138
+ cte_outputs = cte_outputs or {}
139
+ for sc in select_cols:
140
+ _, agg_func = extract_functions_from_term(sc.expr.primary_term)
141
+ if not agg_func:
142
+ continue
143
+ col_expr = sc.expr.primary_column
144
+ if not col_expr:
145
+ continue
146
+ actual_col = extract_col_from_scalar_wrapper(col_expr)
147
+ if actual_col == "*":
148
+ continue
149
+ if "." not in actual_col:
150
+ continue
151
+ table_name, col_name = actual_col.rsplit(".", 1)
152
+ if table_name in cte_outputs:
153
+ cte_cols = cte_outputs[table_name]
154
+ matched_key = next((c for c in cte_cols if c.lower() == col_name.lower()), None)
155
+ if matched_key:
156
+ cte_meta = cte_cols[matched_key]
157
+ if cte_meta.valid_aggregations:
158
+ func_lower = agg_func.lower()
159
+ if func_lower not in cte_meta.valid_aggregations:
160
+ issues.append(
161
+ IntentIssue(
162
+ issue_id=f"select_agg_invalid_for_cte_{context}_{actual_col}_{agg_func}",
163
+ category="aggregation_validity",
164
+ severity="error",
165
+ message=f"Aggregation '{agg_func.upper()}' not valid for CTE column '{actual_col}' (role={cte_meta.role}) in {context}. Valid: {sorted(cte_meta.valid_aggregations)}",
166
+ context={
167
+ "column": actual_col,
168
+ "function": agg_func,
169
+ "role": cte_meta.role,
170
+ "valid_aggs": sorted(cte_meta.valid_aggregations),
171
+ "location": context,
172
+ },
173
+ )
174
+ )
175
+ continue
176
+ if table_name not in schema.tables:
177
+ continue
178
+ table_meta = schema.tables[table_name]
179
+ col_meta = table_meta.columns.get(col_name) or table_meta.columns.get(col_name.lower())
180
+ if not col_meta:
181
+ continue
182
+ valid_aggs = col_meta.get_valid_aggregations()
183
+ func_lower = agg_func.lower()
184
+ if func_lower not in valid_aggs:
185
+ issues.append(
186
+ IntentIssue(
187
+ issue_id=f"select_agg_invalid_for_role_{context}_{actual_col}_{agg_func}",
188
+ category="aggregation_validity",
189
+ severity="error",
190
+ message=f"Aggregation '{agg_func.upper()}' not valid for column '{actual_col}' (role={col_meta.role}) in {context}. Valid: {sorted(valid_aggs)}",
191
+ context={
192
+ "column": actual_col,
193
+ "function": agg_func,
194
+ "role": col_meta.role,
195
+ "valid_aggs": sorted(valid_aggs),
196
+ "location": context,
197
+ },
198
+ )
199
+ )
200
+ debug(f"[validation_schema.validate_select_agg_per_role] {len(issues)} issues in {context}")
201
+ return issues
202
+
203
+
204
+ def validate_select_agg_semantics(
205
+ select_cols: list[SelectCol],
206
+ schema: SchemaGraph,
207
+ context: str = "main",
208
+ ) -> list[IntentIssue]:
209
+ """Validate that SELECT aggregation functions are semantically appropriate for column types.
210
+
211
+ Errors for SUM/AVG on non-numeric columns; warns for MIN/MAX on FREE_TEXT columns.
212
+
213
+ Args:
214
+
215
+ select_cols: List of SelectCol instances to validate.
216
+ schema: The SchemaGraph.
217
+ context: Label used in issue IDs and messages.
218
+
219
+ Returns:
220
+
221
+ List of IntentIssue objects.
222
+ """
223
+ issues = []
224
+ if not select_cols:
225
+ return []
226
+ numeric_aggs = {"sum", "avg"}
227
+ for sc in select_cols:
228
+ _, agg_func = extract_functions_from_term(sc.expr.primary_term)
229
+ if not agg_func:
230
+ continue
231
+ func_lower = agg_func
232
+ if func_lower not in numeric_aggs and func_lower not in {"min", "max"}:
233
+ continue
234
+ col_expr = sc.expr.primary_column
235
+ if not col_expr:
236
+ continue
237
+ actual_col = extract_col_from_scalar_wrapper(col_expr)
238
+ if actual_col == "*":
239
+ continue
240
+ if "." not in actual_col:
241
+ continue
242
+ table_name, col_name = actual_col.rsplit(".", 1)
243
+ if table_name not in schema.tables:
244
+ continue
245
+ col_meta = schema.tables[table_name].columns.get(col_name)
246
+ if not col_meta:
247
+ continue
248
+ vt = col_meta.value_type
249
+ numeric = vt in ("integer", "number")
250
+ temporal = vt == "date"
251
+ if func_lower in numeric_aggs and not numeric:
252
+ issues.append(
253
+ IntentIssue(
254
+ issue_id=f"invalid_agg_semantics_{func_lower}_{table_name}_{col_name}",
255
+ category="aggregation_semantics",
256
+ severity="error",
257
+ message=f"Cannot {func_lower.upper()} on {actual_col} (type={col_meta.data_type}): {func_lower.upper()} requires numeric column",
258
+ context={
259
+ "aggregation": func_lower,
260
+ "column": actual_col,
261
+ "data_type": col_meta.data_type,
262
+ "location": context,
263
+ },
264
+ )
265
+ )
266
+ debug(f"[validation_schema.validate_select_agg_semantics] invalid {func_lower.upper()} on {actual_col}")
267
+ elif func_lower in {"min", "max"} and not numeric and not temporal:
268
+ col_role = col_meta.role if col_meta.role else None
269
+ if col_role == ColumnRole.FREE_TEXT.value:
270
+ issues.append(
271
+ IntentIssue(
272
+ issue_id=f"questionable_agg_{func_lower}_{table_name}_{col_name}",
273
+ category="aggregation_semantics",
274
+ severity="warning",
275
+ message=f"Questionable {func_lower.upper()} on {actual_col} (type={col_meta.data_type}): {func_lower.upper()} on free text is semantically meaningless",
276
+ context={
277
+ "aggregation": func_lower,
278
+ "column": actual_col,
279
+ "data_type": col_meta.data_type,
280
+ "location": context,
281
+ },
282
+ )
283
+ )
284
+ debug(
285
+ f"[validation_schema.validate_select_agg_semantics] questionable {func_lower.upper()} on {actual_col}"
286
+ )
287
+ if issues:
288
+ debug(f"[validation_schema.validate_select_agg_semantics] found {len(issues)} semantic issues")
289
+ else:
290
+ debug("[validation_schema.validate_select_agg_semantics] no semantic issues")
291
+ return issues
292
+
293
+
294
+ def validate_order_by_agg_per_role(
295
+ order_by_cols: list[OrderByCol],
296
+ schema: SchemaGraph,
297
+ cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] | None = None,
298
+ context: str = "main",
299
+ ) -> list[IntentIssue]:
300
+ """Validate that ORDER BY aggregation functions are valid for each column's role.
301
+
302
+ Args:
303
+
304
+ order_by_cols: List of OrderByCol instances to validate.
305
+ schema: The SchemaGraph.
306
+ cte_outputs: Dict of CTE name -> output column metadata.
307
+ context: Label used in issue IDs and messages.
308
+
309
+ Returns:
310
+
311
+ List of IntentIssue objects.
312
+ """
313
+ issues = []
314
+ if not order_by_cols:
315
+ return []
316
+ cte_outputs = cte_outputs or {}
317
+ for obc in order_by_cols:
318
+ _, agg_func = extract_functions_from_term(obc.expr.primary_term)
319
+ if not agg_func:
320
+ continue
321
+ col_expr = obc.expr.primary_column
322
+ if not col_expr:
323
+ continue
324
+ actual_col = extract_col_from_scalar_wrapper(col_expr)
325
+ if actual_col == "*":
326
+ continue
327
+ if "." not in actual_col:
328
+ continue
329
+ table_name, col_name = actual_col.rsplit(".", 1)
330
+ if table_name in cte_outputs:
331
+ cte_cols = cte_outputs[table_name]
332
+ matched_key = next((c for c in cte_cols if c.lower() == col_name.lower()), None)
333
+ if matched_key:
334
+ cte_meta = cte_cols[matched_key]
335
+ if cte_meta.valid_aggregations:
336
+ func_lower = agg_func.lower()
337
+ if func_lower not in cte_meta.valid_aggregations:
338
+ issues.append(
339
+ IntentIssue(
340
+ issue_id=f"order_by_agg_invalid_for_cte_{context}_{actual_col}_{agg_func}",
341
+ category="aggregation_validity",
342
+ severity="error",
343
+ message=f"Aggregation '{agg_func.upper()}' not valid for CTE column '{actual_col}' (role={cte_meta.role}) in order_by for {context}. Valid: {sorted(cte_meta.valid_aggregations)}",
344
+ context={
345
+ "column": actual_col,
346
+ "function": agg_func,
347
+ "role": cte_meta.role,
348
+ "valid_aggs": sorted(cte_meta.valid_aggregations),
349
+ "location": context,
350
+ },
351
+ )
352
+ )
353
+ continue
354
+ if table_name not in schema.tables:
355
+ continue
356
+ table_meta = schema.tables[table_name]
357
+ col_meta = table_meta.columns.get(col_name) or table_meta.columns.get(col_name.lower())
358
+ if not col_meta:
359
+ continue
360
+ valid_aggs = col_meta.get_valid_aggregations()
361
+ func_lower = agg_func.lower()
362
+ if func_lower not in valid_aggs:
363
+ issues.append(
364
+ IntentIssue(
365
+ issue_id=f"order_by_agg_invalid_for_role_{context}_{actual_col}_{agg_func}",
366
+ category="aggregation_validity",
367
+ severity="error",
368
+ message=f"Aggregation '{agg_func.upper()}' not valid for column '{actual_col}' (role={col_meta.role}) in order_by for {context}. Valid: {sorted(valid_aggs)}",
369
+ context={
370
+ "column": actual_col,
371
+ "function": agg_func,
372
+ "role": col_meta.role,
373
+ "valid_aggs": sorted(valid_aggs),
374
+ "location": context,
375
+ },
376
+ )
377
+ )
378
+ debug(f"[validation_schema.validate_order_by_agg_per_role] {len(issues)} issues in {context}")
379
+ return issues
380
+
381
+
382
+ def validate_order_by_agg_semantics(
383
+ order_by_cols: list[OrderByCol],
384
+ schema: SchemaGraph,
385
+ context: str = "main",
386
+ ) -> list[IntentIssue]:
387
+ """Validate that ORDER BY aggregation functions are semantically appropriate for column types.
388
+
389
+ Errors for SUM/AVG on non-numeric columns; warns for MIN/MAX on FREE_TEXT columns.
390
+
391
+ Args:
392
+
393
+ order_by_cols: List of OrderByCol instances to validate.
394
+ schema: The SchemaGraph.
395
+ context: Label used in issue IDs and messages.
396
+
397
+ Returns:
398
+
399
+ List of IntentIssue objects.
400
+ """
401
+ issues = []
402
+ if not order_by_cols:
403
+ return []
404
+ numeric_aggs = {"sum", "avg"}
405
+ for obc in order_by_cols:
406
+ _, agg_func = extract_functions_from_term(obc.expr.primary_term)
407
+ if not agg_func:
408
+ continue
409
+ func_lower = agg_func
410
+ if func_lower not in numeric_aggs and func_lower not in {"min", "max"}:
411
+ continue
412
+ col_expr = obc.expr.primary_column
413
+ if not col_expr:
414
+ continue
415
+ actual_col = extract_col_from_scalar_wrapper(col_expr)
416
+ if actual_col == "*":
417
+ continue
418
+ if "." not in actual_col:
419
+ continue
420
+ table_name, col_name = actual_col.rsplit(".", 1)
421
+ if table_name not in schema.tables:
422
+ continue
423
+ col_meta = schema.tables[table_name].columns.get(col_name)
424
+ if not col_meta:
425
+ continue
426
+ vt = col_meta.value_type
427
+ numeric = vt in ("integer", "number")
428
+ temporal = vt == "date"
429
+ if func_lower in numeric_aggs and not numeric:
430
+ issues.append(
431
+ IntentIssue(
432
+ issue_id=f"invalid_order_by_agg_semantics_{func_lower}_{table_name}_{col_name}",
433
+ category="aggregation_semantics",
434
+ severity="error",
435
+ message=f"Cannot {func_lower.upper()} on {actual_col} (type={col_meta.data_type}) in ORDER BY: {func_lower.upper()} requires numeric column",
436
+ context={
437
+ "aggregation": func_lower,
438
+ "column": actual_col,
439
+ "data_type": col_meta.data_type,
440
+ "location": context,
441
+ },
442
+ )
443
+ )
444
+ debug(f"[validation_schema.validate_order_by_agg_semantics] invalid {func_lower.upper()} on {actual_col}")
445
+ elif func_lower in {"min", "max"} and not numeric and not temporal:
446
+ col_role = col_meta.role if col_meta.role else None
447
+ if col_role == ColumnRole.FREE_TEXT.value:
448
+ issues.append(
449
+ IntentIssue(
450
+ issue_id=f"questionable_order_by_agg_{func_lower}_{table_name}_{col_name}",
451
+ category="aggregation_semantics",
452
+ severity="warning",
453
+ message=f"Questionable {func_lower.upper()} on {actual_col} (type={col_meta.data_type}) in ORDER BY: {func_lower.upper()} on free text is semantically meaningless",
454
+ context={
455
+ "aggregation": func_lower,
456
+ "column": actual_col,
457
+ "data_type": col_meta.data_type,
458
+ "location": context,
459
+ },
460
+ )
461
+ )
462
+ debug(
463
+ f"[validation_schema.validate_order_by_agg_semantics] questionable {func_lower.upper()} on {actual_col}"
464
+ )
465
+ if issues:
466
+ debug(f"[validation_schema.validate_order_by_agg_semantics] found {len(issues)} semantic issues")
467
+ else:
468
+ debug("[validation_schema.validate_order_by_agg_semantics] no semantic issues")
469
+ return issues
470
+
471
+
472
+ def validate_scalar_func_type_semantics(
473
+ select_cols: list[SelectCol],
474
+ order_by_cols: list[OrderByCol],
475
+ schema: SchemaGraph,
476
+ context: str = "main",
477
+ ) -> list[IntentIssue]:
478
+ """Validate that scalar functions are appropriate for column types and aggregation context.
479
+
480
+ Errors when a non-aggregate-compatible scalar wraps an aggregation, or when a type-specific scalar (string, numeric, temporal) is applied to the wrong column type.
481
+
482
+ Args:
483
+
484
+ select_cols: List of SelectCol instances to validate.
485
+ order_by_cols: List of OrderByCol instances to validate.
486
+ schema: The SchemaGraph.
487
+ context: Label used in issue IDs and messages.
488
+
489
+ Returns:
490
+
491
+ List of IntentIssue objects.
492
+ """
493
+ issues = []
494
+
495
+ def check_scalar_semantics(
496
+ scalar_func: str, col_expr: str, agg_func: str | None, location: str
497
+ ) -> list[IntentIssue]:
498
+ inner_issues = []
499
+ func_lower = scalar_func.lower()
500
+ if agg_func and func_lower not in SCALAR_FUNCTIONS_AGG_COMPATIBLE:
501
+ inner_issues.append(
502
+ IntentIssue(
503
+ issue_id=f"scalar_on_agg_invalid_{location}_{func_lower}",
504
+ category="scalar_semantics",
505
+ severity="error",
506
+ message=f"Scalar '{scalar_func}' cannot wrap aggregation '{agg_func.upper()}' in {location}. Only {sorted(SCALAR_FUNCTIONS_AGG_COMPATIBLE)} allowed on aggregates",
507
+ context={
508
+ "scalar": scalar_func,
509
+ "aggregation": agg_func,
510
+ "location": location,
511
+ "allowed": sorted(SCALAR_FUNCTIONS_AGG_COMPATIBLE),
512
+ },
513
+ )
514
+ )
515
+ return inner_issues
516
+ if agg_func:
517
+ return inner_issues
518
+ actual_col = extract_col_from_scalar_wrapper(col_expr)
519
+ if not actual_col or "." not in actual_col or actual_col == "*":
520
+ return inner_issues
521
+ table_name, col_name = actual_col.rsplit(".", 1)
522
+ if table_name not in schema.tables:
523
+ return inner_issues
524
+ col_meta = schema.tables[table_name].columns.get(col_name) or schema.tables[table_name].columns.get(
525
+ col_name.lower()
526
+ )
527
+ if not col_meta:
528
+ return inner_issues
529
+ vt = col_meta.value_type
530
+ string = vt == "string"
531
+ numeric = vt in ("integer", "number")
532
+ temporal = vt == "date"
533
+ if func_lower in SCALAR_FUNCTIONS_STRING and not string:
534
+ inner_issues.append(
535
+ IntentIssue(
536
+ issue_id=f"scalar_type_mismatch_{location}_{func_lower}_{actual_col}",
537
+ category="scalar_semantics",
538
+ severity="error",
539
+ message=f"Scalar '{scalar_func}' requires string column, got '{actual_col}' (type={col_meta.data_type}) in {location}",
540
+ context={
541
+ "scalar": scalar_func,
542
+ "column": actual_col,
543
+ "data_type": col_meta.data_type,
544
+ "expected_type": "string",
545
+ "location": location,
546
+ },
547
+ )
548
+ )
549
+ elif func_lower in SCALAR_FUNCTIONS_NUMERIC and not numeric:
550
+ inner_issues.append(
551
+ IntentIssue(
552
+ issue_id=f"scalar_type_mismatch_{location}_{func_lower}_{actual_col}",
553
+ category="scalar_semantics",
554
+ severity="error",
555
+ message=f"Scalar '{scalar_func}' requires numeric column, got '{actual_col}' (type={col_meta.data_type}) in {location}",
556
+ context={
557
+ "scalar": scalar_func,
558
+ "column": actual_col,
559
+ "data_type": col_meta.data_type,
560
+ "expected_type": "numeric",
561
+ "location": location,
562
+ },
563
+ )
564
+ )
565
+ elif func_lower in SCALAR_FUNCTIONS_TEMPORAL and not temporal:
566
+ inner_issues.append(
567
+ IntentIssue(
568
+ issue_id=f"scalar_type_mismatch_{location}_{func_lower}_{actual_col}",
569
+ category="scalar_semantics",
570
+ severity="error",
571
+ message=f"Scalar '{scalar_func}' requires temporal column, got '{actual_col}' (type={col_meta.data_type}) in {location}",
572
+ context={
573
+ "scalar": scalar_func,
574
+ "column": actual_col,
575
+ "data_type": col_meta.data_type,
576
+ "expected_type": "date/timestamp",
577
+ "location": location,
578
+ },
579
+ )
580
+ )
581
+ return inner_issues
582
+
583
+ for idx, sc in enumerate(select_cols or []):
584
+ sc_scalar, sc_agg = extract_functions_from_term(sc.expr.primary_term)
585
+ if sc_scalar:
586
+ issues.extend(check_scalar_semantics(sc_scalar, sc.expr.primary_column, sc_agg, f"select_cols[{idx}]"))
587
+ for idx, obc in enumerate(order_by_cols or []):
588
+ obc_scalar, obc_agg = extract_functions_from_term(obc.expr.primary_term)
589
+ if obc_scalar:
590
+ issues.extend(
591
+ check_scalar_semantics(
592
+ obc_scalar,
593
+ obc.expr.primary_column,
594
+ obc_agg,
595
+ f"order_by_cols[{idx}]",
596
+ )
597
+ )
598
+ if issues:
599
+ debug(
600
+ f"[validation_schema.validate_scalar_func_type_semantics] found {len(issues)} semantic issues in {context}"
601
+ )
602
+ else:
603
+ debug(f"[validation_schema.validate_scalar_func_type_semantics] no semantic issues in {context}")
604
+ return issues
605
+
606
+
607
+ def validate_column_types(
608
+ select_cols: list[SelectCol],
609
+ schema: SchemaGraph,
610
+ context: str = "main",
611
+ ) -> list[IntentIssue]:
612
+ """Validate that operations match their column types (heuristic checks).
613
+
614
+ Warns for numeric aggregations on text columns, date operations on non-date columns, and string operations on numeric columns.
615
+
616
+ Args:
617
+
618
+ select_cols: List of SelectCol instances to inspect.
619
+ filters_param: List of FilterParam instances (currently unused; reserved).
620
+ schema: The SchemaGraph.
621
+ context: Label used in issue IDs and messages.
622
+
623
+ Returns:
624
+
625
+ List of IntentIssue objects.
626
+ """
627
+ issues = []
628
+ debug("[validation_schema.validate_column_types] checking type consistency")
629
+ numeric_aggs = {"sum", "avg", "average", "total", "mean"}
630
+ date_ops = {"latest", "earliest", "recent", "oldest", "newest", "before", "after"}
631
+ string_ops = {"contains", "starts", "ends", "like", "match"}
632
+ for sc in select_cols:
633
+ _, agg_func = extract_functions_from_term(sc.expr.primary_term)
634
+ if not agg_func:
635
+ continue
636
+ func_lower = agg_func
637
+ col_expr = sc.expr.primary_column
638
+ if not col_expr:
639
+ continue
640
+ actual_col = extract_col_from_scalar_wrapper(col_expr)
641
+ if "." not in actual_col:
642
+ continue
643
+ table_name, col_name = actual_col.rsplit(".", 1)
644
+ if table_name not in schema.tables:
645
+ continue
646
+ table_meta = schema.tables[table_name]
647
+ col_meta = table_meta.columns.get(col_name) or table_meta.columns.get(col_name.lower())
648
+ if not col_meta:
649
+ continue
650
+ vt = col_meta.value_type
651
+ if vt:
652
+ numeric = vt in ("integer", "number")
653
+ date = vt == "date"
654
+ text = vt == "string"
655
+ else:
656
+ numeric = any(
657
+ hint in col_name.lower()
658
+ for hint in [
659
+ "amount",
660
+ "price",
661
+ "total",
662
+ "count",
663
+ "qty",
664
+ "quantity",
665
+ "rate",
666
+ "cost",
667
+ "num",
668
+ ]
669
+ )
670
+ date = any(
671
+ hint in col_name.lower()
672
+ for hint in [
673
+ "date",
674
+ "time",
675
+ "created",
676
+ "updated",
677
+ "at",
678
+ "day",
679
+ "year",
680
+ "month",
681
+ ]
682
+ )
683
+ text = any(
684
+ hint in col_name.lower() for hint in ["name", "title", "description", "email", "address", "text"]
685
+ )
686
+ if func_lower in numeric_aggs and text and not numeric:
687
+ issues.append(
688
+ IntentIssue(
689
+ issue_id=f"numeric_on_text_{table_name}_{col_name}",
690
+ category="type_mismatch",
691
+ severity="warning",
692
+ message=f"Attempting numeric aggregation ({func_lower}) on text column '{col_name}' (type: {col_meta.data_type})",
693
+ context={
694
+ "table": table_name,
695
+ "column": col_name,
696
+ "type": col_meta.data_type,
697
+ "agg": func_lower,
698
+ "location": context,
699
+ },
700
+ )
701
+ )
702
+ debug("[validation_schema.validate_column_types] type_mismatch: numeric_on_text")
703
+ if func_lower in date_ops and not date:
704
+ issues.append(
705
+ IntentIssue(
706
+ issue_id=f"date_on_non_date_{table_name}_{col_name}",
707
+ category="type_mismatch",
708
+ severity="warning",
709
+ message=f"Attempting date operation ({func_lower}) on non-date column '{col_name}' (type: {col_meta.data_type})",
710
+ context={
711
+ "table": table_name,
712
+ "column": col_name,
713
+ "type": col_meta.data_type,
714
+ "op": func_lower,
715
+ "location": context,
716
+ },
717
+ )
718
+ )
719
+ debug("[validation_schema.validate_column_types] type_mismatch: date_on_non_date")
720
+ if func_lower in string_ops and numeric and "_id" not in col_name.lower():
721
+ issues.append(
722
+ IntentIssue(
723
+ issue_id=f"string_on_numeric_{table_name}_{col_name}",
724
+ category="type_mismatch",
725
+ severity="warning",
726
+ message=f"Attempting string operation ({func_lower}) on numeric column '{col_name}' (type: {col_meta.data_type})",
727
+ context={
728
+ "table": table_name,
729
+ "column": col_name,
730
+ "type": col_meta.data_type,
731
+ "op": func_lower,
732
+ "location": context,
733
+ },
734
+ )
735
+ )
736
+ debug("[validation_schema.validate_column_types] TYPE MISMATCH: string op on numeric column")
737
+ if issues:
738
+ debug(f"[validation_schema.validate_column_types] FAILED with {len(issues)} issues")
739
+ else:
740
+ debug("[validation_schema.validate_column_types] PASSED")
741
+ return issues
742
+
743
+
744
+ def expr_has_arithmetic(expr: NormalizedExpr) -> bool:
745
+ """Return ``True`` if a ``NormalizedExpr`` contains arithmetic operations.
746
+
747
+ Args:
748
+
749
+ expr: The normalised expression to inspect.
750
+
751
+ Returns:
752
+
753
+ ``True`` when the expression has multiple groups, add/sub constant values, a non-unit coefficient, division, or multiple multiply terms; ``False`` otherwise.
754
+ """
755
+ if len(expr.add_groups) + len(expr.sub_groups) > 1:
756
+ return True
757
+ if expr.add_values or expr.sub_values:
758
+ return True
759
+ for g in expr.add_groups + expr.sub_groups:
760
+ if g.coefficient != 1.0 or g.divide or len(g.multiply) > 1:
761
+ return True
762
+ return False
763
+
764
+
765
+ def strip_function_wrappers(term: str) -> str:
766
+ """Strip all nested function call wrappers to expose the innermost column reference.
767
+
768
+ Args:
769
+
770
+ term: A SQL term string potentially containing function wrappers such as ``UPPER(table.col)`` or ``ABS(SUM(table.col))``.
771
+
772
+ Returns:
773
+
774
+ The bare column reference string after all wrapping functions are removed.
775
+ """
776
+ while "(" in term:
777
+ start = term.index("(")
778
+ end = term.rindex(")")
779
+ inner = term[start + 1 : end].strip()
780
+ if inner.upper().startswith("DISTINCT "):
781
+ inner = inner[9:].strip()
782
+ term = inner
783
+ return term
784
+
785
+
786
+ def term_result_is_numeric(term: str) -> bool:
787
+ """Return ``True`` if function wrappers guarantee a numeric result regardless of column type.
788
+
789
+ Args:
790
+
791
+ term: A SQL term string, possibly containing nested function calls.
792
+
793
+ Returns:
794
+
795
+ ``True`` when the outermost function is a known numeric-result aggregation (``COUNT``, ``SUM``, ``AVG``) or scalar (``ABS``, ``ROUND``, etc.) that always returns a number; ``False`` otherwise.
796
+ """
797
+ remaining = term.strip()
798
+ while True:
799
+ match = re.match(r"^\s*(\w+)\s*\(", remaining)
800
+ if not match:
801
+ return False
802
+ func = match.group(1).lower()
803
+ if func in NUMERIC_RESULT_AGGS or func in NUMERIC_RESULT_SCALARS:
804
+ return True
805
+ inner_start = remaining.index("(") + 1
806
+ inner_end = remaining.rindex(")")
807
+ remaining = remaining[inner_start:inner_end].strip()
808
+ if remaining.upper().startswith("DISTINCT "):
809
+ remaining = remaining[9:].strip()
810
+
811
+
812
+ def expr_result_is_numeric(
813
+ expr: NormalizedExpr,
814
+ schema: SchemaGraph,
815
+ cte_outputs: dict[str, dict[str, CteOutputColumnMeta]],
816
+ ) -> bool | None:
817
+ """Return whether the result of a ``NormalizedExpr`` is numeric.
818
+
819
+ Args:
820
+
821
+ expr: The normalised expression to inspect.
822
+ schema: Schema graph for resolving column types.
823
+ cte_outputs: Map of CTE name to column output metadata.
824
+
825
+ Returns:
826
+
827
+ ``True`` if the expression provably produces a numeric result (numeric aggregation, numeric scalar, arithmetic structure, or numeric primary column); ``None`` if the result type cannot be determined.
828
+ """
829
+ if expr.agg_func and expr.agg_func in NUMERIC_RESULT_AGGS:
830
+ return True
831
+ if expr.scalar_func and expr.scalar_func in NUMERIC_RESULT_SCALARS:
832
+ return True
833
+ if expr.inner_scalar_func and expr.inner_scalar_func in NUMERIC_RESULT_SCALARS:
834
+ return True
835
+ if expr_has_arithmetic(expr):
836
+ return True
837
+ if expr.add_values or expr.sub_values:
838
+ return True
839
+ for g in expr.add_groups + expr.sub_groups:
840
+ if g.agg_func and g.agg_func in NUMERIC_RESULT_AGGS:
841
+ return True
842
+ if g.scalar_func and g.scalar_func in NUMERIC_RESULT_SCALARS:
843
+ return True
844
+ if g.inner_scalar_func and g.inner_scalar_func in NUMERIC_RESULT_SCALARS:
845
+ return True
846
+ if expr.has_aggregation:
847
+ primary = expr.primary_term
848
+ result = extract_agg_col(primary)
849
+ if len(result) == 3 and result[0] in {"count", "sum", "avg"}:
850
+ return True
851
+ col = expr.primary_column
852
+ if col:
853
+ return is_col_numeric(col, schema, cte_outputs)
854
+ return None
855
+
856
+
857
+ def validate_scalar_expression_semantics(
858
+ select_cols: list[SelectCol],
859
+ schema: SchemaGraph,
860
+ context: str = "main",
861
+ ) -> list[IntentIssue]:
862
+ """Validate that scalar functions are applied to semantically appropriate column types.
863
+
864
+ Args:
865
+
866
+ select_cols: SELECT column list to inspect for scalar function misuse.
867
+ filters_param: Filter conditions to inspect.
868
+ having_param: HAVING conditions to inspect.
869
+ schema: Schema graph for resolving column types and roles.
870
+ context: Query context label for issue messages.
871
+
872
+ Returns:
873
+
874
+ List of ``IntentIssue`` instances describing scalar function semantic violations.
875
+ """
876
+ issues = []
877
+ debug("[validation_semantic.validate_scalar_expression_semantics] checking scalar semantics")
878
+ numeric_scalars = {"abs", "round", "ceil", "floor", "sqrt"}
879
+ string_scalars = {"upper", "lower", "trim", "ltrim", "rtrim", "length"}
880
+ for sc in select_cols:
881
+ outer_func, _, _ = extract_agg_col(sc.expr.primary_term)
882
+ if not outer_func or outer_func in VALID_AGG_FUNCS:
883
+ continue
884
+ func_lower = outer_func
885
+ col_type = get_col_type(sc.expr.primary_column, schema, {})
886
+ if col_type:
887
+ numeric = col_type in ("integer", "number")
888
+ text = col_type == "string"
889
+ if func_lower in numeric_scalars and not numeric and not sc.is_aggregated:
890
+ issues.append(
891
+ IntentIssue(
892
+ issue_id=f"numeric_scalar_on_non_numeric_{sc.expr.primary_column}_{func_lower}",
893
+ category="scalar_semantic",
894
+ severity="warning",
895
+ message=f"Numeric scalar '{func_lower}' on non-numeric column '{sc.expr.primary_column}' (type: {col_type})",
896
+ context={
897
+ "column": sc.expr.primary_column,
898
+ "scalar": func_lower,
899
+ "type": col_type,
900
+ "location": context,
901
+ },
902
+ )
903
+ )
904
+ if func_lower in string_scalars and not text:
905
+ issues.append(
906
+ IntentIssue(
907
+ issue_id=f"string_scalar_on_non_string_{sc.expr.primary_column}_{func_lower}",
908
+ category="scalar_semantic",
909
+ severity="warning",
910
+ message=f"String scalar '{func_lower}' on non-string column '{sc.expr.primary_column}' (type: {col_type})",
911
+ context={
912
+ "column": sc.expr.primary_column,
913
+ "scalar": func_lower,
914
+ "type": col_type,
915
+ "location": context,
916
+ },
917
+ )
918
+ )
919
+ debug(f"[validation_semantic.validate_scalar_expression_semantics] {len(issues)} issues in {context}")
920
+ return issues
921
+
922
+
923
+ def validate_temporal_columns(
924
+ select_cols: list[SelectCol],
925
+ schema: SchemaGraph,
926
+ context: str = "main",
927
+ ) -> list[IntentIssue]:
928
+ """Validate that temporal scalar functions (YEAR, MONTH, DAY, EXTRACT) are applied to date-type columns.
929
+
930
+ Args:
931
+
932
+ select_cols: SELECT column list to inspect for temporal function misuse.
933
+ schema: Schema graph for resolving column types.
934
+ context: Query context label for issue messages.
935
+
936
+ Returns:
937
+
938
+ List of ``IntentIssue`` instances where a temporal function is applied to a non-date column.
939
+ """
940
+ issues = []
941
+ temporal_ops = {"latest", "recent", "last", "first", "earliest", "oldest", "newest"}
942
+ agg_funcs = {extract_agg_col(sc.expr.primary_term)[0] for sc in select_cols if sc.is_aggregated} - {None}
943
+ if not (agg_funcs & temporal_ops):
944
+ return []
945
+ debug("[validation_semantic.validate_temporal_columns] checking temporal column presence")
946
+ has_date_column = False
947
+ for sc in select_cols:
948
+ col_expr = sc.expr.primary_column
949
+ if not col_expr:
950
+ continue
951
+ actual_col = extract_col_from_scalar_wrapper(col_expr)
952
+ if "." not in actual_col:
953
+ continue
954
+ table_name, col_name = actual_col.rsplit(".", 1)
955
+ if table_name in schema.tables:
956
+ col_meta = schema.tables[table_name].columns.get(col_name)
957
+ if col_meta:
958
+ if col_meta.value_type == "date":
959
+ has_date_column = True
960
+ break
961
+ if any(hint in col_name.lower() for hint in ["date", "time", "created", "updated", "at"]):
962
+ has_date_column = True
963
+ break
964
+ if not has_date_column:
965
+ issues.append(
966
+ IntentIssue(
967
+ issue_id=f"temporal_no_date_col_{','.join(sorted(agg_funcs & temporal_ops))}",
968
+ category="missing_temporal_column",
969
+ severity="warning",
970
+ message=f"Intent uses temporal operation ({agg_funcs & temporal_ops}) but no date/time column identified",
971
+ context={
972
+ "temporal_ops": list(agg_funcs & temporal_ops),
973
+ "location": context,
974
+ },
975
+ )
976
+ )
977
+ debug("[validation_semantic.validate_temporal_columns] AMBIGUITY: temporal ops but no date column")
978
+ return issues
979
+
980
+
981
+ def validate_pk_fk_aggregation(
982
+ select_cols: list[SelectCol],
983
+ schema: SchemaGraph,
984
+ context: str = "main",
985
+ ) -> list[IntentIssue]:
986
+ """Validate that primary-key and foreign-key columns are not aggregated with SUM or AVG.
987
+
988
+ Args:
989
+
990
+ select_cols: SELECT column list to inspect for PK/FK aggregation misuse.
991
+ schema: Schema graph for resolving column roles.
992
+ context: Query context label for issue messages.
993
+
994
+ Returns:
995
+
996
+ List of ``IntentIssue`` instances where a PK or FK column is used with a ``SUM`` or ``AVG`` aggregation.
997
+ """
998
+ issues = []
999
+ suspicious_aggs = {"sum", "avg"}
1000
+ debug("[validation_semantic.validate_pk_fk_aggregation] checking PK/FK aggregation")
1001
+ for sc in select_cols:
1002
+ if not sc.is_aggregated:
1003
+ continue
1004
+ func_lower, _, _ = extract_agg_col(sc.expr.primary_term)
1005
+ if not func_lower or func_lower not in suspicious_aggs:
1006
+ continue
1007
+ col_expr = sc.expr.primary_column
1008
+ if not col_expr:
1009
+ continue
1010
+ actual_col = extract_col_from_scalar_wrapper(col_expr)
1011
+ if "." not in actual_col:
1012
+ continue
1013
+ table_name, col_name = actual_col.rsplit(".", 1)
1014
+ if table_name not in schema.tables:
1015
+ continue
1016
+ col_meta = schema.tables[table_name].columns.get(col_name)
1017
+ if col_meta and (col_meta.is_primary_key or col_meta.is_foreign_key):
1018
+ issues.append(
1019
+ IntentIssue(
1020
+ issue_id=f"agg_on_pk_fk_{table_name}_{col_name}_{func_lower}",
1021
+ category="aggregation_semantics",
1022
+ severity="warning",
1023
+ message=f"{func_lower.upper()} on PK/FK column {actual_col} is suspicious",
1024
+ context={
1025
+ "table": table_name,
1026
+ "column": col_name,
1027
+ "agg": func_lower,
1028
+ "location": context,
1029
+ },
1030
+ )
1031
+ )
1032
+ debug(f"[validation_semantic.validate_pk_fk_aggregation] {func_lower.upper()} on PK/FK: {actual_col}")
1033
+ return issues