aetherdialect 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aetherdialect-0.1.0.dist-info/METADATA +197 -0
- aetherdialect-0.1.0.dist-info/RECORD +34 -0
- aetherdialect-0.1.0.dist-info/WHEEL +5 -0
- aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
- aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
- text2sql/__init__.py +7 -0
- text2sql/config.py +1063 -0
- text2sql/contracts_base.py +952 -0
- text2sql/contracts_core.py +1890 -0
- text2sql/core_utils.py +834 -0
- text2sql/dialect.py +1134 -0
- text2sql/expansion_ops.py +1218 -0
- text2sql/expansion_rules.py +496 -0
- text2sql/intent_expr.py +1759 -0
- text2sql/intent_process.py +2133 -0
- text2sql/intent_repair.py +1733 -0
- text2sql/intent_resolve.py +1292 -0
- text2sql/live_testing.py +1117 -0
- text2sql/main_execution.py +799 -0
- text2sql/pipeline.py +1662 -0
- text2sql/qsim_ops.py +1286 -0
- text2sql/qsim_sample.py +609 -0
- text2sql/qsim_struct.py +569 -0
- text2sql/schema.py +973 -0
- text2sql/schema_profiling.py +2075 -0
- text2sql/simulator.py +970 -0
- text2sql/sql_gen.py +1537 -0
- text2sql/templates.py +1037 -0
- text2sql/text2sql.py +726 -0
- text2sql/utils.py +973 -0
- text2sql/validation_agg.py +1033 -0
- text2sql/validation_execute.py +1092 -0
- text2sql/validation_schema.py +1847 -0
- text2sql/validation_semantic.py +2122 -0
|
@@ -0,0 +1,1092 @@
|
|
|
1
|
+
"""SQL execution, validation, confidence scoring, and CTE chain validation.
|
|
2
|
+
|
|
3
|
+
Executes SQL via the configured dialect and validates safety
|
|
4
|
+
(SELECT-only enforcement) and structural integrity of CTE chains
|
|
5
|
+
(dependency order, join paths, table necessity, output column
|
|
6
|
+
types, cardinality). Also provides the confidence scoring formula
|
|
7
|
+
and the LLM-based rejection classifier used during user feedback
|
|
8
|
+
collection.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .config import EngineConfig, PolicyConfig
|
|
17
|
+
from .schema_profiling import inject_partition_filters
|
|
18
|
+
from .contracts_base import (
|
|
19
|
+
CteOutputColumnMeta,
|
|
20
|
+
IntentIssue,
|
|
21
|
+
IntentValidationResult,
|
|
22
|
+
SchemaGraph,
|
|
23
|
+
)
|
|
24
|
+
from .contracts_core import (
|
|
25
|
+
RuntimeCteStep,
|
|
26
|
+
RuntimeIntent,
|
|
27
|
+
SelectCol,
|
|
28
|
+
)
|
|
29
|
+
from .core_utils import canonicalize_sql, debug, llm_json, stable_json, substitute_params
|
|
30
|
+
from .dialect import Dialect, get_dialect
|
|
31
|
+
from .validation_agg import (
|
|
32
|
+
validate_column_types,
|
|
33
|
+
validate_having_agg_per_role,
|
|
34
|
+
validate_order_by_agg_per_role,
|
|
35
|
+
validate_order_by_agg_semantics,
|
|
36
|
+
validate_pk_fk_aggregation,
|
|
37
|
+
validate_scalar_expression_semantics,
|
|
38
|
+
validate_scalar_func_type_semantics,
|
|
39
|
+
validate_select_agg_per_role,
|
|
40
|
+
validate_select_agg_semantics,
|
|
41
|
+
validate_temporal_columns,
|
|
42
|
+
)
|
|
43
|
+
from .validation_schema import (
|
|
44
|
+
validate_date_diff_units,
|
|
45
|
+
validate_date_window_units,
|
|
46
|
+
validate_filter_ops_per_column,
|
|
47
|
+
validate_filter_value_type_alignment,
|
|
48
|
+
validate_filters_schema,
|
|
49
|
+
validate_group_by_cols_schema,
|
|
50
|
+
validate_having_ops_per_column,
|
|
51
|
+
validate_having_schema,
|
|
52
|
+
validate_no_between_ops,
|
|
53
|
+
validate_null_filters,
|
|
54
|
+
validate_order_by_cols_schema,
|
|
55
|
+
validate_select_cols_schema,
|
|
56
|
+
)
|
|
57
|
+
from .validation_semantic import (
|
|
58
|
+
validate_agg_vs_agg_having,
|
|
59
|
+
validate_arith_expression_semantics,
|
|
60
|
+
validate_count_threshold_missing_having,
|
|
61
|
+
validate_cte_dependency_grains,
|
|
62
|
+
validate_cte_grain_consistency,
|
|
63
|
+
validate_expr_vs_expr_filters,
|
|
64
|
+
validate_filter_expr_types,
|
|
65
|
+
validate_filter_no_aggregation,
|
|
66
|
+
validate_for_each_grouping,
|
|
67
|
+
validate_grain_consistency,
|
|
68
|
+
validate_grouped_requires_aggregation,
|
|
69
|
+
validate_having_expr_types,
|
|
70
|
+
validate_having_requires_aggregation,
|
|
71
|
+
validate_mixed_aggregation_in_mulgroup,
|
|
72
|
+
validate_no_nested_aggregation,
|
|
73
|
+
validate_no_pk_fk_filters,
|
|
74
|
+
validate_order_by_aggregation_context,
|
|
75
|
+
validate_order_by_expr_types,
|
|
76
|
+
validate_question_agg_keyword_coverage,
|
|
77
|
+
validate_question_aggregation_hint,
|
|
78
|
+
validate_question_distinct_hint,
|
|
79
|
+
validate_question_numeric_coverage,
|
|
80
|
+
validate_question_table_mentions,
|
|
81
|
+
validate_select_expr_types,
|
|
82
|
+
validate_select_group_by_membership,
|
|
83
|
+
validate_semantic_contradictions,
|
|
84
|
+
validate_threshold_missing_having,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _enforce_select_only(sql: str) -> tuple[bool, str]:
|
|
89
|
+
"""Check if SQL is a safe SELECT-only statement.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
|
|
93
|
+
sql: The SQL string to check.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
|
|
97
|
+
Tuple of ``(ok, reason)`` where ``ok`` is True for safe SELECT statements, and ``reason`` is ``'ok'``, ``'not_select'``, or ``'forbidden_sql'``.
|
|
98
|
+
"""
|
|
99
|
+
s = sql.lower().strip()
|
|
100
|
+
if not s.startswith("select"):
|
|
101
|
+
return False, "not_select"
|
|
102
|
+
for p in PolicyConfig.FORBIDDEN_SQL:
|
|
103
|
+
if re.search(p, s, re.IGNORECASE):
|
|
104
|
+
return False, "forbidden_sql"
|
|
105
|
+
return True, "ok"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def validate_sql(
|
|
109
|
+
dialect: Dialect,
|
|
110
|
+
sql: str,
|
|
111
|
+
engine: Any | None = None,
|
|
112
|
+
params: dict[str, Any] | None = None,
|
|
113
|
+
) -> tuple[bool, str | None]:
|
|
114
|
+
"""Validate SQL is a safe SELECT and syntactically valid via the dialect AST, optionally checking executability via ``dialect.explain_sql``.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
|
|
118
|
+
dialect: The database dialect instance for AST validation.
|
|
119
|
+
sql: The SQL string to validate.
|
|
120
|
+
engine: Optional SQLAlchemy ``Engine`` for explain-based executability check; when ``None``, only AST validation runs.
|
|
121
|
+
params: Optional bind-parameter values keyed by placeholder name (for example ``{"p1": "value"}``), forwarded to ``explain_sql`` so the database can resolve ``:p1`` style placeholders.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
|
|
125
|
+
Tuple of ``(ok, error_message)`` where ``error_message`` is ``None`` on success.
|
|
126
|
+
"""
|
|
127
|
+
debug(f"[validation_execute.validate_sql] checking SQL length={len(sql)}")
|
|
128
|
+
ok, reason = _enforce_select_only(sql)
|
|
129
|
+
if not ok:
|
|
130
|
+
debug(f"[validation_execute.validate_sql] enforce_select_only FAILED: {reason}")
|
|
131
|
+
return False, reason
|
|
132
|
+
ok, ast_err = dialect.ast_validate(sql)
|
|
133
|
+
if not ok:
|
|
134
|
+
debug(f"[validation_execute.validate_sql] AST validation failed: {ast_err}")
|
|
135
|
+
return False, f"SQL structure error: {ast_err}"
|
|
136
|
+
debug("[validation_execute.validate_sql] structural validation succeeded")
|
|
137
|
+
if engine is not None:
|
|
138
|
+
ok, explain_err = dialect.explain_sql(engine, sql, params)
|
|
139
|
+
if not ok:
|
|
140
|
+
debug(f"[validation_execute.validate_sql] explain_sql failed: {explain_err}")
|
|
141
|
+
return False, explain_err
|
|
142
|
+
debug("[validation_execute.validate_sql] explain_sql passed")
|
|
143
|
+
return True, None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_spark_sql_for_execution(
|
|
147
|
+
sql_param: str,
|
|
148
|
+
params: dict[str, Any],
|
|
149
|
+
schema: SchemaGraph,
|
|
150
|
+
intent: RuntimeIntent,
|
|
151
|
+
dialect: Dialect,
|
|
152
|
+
spark_sql_param_override: str | None = None,
|
|
153
|
+
) -> str:
|
|
154
|
+
"""Compute Spark SQL for execution: qualify, substitute, then partition inject.
|
|
155
|
+
|
|
156
|
+
Returns empty string when not Databricks.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
|
|
160
|
+
sql_param: Parameterized Spark SQL.
|
|
161
|
+
params: Resolved parameter values.
|
|
162
|
+
schema: Schema graph for partition metadata.
|
|
163
|
+
intent: Runtime intent for partition filter injection.
|
|
164
|
+
dialect: Dialect instance (must have prepare_for_execution).
|
|
165
|
+
spark_sql_param_override: Optional pre-qualified Spark SQL; when
|
|
166
|
+
provided, used instead of qualifying sql_param.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
|
|
170
|
+
Spark SQL string ready for execution, or empty when not Databricks.
|
|
171
|
+
"""
|
|
172
|
+
if EngineConfig.TYPE != "databricks":
|
|
173
|
+
return ""
|
|
174
|
+
spark_sql = spark_sql_param_override or dialect.prepare_for_execution(sql_param)
|
|
175
|
+
substituted = substitute_params(spark_sql, params)
|
|
176
|
+
return inject_partition_filters(substituted, schema, intent)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def execute_sql(
|
|
180
|
+
dialect: Dialect,
|
|
181
|
+
sql: str,
|
|
182
|
+
spark_sql_for_execution: str | None = None,
|
|
183
|
+
) -> list[tuple]:
|
|
184
|
+
"""Execute SQL and return result rows via the configured dialect.
|
|
185
|
+
|
|
186
|
+
Uses Spark execution for Databricks engine type and SQLAlchemy otherwise.
|
|
187
|
+
When Databricks and spark_sql_for_execution is provided, uses it directly;
|
|
188
|
+
otherwise qualifies table references and executes.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
|
|
192
|
+
dialect: The database dialect instance.
|
|
193
|
+
sql: The Spark SQL string to execute for Databricks.
|
|
194
|
+
spark_sql_for_execution: Pre-qualified Spark SQL for Databricks;
|
|
195
|
+
when provided, used directly without qualification.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
|
|
199
|
+
List of result row tuples.
|
|
200
|
+
"""
|
|
201
|
+
debug("[validation_execute.execute_sql] executing query via dialect")
|
|
202
|
+
if EngineConfig.TYPE == "databricks":
|
|
203
|
+
to_run = spark_sql_for_execution if spark_sql_for_execution else sql
|
|
204
|
+
rows = dialect.execute_sql_spark(
|
|
205
|
+
to_run, sql_already_spark=bool(spark_sql_for_execution)
|
|
206
|
+
)
|
|
207
|
+
else:
|
|
208
|
+
from sqlalchemy import text
|
|
209
|
+
|
|
210
|
+
with dialect.engine.connect() as conn:
|
|
211
|
+
rows = conn.execute(text(sql)).fetchall()
|
|
212
|
+
debug(f"[validation_execute.execute_sql] returned {len(rows)} rows")
|
|
213
|
+
return rows
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def compute_confidence(
|
|
217
|
+
best_score: float,
|
|
218
|
+
score_gap: float,
|
|
219
|
+
used_new_tables: bool,
|
|
220
|
+
shape_penalty: float,
|
|
221
|
+
negative_pen: float,
|
|
222
|
+
colmap_pen: float,
|
|
223
|
+
num_cte_pen: float = 0.0,
|
|
224
|
+
) -> float:
|
|
225
|
+
"""Compute overall confidence score for a query result.
|
|
226
|
+
|
|
227
|
+
Combines template similarity score, gap to next-best, new-table penalty, shape distance penalty, negative memory penalty, column-map penalty, and CTE count penalty into a single float.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
|
|
231
|
+
best_score: Similarity score of the best matching template in [0, 1].
|
|
232
|
+
score_gap: Gap between best and second-best template scores.
|
|
233
|
+
used_new_tables: Whether the query uses tables not seen in any template.
|
|
234
|
+
shape_penalty: Shape distance penalty in [0, 1].
|
|
235
|
+
negative_pen: Accumulated negative memory penalty in [0, 1].
|
|
236
|
+
colmap_pen: Column-map rejection penalty in [0, 1].
|
|
237
|
+
num_cte_pen: Optional CTE-count penalty in [0, 1].
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
|
|
241
|
+
Confidence float clamped to [0, 1].
|
|
242
|
+
"""
|
|
243
|
+
debug("[validation_execute.compute_confidence] inputs:")
|
|
244
|
+
debug(
|
|
245
|
+
f"[validation_execute.compute_confidence] used_new_tables={used_new_tables}, shape_penalty={shape_penalty:.3f}, num_cte_pen={num_cte_pen:.3f}"
|
|
246
|
+
)
|
|
247
|
+
c = 0.0
|
|
248
|
+
c += 0.62 * best_score
|
|
249
|
+
debug(f"[validation_execute.compute_confidence] +{0.62 * best_score:.3f} from best_score")
|
|
250
|
+
gap_contrib = 0.18 * max(0.0, min(1.0, score_gap * 2.0))
|
|
251
|
+
c += gap_contrib
|
|
252
|
+
debug(f"[validation_execute.compute_confidence] +{gap_contrib:.3f} from score_gap")
|
|
253
|
+
if used_new_tables:
|
|
254
|
+
c -= 0.12
|
|
255
|
+
debug("[validation_execute.compute_confidence] -0.120 from used_new_tables")
|
|
256
|
+
shape_deduct = 0.18 * shape_penalty
|
|
257
|
+
c -= shape_deduct
|
|
258
|
+
debug(f"[validation_execute.compute_confidence] -{shape_deduct:.3f} from shape_penalty")
|
|
259
|
+
neg_deduct = 0.35 * min(1.0, max(0.0, negative_pen))
|
|
260
|
+
c -= neg_deduct
|
|
261
|
+
debug(f"[validation_execute.compute_confidence] -{neg_deduct:.3f} from negative_pen")
|
|
262
|
+
colmap_deduct = 0.20 * min(1.0, max(0.0, colmap_pen))
|
|
263
|
+
c -= colmap_deduct
|
|
264
|
+
debug(f"[validation_execute.compute_confidence] -{colmap_deduct:.3f} from colmap_pen")
|
|
265
|
+
cte_deduct = 0.10 * min(1.0, max(0.0, num_cte_pen))
|
|
266
|
+
c -= cte_deduct
|
|
267
|
+
debug(f"[validation_execute.compute_confidence] -{cte_deduct:.3f} from num_cte_pen")
|
|
268
|
+
result = max(0.0, min(1.0, c))
|
|
269
|
+
if best_score == 0.0:
|
|
270
|
+
cold_start_floor = max(0.0, 0.50 - neg_deduct - colmap_deduct - cte_deduct)
|
|
271
|
+
if result < cold_start_floor:
|
|
272
|
+
debug(
|
|
273
|
+
f"[validation_execute.compute_confidence] cold-start floor applied: {result:.3f} → {cold_start_floor:.3f}"
|
|
274
|
+
)
|
|
275
|
+
result = cold_start_floor
|
|
276
|
+
debug(f"[validation_execute.compute_confidence] FINAL confidence={result:.3f}")
|
|
277
|
+
return result
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _keyword_classify_rejection(reason_lower: str) -> str | None:
|
|
281
|
+
"""Fast deterministic pre-classifier for common rejection phrases.
|
|
282
|
+
|
|
283
|
+
Scans the lowered rejection reason for unambiguous keyword patterns and returns a canonical category string if matched, or ``None`` to fall through to the LLM classifier.
|
|
284
|
+
"""
|
|
285
|
+
_KEYWORD_MAP: list[tuple[list[str], str]] = [
|
|
286
|
+
(
|
|
287
|
+
["wrong table", "missing table", "incorrect table", "should use table"],
|
|
288
|
+
"wrong_tables",
|
|
289
|
+
),
|
|
290
|
+
(
|
|
291
|
+
[
|
|
292
|
+
"wrong join",
|
|
293
|
+
"relationship wrong",
|
|
294
|
+
"join issue",
|
|
295
|
+
"self-join",
|
|
296
|
+
"self join",
|
|
297
|
+
"self_join",
|
|
298
|
+
],
|
|
299
|
+
"wrong_join",
|
|
300
|
+
),
|
|
301
|
+
(
|
|
302
|
+
[
|
|
303
|
+
"wrong aggregation",
|
|
304
|
+
"wrong grouping",
|
|
305
|
+
"aggregation or grouping",
|
|
306
|
+
"group by",
|
|
307
|
+
],
|
|
308
|
+
"wrong_aggregation_or_grouping",
|
|
309
|
+
),
|
|
310
|
+
(
|
|
311
|
+
["wrong column", "missing column", "columns selected", "wrong columns"],
|
|
312
|
+
"wrong_columns_selected",
|
|
313
|
+
),
|
|
314
|
+
(
|
|
315
|
+
["wrong filter", "wrong condition", "wrong having", "wrong threshold"],
|
|
316
|
+
"wrong_filters_or_having",
|
|
317
|
+
),
|
|
318
|
+
(["wrong intent", "misunderstood", "question wrong"], "wrong_intent"),
|
|
319
|
+
(["too many rows", "too many results"], "too_many_rows"),
|
|
320
|
+
(["too few rows", "too few results", "missing rows"], "too_few_rows"),
|
|
321
|
+
(
|
|
322
|
+
[
|
|
323
|
+
"invalid structure",
|
|
324
|
+
"query format",
|
|
325
|
+
"sql structure",
|
|
326
|
+
"wrong query structure",
|
|
327
|
+
"wrong structure",
|
|
328
|
+
],
|
|
329
|
+
"invalid_structure",
|
|
330
|
+
),
|
|
331
|
+
]
|
|
332
|
+
for phrases, category in _KEYWORD_MAP:
|
|
333
|
+
if any(p in reason_lower for p in phrases):
|
|
334
|
+
return category
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def llm_classify_rejection(q_norm: str, intent: RuntimeIntent, sql: str, ux: str, user_reason: str) -> dict[str, str]:
|
|
339
|
+
"""Use an LLM to classify a user's rejection reason into a canonical category.
|
|
340
|
+
|
|
341
|
+
A fast keyword pre-classifier is tried first and the LLM is only invoked when no deterministic match is found.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
|
|
345
|
+
q_norm: The normalised user question.
|
|
346
|
+
intent: The ``RuntimeIntent`` for the rejected query.
|
|
347
|
+
sql: The rejected SQL.
|
|
348
|
+
ux: The UX explanation shown to the user (unused in the prompt; reserved).
|
|
349
|
+
user_reason: The free-text reason provided by the user.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
|
|
353
|
+
Dict with ``"category"`` (one of ``PolicyConfig.REJECT_CATEGORIES``) and ``"normalized_reason"`` (concise 5–10 word summary).
|
|
354
|
+
"""
|
|
355
|
+
debug("[validation_execute.llm_classify_rejection] classifying user rejection")
|
|
356
|
+
debug(f"[validation_execute.llm_classify_rejection] user_reason='{user_reason or ''}'")
|
|
357
|
+
|
|
358
|
+
reason_lower = (user_reason or "").lower()
|
|
359
|
+
keyword_cat = _keyword_classify_rejection(reason_lower)
|
|
360
|
+
if keyword_cat is not None:
|
|
361
|
+
debug(f"[validation_execute.llm_classify_rejection] keyword hit -> {keyword_cat}")
|
|
362
|
+
return {"category": keyword_cat, "normalized_reason": user_reason.strip()}
|
|
363
|
+
|
|
364
|
+
system = "You are a deterministic rejection classifier. Output ONLY valid JSON. Identical inputs must produce identical outputs."
|
|
365
|
+
user = stable_json(
|
|
366
|
+
{
|
|
367
|
+
"task": "Classify the user's rejection reason into exactly one category.",
|
|
368
|
+
"categories": PolicyConfig.REJECT_CATEGORIES,
|
|
369
|
+
"category_rules": {
|
|
370
|
+
"wrong_intent": "User says the question was misunderstood, intent is wrong.",
|
|
371
|
+
"wrong_tables": "User says wrong table, missing table, incorrect table, or should use different table.",
|
|
372
|
+
"wrong_join": "User says tables connected wrong, relationship wrong, or join issue.",
|
|
373
|
+
"wrong_filters_or_having": "User says wrong condition, filter, WHERE clause, HAVING clause, or threshold issue.",
|
|
374
|
+
"wrong_aggregation_or_grouping": "User says wrong count/sum/avg, grouping wrong, or should group differently.",
|
|
375
|
+
"wrong_columns_selected": "User says wrong column, missing column, or column from wrong table.",
|
|
376
|
+
"too_many_rows": "User says too many results, needs to be filtered more.",
|
|
377
|
+
"too_few_rows": "User says results are incomplete, missing rows, or too few results.",
|
|
378
|
+
"invalid_structure": "User says SQL structure is wrong, query format is bad.",
|
|
379
|
+
"other": "Cannot classify into above categories.",
|
|
380
|
+
},
|
|
381
|
+
"classification_rules": [
|
|
382
|
+
"Read the user_reason carefully.",
|
|
383
|
+
"Match keywords: 'table'/'tables' -> wrong_tables, 'column' -> wrong_columns_selected, 'join' -> wrong_join, 'filter'/'condition'/'value'/'having'/'threshold' -> wrong_filters_or_having.",
|
|
384
|
+
"If multiple categories apply, choose the most specific one.",
|
|
385
|
+
"Only use 'other' if no category matches at all.",
|
|
386
|
+
],
|
|
387
|
+
"output_format": {
|
|
388
|
+
"category": "one of the category keys",
|
|
389
|
+
"normalized_reason": "concise 5-10 word summary of the issue",
|
|
390
|
+
},
|
|
391
|
+
"context": {
|
|
392
|
+
"question": q_norm,
|
|
393
|
+
"tables_used": intent.tables or [],
|
|
394
|
+
"user_reason": user_reason,
|
|
395
|
+
},
|
|
396
|
+
}
|
|
397
|
+
)
|
|
398
|
+
parsed = llm_json(system, user, retries=1, task="intent")
|
|
399
|
+
cat = parsed.get("category") if isinstance(parsed.get("category"), str) else "other"
|
|
400
|
+
if cat not in PolicyConfig.REJECT_CATEGORIES:
|
|
401
|
+
cat = "other"
|
|
402
|
+
norm = parsed.get("normalized_reason") if isinstance(parsed.get("normalized_reason"), str) else user_reason
|
|
403
|
+
debug(f"[validation_execute.llm_classify_rejection] classified: category={cat}")
|
|
404
|
+
return {"category": cat, "normalized_reason": norm.strip()}
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _validate_main_query_cte_usage(intent: RuntimeIntent, cte_outputs: dict[str, list[str]]) -> list[IntentIssue]:
|
|
408
|
+
"""Validate that the main query references CTE outputs correctly.
|
|
409
|
+
|
|
410
|
+
Checks for unreferenced CTEs, main query select columns not present
|
|
411
|
+
in their referenced CTE outputs, column_map references missing from
|
|
412
|
+
CTE outputs, and filter column references missing from CTE outputs.
|
|
413
|
+
|
|
414
|
+
Args: intent: The main RuntimeIntent. cte_outputs: Dict of
|
|
415
|
+
CTE name -> output column list from ``validate_cte_chain``.
|
|
416
|
+
|
|
417
|
+
Returns: List of IntentIssue objects.
|
|
418
|
+
"""
|
|
419
|
+
issues = []
|
|
420
|
+
debug(
|
|
421
|
+
f"[validation_execute.validate_main_query_cte_usage] checking main query uses CTEs: {list(cte_outputs.keys())}"
|
|
422
|
+
)
|
|
423
|
+
if not cte_outputs:
|
|
424
|
+
return issues
|
|
425
|
+
intent_tables = set(t.lower() for t in (intent.tables or []))
|
|
426
|
+
cte_names = set(c.lower() for c in cte_outputs.keys())
|
|
427
|
+
unreferenced_ctes = cte_names - intent_tables
|
|
428
|
+
if unreferenced_ctes:
|
|
429
|
+
issues.append(
|
|
430
|
+
IntentIssue(
|
|
431
|
+
issue_id=f"cte_unreferenced_{','.join(sorted(unreferenced_ctes))}",
|
|
432
|
+
category="cte_usage",
|
|
433
|
+
severity="warning",
|
|
434
|
+
message=f"CTEs defined but not used in main query: {sorted(unreferenced_ctes)}",
|
|
435
|
+
context={"unreferenced": list(unreferenced_ctes)},
|
|
436
|
+
)
|
|
437
|
+
)
|
|
438
|
+
debug(f"[validation_execute.validate_main_query_cte_usage] unreferenced CTEs: {unreferenced_ctes}")
|
|
439
|
+
for sc in intent.select_cols or []:
|
|
440
|
+
col_expr = sc.expr.primary_column
|
|
441
|
+
if not col_expr or "." not in col_expr:
|
|
442
|
+
continue
|
|
443
|
+
table_ref, col_name = col_expr.rsplit(".", 1)
|
|
444
|
+
if table_ref.lower() in cte_names:
|
|
445
|
+
cte_cols = cte_outputs.get(table_ref, [])
|
|
446
|
+
if col_name.lower() not in {c.lower() for c in cte_cols}:
|
|
447
|
+
issues.append(
|
|
448
|
+
IntentIssue(
|
|
449
|
+
issue_id=f"main_col_not_in_cte_{table_ref}_{col_name}",
|
|
450
|
+
category="cte_column_reference",
|
|
451
|
+
severity="error",
|
|
452
|
+
message=f"Main query references column '{col_name}' not in CTE '{table_ref}'",
|
|
453
|
+
context={
|
|
454
|
+
"cte": table_ref,
|
|
455
|
+
"column": col_name,
|
|
456
|
+
"available": cte_cols,
|
|
457
|
+
},
|
|
458
|
+
)
|
|
459
|
+
)
|
|
460
|
+
debug(f"[validation_execute.validate_main_query_cte_usage] column {col_name} not in CTE {table_ref}")
|
|
461
|
+
column_map = intent.column_map or {}
|
|
462
|
+
for col_name, source in column_map.items():
|
|
463
|
+
if source.lower() in cte_names:
|
|
464
|
+
cte_cols = cte_outputs.get(source, [])
|
|
465
|
+
if col_name.lower() not in {c.lower() for c in cte_cols}:
|
|
466
|
+
issues.append(
|
|
467
|
+
IntentIssue(
|
|
468
|
+
issue_id=f"main_colmap_not_in_cte_{source}_{col_name}",
|
|
469
|
+
category="cte_column_reference",
|
|
470
|
+
severity="error",
|
|
471
|
+
message=f"column_map references '{col_name}' from CTE '{source}' but column not in CTE outputs",
|
|
472
|
+
context={
|
|
473
|
+
"cte": source,
|
|
474
|
+
"column": col_name,
|
|
475
|
+
"available": cte_cols,
|
|
476
|
+
},
|
|
477
|
+
)
|
|
478
|
+
)
|
|
479
|
+
debug(f"[validation_execute.validate_main_query_cte_usage] column_map {col_name} not in CTE {source}")
|
|
480
|
+
for fp in intent.filters_param or []:
|
|
481
|
+
col_expr = fp.left_expr.primary_column
|
|
482
|
+
if not col_expr or "." not in col_expr:
|
|
483
|
+
continue
|
|
484
|
+
table_ref, col_name = col_expr.rsplit(".", 1)
|
|
485
|
+
if table_ref.lower() in cte_names:
|
|
486
|
+
cte_cols = cte_outputs.get(table_ref, [])
|
|
487
|
+
if col_name.lower() not in {c.lower() for c in cte_cols}:
|
|
488
|
+
issues.append(
|
|
489
|
+
IntentIssue(
|
|
490
|
+
issue_id=f"main_filter_not_in_cte_{table_ref}_{col_name}",
|
|
491
|
+
category="cte_column_reference",
|
|
492
|
+
severity="error",
|
|
493
|
+
message=f"Main query filter references column '{col_name}' not in CTE '{table_ref}'",
|
|
494
|
+
context={
|
|
495
|
+
"cte": table_ref,
|
|
496
|
+
"column": col_name,
|
|
497
|
+
"available": cte_cols,
|
|
498
|
+
},
|
|
499
|
+
)
|
|
500
|
+
)
|
|
501
|
+
debug(f"[validation_execute.validate_main_query_cte_usage] filter {col_name} not in CTE {table_ref}")
|
|
502
|
+
if issues:
|
|
503
|
+
debug(f"[validation_execute.validate_main_query_cte_usage] found {len(issues)} issues")
|
|
504
|
+
else:
|
|
505
|
+
debug("[validation_execute.validate_main_query_cte_usage] all CTE references valid")
|
|
506
|
+
return issues
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def _validate_cte_output_types(cte_steps: list[RuntimeCteStep], schema: SchemaGraph) -> list[IntentIssue]:
|
|
510
|
+
"""Validate that CTE output column types are consistent with
|
|
511
|
+
aggregation usage.
|
|
512
|
+
|
|
513
|
+
Warns when SUM or AVG is applied to a column whose inferred type is
|
|
514
|
+
not numeric.
|
|
515
|
+
|
|
516
|
+
Args: cte_steps: Ordered list of RuntimeCteStep objects.
|
|
517
|
+
schema: The SchemaGraph for column type lookup.
|
|
518
|
+
|
|
519
|
+
Returns: List of IntentIssue objects (severity ``'warning'``)
|
|
520
|
+
for type mismatches.
|
|
521
|
+
"""
|
|
522
|
+
issues = []
|
|
523
|
+
cte_output_types: dict[str, dict[str, str]] = {}
|
|
524
|
+
debug(f"[validation_execute.validate_cte_output_types] validating {len(cte_steps)} CTE output types")
|
|
525
|
+
for cte in cte_steps:
|
|
526
|
+
cte_name = cte.cte_name
|
|
527
|
+
output_cols = cte.output_columns or []
|
|
528
|
+
select_cols = cte.select_cols or []
|
|
529
|
+
col_types: dict[str, str] = {}
|
|
530
|
+
for col in output_cols:
|
|
531
|
+
if "." in col:
|
|
532
|
+
table, col_name = col.rsplit(".", 1)
|
|
533
|
+
else:
|
|
534
|
+
col_name = col
|
|
535
|
+
table = cte.tables[0] if cte.tables else None
|
|
536
|
+
for sc in select_cols:
|
|
537
|
+
if sc.is_aggregated:
|
|
538
|
+
term = sc.expr.primary_term
|
|
539
|
+
agg_name = term.split("(")[0].lower() if "(" in term else ""
|
|
540
|
+
alias = f"{agg_name}_{sc.expr.primary_column.replace('.', '_')}"
|
|
541
|
+
if col_name.lower() == alias.lower() or col_name.lower().startswith(f"{agg_name}_"):
|
|
542
|
+
col_types[col_name] = "numeric"
|
|
543
|
+
break
|
|
544
|
+
else:
|
|
545
|
+
if table and table in schema.tables:
|
|
546
|
+
schema_col = schema.tables[table].columns.get(col_name)
|
|
547
|
+
if schema_col and schema_col.value_type:
|
|
548
|
+
col_types[col_name] = schema_col.value_type
|
|
549
|
+
elif table and table in cte_output_types:
|
|
550
|
+
dep_types = cte_output_types[table]
|
|
551
|
+
if col_name in dep_types:
|
|
552
|
+
col_types[col_name] = dep_types[col_name]
|
|
553
|
+
cte_output_types[cte_name] = col_types
|
|
554
|
+
for cte in cte_steps:
|
|
555
|
+
cte_name = cte.cte_name
|
|
556
|
+
select_cols = cte.select_cols or []
|
|
557
|
+
numeric_aggs = {"sum", "avg"}
|
|
558
|
+
for sc in select_cols:
|
|
559
|
+
if not sc.is_aggregated:
|
|
560
|
+
continue
|
|
561
|
+
term = sc.expr.primary_term
|
|
562
|
+
agg_func = term.split("(")[0].lower() if "(" in term else ""
|
|
563
|
+
if agg_func not in numeric_aggs:
|
|
564
|
+
continue
|
|
565
|
+
col_expr = sc.expr.primary_column
|
|
566
|
+
if not col_expr:
|
|
567
|
+
continue
|
|
568
|
+
if "." in col_expr:
|
|
569
|
+
table, col_name = col_expr.rsplit(".", 1)
|
|
570
|
+
else:
|
|
571
|
+
col_name = col_expr
|
|
572
|
+
table = cte.tables[0] if cte.tables else None
|
|
573
|
+
if table in cte_output_types:
|
|
574
|
+
col_type = cte_output_types[table].get(col_name, "")
|
|
575
|
+
if col_type and col_type not in ("integer", "number"):
|
|
576
|
+
issues.append(
|
|
577
|
+
IntentIssue(
|
|
578
|
+
issue_id=f"cte_agg_type_mismatch_{cte_name}_{agg_func}_{col_name}",
|
|
579
|
+
category="cte_type_consistency",
|
|
580
|
+
severity="warning",
|
|
581
|
+
message=f"CTE '{cte_name}' applies {agg_func.upper()} to non-numeric column '{col_name}' (type: {col_type})",
|
|
582
|
+
context={
|
|
583
|
+
"cte_name": cte_name,
|
|
584
|
+
"agg": agg_func,
|
|
585
|
+
"column": col_name,
|
|
586
|
+
"type": col_type,
|
|
587
|
+
},
|
|
588
|
+
)
|
|
589
|
+
)
|
|
590
|
+
debug(
|
|
591
|
+
f"[validation_execute.validate_cte_output_types] type mismatch: {agg_func} on {col_name}({col_type})"
|
|
592
|
+
)
|
|
593
|
+
if issues:
|
|
594
|
+
debug(f"[validation_execute.validate_cte_output_types] found {len(issues)} type issues")
|
|
595
|
+
else:
|
|
596
|
+
debug("[validation_execute.validate_cte_output_types] all CTE output types valid")
|
|
597
|
+
return issues
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def _validate_cte_cardinality(cte_steps: list[RuntimeCteStep]) -> list[IntentIssue]:
|
|
601
|
+
"""Validate CTE cardinality expectations are internally consistent.
|
|
602
|
+
|
|
603
|
+
Warns when scalar-grain CTEs have ``expected_rows != 'one'``, LIMIT
|
|
604
|
+
1 CTEs have ``expected_rows != 'one'``, and notes when a many-row
|
|
605
|
+
CTE depends on a single-row CTE.
|
|
606
|
+
|
|
607
|
+
Args: cte_steps: Ordered list of RuntimeCteStep objects.
|
|
608
|
+
|
|
609
|
+
Returns: List of IntentIssue objects.
|
|
610
|
+
"""
|
|
611
|
+
issues = []
|
|
612
|
+
cte_expected_rows: dict[str, str] = {}
|
|
613
|
+
debug(f"[validation_execute.validate_cte_cardinality] validating {len(cte_steps)} CTE cardinalities")
|
|
614
|
+
for cte in cte_steps:
|
|
615
|
+
cte_expected_rows[cte.cte_name] = cte.expected_rows or "many"
|
|
616
|
+
for cte in cte_steps:
|
|
617
|
+
cte_name = cte.cte_name
|
|
618
|
+
expected = cte.expected_rows or "many"
|
|
619
|
+
grain = cte.grain
|
|
620
|
+
if grain == "scalar" and expected != "one":
|
|
621
|
+
issues.append(
|
|
622
|
+
IntentIssue(
|
|
623
|
+
issue_id=f"cte_scalar_cardinality_{cte_name}",
|
|
624
|
+
category="cte_cardinality",
|
|
625
|
+
severity="warning",
|
|
626
|
+
message=f"CTE '{cte_name}' has scalar grain but expected_rows='{expected}'",
|
|
627
|
+
context={
|
|
628
|
+
"cte_name": cte_name,
|
|
629
|
+
"grain": grain,
|
|
630
|
+
"expected_rows": expected,
|
|
631
|
+
},
|
|
632
|
+
)
|
|
633
|
+
)
|
|
634
|
+
debug(f"[validation_execute.validate_cte_cardinality] scalar CTE with expected_rows={expected}")
|
|
635
|
+
if cte.limit == 1 and expected != "one":
|
|
636
|
+
issues.append(
|
|
637
|
+
IntentIssue(
|
|
638
|
+
issue_id=f"cte_limit1_cardinality_{cte_name}",
|
|
639
|
+
category="cte_cardinality",
|
|
640
|
+
severity="warning",
|
|
641
|
+
message=f"CTE '{cte_name}' has LIMIT 1 but expected_rows='{expected}'",
|
|
642
|
+
context={
|
|
643
|
+
"cte_name": cte_name,
|
|
644
|
+
"limit": 1,
|
|
645
|
+
"expected_rows": expected,
|
|
646
|
+
},
|
|
647
|
+
)
|
|
648
|
+
)
|
|
649
|
+
debug(f"[validation_execute.validate_cte_cardinality] LIMIT 1 CTE with expected_rows={expected}")
|
|
650
|
+
for table in cte.tables or []:
|
|
651
|
+
if table in cte_expected_rows:
|
|
652
|
+
dep_expected = cte_expected_rows[table]
|
|
653
|
+
if expected in {"few", "many"} and dep_expected == "one":
|
|
654
|
+
issues.append(
|
|
655
|
+
IntentIssue(
|
|
656
|
+
issue_id=f"cte_cardinality_expansion_{cte_name}_{table}",
|
|
657
|
+
category="cte_cardinality",
|
|
658
|
+
severity="info",
|
|
659
|
+
message=f"CTE '{cte_name}' (expected_rows={expected}) depends on single-row CTE '{table}'",
|
|
660
|
+
context={
|
|
661
|
+
"cte_name": cte_name,
|
|
662
|
+
"expected": expected,
|
|
663
|
+
"dep_cte": table,
|
|
664
|
+
"dep_expected": dep_expected,
|
|
665
|
+
},
|
|
666
|
+
)
|
|
667
|
+
)
|
|
668
|
+
debug(
|
|
669
|
+
f"[validation_execute.validate_cte_cardinality] cardinality note: {cte_name}({expected}) <- {table}({dep_expected})"
|
|
670
|
+
)
|
|
671
|
+
if issues:
|
|
672
|
+
debug(f"[validation_execute.validate_cte_cardinality] found {len(issues)} cardinality issues")
|
|
673
|
+
else:
|
|
674
|
+
debug("[validation_execute.validate_cte_cardinality] all cardinalities valid")
|
|
675
|
+
return issues
|
|
676
|
+
|
|
677
|
+
|
|
678
|
+
def validate_semantics(intent: RuntimeIntent, schema: SchemaGraph) -> IntentValidationResult:
|
|
679
|
+
"""Run the full semantic validation suite against a
|
|
680
|
+
``RuntimeIntent``.
|
|
681
|
+
|
|
682
|
+
Applies every schema-level and semantic-level validation function to
|
|
683
|
+
the main query and each CTE step, then aggregates CTE grain
|
|
684
|
+
compatibility checks across the full CTE chain.
|
|
685
|
+
|
|
686
|
+
Args: intent: The resolved runtime intent to validate.
|
|
687
|
+
schema: Schema graph providing table, column, and relationship
|
|
688
|
+
metadata.
|
|
689
|
+
|
|
690
|
+
Returns: An ``IntentValidationResult`` containing all
|
|
691
|
+
``IntentIssue`` entries found across the main query and all CTE
|
|
692
|
+
steps.
|
|
693
|
+
"""
|
|
694
|
+
all_issues: list[IntentIssue] = []
|
|
695
|
+
debug("[validation_semantic.validate_semantics] running semantic validation suite")
|
|
696
|
+
if not intent.tables:
|
|
697
|
+
all_issues.append(
|
|
698
|
+
IntentIssue(
|
|
699
|
+
issue_id="tables_empty",
|
|
700
|
+
category="structural",
|
|
701
|
+
severity="error",
|
|
702
|
+
message="Intent has no tables specified",
|
|
703
|
+
context={},
|
|
704
|
+
)
|
|
705
|
+
)
|
|
706
|
+
debug("[validation_semantic.validate_semantics] tables empty")
|
|
707
|
+
cte_steps = intent.cte_steps or []
|
|
708
|
+
cte_outputs: dict[str, dict[str, CteOutputColumnMeta]] = {}
|
|
709
|
+
for cte in cte_steps:
|
|
710
|
+
if not cte.cte_name:
|
|
711
|
+
all_issues.append(
|
|
712
|
+
IntentIssue(
|
|
713
|
+
issue_id="cte_name_empty",
|
|
714
|
+
category="cte_structure",
|
|
715
|
+
severity="error",
|
|
716
|
+
message="CTE step has no name specified",
|
|
717
|
+
context={},
|
|
718
|
+
)
|
|
719
|
+
)
|
|
720
|
+
if not cte.output_columns:
|
|
721
|
+
all_issues.append(
|
|
722
|
+
IntentIssue(
|
|
723
|
+
issue_id=f"cte_output_columns_empty_{cte.cte_name}",
|
|
724
|
+
category="cte_structure",
|
|
725
|
+
severity="error",
|
|
726
|
+
message=f"CTE '{cte.cte_name}' has no output columns specified",
|
|
727
|
+
context={"cte_name": cte.cte_name},
|
|
728
|
+
)
|
|
729
|
+
)
|
|
730
|
+
else:
|
|
731
|
+
cte_outputs[cte.cte_name] = dict(cte.output_column_metadata or {})
|
|
732
|
+
if cte_steps:
|
|
733
|
+
cte_names = [c.cte_name for c in cte_steps]
|
|
734
|
+
if len(cte_names) != len(set(cte_names)):
|
|
735
|
+
duplicates = [n for n in cte_names if cte_names.count(n) > 1]
|
|
736
|
+
all_issues.append(
|
|
737
|
+
IntentIssue(
|
|
738
|
+
issue_id=f"cte_duplicate_names_{','.join(sorted(set(duplicates)))}",
|
|
739
|
+
category="cte_structure",
|
|
740
|
+
severity="error",
|
|
741
|
+
message=f"Duplicate CTE names: {sorted(set(duplicates))}",
|
|
742
|
+
context={"duplicates": list(set(duplicates))},
|
|
743
|
+
)
|
|
744
|
+
)
|
|
745
|
+
for i, cte in enumerate(cte_steps):
|
|
746
|
+
for table in cte.tables or []:
|
|
747
|
+
if table.lower() in {n.lower() for n in cte_names[i + 1 :]}:
|
|
748
|
+
all_issues.append(
|
|
749
|
+
IntentIssue(
|
|
750
|
+
issue_id=f"cte_forward_ref_{cte.cte_name}_{table}",
|
|
751
|
+
category="cte_structure",
|
|
752
|
+
severity="error",
|
|
753
|
+
message=f"CTE '{cte.cte_name}' forward-references CTE '{table}' defined later",
|
|
754
|
+
context={"cte_name": cte.cte_name, "forward_ref": table},
|
|
755
|
+
)
|
|
756
|
+
)
|
|
757
|
+
known_tables = set(schema.tables.keys())
|
|
758
|
+
for i, cte in enumerate(cte_steps):
|
|
759
|
+
available = known_tables | {c.cte_name for c in cte_steps[:i]}
|
|
760
|
+
for table in cte.tables or []:
|
|
761
|
+
if table.lower() not in {t.lower() for t in available}:
|
|
762
|
+
all_issues.append(
|
|
763
|
+
IntentIssue(
|
|
764
|
+
issue_id=f"cte_unknown_table_{cte.cte_name}_{table}",
|
|
765
|
+
category="cte_table_reference",
|
|
766
|
+
severity="error",
|
|
767
|
+
message=f"CTE '{cte.cte_name}' references unknown table '{table}'",
|
|
768
|
+
context={"cte_name": cte.cte_name, "table": table},
|
|
769
|
+
)
|
|
770
|
+
)
|
|
771
|
+
cte_outputs_list = {c.cte_name: (c.output_columns or []) for c in cte_steps}
|
|
772
|
+
all_issues.extend(_validate_main_query_cte_usage(intent, cte_outputs_list))
|
|
773
|
+
all_issues.extend(_validate_cte_output_types(cte_steps, schema))
|
|
774
|
+
all_issues.extend(_validate_cte_cardinality(cte_steps))
|
|
775
|
+
allowed_tables = set(intent.tables or [])
|
|
776
|
+
all_issues.extend(
|
|
777
|
+
validate_select_cols_schema(intent.select_cols or [], schema, allowed_tables, cte_outputs, "main query")
|
|
778
|
+
)
|
|
779
|
+
all_issues.extend(
|
|
780
|
+
validate_order_by_cols_schema(
|
|
781
|
+
intent.order_by_cols or [],
|
|
782
|
+
schema,
|
|
783
|
+
allowed_tables,
|
|
784
|
+
cte_outputs,
|
|
785
|
+
"main query",
|
|
786
|
+
)
|
|
787
|
+
)
|
|
788
|
+
all_issues.extend(
|
|
789
|
+
validate_group_by_cols_schema(
|
|
790
|
+
intent.group_by_cols or [],
|
|
791
|
+
schema,
|
|
792
|
+
allowed_tables,
|
|
793
|
+
cte_outputs,
|
|
794
|
+
"main query",
|
|
795
|
+
)
|
|
796
|
+
)
|
|
797
|
+
all_issues.extend(
|
|
798
|
+
validate_filters_schema(
|
|
799
|
+
intent.filters_param or [],
|
|
800
|
+
schema,
|
|
801
|
+
allowed_tables,
|
|
802
|
+
cte_outputs,
|
|
803
|
+
"main query",
|
|
804
|
+
)
|
|
805
|
+
)
|
|
806
|
+
all_issues.extend(
|
|
807
|
+
validate_having_schema(intent.having_param or [], schema, allowed_tables, cte_outputs, "main query")
|
|
808
|
+
)
|
|
809
|
+
all_issues.extend(validate_filter_ops_per_column(intent.filters_param or [], schema, cte_outputs, "main query"))
|
|
810
|
+
all_issues.extend(validate_having_agg_per_role(intent.having_param or [], schema, cte_outputs, "main query"))
|
|
811
|
+
all_issues.extend(validate_having_ops_per_column(intent.having_param or [], schema, cte_outputs, "main query"))
|
|
812
|
+
all_issues.extend(validate_select_agg_per_role(intent.select_cols or [], schema, cte_outputs, "main query"))
|
|
813
|
+
all_issues.extend(validate_select_agg_semantics(intent.select_cols or [], schema, "main query"))
|
|
814
|
+
all_issues.extend(validate_order_by_agg_per_role(intent.order_by_cols or [], schema, cte_outputs, "main query"))
|
|
815
|
+
all_issues.extend(validate_order_by_agg_semantics(intent.order_by_cols or [], schema, "main query"))
|
|
816
|
+
all_issues.extend(
|
|
817
|
+
validate_scalar_func_type_semantics(intent.select_cols or [], intent.order_by_cols or [], schema, "main query")
|
|
818
|
+
)
|
|
819
|
+
all_issues.extend(validate_null_filters(intent.filters_param or [], cte_steps, "main query"))
|
|
820
|
+
all_issues.extend(validate_date_window_units(intent.filters_param or [], cte_steps, "main query"))
|
|
821
|
+
all_issues.extend(validate_date_diff_units(intent.filters_param or [], cte_steps, "main query"))
|
|
822
|
+
all_issues.extend(validate_column_types(intent.select_cols or [], schema, "main query"))
|
|
823
|
+
all_issues.extend(
|
|
824
|
+
validate_filter_value_type_alignment(intent.filters_param or [], schema, "main query", cte_outputs)
|
|
825
|
+
)
|
|
826
|
+
all_issues.extend(validate_no_between_ops(intent.filters_param or [], intent.having_param or [], "main query"))
|
|
827
|
+
all_issues.extend(
|
|
828
|
+
validate_grain_consistency(
|
|
829
|
+
intent.grain,
|
|
830
|
+
intent.select_cols or [],
|
|
831
|
+
intent.group_by_cols or [],
|
|
832
|
+
intent.having_param or [],
|
|
833
|
+
"main query",
|
|
834
|
+
)
|
|
835
|
+
)
|
|
836
|
+
all_issues.extend(
|
|
837
|
+
validate_grouped_requires_aggregation(
|
|
838
|
+
intent.grain,
|
|
839
|
+
intent.select_cols or [],
|
|
840
|
+
intent.group_by_cols or [],
|
|
841
|
+
"main query",
|
|
842
|
+
)
|
|
843
|
+
)
|
|
844
|
+
all_issues.extend(
|
|
845
|
+
validate_semantic_contradictions(
|
|
846
|
+
intent.select_cols or [],
|
|
847
|
+
intent.natural_language,
|
|
848
|
+
intent.grain,
|
|
849
|
+
intent.expected_rows,
|
|
850
|
+
"main query",
|
|
851
|
+
)
|
|
852
|
+
)
|
|
853
|
+
all_issues.extend(
|
|
854
|
+
validate_question_aggregation_hint(
|
|
855
|
+
intent.natural_language,
|
|
856
|
+
intent.select_cols or [],
|
|
857
|
+
intent.having_param or [],
|
|
858
|
+
intent.grain,
|
|
859
|
+
"main query",
|
|
860
|
+
filters_param=intent.filters_param or [],
|
|
861
|
+
schema=schema,
|
|
862
|
+
)
|
|
863
|
+
)
|
|
864
|
+
all_issues.extend(
|
|
865
|
+
validate_threshold_missing_having(
|
|
866
|
+
intent.natural_language,
|
|
867
|
+
intent.select_cols or [],
|
|
868
|
+
intent.having_param or [],
|
|
869
|
+
intent.grain,
|
|
870
|
+
"main query",
|
|
871
|
+
)
|
|
872
|
+
)
|
|
873
|
+
all_issues.extend(
|
|
874
|
+
validate_count_threshold_missing_having(
|
|
875
|
+
intent.natural_language,
|
|
876
|
+
intent.tables or [],
|
|
877
|
+
intent.having_param or [],
|
|
878
|
+
schema,
|
|
879
|
+
"main query",
|
|
880
|
+
)
|
|
881
|
+
)
|
|
882
|
+
all_issues.extend(
|
|
883
|
+
validate_question_numeric_coverage(
|
|
884
|
+
intent.natural_language,
|
|
885
|
+
intent.filters_param or [],
|
|
886
|
+
intent.having_param or [],
|
|
887
|
+
intent.limit,
|
|
888
|
+
"main query",
|
|
889
|
+
)
|
|
890
|
+
)
|
|
891
|
+
all_issues.extend(
|
|
892
|
+
validate_question_distinct_hint(
|
|
893
|
+
intent.natural_language,
|
|
894
|
+
intent.select_cols or [],
|
|
895
|
+
"main query",
|
|
896
|
+
)
|
|
897
|
+
)
|
|
898
|
+
all_issues.extend(
|
|
899
|
+
validate_no_pk_fk_filters(
|
|
900
|
+
intent.filters_param or [],
|
|
901
|
+
schema,
|
|
902
|
+
"main query",
|
|
903
|
+
)
|
|
904
|
+
)
|
|
905
|
+
all_issues.extend(
|
|
906
|
+
validate_question_table_mentions(
|
|
907
|
+
intent.natural_language,
|
|
908
|
+
intent.tables or [],
|
|
909
|
+
schema,
|
|
910
|
+
"main query",
|
|
911
|
+
)
|
|
912
|
+
)
|
|
913
|
+
all_issues.extend(
|
|
914
|
+
validate_question_agg_keyword_coverage(
|
|
915
|
+
intent.natural_language,
|
|
916
|
+
intent.select_cols or [],
|
|
917
|
+
intent.having_param or [],
|
|
918
|
+
"main query",
|
|
919
|
+
)
|
|
920
|
+
)
|
|
921
|
+
_has_agg = any(sc.is_aggregated for sc in (intent.select_cols or [])) or bool(intent.having_param)
|
|
922
|
+
all_issues.extend(
|
|
923
|
+
validate_for_each_grouping(
|
|
924
|
+
intent.natural_language,
|
|
925
|
+
intent.group_by_cols or [],
|
|
926
|
+
schema,
|
|
927
|
+
_has_agg,
|
|
928
|
+
"main query",
|
|
929
|
+
)
|
|
930
|
+
)
|
|
931
|
+
all_issues.extend(validate_expr_vs_expr_filters(intent.filters_param or [], schema, cte_outputs, "main query"))
|
|
932
|
+
all_issues.extend(validate_agg_vs_agg_having(intent.having_param or [], schema, cte_outputs, "main query"))
|
|
933
|
+
all_issues.extend(validate_scalar_expression_semantics(intent.select_cols or [], schema, "main query"))
|
|
934
|
+
all_issues.extend(
|
|
935
|
+
validate_arith_expression_semantics(
|
|
936
|
+
intent.filters_param or [],
|
|
937
|
+
intent.having_param or [],
|
|
938
|
+
schema,
|
|
939
|
+
cte_outputs,
|
|
940
|
+
"main query",
|
|
941
|
+
)
|
|
942
|
+
)
|
|
943
|
+
all_issues.extend(validate_temporal_columns(intent.select_cols or [], schema, "main query"))
|
|
944
|
+
all_issues.extend(validate_pk_fk_aggregation(intent.select_cols or [], schema, "main query"))
|
|
945
|
+
all_issues.extend(validate_select_expr_types(intent.select_cols or [], schema, cte_outputs, "main query"))
|
|
946
|
+
all_issues.extend(validate_order_by_expr_types(intent.order_by_cols or [], schema, cte_outputs, "main query"))
|
|
947
|
+
all_issues.extend(validate_filter_expr_types(intent.filters_param or [], schema, cte_outputs, "main query"))
|
|
948
|
+
all_issues.extend(validate_having_expr_types(intent.having_param or [], schema, cte_outputs, "main query"))
|
|
949
|
+
all_issues.extend(validate_filter_no_aggregation(intent.filters_param or [], "main query"))
|
|
950
|
+
all_issues.extend(validate_having_requires_aggregation(intent.having_param or [], "main query"))
|
|
951
|
+
all_issues.extend(
|
|
952
|
+
validate_no_nested_aggregation(
|
|
953
|
+
intent.select_cols or [],
|
|
954
|
+
intent.order_by_cols or [],
|
|
955
|
+
intent.filters_param or [],
|
|
956
|
+
intent.having_param or [],
|
|
957
|
+
"main query",
|
|
958
|
+
)
|
|
959
|
+
)
|
|
960
|
+
all_issues.extend(
|
|
961
|
+
validate_mixed_aggregation_in_mulgroup(
|
|
962
|
+
intent.select_cols or [],
|
|
963
|
+
intent.order_by_cols or [],
|
|
964
|
+
intent.filters_param or [],
|
|
965
|
+
intent.having_param or [],
|
|
966
|
+
"main query",
|
|
967
|
+
)
|
|
968
|
+
)
|
|
969
|
+
all_issues.extend(validate_order_by_aggregation_context(intent.order_by_cols or [], intent.grain, "main query"))
|
|
970
|
+
all_issues.extend(
|
|
971
|
+
validate_select_group_by_membership(
|
|
972
|
+
intent.select_cols or [],
|
|
973
|
+
intent.group_by_cols or [],
|
|
974
|
+
intent.grain,
|
|
975
|
+
"main query",
|
|
976
|
+
)
|
|
977
|
+
)
|
|
978
|
+
for cte in cte_steps:
|
|
979
|
+
cte_context = f"CTE '{cte.cte_name}'"
|
|
980
|
+
cte_allowed = set(cte.tables or [])
|
|
981
|
+
all_issues.extend(
|
|
982
|
+
validate_select_cols_schema(cte.select_cols or [], schema, cte_allowed, cte_outputs, cte_context)
|
|
983
|
+
)
|
|
984
|
+
all_issues.extend(
|
|
985
|
+
validate_order_by_cols_schema(cte.order_by_cols or [], schema, cte_allowed, cte_outputs, cte_context)
|
|
986
|
+
)
|
|
987
|
+
all_issues.extend(
|
|
988
|
+
validate_group_by_cols_schema(cte.group_by_cols or [], schema, cte_allowed, cte_outputs, cte_context)
|
|
989
|
+
)
|
|
990
|
+
all_issues.extend(
|
|
991
|
+
validate_filters_schema(cte.filters_param or [], schema, cte_allowed, cte_outputs, cte_context)
|
|
992
|
+
)
|
|
993
|
+
all_issues.extend(validate_having_schema(cte.having_param or [], schema, cte_allowed, cte_outputs, cte_context))
|
|
994
|
+
all_issues.extend(validate_filter_ops_per_column(cte.filters_param or [], schema, cte_outputs, cte_context))
|
|
995
|
+
all_issues.extend(validate_date_window_units(cte.filters_param or [], [], cte_context))
|
|
996
|
+
all_issues.extend(validate_date_diff_units(cte.filters_param or [], [], cte_context))
|
|
997
|
+
all_issues.extend(validate_having_agg_per_role(cte.having_param or [], schema, cte_outputs, cte_context))
|
|
998
|
+
all_issues.extend(validate_having_ops_per_column(cte.having_param or [], schema, cte_outputs, cte_context))
|
|
999
|
+
all_issues.extend(validate_select_agg_per_role(cte.select_cols or [], schema, cte_outputs, cte_context))
|
|
1000
|
+
all_issues.extend(validate_select_agg_semantics(cte.select_cols or [], schema, cte_context))
|
|
1001
|
+
all_issues.extend(validate_order_by_agg_per_role(cte.order_by_cols or [], schema, cte_outputs, cte_context))
|
|
1002
|
+
all_issues.extend(validate_order_by_agg_semantics(cte.order_by_cols or [], schema, cte_context))
|
|
1003
|
+
all_issues.extend(
|
|
1004
|
+
validate_scalar_func_type_semantics(cte.select_cols or [], cte.order_by_cols or [], schema, cte_context)
|
|
1005
|
+
)
|
|
1006
|
+
all_issues.extend(validate_column_types(cte.select_cols or [], schema, cte_context))
|
|
1007
|
+
all_issues.extend(
|
|
1008
|
+
validate_filter_value_type_alignment(cte.filters_param or [], schema, cte_context, cte_outputs)
|
|
1009
|
+
)
|
|
1010
|
+
all_issues.extend(validate_no_between_ops(cte.filters_param or [], cte.having_param or [], cte_context))
|
|
1011
|
+
all_issues.extend(validate_cte_grain_consistency(cte, cte_context))
|
|
1012
|
+
all_issues.extend(
|
|
1013
|
+
validate_grouped_requires_aggregation(
|
|
1014
|
+
cte.grain, cte.select_cols or [], cte.group_by_cols or [], cte_context
|
|
1015
|
+
)
|
|
1016
|
+
)
|
|
1017
|
+
all_issues.extend(
|
|
1018
|
+
validate_semantic_contradictions(
|
|
1019
|
+
cte.select_cols or [],
|
|
1020
|
+
cte.description or "",
|
|
1021
|
+
cte.grain,
|
|
1022
|
+
cte.expected_rows,
|
|
1023
|
+
cte_context,
|
|
1024
|
+
)
|
|
1025
|
+
)
|
|
1026
|
+
all_issues.extend(
|
|
1027
|
+
validate_question_aggregation_hint(
|
|
1028
|
+
cte.description or "",
|
|
1029
|
+
cte.select_cols or [],
|
|
1030
|
+
cte.having_param or [],
|
|
1031
|
+
cte.grain,
|
|
1032
|
+
cte_context,
|
|
1033
|
+
filters_param=cte.filters_param or [],
|
|
1034
|
+
schema=schema,
|
|
1035
|
+
)
|
|
1036
|
+
)
|
|
1037
|
+
all_issues.extend(validate_expr_vs_expr_filters(cte.filters_param or [], schema, cte_outputs, cte_context))
|
|
1038
|
+
all_issues.extend(validate_agg_vs_agg_having(cte.having_param or [], schema, cte_outputs, cte_context))
|
|
1039
|
+
all_issues.extend(validate_scalar_expression_semantics(cte.select_cols or [], schema, cte_context))
|
|
1040
|
+
all_issues.extend(
|
|
1041
|
+
validate_arith_expression_semantics(
|
|
1042
|
+
cte.filters_param or [],
|
|
1043
|
+
cte.having_param or [],
|
|
1044
|
+
schema,
|
|
1045
|
+
cte_outputs,
|
|
1046
|
+
cte_context,
|
|
1047
|
+
)
|
|
1048
|
+
)
|
|
1049
|
+
all_issues.extend(validate_temporal_columns(cte.select_cols or [], schema, cte_context))
|
|
1050
|
+
all_issues.extend(validate_pk_fk_aggregation(cte.select_cols or [], schema, cte_context))
|
|
1051
|
+
all_issues.extend(validate_select_expr_types(cte.select_cols or [], schema, cte_outputs, cte_context))
|
|
1052
|
+
all_issues.extend(validate_order_by_expr_types(cte.order_by_cols or [], schema, cte_outputs, cte_context))
|
|
1053
|
+
all_issues.extend(validate_filter_expr_types(cte.filters_param or [], schema, cte_outputs, cte_context))
|
|
1054
|
+
all_issues.extend(validate_having_expr_types(cte.having_param or [], schema, cte_outputs, cte_context))
|
|
1055
|
+
all_issues.extend(validate_filter_no_aggregation(cte.filters_param or [], cte_context))
|
|
1056
|
+
all_issues.extend(validate_having_requires_aggregation(cte.having_param or [], cte_context))
|
|
1057
|
+
all_issues.extend(
|
|
1058
|
+
validate_no_nested_aggregation(
|
|
1059
|
+
cte.select_cols or [],
|
|
1060
|
+
cte.order_by_cols or [],
|
|
1061
|
+
cte.filters_param or [],
|
|
1062
|
+
cte.having_param or [],
|
|
1063
|
+
cte_context,
|
|
1064
|
+
)
|
|
1065
|
+
)
|
|
1066
|
+
all_issues.extend(
|
|
1067
|
+
validate_mixed_aggregation_in_mulgroup(
|
|
1068
|
+
cte.select_cols or [],
|
|
1069
|
+
cte.order_by_cols or [],
|
|
1070
|
+
cte.filters_param or [],
|
|
1071
|
+
cte.having_param or [],
|
|
1072
|
+
cte_context,
|
|
1073
|
+
)
|
|
1074
|
+
)
|
|
1075
|
+
all_issues.extend(validate_order_by_aggregation_context(cte.order_by_cols or [], cte.grain, cte_context))
|
|
1076
|
+
all_issues.extend(
|
|
1077
|
+
validate_select_group_by_membership(cte.select_cols or [], cte.group_by_cols or [], cte.grain, cte_context)
|
|
1078
|
+
)
|
|
1079
|
+
if cte.output_columns:
|
|
1080
|
+
cte_outputs[cte.cte_name] = dict(cte.output_column_metadata or {})
|
|
1081
|
+
if cte_steps:
|
|
1082
|
+
all_issues.extend(validate_cte_dependency_grains(cte_steps, intent.grain))
|
|
1083
|
+
if all_issues:
|
|
1084
|
+
debug(f"[validation_semantic.validate_semantics] found {len(all_issues)} total issues")
|
|
1085
|
+
else:
|
|
1086
|
+
debug("[validation_execute.validate_semantics] all validations passed")
|
|
1087
|
+
for idx, iss in enumerate(all_issues):
|
|
1088
|
+
debug(
|
|
1089
|
+
f"[validation_execute.validate_semantics] issue[{idx}]: "
|
|
1090
|
+
f"{iss.issue_id} | {iss.category} | {iss.severity} | {iss.message} | context={iss.context}"
|
|
1091
|
+
)
|
|
1092
|
+
return IntentValidationResult(issues=all_issues)
|