aetherdialect 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aetherdialect-0.1.0.dist-info/METADATA +197 -0
- aetherdialect-0.1.0.dist-info/RECORD +34 -0
- aetherdialect-0.1.0.dist-info/WHEEL +5 -0
- aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
- aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
- text2sql/__init__.py +7 -0
- text2sql/config.py +1063 -0
- text2sql/contracts_base.py +952 -0
- text2sql/contracts_core.py +1890 -0
- text2sql/core_utils.py +834 -0
- text2sql/dialect.py +1134 -0
- text2sql/expansion_ops.py +1218 -0
- text2sql/expansion_rules.py +496 -0
- text2sql/intent_expr.py +1759 -0
- text2sql/intent_process.py +2133 -0
- text2sql/intent_repair.py +1733 -0
- text2sql/intent_resolve.py +1292 -0
- text2sql/live_testing.py +1117 -0
- text2sql/main_execution.py +799 -0
- text2sql/pipeline.py +1662 -0
- text2sql/qsim_ops.py +1286 -0
- text2sql/qsim_sample.py +609 -0
- text2sql/qsim_struct.py +569 -0
- text2sql/schema.py +973 -0
- text2sql/schema_profiling.py +2075 -0
- text2sql/simulator.py +970 -0
- text2sql/sql_gen.py +1537 -0
- text2sql/templates.py +1037 -0
- text2sql/text2sql.py +726 -0
- text2sql/utils.py +973 -0
- text2sql/validation_agg.py +1033 -0
- text2sql/validation_execute.py +1092 -0
- text2sql/validation_schema.py +1847 -0
- text2sql/validation_semantic.py +2122 -0
text2sql/qsim_ops.py
ADDED
|
@@ -0,0 +1,1286 @@
|
|
|
1
|
+
"""LLM operations and question generation for the question-generation simulator.
|
|
2
|
+
|
|
3
|
+
Provides LLM-based intent filling from structural skeletons, response parsing into ``QSimIntent`` structures, NL question generation, stratified adaptive skeleton selection with coverage guarantees, and intent normalisation utilities.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import random
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .config import (
|
|
12
|
+
AGG_PATTERN,
|
|
13
|
+
TABLE_COL_PATTERN,
|
|
14
|
+
VALID_FILTER_VALUE_TYPES,
|
|
15
|
+
VALID_HAVING_OPS,
|
|
16
|
+
VALID_HAVING_VALUE_TYPES,
|
|
17
|
+
PolicyConfig,
|
|
18
|
+
QSimConfig,
|
|
19
|
+
)
|
|
20
|
+
from .contracts_base import QSimSkeleton, RetryFailureContext, SchemaGraph, SkeletonPool
|
|
21
|
+
from .contracts_core import QSimFilter, QSimHaving, QSimIntent
|
|
22
|
+
from .core_utils import debug, llm_json
|
|
23
|
+
from .qsim_struct import (
|
|
24
|
+
build_fk_adjacency,
|
|
25
|
+
build_schema_context,
|
|
26
|
+
compute_intent_id,
|
|
27
|
+
decompose_between_filter,
|
|
28
|
+
enumerate_table_sets,
|
|
29
|
+
generate_all_skeletons,
|
|
30
|
+
get_aggregatable_columns,
|
|
31
|
+
get_comparable_column_pairs,
|
|
32
|
+
get_filterable_columns,
|
|
33
|
+
get_groupable_columns,
|
|
34
|
+
is_connected,
|
|
35
|
+
load_or_create_skeletons,
|
|
36
|
+
validate_column_exists,
|
|
37
|
+
)
|
|
38
|
+
from .utils import generate_question
|
|
39
|
+
|
|
40
|
+
_QSIM_FILL_SYSTEM = (
|
|
41
|
+
"You are a SQL intent generator."
|
|
42
|
+
" Given structural skeleton constraints, generate a valid query intent filling in specific columns and filters from the available schema."
|
|
43
|
+
" Rules:"
|
|
44
|
+
" 1. STRICTLY follow skeleton constraints for tables, aggregation presence, filter count, groupby count, having, orderby."
|
|
45
|
+
" 2. Use ONLY columns from the provided schema in the specified tables."
|
|
46
|
+
" 3. For filters, choose columns with meaningful filter potential (status, category, date columns)."
|
|
47
|
+
" 4. For aggregation, use COUNT/SUM/AVG/MIN/MAX wrapping table.column in select_cols; aggregate numeric columns only (not IDs or foreign keys); COUNT may use any column or *."
|
|
48
|
+
" 5. For groupby, choose categorical or temporal columns that make semantic sense."
|
|
49
|
+
" 6. Ensure all column references use table.column format from the specified tables."
|
|
50
|
+
" 7. Return ONLY valid JSON matching the specified format."
|
|
51
|
+
" 8. For expr_comparison (filter expr-vs-expr), both expressions must be from different tables with compatible types."
|
|
52
|
+
" 9. DISTINCT is only valid for non-aggregated queries."
|
|
53
|
+
" 10. For orderby, include ASC or DESC direction suffix."
|
|
54
|
+
" 11. DO NOT return columns or tables not in the provided schema."
|
|
55
|
+
" 12. For having, expression must be an aggregation matching a select_cols aggregation."
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _has_aggregation(select_cols: list[str]) -> bool:
|
|
60
|
+
"""Return whether any element of *select_cols* is an aggregation expression.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
|
|
64
|
+
select_cols: List of SQL select-column strings to inspect.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
|
|
68
|
+
``True`` if at least one string matches the aggregation pattern ``AGG(...)``; ``False`` otherwise.
|
|
69
|
+
"""
|
|
70
|
+
return any(AGG_PATTERN.match(sc) for sc in select_cols)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _extract_agg_info(expr: str) -> tuple[str, str] | None:
|
|
74
|
+
"""Extract the aggregation function and inner column from an expression string.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
|
|
78
|
+
expr: A SQL expression string such as ``"COUNT(table.col)"`` or ``"SUM(table.amount)"``.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
|
|
82
|
+
A 2-tuple ``(func, inner_column)`` with lowercase function name and the content inside the parentheses, or ``None`` if the expression does not match an aggregation pattern.
|
|
83
|
+
"""
|
|
84
|
+
m = AGG_PATTERN.match(expr.strip())
|
|
85
|
+
if m:
|
|
86
|
+
return (m.group(1).lower(), m.group(2).strip())
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _extract_tables_from_expr(expr: str) -> set[str]:
|
|
91
|
+
"""Extract all table names from a SQL expression containing ``table.column`` references.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
|
|
95
|
+
expr: A SQL expression string that may contain one or more ``table.column`` tokens.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
|
|
99
|
+
Set of table name strings found in *expr*.
|
|
100
|
+
"""
|
|
101
|
+
return {m.group(1) for m in TABLE_COL_PATTERN.finditer(expr)}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _validate_skeleton_constraints(response: dict[str, Any], skeleton: QSimSkeleton) -> tuple[bool, list[str]]:
|
|
105
|
+
"""Validate an LLM response dict against the structural requirements of a skeleton.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
|
|
109
|
+
response: Parsed LLM response dict with keys ``"select_cols"``, ``"filters"``, ``"groupby_cols"``, ``"having"``, ``"distinct"``, and ``"expr_comparison"``.
|
|
110
|
+
skeleton: The ``QSimSkeleton`` whose constraints the response must satisfy.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
|
|
114
|
+
A 2-tuple ``(is_valid, violations)`` where *is_valid* is ``True`` when all constraints are met and *violations* is a list of human-readable violation description strings.
|
|
115
|
+
"""
|
|
116
|
+
violations = []
|
|
117
|
+
|
|
118
|
+
select_cols_raw = response.get("select_cols", [])
|
|
119
|
+
has_agg = any(AGG_PATTERN.match(sc) for sc in select_cols_raw if isinstance(sc, str))
|
|
120
|
+
|
|
121
|
+
if skeleton.has_aggregation and not has_agg:
|
|
122
|
+
violations.append("skeleton requires aggregation but no aggregated select_cols found")
|
|
123
|
+
if not skeleton.has_aggregation and has_agg:
|
|
124
|
+
violations.append("skeleton forbids aggregation but aggregated select_cols found")
|
|
125
|
+
|
|
126
|
+
filters = response.get("filters", [])
|
|
127
|
+
if skeleton.num_filters > 0 and len(filters) == 0 and not skeleton.has_expr_comparison:
|
|
128
|
+
violations.append(f"skeleton requires {skeleton.num_filters} filters but got 0")
|
|
129
|
+
|
|
130
|
+
groupby = response.get("groupby_cols", [])
|
|
131
|
+
if skeleton.num_groupby > 0 and len(groupby) == 0:
|
|
132
|
+
violations.append(f"skeleton requires {skeleton.num_groupby} groupby but got 0")
|
|
133
|
+
if skeleton.num_groupby == 0 and len(groupby) > 0:
|
|
134
|
+
violations.append(f"skeleton forbids groupby but got {len(groupby)}")
|
|
135
|
+
|
|
136
|
+
having = response.get("having", [])
|
|
137
|
+
if skeleton.has_having and len(having) == 0:
|
|
138
|
+
violations.append("skeleton requires having but got none")
|
|
139
|
+
if not skeleton.has_having and len(having) > 0:
|
|
140
|
+
violations.append(f"skeleton forbids having but got {len(having)}")
|
|
141
|
+
|
|
142
|
+
has_distinct = response.get("distinct", False)
|
|
143
|
+
if skeleton.has_distinct and not has_distinct:
|
|
144
|
+
violations.append(f"skeleton requires distinct but got distinct={has_distinct}")
|
|
145
|
+
if not skeleton.has_distinct and has_distinct:
|
|
146
|
+
violations.append(f"skeleton forbids distinct but got distinct={has_distinct}")
|
|
147
|
+
|
|
148
|
+
expr_comparison = response.get("expr_comparison") or response.get("column_comparison")
|
|
149
|
+
if skeleton.has_expr_comparison and not expr_comparison:
|
|
150
|
+
violations.append("skeleton requires expr_comparison but got none")
|
|
151
|
+
|
|
152
|
+
orderby_cols = response.get("orderby_cols", [])
|
|
153
|
+
if skeleton.has_orderby and len(orderby_cols) == 0:
|
|
154
|
+
violations.append("skeleton requires orderby but got none")
|
|
155
|
+
if not skeleton.has_orderby and len(orderby_cols) > 0:
|
|
156
|
+
violations.append("skeleton forbids orderby but got orderby_cols")
|
|
157
|
+
|
|
158
|
+
return (len(violations) == 0, violations)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _build_retry_guidance(failure_ctx: RetryFailureContext, schema: SchemaGraph, column_roles: dict[str, str]) -> str:
|
|
162
|
+
"""Build a retry guidance string for the LLM based on a previous failure context.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
|
|
166
|
+
failure_ctx: Context object describing the previous attempt's failure, including required/used/missing tables and attempt number.
|
|
167
|
+
schema: Schema graph used to suggest available columns for missing tables.
|
|
168
|
+
column_roles: Map of column key to role string (unused directly but available for future extensions).
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
|
|
172
|
+
A formatted multi-line string to append to the LLM prompt directing it to correct the missing-table violation.
|
|
173
|
+
"""
|
|
174
|
+
guidance_parts = []
|
|
175
|
+
|
|
176
|
+
guidance_parts.append(f"\n\n RETRY GUIDANCE (Attempt {failure_ctx.attempt_number + 2}):")
|
|
177
|
+
guidance_parts.append(f" Previous attempt failed: {failure_ctx.failure_type}")
|
|
178
|
+
guidance_parts.append(f" Required tables: {failure_ctx.required_tables}")
|
|
179
|
+
guidance_parts.append(f" Tables you used: {list(failure_ctx.used_tables)}")
|
|
180
|
+
guidance_parts.append(f" Tables you MUST include: {list(failure_ctx.missing_tables)}")
|
|
181
|
+
|
|
182
|
+
for missing_table in failure_ctx.missing_tables:
|
|
183
|
+
table_ir = schema.tables.get(missing_table)
|
|
184
|
+
if table_ir:
|
|
185
|
+
cols = list(table_ir.columns.keys())[:5]
|
|
186
|
+
guidance_parts.append(f" Available columns in {missing_table}: {cols}")
|
|
187
|
+
|
|
188
|
+
guidance_parts.append(
|
|
189
|
+
f" FIX: Add filters, select_cols, groupby_cols, or aggregation from {list(failure_ctx.missing_tables)}"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
return "\n".join(guidance_parts)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _llm_fill_intent(skeleton: QSimSkeleton, schema: SchemaGraph, column_roles: dict[str, str]) -> QSimIntent | None:
|
|
196
|
+
"""Use the LLM to fill a structural skeleton with concrete column choices and filter values.
|
|
197
|
+
|
|
198
|
+
Builds a detailed prompt from skeleton constraints and schema context, calls the LLM via ``llm_json``, validates the response against skeleton constraints and column existence, and retries with progressive guidance on failure.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
|
|
202
|
+
skeleton: The structural query skeleton defining table set, aggregation presence, filter count, GROUP BY count, and other constraints.
|
|
203
|
+
schema: Schema graph providing column metadata and FK relationships.
|
|
204
|
+
column_roles: Map of ``table.column`` key to column role string.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
|
|
208
|
+
A populated ``QSimIntent`` on success, or ``None`` if all retry attempts are exhausted.
|
|
209
|
+
"""
|
|
210
|
+
context = build_schema_context(skeleton.tables, schema)
|
|
211
|
+
|
|
212
|
+
all_filterable = []
|
|
213
|
+
all_groupable = []
|
|
214
|
+
all_aggregatable = []
|
|
215
|
+
for table in skeleton.tables:
|
|
216
|
+
all_filterable.extend(get_filterable_columns(table, schema, column_roles))
|
|
217
|
+
all_groupable.extend(get_groupable_columns(table, schema, column_roles))
|
|
218
|
+
all_aggregatable.extend(get_aggregatable_columns(table, schema, column_roles))
|
|
219
|
+
|
|
220
|
+
filterable_list = [col_key for col_key, _ in all_filterable]
|
|
221
|
+
|
|
222
|
+
effective_filters = skeleton.num_filters
|
|
223
|
+
if skeleton.has_expr_comparison:
|
|
224
|
+
effective_filters = max(0, skeleton.num_filters - 1)
|
|
225
|
+
|
|
226
|
+
if skeleton.has_aggregation:
|
|
227
|
+
agg_instruction = (
|
|
228
|
+
"MUST include at least one aggregated select column (COUNT/SUM/AVG/MIN/MAX wrapping table.column)"
|
|
229
|
+
)
|
|
230
|
+
else:
|
|
231
|
+
agg_instruction = "NO aggregation - all select_cols must be plain table.column references"
|
|
232
|
+
|
|
233
|
+
filter_instruction = (
|
|
234
|
+
f"MUST include {skeleton.num_filters} filter conditions"
|
|
235
|
+
if skeleton.num_filters > 0
|
|
236
|
+
else "DO NOT include filters"
|
|
237
|
+
)
|
|
238
|
+
groupby_instruction = (
|
|
239
|
+
f"MUST include {skeleton.num_groupby} GROUP BY columns"
|
|
240
|
+
if skeleton.num_groupby > 0
|
|
241
|
+
else "DO NOT include GROUP BY"
|
|
242
|
+
)
|
|
243
|
+
orderby_instruction = (
|
|
244
|
+
"MUST include ORDER BY clause (non-empty orderby_cols)" if skeleton.has_orderby else "DO NOT include ORDER BY"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
if skeleton.has_having:
|
|
248
|
+
having_instruction = (
|
|
249
|
+
"MUST include a HAVING condition with an aggregation expression. DO NOT return an empty having array."
|
|
250
|
+
)
|
|
251
|
+
else:
|
|
252
|
+
having_instruction = "DO NOT include HAVING"
|
|
253
|
+
|
|
254
|
+
distinct_instruction = "Use SELECT DISTINCT (no aggregations)" if skeleton.has_distinct else ""
|
|
255
|
+
|
|
256
|
+
expr_comparison_instruction = ""
|
|
257
|
+
comparable_pairs = []
|
|
258
|
+
if skeleton.has_expr_comparison:
|
|
259
|
+
comparable_pairs = get_comparable_column_pairs(skeleton.tables, schema, column_roles)
|
|
260
|
+
if comparable_pairs:
|
|
261
|
+
pairs_str = ", ".join([f"{t1}.{c1} vs {t2}.{c2}" for t1, c1, t2, c2, _ in comparable_pairs[:5]])
|
|
262
|
+
expr_comparison_instruction = (
|
|
263
|
+
f"MUST include an expr-vs-expr comparison (e.g., {pairs_str}). "
|
|
264
|
+
"Choose columns and operator that make logical sense. "
|
|
265
|
+
"DO NOT set expr_comparison to null."
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
filterable_constraint = (
|
|
269
|
+
f"\n FILTERABLE COLUMNS (MUST use ONLY these for filters): {filterable_list}"
|
|
270
|
+
if effective_filters > 0
|
|
271
|
+
else ""
|
|
272
|
+
)
|
|
273
|
+
aggregatable_constraint = (
|
|
274
|
+
f"\n AGGREGATABLE COLUMNS (use for SUM/AVG/MIN/MAX): {all_aggregatable}"
|
|
275
|
+
if skeleton.has_aggregation and all_aggregatable
|
|
276
|
+
else ""
|
|
277
|
+
)
|
|
278
|
+
groupable_constraint = (
|
|
279
|
+
f"\n GROUPABLE COLUMNS (MUST use for GROUP BY): {all_groupable}" if skeleton.num_groupby > 0 else ""
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
optional_instructions = []
|
|
283
|
+
if distinct_instruction:
|
|
284
|
+
optional_instructions.append(distinct_instruction)
|
|
285
|
+
if expr_comparison_instruction:
|
|
286
|
+
optional_instructions.append(expr_comparison_instruction)
|
|
287
|
+
optional_str = "\n - ".join([""] + optional_instructions) if optional_instructions else ""
|
|
288
|
+
|
|
289
|
+
user_prompt = f"""
|
|
290
|
+
Schema:
|
|
291
|
+
{context}
|
|
292
|
+
{filterable_constraint}{aggregatable_constraint}{groupable_constraint}
|
|
293
|
+
|
|
294
|
+
CRITICAL REQUIREMENTS (MUST follow exactly):
|
|
295
|
+
- Tables: {skeleton.tables}
|
|
296
|
+
- {agg_instruction}
|
|
297
|
+
- {filter_instruction}
|
|
298
|
+
- {groupby_instruction}
|
|
299
|
+
- {orderby_instruction}
|
|
300
|
+
- {having_instruction}{optional_str}
|
|
301
|
+
|
|
302
|
+
Return JSON:
|
|
303
|
+
{{
|
|
304
|
+
"select_cols": ["table.column" | "COUNT(table.column)" | "SUM(table.column)" | "AVG(table.column)" | "MIN(table.column)" | "MAX(table.column)", ...],
|
|
305
|
+
"filters": [{{"column": "table.column", "op": "=" | ">" | "<" | ">=" | "<=" | "!=" | "like" | "between" | "in" | "not in" | "is null" | "is not null", "value_type": "categorical" | "numeric_categorical" | "numeric" | "temporal" | "boolean" | "null"}}],
|
|
306
|
+
"groupby_cols": ["table.column", ...],
|
|
307
|
+
"orderby_cols": ["table.column ASC" | "table.column DESC" | "COUNT(table.column) DESC", ...],
|
|
308
|
+
"having": [{{"expression": "COUNT(table.column)" | "SUM(table.column)" | "AVG(table.column)" | "MIN(table.column)" | "MAX(table.column)", "op": "=" | "!=" | ">" | "<" | ">=" | "<=" | "in" | "not in" | "between", "value_type": "number" | "integer"}}],
|
|
309
|
+
"expr_comparison": {{"left_column": "table.column", "op": "=" | ">" | "<" | ">=" | "<=" | "!=", "right_column": "table.column"}} | null,
|
|
310
|
+
"distinct": true | false
|
|
311
|
+
}}
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
last_failure_reason = None
|
|
315
|
+
failure_context = None
|
|
316
|
+
for attempt in range(PolicyConfig.MAX_REPAIR_LOOPS):
|
|
317
|
+
prompt_with_context = user_prompt
|
|
318
|
+
if failure_context and attempt > 0:
|
|
319
|
+
retry_guidance = _build_retry_guidance(failure_context, schema, column_roles)
|
|
320
|
+
prompt_with_context = f"{user_prompt}{retry_guidance}"
|
|
321
|
+
elif last_failure_reason and attempt > 0:
|
|
322
|
+
prompt_with_context = f"{user_prompt}\n\n PREVIOUS ATTEMPT FAILED: {last_failure_reason}\n Please fix this issue in your response."
|
|
323
|
+
|
|
324
|
+
result = llm_json(_QSIM_FILL_SYSTEM, prompt_with_context, task="default")
|
|
325
|
+
if not result:
|
|
326
|
+
last_failure_reason = "LLM returned empty/null response"
|
|
327
|
+
failure_context = None
|
|
328
|
+
debug(
|
|
329
|
+
f"[qsim_ops.llm_fill_intent] attempt {attempt + 1}/{PolicyConfig.MAX_REPAIR_LOOPS} failed: {last_failure_reason} for skeleton tables={skeleton.tables}"
|
|
330
|
+
)
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
debug(
|
|
334
|
+
f"[qsim_ops.llm_fill_intent] attempt {attempt + 1} LLM returned: select_cols={len(result.get('select_cols', []))}, filters_count={len(result.get('filters', []))}, groupby_count={len(result.get('groupby_cols', []))}, having_count={len(result.get('having', []))}, expr_comparison={result.get('expr_comparison') or result.get('column_comparison')}, distinct={result.get('distinct')}"
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
is_valid, violations = _validate_skeleton_constraints(result, skeleton)
|
|
338
|
+
if not is_valid:
|
|
339
|
+
last_failure_reason = "; ".join(violations)
|
|
340
|
+
failure_context = None
|
|
341
|
+
debug(
|
|
342
|
+
f"[qsim_ops.llm_fill_intent] attempt {attempt + 1}/{PolicyConfig.MAX_REPAIR_LOOPS} SKELETON_CONSTRAINT_VIOLATION: {violations}"
|
|
343
|
+
)
|
|
344
|
+
continue
|
|
345
|
+
|
|
346
|
+
parse_result = _parse_llm_response(result, skeleton, schema, column_roles)
|
|
347
|
+
|
|
348
|
+
if isinstance(parse_result, tuple) and len(parse_result) == 3:
|
|
349
|
+
failure_type, used_tables, missing_tables = parse_result
|
|
350
|
+
failure_context = RetryFailureContext(
|
|
351
|
+
failure_type=failure_type,
|
|
352
|
+
required_tables=skeleton.tables,
|
|
353
|
+
used_tables=used_tables,
|
|
354
|
+
missing_tables=missing_tables,
|
|
355
|
+
attempt_number=attempt,
|
|
356
|
+
)
|
|
357
|
+
last_failure_reason = None
|
|
358
|
+
debug(
|
|
359
|
+
f"[qsim_ops.llm_fill_intent] attempt {attempt + 1}/{PolicyConfig.MAX_REPAIR_LOOPS} failed: {failure_type} for skeleton tables={skeleton.tables}, missing={missing_tables}"
|
|
360
|
+
)
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
if parse_result:
|
|
364
|
+
debug(
|
|
365
|
+
f"[qsim_ops.llm_fill_intent] SUCCESS: intent_id={parse_result.intent_id}, grain={parse_result.grain}, filters={len(parse_result.filters_param)}, groupby={len(parse_result.group_by_cols)}, distinct={parse_result.distinct}"
|
|
366
|
+
)
|
|
367
|
+
return parse_result
|
|
368
|
+
|
|
369
|
+
last_failure_reason = "Response validation failed (filters/columns rejected)"
|
|
370
|
+
failure_context = None
|
|
371
|
+
debug(
|
|
372
|
+
f"[qsim_ops.llm_fill_intent] attempt {attempt + 1}/{PolicyConfig.MAX_REPAIR_LOOPS} failed: parse_llm_response returned None for skeleton tables={skeleton.tables}, LLM response keys={list(result.keys())}"
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
debug(
|
|
376
|
+
f"[qsim_ops.llm_fill_intent] FINAL_FAILURE: exhausted {PolicyConfig.MAX_REPAIR_LOOPS} attempts for skeleton tables={skeleton.tables}, has_agg={skeleton.has_aggregation}, num_filters={skeleton.num_filters}"
|
|
377
|
+
)
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _parse_llm_response(
|
|
382
|
+
response: dict[str, Any],
|
|
383
|
+
skeleton: QSimSkeleton,
|
|
384
|
+
schema: SchemaGraph,
|
|
385
|
+
column_roles: dict[str, str],
|
|
386
|
+
) -> Any | None:
|
|
387
|
+
"""Parse an LLM response dict into a validated ``QSimIntent``.
|
|
388
|
+
|
|
389
|
+
Validates and normalises select columns, filters, GROUP BY, ORDER BY, HAVING, and expr comparisons against the skeleton constraints and schema.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
|
|
393
|
+
response: Parsed LLM JSON response with intent fields.
|
|
394
|
+
skeleton: Skeleton whose constraints the response must satisfy.
|
|
395
|
+
schema: Schema graph for column existence and metadata checks.
|
|
396
|
+
column_roles: Map of ``table.column`` key to column role string.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
|
|
400
|
+
A ``QSimIntent`` on success; ``None`` if validation fails; or a 3-tuple ``(failure_type, used_tables, missing_tables)`` when a three-table coverage violation is detected (used as retry context).
|
|
401
|
+
"""
|
|
402
|
+
select_cols_raw = response.get("select_cols", [])
|
|
403
|
+
filter_dicts = response.get("filters", [])
|
|
404
|
+
groupby_cols = response.get("groupby_cols", [])
|
|
405
|
+
orderby_cols_raw = response.get("orderby_cols", [])
|
|
406
|
+
having_dicts = response.get("having", [])
|
|
407
|
+
expr_comparison_dict = response.get("expr_comparison") or response.get("column_comparison")
|
|
408
|
+
has_distinct = response.get("distinct", False)
|
|
409
|
+
|
|
410
|
+
has_agg = _has_aggregation(select_cols_raw)
|
|
411
|
+
|
|
412
|
+
if skeleton.has_aggregation and not has_agg:
|
|
413
|
+
debug("[qsim_ops.parse_llm_response] REJECTED: skeleton requires aggregation but none in select_cols")
|
|
414
|
+
return None
|
|
415
|
+
if skeleton.num_filters > 0 and len(filter_dicts) == 0 and not skeleton.has_expr_comparison:
|
|
416
|
+
debug(
|
|
417
|
+
f"[qsim_ops.parse_llm_response] REJECTED: skeleton requires {skeleton.num_filters} filters but none provided"
|
|
418
|
+
)
|
|
419
|
+
return None
|
|
420
|
+
if skeleton.num_groupby > 0 and len(groupby_cols) == 0:
|
|
421
|
+
debug(
|
|
422
|
+
f"[qsim_ops.parse_llm_response] REJECTED: skeleton requires {skeleton.num_groupby} groupby cols but none provided"
|
|
423
|
+
)
|
|
424
|
+
return None
|
|
425
|
+
|
|
426
|
+
if skeleton.has_orderby and len(orderby_cols_raw) == 0:
|
|
427
|
+
debug("[qsim_ops.parse_llm_response] REJECTED: skeleton requires orderby but none provided")
|
|
428
|
+
return None
|
|
429
|
+
if not skeleton.has_orderby and len(orderby_cols_raw) > 0:
|
|
430
|
+
debug("[qsim_ops.parse_llm_response] REJECTED: skeleton forbids orderby but orderby_cols provided")
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
if skeleton.has_distinct and skeleton.has_aggregation:
|
|
434
|
+
debug("[qsim_ops.parse_llm_response] REJECTED: DISTINCT not allowed with aggregation")
|
|
435
|
+
return None
|
|
436
|
+
|
|
437
|
+
select_cols: list[str] = []
|
|
438
|
+
for sc in select_cols_raw:
|
|
439
|
+
if not isinstance(sc, str) or not sc.strip():
|
|
440
|
+
continue
|
|
441
|
+
sc = sc.strip()
|
|
442
|
+
agg_info = _extract_agg_info(sc)
|
|
443
|
+
if agg_info:
|
|
444
|
+
agg_func, agg_inner = agg_info
|
|
445
|
+
if agg_inner != "*":
|
|
446
|
+
if not validate_column_exists(agg_inner, skeleton.tables, schema):
|
|
447
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_SELECT: {sc}, reason=agg_column_not_found")
|
|
448
|
+
continue
|
|
449
|
+
select_cols.append(f"{agg_func.upper()}({agg_inner})")
|
|
450
|
+
else:
|
|
451
|
+
if not validate_column_exists(sc, skeleton.tables, schema):
|
|
452
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_SELECT: {sc}, reason=column_not_found")
|
|
453
|
+
continue
|
|
454
|
+
select_cols.append(sc)
|
|
455
|
+
|
|
456
|
+
if not select_cols:
|
|
457
|
+
debug("[qsim_ops.parse_llm_response] REJECTED: no valid select_cols remaining")
|
|
458
|
+
return None
|
|
459
|
+
|
|
460
|
+
aggregated_tables: set[str] = set()
|
|
461
|
+
if has_agg and groupby_cols:
|
|
462
|
+
for gcol in groupby_cols:
|
|
463
|
+
if "." in gcol:
|
|
464
|
+
aggregated_tables.add(gcol.split(".")[0])
|
|
465
|
+
|
|
466
|
+
filters: list[QSimFilter] = []
|
|
467
|
+
filter_columns_used: set[str] = set()
|
|
468
|
+
for _filter_idx, fd in enumerate(filter_dicts):
|
|
469
|
+
col = fd.get("column", "")
|
|
470
|
+
if not validate_column_exists(col, skeleton.tables, schema):
|
|
471
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_FILTER: col={col}, reason=column_not_found")
|
|
472
|
+
continue
|
|
473
|
+
|
|
474
|
+
table, col_name = col.split(".", 1)
|
|
475
|
+
col_meta = schema.tables[table].columns.get(col_name)
|
|
476
|
+
if not col_meta or not col_meta.is_filterable or not col_meta.is_usable:
|
|
477
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_FILTER: col={col}, reason=not_filterable")
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
if col not in filter_columns_used and len(filter_columns_used) >= QSimConfig.MAX_FILTER_COLUMNS + 1:
|
|
481
|
+
debug(
|
|
482
|
+
f"[qsim_ops.parse_llm_response] REJECTED_FILTER: col={col}, reason=max_filter_columns_exceeded (>{QSimConfig.MAX_FILTER_COLUMNS + 1})"
|
|
483
|
+
)
|
|
484
|
+
continue
|
|
485
|
+
|
|
486
|
+
if col not in filter_columns_used and len(filter_columns_used) >= QSimConfig.MAX_FILTER_COLUMNS:
|
|
487
|
+
debug(
|
|
488
|
+
f"[qsim_ops.parse_llm_response] WARNING_FILTER: col={col}, using {len(filter_columns_used) + 1} distinct columns (preferred max={QSimConfig.MAX_FILTER_COLUMNS})"
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
op = fd.get("op", "=")
|
|
492
|
+
valid_ops = col_meta.get_valid_filter_ops()
|
|
493
|
+
|
|
494
|
+
if op not in valid_ops:
|
|
495
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_FILTER: col={col}, reason=invalid_operator_{op}_for_type")
|
|
496
|
+
continue
|
|
497
|
+
|
|
498
|
+
if has_agg and col_meta.is_foreign_key and op == "=":
|
|
499
|
+
fk_target_table = col_meta.fk_target[0] if col_meta.fk_target else None
|
|
500
|
+
|
|
501
|
+
if fk_target_table and fk_target_table in aggregated_tables:
|
|
502
|
+
debug(
|
|
503
|
+
f"[qsim_ops.parse_llm_response] REJECTED_FILTER: col={col}, reason=circular_fk_to_aggregated_table"
|
|
504
|
+
)
|
|
505
|
+
continue
|
|
506
|
+
|
|
507
|
+
if table in aggregated_tables:
|
|
508
|
+
debug(
|
|
509
|
+
f"[qsim_ops.parse_llm_response] REJECTED_FILTER: col={col}, reason=fk_filter_on_aggregated_source_table"
|
|
510
|
+
)
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
value_type = fd.get("value_type", "categorical")
|
|
514
|
+
if op in ("is null", "is not null"):
|
|
515
|
+
value_type = "null"
|
|
516
|
+
elif value_type not in VALID_FILTER_VALUE_TYPES and value_type != "null":
|
|
517
|
+
value_type = "categorical"
|
|
518
|
+
|
|
519
|
+
filter_columns_used.add(col)
|
|
520
|
+
|
|
521
|
+
qf = QSimFilter(column=col, op=op, value_type=value_type)
|
|
522
|
+
if op == "between":
|
|
523
|
+
decomposed = decompose_between_filter(qf)
|
|
524
|
+
filters.extend(decomposed)
|
|
525
|
+
debug(f"[qsim_ops.parse_llm_response] DECOMPOSED_BETWEEN: col={col} into >= and <=")
|
|
526
|
+
else:
|
|
527
|
+
filters.append(qf)
|
|
528
|
+
debug(f"[qsim_ops.parse_llm_response] ACCEPTED_FILTER: col={col}, op={op}, value_type={value_type}")
|
|
529
|
+
|
|
530
|
+
if skeleton.has_expr_comparison and expr_comparison_dict:
|
|
531
|
+
left_col_full = expr_comparison_dict.get("left_column", "")
|
|
532
|
+
right_col_full = expr_comparison_dict.get("right_column", "")
|
|
533
|
+
cmp_op = expr_comparison_dict.get("op", "=")
|
|
534
|
+
|
|
535
|
+
if left_col_full and right_col_full and "." in left_col_full and "." in right_col_full:
|
|
536
|
+
left_table, left_col_name = left_col_full.split(".", 1)
|
|
537
|
+
right_table, right_col_name = right_col_full.split(".", 1)
|
|
538
|
+
|
|
539
|
+
left_valid = validate_column_exists(left_col_full, skeleton.tables, schema)
|
|
540
|
+
right_valid = validate_column_exists(right_col_full, skeleton.tables, schema)
|
|
541
|
+
|
|
542
|
+
if left_valid and right_valid:
|
|
543
|
+
left_meta = schema.tables[left_table].columns.get(left_col_name)
|
|
544
|
+
right_meta = schema.tables[right_table].columns.get(right_col_name)
|
|
545
|
+
|
|
546
|
+
if left_meta and right_meta:
|
|
547
|
+
left_is_numeric = left_meta.value_type in ("integer", "number")
|
|
548
|
+
right_is_numeric = right_meta.value_type in ("integer", "number")
|
|
549
|
+
left_is_temporal = left_meta.value_type == "date"
|
|
550
|
+
right_is_temporal = right_meta.value_type == "date"
|
|
551
|
+
|
|
552
|
+
left_role = column_roles.get(f"{left_table}.{left_col_name}", left_meta.role or "unknown")
|
|
553
|
+
right_role = column_roles.get(f"{right_table}.{right_col_name}", right_meta.role or "unknown")
|
|
554
|
+
|
|
555
|
+
semantic_compatible = False
|
|
556
|
+
rejection_reason = None
|
|
557
|
+
|
|
558
|
+
if left_role == right_role and left_role != "unknown":
|
|
559
|
+
semantic_compatible = True
|
|
560
|
+
elif left_is_temporal and right_is_temporal:
|
|
561
|
+
semantic_compatible = True
|
|
562
|
+
elif left_is_numeric and right_is_numeric and left_role == right_role:
|
|
563
|
+
semantic_compatible = True
|
|
564
|
+
else:
|
|
565
|
+
rejection_reason = f"role_mismatch: left={left_col_full}(role={left_role}) vs right={right_col_full}(role={right_role})"
|
|
566
|
+
|
|
567
|
+
if semantic_compatible:
|
|
568
|
+
value_type = "temporal" if left_is_temporal else "numeric"
|
|
569
|
+
filters.append(
|
|
570
|
+
QSimFilter(
|
|
571
|
+
column=left_col_full,
|
|
572
|
+
op=cmp_op,
|
|
573
|
+
value_type=value_type,
|
|
574
|
+
right_column=right_col_full,
|
|
575
|
+
)
|
|
576
|
+
)
|
|
577
|
+
debug(
|
|
578
|
+
f"[qsim_ops.parse_llm_response] ACCEPTED_COLUMN_COMPARISON: {left_col_full} {cmp_op} {right_col_full}, roles={left_role}={right_role}"
|
|
579
|
+
)
|
|
580
|
+
else:
|
|
581
|
+
debug(
|
|
582
|
+
f"[qsim_ops.parse_llm_response] DISCARDED_EXPR_COMPARISON: {left_col_full} {cmp_op} {right_col_full}, reason={rejection_reason}"
|
|
583
|
+
)
|
|
584
|
+
else:
|
|
585
|
+
debug("[qsim_ops.parse_llm_response] DISCARDED_EXPR_COMPARISON: column metadata not found")
|
|
586
|
+
else:
|
|
587
|
+
debug(
|
|
588
|
+
f"[qsim_ops.parse_llm_response] DISCARDED_EXPR_COMPARISON: column validation failed left={left_valid} right={right_valid}"
|
|
589
|
+
)
|
|
590
|
+
else:
|
|
591
|
+
debug("[qsim_ops.parse_llm_response] DISCARDED_EXPR_COMPARISON: invalid column format")
|
|
592
|
+
|
|
593
|
+
total_filter_elements = len(filters)
|
|
594
|
+
if skeleton.num_filters > 0 and total_filter_elements == 0:
|
|
595
|
+
debug(
|
|
596
|
+
f"[qsim_ops.parse_llm_response] INSUFFICIENT_FILTERS: requested={skeleton.num_filters}, validated_filters={len(filters)}, rejecting_intent"
|
|
597
|
+
)
|
|
598
|
+
return None
|
|
599
|
+
|
|
600
|
+
if has_agg and groupby_cols:
|
|
601
|
+
for sc in select_cols:
|
|
602
|
+
agg_info = _extract_agg_info(sc)
|
|
603
|
+
if agg_info:
|
|
604
|
+
_, agg_inner = agg_info
|
|
605
|
+
agg_inner_base = agg_inner.split(".")[-1] if "." in agg_inner else agg_inner
|
|
606
|
+
for gcol in groupby_cols:
|
|
607
|
+
gcol_base = gcol.split(".")[-1] if "." in gcol else gcol
|
|
608
|
+
if agg_inner == gcol:
|
|
609
|
+
debug(
|
|
610
|
+
f"[qsim_ops.parse_llm_response] REJECTED: agg_inner={agg_inner} matches groupby_col={gcol}, reason=exact_self_grouping"
|
|
611
|
+
)
|
|
612
|
+
return None
|
|
613
|
+
if agg_inner_base == gcol_base:
|
|
614
|
+
debug(
|
|
615
|
+
f"[qsim_ops.parse_llm_response] REJECTED: agg_inner={agg_inner} matches groupby_col={gcol}, reason=base_name_self_grouping"
|
|
616
|
+
)
|
|
617
|
+
return None
|
|
618
|
+
|
|
619
|
+
having: list[QSimHaving] = []
|
|
620
|
+
for hd in having_dicts:
|
|
621
|
+
h_expression = hd.get("expression", "")
|
|
622
|
+
h_op = hd.get("op", ">")
|
|
623
|
+
if h_op not in VALID_HAVING_OPS:
|
|
624
|
+
h_op = ">"
|
|
625
|
+
h_value_type = hd.get("value_type", "number")
|
|
626
|
+
if h_value_type not in VALID_HAVING_VALUE_TYPES:
|
|
627
|
+
h_value_type = "number"
|
|
628
|
+
right_expr = hd.get("right_expression", "")
|
|
629
|
+
|
|
630
|
+
h_agg_info = _extract_agg_info(h_expression)
|
|
631
|
+
if not h_agg_info:
|
|
632
|
+
debug(
|
|
633
|
+
f"[qsim_ops.parse_llm_response] REJECTED_HAVING: expression={h_expression}, reason=no_aggregation_pattern"
|
|
634
|
+
)
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
h_agg_func, h_agg_inner = h_agg_info
|
|
638
|
+
if h_agg_inner != "*" and not validate_column_exists(h_agg_inner, skeleton.tables, schema):
|
|
639
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_HAVING: expression={h_expression}, reason=column_not_found")
|
|
640
|
+
continue
|
|
641
|
+
|
|
642
|
+
if right_expr:
|
|
643
|
+
right_agg_info = _extract_agg_info(right_expr)
|
|
644
|
+
if not right_agg_info:
|
|
645
|
+
debug(
|
|
646
|
+
f"[qsim_ops.parse_llm_response] REJECTED_HAVING: right_expression={right_expr}, reason=no_aggregation_pattern"
|
|
647
|
+
)
|
|
648
|
+
continue
|
|
649
|
+
right_agg_func, right_agg_inner = right_agg_info
|
|
650
|
+
if right_agg_inner != "*" and not validate_column_exists(right_agg_inner, skeleton.tables, schema):
|
|
651
|
+
debug(
|
|
652
|
+
f"[qsim_ops.parse_llm_response] REJECTED_HAVING: right_expression={right_expr}, reason=column_not_found"
|
|
653
|
+
)
|
|
654
|
+
continue
|
|
655
|
+
having.append(
|
|
656
|
+
QSimHaving(
|
|
657
|
+
expression=f"{h_agg_func.upper()}({h_agg_inner})",
|
|
658
|
+
op=h_op,
|
|
659
|
+
value_type="expression",
|
|
660
|
+
right_expression=f"{right_agg_func.upper()}({right_agg_inner})",
|
|
661
|
+
)
|
|
662
|
+
)
|
|
663
|
+
else:
|
|
664
|
+
having.append(
|
|
665
|
+
QSimHaving(
|
|
666
|
+
expression=f"{h_agg_func.upper()}({h_agg_inner})",
|
|
667
|
+
op=h_op,
|
|
668
|
+
value_type=h_value_type,
|
|
669
|
+
)
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
validated_groupby: list[str] = []
|
|
673
|
+
for gcol in groupby_cols:
|
|
674
|
+
if validate_column_exists(gcol, skeleton.tables, schema):
|
|
675
|
+
validated_groupby.append(gcol)
|
|
676
|
+
else:
|
|
677
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_GROUPBY: col={gcol}, reason=column_not_found")
|
|
678
|
+
|
|
679
|
+
order_by_cols: list[str] = []
|
|
680
|
+
for ob in orderby_cols_raw:
|
|
681
|
+
ob_clean = ob.strip()
|
|
682
|
+
direction = "ASC"
|
|
683
|
+
if ob_clean.upper().endswith(" DESC"):
|
|
684
|
+
direction = "DESC"
|
|
685
|
+
ob_clean = ob_clean[:-5].strip()
|
|
686
|
+
elif ob_clean.upper().endswith(" ASC"):
|
|
687
|
+
ob_clean = ob_clean[:-4].strip()
|
|
688
|
+
|
|
689
|
+
agg_info = _extract_agg_info(ob_clean)
|
|
690
|
+
if agg_info:
|
|
691
|
+
agg_func, agg_inner = agg_info
|
|
692
|
+
if agg_inner != "*" and not validate_column_exists(agg_inner, skeleton.tables, schema):
|
|
693
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_ORDERBY: {ob}, reason=column_not_found")
|
|
694
|
+
continue
|
|
695
|
+
order_by_cols.append(f"{agg_func.upper()}({agg_inner}) {direction}")
|
|
696
|
+
else:
|
|
697
|
+
if not validate_column_exists(ob_clean, skeleton.tables, schema):
|
|
698
|
+
debug(f"[qsim_ops.parse_llm_response] REJECTED_ORDERBY: {ob}, reason=column_not_found")
|
|
699
|
+
continue
|
|
700
|
+
order_by_cols.append(f"{ob_clean} {direction}")
|
|
701
|
+
|
|
702
|
+
grain = "row_level"
|
|
703
|
+
if has_agg:
|
|
704
|
+
grain = "grouped" if validated_groupby else "scalar"
|
|
705
|
+
|
|
706
|
+
use_distinct = skeleton.has_distinct and has_distinct and grain == "row_level"
|
|
707
|
+
|
|
708
|
+
if skeleton.has_distinct and not use_distinct:
|
|
709
|
+
if not has_distinct:
|
|
710
|
+
debug(
|
|
711
|
+
"[qsim_ops.parse_llm_response] DISTINCT_REJECTED: LLM returned distinct=false despite skeleton.has_distinct=True"
|
|
712
|
+
)
|
|
713
|
+
elif grain != "row_level":
|
|
714
|
+
debug(
|
|
715
|
+
f"[qsim_ops.parse_llm_response] DISTINCT_REJECTED: grain={grain} incompatible with DISTINCT (requires row_level)"
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
if len(skeleton.tables) >= 3:
|
|
719
|
+
tables_used: set[str] = set()
|
|
720
|
+
for sc in select_cols:
|
|
721
|
+
tables_used.update(_extract_tables_from_expr(sc))
|
|
722
|
+
for col in validated_groupby:
|
|
723
|
+
tables_used.update(_extract_tables_from_expr(col))
|
|
724
|
+
for f in filters:
|
|
725
|
+
tables_used.update(_extract_tables_from_expr(f.column))
|
|
726
|
+
if f.right_column:
|
|
727
|
+
tables_used.update(_extract_tables_from_expr(f.right_column))
|
|
728
|
+
for ob in order_by_cols:
|
|
729
|
+
tables_used.update(_extract_tables_from_expr(ob))
|
|
730
|
+
|
|
731
|
+
missing_tables = set(skeleton.tables) - tables_used
|
|
732
|
+
if missing_tables:
|
|
733
|
+
debug(
|
|
734
|
+
f"[qsim_ops.parse_llm_response] REJECTED_THREE_TABLE: tables={skeleton.tables}, used={tables_used}, missing={missing_tables}"
|
|
735
|
+
)
|
|
736
|
+
return ("three_table_violation", tables_used, missing_tables)
|
|
737
|
+
|
|
738
|
+
intent_id_val = compute_intent_id(
|
|
739
|
+
{
|
|
740
|
+
"tables": skeleton.tables,
|
|
741
|
+
"grain": grain,
|
|
742
|
+
"select_cols": select_cols,
|
|
743
|
+
"group_by_cols": validated_groupby,
|
|
744
|
+
"filters_param": [f.to_dict() for f in filters],
|
|
745
|
+
"having_param": [h.to_dict() for h in having],
|
|
746
|
+
"distinct": use_distinct,
|
|
747
|
+
}
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
return QSimIntent(
|
|
751
|
+
intent_id=intent_id_val,
|
|
752
|
+
tables=skeleton.tables,
|
|
753
|
+
grain=grain,
|
|
754
|
+
select_cols=select_cols,
|
|
755
|
+
group_by_cols=validated_groupby,
|
|
756
|
+
order_by_cols=order_by_cols,
|
|
757
|
+
filters_param=filters,
|
|
758
|
+
having_param=having,
|
|
759
|
+
param_values={},
|
|
760
|
+
distinct=use_distinct,
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def _generate_question_from_intent(intent: QSimIntent, schema: SchemaGraph) -> str | None:
|
|
765
|
+
"""Generate a natural-language question from a structured ``QSimIntent``.
|
|
766
|
+
|
|
767
|
+
Args:
|
|
768
|
+
|
|
769
|
+
intent: The structured intent whose components are used to build the question description passed to ``generate_question``.
|
|
770
|
+
schema: Schema graph passed through to the question generator.
|
|
771
|
+
column_roles: Map of column key to role string (currently unused but available for style selection).
|
|
772
|
+
|
|
773
|
+
Returns:
|
|
774
|
+
|
|
775
|
+
A natural-language question string, or ``None`` if generation failed.
|
|
776
|
+
"""
|
|
777
|
+
filter_descriptions = []
|
|
778
|
+
for idx, f in enumerate(intent.filters_param):
|
|
779
|
+
if f.is_expr_comparison:
|
|
780
|
+
cond = f"{f.op} {f.right_column}"
|
|
781
|
+
else:
|
|
782
|
+
cond = f"{f.op} {intent.param_values.get(f'f{idx}', '?')}"
|
|
783
|
+
filter_descriptions.append({"column": f.column, "condition": cond})
|
|
784
|
+
|
|
785
|
+
having_descriptions = []
|
|
786
|
+
for hidx, h in enumerate(intent.having_param):
|
|
787
|
+
if h.is_expression_comparison:
|
|
788
|
+
cond = f"{h.op} {h.right_expression}"
|
|
789
|
+
else:
|
|
790
|
+
cond = f"{h.op} {intent.param_values.get(f'h{hidx}', '?')}"
|
|
791
|
+
having_descriptions.append({"expression": h.expression, "condition": cond})
|
|
792
|
+
|
|
793
|
+
return generate_question(
|
|
794
|
+
intent.tables,
|
|
795
|
+
intent.select_cols,
|
|
796
|
+
filter_descriptions,
|
|
797
|
+
intent.group_by_cols,
|
|
798
|
+
having_descriptions,
|
|
799
|
+
schema,
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
def generate_all_questions(intents: list[QSimIntent], schema: SchemaGraph) -> list[QSimIntent]:
|
|
804
|
+
"""Generate NL questions for a list of intents and return those with successful questions.
|
|
805
|
+
|
|
806
|
+
Args:
|
|
807
|
+
|
|
808
|
+
intents: List of ``QSimIntent`` instances to generate questions for.
|
|
809
|
+
schema: Schema graph passed through to ``generate_question_from_intent``.
|
|
810
|
+
column_roles: Map of column key to role string.
|
|
811
|
+
|
|
812
|
+
Returns:
|
|
813
|
+
|
|
814
|
+
List of ``QSimIntent`` instances (copies) with the ``question`` field populated; intents for which generation failed are omitted.
|
|
815
|
+
"""
|
|
816
|
+
debug(f"[qsim_ops.generate_all_questions] generating: {len(intents)} questions")
|
|
817
|
+
|
|
818
|
+
results: list[QSimIntent] = []
|
|
819
|
+
|
|
820
|
+
for i, intent in enumerate(intents):
|
|
821
|
+
if i > 0 and i % 10 == 0:
|
|
822
|
+
debug(f"[qsim_ops.generate_all_questions] progress: {i}/{len(intents)}")
|
|
823
|
+
|
|
824
|
+
question = _generate_question_from_intent(intent, schema)
|
|
825
|
+
if question:
|
|
826
|
+
intent_with_question = QSimIntent(
|
|
827
|
+
intent_id=intent.intent_id,
|
|
828
|
+
tables=intent.tables,
|
|
829
|
+
grain=intent.grain,
|
|
830
|
+
select_cols=intent.select_cols,
|
|
831
|
+
group_by_cols=intent.group_by_cols,
|
|
832
|
+
order_by_cols=intent.order_by_cols,
|
|
833
|
+
filters_param=intent.filters_param,
|
|
834
|
+
having_param=intent.having_param,
|
|
835
|
+
param_values=intent.param_values,
|
|
836
|
+
question=question,
|
|
837
|
+
variant_idx=intent.variant_idx,
|
|
838
|
+
limit=intent.limit,
|
|
839
|
+
distinct=intent.distinct,
|
|
840
|
+
)
|
|
841
|
+
results.append(intent_with_question)
|
|
842
|
+
else:
|
|
843
|
+
debug(f"[qsim_ops.generate_all_questions] failed: {intent.intent_id}")
|
|
844
|
+
|
|
845
|
+
debug(f"[qsim_ops.generate_all_questions] complete: {len(results)} questions")
|
|
846
|
+
return results
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
def _is_no_variance_skeleton(skeleton: QSimSkeleton) -> bool:
|
|
850
|
+
"""Return whether a skeleton has no variance (no filters and no HAVING).
|
|
851
|
+
|
|
852
|
+
No-variance skeletons always produce the same query regardless of value sampling and should be used sparingly.
|
|
853
|
+
|
|
854
|
+
Args:
|
|
855
|
+
|
|
856
|
+
skeleton: The skeleton to inspect.
|
|
857
|
+
|
|
858
|
+
Returns:
|
|
859
|
+
|
|
860
|
+
``True`` if the skeleton has zero required filters and no HAVING clause.
|
|
861
|
+
"""
|
|
862
|
+
return skeleton.num_filters == 0 and not skeleton.has_having
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def _compute_skeleton_complexity_tier(skeleton: QSimSkeleton) -> str:
|
|
866
|
+
"""Compute the complexity tier of a skeleton for stratified pool construction.
|
|
867
|
+
|
|
868
|
+
Assigns a composite score based on filter count, aggregation, GROUP BY, HAVING, ORDER BY, DISTINCT, and expr comparison presence.
|
|
869
|
+
|
|
870
|
+
Args:
|
|
871
|
+
|
|
872
|
+
skeleton: The skeleton to score.
|
|
873
|
+
|
|
874
|
+
Returns:
|
|
875
|
+
|
|
876
|
+
``"A"`` (score >= 8, high complexity), ``"B"`` (score >= 4, medium), or ``"C"`` (score < 4, low complexity).
|
|
877
|
+
"""
|
|
878
|
+
score = 0
|
|
879
|
+
score += skeleton.num_filters * 2
|
|
880
|
+
score += 3 if skeleton.has_aggregation else 0
|
|
881
|
+
score += skeleton.num_groupby * 2
|
|
882
|
+
score += 3 if skeleton.has_having else 0
|
|
883
|
+
score += 1 if skeleton.has_orderby else 0
|
|
884
|
+
score += 2 if skeleton.has_distinct else 0
|
|
885
|
+
score += 3 if skeleton.has_expr_comparison else 0
|
|
886
|
+
|
|
887
|
+
if score >= 8:
|
|
888
|
+
return "A"
|
|
889
|
+
elif score >= 4:
|
|
890
|
+
return "B"
|
|
891
|
+
else:
|
|
892
|
+
return "C"
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
def _compute_table_set_richness(tables: list[str], schema: SchemaGraph, column_roles: dict[str, str]) -> int:
|
|
896
|
+
"""Compute a richness score for a table set based on column capabilities.
|
|
897
|
+
|
|
898
|
+
Args:
|
|
899
|
+
|
|
900
|
+
tables: List of table names in the candidate set.
|
|
901
|
+
schema: Schema graph for column capability lookups.
|
|
902
|
+
column_roles: Map of column key to role string.
|
|
903
|
+
|
|
904
|
+
Returns:
|
|
905
|
+
|
|
906
|
+
Integer score reflecting the total number of filterable (×2), aggregatable (×3), groupable (×2), and comparable column-pair (×2) opportunities across all tables.
|
|
907
|
+
"""
|
|
908
|
+
filterable_count = 0
|
|
909
|
+
aggregatable_count = 0
|
|
910
|
+
groupable_count = 0
|
|
911
|
+
|
|
912
|
+
for table in tables:
|
|
913
|
+
filterable_count += len(get_filterable_columns(table, schema, column_roles))
|
|
914
|
+
aggregatable_count += len(get_aggregatable_columns(table, schema, column_roles))
|
|
915
|
+
groupable_count += len(get_groupable_columns(table, schema, column_roles))
|
|
916
|
+
|
|
917
|
+
comparable_pairs = len(get_comparable_column_pairs(tables, schema, column_roles))
|
|
918
|
+
|
|
919
|
+
return filterable_count * 2 + aggregatable_count * 3 + groupable_count * 2 + comparable_pairs * 2
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
def _build_skeleton_pool(
|
|
923
|
+
schema: SchemaGraph, column_roles: dict[str, str], num_tables: int | None = None
|
|
924
|
+
) -> SkeletonPool:
|
|
925
|
+
"""Build a tiered skeleton pool from all valid table sets for adaptive selection.
|
|
926
|
+
|
|
927
|
+
Enumerates table sets, generates all structural skeletons for each, assigns each to tier A/B/C by complexity score, and packages them into a ``SkeletonPool`` with round-robin iteration state.
|
|
928
|
+
|
|
929
|
+
Args:
|
|
930
|
+
|
|
931
|
+
schema: Schema graph used for skeleton generation and richness scoring.
|
|
932
|
+
column_roles: Map of column key to role string.
|
|
933
|
+
num_tables: If provided, restrict the pool to table sets of exactly this size.
|
|
934
|
+
|
|
935
|
+
Returns:
|
|
936
|
+
|
|
937
|
+
A ``SkeletonPool`` instance ready for adaptive skeleton selection.
|
|
938
|
+
"""
|
|
939
|
+
table_sets = enumerate_table_sets(schema)
|
|
940
|
+
|
|
941
|
+
if num_tables is not None:
|
|
942
|
+
table_sets = [ts for ts in table_sets if len(ts) == num_tables]
|
|
943
|
+
|
|
944
|
+
scored_sets = [(ts, _compute_table_set_richness(ts, schema, column_roles)) for ts in table_sets]
|
|
945
|
+
scored_sets.sort(key=lambda x: x[1], reverse=True)
|
|
946
|
+
|
|
947
|
+
tier_a_by_table_set: dict[str, list[QSimSkeleton]] = {}
|
|
948
|
+
tier_b_by_table_set: dict[str, list[QSimSkeleton]] = {}
|
|
949
|
+
tier_c_by_table_set: dict[str, list[QSimSkeleton]] = {}
|
|
950
|
+
|
|
951
|
+
for table_set, _ in scored_sets:
|
|
952
|
+
table_key = "|".join(sorted(table_set))
|
|
953
|
+
tier_a_by_table_set[table_key] = []
|
|
954
|
+
tier_b_by_table_set[table_key] = []
|
|
955
|
+
tier_c_by_table_set[table_key] = []
|
|
956
|
+
|
|
957
|
+
skeletons = generate_all_skeletons(table_set, schema, column_roles)
|
|
958
|
+
for skel in skeletons:
|
|
959
|
+
tier = _compute_skeleton_complexity_tier(skel)
|
|
960
|
+
if tier == "A":
|
|
961
|
+
tier_a_by_table_set[table_key].append(skel)
|
|
962
|
+
elif tier == "B":
|
|
963
|
+
tier_b_by_table_set[table_key].append(skel)
|
|
964
|
+
else:
|
|
965
|
+
tier_c_by_table_set[table_key].append(skel)
|
|
966
|
+
|
|
967
|
+
for table_key in tier_a_by_table_set:
|
|
968
|
+
random.shuffle(tier_a_by_table_set[table_key])
|
|
969
|
+
random.shuffle(tier_b_by_table_set[table_key])
|
|
970
|
+
random.shuffle(tier_c_by_table_set[table_key])
|
|
971
|
+
|
|
972
|
+
table_set_keys = list(tier_a_by_table_set.keys())
|
|
973
|
+
tier_a_indices = {k: 0 for k in table_set_keys}
|
|
974
|
+
tier_b_indices = {k: 0 for k in table_set_keys}
|
|
975
|
+
tier_c_indices = {k: 0 for k in table_set_keys}
|
|
976
|
+
|
|
977
|
+
total_a = sum(len(v) for v in tier_a_by_table_set.values())
|
|
978
|
+
total_b = sum(len(v) for v in tier_b_by_table_set.values())
|
|
979
|
+
total_c = sum(len(v) for v in tier_c_by_table_set.values())
|
|
980
|
+
|
|
981
|
+
debug(f"[qsim_ops.build_skeleton_pool] built pool: tier_a={total_a}, tier_b={total_b}, tier_c={total_c}")
|
|
982
|
+
return SkeletonPool(
|
|
983
|
+
tier_a_by_table_set=tier_a_by_table_set,
|
|
984
|
+
tier_b_by_table_set=tier_b_by_table_set,
|
|
985
|
+
tier_c_by_table_set=tier_c_by_table_set,
|
|
986
|
+
table_set_keys=table_set_keys,
|
|
987
|
+
tier_a_indices=tier_a_indices,
|
|
988
|
+
tier_b_indices=tier_b_indices,
|
|
989
|
+
tier_c_indices=tier_c_indices,
|
|
990
|
+
)
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
def _select_next_skeleton(
|
|
994
|
+
pool: SkeletonPool, need_filters: bool, need_having: bool
|
|
995
|
+
) -> tuple[QSimSkeleton, list[str]] | None:
|
|
996
|
+
"""Select the next skeleton from the pool using round-robin table-set iteration.
|
|
997
|
+
|
|
998
|
+
Prefers tier A skeletons, then B, then C, honouring coverage needs for filters and HAVING when specified.
|
|
999
|
+
|
|
1000
|
+
Args:
|
|
1001
|
+
|
|
1002
|
+
pool: The ``SkeletonPool`` with current iteration state.
|
|
1003
|
+
need_filters: When ``True``, only select skeletons that require at least one filter.
|
|
1004
|
+
need_having: When ``True``, only select skeletons that require HAVING.
|
|
1005
|
+
|
|
1006
|
+
Returns:
|
|
1007
|
+
|
|
1008
|
+
A 2-tuple ``(skeleton, table_set)`` for the selected skeleton, or ``None`` if no suitable skeleton is found across all table sets.
|
|
1009
|
+
"""
|
|
1010
|
+
|
|
1011
|
+
def matches_needs(skel: QSimSkeleton) -> bool:
|
|
1012
|
+
if need_filters and skel.num_filters == 0:
|
|
1013
|
+
return False
|
|
1014
|
+
if need_having and not skel.has_having:
|
|
1015
|
+
return False
|
|
1016
|
+
return True
|
|
1017
|
+
|
|
1018
|
+
start_idx = pool.current_table_idx
|
|
1019
|
+
attempts = 0
|
|
1020
|
+
max_attempts = len(pool.table_set_keys)
|
|
1021
|
+
|
|
1022
|
+
tiers = [
|
|
1023
|
+
("a", pool.tier_a_by_table_set, pool.tier_a_indices),
|
|
1024
|
+
("b", pool.tier_b_by_table_set, pool.tier_b_indices),
|
|
1025
|
+
("c", pool.tier_c_by_table_set, pool.tier_c_indices),
|
|
1026
|
+
]
|
|
1027
|
+
|
|
1028
|
+
while attempts < max_attempts:
|
|
1029
|
+
table_idx = (start_idx + attempts) % len(pool.table_set_keys)
|
|
1030
|
+
table_key = pool.table_set_keys[table_idx]
|
|
1031
|
+
table_set = table_key.split("|")
|
|
1032
|
+
|
|
1033
|
+
for _tier_name, tier_dict, indices_dict in tiers:
|
|
1034
|
+
skeletons = tier_dict[table_key]
|
|
1035
|
+
current_idx = indices_dict[table_key]
|
|
1036
|
+
|
|
1037
|
+
for i in range(current_idx, len(skeletons)):
|
|
1038
|
+
skel = skeletons[i]
|
|
1039
|
+
if matches_needs(skel):
|
|
1040
|
+
indices_dict[table_key] = i + 1
|
|
1041
|
+
pool.current_table_idx = (table_idx + 1) % len(pool.table_set_keys)
|
|
1042
|
+
return skel, table_set
|
|
1043
|
+
|
|
1044
|
+
attempts += 1
|
|
1045
|
+
|
|
1046
|
+
return None
|
|
1047
|
+
|
|
1048
|
+
|
|
1049
|
+
def _normalize_qsim_intent(intent: QSimIntent, schema: SchemaGraph) -> QSimIntent:
|
|
1050
|
+
"""Normalise a ``QSimIntent`` for canonical deduplication and consistency.
|
|
1051
|
+
|
|
1052
|
+
Enforces grain consistency with aggregation and GROUP BY presence, deduplicates and sorts SELECT and ORDER BY columns, removes table names not referenced in any clause (when the remaining set is still FK-connected), and recomputes the intent ID.
|
|
1053
|
+
|
|
1054
|
+
Args:
|
|
1055
|
+
|
|
1056
|
+
intent: The ``QSimIntent`` to normalise.
|
|
1057
|
+
schema: Schema graph used for FK connectivity checks.
|
|
1058
|
+
|
|
1059
|
+
Returns:
|
|
1060
|
+
|
|
1061
|
+
A new normalised ``QSimIntent`` with canonical field values and a recomputed ``intent_id``.
|
|
1062
|
+
"""
|
|
1063
|
+
grain = intent.grain
|
|
1064
|
+
has_agg = _has_aggregation(intent.select_cols)
|
|
1065
|
+
|
|
1066
|
+
if grain == "grouped":
|
|
1067
|
+
if not intent.group_by_cols:
|
|
1068
|
+
grain = "row_level"
|
|
1069
|
+
else:
|
|
1070
|
+
if has_agg:
|
|
1071
|
+
grain = "grouped" if intent.group_by_cols else "scalar"
|
|
1072
|
+
|
|
1073
|
+
normalized_select = sorted(set(intent.select_cols))
|
|
1074
|
+
normalized_orderby = sorted(intent.order_by_cols)
|
|
1075
|
+
|
|
1076
|
+
tables_used: set[str] = set()
|
|
1077
|
+
for sc in normalized_select:
|
|
1078
|
+
tables_used.update(_extract_tables_from_expr(sc))
|
|
1079
|
+
for col in intent.group_by_cols:
|
|
1080
|
+
tables_used.update(_extract_tables_from_expr(col))
|
|
1081
|
+
for ob in normalized_orderby:
|
|
1082
|
+
tables_used.update(_extract_tables_from_expr(ob))
|
|
1083
|
+
for f in intent.filters_param:
|
|
1084
|
+
tables_used.update(_extract_tables_from_expr(f.column))
|
|
1085
|
+
if f.right_column:
|
|
1086
|
+
tables_used.update(_extract_tables_from_expr(f.right_column))
|
|
1087
|
+
for h in intent.having_param:
|
|
1088
|
+
tables_used.update(_extract_tables_from_expr(h.expression))
|
|
1089
|
+
|
|
1090
|
+
tables_used.discard("")
|
|
1091
|
+
|
|
1092
|
+
normalized_tables = intent.tables
|
|
1093
|
+
if tables_used and len(tables_used) < len(intent.tables):
|
|
1094
|
+
adj = build_fk_adjacency(schema)
|
|
1095
|
+
if is_connected(list(tables_used), adj):
|
|
1096
|
+
normalized_tables = sorted(tables_used)
|
|
1097
|
+
debug(f"[qsim_ops.normalize_qsim_intent] removed unnecessary tables: {set(intent.tables) - tables_used}")
|
|
1098
|
+
|
|
1099
|
+
table_prefixed_group_by = []
|
|
1100
|
+
for col in intent.group_by_cols:
|
|
1101
|
+
if "." not in col:
|
|
1102
|
+
if normalized_tables:
|
|
1103
|
+
col = f"{normalized_tables[0]}.{col}"
|
|
1104
|
+
table_prefixed_group_by.append(col)
|
|
1105
|
+
|
|
1106
|
+
intent_id_val = compute_intent_id(
|
|
1107
|
+
{
|
|
1108
|
+
"tables": normalized_tables,
|
|
1109
|
+
"grain": grain,
|
|
1110
|
+
"select_cols": normalized_select,
|
|
1111
|
+
"group_by_cols": table_prefixed_group_by,
|
|
1112
|
+
"filters_param": [f.to_dict() for f in intent.filters_param],
|
|
1113
|
+
"having_param": [h.to_dict() for h in intent.having_param],
|
|
1114
|
+
"distinct": intent.distinct,
|
|
1115
|
+
}
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
return QSimIntent(
|
|
1119
|
+
intent_id=intent_id_val,
|
|
1120
|
+
tables=normalized_tables,
|
|
1121
|
+
grain=grain,
|
|
1122
|
+
select_cols=normalized_select,
|
|
1123
|
+
group_by_cols=table_prefixed_group_by,
|
|
1124
|
+
order_by_cols=normalized_orderby,
|
|
1125
|
+
filters_param=intent.filters_param,
|
|
1126
|
+
having_param=intent.having_param,
|
|
1127
|
+
param_values=intent.param_values,
|
|
1128
|
+
question=intent.question,
|
|
1129
|
+
variant_idx=intent.variant_idx,
|
|
1130
|
+
limit=intent.limit,
|
|
1131
|
+
distinct=intent.distinct,
|
|
1132
|
+
)
|
|
1133
|
+
|
|
1134
|
+
|
|
1135
|
+
def generate_all_intents(
|
|
1136
|
+
schema: SchemaGraph, column_roles: dict[str, str], num_intents: int = None
|
|
1137
|
+
) -> list[QSimIntent]:
|
|
1138
|
+
"""Generate a diverse set of ``QSimIntent`` instances with stratified coverage guarantees.
|
|
1139
|
+
|
|
1140
|
+
Uses adaptive skeleton selection across three table-count strata (single, two-table, three-table) with configurable budgets for each.
|
|
1141
|
+
Enforces minimum ratios for filter coverage, HAVING coverage, and three-table joins, and caps no-variance skeletons.
|
|
1142
|
+
|
|
1143
|
+
Args:
|
|
1144
|
+
|
|
1145
|
+
schema: Schema graph used for skeleton pool construction and LLM filling.
|
|
1146
|
+
column_roles: Map of column key to role string.
|
|
1147
|
+
num_intents: Total number of intents to generate; defaults to ``QSimConfig.INTENT_TYPES``.
|
|
1148
|
+
|
|
1149
|
+
Returns:
|
|
1150
|
+
|
|
1151
|
+
List of normalised, deduplicated ``QSimIntent`` instances up to *num_intents* in length.
|
|
1152
|
+
"""
|
|
1153
|
+
random.seed(QSimConfig.RANDOM_SEED)
|
|
1154
|
+
load_or_create_skeletons(schema, column_roles)
|
|
1155
|
+
|
|
1156
|
+
if num_intents is None:
|
|
1157
|
+
num_intents = QSimConfig.INTENT_TYPES
|
|
1158
|
+
|
|
1159
|
+
min_with_filters = int(num_intents * QSimConfig.MIN_FILTER_RATIO)
|
|
1160
|
+
min_with_having = int(num_intents * QSimConfig.MIN_HAVING_RATIO)
|
|
1161
|
+
min_three_table = int(num_intents * QSimConfig.MIN_THREE_TABLE_RATIO)
|
|
1162
|
+
max_no_variance = int(num_intents * QSimConfig.MAX_NO_VARIANCE_RATIO)
|
|
1163
|
+
|
|
1164
|
+
debug(
|
|
1165
|
+
f"[qsim_ops.generate_all_intents] targeting {num_intents} intents, min_filters={min_with_filters}, min_having={min_with_having}, min_three_table={min_three_table}"
|
|
1166
|
+
)
|
|
1167
|
+
|
|
1168
|
+
budget_three = max(int(num_intents * QSimConfig.THREE_TABLE_RATIO), min_three_table)
|
|
1169
|
+
remaining_after_three = num_intents - budget_three
|
|
1170
|
+
budget_single = int(
|
|
1171
|
+
remaining_after_three
|
|
1172
|
+
* QSimConfig.SINGLE_TABLE_RATIO
|
|
1173
|
+
/ (QSimConfig.SINGLE_TABLE_RATIO + QSimConfig.TWO_TABLE_RATIO)
|
|
1174
|
+
)
|
|
1175
|
+
budget_two = remaining_after_three - budget_single
|
|
1176
|
+
|
|
1177
|
+
intents: list[QSimIntent] = []
|
|
1178
|
+
seen_ids: set[str] = set()
|
|
1179
|
+
table_set_usage: dict[str, int] = {}
|
|
1180
|
+
no_variance_count = 0
|
|
1181
|
+
|
|
1182
|
+
strata = [
|
|
1183
|
+
(3, budget_three, "three-table"),
|
|
1184
|
+
(1, budget_single, "single-table"),
|
|
1185
|
+
(2, budget_two, "two-table"),
|
|
1186
|
+
]
|
|
1187
|
+
|
|
1188
|
+
for num_tables, stratum_budget, stratum_name in strata:
|
|
1189
|
+
if stratum_budget == 0:
|
|
1190
|
+
continue
|
|
1191
|
+
|
|
1192
|
+
pool = _build_skeleton_pool(schema, column_roles, num_tables=num_tables)
|
|
1193
|
+
|
|
1194
|
+
consecutive_duplicates = 0
|
|
1195
|
+
consecutive_failures = 0
|
|
1196
|
+
stratum_intents = 0
|
|
1197
|
+
|
|
1198
|
+
while stratum_intents < stratum_budget:
|
|
1199
|
+
if consecutive_duplicates >= QSimConfig.MAX_CONSECUTIVE_DUPLICATES:
|
|
1200
|
+
debug(
|
|
1201
|
+
f"[qsim_ops.generate_all_intents] EARLY_EXIT ({stratum_name}): {consecutive_duplicates} consecutive duplicates"
|
|
1202
|
+
)
|
|
1203
|
+
break
|
|
1204
|
+
|
|
1205
|
+
if consecutive_failures >= QSimConfig.MAX_CONSECUTIVE_FAILURES:
|
|
1206
|
+
debug(
|
|
1207
|
+
f"[qsim_ops.generate_all_intents] EARLY_EXIT ({stratum_name}): {consecutive_failures} consecutive failures"
|
|
1208
|
+
)
|
|
1209
|
+
break
|
|
1210
|
+
|
|
1211
|
+
current_with_filters = len([i for i in intents if i.filters_param])
|
|
1212
|
+
current_with_having = len([i for i in intents if i.having_param])
|
|
1213
|
+
need_filters = current_with_filters < min_with_filters
|
|
1214
|
+
need_having = current_with_having < min_with_having
|
|
1215
|
+
|
|
1216
|
+
selection = _select_next_skeleton(pool, need_filters, need_having)
|
|
1217
|
+
if not selection:
|
|
1218
|
+
debug(f"[qsim_ops.generate_all_intents] EARLY_EXIT ({stratum_name}): skeleton pool exhausted")
|
|
1219
|
+
break
|
|
1220
|
+
|
|
1221
|
+
skeleton, table_set = selection
|
|
1222
|
+
|
|
1223
|
+
if _is_no_variance_skeleton(skeleton) and no_variance_count >= max_no_variance:
|
|
1224
|
+
debug(
|
|
1225
|
+
f"[qsim_ops.generate_all_intents] SKIPPING ({stratum_name}): no-variance budget exceeded ({no_variance_count}/{max_no_variance})"
|
|
1226
|
+
)
|
|
1227
|
+
continue
|
|
1228
|
+
|
|
1229
|
+
intent = _llm_fill_intent(skeleton, schema, column_roles)
|
|
1230
|
+
|
|
1231
|
+
if not intent:
|
|
1232
|
+
consecutive_failures += 1
|
|
1233
|
+
debug(f"[qsim_ops.generate_all_intents] LLM failed, consecutive_failures={consecutive_failures}")
|
|
1234
|
+
continue
|
|
1235
|
+
|
|
1236
|
+
consecutive_failures = 0
|
|
1237
|
+
|
|
1238
|
+
normalized = _normalize_qsim_intent(intent, schema)
|
|
1239
|
+
|
|
1240
|
+
if num_tables == 3 and len(normalized.tables) < 3:
|
|
1241
|
+
debug(
|
|
1242
|
+
f"[qsim_ops.generate_all_intents] SKIPPING ({stratum_name}): normalized to {len(normalized.tables)} tables"
|
|
1243
|
+
)
|
|
1244
|
+
continue
|
|
1245
|
+
|
|
1246
|
+
if normalized.intent_id in seen_ids:
|
|
1247
|
+
consecutive_duplicates += 1
|
|
1248
|
+
debug(
|
|
1249
|
+
f"[qsim_ops.generate_all_intents] DUPLICATE: intent_id={normalized.intent_id}, consecutive_duplicates={consecutive_duplicates}"
|
|
1250
|
+
)
|
|
1251
|
+
continue
|
|
1252
|
+
|
|
1253
|
+
consecutive_duplicates = 0
|
|
1254
|
+
|
|
1255
|
+
if _is_no_variance_skeleton(skeleton):
|
|
1256
|
+
no_variance_count += 1
|
|
1257
|
+
|
|
1258
|
+
table_set_key = "|".join(sorted(table_set))
|
|
1259
|
+
table_set_usage[table_set_key] = table_set_usage.get(table_set_key, 0) + 1
|
|
1260
|
+
|
|
1261
|
+
intents.append(normalized)
|
|
1262
|
+
seen_ids.add(normalized.intent_id)
|
|
1263
|
+
stratum_intents += 1
|
|
1264
|
+
debug(
|
|
1265
|
+
f"[qsim_ops.generate_all_intents] ADDED: intent_id={normalized.intent_id}, tables={table_set}, filters={len(normalized.filters_param)}, having={len(normalized.having_param)}, total={len(intents)}/{num_intents}"
|
|
1266
|
+
)
|
|
1267
|
+
|
|
1268
|
+
final_with_filters = len([i for i in intents if i.filters_param])
|
|
1269
|
+
final_with_having = len([i for i in intents if i.having_param])
|
|
1270
|
+
single_count = len([i for i in intents if len(i.tables) == 1])
|
|
1271
|
+
two_count = len([i for i in intents if len(i.tables) == 2])
|
|
1272
|
+
three_count = len([i for i in intents if len(i.tables) >= 3])
|
|
1273
|
+
|
|
1274
|
+
debug(
|
|
1275
|
+
f"[qsim_ops.generate_all_intents] generated {len(intents)} intents: single={single_count}, two={two_count}, three={three_count}"
|
|
1276
|
+
)
|
|
1277
|
+
debug(
|
|
1278
|
+
f"[qsim_ops.generate_all_intents] coverage: with_filters={final_with_filters}/{min_with_filters}, with_having={final_with_having}/{min_with_having}, three_table={three_count}/{min_three_table}, no_variance={no_variance_count}/{max_no_variance}"
|
|
1279
|
+
)
|
|
1280
|
+
debug(
|
|
1281
|
+
f"[qsim_ops.generate_all_intents] table_set_usage: {
|
|
1282
|
+
dict(sorted(table_set_usage.items(), key=lambda x: x[1], reverse=True)[:10])
|
|
1283
|
+
}"
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1286
|
+
return intents
|