aetherdialect 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aetherdialect-0.1.0.dist-info/METADATA +197 -0
- aetherdialect-0.1.0.dist-info/RECORD +34 -0
- aetherdialect-0.1.0.dist-info/WHEEL +5 -0
- aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
- aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
- text2sql/__init__.py +7 -0
- text2sql/config.py +1063 -0
- text2sql/contracts_base.py +952 -0
- text2sql/contracts_core.py +1890 -0
- text2sql/core_utils.py +834 -0
- text2sql/dialect.py +1134 -0
- text2sql/expansion_ops.py +1218 -0
- text2sql/expansion_rules.py +496 -0
- text2sql/intent_expr.py +1759 -0
- text2sql/intent_process.py +2133 -0
- text2sql/intent_repair.py +1733 -0
- text2sql/intent_resolve.py +1292 -0
- text2sql/live_testing.py +1117 -0
- text2sql/main_execution.py +799 -0
- text2sql/pipeline.py +1662 -0
- text2sql/qsim_ops.py +1286 -0
- text2sql/qsim_sample.py +609 -0
- text2sql/qsim_struct.py +569 -0
- text2sql/schema.py +973 -0
- text2sql/schema_profiling.py +2075 -0
- text2sql/simulator.py +970 -0
- text2sql/sql_gen.py +1537 -0
- text2sql/templates.py +1037 -0
- text2sql/text2sql.py +726 -0
- text2sql/utils.py +973 -0
- text2sql/validation_agg.py +1033 -0
- text2sql/validation_execute.py +1092 -0
- text2sql/validation_schema.py +1847 -0
- text2sql/validation_semantic.py +2122 -0
|
@@ -0,0 +1,1218 @@
|
|
|
1
|
+
"""Fully deterministic expansion operators for synthetic intent generation.
|
|
2
|
+
|
|
3
|
+
Implements A-series (attribute: filters, aggregations, GROUP BY, ORDER BY,
|
|
4
|
+
HAVING), B-series (join: dimension, fact, swap, remove, bridge), C-series
|
|
5
|
+
(gold inclusion), T-series (temporal: EXTRACT, DATE_TRUNC, date_window,
|
|
6
|
+
date_diff), N-series (numeric: ROUND, ABS), and structural operators
|
|
7
|
+
(DISTINCT, LIMIT, OR-groups, expression composition).
|
|
8
|
+
|
|
9
|
+
All operators are purely deterministic — no LLM calls. The top-level
|
|
10
|
+
``expand_gold_intents`` function orchestrates multi-depth expansion with
|
|
11
|
+
SHA-256 dedup across all gold intents.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import copy
|
|
17
|
+
from dataclasses import replace
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from .config import (
|
|
21
|
+
VALID_AGGREGATION_FUNCTIONS,
|
|
22
|
+
VALID_FILTER_OPS,
|
|
23
|
+
VALID_HAVING_OPS,
|
|
24
|
+
SimulatorConfig,
|
|
25
|
+
)
|
|
26
|
+
from .contracts_base import (
|
|
27
|
+
ColumnRole,
|
|
28
|
+
ExpansionMetadata,
|
|
29
|
+
SchemaGraph,
|
|
30
|
+
SchemaLimits,
|
|
31
|
+
TableRole,
|
|
32
|
+
)
|
|
33
|
+
from .contracts_core import (
|
|
34
|
+
FilterParam,
|
|
35
|
+
HavingParam,
|
|
36
|
+
MulGroup,
|
|
37
|
+
NormalizedExpr,
|
|
38
|
+
OrderByCol,
|
|
39
|
+
SelectCol,
|
|
40
|
+
SimulatorIntent,
|
|
41
|
+
)
|
|
42
|
+
from .core_utils import debug, log
|
|
43
|
+
from .intent_resolve import enforce_schema
|
|
44
|
+
from .utils import intent_key
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Schema helpers
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def _get_table_role(schema: SchemaGraph, table: str) -> str | None:
|
|
52
|
+
"""Return table role string from schema."""
|
|
53
|
+
tm = schema.tables.get(table)
|
|
54
|
+
return tm.role if tm else None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _table_from_column_ref(col_ref: str) -> str:
|
|
58
|
+
"""Extract table name from ``table.column`` reference."""
|
|
59
|
+
if not col_ref or "." not in col_ref:
|
|
60
|
+
return ""
|
|
61
|
+
return col_ref.split(".", 1)[0]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _build_column_metadata(
|
|
65
|
+
schema: SchemaGraph,
|
|
66
|
+
) -> dict[str, dict[str, dict[str, Any]]]:
|
|
67
|
+
"""Build nested ``table -> column -> metadata`` dict from schema."""
|
|
68
|
+
result: dict[str, dict[str, dict[str, Any]]] = {}
|
|
69
|
+
for table_name, table_obj in schema.tables.items():
|
|
70
|
+
result[table_name] = {}
|
|
71
|
+
for col_name, col in table_obj.columns.items():
|
|
72
|
+
result[table_name][col_name] = {
|
|
73
|
+
"data_type": col.data_type,
|
|
74
|
+
"role": col.role,
|
|
75
|
+
"nullable": col.null_ratio > 0.0,
|
|
76
|
+
"cardinality": getattr(col, "cardinality", None),
|
|
77
|
+
}
|
|
78
|
+
return result
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _build_fk_map(schema: SchemaGraph) -> dict[str, list[dict[str, str]]]:
|
|
82
|
+
"""Build FK adjacency map: ``source_table -> [{source_column, target_table, target_column}]``."""
|
|
83
|
+
fk_map: dict[str, list[dict[str, str]]] = {}
|
|
84
|
+
for fk in schema.fk_edges:
|
|
85
|
+
source = fk.src_table
|
|
86
|
+
if source not in fk_map:
|
|
87
|
+
fk_map[source] = []
|
|
88
|
+
fk_map[source].append({
|
|
89
|
+
"source_column": fk.src_cols[0] if fk.src_cols else "",
|
|
90
|
+
"target_table": fk.dst_table,
|
|
91
|
+
"target_column": fk.dst_cols[0] if fk.dst_cols else "",
|
|
92
|
+
})
|
|
93
|
+
return fk_map
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _tables_are_connected(
|
|
97
|
+
tables: list[str],
|
|
98
|
+
fk_map: dict[str, list[dict[str, str]]],
|
|
99
|
+
) -> bool:
|
|
100
|
+
"""Return True when all *tables* form a connected component via FKs."""
|
|
101
|
+
if len(tables) <= 1:
|
|
102
|
+
return True
|
|
103
|
+
adjacency: dict[str, set[str]] = {t: set() for t in tables}
|
|
104
|
+
for source, fks in fk_map.items():
|
|
105
|
+
if source not in adjacency:
|
|
106
|
+
continue
|
|
107
|
+
for fk in fks:
|
|
108
|
+
target = fk.get("target_table", "")
|
|
109
|
+
if target in adjacency:
|
|
110
|
+
adjacency[source].add(target)
|
|
111
|
+
adjacency[target].add(source)
|
|
112
|
+
visited: set[str] = set()
|
|
113
|
+
stack = [tables[0]]
|
|
114
|
+
while stack:
|
|
115
|
+
current = stack.pop()
|
|
116
|
+
if current in visited:
|
|
117
|
+
continue
|
|
118
|
+
visited.add(current)
|
|
119
|
+
for neighbor in adjacency.get(current, []):
|
|
120
|
+
if neighbor not in visited:
|
|
121
|
+
stack.append(neighbor)
|
|
122
|
+
return len(visited) == len(tables)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _get_filterable_columns(
|
|
126
|
+
schema: SchemaGraph, table_name: str,
|
|
127
|
+
) -> list[str]:
|
|
128
|
+
"""Return ``table.column`` refs suitable for filtering (CATEGORICAL, TEMPORAL, IDENTIFIER)."""
|
|
129
|
+
if table_name not in schema.tables:
|
|
130
|
+
return []
|
|
131
|
+
table = schema.tables[table_name]
|
|
132
|
+
return [
|
|
133
|
+
f"{table_name}.{c}"
|
|
134
|
+
for c, col in table.columns.items()
|
|
135
|
+
if col.role in (
|
|
136
|
+
ColumnRole.CATEGORICAL.value,
|
|
137
|
+
ColumnRole.TEMPORAL.value,
|
|
138
|
+
ColumnRole.IDENTIFIER.value,
|
|
139
|
+
)
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _get_groupable_columns(
|
|
144
|
+
schema: SchemaGraph, table_name: str,
|
|
145
|
+
) -> list[str]:
|
|
146
|
+
"""Return ``table.column`` refs suitable for GROUP BY."""
|
|
147
|
+
if table_name not in schema.tables:
|
|
148
|
+
return []
|
|
149
|
+
table = schema.tables[table_name]
|
|
150
|
+
return [
|
|
151
|
+
f"{table_name}.{c}"
|
|
152
|
+
for c, col in table.columns.items()
|
|
153
|
+
if col.role in (
|
|
154
|
+
ColumnRole.CATEGORICAL.value,
|
|
155
|
+
ColumnRole.TEMPORAL.value,
|
|
156
|
+
)
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _get_temporal_columns(
|
|
161
|
+
schema: SchemaGraph, table_name: str,
|
|
162
|
+
) -> list[str]:
|
|
163
|
+
"""Return ``table.column`` refs for TEMPORAL columns."""
|
|
164
|
+
if table_name not in schema.tables:
|
|
165
|
+
return []
|
|
166
|
+
table = schema.tables[table_name]
|
|
167
|
+
return [
|
|
168
|
+
f"{table_name}.{c}"
|
|
169
|
+
for c, col in table.columns.items()
|
|
170
|
+
if col.role == ColumnRole.TEMPORAL.value
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _get_numeric_measure_columns(
|
|
175
|
+
schema: SchemaGraph, table_name: str,
|
|
176
|
+
) -> list[str]:
|
|
177
|
+
"""Return ``table.column`` refs for NUMERIC_MEASURE columns."""
|
|
178
|
+
if table_name not in schema.tables:
|
|
179
|
+
return []
|
|
180
|
+
table = schema.tables[table_name]
|
|
181
|
+
return [
|
|
182
|
+
f"{table_name}.{c}"
|
|
183
|
+
for c, col in table.columns.items()
|
|
184
|
+
if col.role == ColumnRole.NUMERIC_MEASURE.value
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _get_dimension_tables(schema: SchemaGraph) -> list[str]:
|
|
189
|
+
"""Return all dimension table names."""
|
|
190
|
+
return [
|
|
191
|
+
t for t, info in schema.tables.items()
|
|
192
|
+
if info.role == TableRole.DIMENSION.value
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _add_expansion_metadata(
|
|
197
|
+
intent: SimulatorIntent, operator: str,
|
|
198
|
+
) -> None:
|
|
199
|
+
"""Stamp *intent* in-place with expansion metadata for *operator*."""
|
|
200
|
+
if intent.expansion_metadata is None:
|
|
201
|
+
intent.expansion_metadata = ExpansionMetadata(
|
|
202
|
+
parent_intent_id="",
|
|
203
|
+
operator=operator,
|
|
204
|
+
depth=1,
|
|
205
|
+
expansion_path=[operator],
|
|
206
|
+
)
|
|
207
|
+
else:
|
|
208
|
+
intent.expansion_metadata = ExpansionMetadata(
|
|
209
|
+
parent_intent_id=(
|
|
210
|
+
intent.expansion_metadata.parent_intent_id
|
|
211
|
+
or intent.intent_id
|
|
212
|
+
),
|
|
213
|
+
operator=operator,
|
|
214
|
+
depth=(intent.expansion_metadata.depth or 0) + 1,
|
|
215
|
+
expansion_path=(
|
|
216
|
+
(intent.expansion_metadata.expansion_path or [])
|
|
217
|
+
+ [operator]
|
|
218
|
+
),
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _compatible_data_types(type_a: str, type_b: str) -> bool:
|
|
223
|
+
"""Return True when *type_a* and *type_b* belong to the same broad category."""
|
|
224
|
+
numeric = {
|
|
225
|
+
"integer", "decimal", "float", "numeric",
|
|
226
|
+
"double", "bigint", "smallint", "real",
|
|
227
|
+
}
|
|
228
|
+
text = {"character varying", "varchar", "text", "char", "character"}
|
|
229
|
+
temporal = {
|
|
230
|
+
"date", "timestamp", "timestamp without time zone",
|
|
231
|
+
"timestamp with time zone", "time",
|
|
232
|
+
}
|
|
233
|
+
a, b = type_a.lower(), type_b.lower()
|
|
234
|
+
if a == b:
|
|
235
|
+
return True
|
|
236
|
+
for group in (numeric, text, temporal):
|
|
237
|
+
if a in group and b in group:
|
|
238
|
+
return True
|
|
239
|
+
return False
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
# ---------------------------------------------------------------------------
|
|
243
|
+
# A-series operators (attribute modifications)
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
|
|
246
|
+
def _a1_add_filter(
|
|
247
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
248
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
249
|
+
) -> list[SimulatorIntent]:
|
|
250
|
+
"""A1: Add one value-based filter per filterable column not yet filtered."""
|
|
251
|
+
current_filter_cols = {
|
|
252
|
+
f.left_expr.primary_column for f in (intent.filters_param or [])
|
|
253
|
+
}
|
|
254
|
+
if len(current_filter_cols) >= SimulatorConfig.MAX_FILTERS:
|
|
255
|
+
return []
|
|
256
|
+
|
|
257
|
+
results: list[SimulatorIntent] = []
|
|
258
|
+
for table in intent.tables or []:
|
|
259
|
+
for col in _get_filterable_columns(schema, table):
|
|
260
|
+
if col in current_filter_cols:
|
|
261
|
+
continue
|
|
262
|
+
new_intent = copy.deepcopy(intent)
|
|
263
|
+
new_filter = FilterParam(
|
|
264
|
+
left_expr=NormalizedExpr.from_column(col),
|
|
265
|
+
op="=",
|
|
266
|
+
value_type="string",
|
|
267
|
+
param_key=f"f_{col.replace('.', '_')}",
|
|
268
|
+
)
|
|
269
|
+
new_intent.filters_param = list(
|
|
270
|
+
new_intent.filters_param or []
|
|
271
|
+
) + [new_filter]
|
|
272
|
+
_add_expansion_metadata(new_intent, "A1_add_filter")
|
|
273
|
+
results.append(new_intent)
|
|
274
|
+
return results
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _a2_add_expr_filter(
|
|
278
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
279
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
280
|
+
) -> list[SimulatorIntent]:
|
|
281
|
+
"""A2: Add column-vs-column comparison for same-type column pairs."""
|
|
282
|
+
existing = set()
|
|
283
|
+
for f in intent.filters_param or []:
|
|
284
|
+
if f.right_expr:
|
|
285
|
+
existing.add((
|
|
286
|
+
f.left_expr.primary_column,
|
|
287
|
+
f.op,
|
|
288
|
+
f.right_expr.primary_column,
|
|
289
|
+
))
|
|
290
|
+
if len(existing) >= SimulatorConfig.MAX_EXPR_COMPARISONS:
|
|
291
|
+
return []
|
|
292
|
+
|
|
293
|
+
type_groups: dict[str, list[str]] = {}
|
|
294
|
+
for table in intent.tables or []:
|
|
295
|
+
if table not in column_metadata:
|
|
296
|
+
continue
|
|
297
|
+
for col_name, col_info in column_metadata[table].items():
|
|
298
|
+
dtype = col_info.get("data_type", "unknown")
|
|
299
|
+
full_col = f"{table}.{col_name}"
|
|
300
|
+
type_groups.setdefault(dtype, []).append(full_col)
|
|
301
|
+
|
|
302
|
+
results: list[SimulatorIntent] = []
|
|
303
|
+
for cols in type_groups.values():
|
|
304
|
+
if len(cols) < 2:
|
|
305
|
+
continue
|
|
306
|
+
for i, left in enumerate(cols):
|
|
307
|
+
for right in cols[i + 1:]:
|
|
308
|
+
for op in ["=", ">", "<"]:
|
|
309
|
+
if (left, op, right) in existing:
|
|
310
|
+
continue
|
|
311
|
+
new_intent = copy.deepcopy(intent)
|
|
312
|
+
new_filter = FilterParam(
|
|
313
|
+
left_expr=NormalizedExpr.from_column(left),
|
|
314
|
+
op=op,
|
|
315
|
+
right_expr=NormalizedExpr.from_column(right),
|
|
316
|
+
value_type="column",
|
|
317
|
+
param_key="",
|
|
318
|
+
)
|
|
319
|
+
new_intent.filters_param = list(
|
|
320
|
+
new_intent.filters_param or []
|
|
321
|
+
) + [new_filter]
|
|
322
|
+
_add_expansion_metadata(new_intent, "A2_add_expr_filter")
|
|
323
|
+
results.append(new_intent)
|
|
324
|
+
return results
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _swap_agg_func(expr: NormalizedExpr, new_agg: str) -> NormalizedExpr:
|
|
328
|
+
"""Return *expr* with its aggregation function swapped to *new_agg*."""
|
|
329
|
+
if expr.agg_func:
|
|
330
|
+
return replace(expr, agg_func=new_agg)
|
|
331
|
+
if expr.add_groups and expr.add_groups[0].agg_func:
|
|
332
|
+
new_group = replace(expr.add_groups[0], agg_func=new_agg)
|
|
333
|
+
return replace(
|
|
334
|
+
expr, add_groups=[new_group] + list(expr.add_groups[1:])
|
|
335
|
+
)
|
|
336
|
+
return NormalizedExpr.from_agg(new_agg, expr.primary_column)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _a3_change_aggregation(
|
|
340
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
341
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
342
|
+
) -> list[SimulatorIntent]:
|
|
343
|
+
"""A3: Swap aggregation function on each aggregated select column."""
|
|
344
|
+
results: list[SimulatorIntent] = []
|
|
345
|
+
alt_aggs = ["count", "sum", "avg", "min", "max"]
|
|
346
|
+
|
|
347
|
+
for sc in intent.select_cols or []:
|
|
348
|
+
if not sc.is_aggregated:
|
|
349
|
+
continue
|
|
350
|
+
sc_col = sc.expr.primary_column
|
|
351
|
+
sc_term = sc.expr.primary_term
|
|
352
|
+
for new_agg in alt_aggs:
|
|
353
|
+
new_term = f"{new_agg}({sc_col})"
|
|
354
|
+
if new_term.lower() == sc_term.lower():
|
|
355
|
+
continue
|
|
356
|
+
new_intent = copy.deepcopy(intent)
|
|
357
|
+
for i, s in enumerate(new_intent.select_cols or []):
|
|
358
|
+
if (
|
|
359
|
+
s.expr.primary_column == sc_col
|
|
360
|
+
and s.expr.primary_term == sc_term
|
|
361
|
+
):
|
|
362
|
+
new_expr = _swap_agg_func(s.expr, new_agg)
|
|
363
|
+
new_intent.select_cols[i] = SelectCol(expr=new_expr)
|
|
364
|
+
break
|
|
365
|
+
_add_expansion_metadata(new_intent, "A3_change_aggregation")
|
|
366
|
+
results.append(new_intent)
|
|
367
|
+
return results
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _a4_add_groupby(
|
|
371
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
372
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
373
|
+
) -> list[SimulatorIntent]:
|
|
374
|
+
"""A4: Add one GROUP BY column per groupable column not yet grouped."""
|
|
375
|
+
current_gb = {g.primary_column for g in (intent.group_by_cols or [])}
|
|
376
|
+
if len(current_gb) >= SimulatorConfig.MAX_GROUPBY:
|
|
377
|
+
return []
|
|
378
|
+
|
|
379
|
+
results: list[SimulatorIntent] = []
|
|
380
|
+
for table in intent.tables or []:
|
|
381
|
+
for col in _get_groupable_columns(schema, table):
|
|
382
|
+
if col in current_gb:
|
|
383
|
+
continue
|
|
384
|
+
new_intent = copy.deepcopy(intent)
|
|
385
|
+
new_intent.group_by_cols = sorted(
|
|
386
|
+
list(intent.group_by_cols or [])
|
|
387
|
+
+ [NormalizedExpr.from_column(col)],
|
|
388
|
+
key=lambda g: g.signature_key,
|
|
389
|
+
)
|
|
390
|
+
if new_intent.grain == "row_level":
|
|
391
|
+
new_intent.grain = "grouped"
|
|
392
|
+
_add_expansion_metadata(new_intent, "A4_add_groupby")
|
|
393
|
+
results.append(new_intent)
|
|
394
|
+
return results
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _a5_add_orderby(
|
|
398
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
399
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
400
|
+
) -> list[SimulatorIntent]:
|
|
401
|
+
"""A5: Add ORDER BY for each select/groupby column in ASC and DESC."""
|
|
402
|
+
current_ob = {o.expr.primary_column for o in (intent.order_by_cols or [])}
|
|
403
|
+
candidates = [g.primary_column for g in (intent.group_by_cols or [])]
|
|
404
|
+
for sc in intent.select_cols or []:
|
|
405
|
+
if sc.expr.primary_column not in candidates:
|
|
406
|
+
candidates.append(sc.expr.primary_column)
|
|
407
|
+
|
|
408
|
+
results: list[SimulatorIntent] = []
|
|
409
|
+
for col in candidates:
|
|
410
|
+
if col in current_ob:
|
|
411
|
+
continue
|
|
412
|
+
for direction in ["ASC", "DESC"]:
|
|
413
|
+
new_intent = copy.deepcopy(intent)
|
|
414
|
+
new_order = OrderByCol(
|
|
415
|
+
expr=NormalizedExpr.from_column(col),
|
|
416
|
+
direction=direction,
|
|
417
|
+
)
|
|
418
|
+
new_intent.order_by_cols = list(
|
|
419
|
+
new_intent.order_by_cols or []
|
|
420
|
+
) + [new_order]
|
|
421
|
+
_add_expansion_metadata(new_intent, "A5_add_orderby")
|
|
422
|
+
results.append(new_intent)
|
|
423
|
+
return results
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _a6_add_having_value(
|
|
427
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
428
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
429
|
+
) -> list[SimulatorIntent]:
|
|
430
|
+
"""A6: Add HAVING with value threshold for grouped intents."""
|
|
431
|
+
if intent.grain != "grouped" or not intent.group_by_cols:
|
|
432
|
+
return []
|
|
433
|
+
existing = {
|
|
434
|
+
(h.left_expr.primary_term, h.op)
|
|
435
|
+
for h in (intent.having_param or [])
|
|
436
|
+
}
|
|
437
|
+
results: list[SimulatorIntent] = []
|
|
438
|
+
for agg_func in ["count", "sum", "avg"]:
|
|
439
|
+
for op in [">", "<", ">=", "<="]:
|
|
440
|
+
left_agg = f"{agg_func}(*)"
|
|
441
|
+
if (left_agg, op) in existing:
|
|
442
|
+
continue
|
|
443
|
+
new_intent = copy.deepcopy(intent)
|
|
444
|
+
new_having = HavingParam(
|
|
445
|
+
left_expr=NormalizedExpr.from_agg(agg_func, "*"),
|
|
446
|
+
op=op,
|
|
447
|
+
value_type="number",
|
|
448
|
+
param_key=f"h_{agg_func}_{op.replace('<', 'lt').replace('>', 'gt').replace('=', 'e')}",
|
|
449
|
+
)
|
|
450
|
+
new_intent.having_param = list(
|
|
451
|
+
new_intent.having_param or []
|
|
452
|
+
) + [new_having]
|
|
453
|
+
_add_expansion_metadata(new_intent, "A6_add_having_value")
|
|
454
|
+
results.append(new_intent)
|
|
455
|
+
return results
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _a7_add_having_expr(
|
|
459
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
460
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
461
|
+
) -> list[SimulatorIntent]:
|
|
462
|
+
"""A7: Add HAVING agg-vs-agg comparison for grouped intents."""
|
|
463
|
+
if intent.grain != "grouped" or not intent.group_by_cols:
|
|
464
|
+
return []
|
|
465
|
+
existing = {
|
|
466
|
+
(h.left_expr.primary_term, h.op)
|
|
467
|
+
for h in (intent.having_param or [])
|
|
468
|
+
}
|
|
469
|
+
agg_cols = [
|
|
470
|
+
sc.expr.primary_column
|
|
471
|
+
for sc in (intent.select_cols or []) if sc.is_aggregated
|
|
472
|
+
]
|
|
473
|
+
target_col = agg_cols[0] if agg_cols else "*"
|
|
474
|
+
|
|
475
|
+
agg_pairs = [("count", "avg"), ("sum", "count"), ("avg", "min")]
|
|
476
|
+
results: list[SimulatorIntent] = []
|
|
477
|
+
for left_agg, right_agg in agg_pairs:
|
|
478
|
+
left_term = f"{left_agg}({target_col})"
|
|
479
|
+
if (left_term, ">") in existing:
|
|
480
|
+
continue
|
|
481
|
+
new_intent = copy.deepcopy(intent)
|
|
482
|
+
new_having = HavingParam(
|
|
483
|
+
left_expr=NormalizedExpr.from_agg(left_agg, target_col),
|
|
484
|
+
op=">",
|
|
485
|
+
right_expr=NormalizedExpr.from_agg(right_agg, target_col),
|
|
486
|
+
value_type="expression",
|
|
487
|
+
param_key="",
|
|
488
|
+
)
|
|
489
|
+
new_intent.having_param = list(
|
|
490
|
+
new_intent.having_param or []
|
|
491
|
+
) + [new_having]
|
|
492
|
+
_add_expansion_metadata(new_intent, "A7_add_having_expr")
|
|
493
|
+
results.append(new_intent)
|
|
494
|
+
return results
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def _a8_remove_filter(
|
|
498
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
499
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
500
|
+
) -> list[SimulatorIntent]:
|
|
501
|
+
"""A8: Remove each filter one at a time."""
|
|
502
|
+
current = intent.filters_param or []
|
|
503
|
+
if not current:
|
|
504
|
+
return []
|
|
505
|
+
results: list[SimulatorIntent] = []
|
|
506
|
+
for i in range(len(current)):
|
|
507
|
+
new_intent = copy.deepcopy(intent)
|
|
508
|
+
new_intent.filters_param = current[:i] + current[i + 1:]
|
|
509
|
+
_add_expansion_metadata(new_intent, "A8_remove_filter")
|
|
510
|
+
results.append(new_intent)
|
|
511
|
+
return results
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def _a9_remove_groupby(
|
|
515
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
516
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
517
|
+
) -> list[SimulatorIntent]:
|
|
518
|
+
"""A9: Remove each GROUP BY column one at a time (skip if single)."""
|
|
519
|
+
current = list(intent.group_by_cols or [])
|
|
520
|
+
if len(current) <= 1:
|
|
521
|
+
return []
|
|
522
|
+
results: list[SimulatorIntent] = []
|
|
523
|
+
for gb in current:
|
|
524
|
+
new_intent = copy.deepcopy(intent)
|
|
525
|
+
new_intent.group_by_cols = [
|
|
526
|
+
g for g in current if g.primary_column != gb.primary_column
|
|
527
|
+
]
|
|
528
|
+
_add_expansion_metadata(new_intent, "A9_remove_groupby")
|
|
529
|
+
results.append(new_intent)
|
|
530
|
+
return results
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
def _a10_remove_having(
|
|
534
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
535
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
536
|
+
) -> list[SimulatorIntent]:
|
|
537
|
+
"""A10: Remove each HAVING condition one at a time."""
|
|
538
|
+
current = intent.having_param or []
|
|
539
|
+
if not current:
|
|
540
|
+
return []
|
|
541
|
+
results: list[SimulatorIntent] = []
|
|
542
|
+
for i in range(len(current)):
|
|
543
|
+
new_intent = copy.deepcopy(intent)
|
|
544
|
+
new_intent.having_param = current[:i] + current[i + 1:]
|
|
545
|
+
_add_expansion_metadata(new_intent, "A10_remove_having")
|
|
546
|
+
results.append(new_intent)
|
|
547
|
+
return results
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
# ---------------------------------------------------------------------------
|
|
551
|
+
# B-series operators (join modifications)
|
|
552
|
+
# ---------------------------------------------------------------------------
|
|
553
|
+
|
|
554
|
+
def _b1_add_dimension_join(
|
|
555
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
556
|
+
fk_map: dict[str, list[dict[str, str]]],
|
|
557
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
558
|
+
) -> list[SimulatorIntent]:
|
|
559
|
+
"""B1: Add each FK-connected dimension table not already present."""
|
|
560
|
+
current = set(intent.tables or [])
|
|
561
|
+
if len(current) >= SimulatorConfig.MAX_TABLES:
|
|
562
|
+
return []
|
|
563
|
+
results: list[SimulatorIntent] = []
|
|
564
|
+
for table in list(current):
|
|
565
|
+
for fk in fk_map.get(table, []):
|
|
566
|
+
target = fk.get("target_table")
|
|
567
|
+
if not target or target in current:
|
|
568
|
+
continue
|
|
569
|
+
if (
|
|
570
|
+
_get_table_role(schema, target) or TableRole.FACT.value
|
|
571
|
+
) != TableRole.DIMENSION.value:
|
|
572
|
+
continue
|
|
573
|
+
new_tables = list(current | {target})
|
|
574
|
+
if not _tables_are_connected(new_tables, fk_map):
|
|
575
|
+
continue
|
|
576
|
+
new_intent = copy.deepcopy(intent)
|
|
577
|
+
new_intent.tables = sorted(new_tables)
|
|
578
|
+
_add_expansion_metadata(new_intent, "B1_add_dimension_join")
|
|
579
|
+
results.append(new_intent)
|
|
580
|
+
return results
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _b2_add_fact_join(
|
|
584
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
585
|
+
fk_map: dict[str, list[dict[str, str]]],
|
|
586
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
587
|
+
) -> list[SimulatorIntent]:
|
|
588
|
+
"""B2: Add each FK-connected fact table not already present."""
|
|
589
|
+
current = set(intent.tables or [])
|
|
590
|
+
if len(current) >= SimulatorConfig.MAX_TABLES:
|
|
591
|
+
return []
|
|
592
|
+
results: list[SimulatorIntent] = []
|
|
593
|
+
seen_targets: set[str] = set()
|
|
594
|
+
for table in list(current):
|
|
595
|
+
for fk in fk_map.get(table, []):
|
|
596
|
+
target = fk.get("target_table")
|
|
597
|
+
if not target or target in current or target in seen_targets:
|
|
598
|
+
continue
|
|
599
|
+
if (
|
|
600
|
+
_get_table_role(schema, target) or TableRole.FACT.value
|
|
601
|
+
) != TableRole.FACT.value:
|
|
602
|
+
continue
|
|
603
|
+
new_tables = list(current | {target})
|
|
604
|
+
if not _tables_are_connected(new_tables, fk_map):
|
|
605
|
+
continue
|
|
606
|
+
seen_targets.add(target)
|
|
607
|
+
new_intent = copy.deepcopy(intent)
|
|
608
|
+
new_intent.tables = sorted(new_tables)
|
|
609
|
+
_add_expansion_metadata(new_intent, "B2_add_fact_join")
|
|
610
|
+
results.append(new_intent)
|
|
611
|
+
|
|
612
|
+
for other_table, other_fks in fk_map.items():
|
|
613
|
+
if other_table in current or other_table in seen_targets:
|
|
614
|
+
continue
|
|
615
|
+
if (
|
|
616
|
+
_get_table_role(schema, other_table) or TableRole.FACT.value
|
|
617
|
+
) != TableRole.FACT.value:
|
|
618
|
+
continue
|
|
619
|
+
for ofk in other_fks:
|
|
620
|
+
if ofk.get("target_table") == table:
|
|
621
|
+
new_tables = list(current | {other_table})
|
|
622
|
+
if not _tables_are_connected(new_tables, fk_map):
|
|
623
|
+
continue
|
|
624
|
+
seen_targets.add(other_table)
|
|
625
|
+
new_intent = copy.deepcopy(intent)
|
|
626
|
+
new_intent.tables = sorted(new_tables)
|
|
627
|
+
_add_expansion_metadata(
|
|
628
|
+
new_intent, "B2_add_fact_join",
|
|
629
|
+
)
|
|
630
|
+
results.append(new_intent)
|
|
631
|
+
break
|
|
632
|
+
return results
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def _b3_swap_dimension(
|
|
636
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
637
|
+
fk_map: dict[str, list[dict[str, str]]],
|
|
638
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
639
|
+
) -> list[SimulatorIntent]:
|
|
640
|
+
"""B3: Swap each dimension for an alternative FK-connected dimension."""
|
|
641
|
+
current = list(intent.tables or [])
|
|
642
|
+
results: list[SimulatorIntent] = []
|
|
643
|
+
for i, table in enumerate(current):
|
|
644
|
+
if (
|
|
645
|
+
_get_table_role(schema, table) or TableRole.FACT.value
|
|
646
|
+
) != TableRole.DIMENSION.value:
|
|
647
|
+
continue
|
|
648
|
+
fact_tables = [
|
|
649
|
+
t for t in current
|
|
650
|
+
if (_get_table_role(schema, t) or TableRole.FACT.value)
|
|
651
|
+
== TableRole.FACT.value
|
|
652
|
+
]
|
|
653
|
+
if not fact_tables:
|
|
654
|
+
continue
|
|
655
|
+
for dim in _get_dimension_tables(schema):
|
|
656
|
+
if dim == table or dim in current:
|
|
657
|
+
continue
|
|
658
|
+
can_join = any(
|
|
659
|
+
fk.get("target_table") == dim
|
|
660
|
+
for fact in fact_tables
|
|
661
|
+
for fk in fk_map.get(fact, [])
|
|
662
|
+
)
|
|
663
|
+
if not can_join:
|
|
664
|
+
continue
|
|
665
|
+
new_tables = current[:i] + [dim] + current[i + 1:]
|
|
666
|
+
new_intent = copy.deepcopy(intent)
|
|
667
|
+
new_intent.tables = sorted(new_tables)
|
|
668
|
+
_add_expansion_metadata(new_intent, "B3_swap_dimension")
|
|
669
|
+
results.append(new_intent)
|
|
670
|
+
return results
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def _b4_remove_table(
|
|
674
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
675
|
+
fk_map: dict[str, list[dict[str, str]]],
|
|
676
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
677
|
+
) -> list[SimulatorIntent]:
|
|
678
|
+
"""B4: Remove each removable dimension table, pruning dependent clauses."""
|
|
679
|
+
current = list(intent.tables or [])
|
|
680
|
+
if len(current) <= 1:
|
|
681
|
+
return []
|
|
682
|
+
results: list[SimulatorIntent] = []
|
|
683
|
+
for i, table in enumerate(current):
|
|
684
|
+
if (
|
|
685
|
+
_get_table_role(schema, table) or TableRole.FACT.value
|
|
686
|
+
) != TableRole.DIMENSION.value:
|
|
687
|
+
continue
|
|
688
|
+
new_tables = current[:i] + current[i + 1:]
|
|
689
|
+
if not new_tables:
|
|
690
|
+
continue
|
|
691
|
+
if not _tables_are_connected(new_tables, fk_map):
|
|
692
|
+
continue
|
|
693
|
+
new_intent = copy.deepcopy(intent)
|
|
694
|
+
new_intent.tables = sorted(new_tables)
|
|
695
|
+
ts = set(new_tables)
|
|
696
|
+
new_intent.filters_param = [
|
|
697
|
+
f for f in (new_intent.filters_param or [])
|
|
698
|
+
if _table_from_column_ref(f.left_expr.primary_column) in ts
|
|
699
|
+
]
|
|
700
|
+
new_intent.group_by_cols = [
|
|
701
|
+
c for c in (new_intent.group_by_cols or [])
|
|
702
|
+
if _table_from_column_ref(c.primary_column) in ts
|
|
703
|
+
]
|
|
704
|
+
new_intent.order_by_cols = [
|
|
705
|
+
o for o in (new_intent.order_by_cols or [])
|
|
706
|
+
if _table_from_column_ref(o.expr.primary_column) in ts
|
|
707
|
+
]
|
|
708
|
+
new_intent.select_cols = [
|
|
709
|
+
sc for sc in (new_intent.select_cols or [])
|
|
710
|
+
if _table_from_column_ref(sc.expr.primary_column) in ts
|
|
711
|
+
]
|
|
712
|
+
if not new_intent.select_cols:
|
|
713
|
+
continue
|
|
714
|
+
_add_expansion_metadata(new_intent, "B4_remove_table")
|
|
715
|
+
results.append(new_intent)
|
|
716
|
+
return results
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def _b5_bridge_via_intermediate(
|
|
720
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
721
|
+
fk_map: dict[str, list[dict[str, str]]],
|
|
722
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
723
|
+
) -> list[SimulatorIntent]:
|
|
724
|
+
"""B5: Add bridge tables connected to 2+ current tables."""
|
|
725
|
+
current = set(intent.tables or [])
|
|
726
|
+
if len(current) >= SimulatorConfig.MAX_TABLES:
|
|
727
|
+
return []
|
|
728
|
+
results: list[SimulatorIntent] = []
|
|
729
|
+
for bridge in schema.tables:
|
|
730
|
+
if bridge in current:
|
|
731
|
+
continue
|
|
732
|
+
if (
|
|
733
|
+
_get_table_role(schema, bridge) or TableRole.FACT.value
|
|
734
|
+
) != TableRole.BRIDGE.value:
|
|
735
|
+
continue
|
|
736
|
+
connected = {
|
|
737
|
+
fk.get("target_table")
|
|
738
|
+
for fk in fk_map.get(bridge, [])
|
|
739
|
+
if fk.get("target_table") in current
|
|
740
|
+
}
|
|
741
|
+
if len(connected) < 2:
|
|
742
|
+
continue
|
|
743
|
+
new_tables = list(current | {bridge})
|
|
744
|
+
if not _tables_are_connected(new_tables, fk_map):
|
|
745
|
+
continue
|
|
746
|
+
new_intent = copy.deepcopy(intent)
|
|
747
|
+
new_intent.tables = sorted(new_tables)
|
|
748
|
+
_add_expansion_metadata(
|
|
749
|
+
new_intent, "B5_bridge_via_intermediate",
|
|
750
|
+
)
|
|
751
|
+
results.append(new_intent)
|
|
752
|
+
return results
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
# ---------------------------------------------------------------------------
|
|
756
|
+
# C-series (gold inclusion)
|
|
757
|
+
# ---------------------------------------------------------------------------
|
|
758
|
+
|
|
759
|
+
def _c1_include_gold(
|
|
760
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
761
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
762
|
+
) -> list[SimulatorIntent]:
|
|
763
|
+
"""C1: Include the gold intent as-is with expansion metadata."""
|
|
764
|
+
gold_copy = copy.deepcopy(intent)
|
|
765
|
+
_add_expansion_metadata(gold_copy, "C1_include_gold")
|
|
766
|
+
return [gold_copy]
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
# ---------------------------------------------------------------------------
|
|
770
|
+
# T-series (temporal scalar expansions)
|
|
771
|
+
# ---------------------------------------------------------------------------
|
|
772
|
+
|
|
773
|
+
def _t1_extract_select_groupby(
|
|
774
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
775
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
776
|
+
) -> list[SimulatorIntent]:
|
|
777
|
+
"""T1: Wrap temporal columns with EXTRACT(unit) in SELECT and GROUP BY."""
|
|
778
|
+
results: list[SimulatorIntent] = []
|
|
779
|
+
for table in intent.tables or []:
|
|
780
|
+
for col in _get_temporal_columns(schema, table):
|
|
781
|
+
for unit in SimulatorConfig.EXTRACT_EXPANSION_UNITS:
|
|
782
|
+
new_intent = copy.deepcopy(intent)
|
|
783
|
+
extract_expr = NormalizedExpr.from_column(col)
|
|
784
|
+
extract_expr = replace(
|
|
785
|
+
extract_expr,
|
|
786
|
+
scalar_func="extract",
|
|
787
|
+
scalar_func_args=[unit],
|
|
788
|
+
)
|
|
789
|
+
new_intent.select_cols = list(
|
|
790
|
+
new_intent.select_cols or []
|
|
791
|
+
) + [SelectCol(expr=extract_expr)]
|
|
792
|
+
new_intent.group_by_cols = sorted(
|
|
793
|
+
list(new_intent.group_by_cols or [])
|
|
794
|
+
+ [extract_expr],
|
|
795
|
+
key=lambda g: g.signature_key,
|
|
796
|
+
)
|
|
797
|
+
if new_intent.grain == "row_level":
|
|
798
|
+
new_intent.grain = "grouped"
|
|
799
|
+
_add_expansion_metadata(
|
|
800
|
+
new_intent, "T1_extract_select_groupby",
|
|
801
|
+
)
|
|
802
|
+
results.append(new_intent)
|
|
803
|
+
return results
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def _t2_date_trunc_groupby(
|
|
807
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
808
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
809
|
+
) -> list[SimulatorIntent]:
|
|
810
|
+
"""T2: Wrap temporal columns with DATE_TRUNC(unit) in GROUP BY and SELECT."""
|
|
811
|
+
results: list[SimulatorIntent] = []
|
|
812
|
+
for table in intent.tables or []:
|
|
813
|
+
for col in _get_temporal_columns(schema, table):
|
|
814
|
+
for unit in SimulatorConfig.DATE_TRUNC_EXPANSION_UNITS:
|
|
815
|
+
new_intent = copy.deepcopy(intent)
|
|
816
|
+
trunc_expr = NormalizedExpr.from_column(col)
|
|
817
|
+
trunc_expr = replace(
|
|
818
|
+
trunc_expr,
|
|
819
|
+
scalar_func="date_trunc",
|
|
820
|
+
scalar_func_args=[unit],
|
|
821
|
+
)
|
|
822
|
+
new_intent.select_cols = list(
|
|
823
|
+
new_intent.select_cols or []
|
|
824
|
+
) + [SelectCol(expr=trunc_expr)]
|
|
825
|
+
new_intent.group_by_cols = sorted(
|
|
826
|
+
list(new_intent.group_by_cols or [])
|
|
827
|
+
+ [trunc_expr],
|
|
828
|
+
key=lambda g: g.signature_key,
|
|
829
|
+
)
|
|
830
|
+
if new_intent.grain == "row_level":
|
|
831
|
+
new_intent.grain = "grouped"
|
|
832
|
+
_add_expansion_metadata(
|
|
833
|
+
new_intent, "T2_date_trunc_groupby",
|
|
834
|
+
)
|
|
835
|
+
results.append(new_intent)
|
|
836
|
+
return results
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
def _t3_date_window_filter(
|
|
840
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
841
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
842
|
+
) -> list[SimulatorIntent]:
|
|
843
|
+
"""T3: Add date_window filter on temporal columns using config presets."""
|
|
844
|
+
current_filter_cols = {
|
|
845
|
+
f.left_expr.primary_column for f in (intent.filters_param or [])
|
|
846
|
+
}
|
|
847
|
+
if len(current_filter_cols) >= SimulatorConfig.MAX_FILTERS:
|
|
848
|
+
return []
|
|
849
|
+
results: list[SimulatorIntent] = []
|
|
850
|
+
for table in intent.tables or []:
|
|
851
|
+
for col in _get_temporal_columns(schema, table):
|
|
852
|
+
if col in current_filter_cols:
|
|
853
|
+
continue
|
|
854
|
+
for preset in SimulatorConfig.DATE_WINDOW_EXPANSION_PRESETS:
|
|
855
|
+
new_intent = copy.deepcopy(intent)
|
|
856
|
+
new_filter = FilterParam(
|
|
857
|
+
left_expr=NormalizedExpr.from_column(col),
|
|
858
|
+
op=">=",
|
|
859
|
+
value_type="date_window",
|
|
860
|
+
param_key="",
|
|
861
|
+
raw_value=dict(preset),
|
|
862
|
+
)
|
|
863
|
+
new_intent.filters_param = list(
|
|
864
|
+
new_intent.filters_param or []
|
|
865
|
+
) + [new_filter]
|
|
866
|
+
_add_expansion_metadata(
|
|
867
|
+
new_intent, "T3_date_window_filter",
|
|
868
|
+
)
|
|
869
|
+
results.append(new_intent)
|
|
870
|
+
return results
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
def _t4_date_diff_filter(
|
|
874
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
875
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
876
|
+
) -> list[SimulatorIntent]:
|
|
877
|
+
"""T4: Add date_diff filter on temporal columns using config presets."""
|
|
878
|
+
current_filter_cols = {
|
|
879
|
+
f.left_expr.primary_column for f in (intent.filters_param or [])
|
|
880
|
+
}
|
|
881
|
+
if len(current_filter_cols) >= SimulatorConfig.MAX_FILTERS:
|
|
882
|
+
return []
|
|
883
|
+
results: list[SimulatorIntent] = []
|
|
884
|
+
for table in intent.tables or []:
|
|
885
|
+
for col in _get_temporal_columns(schema, table):
|
|
886
|
+
if col in current_filter_cols:
|
|
887
|
+
continue
|
|
888
|
+
for preset in SimulatorConfig.DATE_DIFF_EXPANSION_PRESETS:
|
|
889
|
+
new_intent = copy.deepcopy(intent)
|
|
890
|
+
new_filter = FilterParam(
|
|
891
|
+
left_expr=NormalizedExpr.from_column(col),
|
|
892
|
+
op="<=",
|
|
893
|
+
value_type="date_diff",
|
|
894
|
+
param_key="",
|
|
895
|
+
raw_value=dict(preset),
|
|
896
|
+
)
|
|
897
|
+
new_intent.filters_param = list(
|
|
898
|
+
new_intent.filters_param or []
|
|
899
|
+
) + [new_filter]
|
|
900
|
+
_add_expansion_metadata(
|
|
901
|
+
new_intent, "T4_date_diff_filter",
|
|
902
|
+
)
|
|
903
|
+
results.append(new_intent)
|
|
904
|
+
return results
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
# ---------------------------------------------------------------------------
|
|
908
|
+
# N-series (numeric scalar expansions)
|
|
909
|
+
# ---------------------------------------------------------------------------
|
|
910
|
+
|
|
911
|
+
def _n1_round_numeric(
|
|
912
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
913
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
914
|
+
) -> list[SimulatorIntent]:
|
|
915
|
+
"""N1: Wrap NUMERIC_MEASURE select columns with ROUND."""
|
|
916
|
+
numeric_cols: set[str] = set()
|
|
917
|
+
for table in intent.tables or []:
|
|
918
|
+
numeric_cols.update(_get_numeric_measure_columns(schema, table))
|
|
919
|
+
|
|
920
|
+
results: list[SimulatorIntent] = []
|
|
921
|
+
for idx, sc in enumerate(intent.select_cols or []):
|
|
922
|
+
if sc.expr.primary_column not in numeric_cols:
|
|
923
|
+
continue
|
|
924
|
+
if sc.expr.scalar_func == "round":
|
|
925
|
+
continue
|
|
926
|
+
new_intent = copy.deepcopy(intent)
|
|
927
|
+
new_expr = replace(
|
|
928
|
+
new_intent.select_cols[idx].expr,
|
|
929
|
+
scalar_func="round",
|
|
930
|
+
scalar_func_args=[0],
|
|
931
|
+
)
|
|
932
|
+
new_intent.select_cols[idx] = SelectCol(expr=new_expr)
|
|
933
|
+
_add_expansion_metadata(new_intent, "N1_round_numeric")
|
|
934
|
+
results.append(new_intent)
|
|
935
|
+
return results
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
def _n2_abs_filter(
|
|
939
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
940
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
941
|
+
) -> list[SimulatorIntent]:
|
|
942
|
+
"""N2: Wrap numeric filter left_expr with ABS for range ops."""
|
|
943
|
+
results: list[SimulatorIntent] = []
|
|
944
|
+
for idx, f in enumerate(intent.filters_param or []):
|
|
945
|
+
if f.op not in (">", "<", ">=", "<="):
|
|
946
|
+
continue
|
|
947
|
+
if f.left_expr.scalar_func == "abs":
|
|
948
|
+
continue
|
|
949
|
+
col = f.left_expr.primary_column
|
|
950
|
+
table = _table_from_column_ref(col)
|
|
951
|
+
if not table or table not in column_metadata:
|
|
952
|
+
continue
|
|
953
|
+
bare = col.split(".", 1)[1] if "." in col else col
|
|
954
|
+
col_info = column_metadata.get(table, {}).get(bare, {})
|
|
955
|
+
if col_info.get("role") != ColumnRole.NUMERIC_MEASURE.value:
|
|
956
|
+
continue
|
|
957
|
+
new_intent = copy.deepcopy(intent)
|
|
958
|
+
new_expr = replace(
|
|
959
|
+
new_intent.filters_param[idx].left_expr,
|
|
960
|
+
scalar_func="abs",
|
|
961
|
+
)
|
|
962
|
+
new_intent.filters_param[idx] = replace(
|
|
963
|
+
new_intent.filters_param[idx], left_expr=new_expr,
|
|
964
|
+
)
|
|
965
|
+
_add_expansion_metadata(new_intent, "N2_abs_filter")
|
|
966
|
+
results.append(new_intent)
|
|
967
|
+
return results
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
# ---------------------------------------------------------------------------
|
|
971
|
+
# Structural operators (DISTINCT, LIMIT, OR-groups, expression composition)
|
|
972
|
+
# ---------------------------------------------------------------------------
|
|
973
|
+
|
|
974
|
+
def _d1_add_distinct(
|
|
975
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
976
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
977
|
+
) -> list[SimulatorIntent]:
|
|
978
|
+
"""D1: Set distinct=True if not already set."""
|
|
979
|
+
if getattr(intent, "distinct", False):
|
|
980
|
+
return []
|
|
981
|
+
new_intent = copy.deepcopy(intent)
|
|
982
|
+
if hasattr(new_intent, "distinct"):
|
|
983
|
+
new_intent.distinct = True
|
|
984
|
+
_add_expansion_metadata(new_intent, "D1_add_distinct")
|
|
985
|
+
return [new_intent]
|
|
986
|
+
|
|
987
|
+
|
|
988
|
+
def _l1_add_limit(
|
|
989
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
990
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
991
|
+
) -> list[SimulatorIntent]:
|
|
992
|
+
"""L1: Add LIMIT with representative values from config."""
|
|
993
|
+
if intent.limit is not None:
|
|
994
|
+
return []
|
|
995
|
+
results: list[SimulatorIntent] = []
|
|
996
|
+
for val in SimulatorConfig.LIMIT_EXPANSION_VALUES:
|
|
997
|
+
new_intent = copy.deepcopy(intent)
|
|
998
|
+
new_intent.limit = val
|
|
999
|
+
_add_expansion_metadata(new_intent, "L1_add_limit")
|
|
1000
|
+
results.append(new_intent)
|
|
1001
|
+
return results
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
def _f1_or_filter_group(
|
|
1005
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
1006
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
1007
|
+
) -> list[SimulatorIntent]:
|
|
1008
|
+
"""F1: Convert pairs of existing AND filters into OR groups."""
|
|
1009
|
+
filters = intent.filters_param or []
|
|
1010
|
+
if len(filters) < 2:
|
|
1011
|
+
return []
|
|
1012
|
+
results: list[SimulatorIntent] = []
|
|
1013
|
+
for i in range(len(filters)):
|
|
1014
|
+
for j in range(i + 1, len(filters)):
|
|
1015
|
+
fi, fj = filters[i], filters[j]
|
|
1016
|
+
if fi.right_expr or fj.right_expr:
|
|
1017
|
+
continue
|
|
1018
|
+
if fi.value_type in ("date_window", "date_diff"):
|
|
1019
|
+
continue
|
|
1020
|
+
if fj.value_type in ("date_window", "date_diff"):
|
|
1021
|
+
continue
|
|
1022
|
+
new_intent = copy.deepcopy(intent)
|
|
1023
|
+
group_id = 1
|
|
1024
|
+
new_fi = replace(
|
|
1025
|
+
new_intent.filters_param[i],
|
|
1026
|
+
bool_op="OR",
|
|
1027
|
+
filter_group=group_id,
|
|
1028
|
+
)
|
|
1029
|
+
new_fj = replace(
|
|
1030
|
+
new_intent.filters_param[j],
|
|
1031
|
+
bool_op="OR",
|
|
1032
|
+
filter_group=group_id,
|
|
1033
|
+
)
|
|
1034
|
+
new_intent.filters_param[i] = new_fi
|
|
1035
|
+
new_intent.filters_param[j] = new_fj
|
|
1036
|
+
_add_expansion_metadata(new_intent, "F1_or_filter_group")
|
|
1037
|
+
results.append(new_intent)
|
|
1038
|
+
return results
|
|
1039
|
+
|
|
1040
|
+
|
|
1041
|
+
def _e1_expression_select(
|
|
1042
|
+
intent: SimulatorIntent, schema: SchemaGraph,
|
|
1043
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
1044
|
+
) -> list[SimulatorIntent]:
|
|
1045
|
+
"""E1: Create composed expressions from numeric column pairs in SELECT."""
|
|
1046
|
+
numeric_cols: list[str] = []
|
|
1047
|
+
for table in intent.tables or []:
|
|
1048
|
+
numeric_cols.extend(_get_numeric_measure_columns(schema, table))
|
|
1049
|
+
|
|
1050
|
+
if len(numeric_cols) < 2:
|
|
1051
|
+
return []
|
|
1052
|
+
|
|
1053
|
+
results: list[SimulatorIntent] = []
|
|
1054
|
+
for i, col_a in enumerate(numeric_cols):
|
|
1055
|
+
for col_b in numeric_cols[i + 1:]:
|
|
1056
|
+
new_intent = copy.deepcopy(intent)
|
|
1057
|
+
composed = NormalizedExpr(
|
|
1058
|
+
add_groups=[
|
|
1059
|
+
MulGroup(multiply=[col_a, col_b]),
|
|
1060
|
+
],
|
|
1061
|
+
)
|
|
1062
|
+
new_intent.select_cols = list(
|
|
1063
|
+
new_intent.select_cols or []
|
|
1064
|
+
) + [SelectCol(expr=composed)]
|
|
1065
|
+
_add_expansion_metadata(new_intent, "E1_expression_select")
|
|
1066
|
+
results.append(new_intent)
|
|
1067
|
+
return results
|
|
1068
|
+
|
|
1069
|
+
|
|
1070
|
+
# ---------------------------------------------------------------------------
|
|
1071
|
+
# Orchestration: single-depth expansion
|
|
1072
|
+
# ---------------------------------------------------------------------------
|
|
1073
|
+
|
|
1074
|
+
_OperatorFn = Any
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
def _build_operator_registry(
|
|
1078
|
+
column_metadata: dict[str, dict[str, dict[str, Any]]],
|
|
1079
|
+
fk_map: dict[str, list[dict[str, str]]],
|
|
1080
|
+
) -> dict[str, _OperatorFn]:
|
|
1081
|
+
"""Return the full operator map keyed by short code."""
|
|
1082
|
+
return {
|
|
1083
|
+
"A1": lambda i, s: _a1_add_filter(i, s, column_metadata),
|
|
1084
|
+
"A2": lambda i, s: _a2_add_expr_filter(i, s, column_metadata),
|
|
1085
|
+
"A3": lambda i, s: _a3_change_aggregation(i, s, column_metadata),
|
|
1086
|
+
"A4": lambda i, s: _a4_add_groupby(i, s, column_metadata),
|
|
1087
|
+
"A5": lambda i, s: _a5_add_orderby(i, s, column_metadata),
|
|
1088
|
+
"A6": lambda i, s: _a6_add_having_value(i, s, column_metadata),
|
|
1089
|
+
"A7": lambda i, s: _a7_add_having_expr(i, s, column_metadata),
|
|
1090
|
+
"A8": lambda i, s: _a8_remove_filter(i, s, column_metadata),
|
|
1091
|
+
"A9": lambda i, s: _a9_remove_groupby(i, s, column_metadata),
|
|
1092
|
+
"A10": lambda i, s: _a10_remove_having(i, s, column_metadata),
|
|
1093
|
+
"B1": lambda i, s: _b1_add_dimension_join(i, s, fk_map, column_metadata),
|
|
1094
|
+
"B2": lambda i, s: _b2_add_fact_join(i, s, fk_map, column_metadata),
|
|
1095
|
+
"B3": lambda i, s: _b3_swap_dimension(i, s, fk_map, column_metadata),
|
|
1096
|
+
"B4": lambda i, s: _b4_remove_table(i, s, fk_map, column_metadata),
|
|
1097
|
+
"B5": lambda i, s: _b5_bridge_via_intermediate(i, s, fk_map, column_metadata),
|
|
1098
|
+
"C1": lambda i, s: _c1_include_gold(i, s, column_metadata),
|
|
1099
|
+
"T1": lambda i, s: _t1_extract_select_groupby(i, s, column_metadata),
|
|
1100
|
+
"T2": lambda i, s: _t2_date_trunc_groupby(i, s, column_metadata),
|
|
1101
|
+
"T3": lambda i, s: _t3_date_window_filter(i, s, column_metadata),
|
|
1102
|
+
"T4": lambda i, s: _t4_date_diff_filter(i, s, column_metadata),
|
|
1103
|
+
"N1": lambda i, s: _n1_round_numeric(i, s, column_metadata),
|
|
1104
|
+
"N2": lambda i, s: _n2_abs_filter(i, s, column_metadata),
|
|
1105
|
+
"D1": lambda i, s: _d1_add_distinct(i, s, column_metadata),
|
|
1106
|
+
"L1": lambda i, s: _l1_add_limit(i, s, column_metadata),
|
|
1107
|
+
"F1": lambda i, s: _f1_or_filter_group(i, s, column_metadata),
|
|
1108
|
+
"E1": lambda i, s: _e1_expression_select(i, s, column_metadata),
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
def _expand_single_depth(
|
|
1113
|
+
intents: list[SimulatorIntent],
|
|
1114
|
+
schema: SchemaGraph,
|
|
1115
|
+
operators: dict[str, _OperatorFn],
|
|
1116
|
+
seen_keys: set[str],
|
|
1117
|
+
) -> list[SimulatorIntent]:
|
|
1118
|
+
"""Run all operators on each intent in *intents*, returning new unique variants.
|
|
1119
|
+
|
|
1120
|
+
Deduplicates via *seen_keys* (mutated in place) and enforces schema
|
|
1121
|
+
consistency on each accepted variant.
|
|
1122
|
+
"""
|
|
1123
|
+
results: list[SimulatorIntent] = []
|
|
1124
|
+
for intent in intents:
|
|
1125
|
+
for op_name, op_func in operators.items():
|
|
1126
|
+
variants = op_func(intent, schema)
|
|
1127
|
+
for var in variants:
|
|
1128
|
+
var_key = intent_key(var.to_runtime_intent())
|
|
1129
|
+
if var_key in seen_keys:
|
|
1130
|
+
continue
|
|
1131
|
+
seen_keys.add(var_key)
|
|
1132
|
+
if var.grain == "grouped" and not var.group_by_cols:
|
|
1133
|
+
continue
|
|
1134
|
+
var, _ = enforce_schema(var, schema)
|
|
1135
|
+
results.append(var)
|
|
1136
|
+
return results
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
# ---------------------------------------------------------------------------
|
|
1140
|
+
# Public API
|
|
1141
|
+
# ---------------------------------------------------------------------------
|
|
1142
|
+
|
|
1143
|
+
def expand_gold_intents(
|
|
1144
|
+
gold_intents: list[SimulatorIntent],
|
|
1145
|
+
schema: SchemaGraph,
|
|
1146
|
+
limits: SchemaLimits | None = None,
|
|
1147
|
+
max_depth: int | None = None,
|
|
1148
|
+
) -> list[SimulatorIntent]:
|
|
1149
|
+
"""Expand all gold intents into synthetic intents via multi-depth deterministic expansion.
|
|
1150
|
+
|
|
1151
|
+
Runs every operator on every gold intent (depth 1), then re-expands
|
|
1152
|
+
the depth-1 results (depth 2), up to *max_depth*. Deduplicates
|
|
1153
|
+
across all golds and depths via SHA-256 ``intent_key``.
|
|
1154
|
+
|
|
1155
|
+
CTE gold intents are now included (operators apply to the main
|
|
1156
|
+
query portion).
|
|
1157
|
+
|
|
1158
|
+
Args:
|
|
1159
|
+
|
|
1160
|
+
gold_intents: Seed SimulatorIntents to expand.
|
|
1161
|
+
schema: Schema graph for column/table introspection.
|
|
1162
|
+
limits: Optional SchemaLimits overriding MAX_FILTERS,
|
|
1163
|
+
MAX_GROUPBY, MAX_TABLES.
|
|
1164
|
+
max_depth: Expansion depth; defaults to
|
|
1165
|
+
``SimulatorConfig.MAX_EXPANSION_DEPTH``.
|
|
1166
|
+
|
|
1167
|
+
Returns:
|
|
1168
|
+
|
|
1169
|
+
List of unique synthetic SimulatorIntents.
|
|
1170
|
+
"""
|
|
1171
|
+
if limits is not None:
|
|
1172
|
+
SimulatorConfig.MAX_FILTERS = limits.max_filters
|
|
1173
|
+
SimulatorConfig.MAX_GROUPBY = limits.max_groupby
|
|
1174
|
+
SimulatorConfig.MAX_TABLES = limits.max_tables
|
|
1175
|
+
log(
|
|
1176
|
+
f"expand_gold_intents: using SchemaLimits "
|
|
1177
|
+
f"max_filters={limits.max_filters}, "
|
|
1178
|
+
f"max_groupby={limits.max_groupby}, "
|
|
1179
|
+
f"max_tables={limits.max_tables}"
|
|
1180
|
+
)
|
|
1181
|
+
|
|
1182
|
+
if max_depth is None:
|
|
1183
|
+
max_depth = SimulatorConfig.MAX_EXPANSION_DEPTH
|
|
1184
|
+
|
|
1185
|
+
log(
|
|
1186
|
+
f"expand_gold_intents: expanding {len(gold_intents)} gold intents "
|
|
1187
|
+
f"with max_depth={max_depth}"
|
|
1188
|
+
)
|
|
1189
|
+
|
|
1190
|
+
column_metadata = _build_column_metadata(schema)
|
|
1191
|
+
fk_map = _build_fk_map(schema)
|
|
1192
|
+
operators = _build_operator_registry(column_metadata, fk_map)
|
|
1193
|
+
|
|
1194
|
+
seen_keys: set[str] = set()
|
|
1195
|
+
for gold in gold_intents:
|
|
1196
|
+
seen_keys.add(intent_key(gold.to_runtime_intent()))
|
|
1197
|
+
|
|
1198
|
+
current_layer = list(gold_intents)
|
|
1199
|
+
all_synthetic: list[SimulatorIntent] = []
|
|
1200
|
+
|
|
1201
|
+
for depth in range(1, max_depth + 1):
|
|
1202
|
+
new_variants = _expand_single_depth(
|
|
1203
|
+
current_layer, schema, operators, seen_keys,
|
|
1204
|
+
)
|
|
1205
|
+
log(
|
|
1206
|
+
f"expand_gold_intents: depth={depth} produced "
|
|
1207
|
+
f"{len(new_variants)} new variants"
|
|
1208
|
+
)
|
|
1209
|
+
if not new_variants:
|
|
1210
|
+
break
|
|
1211
|
+
all_synthetic.extend(new_variants)
|
|
1212
|
+
current_layer = new_variants
|
|
1213
|
+
|
|
1214
|
+
log(
|
|
1215
|
+
f"expand_gold_intents: generated {len(all_synthetic)} "
|
|
1216
|
+
f"unique synthetic intents across {max_depth} depth(s)"
|
|
1217
|
+
)
|
|
1218
|
+
return all_synthetic
|