waxsql 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waxsql/__init__.py +158 -0
- waxsql/ast.py +757 -0
- waxsql/catalog.py +363 -0
- waxsql/cli.py +888 -0
- waxsql/config.py +477 -0
- waxsql/context.py +255 -0
- waxsql/data.py +99 -0
- waxsql/gen/__init__.py +51 -0
- waxsql/gen/cte.py +367 -0
- waxsql/gen/data/__init__.py +14 -0
- waxsql/gen/data/columns.py +48 -0
- waxsql/gen/data/emit.py +247 -0
- waxsql/gen/data/rows.py +236 -0
- waxsql/gen/data/strategies.py +299 -0
- waxsql/gen/expr.py +723 -0
- waxsql/gen/select.py +831 -0
- waxsql/gen/setop.py +259 -0
- waxsql/gen/subquery.py +397 -0
- waxsql/gen/window.py +398 -0
- waxsql/pretty.py +81 -0
- waxsql/printer.py +688 -0
- waxsql/py.typed +0 -0
- waxsql/schema.py +557 -0
- waxsql/scope.py +391 -0
- waxsql/types.py +187 -0
- waxsql/validate/__init__.py +52 -0
- waxsql/validate/parse.py +194 -0
- waxsql/validate/plan.py +149 -0
- waxsql/validate/syntax.py +87 -0
- waxsql-1.0.0.dist-info/METADATA +746 -0
- waxsql-1.0.0.dist-info/RECORD +35 -0
- waxsql-1.0.0.dist-info/WHEEL +5 -0
- waxsql-1.0.0.dist-info/entry_points.txt +2 -0
- waxsql-1.0.0.dist-info/licenses/LICENSE +21 -0
- waxsql-1.0.0.dist-info/top_level.txt +1 -0
waxsql/gen/setop.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""Set-operation generator (UNION / INTERSECT / EXCEPT).
|
|
2
|
+
|
|
3
|
+
Role: combines N SELECTs into one statement. The hard constraint
|
|
4
|
+
that gives the file its shape: both (all) sides of a set-op must
|
|
5
|
+
produce the SAME column types in the SAME order. PG enforces this
|
|
6
|
+
positionally — column #i of every arm must implicitly-cast to a
|
|
7
|
+
common type with column #i of every other arm. The generator
|
|
8
|
+
sidesteps the cross-arm cast negotiation entirely by FIXING arm 1's
|
|
9
|
+
exact target types and reusing them as the spec for arms 2..N (see
|
|
10
|
+
`target_types` below). Arms 2..N don't get gen_select's full freedom;
|
|
11
|
+
they call _gen_arm_select with a pinned type list.
|
|
12
|
+
|
|
13
|
+
One public entry point: `gen_setop(ctx)` builds a SetOp wrapping
|
|
14
|
+
N Selects with matching column counts and types.
|
|
15
|
+
|
|
16
|
+
Two phases:
|
|
17
|
+
|
|
18
|
+
1. **Arm 1**: a full `gen_select(arm1_ctx)` in its own child scope.
|
|
19
|
+
Could be aggregate, could have any target shape. We extract its
|
|
20
|
+
target types after generation, then strip its with_ctes /
|
|
21
|
+
order_by / limit / offset (those belong to the SetOp wrapper,
|
|
22
|
+
not the arm).
|
|
23
|
+
|
|
24
|
+
2. **Arms 2+**: simpler Selects that match arm 1's target types
|
|
25
|
+
position-by-position. Built via `_gen_arm_select` which produces
|
|
26
|
+
a non-aggregate SELECT with FROM, optional WHERE, and one
|
|
27
|
+
SelectTarget per requested type (generated via gen_expr against
|
|
28
|
+
the arm's local scope).
|
|
29
|
+
|
|
30
|
+
Each arm gets its own scope via `descend_subquery(correlated=False)`
|
|
31
|
+
so arm-local FROM aliases don't collide with the surrounding query
|
|
32
|
+
or with other arms. The shared `alias_counter` (and `cte_counter`)
|
|
33
|
+
ensures unique naming across the whole query.
|
|
34
|
+
|
|
35
|
+
After all arms are generated, the SetOp wrapper optionally gets
|
|
36
|
+
its own ORDER BY (positional reference: `ORDER BY 1`) and LIMIT —
|
|
37
|
+
both PG-valid only at the combined-statement level for milestone 7
|
|
38
|
+
(per-arm ORDER BY/LIMIT requires explicit parens).
|
|
39
|
+
"""
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
from dataclasses import replace
|
|
43
|
+
from typing import Optional
|
|
44
|
+
|
|
45
|
+
from ..ast import (
|
|
46
|
+
Expr, Literal, OrderByItem, Select, SelectTarget, SetOp,
|
|
47
|
+
)
|
|
48
|
+
from ..config import (
|
|
49
|
+
FEATURE_LIMIT, FEATURE_ORDER_BY, FEATURE_WHERE,
|
|
50
|
+
)
|
|
51
|
+
from ..context import GenContext
|
|
52
|
+
from ..types import BOOL, INT4, PgType
|
|
53
|
+
from .expr import gen_expr
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
_LIMIT_VALUES: tuple[int, ...] = (1, 5, 10, 25, 50, 100)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def gen_setop(ctx: GenContext) -> SetOp:
|
|
60
|
+
"""Generate a SetOp combining N arms with the chosen operator.
|
|
61
|
+
|
|
62
|
+
Arm 1 is a full gen_select; subsequent arms match arm 1's
|
|
63
|
+
target types via _gen_arm_select OR (with probability
|
|
64
|
+
`cfg.p_nested_set_op_arm`) by recursively generating another
|
|
65
|
+
SetOp whose own first arm matches the parent's target types.
|
|
66
|
+
|
|
67
|
+
Each arm has its own child scope; the shared alias_counter
|
|
68
|
+
prevents cross-arm alias collision. Nested SetOps consume
|
|
69
|
+
`subquery_depth_remaining` via the per-arm descend_subquery,
|
|
70
|
+
so unbounded nesting is impossible.
|
|
71
|
+
|
|
72
|
+
The SetOp wrapper's ORDER BY/LIMIT (if generated) reference
|
|
73
|
+
output columns positionally (`ORDER BY 1`) — the only form
|
|
74
|
+
that works without naming the unified output columns
|
|
75
|
+
explicitly.
|
|
76
|
+
"""
|
|
77
|
+
# Lazy import — gen_select / _gen_arm_select live in gen/select.py,
|
|
78
|
+
# which currently doesn't import gen/setop.py but might in future.
|
|
79
|
+
from .select import gen_select
|
|
80
|
+
|
|
81
|
+
cfg = ctx.config
|
|
82
|
+
rng = ctx.rng
|
|
83
|
+
|
|
84
|
+
n_arms = rng.randint(2, cfg.max_set_op_arms)
|
|
85
|
+
op = rng.choice(("UNION", "INTERSECT", "EXCEPT"))
|
|
86
|
+
all_ = rng.random() < cfg.p_set_op_all
|
|
87
|
+
|
|
88
|
+
# Arm 1: full SELECT in its own child scope. MUST be generated
|
|
89
|
+
# before arms 2+ — its target types drive everything downstream
|
|
90
|
+
# (positional set-op compatibility means every later arm's
|
|
91
|
+
# targets are typed-matched to arm 1's, not the other way).
|
|
92
|
+
arm1_ctx = ctx.descend_subquery(correlated=False)
|
|
93
|
+
arm1 = _strip_arm_clauses(gen_select(arm1_ctx))
|
|
94
|
+
# `target_types` is the positional type signature every later arm
|
|
95
|
+
# must match. We extract it from arm 1 AFTER stripping the
|
|
96
|
+
# combined-statement clauses but BEFORE iterating arms 2..N —
|
|
97
|
+
# gen_select's own type choices become the canonical spec for
|
|
98
|
+
# the whole set-op.
|
|
99
|
+
target_types: list[PgType] = [t.expr.pg_type for t in arm1.targets]
|
|
100
|
+
|
|
101
|
+
# `arms` may hold either Selects or nested SetOps (Track B #5).
|
|
102
|
+
# The explicit union annotation keeps mypy honest about the
|
|
103
|
+
# mixed-element list — without it, the inferred type from the
|
|
104
|
+
# initial `[arm1]` (Select-only) would reject the SetOp append.
|
|
105
|
+
arms: list[Select | SetOp] = [arm1]
|
|
106
|
+
for _ in range(n_arms - 1):
|
|
107
|
+
arm_ctx = ctx.descend_subquery(correlated=False)
|
|
108
|
+
# Nested-SetOp candidate gating:
|
|
109
|
+
# * Need budget — descending into a nested SetOp consumes
|
|
110
|
+
# another subquery_depth in turn for its own arms.
|
|
111
|
+
# * Probability gate. Kept moderate-low; deeply nested
|
|
112
|
+
# setops aren't realistic SQL and produce huge outputs.
|
|
113
|
+
arm: Select | SetOp
|
|
114
|
+
if (not arm_ctx.at_subquery_leaf()
|
|
115
|
+
and rng.random() < cfg.p_nested_set_op_arm):
|
|
116
|
+
arm = _gen_nested_setop_arm(arm_ctx, target_types)
|
|
117
|
+
else:
|
|
118
|
+
arm = _gen_arm_select(arm_ctx, target_types)
|
|
119
|
+
arms.append(arm)
|
|
120
|
+
|
|
121
|
+
# SetOp-level ORDER BY (positional) and LIMIT.
|
|
122
|
+
order_by: tuple[OrderByItem, ...] = ()
|
|
123
|
+
if (FEATURE_ORDER_BY in cfg.feature_flags
|
|
124
|
+
and rng.random() < cfg.p_order_by):
|
|
125
|
+
# Positional reference: `ORDER BY <n> ASC|DESC`. PG interprets
|
|
126
|
+
# bare integer literals in ORDER BY as 1-based output-column
|
|
127
|
+
# references — works regardless of what the unified output
|
|
128
|
+
# columns are named.
|
|
129
|
+
pos = rng.randint(1, len(target_types))
|
|
130
|
+
direction = rng.choice(("ASC", "DESC"))
|
|
131
|
+
order_by = (OrderByItem(
|
|
132
|
+
expr=Literal(INT4, pos),
|
|
133
|
+
direction=direction,
|
|
134
|
+
),)
|
|
135
|
+
|
|
136
|
+
limit: Optional[Expr] = None
|
|
137
|
+
if (FEATURE_LIMIT in cfg.feature_flags
|
|
138
|
+
and rng.random() < cfg.p_limit):
|
|
139
|
+
limit = Literal(INT4, rng.choice(_LIMIT_VALUES))
|
|
140
|
+
|
|
141
|
+
return SetOp(
|
|
142
|
+
op=op,
|
|
143
|
+
all=all_,
|
|
144
|
+
arms=tuple(arms),
|
|
145
|
+
order_by=order_by,
|
|
146
|
+
limit=limit,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# ===========================================================================
|
|
151
|
+
# Internals
|
|
152
|
+
# ===========================================================================
|
|
153
|
+
|
|
154
|
+
def _gen_nested_setop_arm(
|
|
155
|
+
ctx: GenContext,
|
|
156
|
+
target_types: list[PgType],
|
|
157
|
+
) -> SetOp:
|
|
158
|
+
"""Generate a nested SetOp suitable for use as an arm of an
|
|
159
|
+
outer SetOp.
|
|
160
|
+
|
|
161
|
+
All arms of the nested SetOp must produce the same target_types
|
|
162
|
+
as the outer's arm 1 (set-op type compatibility is positional).
|
|
163
|
+
The nested SetOp also doesn't get its own ORDER BY/LIMIT —
|
|
164
|
+
those would be ambiguous with the outer's, and PG requires
|
|
165
|
+
explicit per-arm parens for them anyway. The wrapping parens
|
|
166
|
+
happen in the printer based on AST structure.
|
|
167
|
+
|
|
168
|
+
Currently always 2-armed (no further nesting). Could recurse to
|
|
169
|
+
arbitrary depth via subquery_depth_remaining, but two-deep
|
|
170
|
+
nesting (`A UNION (B INTERSECT C)`) is the realistic case;
|
|
171
|
+
deeper chains produce mostly-unreadable output.
|
|
172
|
+
"""
|
|
173
|
+
rng = ctx.rng
|
|
174
|
+
op = rng.choice(("UNION", "INTERSECT", "EXCEPT"))
|
|
175
|
+
all_ = rng.random() < ctx.config.p_set_op_all
|
|
176
|
+
|
|
177
|
+
# All arms of the nested SetOp use _gen_arm_select with the
|
|
178
|
+
# given target types, ensuring positional type alignment with
|
|
179
|
+
# the OUTER setop's arm 1.
|
|
180
|
+
n_arms = 2
|
|
181
|
+
inner_arms: list[Select] = []
|
|
182
|
+
for _ in range(n_arms):
|
|
183
|
+
sub_ctx = ctx.descend_subquery(correlated=False)
|
|
184
|
+
inner_arms.append(_gen_arm_select(sub_ctx, target_types))
|
|
185
|
+
|
|
186
|
+
# No order_by / limit / offset — those belong only to the
|
|
187
|
+
# outermost SetOp wrapper.
|
|
188
|
+
return SetOp(op=op, all=all_, arms=tuple(inner_arms))
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _strip_arm_clauses(s: Select) -> Select:
|
|
192
|
+
"""Remove clauses that belong to the SetOp wrapper, not individual
|
|
193
|
+
arms — with_ctes, order_by, limit, offset.
|
|
194
|
+
|
|
195
|
+
PG's grammar requires per-arm ORDER BY/LIMIT/OFFSET to be wrapped
|
|
196
|
+
in parens. Stripping is simpler than parenthesizing, and the
|
|
197
|
+
SetOp wrapper carries equivalents at the combined-statement level."""
|
|
198
|
+
return replace(
|
|
199
|
+
s,
|
|
200
|
+
with_ctes=(),
|
|
201
|
+
order_by=(),
|
|
202
|
+
limit=None,
|
|
203
|
+
offset=None,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# CONTRACT: every Select returned here has exactly len(target_types)
|
|
208
|
+
# targets, in order, with each target's pg_type implicitly-castable
|
|
209
|
+
# from target_types[i]. This is what the outer SetOp wrapper relies
|
|
210
|
+
# on for cross-arm compatibility. Violating it produces a PG error
|
|
211
|
+
# like "each UNION query must have the same number of columns" or
|
|
212
|
+
# "UNION types int and text cannot be matched".
|
|
213
|
+
def _gen_arm_select(
|
|
214
|
+
ctx: GenContext,
|
|
215
|
+
target_types: list[PgType],
|
|
216
|
+
) -> Select:
|
|
217
|
+
"""Build a Select for an arms-2+ SetOp arm: FROM clause, target
|
|
218
|
+
list of pre-determined types, optional WHERE.
|
|
219
|
+
|
|
220
|
+
Bypasses gen_select's aggregate-vs-non-aggregate dispatch: arm
|
|
221
|
+
Selects in milestone 7 are always non-aggregate (matching aggregate
|
|
222
|
+
target types from arm 1 across arms is handled by the type
|
|
223
|
+
machinery, not the GROUP BY logic). Aggregate-style arms can come
|
|
224
|
+
in a follow-up.
|
|
225
|
+
"""
|
|
226
|
+
# Lazy imports for the same cycle-breaking reason as gen/cte.py
|
|
227
|
+
# — gen/select.py imports from gen/setop.py at the top level, so
|
|
228
|
+
# importing _gen_from_clause at module top would close the loop.
|
|
229
|
+
from .select import _gen_from_clause
|
|
230
|
+
|
|
231
|
+
cfg = ctx.config
|
|
232
|
+
rng = ctx.rng
|
|
233
|
+
flags = cfg.feature_flags
|
|
234
|
+
|
|
235
|
+
# FROM clause — populates arm-local scope.
|
|
236
|
+
from_ = _gen_from_clause(ctx)
|
|
237
|
+
|
|
238
|
+
# Target list: one SelectTarget per requested type, generated
|
|
239
|
+
# against the arm's scope. allow_aggregates=False because this
|
|
240
|
+
# path is non-aggregate.
|
|
241
|
+
expr_ctx = replace(ctx, allow_aggregates=False)
|
|
242
|
+
targets = tuple(
|
|
243
|
+
SelectTarget(expr=gen_expr(expr_ctx, t))
|
|
244
|
+
for t in target_types
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Optional WHERE.
|
|
248
|
+
where: Optional[Expr] = None
|
|
249
|
+
if FEATURE_WHERE in flags and rng.random() < cfg.p_where:
|
|
250
|
+
where = gen_expr(expr_ctx, BOOL)
|
|
251
|
+
|
|
252
|
+
return Select(
|
|
253
|
+
targets=targets,
|
|
254
|
+
from_=from_,
|
|
255
|
+
where=where,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
__all__ = ["gen_setop"]
|
waxsql/gen/subquery.py
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
"""Subquery generators.
|
|
2
|
+
|
|
3
|
+
Role: covers the four SQL surfaces where a SELECT appears as a value
|
|
4
|
+
or table source inside another query. Each public entry point hands
|
|
5
|
+
back an AST node that gen_expr or gen_select can splice directly into
|
|
6
|
+
the surrounding tree.
|
|
7
|
+
|
|
8
|
+
SCOPE HANDLING is the subtle bit. Correlated subqueries need the
|
|
9
|
+
outer scope visible while the inner is being built (so inner
|
|
10
|
+
expressions can pick outer columns); derived tables produce a fresh
|
|
11
|
+
scope whose *outer* names are projected back as `alias.cN` and whose
|
|
12
|
+
*internals* are NOT visible to siblings. `descend_subquery` on the
|
|
13
|
+
context is the chokepoint that decides which: passing
|
|
14
|
+
correlated=True keeps the outer chain reachable from the child scope;
|
|
15
|
+
correlated=False severs it.
|
|
16
|
+
|
|
17
|
+
Four public entry points. The first three return a complete `Expr`
|
|
18
|
+
for use in `gen/expr.py`'s candidate dispatch; the fourth returns a
|
|
19
|
+
FromItem for `gen/select.py`'s FROM-clause builder:
|
|
20
|
+
|
|
21
|
+
* `gen_scalar_subquery(ctx, target_type, *, correlated)` — returns
|
|
22
|
+
a `Subquery(target_type, inner)`. Inner has a single target of
|
|
23
|
+
`target_type` and `LIMIT 1` for runtime safety.
|
|
24
|
+
|
|
25
|
+
* `gen_exists_subquery(ctx, *, correlated)` — returns
|
|
26
|
+
`Exists(BOOL, inner, negated)`. Inner has the canonical
|
|
27
|
+
`SELECT 1 FROM ...` shape.
|
|
28
|
+
|
|
29
|
+
* `gen_in_subquery(ctx, *, correlated)` — returns
|
|
30
|
+
`InSubquery(BOOL, lhs, inner, negated)`. The LHS is generated
|
|
31
|
+
BEFORE descending (so it references the outer scope, not the
|
|
32
|
+
inner FROM tables); the inner produces a single column of the
|
|
33
|
+
LHS's type.
|
|
34
|
+
|
|
35
|
+
* `gen_derived_table(ctx, alias, *, lateral)` — returns a
|
|
36
|
+
`DerivedTable` FromItem (`[LATERAL ](SELECT ...) AS alias`).
|
|
37
|
+
Inner has 1..N targets aliased c1..cN; with `lateral=True`,
|
|
38
|
+
the same forced-correlation predicate gets injected so LATERAL
|
|
39
|
+
actually exercises its capability.
|
|
40
|
+
|
|
41
|
+
All four share `_build_subquery_select`, which handles the recurring
|
|
42
|
+
pattern: descend, build FROM, build target(s), optionally WHERE,
|
|
43
|
+
force correlation if requested.
|
|
44
|
+
|
|
45
|
+
The "force correlation" path is what gives the `correlated=True`
|
|
46
|
+
flag teeth — without it, the inner WHERE might happen to pick an
|
|
47
|
+
outer column or might not, and the test suite's
|
|
48
|
+
"correlated-references-outer" invariant would be flaky. With it,
|
|
49
|
+
every correlated subquery has at least one outer-column reference
|
|
50
|
+
in its WHERE.
|
|
51
|
+
"""
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
from dataclasses import replace
|
|
55
|
+
from typing import Optional
|
|
56
|
+
|
|
57
|
+
from ..ast import (
|
|
58
|
+
BinaryOp, ColumnRef, DerivedTable, Exists, Expr, InSubquery, Literal,
|
|
59
|
+
Select, SelectTarget, Subquery,
|
|
60
|
+
)
|
|
61
|
+
from ..config import FEATURE_WHERE
|
|
62
|
+
from ..context import GenContext
|
|
63
|
+
from ..scope import Scope
|
|
64
|
+
from ..types import BOOL, INT4, INT8, NUMERIC, PgType, TEXT, TIMESTAMPTZ
|
|
65
|
+
from .expr import gen_expr, gen_literal
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Probability of negating an EXISTS / IN subquery. Kept low: most
|
|
69
|
+
# real SQL uses the affirmative form; NOT EXISTS / NOT IN appear
|
|
70
|
+
# as anti-joins or set differences but less commonly.
|
|
71
|
+
_P_NEGATE_EXISTS = 0.3
|
|
72
|
+
_P_NEGATE_IN = 0.3
|
|
73
|
+
|
|
74
|
+
# Type pool for the IN subquery's comparison. Chosen from types where
|
|
75
|
+
# `=` is in the catalog so the inner SELECT's column type aligns with
|
|
76
|
+
# what the outer LHS expression can produce. Excludes JSONB / arrays
|
|
77
|
+
# since the catalog has no `=` op for those.
|
|
78
|
+
_IN_COMPARISON_TYPES: tuple[PgType, ...] = (
|
|
79
|
+
INT4, INT8, NUMERIC, TEXT, TIMESTAMPTZ,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# ===========================================================================
|
|
84
|
+
# Public entry points
|
|
85
|
+
# ===========================================================================
|
|
86
|
+
|
|
87
|
+
def gen_scalar_subquery(
|
|
88
|
+
ctx: GenContext,
|
|
89
|
+
target_type: PgType,
|
|
90
|
+
*,
|
|
91
|
+
correlated: bool,
|
|
92
|
+
) -> Subquery:
|
|
93
|
+
"""Generate a scalar subquery `(SELECT col FROM ...)` of
|
|
94
|
+
`target_type`. The inner SELECT has exactly one target, and
|
|
95
|
+
`LIMIT 1` to be runtime-safe even though we don't actually run
|
|
96
|
+
the queries (PG accepts multi-row scalar subqueries at parse
|
|
97
|
+
time but errors at runtime if more than one row comes back)."""
|
|
98
|
+
inner = _build_subquery_select(
|
|
99
|
+
ctx,
|
|
100
|
+
correlated=correlated,
|
|
101
|
+
target_type=target_type,
|
|
102
|
+
with_limit_1=True,
|
|
103
|
+
)
|
|
104
|
+
return Subquery(target_type, inner)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def gen_exists_subquery(
|
|
108
|
+
ctx: GenContext,
|
|
109
|
+
*,
|
|
110
|
+
correlated: bool,
|
|
111
|
+
) -> Exists:
|
|
112
|
+
"""Generate `[NOT ]EXISTS (SELECT 1 FROM ...)`. The constant `1`
|
|
113
|
+
is the canonical EXISTS body — PG ignores the SELECT list at
|
|
114
|
+
runtime, so we don't waste generator effort building elaborate
|
|
115
|
+
expressions there."""
|
|
116
|
+
inner = _build_subquery_select(
|
|
117
|
+
ctx,
|
|
118
|
+
correlated=correlated,
|
|
119
|
+
target_expr=Literal(INT4, 1),
|
|
120
|
+
with_limit_1=False,
|
|
121
|
+
)
|
|
122
|
+
negated = ctx.rng.random() < _P_NEGATE_EXISTS
|
|
123
|
+
return Exists(BOOL, inner, negated=negated)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def gen_derived_table(
|
|
127
|
+
ctx: GenContext,
|
|
128
|
+
alias: str,
|
|
129
|
+
*,
|
|
130
|
+
lateral: bool,
|
|
131
|
+
) -> DerivedTable:
|
|
132
|
+
"""Generate a derived-table FromItem `[LATERAL ](SELECT ...) AS alias`.
|
|
133
|
+
|
|
134
|
+
The inner SELECT has 1..N targets, each with explicit `cN` alias
|
|
135
|
+
so the outer query can reference `<alias>.cN` deterministically
|
|
136
|
+
regardless of what expressions the targets carry. Column count
|
|
137
|
+
drawn from the config knob `max_derived_table_columns` (capped
|
|
138
|
+
by determinism — same draw on same RNG state always picks the
|
|
139
|
+
same count). The single-column case remains the most common
|
|
140
|
+
output by virtue of the small max.
|
|
141
|
+
|
|
142
|
+
With `lateral=True`, the inner SELECT can reference the outer
|
|
143
|
+
scope's preceding-sibling aliases. The forced-correlation
|
|
144
|
+
predicate inside `_build_subquery_select` (already used by
|
|
145
|
+
correlated scalar/EXISTS/IN subqueries in milestone 3) guarantees
|
|
146
|
+
at least one outer-column reference appears in the inner WHERE
|
|
147
|
+
— turning "LATERAL capability" into "LATERAL actually exercises
|
|
148
|
+
the capability."
|
|
149
|
+
|
|
150
|
+
No LIMIT 1: derived tables are virtual TABLES (multi-row by
|
|
151
|
+
nature), unlike scalar subqueries that need LIMIT 1 for PG's
|
|
152
|
+
single-row runtime constraint.
|
|
153
|
+
"""
|
|
154
|
+
rng = ctx.rng
|
|
155
|
+
cfg = ctx.config
|
|
156
|
+
n_cols = rng.randint(1, cfg.max_derived_table_columns)
|
|
157
|
+
target_types = tuple(
|
|
158
|
+
rng.choice(_IN_COMPARISON_TYPES) for _ in range(n_cols)
|
|
159
|
+
)
|
|
160
|
+
inner = _build_subquery_select(
|
|
161
|
+
ctx,
|
|
162
|
+
correlated=lateral,
|
|
163
|
+
target_types=target_types,
|
|
164
|
+
with_limit_1=False,
|
|
165
|
+
)
|
|
166
|
+
# Re-wrap each target with an explicit `cN` alias for stable
|
|
167
|
+
# outer-side column resolution.
|
|
168
|
+
inner_aliased = replace(
|
|
169
|
+
inner,
|
|
170
|
+
targets=tuple(
|
|
171
|
+
SelectTarget(expr=t.expr, alias=f"c{i + 1}")
|
|
172
|
+
for i, t in enumerate(inner.targets)
|
|
173
|
+
),
|
|
174
|
+
)
|
|
175
|
+
return DerivedTable(inner_aliased, alias, lateral=lateral)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def gen_in_subquery(
|
|
179
|
+
ctx: GenContext,
|
|
180
|
+
*,
|
|
181
|
+
correlated: bool,
|
|
182
|
+
) -> InSubquery:
|
|
183
|
+
"""Generate `<lhs> [NOT ]IN (SELECT col FROM ...)`. The LHS is
|
|
184
|
+
generated against the OUTER context so it references outer
|
|
185
|
+
columns, not the inner FROM tables. The inner produces a single
|
|
186
|
+
column whose type matches the LHS."""
|
|
187
|
+
# The InSubquery node returns BOOL to the caller, but the LHS and
|
|
188
|
+
# inner column share a different type — the equality-comparable
|
|
189
|
+
# type chosen here. Caller's "target_type=BOOL" gate (in expr.py)
|
|
190
|
+
# is what makes this candidate visible; this local `target_type`
|
|
191
|
+
# is the comparison-element type, not the outer-expression type.
|
|
192
|
+
target_type = ctx.rng.choice(_IN_COMPARISON_TYPES)
|
|
193
|
+
# CRITICAL: build LHS before descending. After descend_subquery
|
|
194
|
+
# the ctx.scope is the inner scope, and LHS column refs would
|
|
195
|
+
# come from inner tables — semantically wrong (LHS is part of
|
|
196
|
+
# the outer query's expression, not the inner subquery).
|
|
197
|
+
# ORDERING DEPENDENCY: this line MUST stay above the
|
|
198
|
+
# _build_subquery_select call below; reordering would silently
|
|
199
|
+
# produce SQL that parses but means something different.
|
|
200
|
+
lhs = gen_expr(ctx, target_type)
|
|
201
|
+
inner = _build_subquery_select(
|
|
202
|
+
ctx,
|
|
203
|
+
correlated=correlated,
|
|
204
|
+
target_type=target_type,
|
|
205
|
+
with_limit_1=False,
|
|
206
|
+
)
|
|
207
|
+
negated = ctx.rng.random() < _P_NEGATE_IN
|
|
208
|
+
return InSubquery(BOOL, lhs, inner, negated=negated)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# ===========================================================================
|
|
212
|
+
# Shared subquery body builder
|
|
213
|
+
# ===========================================================================
|
|
214
|
+
|
|
215
|
+
def _build_subquery_select(
|
|
216
|
+
ctx: GenContext,
|
|
217
|
+
*,
|
|
218
|
+
correlated: bool,
|
|
219
|
+
target_type: Optional[PgType] = None,
|
|
220
|
+
target_types: Optional[tuple[PgType, ...]] = None,
|
|
221
|
+
target_expr: Optional[Expr] = None,
|
|
222
|
+
with_limit_1: bool = False,
|
|
223
|
+
) -> Select:
|
|
224
|
+
"""Construct a SELECT for use as a subquery body.
|
|
225
|
+
|
|
226
|
+
Caller provides EXACTLY ONE target spec:
|
|
227
|
+
* `target_type` (singular) — generate a single target of that
|
|
228
|
+
type; convenience for the common single-column case.
|
|
229
|
+
* `target_types` (tuple) — generate len(target_types) targets,
|
|
230
|
+
one of each type. Used by multi-column derived tables / CTEs.
|
|
231
|
+
* `target_expr` — a pre-built single target expression
|
|
232
|
+
(used by EXISTS's constant `SELECT 1`).
|
|
233
|
+
|
|
234
|
+
The descent flow:
|
|
235
|
+
1. Capture outer scope (needed by the correlation forcer).
|
|
236
|
+
2. Descend via ctx.descend_subquery — fresh expression depth,
|
|
237
|
+
fresh aggregate flags, child scope (correlated or not).
|
|
238
|
+
3. Build FROM clause; the child scope acquires the inner tables.
|
|
239
|
+
4. Build the target list (generated or supplied).
|
|
240
|
+
5. Optionally generate WHERE (with allow_aggregates=False —
|
|
241
|
+
WHERE forbids aggregates, just like in the outer query).
|
|
242
|
+
6. If correlated, AND-inject an outer-referencing predicate
|
|
243
|
+
into WHERE.
|
|
244
|
+
7. Add LIMIT 1 if requested.
|
|
245
|
+
"""
|
|
246
|
+
# Lazy import to break the cycle: gen/expr.py imports the public
|
|
247
|
+
# entry points above; importing _gen_from_clause at module top
|
|
248
|
+
# would close the import loop. Lazy import is the standard Python
|
|
249
|
+
# answer and incurs only a sys.modules dict lookup after first call.
|
|
250
|
+
from .select import _gen_from_clause
|
|
251
|
+
|
|
252
|
+
# Capture outer scope BEFORE descending — `descend_subquery`
|
|
253
|
+
# replaces ctx.scope with the child scope, so this is the last
|
|
254
|
+
# chance to keep a handle on the outer bindings. The correlation
|
|
255
|
+
# forcer below needs that handle to pick an outer column for the
|
|
256
|
+
# left-hand side of its injected predicate.
|
|
257
|
+
outer_scope = ctx.scope
|
|
258
|
+
child_ctx = ctx.descend_subquery(correlated=correlated)
|
|
259
|
+
cfg = child_ctx.config
|
|
260
|
+
rng = child_ctx.rng
|
|
261
|
+
|
|
262
|
+
from_clause = _gen_from_clause(child_ctx)
|
|
263
|
+
|
|
264
|
+
# Resolve the three input forms into a single `target_exprs`
|
|
265
|
+
# tuple. Exactly one of (target_expr, target_type, target_types)
|
|
266
|
+
# must be set; combinations would be ambiguous.
|
|
267
|
+
set_count = sum(
|
|
268
|
+
1 for x in (target_expr, target_type, target_types) if x is not None
|
|
269
|
+
)
|
|
270
|
+
if set_count != 1:
|
|
271
|
+
raise ValueError(
|
|
272
|
+
"_build_subquery_select needs exactly one of target_expr, "
|
|
273
|
+
f"target_type, target_types (got {set_count})"
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
if target_expr is not None:
|
|
277
|
+
target_exprs: tuple[Expr, ...] = (target_expr,)
|
|
278
|
+
else:
|
|
279
|
+
# Subquery targets generated with allow_aggregates=False to
|
|
280
|
+
# prevent the "mixed aggregate + non-aggregate-column-ref in
|
|
281
|
+
# the same target expression" bug class. Without GROUP BY,
|
|
282
|
+
# `col - max(other)` triggers implicit-grouping inference;
|
|
283
|
+
# the un-grouped col then errors at PARSE time (42803).
|
|
284
|
+
# Pure-aggregate subqueries lose direct generability here in
|
|
285
|
+
# exchange for PARSE-correctness; can be added back via a
|
|
286
|
+
# dedicated path if needed.
|
|
287
|
+
target_ctx = replace(child_ctx, allow_aggregates=False)
|
|
288
|
+
if target_types is not None:
|
|
289
|
+
types_tuple: tuple[PgType, ...] = target_types
|
|
290
|
+
else:
|
|
291
|
+
# Validated above: exactly-one-of (expr, type, types).
|
|
292
|
+
# When we reach this branch, target_expr is None and
|
|
293
|
+
# target_types is None, so target_type MUST be non-None.
|
|
294
|
+
assert target_type is not None
|
|
295
|
+
types_tuple = (target_type,)
|
|
296
|
+
target_exprs = tuple(gen_expr(target_ctx, t) for t in types_tuple)
|
|
297
|
+
|
|
298
|
+
where: Optional[Expr] = None
|
|
299
|
+
if FEATURE_WHERE in cfg.feature_flags and rng.random() < cfg.p_where:
|
|
300
|
+
where_ctx = replace(child_ctx, allow_aggregates=False)
|
|
301
|
+
where = gen_expr(where_ctx, BOOL)
|
|
302
|
+
|
|
303
|
+
if correlated:
|
|
304
|
+
where = _force_correlation_predicate(child_ctx, outer_scope, where)
|
|
305
|
+
|
|
306
|
+
return Select(
|
|
307
|
+
targets=tuple(SelectTarget(expr=e) for e in target_exprs),
|
|
308
|
+
from_=from_clause,
|
|
309
|
+
where=where,
|
|
310
|
+
limit=Literal(INT4, 1) if with_limit_1 else None,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# ===========================================================================
|
|
315
|
+
# Correlation enforcement
|
|
316
|
+
# ===========================================================================
|
|
317
|
+
|
|
318
|
+
def _force_correlation_predicate(
|
|
319
|
+
child_ctx: GenContext,
|
|
320
|
+
outer_scope: Scope,
|
|
321
|
+
existing_where: Optional[Expr],
|
|
322
|
+
) -> Optional[Expr]:
|
|
323
|
+
"""Inject `outer_col = X` into the inner WHERE, AND-combined with
|
|
324
|
+
any existing WHERE. Guarantees the subquery references the outer
|
|
325
|
+
scope at least once — the test suite's "correlated subqueries
|
|
326
|
+
actually correlate" invariant relies on this.
|
|
327
|
+
|
|
328
|
+
Picks an outer column whose type has a usable `=` operator (filters
|
|
329
|
+
out JSONB / arrays etc. that don't have catalog-registered equality).
|
|
330
|
+
The right-hand side is preferentially another column of the same
|
|
331
|
+
type from the inner scope; failing that, a literal of that type
|
|
332
|
+
(still satisfies "references outer" because the outer column is
|
|
333
|
+
on the left).
|
|
334
|
+
"""
|
|
335
|
+
rng = child_ctx.rng
|
|
336
|
+
|
|
337
|
+
# Outer bindings whose types have a usable `=` in our catalog.
|
|
338
|
+
bool_ops = child_ctx.catalog.binary_ops_returning(BOOL)
|
|
339
|
+
# Set is fine for membership tests; we never iterate it.
|
|
340
|
+
eq_types = {
|
|
341
|
+
o.left for o in bool_ops
|
|
342
|
+
if o.symbol == "=" and o.left == o.right
|
|
343
|
+
}
|
|
344
|
+
outer_candidates = [
|
|
345
|
+
b for b in outer_scope.visible_columns()
|
|
346
|
+
if b.type in eq_types
|
|
347
|
+
]
|
|
348
|
+
if not outer_candidates:
|
|
349
|
+
# Pathological — outer has no comparable columns. Skip
|
|
350
|
+
# correlation rather than emit invalid SQL.
|
|
351
|
+
return existing_where
|
|
352
|
+
|
|
353
|
+
outer = rng.choice(outer_candidates)
|
|
354
|
+
|
|
355
|
+
# RHS: prefer an inner column of the same type; fall back to a
|
|
356
|
+
# literal. Either way, the outer column reference on the LEFT is
|
|
357
|
+
# what makes this a correlated subquery.
|
|
358
|
+
#
|
|
359
|
+
# Why filter by `inner_aliases` rather than just calling
|
|
360
|
+
# visible_columns()? In a correlated child scope, visible_columns()
|
|
361
|
+
# returns BOTH inner bindings AND outer bindings (correlation lets
|
|
362
|
+
# outer columns leak in). If we picked an OUTER column for the RHS,
|
|
363
|
+
# the predicate would compare two outer columns — semantically a
|
|
364
|
+
# constant from the inner subquery's POV, which PG might pull out
|
|
365
|
+
# of the subquery as a constant filter, defeating correlation.
|
|
366
|
+
# Restricting to inner aliases keeps the LEFT/RIGHT asymmetry that
|
|
367
|
+
# makes this a real correlated reference.
|
|
368
|
+
inner_aliases = {a for a, _ in child_ctx.scope.aliased_tables()}
|
|
369
|
+
inner_candidates = [
|
|
370
|
+
b for b in child_ctx.scope.visible_columns()
|
|
371
|
+
if b.table_alias in inner_aliases and b.type == outer.type
|
|
372
|
+
]
|
|
373
|
+
if inner_candidates:
|
|
374
|
+
inner = rng.choice(inner_candidates)
|
|
375
|
+
rhs: Expr = ColumnRef(inner.type, inner.table_alias, inner.column)
|
|
376
|
+
else:
|
|
377
|
+
rhs = gen_literal(rng, outer.type)
|
|
378
|
+
|
|
379
|
+
correlation = BinaryOp(
|
|
380
|
+
BOOL, "=",
|
|
381
|
+
ColumnRef(outer.type, outer.table_alias, outer.column),
|
|
382
|
+
rhs,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if existing_where is None:
|
|
386
|
+
return correlation
|
|
387
|
+
# AND-combine: outer-correlation predicate first, existing where
|
|
388
|
+
# second. Order is just for readability — both are evaluated.
|
|
389
|
+
return BinaryOp(BOOL, "AND", correlation, existing_where)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
__all__ = [
|
|
393
|
+
"gen_scalar_subquery",
|
|
394
|
+
"gen_exists_subquery",
|
|
395
|
+
"gen_in_subquery",
|
|
396
|
+
"gen_derived_table",
|
|
397
|
+
]
|