waxsql 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waxsql/gen/select.py ADDED
@@ -0,0 +1,831 @@
1
+ """SELECT-statement generator.
2
+
3
+ Role: the integration layer between schema (tables, columns, FKs),
4
+ scope (visible bindings), and `gen_expr` (typed expression
5
+ production). Every other generator in `gen/` that emits a SELECT —
6
+ subquery bodies, CTE bodies, set-op arms — ultimately routes through
7
+ `gen_select` or shares its FROM-clause helpers, so the JOIN/scope
8
+ rules here are the canonical SELECT-shape definition for the project.
9
+
10
+ The integration layer: picks tables for the FROM clause, populates the
11
+ scope, decides comma-FROM vs explicit JOIN, biases ON conditions
12
+ toward FK relationships, generates SELECT-list expressions via
13
+ `gen_expr`, and optionally adds WHERE / ORDER BY / LIMIT according
14
+ to the complexity dial.
15
+
16
+ Notable design choices:
17
+
18
+ * The caller owns the scope. `gen_select` does NOT create its own
19
+ scope — it adds tables to whatever scope is on `ctx`. For
20
+ top-level use, the caller passes an empty Scope; for the future
21
+ subquery case, the caller pushes a child scope first. Keeping
22
+ scope ownership outside `gen_select` keeps the generator
23
+ composable.
24
+
25
+ * JOIN tree is left-deep, with each new table added to scope
26
+ BEFORE its ON condition is generated. This matches PostgreSQL's
27
+ rule that the ON of `tA JOIN tB` can only see tA and tB (not
28
+ later tables), so the generator's notion of "visible" tracks PG's.
29
+
30
+ * ON-condition FK biasing looks at FKs in either direction (right
31
+ table → existing tables, OR existing tables → right table). When
32
+ the schema's FK density is non-zero, this dominates over the
33
+ random-BOOL fallback, producing joins that read like real ones.
34
+
35
+ * Two top-level shapes — non-aggregating and aggregating (with
36
+ GROUP BY / HAVING) — are picked between by `gen_select` based
37
+ on the FEATURE_AGGREGATE flag and `p_aggregate_query` config.
38
+ Splitting on the shape keeps each path's invariants local: the
39
+ non-agg path forces `allow_aggregates=False` so aggregates can't
40
+ sneak in via WHERE or SELECT items; the agg path constructs
41
+ every SELECT-list non-aggregate as a verbatim GROUP BY entry so
42
+ PG's "must appear in GROUP BY" rule is satisfied by construction.
43
+ """
44
+ from __future__ import annotations
45
+
46
+ import random
47
+ from dataclasses import replace
48
+
49
+ from typing import Optional
50
+
51
+ from ..ast import (
52
+ BinaryOp, ColumnRef, CteDef, CteRef, Expr, FromItem,
53
+ FuncCall, GroupingSet, JoinExpr, Literal, OrderByItem, Select,
54
+ SelectTarget, TableRef,
55
+ )
56
+ from ..catalog import FuncKind
57
+ from ..config import (
58
+ FEATURE_AGGREGATE, FEATURE_CTE, FEATURE_DERIVED_TABLE, FEATURE_GROUPING_SET,
59
+ FEATURE_HAVING, FEATURE_INNER_JOIN, FEATURE_LATERAL, FEATURE_LEFT_JOIN,
60
+ FEATURE_LIMIT, FEATURE_ORDER_BY, FEATURE_RECURSIVE_CTE, FEATURE_WHERE,
61
+ )
62
+ from ..context import GenContext
63
+ from ..schema import Table
64
+ from ..types import (
65
+ BOOL, INT4, INT8, NUMERIC, PgType, TEXT, TIMESTAMPTZ,
66
+ )
67
+ from .cte import gen_cte_def, gen_recursive_cte_def
68
+ from .expr import (
69
+ ORDERED_SET_AGGREGATES, coerce_to_param_type, gen_expr,
70
+ gen_filter_predicate, gen_literal, gen_ordered_set_agg,
71
+ should_emit_count_star, should_emit_filter,
72
+ )
73
+ from .subquery import gen_derived_table
74
+ from .window import hoist_named_windows
75
+
76
+
77
+ # Target-type weights for SELECT-list expressions. Biased toward the
78
+ # common types so output reads like normal SQL; rare types (UUID,
79
+ # JSONB, INTERVAL, FLOAT8) are excluded here because their literals
80
+ # and operators tend to produce visually noisy output, and they're
81
+ # still reachable via column refs when the schema includes them.
82
+ _SELECT_TYPE_WEIGHTS: tuple[tuple[PgType, float], ...] = (
83
+ (INT4, 2.0),
84
+ (INT8, 2.0),
85
+ (NUMERIC, 1.0),
86
+ (TEXT, 3.0),
87
+ (BOOL, 1.5),
88
+ (TIMESTAMPTZ, 1.0),
89
+ )
90
+
91
+ # Limit values to draw from. Small set keeps output predictable;
92
+ # fuzzing rare LIMIT-edge-cases (LIMIT 0, LIMIT NULL, LIMIT ALL) is
93
+ # its own future concern.
94
+ _LIMIT_VALUES: tuple[int, ...] = (1, 5, 10, 25, 50, 100)
95
+
96
+
97
+ # ===========================================================================
98
+ # Public entry point
99
+ # ===========================================================================
100
+
101
+ def gen_select(ctx: GenContext) -> Select:
102
+ """Generate a SELECT statement.
103
+
104
+ `ctx.scope` is mutated: tables chosen for the FROM clause are
105
+ added to it. The caller is expected to pass a scope appropriate
106
+ for the use case — empty for a top-level query, or a child scope
107
+ pushed via `ctx.scope.push_subquery(...)` for a future subquery.
108
+
109
+ Other parts of `ctx` are read-only.
110
+
111
+ Two code paths:
112
+
113
+ * **Aggregate query** (with GROUP BY) — chosen with probability
114
+ `p_aggregate_query` when FEATURE_AGGREGATE is unlocked.
115
+ * **Non-aggregate query** — the default shape (no GROUP BY,
116
+ no HAVING); chosen otherwise. The non-agg path explicitly
117
+ disables aggregates on the gen_expr context so they cannot
118
+ sneak in via SELECT items or WHERE.
119
+
120
+ A WITH clause may be generated AHEAD of either path (when
121
+ FEATURE_CTE is unlocked AND ctx.allow_with). Each CTE is
122
+ registered in ctx.scope before generating the next, so later
123
+ CTEs can reference earlier ones — and the main query body
124
+ (after the WITH) sees all of them.
125
+ """
126
+ # ORDERING DEPENDENCY: WITH clause comes first — its CTEs must be
127
+ # in `ctx.scope` before _gen_from_clause runs so a base-vs-CTE
128
+ # decision in _make_from_item can find them. Reversing this would
129
+ # silently downgrade every "FROM cte_name" pick to "FROM base_table".
130
+ with_ctes = _gen_with_clause(ctx)
131
+
132
+ is_agg = (
133
+ FEATURE_AGGREGATE in ctx.config.feature_flags
134
+ and ctx.rng.random() < ctx.config.p_aggregate_query
135
+ )
136
+ if is_agg:
137
+ s = _gen_aggregate_select(ctx, with_ctes=with_ctes)
138
+ else:
139
+ # Non-aggregating path. Forcing allow_aggregates=False prevents
140
+ # gen_expr from sneaking aggregates into SELECT-list items or
141
+ # WHERE — without GROUP BY they'd be either parse errors or
142
+ # implicit-single-group queries (which we deliberately don't
143
+ # generate from this path).
144
+ s = _gen_non_aggregate_select(
145
+ replace(ctx, allow_aggregates=False),
146
+ with_ctes=with_ctes,
147
+ )
148
+ # Post-pass: hoist any duplicate WindowSpecs into a WINDOW clause.
149
+ # No-op when there are no duplicates (the common case); cheap
150
+ # to run unconditionally.
151
+ return hoist_named_windows(s)
152
+
153
+
154
+ def _gen_with_clause(ctx: GenContext) -> tuple[CteDef, ...]:
155
+ """Generate zero or more CTE definitions for the WITH clause.
156
+
157
+ Returns an empty tuple when:
158
+ * FEATURE_CTE is locked (c < 7), OR
159
+ * ctx.allow_with is False (we're inside a subquery), OR
160
+ * no subquery-depth budget remains (each CTE body costs 1), OR
161
+ * the per-call probability roll fails.
162
+
163
+ Each generated CTE is registered in ctx.scope BEFORE the next
164
+ is generated, so cte2 can reference cte1 (the natural CTE-to-CTE
165
+ visibility). The same registration makes them visible to the
166
+ main query body that runs after this returns.
167
+ """
168
+ cfg = ctx.config
169
+ rng = ctx.rng
170
+
171
+ if (FEATURE_CTE not in cfg.feature_flags
172
+ or not ctx.allow_with
173
+ or ctx.at_subquery_leaf()
174
+ or rng.random() >= cfg.p_with_clause):
175
+ return ()
176
+
177
+ n_ctes = rng.randint(1, cfg.max_ctes_per_with)
178
+ defs: list[CteDef] = []
179
+ for _ in range(n_ctes):
180
+ name = f"cte{ctx.cte_counter.take()}"
181
+ # Recursive vs plain CTE — gated on FEATURE_RECURSIVE_CTE
182
+ # AND a probability roll.
183
+ recursive = (
184
+ FEATURE_RECURSIVE_CTE in cfg.feature_flags
185
+ and rng.random() < cfg.p_recursive_when_cte
186
+ )
187
+ if recursive:
188
+ # gen_recursive_cte_def registers the CTE name internally
189
+ # (the recursive arm needs it visible during generation).
190
+ # Don't re-register here.
191
+ cte_def, _columns = gen_recursive_cte_def(ctx, name)
192
+ else:
193
+ cte_def, columns = gen_cte_def(ctx, name)
194
+ # Register in OUTER scope so subsequent CTEs (and the main
195
+ # body) can resolve `name` via lookup_cte / has_visible_ctes.
196
+ ctx.scope.add_cte(name, columns)
197
+ defs.append(cte_def)
198
+ return tuple(defs)
199
+
200
+
201
+ def _gen_non_aggregate_select(
202
+ ctx: GenContext,
203
+ *,
204
+ with_ctes: tuple[CteDef, ...] = (),
205
+ ) -> Select:
206
+ """The milestone-1 SELECT shape: targets / FROM / WHERE / ORDER BY
207
+ / LIMIT, no GROUP BY, no HAVING. May be prefixed with a WITH
208
+ clause (passed in by gen_select)."""
209
+ cfg = ctx.config
210
+ rng = ctx.rng
211
+ flags = cfg.feature_flags
212
+
213
+ from_ = _gen_from_clause(ctx)
214
+
215
+ # ---- SELECT list ---------------------------------------------------
216
+ # allow_window=True only here — windows are valid in SELECT
217
+ # list (and in ORDER BY by reuse), forbidden everywhere else.
218
+ select_ctx = replace(ctx, allow_window=True)
219
+ n_targets = rng.randint(1, cfg.max_select_items)
220
+ targets: list[SelectTarget] = []
221
+ for _ in range(n_targets):
222
+ target_type = _pick_select_type(rng)
223
+ e = gen_expr(select_ctx, target_type)
224
+ targets.append(SelectTarget(expr=e))
225
+
226
+ # ---- WHERE ---------------------------------------------------------
227
+ where: Expr | None = None
228
+ if FEATURE_WHERE in flags and rng.random() < cfg.p_where:
229
+ # WHERE forbids windows; use the original ctx (allow_window=False).
230
+ where = gen_expr(ctx, BOOL)
231
+
232
+ # ---- ORDER BY ------------------------------------------------------
233
+ order_by = _maybe_gen_order_by(ctx, targets)
234
+
235
+ # ---- LIMIT ---------------------------------------------------------
236
+ limit = _maybe_gen_limit(ctx)
237
+
238
+ return Select(
239
+ targets=tuple(targets),
240
+ from_=from_,
241
+ with_ctes=with_ctes,
242
+ where=where,
243
+ order_by=order_by,
244
+ limit=limit,
245
+ )
246
+
247
+
248
+ def _gen_aggregate_select(
249
+ ctx: GenContext,
250
+ *,
251
+ with_ctes: tuple[CteDef, ...] = (),
252
+ ) -> Select:
253
+ """An aggregating SELECT with explicit GROUP BY.
254
+
255
+ PARSE-tier-correct by construction: every non-aggregate SELECT-list
256
+ item is one of the chosen GROUP BY expressions (reused as the same
257
+ AST object), so PG's "must appear in GROUP BY or be in an
258
+ aggregate" rule is satisfied. HAVING is constructed as
259
+ `aggregate COMP literal` so it always references aggregated state.
260
+
261
+ Implicit-single-group aggregates (no GROUP BY, all SELECT items
262
+ aggregates) are deliberately not generated here — they come back
263
+ in a later milestone.
264
+ """
265
+ cfg = ctx.config
266
+ rng = ctx.rng
267
+ flags = cfg.feature_flags
268
+
269
+ from_ = _gen_from_clause(ctx)
270
+
271
+ # ---- GROUP BY: pick K column refs from the populated scope ---------
272
+ visible = ctx.scope.visible_columns()
273
+ if not visible:
274
+ # Defensive: schema generator always gives every table at least
275
+ # an `id` column, so this can't happen. Fall back rather than
276
+ # crashing if it ever does.
277
+ return _gen_non_aggregate_select(replace(ctx, allow_aggregates=False))
278
+
279
+ n_group = rng.randint(1, min(cfg.max_group_by_items, len(visible)))
280
+ chosen_bindings = rng.sample(visible, n_group)
281
+ # The same AST objects get reused as both GROUP BY items and
282
+ # SELECT-list grouped items — that's how the "verbatim in GROUP
283
+ # BY" PARSE-correctness rule is satisfied at the AST level
284
+ # (frozen dataclasses compare structurally, so equality holds).
285
+ grouped_exprs: tuple[Expr, ...] = tuple(
286
+ ColumnRef(b.type, b.table_alias, b.column)
287
+ for b in chosen_bindings
288
+ )
289
+
290
+ # Optional grouping-set extension: wrap the column list in
291
+ # ROLLUP, CUBE, or GROUPING SETS. PG accepts mixing constructs
292
+ # within one GROUP BY but the simplest realistic shape is "the
293
+ # whole GROUP BY is one construct"; emit that.
294
+ # Either the original column tuple OR a single-element tuple
295
+ # holding a GroupingSet — the AST field accepts the union.
296
+ group_by_clause: tuple[Expr | GroupingSet, ...] = grouped_exprs
297
+ if (FEATURE_GROUPING_SET in flags
298
+ and rng.random() < cfg.p_grouping_set):
299
+ group_by_clause = (_gen_grouping_set(ctx, grouped_exprs),)
300
+
301
+ # ---- SELECT list: mix grouped exprs and aggregates ----------------
302
+ # allow_window=True for this section — windows are valid in
303
+ # aggregating SELECT lists too (e.g., row_number() over partitions
304
+ # of grouped results).
305
+ select_ctx = replace(ctx, allow_window=True)
306
+ n_targets = rng.randint(1, cfg.max_select_items)
307
+ targets: list[SelectTarget] = []
308
+ for _ in range(n_targets):
309
+ # 50/50 split: half grouped, half aggregate. The mix keeps
310
+ # output looking like real analytic SQL — `region, count(*)`
311
+ # rather than all-agg or all-grouped.
312
+ if rng.random() < 0.5:
313
+ e = rng.choice(grouped_exprs)
314
+ else:
315
+ e = _gen_aggregate_funccall(select_ctx)
316
+ targets.append(SelectTarget(expr=e))
317
+
318
+ # ---- WHERE: same as non-agg path; aggregates forbidden -----------
319
+ where: Expr | None = None
320
+ if FEATURE_WHERE in flags and rng.random() < cfg.p_where:
321
+ where_ctx = replace(ctx, allow_aggregates=False)
322
+ where = gen_expr(where_ctx, BOOL)
323
+
324
+ # ---- HAVING: aggregate COMP literal -------------------------------
325
+ having: Expr | None = None
326
+ if FEATURE_HAVING in flags and rng.random() < cfg.p_having:
327
+ having = _gen_having_expr(ctx)
328
+
329
+ # ---- ORDER BY: pull from SELECT list (always GROUP-BY-consistent)
330
+ order_by = _maybe_gen_order_by(ctx, targets)
331
+
332
+ # ---- LIMIT --------------------------------------------------------
333
+ limit = _maybe_gen_limit(ctx)
334
+
335
+ return Select(
336
+ targets=tuple(targets),
337
+ from_=from_,
338
+ with_ctes=with_ctes,
339
+ where=where,
340
+ group_by=group_by_clause,
341
+ having=having,
342
+ order_by=order_by,
343
+ limit=limit,
344
+ )
345
+
346
+
347
+ # ---- Shared sub-generators -------------------------------------------------
348
+
349
+ def _gen_from_clause(ctx: GenContext) -> tuple[FromItem, ...]:
350
+ """Pick FROM items, populate scope, return the FROM tuple.
351
+
352
+ Each FROM position is built incrementally — and for the explicit-
353
+ JOIN path, each item's ON condition is generated IMMEDIATELY
354
+ after the item is added to scope (before the next item exists).
355
+ This enforces three structural rules at once:
356
+
357
+ * LATERAL derived tables see preceding siblings (which were
358
+ added in earlier iterations) but not later ones (which
359
+ haven't been generated yet).
360
+
361
+ * JoinExpr ON conditions for `tA JOIN tB ON ...` see only tA
362
+ and tB (and earlier items in the join), not later joins —
363
+ PG's parse-analysis rejects `t1 JOIN t2 ON t4.x = t2.y`
364
+ because t4 isn't yet in scope at that point in the FROM
365
+ tree.
366
+
367
+ * Comma-FROM still adds each item before the next, so a LATERAL
368
+ derived table at position i sees items 0..i-1 the same way
369
+ the explicit-join path does.
370
+
371
+ Mutates `ctx.scope` and `ctx.alias_counter` as side effects.
372
+ """
373
+ cfg = ctx.config
374
+ rng = ctx.rng
375
+ flags = cfg.feature_flags
376
+
377
+ # DETERMINISM: `ctx.schema.tables` is an insertion-ordered tuple
378
+ # built by the schema generator from a deterministic RNG draw, so
379
+ # the rng.sample() output is stable across runs. Switching to a
380
+ # set-derived container here would silently break the seed →
381
+ # output guarantee.
382
+ n_from = rng.randint(1, min(cfg.max_from_items, len(ctx.schema.tables)))
383
+ chosen = rng.sample(ctx.schema.tables, n_from)
384
+ # Reserve `n_from` consecutive alias indices from the query-wide
385
+ # counter. Shared across all derived contexts so sibling /
386
+ # nested subqueries don't collide on aliases.
387
+ start = ctx.alias_counter.take(n_from)
388
+
389
+ use_explicit = (
390
+ n_from > 1
391
+ and FEATURE_INNER_JOIN in flags
392
+ and rng.random() < cfg.p_explicit_join
393
+ )
394
+
395
+ if use_explicit:
396
+ # ALGORITHM: left-deep JOIN tree, built position-by-position.
397
+ # We generate each ON condition right after its right-hand item
398
+ # is added to scope. This is what keeps ON-clause name
399
+ # resolution honest — at the time we generate `t1 JOIN t2 ON
400
+ # ...`, only t1 and t2 are in scope. PG's parse analysis
401
+ # rejects `t1 JOIN t2 ON t3.x = t2.y` even when t3 appears later
402
+ # in the FROM, because the ON is parsed before t3 enters scope.
403
+ first_alias = f"t{start}"
404
+ first_item, _ = _make_from_item(
405
+ ctx, first_alias, chosen[0], position=0,
406
+ )
407
+ tree: FromItem = first_item
408
+ for i in range(1, n_from):
409
+ alias = f"t{start + i}"
410
+ item, base_t = _make_from_item(
411
+ ctx, alias, chosen[i], position=i,
412
+ )
413
+ kind = _pick_join_kind(ctx)
414
+ # ctx.scope at this point has items 0..i in it (item i
415
+ # was just added by _make_from_item). The ON sees
416
+ # exactly those — what PG's left-to-right rule requires.
417
+ on = _gen_join_condition(ctx, alias, base_t)
418
+ tree = JoinExpr(left=tree, right=item, kind=kind, on=on)
419
+ return (tree,)
420
+
421
+ # Comma-FROM path: incremental in scope-building too (so LATERAL
422
+ # in a comma-FROM sees preceding siblings), but no ON conditions.
423
+ items: list[FromItem] = []
424
+ for i, t in enumerate(chosen):
425
+ alias = f"t{start + i}"
426
+ item, _ = _make_from_item(ctx, alias, t, position=i)
427
+ items.append(item)
428
+ return tuple(items)
429
+
430
+
431
+ def _make_from_item(
432
+ ctx: GenContext,
433
+ alias: str,
434
+ table: Table,
435
+ *,
436
+ position: int,
437
+ ) -> tuple[FromItem, Optional[Table]]:
438
+ """Build one FROM item and register it in scope.
439
+
440
+ Returns (item, optional_base_table) — the second element is the
441
+ underlying Table for base-table FROMs, None for derived tables
442
+ AND for CTE references (FK biasing applies only to base tables).
443
+
444
+ Decisions, in priority order:
445
+ 1. CTE reference — gated on FEATURE_CTE and at least one CTE
446
+ being visible in scope (has_visible_ctes). Picked first
447
+ when available because it's the most "structural" FROM
448
+ shape — once a WITH clause defines CTEs, generated queries
449
+ should actually reference them often enough to exercise
450
+ the resolution machinery.
451
+ 2. Derived table — gated on FEATURE_DERIVED_TABLE and
452
+ subquery-depth budget. If derived, LATERAL vs non-LATERAL
453
+ is gated on FEATURE_LATERAL and position > 0.
454
+ 3. Base table — the fallback / default.
455
+
456
+ Scope addition happens BEFORE returning so the next call (for the
457
+ next FROM position) sees this item as a preceding sibling.
458
+ Critical for LATERAL semantics.
459
+ """
460
+ cfg = ctx.config
461
+ rng = ctx.rng
462
+ flags = cfg.feature_flags
463
+
464
+ # CTE reference: only meaningful when at least one CTE is visible.
465
+ use_cte = (
466
+ FEATURE_CTE in flags
467
+ and ctx.scope.has_visible_ctes()
468
+ and rng.random() < cfg.p_cte_in_from
469
+ )
470
+ if use_cte:
471
+ # Pick deterministically from the visible CTE pool (already
472
+ # in deterministic order — dict insertion + chain walk).
473
+ cte_names = ctx.scope.visible_cte_names()
474
+ cte_name = rng.choice(cte_names)
475
+ cte_columns = ctx.scope.lookup_cte(cte_name)
476
+ # has_visible_ctes was True, so lookup must succeed; assert
477
+ # here as a generator-bug tripwire.
478
+ assert cte_columns is not None
479
+ # Register the local alias's bindings (using the CTE's column
480
+ # info but tagged with this local alias).
481
+ ctx.scope.add_derived(alias, cte_columns)
482
+ return CteRef(cte_name=cte_name, alias=alias), None
483
+
484
+ use_derived = (
485
+ FEATURE_DERIVED_TABLE in flags
486
+ and not ctx.at_subquery_leaf() # need budget for the inner SELECT
487
+ and rng.random() < cfg.p_derived_table_in_from
488
+ )
489
+
490
+ if use_derived:
491
+ lateral = (
492
+ position > 0
493
+ and FEATURE_LATERAL in flags
494
+ and rng.random() < cfg.p_lateral_when_derived
495
+ )
496
+ dt = gen_derived_table(ctx, alias, lateral=lateral)
497
+ cols = [
498
+ (st.alias if st.alias is not None else "c1",
499
+ st.expr.pg_type)
500
+ for st in dt.select.targets
501
+ ]
502
+ ctx.scope.add_derived(alias, cols)
503
+ return dt, None
504
+
505
+ # Base table
506
+ ctx.scope.add_table(alias, table)
507
+ return TableRef(table.name, alias), table
508
+
509
+
510
+ def _maybe_gen_order_by(
511
+ ctx: GenContext,
512
+ targets: list[SelectTarget],
513
+ ) -> tuple[OrderByItem, ...]:
514
+ cfg = ctx.config
515
+ rng = ctx.rng
516
+ if (FEATURE_ORDER_BY not in cfg.feature_flags
517
+ or not targets
518
+ or rng.random() >= cfg.p_order_by):
519
+ return ()
520
+ n_order = rng.randint(1, len(targets))
521
+ idx = sorted(rng.sample(range(len(targets)), n_order))
522
+ return tuple(
523
+ OrderByItem(
524
+ expr=targets[i].expr,
525
+ direction=rng.choice(("ASC", "DESC")),
526
+ )
527
+ for i in idx
528
+ )
529
+
530
+
531
+ def _maybe_gen_limit(ctx: GenContext) -> Expr | None:
532
+ cfg = ctx.config
533
+ if FEATURE_LIMIT in cfg.feature_flags and ctx.rng.random() < cfg.p_limit:
534
+ return Literal(INT4, ctx.rng.choice(_LIMIT_VALUES))
535
+ return None
536
+
537
+
538
+ # ---- Grouping-set construction --------------------------------------------
539
+
540
+ def _gen_grouping_set(
541
+ ctx: GenContext,
542
+ grouped_exprs: tuple[Expr, ...],
543
+ ) -> GroupingSet:
544
+ """Wrap `grouped_exprs` in a ROLLUP, CUBE, or GROUPING SETS
545
+ construct. The choice is uniform across the three keywords.
546
+
547
+ Element shape per kind:
548
+
549
+ * ROLLUP: each grouped expr is its own single-expr element,
550
+ producing `ROLLUP (a, b, c)` — the canonical hierarchical-
551
+ rollup shape.
552
+ * CUBE: same — `CUBE (a, b, c)` produces 2^n grouping sets.
553
+ * GROUPING SETS: enumerates 2..N random subsets of grouped_exprs
554
+ (including possibly the empty set `()` for the grand total).
555
+ Output looks like `GROUPING SETS ((a, b), (c), ())`.
556
+
557
+ Multi-expr per element (e.g. `ROLLUP ((a, b), c)` where (a,b) is
558
+ treated as a single rollup level) is structurally supported by
559
+ GroupingSet but not generated here — single-expr elements are
560
+ the dominant real-world shape and keep output readable.
561
+ """
562
+ rng = ctx.rng
563
+ kind = rng.choice(("ROLLUP", "CUBE", "GROUPING SETS"))
564
+
565
+ # Annotated explicitly — without it mypy unifies the two branches
566
+ # to the narrower `tuple[tuple[Expr], ...]` from the ROLLUP/CUBE
567
+ # path (single-expr inner tuples) and then rejects the GROUPING
568
+ # SETS path which produces variable-length inner tuples.
569
+ elements: tuple[tuple[Expr, ...], ...]
570
+ if kind in ("ROLLUP", "CUBE"):
571
+ elements = tuple((expr,) for expr in grouped_exprs)
572
+ else: # GROUPING SETS
573
+ # 2..min(4, 2^N) sets — enough variety, kept low for
574
+ # readability. Each set is a random subset; duplicates
575
+ # across sets are accepted (PG just collapses them).
576
+ # Always include the empty grouping at least sometimes.
577
+ max_n = min(4, 2 ** len(grouped_exprs))
578
+ n_sets = rng.randint(2, max_n) if max_n >= 2 else 2
579
+ sets: list[tuple[Expr, ...]] = []
580
+ for _ in range(n_sets):
581
+ n = rng.randint(0, len(grouped_exprs))
582
+ sample = rng.sample(grouped_exprs, n) if n > 0 else []
583
+ sets.append(tuple(sample))
584
+ elements = tuple(sets)
585
+
586
+ return GroupingSet(kind=kind, elements=elements)
587
+
588
+
589
+ # ---- Aggregate construction helpers ---------------------------------------
590
+
591
+ def _gen_aggregate_funccall(ctx: GenContext) -> FuncCall:
592
+ """Pick a random aggregate function and build its FuncCall.
593
+
594
+ Aggregate selection is over `catalog.functions` (ordered tuple,
595
+ stable across runs); each declared overload is one candidate,
596
+ so aggregates with more overloads (e.g. `min`/`max`) appear
597
+ proportionally more often than rarely-overloaded ones (e.g.
598
+ `string_agg`, `bool_and`).
599
+
600
+ Args are generated under `in_aggregate=True` (blocks nested
601
+ aggregates) AND `allow_window=False` (blocks windows inside
602
+ aggregates — PG evaluates aggregates *before* windows in its
603
+ pipeline, so `count(row_number() OVER (...))` is a parse-
604
+ analysis error). Both flag resets needed when allow_window
605
+ might be inherited from a SELECT-list-context parent ctx.
606
+ """
607
+ aggs = [f for f in ctx.catalog.functions if f.kind == FuncKind.AGGREGATE]
608
+ f = ctx.rng.choice(aggs)
609
+ # Ordered-set aggregates (percentile_cont, percentile_disc, ...)
610
+ # MUST be called with WITHIN GROUP — special-case before any
611
+ # other path. Same gate as gen_expr's agg branch.
612
+ if f.name in ORDERED_SET_AGGREGATES:
613
+ return gen_ordered_set_agg(ctx, f)
614
+ # FILTER eligibility — every aggregate accepts FILTER in PG.
615
+ # Gen up-front so it composes with the star form below.
616
+ filter_expr = (
617
+ gen_filter_predicate(ctx)
618
+ if should_emit_filter(ctx.rng)
619
+ else None
620
+ )
621
+ # `count(*)` substitution — same gate as gen_expr's agg branch.
622
+ if f.name == "count" and should_emit_count_star(ctx.rng):
623
+ return FuncCall(
624
+ f.returns, "count", (), star=True, filter_=filter_expr,
625
+ )
626
+ arg_ctx = replace(
627
+ ctx.descend(),
628
+ in_aggregate=True,
629
+ allow_window=False,
630
+ )
631
+ # Wrap in explicit casts when the actual arg type doesn't match
632
+ # the param type — same overload-resolution defense as gen_expr.
633
+ args = tuple(
634
+ coerce_to_param_type(gen_expr(arg_ctx, arg_t), arg_t)
635
+ for arg_t in f.args
636
+ )
637
+ return FuncCall(f.returns, f.name, args, filter_=filter_expr)
638
+
639
+
640
+ def _gen_having_expr(ctx: GenContext) -> Expr:
641
+ """Construct a HAVING expression as `aggregate COMP literal`.
642
+
643
+ This shape — an aggregate against a literal of the same type — is
644
+ the most realistic HAVING form (`HAVING count(*) > 10`,
645
+ `HAVING sum(amount) >= 1000`) and guarantees PARSE-tier
646
+ correctness: the aggregate side carries the post-grouping
647
+ semantic, the literal side has no GROUP BY constraint at all.
648
+
649
+ Aggregates whose return type isn't comparable in our catalog
650
+ (e.g. array_agg returns int4[], no array comparison ops
651
+ registered) are filtered out — using one of those would fall
652
+ into the defensive branch which produces a trivial `TRUE`.
653
+ """
654
+ cat = ctx.catalog
655
+ bool_ops = cat.binary_ops_returning(BOOL)
656
+ # Types T such that there's a `T COMP T → BOOL` op available.
657
+ # Using a set is fine here: we only do membership tests, never
658
+ # iterate (set iteration order would be a determinism hazard).
659
+ comparable = {o.left for o in bool_ops if o.left == o.right}
660
+
661
+ candidate_aggs = [
662
+ f for f in cat.functions
663
+ if f.kind == FuncKind.AGGREGATE and f.returns in comparable
664
+ ]
665
+ if not candidate_aggs:
666
+ # Pathological catalog (no comparable aggregate returns at
667
+ # all). Emit a trivially-TRUE HAVING — parses, runs, doesn't
668
+ # filter anything.
669
+ return Literal(BOOL, True)
670
+
671
+ f = ctx.rng.choice(candidate_aggs)
672
+ # Ordered-set aggregates need WITHIN GROUP — same special-case
673
+ # gate as the other two agg-construction sites. Returns the
674
+ # FuncCall without FILTER attached (the helper doesn't add one);
675
+ # the HAVING comparison wraps it as usual.
676
+ if f.name in ORDERED_SET_AGGREGATES:
677
+ agg_expr: Expr = gen_ordered_set_agg(ctx, f)
678
+ else:
679
+ # FILTER + count(*) attachments — same gates as the other agg
680
+ # construction sites. Both compose: `count(*) FILTER (WHERE ...)`
681
+ # is canonical and the most common filtered-aggregate form.
682
+ filter_expr = (
683
+ gen_filter_predicate(ctx)
684
+ if should_emit_filter(ctx.rng)
685
+ else None
686
+ )
687
+ if f.name == "count" and should_emit_count_star(ctx.rng):
688
+ agg_expr = FuncCall(
689
+ f.returns, "count", (), star=True, filter_=filter_expr,
690
+ )
691
+ else:
692
+ arg_ctx = replace(ctx.descend(), in_aggregate=True, allow_window=False)
693
+ args = tuple(
694
+ coerce_to_param_type(gen_expr(arg_ctx, arg_t), arg_t)
695
+ for arg_t in f.args
696
+ )
697
+ agg_expr = FuncCall(f.returns, f.name, args, filter_=filter_expr)
698
+
699
+ matching_ops = [
700
+ o for o in bool_ops
701
+ if o.left == f.returns and o.right == f.returns
702
+ ]
703
+ op = ctx.rng.choice(matching_ops)
704
+ # The equality filter above guarantees o.right == f.returns (a
705
+ # concrete PgType), so o.right is non-None. mypy can't narrow
706
+ # through the equality, so assert locally.
707
+ assert op.right is not None
708
+ rhs = gen_literal(ctx.rng, op.right)
709
+ return BinaryOp(BOOL, op.symbol, agg_expr, rhs)
710
+
711
+
712
+ # ===========================================================================
713
+ # Internals: JOIN-tree construction and ON-condition FK biasing
714
+ # ===========================================================================
715
+
716
+ # (formerly: _build_join_tree and _alias_of — removed when
717
+ # _gen_from_clause was refactored to interleave scope-population with
718
+ # ON-condition generation, eliminating the forward-reference bug
719
+ # where ON for `t1 JOIN t2` could reference t3+ that wasn't yet in
720
+ # scope at parse-analysis time.)
721
+
722
+
723
+ def _pick_join_kind(ctx: GenContext) -> str:
724
+ """Pick INNER vs LEFT for an explicit join.
725
+
726
+ LEFT is only an option when FEATURE_LEFT_JOIN is unlocked at the
727
+ current dial level. RIGHT and FULL are deferred — they're
728
+ syntactically supported by the AST but the generator avoids them
729
+ to keep the join semantics simpler to reason about.
730
+ """
731
+ if (FEATURE_LEFT_JOIN in ctx.config.feature_flags
732
+ and ctx.rng.random() < ctx.config.p_left_join_when_explicit):
733
+ return "LEFT"
734
+ return "INNER"
735
+
736
+
737
+ def _gen_join_condition(
738
+ ctx: GenContext,
739
+ right_alias: str,
740
+ right_table: Optional[Table],
741
+ ) -> Expr:
742
+ """Generate an ON condition for joining `right_alias` to the
743
+ existing scope tables.
744
+
745
+ Strategy:
746
+ 1. If the right side is a base table (`right_table` is not None),
747
+ look for FK relationships either way between it and any
748
+ already-aliased BASE table. Each found FK gives a candidate
749
+ equality `left_alias.id = right_alias.fk_col` (or vice versa).
750
+ 2. If candidates exist, pick one at random — this is the FK bias.
751
+ 3. Otherwise (right is derived, OR no FK candidates), fall back
752
+ to a random BOOL expression. Derived tables have no FKs so
753
+ this path is the only option for a derived right side.
754
+
755
+ Composite FKs (multi-column) are skipped — the schema generator
756
+ never produces them; if that changes, this function needs an
757
+ AND-of-equalities path.
758
+ """
759
+ aliased = ctx.scope.aliased_tables() # base tables only
760
+ others = [(a, t) for a, t in aliased if a != right_alias]
761
+
762
+ # Each candidate is (left_alias, left_col, right_alias, right_col).
763
+ # Order is deterministic because both `right_table.foreign_keys`
764
+ # and `aliased_tables()` are insertion-ordered.
765
+ candidates: list[tuple[str, str, str, str]] = []
766
+
767
+ # FK biasing only applies when the right side is a base table.
768
+ # Derived tables have no FKs, so right_table is None for derived
769
+ # and we skip directly to the gen_expr fallback.
770
+ if right_table is not None:
771
+ # Both directions are collected because either makes a valid
772
+ # equi-join. Walking right→left FKs first then left→right
773
+ # produces a stable insertion order; the rng.choice below is
774
+ # the only place RNG enters the FK-bias decision.
775
+ # FKs FROM the new right table TO an already-present table.
776
+ for fk in right_table.foreign_keys:
777
+ if len(fk.columns) != 1 or len(fk.ref_columns) != 1:
778
+ continue
779
+ for other_alias, other_table in others:
780
+ if fk.ref_table == other_table.name:
781
+ candidates.append((
782
+ other_alias, fk.ref_columns[0],
783
+ right_alias, fk.columns[0],
784
+ ))
785
+
786
+ # FKs FROM an already-present table TO the new right table.
787
+ for other_alias, other_table in others:
788
+ for fk in other_table.foreign_keys:
789
+ if len(fk.columns) != 1 or len(fk.ref_columns) != 1:
790
+ continue
791
+ if fk.ref_table == right_table.name:
792
+ candidates.append((
793
+ other_alias, fk.columns[0],
794
+ right_alias, fk.ref_columns[0],
795
+ ))
796
+
797
+ if candidates:
798
+ left_a, left_c, right_a, right_c = ctx.rng.choice(candidates)
799
+ left_t = ctx.scope.lookup_alias(left_a)
800
+ right_t = ctx.scope.lookup_alias(right_a)
801
+ # lookup_alias returns None only if the alias was never added,
802
+ # which can't happen here (we added them all upstream).
803
+ assert left_t is not None and right_t is not None
804
+ left_type = left_t.column(left_c).type
805
+ right_type = right_t.column(right_c).type
806
+ return BinaryOp(
807
+ BOOL, "=",
808
+ ColumnRef(left_type, left_a, left_c),
809
+ ColumnRef(right_type, right_a, right_c),
810
+ )
811
+
812
+ # No FK candidate. Fall back to a random BOOL via gen_expr.
813
+ # Aggregates are forbidden in ON clauses by PG parse analysis
814
+ # regardless of whether the surrounding query aggregates, so
815
+ # explicitly disable them here even if the caller's ctx allowed
816
+ # them. This is enforced at the function level so every caller
817
+ # gets it for free.
818
+ return gen_expr(replace(ctx, allow_aggregates=False), BOOL)
819
+
820
+
821
+ # ===========================================================================
822
+ # Internals: target-type picking
823
+ # ===========================================================================
824
+
825
+ def _pick_select_type(rng: random.Random) -> PgType:
826
+ types = [t for t, _ in _SELECT_TYPE_WEIGHTS]
827
+ weights = [w for _, w in _SELECT_TYPE_WEIGHTS]
828
+ return rng.choices(types, weights=weights, k=1)[0]
829
+
830
+
831
+ __all__ = ["gen_select"]