waxsql 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waxsql/gen/window.py ADDED
@@ -0,0 +1,398 @@
1
+ """Window-spec generator for OVER clauses.
2
+
3
+ Role: builds the contents of an OVER (...) clause attached to a
4
+ windowed function call. Called from `gen_expr`'s window branch
5
+ (`kind == "window"`) and consumed by the printer to render the
6
+ spec inline OR — after `hoist_named_windows` — as a `WINDOW name AS
7
+ (...)` reference. Windows are only valid in SELECT-list and ORDER BY
8
+ positions; `gen_select` enforces this with the `allow_window` flag,
9
+ which is False everywhere else.
10
+
11
+ One public entry point: `gen_window_spec(ctx)` builds a WindowSpec
12
+ with optional PARTITION BY, ORDER BY, and frame clause.
13
+
14
+ Window specs are intentionally simple:
15
+
16
+ * PARTITION BY contents are column refs only — picked from
17
+ `ctx.scope.visible_columns()`. Arbitrary expressions are
18
+ syntactically valid but rare in real SQL; complex window
19
+ specs add visual clutter without exercising new generator
20
+ paths.
21
+
22
+ * ORDER BY in the spec is also column refs only, with random
23
+ ASC/DESC.
24
+
25
+ * Frame clauses are ROWS-only with BETWEEN bounds. Subset chosen
26
+ for two reasons: ROWS doesn't require ORDER BY (RANGE/GROUPS
27
+ do, for offset bounds), and ROWS-with-integer-offsets is
28
+ universally valid regardless of the ORDER BY column types.
29
+ EXCLUDE clauses (CURRENT ROW / GROUP / TIES / NO OTHERS) are
30
+ deferred — small additional complexity, low marginal value.
31
+
32
+ The probability gates (`p_partition_by`, `p_order_by_in_window`,
33
+ `p_window_frame`) fire independently — all can be False, producing
34
+ the empty `OVER ()` form which PG accepts (entire result set as
35
+ one partition).
36
+ """
37
+ from __future__ import annotations
38
+
39
+ from dataclasses import replace
40
+
41
+ from ..ast import (
42
+ BinaryOp, Cast, ColumnRef, Expr, FrameBound, FrameClause, FuncCall,
43
+ Literal, NamedWindow, OrderByItem, Select, UnaryOp, WindowRef,
44
+ WindowSpec,
45
+ )
46
+ from ..context import GenContext
47
+ from ..types import FLOAT8, INT4, INT8, NUMERIC, PgType
48
+
49
+
50
+ # Frame bound choices, partitioned by their valid position in the
51
+ # BETWEEN range. PG requires the start to come "before" the end in
52
+ # this implicit ordering: UNBOUNDED PRECEDING < N PRECEDING < CURRENT
53
+ # ROW < N FOLLOWING < UNBOUNDED FOLLOWING. By restricting `_START_KINDS`
54
+ # and `_END_KINDS` to non-overlapping subsets of valid choices, every
55
+ # (start, end) combo we generate is automatically valid — no need for
56
+ # a post-pick ordering check.
57
+ _START_KINDS: tuple[str, ...] = (
58
+ "unbounded_preceding", "preceding", "current_row",
59
+ )
60
+ _END_KINDS: tuple[str, ...] = (
61
+ "current_row", "following", "unbounded_following",
62
+ )
63
+
64
+ # Small integer offsets for `N PRECEDING` / `N FOLLOWING`. Kept small
65
+ # so generated frames look realistic (centered moving averages with
66
+ # huge offsets are rare in practice).
67
+ _OFFSETS: tuple[int, ...] = (1, 2, 3, 5, 10)
68
+
69
+ # EXCLUDE clause body choices. PG accepts these four; "NO OTHERS" is
70
+ # the default (semantically same as omitting the clause), but we
71
+ # generate it sometimes for grammar coverage.
72
+ _EXCLUDE_KINDS: tuple[str, ...] = (
73
+ "CURRENT ROW", "GROUP", "TIES", "NO OTHERS",
74
+ )
75
+
76
+ # Probability of attaching an EXCLUDE clause to a generated frame.
77
+ # Most real frames omit EXCLUDE entirely; biased moderate-low so
78
+ # the path gets exercised without dominating output.
79
+ _P_FRAME_EXCLUDE: float = 0.25
80
+
81
+ # Numeric types eligible to back a RANGE-with-offset frame. PG allows
82
+ # any type with `+`/`-` against the offset's type, but our generator
83
+ # only produces integer offsets via _OFFSETS, so these are the safe
84
+ # ORDER BY column types. Adding INTERVAL+TIMESTAMPTZ would require
85
+ # generating INTERVAL offsets — deferred.
86
+ _RANGE_OFFSET_NUMERIC_TYPES: frozenset[PgType] = frozenset({INT4, INT8, NUMERIC, FLOAT8})
87
+
88
+ # Frame-bound subsets without offset kinds. Used when picking RANGE
89
+ # without offsets (universally valid) vs RANGE with offsets (needs
90
+ # numeric ORDER BY).
91
+ _START_KINDS_NO_OFFSET: tuple[str, ...] = (
92
+ "unbounded_preceding", "current_row",
93
+ )
94
+ _END_KINDS_NO_OFFSET: tuple[str, ...] = (
95
+ "current_row", "unbounded_following",
96
+ )
97
+
98
+
99
+ def gen_window_spec(ctx: GenContext) -> WindowSpec:
100
+ """Generate a WindowSpec — partition_by + order_by + optional
101
+ frame clause.
102
+
103
+ All three sections are independent dice rolls; any combination
104
+ can be empty, yielding `OVER ()` (a single partition over the
105
+ entire result set, which PG accepts and is occasionally
106
+ semantically useful for `count(*) OVER ()`-style "total row
107
+ count alongside each row" patterns).
108
+
109
+ Column refs are drawn from the current scope's visible columns
110
+ — same pool as ordinary expression generation. If the scope
111
+ has no visible columns (a degenerate case during testing),
112
+ we fall back to empty `OVER ()` for safety.
113
+ """
114
+ cfg = ctx.config
115
+ rng = ctx.rng
116
+
117
+ visible = ctx.scope.visible_columns()
118
+ if not visible:
119
+ # Defensive: real production calls always have visible
120
+ # columns by the time we're generating SELECT-list items,
121
+ # but tests building GenContext directly might not.
122
+ return WindowSpec()
123
+
124
+ partition_by: tuple[ColumnRef, ...] = ()
125
+ if rng.random() < cfg.p_partition_by:
126
+ n = min(
127
+ rng.randint(1, cfg.max_partition_by_items),
128
+ len(visible),
129
+ )
130
+ bindings = rng.sample(visible, n)
131
+ partition_by = tuple(
132
+ ColumnRef(b.type, b.table_alias, b.column)
133
+ for b in bindings
134
+ )
135
+
136
+ order_by: tuple[OrderByItem, ...] = ()
137
+ if rng.random() < cfg.p_order_by_in_window:
138
+ n = min(
139
+ rng.randint(1, cfg.max_order_by_in_window_items),
140
+ len(visible),
141
+ )
142
+ bindings = rng.sample(visible, n)
143
+ order_by = tuple(
144
+ OrderByItem(
145
+ expr=ColumnRef(b.type, b.table_alias, b.column),
146
+ direction=rng.choice(("ASC", "DESC")),
147
+ )
148
+ for b in bindings
149
+ )
150
+
151
+ frame = (
152
+ _gen_frame(ctx, order_by)
153
+ if rng.random() < cfg.p_window_frame
154
+ else None
155
+ )
156
+
157
+ return WindowSpec(
158
+ partition_by=partition_by, order_by=order_by, frame=frame,
159
+ )
160
+
161
+
162
+ # CONSTRAINT: frame syntax depends on the presence and type of the
163
+ # parent spec's ORDER BY. This isn't optional — PG rejects mismatched
164
+ # combos at parse-analysis. The (unit, allow_offset_bounds) option
165
+ # list built inside `_gen_frame` is the explicit encoding of those
166
+ # rules; expanding it later (INTERVAL offsets for TIMESTAMPTZ, etc.)
167
+ # means also expanding `_RANGE_OFFSET_NUMERIC_TYPES` consistently.
168
+ def _gen_frame(
169
+ ctx: GenContext,
170
+ order_by: tuple[OrderByItem, ...],
171
+ ) -> FrameClause:
172
+ """Build a frame clause whose unit and bounds are guaranteed
173
+ valid given the parent WindowSpec's `order_by`.
174
+
175
+ PG's frame-validity rules (verified empirically against PG 17):
176
+
177
+ * **ROWS**: any bounds, any ORDER BY situation. Universal.
178
+ * **RANGE without offset bounds**: any ORDER BY (including
179
+ none). Just identifies peer rows by ORDER BY value.
180
+ * **RANGE with offset bounds**: needs EXACTLY ONE ORDER BY
181
+ column whose type supports `+`/`-` with the offset type.
182
+ Our offsets are integers, so that one column must be
183
+ numeric (int4/int8/numeric/float8). Multi-column ORDER BY
184
+ is invalid even when the first column is numeric.
185
+ * **GROUPS**: requires ORDER BY (any type, any number of
186
+ columns). Bounds with integer offsets always OK.
187
+
188
+ Strategy: build a list of valid `(unit, allow_offset_bounds)`
189
+ options based on the spec's order_by, then pick uniformly from
190
+ that list. Bounds are then drawn from offset-allowed or
191
+ no-offset-only kind pools to match.
192
+
193
+ EXCLUDE clause appended at probability _P_FRAME_EXCLUDE; choice
194
+ universal across all four bodies.
195
+ """
196
+ rng = ctx.rng
197
+
198
+ # Build the option list: (unit, allow_offset_bounds).
199
+ has_order = bool(order_by)
200
+ first_order_type = order_by[0].expr.pg_type if has_order else None
201
+ # RANGE with offsets needs SINGLE numeric ORDER BY column.
202
+ # GROUPS with offsets only needs ORDER BY (any column count, any
203
+ # type); the offset is a peer-group count, not an arithmetic delta.
204
+ range_offset_ok = (
205
+ has_order
206
+ and len(order_by) == 1
207
+ and first_order_type in _RANGE_OFFSET_NUMERIC_TYPES
208
+ )
209
+
210
+ options: list[tuple[str, bool]] = [
211
+ ("ROWS", True), # always valid
212
+ ("RANGE", False), # always valid (no offsets)
213
+ ]
214
+ if has_order:
215
+ options.append(("GROUPS", True)) # GROUPS requires ORDER BY
216
+ if range_offset_ok:
217
+ options.append(("RANGE", True))
218
+
219
+ unit, allow_offset = rng.choice(options)
220
+ start_kinds = _START_KINDS if allow_offset else _START_KINDS_NO_OFFSET
221
+ end_kinds = _END_KINDS if allow_offset else _END_KINDS_NO_OFFSET
222
+ start = _gen_bound(rng, kinds=start_kinds)
223
+ end = _gen_bound(rng, kinds=end_kinds)
224
+ exclude = (
225
+ rng.choice(_EXCLUDE_KINDS)
226
+ if rng.random() < _P_FRAME_EXCLUDE
227
+ else None
228
+ )
229
+ return FrameClause(unit=unit, start=start, end=end, exclude=exclude)
230
+
231
+
232
+ def _gen_bound(rng, *, kinds: tuple[str, ...]) -> FrameBound:
233
+ """Construct a FrameBound with a kind drawn from `kinds`. Adds an
234
+ integer-literal offset for the preceding/following kinds (which
235
+ require one) and leaves it None for the unbounded/current_row
236
+ kinds (which forbid one). The post-init invariant on FrameBound
237
+ enforces this match — passing a wrong combo would error at
238
+ construction, surfacing the bug immediately."""
239
+ kind = rng.choice(kinds)
240
+ if kind in ("preceding", "following"):
241
+ return FrameBound(kind=kind, offset=Literal(INT4, rng.choice(_OFFSETS)))
242
+ return FrameBound(kind=kind)
243
+
244
+
245
+ # ===========================================================================
246
+ # Named-window hoisting (post-pass deduplication)
247
+ # ===========================================================================
248
+ #
249
+ # When the SELECT-list (or HAVING) generation produces multiple windowed
250
+ # aggregates with structurally-identical WindowSpecs, hoisting the
251
+ # common spec into a WINDOW clause and replacing inline OVER (...) with
252
+ # OVER name produces the canonical PG idiom for spec deduplication.
253
+ # Generators don't try to produce duplicates intentionally; this catches
254
+ # the natural ones that emerge from random window-spec generation
255
+ # (more likely with small scopes and biased PARTITION BY weights).
256
+ #
257
+ # We deliberately don't recurse into Subquery/Exists/InSubquery bodies —
258
+ # named windows are scoped to their own SELECT, and inner SELECTs run
259
+ # their own hoisting independently.
260
+
261
+ def _collect_window_specs(expr: Expr) -> list[WindowSpec]:
262
+ """Recursively gather every WindowSpec appearing on a FuncCall.over
263
+ within `expr`. Returns specs in left-to-right traversal order so
264
+ later equality grouping is deterministic.
265
+
266
+ Stops at subquery boundaries — the inner SELECT's own hoist pass
267
+ handles those independently."""
268
+ out: list[WindowSpec] = []
269
+
270
+ def walk(e: Expr) -> None:
271
+ if isinstance(e, FuncCall):
272
+ if isinstance(e.over, WindowSpec):
273
+ out.append(e.over)
274
+ for a in e.args:
275
+ walk(a)
276
+ if e.filter_ is not None:
277
+ walk(e.filter_)
278
+ elif isinstance(e, BinaryOp):
279
+ walk(e.left)
280
+ walk(e.right)
281
+ elif isinstance(e, UnaryOp):
282
+ walk(e.operand)
283
+ elif isinstance(e, Cast):
284
+ walk(e.expr)
285
+ # Subquery/Exists/InSubquery/Literal/ColumnRef: no descent.
286
+
287
+ walk(expr)
288
+ return out
289
+
290
+
291
+ def _replace_window_specs(
292
+ expr: Expr,
293
+ mapping: dict[WindowSpec, str],
294
+ ) -> Expr:
295
+ """Rebuild `expr` with each WindowSpec found in `mapping` replaced
296
+ by a WindowRef pointing at the mapped name. Same boundary rules
297
+ as _collect_window_specs (no subquery descent)."""
298
+ if isinstance(expr, FuncCall):
299
+ # Widen the local annotation explicitly: expr.over is
300
+ # `Optional[WindowSpec | WindowRef]`, and we may swap a spec
301
+ # for a ref. Without this, mypy infers the narrower type from
302
+ # the initial assignment and rejects the WindowRef branch.
303
+ new_over: WindowSpec | WindowRef | None = expr.over
304
+ if isinstance(expr.over, WindowSpec) and expr.over in mapping:
305
+ new_over = WindowRef(mapping[expr.over])
306
+ new_args = tuple(_replace_window_specs(a, mapping) for a in expr.args)
307
+ new_filter = (
308
+ _replace_window_specs(expr.filter_, mapping)
309
+ if expr.filter_ is not None else None
310
+ )
311
+ return replace(expr, args=new_args, over=new_over, filter_=new_filter)
312
+ if isinstance(expr, BinaryOp):
313
+ return replace(
314
+ expr,
315
+ left=_replace_window_specs(expr.left, mapping),
316
+ right=_replace_window_specs(expr.right, mapping),
317
+ )
318
+ if isinstance(expr, UnaryOp):
319
+ return replace(expr, operand=_replace_window_specs(expr.operand, mapping))
320
+ if isinstance(expr, Cast):
321
+ return replace(expr, expr=_replace_window_specs(expr.expr, mapping))
322
+ return expr
323
+
324
+
325
+ # INVARIANT: this pass is idempotent — running it twice on the same
326
+ # Select produces the same Select (after the first pass the surviving
327
+ # WindowSpecs each appear once inline at most, so the spec_counts >= 2
328
+ # gate never fires the second time). Useful for tests that compose
329
+ # multiple post-passes; nothing currently exercises this property but
330
+ # it's a free correctness guarantee.
331
+ def hoist_named_windows(s: Select) -> Select:
332
+ """Post-pass: dedupe structurally-identical WindowSpecs within a
333
+ Select by hoisting them to a WINDOW clause and replacing inline
334
+ OVER (...) with OVER name.
335
+
336
+ Only specs appearing 2+ times in the SELECT-level expressions
337
+ (targets and HAVING — WHERE/ORDER BY can't host windows) get
338
+ hoisted. Single-use specs stay inline; hoisting them would just
339
+ add noise without exercising the dedup mechanic.
340
+
341
+ Operates on the SELECT's structure only — does not descend into
342
+ Subquery/Exists/InSubquery bodies. Those have their own hoisting
343
+ via their own gen_select pass.
344
+ """
345
+ # WindowSpec must be a frozen dataclass for this to work — equality
346
+ # and hashing are what cause structurally-identical specs (different
347
+ # objects, same partition_by/order_by/frame) to collapse into the
348
+ # same dict key. If WindowSpec ever loses `frozen=True` or grows a
349
+ # field that breaks structural equality, hoisting silently stops
350
+ # deduplicating.
351
+ spec_counts: dict[WindowSpec, int] = {}
352
+ spec_order: list[WindowSpec] = []
353
+
354
+ def tally(spec: WindowSpec) -> None:
355
+ if spec not in spec_counts:
356
+ spec_order.append(spec)
357
+ spec_counts[spec] = 0
358
+ spec_counts[spec] += 1
359
+
360
+ for t in s.targets:
361
+ for spec in _collect_window_specs(t.expr):
362
+ tally(spec)
363
+ if s.having is not None:
364
+ for spec in _collect_window_specs(s.having):
365
+ tally(spec)
366
+
367
+ # Hoist only specs with multiple occurrences. Names assigned in
368
+ # first-seen order so the WINDOW clause's name list is
369
+ # deterministic given the same RNG state.
370
+ to_hoist = [spec for spec in spec_order if spec_counts[spec] >= 2]
371
+ if not to_hoist:
372
+ return s
373
+
374
+ name_for: dict[WindowSpec, str] = {
375
+ spec: f"w{i + 1}" for i, spec in enumerate(to_hoist)
376
+ }
377
+ named_windows = tuple(
378
+ NamedWindow(name=name_for[spec], spec=spec) for spec in to_hoist
379
+ )
380
+
381
+ new_targets = tuple(
382
+ replace(t, expr=_replace_window_specs(t.expr, name_for))
383
+ for t in s.targets
384
+ )
385
+ new_having = (
386
+ _replace_window_specs(s.having, name_for)
387
+ if s.having is not None else None
388
+ )
389
+
390
+ return replace(
391
+ s,
392
+ targets=new_targets,
393
+ having=new_having,
394
+ windows=named_windows,
395
+ )
396
+
397
+
398
+ __all__ = ["gen_window_spec", "hoist_named_windows"]
waxsql/pretty.py ADDED
@@ -0,0 +1,81 @@
1
+ """Pretty-printing for generated SQL: reformat + optional terminal color.
2
+
3
+ Display-only transformation behind `waxsql gen --pprint`. Reformats SQL
4
+ via pglast's parse-tree serializer and, when writing to a terminal,
5
+ colorizes via pygments. Deliberately NOT on the generation hot path —
6
+ this is a human-facing presentation layer, kept separate from the
7
+ canonical machine-pipe output (which stays plain so it can be re-fed to
8
+ `validate`).
9
+
10
+ Both pglast and pygments are optional (the `[pprint]` extra). They are
11
+ imported lazily inside `prettify_sql` so that importing this module —
12
+ and therefore `waxsql.cli` — never hard-requires them. Same discipline
13
+ as `validate/parse.py`'s lazy `import psycopg`.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import os
18
+ from typing import TextIO
19
+
20
+
21
+ _INSTALL_HINT = (
22
+ "--pprint requires pglast and pygments. "
23
+ "Install with: pip install 'waxsql[pprint]'"
24
+ )
25
+
26
+
27
+ def should_colorize(stream: TextIO) -> bool:
28
+ """True iff `stream` is an interactive terminal and color isn't
29
+ opted out via the NO_COLOR convention (https://no-color.org).
30
+
31
+ Only COLOR is gated on this — reformatting under --pprint always
32
+ happens. The split is deliberate: `gen --pprint > file.sql` and
33
+ `gen --pprint | psql` get clean, escape-code-free SQL, while an
34
+ interactive terminal gets the colorized view.
35
+ """
36
+ isatty = getattr(stream, "isatty", None)
37
+ return bool(isatty and isatty()) and os.environ.get("NO_COLOR") is None
38
+
39
+
40
+ def prettify_sql(sql: str, *, color: bool) -> str:
41
+ """Reformat `sql` and optionally colorize it for terminal display.
42
+
43
+ Reformatting uses ``pglast.prettify(sql, comma_at_eoln=True)`` —
44
+ conventional trailing commas. Coloring uses pygments.
45
+
46
+ Trailing-semicolon handling: pglast.prettify drops a trailing ``;``
47
+ (and omits the final ``;`` of a multi-statement script). To preserve
48
+ gen's psql-ready output, if the INPUT ends with ``;`` we restore one
49
+ after reformatting and before coloring, so it colors uniformly with
50
+ the rest of the statement.
51
+
52
+ Returns the formatted string with no trailing newline.
53
+ """
54
+ try:
55
+ import pglast
56
+ except ImportError as e:
57
+ raise RuntimeError(_INSTALL_HINT) from e
58
+
59
+ had_semicolon = sql.rstrip().endswith(";")
60
+ formatted = pglast.prettify(sql, comma_at_eoln=True)
61
+ if had_semicolon:
62
+ formatted += ";"
63
+
64
+ if color:
65
+ try:
66
+ from pygments import highlight
67
+ from pygments.formatters import Terminal256Formatter
68
+ from pygments.lexers import PostgresLexer
69
+ except ImportError as e:
70
+ raise RuntimeError(_INSTALL_HINT) from e
71
+
72
+ # pygments.highlight appends a trailing newline; strip it so the
73
+ # caller controls line breaks between output segments.
74
+ formatted = highlight(
75
+ formatted, PostgresLexer(), Terminal256Formatter()
76
+ ).rstrip("\n")
77
+
78
+ return formatted
79
+
80
+
81
+ __all__ = ["prettify_sql", "should_colorize"]