waxsql 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waxsql/catalog.py ADDED
@@ -0,0 +1,363 @@
1
+ """Catalog of functions and operators available for query generation.
2
+
3
+ Hand-curated rather than scraped from pg_proc. The reasons:
4
+
5
+ 1. pg_proc has thousands of entries with polymorphic types (anyelement,
6
+ anyarray) and special-case argument handling (variadic, set-returning,
7
+ aggregates with FILTER, etc.) that don't translate cleanly to a
8
+ simple signature model.
9
+
10
+ 2. The generator only needs *enough* surface area to produce varied
11
+ queries. ~50 hand-picked entries give richer output than 5,000
12
+ entries we don't know how to use.
13
+
14
+ 3. Curation lets us exclude things with awkward syntax — POSITION,
15
+ EXTRACT, OVERLAY, TRIM-with-FROM-clause — that need bespoke printer
16
+ handling. Add those when the printer can support them.
17
+
18
+ The catalog is split into FuncSig (callable name + arg list + return
19
+ type) and OpSig (left/right operand types + return). The generator asks
20
+ "what produces type T?" and the catalog answers from both pools.
21
+
22
+ Determinism contract: every lookup that returns a list is responsible
23
+ for sorting that list by a stable key. The expression generator picks
24
+ weighted-randomly from those lists, so a non-deterministic ordering
25
+ would silently desynchronize generated queries from their seed even
26
+ when the underlying tuple of FuncSig/OpSig hasn't changed (e.g. when
27
+ Python's hash randomization shuffles set iteration). See ARCHITECTURE.md
28
+ "Determinism" pillar.
29
+ """
30
+ from __future__ import annotations
31
+
32
+ from dataclasses import dataclass
33
+ from enum import Enum
34
+ from typing import Optional
35
+
36
+ from .types import (
37
+ PgType,
38
+ INT4, INT8, NUMERIC, FLOAT8,
39
+ TEXT, BOOL,
40
+ DATE, TIMESTAMPTZ, INTERVAL,
41
+ UUID, JSONB,
42
+ array_of, implicitly_castable,
43
+ )
44
+
45
+
46
+ class FuncKind(str, Enum):
47
+ """How a function may be invoked.
48
+
49
+ SCALAR functions can appear in any expression context.
50
+ AGGREGATE functions need a query level with grouping semantics.
51
+ WINDOW functions need an OVER clause.
52
+ SET_RETURNING functions are valid in FROM (and a few other places).
53
+
54
+ The generator uses this to pick functions whose call site matches
55
+ where the expression is being placed.
56
+ """
57
+ SCALAR = "scalar"
58
+ AGGREGATE = "agg"
59
+ WINDOW = "window"
60
+ SET_RETURNING = "srf"
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class FuncSig:
65
+ """A function signature.
66
+
67
+ For overloaded functions (e.g. abs, sum, min, max) we register one
68
+ FuncSig per concrete signature rather than trying to model
69
+ polymorphism. Slightly verbose, completely unambiguous.
70
+
71
+ Frozen so it's hashable and can serve as a key in any future
72
+ candidate-cache; also makes the "one FuncSig per concrete
73
+ signature" rule structurally enforced rather than just
74
+ documentary.
75
+ """
76
+ name: str
77
+ args: tuple[PgType, ...]
78
+ returns: PgType
79
+ kind: FuncKind = FuncKind.SCALAR
80
+ variadic: bool = False
81
+
82
+
83
+ @dataclass(frozen=True)
84
+ class OpSig:
85
+ """A binary or unary operator signature.
86
+
87
+ `left=None` means prefix unary (e.g. -x); `right=None` would mean
88
+ postfix unary (rare in modern SQL — factorial `!` was deprecated).
89
+ For the starter catalog, all operators are binary.
90
+ """
91
+ symbol: str
92
+ left: Optional[PgType]
93
+ right: Optional[PgType]
94
+ returns: PgType
95
+
96
+
97
+ @dataclass
98
+ class Catalog:
99
+ """Indexed pool of functions and operators.
100
+
101
+ The lookup methods sort their results by stable keys (name, then
102
+ discriminating attributes) so that iteration order is reproducible
103
+ across Python runs. This matters because the generator picks weighted
104
+ random elements from these lists, and a different ordering would
105
+ silently desynchronize generated queries from their seed.
106
+ """
107
+ functions: tuple[FuncSig, ...]
108
+ operators: tuple[OpSig, ...]
109
+
110
+ def funcs_returning(self, t: PgType) -> list[FuncSig]:
111
+ """All functions whose return type implicitly casts to `t`.
112
+
113
+ The sort key (name, kind, arg-type names) is chosen so that
114
+ two FuncSigs that differ only in overload (`abs(int4)` vs
115
+ `abs(int8)`) sort by arg-type — giving a fully stable order
116
+ across runs. Don't replace the lambda with a simple `f.name`
117
+ key: PG-style overloads would then cluster in arbitrary order
118
+ and the generator would pick differently across Python builds.
119
+ """
120
+ return sorted(
121
+ (f for f in self.functions if implicitly_castable(f.returns, t)),
122
+ key=lambda f: (f.name, f.kind.value, tuple(a.name for a in f.args)),
123
+ )
124
+
125
+ # The kind-filtered convenience methods all delegate to
126
+ # funcs_returning, so they inherit its sort order automatically.
127
+ # Don't shortcut by filtering self.functions directly — that would
128
+ # bypass the sort, breaking determinism.
129
+ def aggs_returning(self, t: PgType) -> list[FuncSig]:
130
+ return [f for f in self.funcs_returning(t) if f.kind == FuncKind.AGGREGATE]
131
+
132
+ def scalar_funcs_returning(self, t: PgType) -> list[FuncSig]:
133
+ return [f for f in self.funcs_returning(t) if f.kind == FuncKind.SCALAR]
134
+
135
+ def window_funcs_returning(self, t: PgType) -> list[FuncSig]:
136
+ return [f for f in self.funcs_returning(t) if f.kind == FuncKind.WINDOW]
137
+
138
+ def srfs_returning(self, t: PgType) -> list[FuncSig]:
139
+ return [f for f in self.funcs_returning(t) if f.kind == FuncKind.SET_RETURNING]
140
+
141
+ def binary_ops_returning(self, t: PgType) -> list[OpSig]:
142
+ # The `left is not None and right is not None` guard filters out
143
+ # unary OpSigs. The starter catalog registers no unary operators,
144
+ # but OpSig models them (`left=None` for prefix unary), so the
145
+ # filter is defensive against future additions — without it, a
146
+ # registered unary op would silently leak into the binary pool
147
+ # and produce malformed `<unary> X Y` SQL.
148
+ binary = [
149
+ o for o in self.operators
150
+ if o.left is not None and o.right is not None
151
+ and implicitly_castable(o.returns, t)
152
+ ]
153
+ # Lambda accesses .name on what mypy still sees as Optional;
154
+ # the comprehension above guarantees both are non-None, but
155
+ # type-narrowing doesn't propagate into the lambda. Assert
156
+ # locally for the type-checker — the runtime check is a no-op.
157
+ return sorted(
158
+ binary,
159
+ key=lambda o: (
160
+ o.symbol,
161
+ o.left.name if o.left else "",
162
+ o.right.name if o.right else "",
163
+ ),
164
+ )
165
+
166
+
167
+ def default_catalog() -> Catalog:
168
+ """The starter catalog used by the generator out of the box.
169
+
170
+ Roughly 50 functions covering numeric, string, temporal, JSON, UUID,
171
+ aggregates, window, and set-returning categories; plus operators for
172
+ arithmetic, comparison, logical, string concat, LIKE/ILIKE, temporal
173
+ arithmetic, and JSON access.
174
+
175
+ Anything requiring special-case syntax (POSITION...IN, EXTRACT...FROM,
176
+ OVERLAY...PLACING, TRIM...FROM, SUBSTRING...FOR) is omitted; they need
177
+ printer support before they're safe to register.
178
+ """
179
+ fs: list[FuncSig] = [
180
+ # ---- Numeric scalar ----
181
+ FuncSig("abs", (INT4,), INT4),
182
+ FuncSig("abs", (INT8,), INT8),
183
+ FuncSig("abs", (NUMERIC,), NUMERIC),
184
+ FuncSig("abs", (FLOAT8,), FLOAT8),
185
+ FuncSig("ceil", (NUMERIC,), NUMERIC),
186
+ FuncSig("floor", (NUMERIC,), NUMERIC),
187
+ FuncSig("round", (NUMERIC,), NUMERIC),
188
+ FuncSig("round", (NUMERIC, INT4), NUMERIC),
189
+ FuncSig("sqrt", (FLOAT8,), FLOAT8),
190
+ FuncSig("power", (FLOAT8, FLOAT8), FLOAT8),
191
+ FuncSig("mod", (INT4, INT4), INT4),
192
+ FuncSig("mod", (INT8, INT8), INT8),
193
+ FuncSig("greatest", (INT4, INT4), INT4),
194
+ FuncSig("least", (INT4, INT4), INT4),
195
+ FuncSig("greatest", (NUMERIC, NUMERIC), NUMERIC),
196
+ FuncSig("least", (NUMERIC, NUMERIC), NUMERIC),
197
+
198
+ # ---- String scalar ----
199
+ FuncSig("length", (TEXT,), INT4),
200
+ FuncSig("char_length", (TEXT,), INT4),
201
+ FuncSig("upper", (TEXT,), TEXT),
202
+ FuncSig("lower", (TEXT,), TEXT),
203
+ FuncSig("initcap", (TEXT,), TEXT),
204
+ FuncSig("trim", (TEXT,), TEXT),
205
+ FuncSig("btrim", (TEXT,), TEXT),
206
+ FuncSig("ltrim", (TEXT,), TEXT),
207
+ FuncSig("rtrim", (TEXT,), TEXT),
208
+ FuncSig("substr", (TEXT, INT4, INT4), TEXT),
209
+ FuncSig("substr", (TEXT, INT4), TEXT),
210
+ FuncSig("concat", (TEXT, TEXT), TEXT),
211
+ FuncSig("replace", (TEXT, TEXT, TEXT), TEXT),
212
+ FuncSig("md5", (TEXT,), TEXT),
213
+ FuncSig("left", (TEXT, INT4), TEXT),
214
+ FuncSig("right", (TEXT, INT4), TEXT),
215
+ FuncSig("reverse", (TEXT,), TEXT),
216
+ FuncSig("repeat", (TEXT, INT4), TEXT),
217
+
218
+ # ---- Date/time scalar ----
219
+ FuncSig("now", (), TIMESTAMPTZ),
220
+ FuncSig("current_date", (), DATE),
221
+ # date_trunc's first arg type is TEXT in our catalog, but PG
222
+ # only accepts specific UNIT keywords ('day', 'hour', ...).
223
+ # Generation special-cases the first arg in gen/expr.py's
224
+ # func branch via `_DATE_TRUNC_UNITS` — the bare TEXT
225
+ # signature here lets the catalog dispatch find it normally;
226
+ # the special arg rewrite is what keeps the unit valid.
227
+ # (Bad units fail at runtime, not PARSE/PLAN — the latter
228
+ # because the cast wrapping masks PG's constant-folding —
229
+ # so we'd fail EXECUTE without this special-case.)
230
+ FuncSig("date_trunc", (TEXT, TIMESTAMPTZ), TIMESTAMPTZ),
231
+ FuncSig("age", (TIMESTAMPTZ, TIMESTAMPTZ), INTERVAL),
232
+ FuncSig("age", (TIMESTAMPTZ,), INTERVAL),
233
+
234
+ # ---- UUID ----
235
+ FuncSig("gen_random_uuid", (), UUID),
236
+
237
+ # ---- JSON ----
238
+ FuncSig("jsonb_typeof", (JSONB,), TEXT),
239
+ # jsonb_array_length is back, with a special-case in
240
+ # gen/expr.py's func branch: when emitting it, the JSONB arg
241
+ # is replaced with a guaranteed-array literal. The catalog
242
+ # signature stays generic JSONB so dispatch finds it; the
243
+ # arg-rewrite ensures the value is always array-shaped.
244
+ # (PG validates array-ness at run time AND at planning time
245
+ # if the value is constant; my always-typed text literals
246
+ # would mask the constant from the planner anyway, so this
247
+ # is a runtime-correctness fix rather than a PLAN-tier fix.)
248
+ FuncSig("jsonb_array_length", (JSONB,), INT4),
249
+ FuncSig("jsonb_build_object", (TEXT, TEXT), JSONB),
250
+ FuncSig("jsonb_build_array", (TEXT, TEXT), JSONB),
251
+ FuncSig("to_jsonb", (TEXT,), JSONB),
252
+
253
+ # ---- Null handling ----
254
+ FuncSig("coalesce", (TEXT, TEXT), TEXT),
255
+ FuncSig("coalesce", (INT4, INT4), INT4),
256
+ FuncSig("nullif", (INT4, INT4), INT4),
257
+ FuncSig("nullif", (TEXT, TEXT), TEXT),
258
+
259
+ # ---- Aggregates ----
260
+ FuncSig("count", (INT4,), INT8, kind=FuncKind.AGGREGATE),
261
+ FuncSig("count", (TEXT,), INT8, kind=FuncKind.AGGREGATE),
262
+ FuncSig("sum", (INT4,), INT8, kind=FuncKind.AGGREGATE),
263
+ FuncSig("sum", (INT8,), NUMERIC, kind=FuncKind.AGGREGATE),
264
+ FuncSig("sum", (NUMERIC,), NUMERIC, kind=FuncKind.AGGREGATE),
265
+ FuncSig("avg", (INT4,), NUMERIC, kind=FuncKind.AGGREGATE),
266
+ FuncSig("avg", (NUMERIC,), NUMERIC, kind=FuncKind.AGGREGATE),
267
+ FuncSig("min", (INT4,), INT4, kind=FuncKind.AGGREGATE),
268
+ FuncSig("min", (NUMERIC,), NUMERIC, kind=FuncKind.AGGREGATE),
269
+ FuncSig("min", (TEXT,), TEXT, kind=FuncKind.AGGREGATE),
270
+ FuncSig("min", (TIMESTAMPTZ,), TIMESTAMPTZ, kind=FuncKind.AGGREGATE),
271
+ FuncSig("max", (INT4,), INT4, kind=FuncKind.AGGREGATE),
272
+ FuncSig("max", (NUMERIC,), NUMERIC, kind=FuncKind.AGGREGATE),
273
+ FuncSig("max", (TEXT,), TEXT, kind=FuncKind.AGGREGATE),
274
+ FuncSig("max", (TIMESTAMPTZ,), TIMESTAMPTZ, kind=FuncKind.AGGREGATE),
275
+ FuncSig("string_agg", (TEXT, TEXT), TEXT, kind=FuncKind.AGGREGATE),
276
+ FuncSig("array_agg", (INT4,), array_of(INT4), kind=FuncKind.AGGREGATE),
277
+ FuncSig("array_agg", (TEXT,), array_of(TEXT), kind=FuncKind.AGGREGATE),
278
+ FuncSig("bool_and", (BOOL,), BOOL, kind=FuncKind.AGGREGATE),
279
+ FuncSig("bool_or", (BOOL,), BOOL, kind=FuncKind.AGGREGATE),
280
+
281
+ # ---- Ordered-set aggregates (require WITHIN GROUP) ----
282
+ # These cannot be called without a WITHIN GROUP clause; the
283
+ # generator special-cases their emission in gen/expr.py to
284
+ # always attach one. Their declared return type holds when
285
+ # the WITHIN GROUP ORDER BY column is FLOAT8 (the common
286
+ # case our generator targets); other ORDER BY types would
287
+ # change the return type per PG's polymorphic resolution,
288
+ # but we don't exercise those.
289
+ FuncSig("percentile_cont", (FLOAT8,), FLOAT8, kind=FuncKind.AGGREGATE),
290
+ FuncSig("percentile_disc", (FLOAT8,), FLOAT8, kind=FuncKind.AGGREGATE),
291
+
292
+ # ---- Window-only ----
293
+ # (Aggregates can also be used as window functions via OVER, but
294
+ # those are handled at the AST level rather than re-registered.)
295
+ FuncSig("row_number", (), INT8, kind=FuncKind.WINDOW),
296
+ FuncSig("rank", (), INT8, kind=FuncKind.WINDOW),
297
+ FuncSig("dense_rank", (), INT8, kind=FuncKind.WINDOW),
298
+ FuncSig("lag", (INT4,), INT4, kind=FuncKind.WINDOW),
299
+ FuncSig("lead", (INT4,), INT4, kind=FuncKind.WINDOW),
300
+ FuncSig("first_value", (INT4,), INT4, kind=FuncKind.WINDOW),
301
+ FuncSig("last_value", (INT4,), INT4, kind=FuncKind.WINDOW),
302
+
303
+ # ---- Set-returning ----
304
+ FuncSig("generate_series", (INT4, INT4), INT4, kind=FuncKind.SET_RETURNING),
305
+ FuncSig("generate_series", (INT8, INT8), INT8, kind=FuncKind.SET_RETURNING),
306
+ FuncSig("generate_series", (TIMESTAMPTZ, TIMESTAMPTZ, INTERVAL),
307
+ TIMESTAMPTZ, kind=FuncKind.SET_RETURNING),
308
+ FuncSig("unnest", (array_of(INT4),), INT4, kind=FuncKind.SET_RETURNING),
309
+ FuncSig("unnest", (array_of(TEXT),), TEXT, kind=FuncKind.SET_RETURNING),
310
+ ]
311
+
312
+ ops: list[OpSig] = []
313
+
314
+ # Same-type arithmetic for the four numeric types.
315
+ for sym in ("+", "-", "*", "/"):
316
+ for t in (INT4, INT8, NUMERIC, FLOAT8):
317
+ ops.append(OpSig(sym, t, t, t))
318
+
319
+ # Modulo: defined for integer/numeric only (FLOAT8 modulo uses mod()).
320
+ for t in (INT4, INT8, NUMERIC):
321
+ ops.append(OpSig("%", t, t, t))
322
+
323
+ # A few cross-type numeric operators for variety. PG implicitly
324
+ # promotes, so we get richer outputs by registering these explicitly.
325
+ # Not exhaustive — registering every promotion would multiply the
326
+ # arithmetic operator pool four-fold without adding much expressive
327
+ # range. The handful below ensures the cross-type path gets exercised
328
+ # regularly during fuzzing.
329
+ ops.append(OpSig("+", INT4, INT8, INT8))
330
+ ops.append(OpSig("+", INT8, INT4, INT8))
331
+ ops.append(OpSig("+", INT4, NUMERIC, NUMERIC))
332
+ ops.append(OpSig("+", NUMERIC, INT4, NUMERIC))
333
+
334
+ # Comparison: every comparable type → bool.
335
+ for sym in ("=", "<>", "<", "<=", ">", ">="):
336
+ for t in (INT4, INT8, NUMERIC, FLOAT8, TEXT, DATE, TIMESTAMPTZ, BOOL, UUID):
337
+ ops.append(OpSig(sym, t, t, BOOL))
338
+
339
+ # Logical
340
+ ops.append(OpSig("AND", BOOL, BOOL, BOOL))
341
+ ops.append(OpSig("OR", BOOL, BOOL, BOOL))
342
+
343
+ # String
344
+ ops.append(OpSig("||", TEXT, TEXT, TEXT))
345
+ ops.append(OpSig("LIKE", TEXT, TEXT, BOOL))
346
+ ops.append(OpSig("ILIKE", TEXT, TEXT, BOOL))
347
+
348
+ # Date/time arithmetic. NOTE: `date ± interval` is intentionally NOT
349
+ # registered — PG types it as `timestamp` (without time zone), a type
350
+ # waxsql doesn't model; declaring it TIMESTAMPTZ was a type-system lie
351
+ # masked only by PG's implicit timestamp→timestamptz cast (ISSUES.md
352
+ # #50). `timestamptz ± interval` is correctly typed and kept.
353
+ ops.append(OpSig("+", TIMESTAMPTZ, INTERVAL, TIMESTAMPTZ))
354
+ ops.append(OpSig("-", TIMESTAMPTZ, INTERVAL, TIMESTAMPTZ))
355
+ ops.append(OpSig("-", TIMESTAMPTZ, TIMESTAMPTZ, INTERVAL))
356
+
357
+ # JSON access
358
+ ops.append(OpSig("->", JSONB, TEXT, JSONB))
359
+ ops.append(OpSig("->>", JSONB, TEXT, TEXT))
360
+ ops.append(OpSig("@>", JSONB, JSONB, BOOL))
361
+ ops.append(OpSig("?", JSONB, TEXT, BOOL))
362
+
363
+ return Catalog(functions=tuple(fs), operators=tuple(ops))