waxsql 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waxsql/__init__.py +158 -0
- waxsql/ast.py +757 -0
- waxsql/catalog.py +363 -0
- waxsql/cli.py +888 -0
- waxsql/config.py +477 -0
- waxsql/context.py +255 -0
- waxsql/data.py +99 -0
- waxsql/gen/__init__.py +51 -0
- waxsql/gen/cte.py +367 -0
- waxsql/gen/data/__init__.py +14 -0
- waxsql/gen/data/columns.py +48 -0
- waxsql/gen/data/emit.py +247 -0
- waxsql/gen/data/rows.py +236 -0
- waxsql/gen/data/strategies.py +299 -0
- waxsql/gen/expr.py +723 -0
- waxsql/gen/select.py +831 -0
- waxsql/gen/setop.py +259 -0
- waxsql/gen/subquery.py +397 -0
- waxsql/gen/window.py +398 -0
- waxsql/pretty.py +81 -0
- waxsql/printer.py +688 -0
- waxsql/py.typed +0 -0
- waxsql/schema.py +557 -0
- waxsql/scope.py +391 -0
- waxsql/types.py +187 -0
- waxsql/validate/__init__.py +52 -0
- waxsql/validate/parse.py +194 -0
- waxsql/validate/plan.py +149 -0
- waxsql/validate/syntax.py +87 -0
- waxsql-1.0.0.dist-info/METADATA +746 -0
- waxsql-1.0.0.dist-info/RECORD +35 -0
- waxsql-1.0.0.dist-info/WHEEL +5 -0
- waxsql-1.0.0.dist-info/entry_points.txt +2 -0
- waxsql-1.0.0.dist-info/licenses/LICENSE +21 -0
- waxsql-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,746 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: waxsql
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Random PostgreSQL query generator (wax-fruit SQL for fuzzing & test data)
|
|
5
|
+
Author-email: Christophe Pettus <christophe.pettus@pgexperts.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/pgexperts/waxsql
|
|
8
|
+
Project-URL: Repository, https://github.com/pgexperts/waxsql
|
|
9
|
+
Project-URL: Issues, https://github.com/pgexperts/waxsql/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/pgexperts/waxsql/releases
|
|
11
|
+
Keywords: postgres,postgresql,sql,fuzzing,testing,query-generator,deterministic,test-data
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
21
|
+
Classifier: Topic :: Database
|
|
22
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
23
|
+
Classifier: Topic :: Software Development :: Testing
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.10
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: syntax
|
|
29
|
+
Requires-Dist: pglast<8.0,>=7.0; extra == "syntax"
|
|
30
|
+
Provides-Extra: parse
|
|
31
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == "parse"
|
|
32
|
+
Provides-Extra: plan
|
|
33
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == "plan"
|
|
34
|
+
Provides-Extra: cli
|
|
35
|
+
Requires-Dist: click>=8.0; extra == "cli"
|
|
36
|
+
Provides-Extra: pprint
|
|
37
|
+
Requires-Dist: pglast<8.0,>=7.0; extra == "pprint"
|
|
38
|
+
Requires-Dist: pygments>=2.0; extra == "pprint"
|
|
39
|
+
Provides-Extra: all
|
|
40
|
+
Requires-Dist: pglast<8.0,>=7.0; extra == "all"
|
|
41
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == "all"
|
|
42
|
+
Requires-Dist: click>=8.0; extra == "all"
|
|
43
|
+
Requires-Dist: pygments>=2.0; extra == "all"
|
|
44
|
+
Provides-Extra: dev
|
|
45
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
46
|
+
Requires-Dist: pglast<8.0,>=7.0; extra == "dev"
|
|
47
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == "dev"
|
|
48
|
+
Requires-Dist: click>=8.0; extra == "dev"
|
|
49
|
+
Requires-Dist: pygments>=2.0; extra == "dev"
|
|
50
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
51
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
52
|
+
Dynamic: license-file
|
|
53
|
+
|
|
54
|
+
# waxsql
|
|
55
|
+
|
|
56
|
+
[](https://github.com/pgexperts/waxsql/actions/workflows/ci.yml)
|
|
57
|
+
[](https://pypi.org/project/waxsql/)
|
|
58
|
+
[](https://pypi.org/project/waxsql/)
|
|
59
|
+
|
|
60
|
+
A deterministic, type-driven random PostgreSQL query generator. SQL the
|
|
61
|
+
equivalent of wax fruit — looks real, doesn't compute anything useful, won't
|
|
62
|
+
spoil. Useful for fuzzing query tools, exercising parsers and planners,
|
|
63
|
+
generating reproducible workloads, and producing realistic-but-meaningless
|
|
64
|
+
SQL on tap.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from waxsql import generate_query, generate_schema, print_query
|
|
68
|
+
|
|
69
|
+
schema = generate_schema(seed=42, complexity=8)
|
|
70
|
+
query = generate_query(seed=42, schema=schema, complexity=8)
|
|
71
|
+
|
|
72
|
+
print(schema.emit_ddl()) # CREATE TABLE / ALTER TABLE / CREATE INDEX
|
|
73
|
+
print(print_query(query)) # SELECT ... — type-correct against the schema
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Same `(seed, complexity)` always produces the same output, byte for byte.
|
|
77
|
+
The complexity dial scales from `SELECT col FROM t` up through deeply-nested
|
|
78
|
+
CTE / subquery / window-function / grouping-set trees. "Correct" means
|
|
79
|
+
**type-driven correct**: every expression respects PostgreSQL's type system,
|
|
80
|
+
scope rules, aggregate-context restrictions, and overload-resolution
|
|
81
|
+
semantics — generated SQL clears parse-analysis cleanly.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Contents
|
|
86
|
+
|
|
87
|
+
- [Install](#install)
|
|
88
|
+
- [Quick start](#quick-start)
|
|
89
|
+
- [Theory of operation](#theory-of-operation)
|
|
90
|
+
- [The complexity dial](#the-complexity-dial)
|
|
91
|
+
- [Validation tiers](#validation-tiers)
|
|
92
|
+
- [Public API reference](#public-api-reference)
|
|
93
|
+
- [Surprising corners](#surprising-corners)
|
|
94
|
+
- [Development](#development)
|
|
95
|
+
- [License](#license)
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Install
|
|
100
|
+
|
|
101
|
+
waxsql is a pure-Python package on PyPI. The package itself has zero runtime
|
|
102
|
+
dependencies; everything PostgreSQL-related is an optional extra.
|
|
103
|
+
|
|
104
|
+
### With `pip`
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# Just the generator. Use this if you only need to produce SQL strings.
|
|
108
|
+
pip install waxsql
|
|
109
|
+
|
|
110
|
+
# Add the SYNTAX-tier validator (uses pglast / libpg_query — no DB needed).
|
|
111
|
+
pip install 'waxsql[syntax]'
|
|
112
|
+
|
|
113
|
+
# Add the live-DB validators (psycopg). PARSE and PLAN are equivalent in
|
|
114
|
+
# the dependency set; the names exist so callers can express intent.
|
|
115
|
+
pip install 'waxsql[parse]'
|
|
116
|
+
pip install 'waxsql[plan]'
|
|
117
|
+
|
|
118
|
+
# Everything (generator + all validators).
|
|
119
|
+
pip install 'waxsql[all]'
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### With `uv`
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Standalone install into the active venv.
|
|
126
|
+
uv pip install waxsql
|
|
127
|
+
|
|
128
|
+
# Or as a project dependency:
|
|
129
|
+
uv add waxsql
|
|
130
|
+
uv add 'waxsql[all]'
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Optional-dependency matrix
|
|
134
|
+
|
|
135
|
+
| Extra | Pulls in | Enables |
|
|
136
|
+
|------------|-----------------------------------------|----------------------------------------|
|
|
137
|
+
| (none) | nothing | `generate_schema`, `generate_query`, `print_query` |
|
|
138
|
+
| `[syntax]` | `pglast >=7,<8` | `check_syntax(sql)` — no DB needed |
|
|
139
|
+
| `[parse]` | `psycopg[binary] >=3.1` | `check_parse(sql, conn)` |
|
|
140
|
+
| `[plan]` | `psycopg[binary] >=3.1` | `check_plan(sql, conn)` |
|
|
141
|
+
| `[cli]` | `click >=8` | the `waxsql` console script (`gen`, `validate`) |
|
|
142
|
+
| `[pprint]` | `pglast >=7,<8` + `pygments >=2` | `gen --pprint` — reformatted and colorized SQL output |
|
|
143
|
+
| `[all]` | `pglast >=7,<8` + `psycopg[binary] >=3.1` + `click >=8` + `pygments >=2` | all validators + the CLI |
|
|
144
|
+
| `[dev]` | `[all]` plus `pytest`, `ruff`, `mypy` | full test/lint/type pipeline |
|
|
145
|
+
|
|
146
|
+
Python 3.10 or newer is required. `pglast` v7 currently tracks PostgreSQL 17;
|
|
147
|
+
v8 (PG18 support) is in development upstream — when it lands, the `[syntax]`
|
|
148
|
+
pin will move.
|
|
149
|
+
|
|
150
|
+
### Pre-release versions
|
|
151
|
+
|
|
152
|
+
If you want a release candidate or dev build (`waxsql 1.1.0rc1`,
|
|
153
|
+
`waxsql 1.1.0.dev1`), `pip install` ignores those by default. Pass `--pre`:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
pip install --pre waxsql
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Quick start
|
|
162
|
+
|
|
163
|
+
### Use it from the command line
|
|
164
|
+
|
|
165
|
+
Install the `[cli]` extra and use the bundled `waxsql` command — no Python needed:
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
pip install 'waxsql[cli]'
|
|
169
|
+
|
|
170
|
+
# One-shot demo: random seed, default complexity, schema + query both.
|
|
171
|
+
waxsql gen
|
|
172
|
+
|
|
173
|
+
# Reproducible run with a fixed seed.
|
|
174
|
+
waxsql gen --seed 42 --complexity 8
|
|
175
|
+
|
|
176
|
+
# Pipe straight into psql.
|
|
177
|
+
waxsql gen --seed 42 -c 8 | psql -d scratch
|
|
178
|
+
|
|
179
|
+
# Pipe through the validator (gen output's header tells validate
|
|
180
|
+
# which schema to install — auto-schema is on by default).
|
|
181
|
+
waxsql gen --seed 42 -c 8 | waxsql validate --tier plan
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
`waxsql gen --help` and `waxsql validate --help` document all the flags. A few notable `gen` options:
|
|
185
|
+
|
|
186
|
+
- `--pprint` — reformat the generated DDL and queries with indentation and,
|
|
187
|
+
when writing to a terminal, syntax-color them. Display-only: the output
|
|
188
|
+
is for reading, not for piping into `validate` (color codes and the
|
|
189
|
+
reflowed layout aren't meant to round-trip). Requires the `[pprint]`
|
|
190
|
+
extra: `pip install 'waxsql[pprint]'`.
|
|
191
|
+
|
|
192
|
+
The Python API examples below cover the same ground for callers who want to drive the library directly.
|
|
193
|
+
|
|
194
|
+
### 1. Generate a schema and one query against it
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
from waxsql import generate_query, generate_schema, print_query
|
|
198
|
+
|
|
199
|
+
schema = generate_schema(seed=42, complexity=5)
|
|
200
|
+
query = generate_query(seed=42, schema=schema, complexity=5)
|
|
201
|
+
|
|
202
|
+
print(print_query(query))
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Both functions are deterministic in their seeds. Re-running the snippet on
|
|
206
|
+
any machine, any Python version (≥3.10), produces the same SQL.
|
|
207
|
+
|
|
208
|
+
### 2. Generate many queries against one fixed schema
|
|
209
|
+
|
|
210
|
+
The schema and query generators have **independent RNG streams**. Hold the
|
|
211
|
+
schema seed fixed and vary the query seed to produce a workload of
|
|
212
|
+
unrelated queries against a stable target — useful for soak-testing a
|
|
213
|
+
query-rewriting tool, a planner, or a connection-pooling proxy.
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from waxsql import generate_query, generate_schema, print_query
|
|
217
|
+
|
|
218
|
+
schema = generate_schema(seed=42, complexity=6)
|
|
219
|
+
|
|
220
|
+
for query_seed in range(100):
|
|
221
|
+
q = generate_query(seed=query_seed, schema=schema, complexity=6)
|
|
222
|
+
print(print_query(q))
|
|
223
|
+
print(";")
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### 3. Validate a generated query through pglast (no DB)
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
from waxsql import generate_query, generate_schema, print_query
|
|
230
|
+
from waxsql.validate.syntax import check_syntax
|
|
231
|
+
|
|
232
|
+
schema = generate_schema(seed=1, complexity=10)
|
|
233
|
+
q = generate_query(seed=1, schema=schema, complexity=10)
|
|
234
|
+
sql = print_query(q)
|
|
235
|
+
|
|
236
|
+
result = check_syntax(sql)
|
|
237
|
+
assert result.ok, f"pglast rejected: {result.error}\n{sql}"
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
`check_syntax` is microsecond-fast and needs no PostgreSQL connection —
|
|
241
|
+
it shells out to the libpg_query C library bundled with `pglast`.
|
|
242
|
+
|
|
243
|
+
### 4. Validate against a live PostgreSQL via PREPARE
|
|
244
|
+
|
|
245
|
+
PARSE-tier validation catches name- and type-resolution errors that pure
|
|
246
|
+
syntax-checking can't see. The validator wraps each PREPARE in a savepoint
|
|
247
|
+
so a single failure doesn't abort the surrounding transaction.
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
import psycopg
|
|
251
|
+
from waxsql import generate_query, generate_schema, print_query
|
|
252
|
+
from waxsql.validate.parse import check_parse, install_schema
|
|
253
|
+
|
|
254
|
+
schema = generate_schema(seed=1, complexity=10)
|
|
255
|
+
q = generate_query(seed=1, schema=schema, complexity=10)
|
|
256
|
+
sql = print_query(q)
|
|
257
|
+
|
|
258
|
+
# autocommit=False is required — the validators use savepoints, which only
|
|
259
|
+
# work inside a transaction. psycopg opens that transaction implicitly on
|
|
260
|
+
# the first statement; no explicit BEGIN needed (and issuing one would
|
|
261
|
+
# produce a spurious "there is already a transaction in progress" warning).
|
|
262
|
+
with psycopg.connect("dbname=waxsql_scratch", autocommit=False) as conn:
|
|
263
|
+
try:
|
|
264
|
+
install_schema(schema, conn) # CREATE TABLE etc.
|
|
265
|
+
result = check_parse(sql, conn)
|
|
266
|
+
assert result.ok, f"PG rejected: {result.error}\n{sql}"
|
|
267
|
+
finally:
|
|
268
|
+
conn.rollback() # discard the throwaway schema
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
`check_plan(sql, conn)` is identical in shape but runs `EXPLAIN` instead
|
|
272
|
+
of `PREPARE`, exercising the planner as well as parse-analysis. See
|
|
273
|
+
[Validation tiers](#validation-tiers) for when to reach for which.
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## Theory of operation
|
|
278
|
+
|
|
279
|
+
waxsql is closer in spirit to SQLsmith than to a yacc-driven fuzzer. Three
|
|
280
|
+
ideas hold the whole thing together:
|
|
281
|
+
|
|
282
|
+
### Type-driven generation
|
|
283
|
+
|
|
284
|
+
Every expression is generated **with a target type**. The catalog
|
|
285
|
+
(`waxsql.catalog`) answers "what produces type T?" from the function pool,
|
|
286
|
+
the operator pool, and (via the scope) column references. The generator
|
|
287
|
+
never emits `int + text` because no operator satisfies that request.
|
|
288
|
+
|
|
289
|
+
The implicit-cast graph (`_IMPLICIT_CASTS` in `waxsql.types`) governs what
|
|
290
|
+
counts as "produces T" — a function returning `int4` satisfies a request
|
|
291
|
+
for `int8` because `int4` implicitly casts to `int8`. The graph is hand-
|
|
292
|
+
curated against PostgreSQL's `pg_cast`.
|
|
293
|
+
|
|
294
|
+
This is why the output isn't just syntactically valid — it clears
|
|
295
|
+
**parse-analysis** at PostgreSQL's full type-resolver. SQL that the
|
|
296
|
+
PostgreSQL grammar accepts but the type system rejects (e.g.
|
|
297
|
+
`WHERE current_timestamp + 'hello'`) is structurally absent from the
|
|
298
|
+
generator's possible outputs.
|
|
299
|
+
|
|
300
|
+
### Determinism
|
|
301
|
+
|
|
302
|
+
Same `(seed, complexity)` produces byte-identical SQL across runs and
|
|
303
|
+
Python versions. This is what makes the generator usable for fuzzing
|
|
304
|
+
(reproduce a bug from a seed) and for golden-output testing.
|
|
305
|
+
|
|
306
|
+
The discipline that keeps it true:
|
|
307
|
+
|
|
308
|
+
- The generator never reads the global `random` module — every randomized
|
|
309
|
+
decision goes through an injected `random.Random` instance.
|
|
310
|
+
- Set iteration is forbidden in any RNG-affecting code path; sets are
|
|
311
|
+
always `sorted(...)` first because Python's set iteration order isn't
|
|
312
|
+
stable across builds. (Dict order is stable since 3.7, so dicts are fine.)
|
|
313
|
+
- Schema and query each get an independently seeded RNG stream, so
|
|
314
|
+
varying the query seed against a fixed schema seed works as expected.
|
|
315
|
+
|
|
316
|
+
### Round-trip validation
|
|
317
|
+
|
|
318
|
+
Every generated artifact can be round-tripped through PostgreSQL's actual
|
|
319
|
+
parser via `pglast` — the `check_syntax` function runs your SQL through
|
|
320
|
+
the real `libpg_query`. The test suite enforces this on every code path
|
|
321
|
+
that emits SQL, parametrized over many seeds. There are no print-time
|
|
322
|
+
shortcuts that would let invalid SQL escape.
|
|
323
|
+
|
|
324
|
+
---
|
|
325
|
+
|
|
326
|
+
## The complexity dial
|
|
327
|
+
|
|
328
|
+
`complexity` is a single integer 0..10 that controls both how rich the
|
|
329
|
+
schema is and how feature-rich the queries are. It works by **unlocking
|
|
330
|
+
features in stages**:
|
|
331
|
+
|
|
332
|
+
| `complexity` | Features unlocked at this notch |
|
|
333
|
+
|--------------|----------------------------------------------------------------------|
|
|
334
|
+
| 0 | Trivial `SELECT col FROM t` only |
|
|
335
|
+
| 1 | `WHERE`, `INNER JOIN`, multiple `FROM` items |
|
|
336
|
+
| 2 | `ORDER BY`, `LIMIT` |
|
|
337
|
+
| 3 | Aggregates and `GROUP BY` |
|
|
338
|
+
| 4 | `LEFT JOIN`, scalar subqueries |
|
|
339
|
+
| 5 | `HAVING`, `EXISTS`, `IN (SELECT ...)`, derived tables |
|
|
340
|
+
| 6 | `LATERAL` (only meaningful with derived tables, hence the gate) |
|
|
341
|
+
| 7 | Common Table Expressions (`WITH`) |
|
|
342
|
+
| 8 | Window functions (`func() OVER (...)`) |
|
|
343
|
+
| 9 | Set operations (`UNION` / `INTERSECT` / `EXCEPT [ALL]`) |
|
|
344
|
+
| 10 | `WITH RECURSIVE`, `ROLLUP` / `CUBE` / `GROUPING SETS` |
|
|
345
|
+
|
|
346
|
+
Structural caps (max expression depth, max FROM items, max subquery nesting,
|
|
347
|
+
max CTEs per WITH list) also grow with the dial — see `waxsql.config.
|
|
348
|
+
query_config_for_complexity` for the formulas.
|
|
349
|
+
|
|
350
|
+
The schema generator's dial is similar in spirit:
|
|
351
|
+
|
|
352
|
+
| `complexity` | Schema effect |
|
|
353
|
+
|--------------|----------------------------------------------------------------------|
|
|
354
|
+
| 0..10 | `table_count = 2 + c` (so 2..12 tables) |
|
|
355
|
+
| 0..10 | `max_columns = 4 + c` (so up to 14 cols/table) |
|
|
356
|
+
| 0..10 | `fk_density = 0.30 + 0.04*c` (more FKs at higher complexity) |
|
|
357
|
+
| ≥5 | Self-referencing FKs allowed (`tbl.parent_id REFERENCES tbl(id)`) |
|
|
358
|
+
| ≥8 | Cyclic FK graphs allowed |
|
|
359
|
+
|
|
360
|
+
If the canned dial doesn't fit your needs, drop down to the underlying
|
|
361
|
+
config objects — see [Public API reference](#public-api-reference) below.
|
|
362
|
+
|
|
363
|
+
---
|
|
364
|
+
|
|
365
|
+
## Validation tiers
|
|
366
|
+
|
|
367
|
+
Three modes, strictly stronger. Each tier catches a strict superset of
|
|
368
|
+
what the previous tier catches.
|
|
369
|
+
|
|
370
|
+
| Tier | Mechanism | Cost | Coverage at c=10 |
|
|
371
|
+
|--------|--------------------------|-----------|------------------|
|
|
372
|
+
| SYNTAX | `pglast` (libpg_query) | µs/check | 100% |
|
|
373
|
+
| PARSE | `PREPARE` + savepoint | ms/check | 100% |
|
|
374
|
+
| PLAN | `EXPLAIN` + savepoint | ms/check | ~95–96% |
|
|
375
|
+
|
|
376
|
+
Rates are observed at c=10 on deterministic hardware; CI gates at floors
|
|
377
|
+
that absorb per-seed variance (PARSE 100% at the tested complexities,
|
|
378
|
+
PLAN ≥90% at c=10).
|
|
379
|
+
|
|
380
|
+
The PLAN-tier residual is constant-foldable runtime errors PG catches
|
|
381
|
+
eagerly during planning (mostly division by zero through arithmetic that
|
|
382
|
+
folds to a literal zero). They'd be the same errors `EXECUTE` would raise
|
|
383
|
+
on real data.
|
|
384
|
+
|
|
385
|
+
### When to use which
|
|
386
|
+
|
|
387
|
+
- **SYNTAX** is the right default. It catches every grammar error, runs
|
|
388
|
+
in microseconds, and needs no PostgreSQL — install with `[syntax]` and
|
|
389
|
+
call `check_syntax(sql)`.
|
|
390
|
+
- **PARSE** is for verifying that name/type resolution succeeds against a
|
|
391
|
+
real schema. If you're feeding waxsql output into a tool that does its
|
|
392
|
+
own parse-analysis (e.g. a query rewriter), PARSE is the floor your
|
|
393
|
+
fuzzer should clear.
|
|
394
|
+
- **PLAN** adds planner-time errors on top of PARSE. Run it when you want
|
|
395
|
+
to be sure PostgreSQL would actually accept the SQL for execution, not
|
|
396
|
+
just compilation.
|
|
397
|
+
|
|
398
|
+
### Live-DB validation pattern
|
|
399
|
+
|
|
400
|
+
Both PARSE and PLAN need a transaction-mode psycopg connection. The
|
|
401
|
+
canonical shape:
|
|
402
|
+
|
|
403
|
+
```python
|
|
404
|
+
import psycopg
|
|
405
|
+
from waxsql import generate_query, generate_schema, print_query
|
|
406
|
+
from waxsql.validate.parse import check_parse, install_schema
|
|
407
|
+
# from waxsql.validate.plan import check_plan # same shape, runs EXPLAIN
|
|
408
|
+
|
|
409
|
+
schema = generate_schema(seed=0, complexity=10)
|
|
410
|
+
|
|
411
|
+
with psycopg.connect("dbname=scratch", autocommit=False) as conn:
|
|
412
|
+
try:
|
|
413
|
+
install_schema(schema, conn)
|
|
414
|
+
|
|
415
|
+
# Sweep many queries against the same installed schema.
|
|
416
|
+
# Each check_parse savepoints around the PREPARE so a single
|
|
417
|
+
# failure doesn't abort the sweep.
|
|
418
|
+
for query_seed in range(1000):
|
|
419
|
+
q = generate_query(seed=query_seed, schema=schema, complexity=10)
|
|
420
|
+
sql = print_query(q)
|
|
421
|
+
r = check_parse(sql, conn)
|
|
422
|
+
if not r.ok:
|
|
423
|
+
print(f"REJECTED at seed={query_seed}: {r.error}\n{sql}\n")
|
|
424
|
+
finally:
|
|
425
|
+
conn.rollback() # nothing persists
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
The default DSN used by the test suite is `dbname=waxsql_test`; override
|
|
429
|
+
via the `WAXSQL_PG_DSN` environment variable if you want to point the
|
|
430
|
+
test suite at a different cluster.
|
|
431
|
+
|
|
432
|
+
---
|
|
433
|
+
|
|
434
|
+
## Public API reference
|
|
435
|
+
|
|
436
|
+
Everything documented here is exported from the top-level `waxsql` package.
|
|
437
|
+
Internal helpers are not part of the public surface and may change without
|
|
438
|
+
notice; if you reach into a submodule that isn't listed here, you're on
|
|
439
|
+
your own at upgrade time.
|
|
440
|
+
|
|
441
|
+
### Generation
|
|
442
|
+
|
|
443
|
+
```python
|
|
444
|
+
generate_schema(seed: int, complexity: int = 5) -> Schema
|
|
445
|
+
```
|
|
446
|
+
Generate a random schema. Deterministic in `(seed, complexity)`.
|
|
447
|
+
|
|
448
|
+
```python
|
|
449
|
+
generate_query(seed: int, *, schema: Schema, complexity: int = 5,
|
|
450
|
+
catalog: Optional[Catalog] = None) -> Query
|
|
451
|
+
```
|
|
452
|
+
Generate a random `Query` against `schema`. Deterministic in
|
|
453
|
+
`(seed, schema, complexity, catalog)`. Note the `*` — `schema` is
|
|
454
|
+
**keyword-only**.
|
|
455
|
+
|
|
456
|
+
```python
|
|
457
|
+
generate_schema_with_config(rng: random.Random, cfg: SchemaConfig) -> Schema
|
|
458
|
+
```
|
|
459
|
+
Lower-level entry point that accepts a pre-seeded RNG and a hand-built
|
|
460
|
+
`SchemaConfig`. Use this if `schema_config_for_complexity` doesn't fit
|
|
461
|
+
your needs.
|
|
462
|
+
|
|
463
|
+
```python
|
|
464
|
+
generate_data(schema: Schema, *, seed: int, rows: int = 100,
|
|
465
|
+
fanout: int = 5, null_fraction: float = 0.05) -> str
|
|
466
|
+
```
|
|
467
|
+
Emit one deterministic `COPY ... FROM STDIN` block per table, in
|
|
468
|
+
FK-topological order — a fully self-contained data section to pair with
|
|
469
|
+
`schema.emit_ddl()`. Deterministic in `(schema, seed, rows, fanout,
|
|
470
|
+
null_fraction)`; tables deeper in the FK DAG get `rows * fanout ** depth`
|
|
471
|
+
rows. Raises `ValueError` on an FK-cyclic schema (deferred-constraint
|
|
472
|
+
cycle handling is a known follow-up).
|
|
473
|
+
|
|
474
|
+
### Rendering
|
|
475
|
+
|
|
476
|
+
```python
|
|
477
|
+
print_query(q: Query) -> str
|
|
478
|
+
```
|
|
479
|
+
Render a `Query` AST as a SQL string. Despite the name, this **returns**
|
|
480
|
+
the string; it does not write to stdout. (See [Surprising corners](#surprising-corners).)
|
|
481
|
+
|
|
482
|
+
```python
|
|
483
|
+
print_expr(e: Expr) -> str
|
|
484
|
+
```
|
|
485
|
+
Render a single expression. Useful when debugging the generator.
|
|
486
|
+
|
|
487
|
+
```python
|
|
488
|
+
schema.emit_ddl() -> str
|
|
489
|
+
```
|
|
490
|
+
Method on `Schema`. Returns the full `CREATE TABLE` / `ALTER TABLE` /
|
|
491
|
+
`CREATE INDEX` script for the schema. Tables are emitted first, then
|
|
492
|
+
all foreign keys (so cyclic FK graphs work), then indexes.
|
|
493
|
+
|
|
494
|
+
### Schema model
|
|
495
|
+
|
|
496
|
+
All frozen dataclasses; safely hashable, safely shared.
|
|
497
|
+
|
|
498
|
+
| Symbol | Role |
|
|
499
|
+
|----------------|---------------------------------------------------------------|
|
|
500
|
+
| `Schema` | Top-level container. `tables: tuple[Table, ...]`, `.table(name)`, `.emit_ddl()` |
|
|
501
|
+
| `Table` | Has `name`, `columns`, `foreign_keys`, `indexes` |
|
|
502
|
+
| `Column` | Has `name`, `type` (a `PgType`), `not_null` |
|
|
503
|
+
| `ForeignKey` | Source-column → target-table.id reference |
|
|
504
|
+
| `Index` | Single- or multi-column index spec |
|
|
505
|
+
| `SchemaConfig` | The dial-derived knobs for `generate_schema_with_config` |
|
|
506
|
+
| `quote_ident(name)` | Quote a SQL identifier (always; safe to over-quote) |
|
|
507
|
+
|
|
508
|
+
### Type system
|
|
509
|
+
|
|
510
|
+
| Symbol | Notes |
|
|
511
|
+
|-------------------|----------------------------------------------------------|
|
|
512
|
+
| `PgType` | Frozen dataclass. The atomic unit of the type system. |
|
|
513
|
+
| `TypeCategory` | Enum: NUMERIC / STRING / BOOLEAN / TEMPORAL / etc. |
|
|
514
|
+
| `INT4`, `INT8`, `NUMERIC`, `FLOAT8`, `TEXT`, `VARCHAR`, `BOOL`, `DATE`, `TIMESTAMPTZ`, `INTERVAL`, `UUID`, `JSONB` | Built-in singletons |
|
|
515
|
+
| `SCALAR_TYPES` | Tuple of all built-in scalar `PgType` values |
|
|
516
|
+
| `array_of(t)` | Construct an array type (`int4[]`, `text[]`, etc.) |
|
|
517
|
+
| `implicitly_castable(src, dst)` | Walks the implicit-cast graph |
|
|
518
|
+
|
|
519
|
+
### Catalog
|
|
520
|
+
|
|
521
|
+
| Symbol | Notes |
|
|
522
|
+
|-------------------|----------------------------------------------------------|
|
|
523
|
+
| `Catalog` | Function and operator pools, plus type → producer index |
|
|
524
|
+
| `FuncSig` | Function signature: name, arg types, return type, kind |
|
|
525
|
+
| `OpSig` | Binary/unary operator signature |
|
|
526
|
+
| `FuncKind` | Enum: SCALAR / AGGREGATE / WINDOW |
|
|
527
|
+
| `default_catalog()` | The standard hand-curated catalog used by the generator |
|
|
528
|
+
|
|
529
|
+
### Query AST
|
|
530
|
+
|
|
531
|
+
| Symbol | Notes |
|
|
532
|
+
|-------------------|----------------------------------------------------------|
|
|
533
|
+
| `Query` | Outermost node — wraps a `Select` or `SetOp` |
|
|
534
|
+
| `Select` | A single SELECT statement |
|
|
535
|
+
| `SetOp` | UNION / INTERSECT / EXCEPT combining multiple selects |
|
|
536
|
+
| `SelectTarget` | One entry in the SELECT list (expression + optional alias) |
|
|
537
|
+
| `OrderByItem` | One ORDER BY entry (expression + ASC/DESC + nulls placement) |
|
|
538
|
+
| `FromItem` | Anything that can appear in FROM (table, derived, CTE ref) |
|
|
539
|
+
| `TableRef` | A reference to a base table |
|
|
540
|
+
| `JoinExpr` | An explicit JOIN node |
|
|
541
|
+
| `Expr` (Protocol) | Marker protocol for expression nodes |
|
|
542
|
+
| `ColumnRef`, `Literal`, `FuncCall`, `BinaryOp`, `UnaryOp`, `Cast` | Concrete expression nodes |
|
|
543
|
+
|
|
544
|
+
### Validation
|
|
545
|
+
|
|
546
|
+
| Symbol | Module | Notes |
|
|
547
|
+
|-------------------------------------|-------------------------|--------------------------------|
|
|
548
|
+
| `ValidationMode` | `waxsql.validate` | Enum (NONE / SYNTAX / PARSE / PLAN). Informational only — see surprises. |
|
|
549
|
+
| `check_syntax(sql)` | `waxsql.validate.syntax`| Returns `SyntaxResult`. No DB. |
|
|
550
|
+
| `check_parse(sql, conn)` | `waxsql.validate.parse` | Returns `ParseResult`. PREPARE.|
|
|
551
|
+
| `check_plan(sql, conn)` | `waxsql.validate.plan` | Returns `PlanResult`. EXPLAIN. |
|
|
552
|
+
| `install_schema(schema, conn)` | `waxsql.validate.parse` | DDL deploy for live-DB checks. |
|
|
553
|
+
|
|
554
|
+
All three result types share the shape `(ok: bool, error: Optional[str], ...)`.
|
|
555
|
+
|
|
556
|
+
### Configuration & generator internals
|
|
557
|
+
|
|
558
|
+
These are exported for callers who want to drive the generator directly
|
|
559
|
+
rather than through the canned `complexity` dial.
|
|
560
|
+
|
|
561
|
+
| Symbol | Notes |
|
|
562
|
+
|-----------------------------------|------------------------------------------------|
|
|
563
|
+
| `ComplexityConfig` | Dial-derived knobs for query generation |
|
|
564
|
+
| `SchemaConfig` | Dial-derived knobs for schema generation |
|
|
565
|
+
| `query_config_for_complexity(c)` | Build a `ComplexityConfig` from 0..10 |
|
|
566
|
+
| `schema_config_for_complexity(c)` | Build a `SchemaConfig` from 0..10 |
|
|
567
|
+
| `GenContext` | The per-call state object (rng, scope, schema, catalog, config, depth budgets) |
|
|
568
|
+
| `Scope`, `Binding` | Visible-columns lookup used during generation |
|
|
569
|
+
| `FEATURE_*` | String constants for the feature-flag set |
|
|
570
|
+
|
|
571
|
+
### Module map
|
|
572
|
+
|
|
573
|
+
```
|
|
574
|
+
waxsql/
|
|
575
|
+
├── __init__.py ← public surface; everything in __all__ comes from here
|
|
576
|
+
├── types.py PgType, type categories, implicit cast graph
|
|
577
|
+
├── catalog.py FuncSig, OpSig, default catalog
|
|
578
|
+
├── schema.py Schema model + random generator + DDL emitter
|
|
579
|
+
├── data.py generate_data: deterministic COPY blocks for a schema
|
|
580
|
+
├── ast.py AST dataclasses for queries
|
|
581
|
+
├── printer.py AST → SQL with precedence/parens
|
|
582
|
+
├── pretty.py SQL reformat + color for `gen --pprint`
|
|
583
|
+
├── scope.py Binding stack, visible-columns lookup
|
|
584
|
+
├── context.py GenContext: rng, scope, depth budget, dial
|
|
585
|
+
├── config.py Complexity dial → weights/budgets
|
|
586
|
+
├── cli.py `waxsql` console script (gen / data / validate)
|
|
587
|
+
├── gen/
|
|
588
|
+
│ ├── expr.py Typed expression generator
|
|
589
|
+
│ ├── select.py SELECT/FROM/WHERE/GROUP BY/HAVING/ORDER/LIMIT
|
|
590
|
+
│ ├── subquery.py Scalar / EXISTS / IN subqueries + derived tables
|
|
591
|
+
│ ├── window.py Window function specs (PARTITION/ORDER/FRAME)
|
|
592
|
+
│ ├── cte.py WITH (recursive and not)
|
|
593
|
+
│ ├── setop.py UNION/INTERSECT/EXCEPT
|
|
594
|
+
│ └── data/ Data-generator internals
|
|
595
|
+
│ ├── strategies.py Per-type value strategies + wordlist
|
|
596
|
+
│ ├── columns.py Column-name override registry
|
|
597
|
+
│ ├── rows.py Topo walk + row materialization
|
|
598
|
+
│ └── emit.py COPY block formatting
|
|
599
|
+
└── validate/
|
|
600
|
+
├── __init__.py ValidationMode enum
|
|
601
|
+
├── syntax.py pglast wrapper (no DB)
|
|
602
|
+
├── parse.py PREPARE-based + install_schema
|
|
603
|
+
└── plan.py EXPLAIN-based
|
|
604
|
+
```
|
|
605
|
+
|
|
606
|
+
---
|
|
607
|
+
|
|
608
|
+
## Surprising corners
|
|
609
|
+
|
|
610
|
+
A handful of API choices that are likely to trip up a first-time reader.
|
|
611
|
+
|
|
612
|
+
### `schema=` is keyword-only
|
|
613
|
+
|
|
614
|
+
```python
|
|
615
|
+
generate_query(42, schema=schema) # OK
|
|
616
|
+
generate_query(42, schema) # TypeError: schema is keyword-only
|
|
617
|
+
```
|
|
618
|
+
|
|
619
|
+
The signature uses a `*` to force keyword passing — this is intentional
|
|
620
|
+
so that a future addition of a `complexity` positional argument can't
|
|
621
|
+
silently re-bind in existing call sites.
|
|
622
|
+
|
|
623
|
+
### `print_query` returns a string; it does not print
|
|
624
|
+
|
|
625
|
+
```python
|
|
626
|
+
sql = print_query(q) # CORRECT — capture the return value
|
|
627
|
+
print_query(q) # WRONG-ish — you get the string back but discard it
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
The name is a historical artifact (think "pretty-print") rather than an
|
|
631
|
+
imperative. The whole `print_*` family in `waxsql.printer` is functional:
|
|
632
|
+
they convert AST → string.
|
|
633
|
+
|
|
634
|
+
### Same seed for schema and query is by convention only
|
|
635
|
+
|
|
636
|
+
The schema and query generators have **independent** RNG streams. Passing
|
|
637
|
+
the same seed to both is a useful idiom for "fully reproducible session,"
|
|
638
|
+
but the two seeds are otherwise unrelated:
|
|
639
|
+
|
|
640
|
+
```python
|
|
641
|
+
schema = generate_schema(seed=42, complexity=8)
|
|
642
|
+
|
|
643
|
+
# Same schema, 100 different queries:
|
|
644
|
+
for s in range(100):
|
|
645
|
+
q = generate_query(seed=s, schema=schema, complexity=8)
|
|
646
|
+
...
|
|
647
|
+
|
|
648
|
+
# Same query, but only across schemas built with the same SCHEMA seed:
|
|
649
|
+
schema2 = generate_schema(seed=42, complexity=8) # identical to schema
|
|
650
|
+
q2 = generate_query(seed=7, schema=schema2, complexity=8)
|
|
651
|
+
# q2 == generate_query(seed=7, schema=schema, complexity=8) # True
|
|
652
|
+
```
|
|
653
|
+
|
|
654
|
+
### `ValidationMode` is a label, not a dispatcher
|
|
655
|
+
|
|
656
|
+
```python
|
|
657
|
+
from waxsql import ValidationMode
|
|
658
|
+
# This enum exists, but there is NO `validate(sql, mode=ValidationMode.PARSE)`
|
|
659
|
+
# function. It's classification metadata for callers' own code.
|
|
660
|
+
```
|
|
661
|
+
|
|
662
|
+
To actually run a validation, call the tier-specific function directly:
|
|
663
|
+
`check_syntax(sql)`, `check_parse(sql, conn)`, or `check_plan(sql, conn)`.
|
|
664
|
+
|
|
665
|
+
### Live-DB validators need autocommit OFF
|
|
666
|
+
|
|
667
|
+
```python
|
|
668
|
+
conn = psycopg.connect(dsn, autocommit=True) # WRONG — savepoints don't work
|
|
669
|
+
conn = psycopg.connect(dsn, autocommit=False) # CORRECT
|
|
670
|
+
```
|
|
671
|
+
|
|
672
|
+
`check_parse` and `check_plan` use `SAVEPOINT` / `ROLLBACK TO SAVEPOINT`
|
|
673
|
+
to isolate per-query failures, and savepoints only exist inside a
|
|
674
|
+
transaction. Combine with the `BEGIN` / `install_schema` / `ROLLBACK`
|
|
675
|
+
shape shown above.
|
|
676
|
+
|
|
677
|
+
### `install_schema` lives in `waxsql.validate.parse`, not the top level
|
|
678
|
+
|
|
679
|
+
It's not re-exported from the top-level package because it's only
|
|
680
|
+
meaningful in the live-DB validation context. Import it explicitly:
|
|
681
|
+
|
|
682
|
+
```python
|
|
683
|
+
from waxsql.validate.parse import install_schema
|
|
684
|
+
```
|
|
685
|
+
|
|
686
|
+
### The schema generator never produces composite primary keys
|
|
687
|
+
|
|
688
|
+
Every table has `id BIGINT NOT NULL`. This is deliberate — composites
|
|
689
|
+
complicate FK matching and JOIN generation in ways that don't earn their
|
|
690
|
+
keep. If you need composite-PK coverage for a particular tool, hand-write
|
|
691
|
+
the schema and feed it into `generate_query` directly.
|
|
692
|
+
|
|
693
|
+
### Tables and columns in generated DDL are deterministically random names
|
|
694
|
+
|
|
695
|
+
You'll get identifiers like `tbl_a3f2`, not `customer` / `order`. The
|
|
696
|
+
generator is type-driven, not domain-driven; column names are opaque
|
|
697
|
+
on purpose to discourage callers from accidentally encoding semantic
|
|
698
|
+
assumptions about the output.
|
|
699
|
+
|
|
700
|
+
### The CLI's auto-schema header is a convention, not a stable file format
|
|
701
|
+
|
|
702
|
+
`waxsql gen` prefixes its output with `-- waxsql <version> seed=N complexity=X`,
|
|
703
|
+
and `waxsql validate --auto-schema` (default on) parses that header to
|
|
704
|
+
regenerate the matching schema. The `seed=N` and `complexity=X` keys are
|
|
705
|
+
guaranteed to remain parseable across CLI versions, but the surrounding
|
|
706
|
+
format may grow new fields. If you're piping `gen` output into something
|
|
707
|
+
other than `validate`, treat the header as a SQL comment to strip rather
|
|
708
|
+
than a format to depend on.
|
|
709
|
+
|
|
710
|
+
---
|
|
711
|
+
|
|
712
|
+
## Development
|
|
713
|
+
|
|
714
|
+
```bash
|
|
715
|
+
# Editable install with everything (test deps, lint, type-checker, validators).
|
|
716
|
+
pip install -e '.[dev]'
|
|
717
|
+
|
|
718
|
+
# Full test suite. ~70s with PARSE/PLAN tiers if a PG is reachable;
|
|
719
|
+
# the live-DB tiers skip cleanly if not.
|
|
720
|
+
pytest
|
|
721
|
+
|
|
722
|
+
# SYNTAX-tier only — fast, no PG needed.
|
|
723
|
+
pytest --ignore=tests/test_parse.py --ignore=tests/test_plan.py
|
|
724
|
+
|
|
725
|
+
# Lint and type-check (CI runs both on every push).
|
|
726
|
+
ruff check waxsql tests
|
|
727
|
+
mypy waxsql
|
|
728
|
+
|
|
729
|
+
# Quick smoke from the CLI:
|
|
730
|
+
python -c "from waxsql import generate_schema; print(generate_schema(42, 6).emit_ddl())"
|
|
731
|
+
```
|
|
732
|
+
|
|
733
|
+
The default DSN for live-DB tests is `dbname=waxsql_test`; override via
|
|
734
|
+
`WAXSQL_PG_DSN` (e.g. `WAXSQL_PG_DSN='host=localhost port=5433 dbname=fuzz' pytest`).
|
|
735
|
+
|
|
736
|
+
The release procedure is documented in [`RELEASING.md`](RELEASING.md).
|
|
737
|
+
The architecture and design rationale, including the choices behind the
|
|
738
|
+
type-driven approach and the determinism discipline, are in
|
|
739
|
+
[`ARCHITECTURE.md`](ARCHITECTURE.md). Possible future directions are in
|
|
740
|
+
[`FUTURE.md`](FUTURE.md).
|
|
741
|
+
|
|
742
|
+
---
|
|
743
|
+
|
|
744
|
+
## License
|
|
745
|
+
|
|
746
|
+
MIT.
|