waxsql 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waxsql/context.py ADDED
@@ -0,0 +1,255 @@
1
+ """Generation context — the bag of state threaded through every
2
+ generator call.
3
+
4
+ A `GenContext` holds:
5
+
6
+ * `rng` — the random.Random instance. Shared across the whole
7
+ generation run; advancing it in one call affects what later calls
8
+ see, which is exactly what makes the (seed → output) mapping
9
+ deterministic.
10
+
11
+ * `scope` — the current binding scope. Determines which columns are
12
+ visible at expression generation time.
13
+
14
+ * `schema`, `catalog` — read-only references to the world. Passed
15
+ through so generators can look up tables, FKs, functions,
16
+ operators without re-importing globals.
17
+
18
+ * `config` — the ComplexityConfig. Tells generators how aggressive
19
+ to be (how many FROM items, how deep, which features are unlocked).
20
+
21
+ * `depth_remaining` — the depth budget. Decremented on each
22
+ recursive descent; at zero, generators are required to pick leaf
23
+ productions only. This is what guarantees termination.
24
+
25
+ * Expression-context flags (`in_aggregate`, `allow_aggregates`,
26
+ `allow_window`, `allow_subquery`) — track what kind of expression
27
+ we're inside, gating which production candidates the expression
28
+ generator considers at each call site.
29
+
30
+ `GenContext` is frozen. To "modify" it (descend into a sub-expression,
31
+ swap scope for a subquery, flip an aggregate flag), use `replace(...)`
32
+ or one of the helper methods like `descend()`. Forcing all changes
33
+ through replace makes "use this for one descent, then revert" the
34
+ default control-flow shape, which prevents whole categories of
35
+ "forgot to reset the flag" bugs.
36
+
37
+ Why this is threaded explicitly through every call instead of being
38
+ held in a module-level / thread-local / context-var slot:
39
+
40
+ * Testability. A test can build a GenContext directly with a
41
+ hand-chosen rng/scope/config and call a generator function in
42
+ isolation. Thread-local state would force tests through a
43
+ setup-fixture-then-call shape that's harder to read and easier
44
+ to leak between tests.
45
+
46
+ * Parallelism. Multiple generation runs (e.g. parallel fuzzing
47
+ workers in one process) can interleave without contending on
48
+ a shared context slot. Each worker holds its own GenContext.
49
+
50
+ * Reproducibility. The state that controls "what gets generated"
51
+ is visible in the function signature. Reading a generator and
52
+ tracing what's in scope doesn't require knowing about an
53
+ ambient module-level variable that some other call mutated.
54
+
55
+ The cost is verbosity — `ctx` shows up in nearly every signature.
56
+ That's a price worth paying for the property that "given this ctx,
57
+ this function does X" is true regardless of where it's called from.
58
+ """
59
+ from __future__ import annotations
60
+
61
+ import random
62
+ from dataclasses import dataclass, field, replace
63
+
64
+ from .catalog import Catalog
65
+ from .config import ComplexityConfig
66
+ from .schema import Schema
67
+ from .scope import Scope
68
+
69
+
70
+ @dataclass
71
+ class Counter:
72
+ """Mutable monotonic counter used as a cell on the frozen GenContext.
73
+
74
+ Replaces the bare `list[int]`-of-one idiom previously used for alias
75
+ and CTE numbering. The point is type-visibility: `alias_counter:
76
+ Counter` says "shared mutable counter" outright, and an accidental
77
+ reset via `dataclasses.replace(ctx, alias_counter=Counter())` reads
78
+ as obviously suspicious in a way `alias_counter=[1]` never did — that
79
+ silent-reset footgun is exactly what this guards against. Determinism
80
+ depends on the produced value sequence being identical across runs,
81
+ so the only operation is a monotonic advance.
82
+ """
83
+
84
+ value: int = 1
85
+
86
+ def take(self, n: int = 1) -> int:
87
+ """Return the current value, then advance by `n` (default 1).
88
+
89
+ The returned value names an alias or CTE; because `replace()`
90
+ preserves this same Counter instance (not a copy), the
91
+ post-advance state is immediately visible to sibling generators
92
+ that share the context — which is what keeps sibling subqueries
93
+ from picking colliding alias/CTE numbers.
94
+ """
95
+ current = self.value
96
+ self.value += n
97
+ return current
98
+
99
+
100
+ @dataclass(frozen=True)
101
+ class GenContext:
102
+ """Generation state passed through the recursive generator calls.
103
+
104
+ Frozen — see module docstring. The contained `rng` and `scope`
105
+ objects are themselves mutable; that's intentional. The frozen
106
+ wrapper protects against re-binding the field, not against the
107
+ pointed-to object's natural mutation.
108
+ """
109
+
110
+ rng: random.Random
111
+ scope: Scope
112
+ schema: Schema
113
+ catalog: Catalog
114
+ config: ComplexityConfig
115
+ depth_remaining: int
116
+
117
+ # Expression-context flags. Defaults reflect "outermost SELECT
118
+ # body": no aggregate context active, but aggregates are allowed
119
+ # (in the SELECT list and HAVING; the SELECT generator flips the
120
+ # `allow_aggregates` flag off when descending into WHERE).
121
+ in_aggregate: bool = False
122
+ allow_aggregates: bool = True
123
+ allow_window: bool = False # True in SELECT-list expr generation
124
+ in_window: bool = False # True inside a window FuncCall's args
125
+ allow_subquery: bool = True
126
+ # True iff the current ctx is a body of a correlated subquery
127
+ # (or LATERAL derived table) — anything where the inner can see
128
+ # outer columns via the parent-chain walk in scope. When True
129
+ # AND in_aggregate=True, gen_expr restricts column-ref candidates
130
+ # to the LOCAL scope only — outer-column refs inside aggregates
131
+ # trigger PG's implicit-grouping inference (PARSE-tier 42803).
132
+ in_correlated_subquery: bool = False
133
+ # Whether a WITH clause may be generated at this gen_select call.
134
+ # Top-level queries default to True; descend_subquery flips it
135
+ # off so subqueries / derived tables / CTE bodies don't generate
136
+ # their own nested WITHs (PG allows nested WITH but milestone 5
137
+ # keeps WITH top-level only — defer nested to a later milestone).
138
+ allow_with: bool = True
139
+
140
+ # Subquery-nesting budget. 0 (the default) means "no subqueries
141
+ # allowed", so code that builds a GenContext without thinking
142
+ # about subqueries gets the safe behavior. The public
143
+ # generate_query sets this from cfg.max_subquery_depth.
144
+ #
145
+ # Counted separately from depth_remaining because subquery
146
+ # nesting is a different rationing dimension — a deep
147
+ # `a + b + c` expression shouldn't share a budget with
148
+ # `(SELECT ... WHERE x IN (SELECT ...))`.
149
+ subquery_depth_remaining: int = 0
150
+
151
+ # Mutable monotonic counter for FROM-clause alias generation.
152
+ # Shared across all GenContexts in a single query (replace()
153
+ # preserves the same Counter instance), so sibling subqueries
154
+ # don't pick colliding aliases. See the Counter docstring for why
155
+ # this is a dedicated type rather than a bare list-of-one cell.
156
+ #
157
+ # Generator-internal: not part of the public API. Tests that
158
+ # build GenContext directly can ignore it.
159
+ alias_counter: Counter = field(default_factory=Counter)
160
+
161
+ # Companion monotonic counter for CTE-name generation. Separate
162
+ # from alias_counter because CTE names live in a different
163
+ # namespace (lookup_cte vs lookup_alias) — combining them would
164
+ # interleave `cte1, cte5, ...` with `t2, t3, t4, t6, ...` for
165
+ # no semantic benefit. Same shared-mutable-cell mechanism so
166
+ # nested subqueries' CTE names don't collide either.
167
+ cte_counter: Counter = field(default_factory=Counter)
168
+
169
+ def descend(self, cost: int = 1) -> "GenContext":
170
+ """Return a copy with `depth_remaining` decremented.
171
+
172
+ Every recursive expression-generator call must descend by at
173
+ least 1; that's what enforces termination. Higher-cost
174
+ productions can pass `cost > 1` to penalize expensive shapes
175
+ (e.g. function calls more than column refs); current callers
176
+ all use the default cost of 1, but the parameter is there for
177
+ when biasing depth budget by production cost becomes useful.
178
+
179
+ Note this only decrements the expression-depth budget. Use
180
+ `descend_subquery` when entering a SELECT body — that's a
181
+ different rationing dimension and inherits-or-resets a
182
+ different set of flags.
183
+ """
184
+ return replace(self, depth_remaining=self.depth_remaining - cost)
185
+
186
+ def at_leaf(self) -> bool:
187
+ """True iff the depth budget is exhausted; generators must
188
+ pick leaf productions only.
189
+
190
+ Centralizing this check makes the budget rule one line in
191
+ the generator instead of an inline `<= 0` test scattered
192
+ across files.
193
+ """
194
+ return self.depth_remaining <= 0
195
+
196
+ def at_subquery_leaf(self) -> bool:
197
+ """True iff no further subqueries can be opened. Used by
198
+ gen_expr to gate subquery candidates the same way at_leaf
199
+ gates recursive expression productions."""
200
+ return self.subquery_depth_remaining <= 0
201
+
202
+ def descend_subquery(self, *, correlated: bool) -> "GenContext":
203
+ """Return a child context for entering a subquery body.
204
+
205
+ The child context gets:
206
+
207
+ * a fresh child scope (correlated/uncorrelated per arg),
208
+ so inner FROM tables don't pollute the outer scope and
209
+ outer columns are visible only when correlated;
210
+ * decremented `subquery_depth_remaining`;
211
+ * fresh `depth_remaining` — the subquery's expression tree
212
+ starts with a full budget, not whatever's left of the
213
+ outer expression's;
214
+ * `allow_aggregates=True`, `in_aggregate=False` — the
215
+ subquery is its own expression context, so the outer
216
+ "no aggregates here" constraint doesn't propagate.
217
+
218
+ The shared rng / schema / catalog / config remain shared.
219
+
220
+ IMPORTANT — this method is the single chokepoint for entering
221
+ a child query body. Every expression-context flag must be
222
+ explicitly considered here: inheriting an outer flag through
223
+ replace()'s "preserve unspecified fields" default is exactly
224
+ the bug pattern this central method exists to prevent (e.g.
225
+ an inherited allow_window=True would leak window functions
226
+ into a subquery's WHERE clause). When adding a new flag to
227
+ GenContext, decide here whether it propagates or resets
228
+ before doing anything else.
229
+ """
230
+ return replace(
231
+ self,
232
+ scope=self.scope.push_subquery(correlated=correlated),
233
+ subquery_depth_remaining=self.subquery_depth_remaining - 1,
234
+ depth_remaining=self.config.max_expr_depth,
235
+ allow_aggregates=True,
236
+ in_aggregate=False,
237
+ # No nested WITH inside a subquery — milestone 5 keeps
238
+ # WITH top-level only.
239
+ allow_with=False,
240
+ # Window flags reset too. allow_window inherited as True
241
+ # from a SELECT-list-context parent would leak windows
242
+ # into the SUBQUERY's WHERE/HAVING/ON clauses (PARSE-tier
243
+ # error 42P20 "window functions are not allowed in WHERE").
244
+ # The subquery's gen_select will independently flip
245
+ # allow_window=True for ITS own SELECT-list expressions.
246
+ allow_window=False,
247
+ in_window=False,
248
+ # Track whether THIS ctx is a correlated-subquery body.
249
+ # Used by gen_expr's aggregate-arg column-ref candidate
250
+ # restriction (see in_correlated_subquery field doc).
251
+ in_correlated_subquery=correlated,
252
+ )
253
+
254
+
255
+ __all__ = ["GenContext"]
waxsql/data.py ADDED
@@ -0,0 +1,99 @@
1
+ """Public API for the data generator.
2
+
3
+ `generate_data(schema, *, seed, rows, fanout, null_fraction)` walks the
4
+ schema's FK DAG topologically and produces a string containing one
5
+ `COPY ... FROM STDIN; ...; \\.` block per table, in dependency order.
6
+
7
+ Determinism contract: same (schema, seed, rows, fanout, null_fraction)
8
+ produces byte-identical output across Python versions. The schema and
9
+ data generators each construct their own `random.Random(seed)`, so
10
+ they share no RNG state even when given the same seed.
11
+
12
+ Role in the system: this is the thin orchestration layer. The heavy
13
+ lifting (topological walk, per-row dispatch, COPY text formatting,
14
+ strategy registries) lives under `waxsql.gen.data.*`; this module
15
+ just stitches them together and owns the public function signature.
16
+ Keeping the public surface here means callers can `from waxsql import
17
+ generate_data` without ever touching the internal package layout.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import random
22
+
23
+ from waxsql.gen.data.emit import emit_copy_block
24
+ from waxsql.gen.data.rows import (
25
+ generate_row,
26
+ rows_for_table,
27
+ topological_order,
28
+ )
29
+ from waxsql.schema import Schema
30
+
31
+
32
+ def generate_data(
33
+ schema: Schema,
34
+ *,
35
+ seed: int,
36
+ rows: int = 100,
37
+ fanout: int = 5,
38
+ null_fraction: float = 0.05,
39
+ ) -> str:
40
+ """Emit COPY blocks for every table in `schema`, in FK-topological order.
41
+
42
+ `rows` is the base row count; tables deeper in the FK DAG get
43
+ `rows * fanout ** depth`. `null_fraction` is the per-nullable-column
44
+ probability of emitting NULL. Same arguments + same schema → byte-
45
+ identical output.
46
+
47
+ `id_store` is populated as each table is finished — critically, the
48
+ current table's own IDs are added AFTER all its rows are materialized.
49
+ This lets the self-FK branch in `generate_row` use `rng.randint(1, pk)`
50
+ for forward-safe references without consulting `id_store` for the
51
+ current table.
52
+
53
+ Raises ValueError if the schema contains an FK cycle. Cycle handling
54
+ (deferred constraints + UPDATE-patches) is a known follow-up; today
55
+ the CLI catches this and surfaces a clean usage error.
56
+ """
57
+ # One RNG seeded once and threaded through every row. The data
58
+ # generator does NOT share RNG state with the schema generator —
59
+ # each constructs its own `random.Random(seed)` — so passing the
60
+ # same seed to both produces independent deterministic streams.
61
+ rng = random.Random(seed)
62
+ # Per-table list of PK values produced so far. Read by child tables
63
+ # to resolve their FK columns; written ONCE per table, after that
64
+ # table's rows are fully materialized (see ordering note below).
65
+ id_store: dict[str, list[int]] = {}
66
+ blocks: list[str] = []
67
+ for table in topological_order(schema):
68
+ n = rows_for_table(table, schema, base=rows, fanout=fanout)
69
+ table_rows: list[tuple[object, ...]] = []
70
+ ids: list[int] = []
71
+ for pk in range(1, n + 1):
72
+ # PKs are sequential 1..n. The schema generator emits
73
+ # `id BIGINT NOT NULL` (no sequence), so the data generator
74
+ # owns the PK numbering — no gaps, no collisions.
75
+ row = generate_row(
76
+ table=table,
77
+ pk=pk,
78
+ rng=rng,
79
+ id_store=id_store,
80
+ null_fraction=null_fraction,
81
+ )
82
+ ids.append(pk)
83
+ table_rows.append(row)
84
+ # Populate id_store only after all rows for this table are done.
85
+ # Children (tables with an FK to this table) will be visited later
86
+ # in topological order and can safely read from id_store then.
87
+ # This ordering is load-bearing: a child must not be able to see
88
+ # a parent's IDs mid-materialization (which could only happen
89
+ # under a buggy reordering), and self-FK resolution within the
90
+ # inner loop intentionally uses `rng.randint(1, pk)` instead of
91
+ # consulting id_store, so the current table is absent from the
92
+ # store throughout its own row loop.
93
+ id_store[table.name] = ids
94
+ columns = tuple(c.name for c in table.columns)
95
+ blocks.append(emit_copy_block(table.name, columns, table_rows))
96
+ # Blocks are joined with a single "\n" so callers can split/process
97
+ # them; each block already ends with "\.\n" so adjacent blocks are
98
+ # separated by a blank line in the rendered output.
99
+ return "\n".join(blocks)
waxsql/gen/__init__.py ADDED
@@ -0,0 +1,51 @@
1
+ """Query-generation entry points.
2
+
3
+ Role: the public surface of the `gen/` subpackage. Re-exports a flat
4
+ set of generator functions so callers (validators, tests, the CLI)
5
+ can `from waxsql.gen import gen_select` without touching the internal
6
+ file layout. The submodules themselves still import each other
7
+ directly — the re-exports here are for *external* convenience, not
8
+ intra-package wiring.
9
+
10
+ The package exposes a flat set of generator functions, one per
11
+ SQL surface. The two foundational ones are:
12
+
13
+ * `expr.gen_expr(ctx, target_type)` — typed expression generator.
14
+ The recursive core; every other generator that needs to emit an
15
+ expression goes through this.
16
+
17
+ * `select.gen_select(ctx)` — SELECT-statement generator. Picks a
18
+ FROM clause, populates the scope, then asks `gen_expr` for each
19
+ select-list / WHERE / ORDER BY expression.
20
+
21
+ The rest (`subquery`, `cte`, `setop`, `window`) are full peers, each
22
+ emitting a different SQL surface but all called from `gen_select` (or
23
+ recursively from each other) for their part of the tree.
24
+
25
+ CONTRACT: these generators MUTATE `ctx` as they run. The rng advances
26
+ deterministically (intentional — that's the determinism guarantee),
27
+ but `ctx.scope`, `ctx.alias_counter`, and `ctx.cte_counter`
28
+ also get mutated as tables are added to the FROM clause and aliases
29
+ are minted. Callers must NOT assume `ctx` is unchanged across a
30
+ generator call. The `descend_subquery` chokepoint on GenContext is
31
+ where children get a fresh scope so cross-arm mutations don't
32
+ collide; production code always goes through it.
33
+ """
34
+ from .cte import gen_cte_def, gen_recursive_cte_def
35
+ from .expr import gen_expr, gen_literal
36
+ from .select import gen_select
37
+ from .setop import gen_setop
38
+ from .subquery import (
39
+ gen_derived_table, gen_exists_subquery, gen_in_subquery,
40
+ gen_scalar_subquery,
41
+ )
42
+ from .window import gen_window_spec
43
+
44
+ __all__ = [
45
+ "gen_expr", "gen_literal", "gen_select",
46
+ "gen_scalar_subquery", "gen_exists_subquery", "gen_in_subquery",
47
+ "gen_derived_table",
48
+ "gen_cte_def", "gen_recursive_cte_def",
49
+ "gen_window_spec",
50
+ "gen_setop",
51
+ ]