waxsql 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waxsql/__init__.py +158 -0
- waxsql/ast.py +757 -0
- waxsql/catalog.py +363 -0
- waxsql/cli.py +888 -0
- waxsql/config.py +477 -0
- waxsql/context.py +255 -0
- waxsql/data.py +99 -0
- waxsql/gen/__init__.py +51 -0
- waxsql/gen/cte.py +367 -0
- waxsql/gen/data/__init__.py +14 -0
- waxsql/gen/data/columns.py +48 -0
- waxsql/gen/data/emit.py +247 -0
- waxsql/gen/data/rows.py +236 -0
- waxsql/gen/data/strategies.py +299 -0
- waxsql/gen/expr.py +723 -0
- waxsql/gen/select.py +831 -0
- waxsql/gen/setop.py +259 -0
- waxsql/gen/subquery.py +397 -0
- waxsql/gen/window.py +398 -0
- waxsql/pretty.py +81 -0
- waxsql/printer.py +688 -0
- waxsql/py.typed +0 -0
- waxsql/schema.py +557 -0
- waxsql/scope.py +391 -0
- waxsql/types.py +187 -0
- waxsql/validate/__init__.py +52 -0
- waxsql/validate/parse.py +194 -0
- waxsql/validate/plan.py +149 -0
- waxsql/validate/syntax.py +87 -0
- waxsql-1.0.0.dist-info/METADATA +746 -0
- waxsql-1.0.0.dist-info/RECORD +35 -0
- waxsql-1.0.0.dist-info/WHEEL +5 -0
- waxsql-1.0.0.dist-info/entry_points.txt +2 -0
- waxsql-1.0.0.dist-info/licenses/LICENSE +21 -0
- waxsql-1.0.0.dist-info/top_level.txt +1 -0
waxsql/gen/cte.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""CTE definition generator.
|
|
2
|
+
|
|
3
|
+
Role: produces the body of one WITH-clause entry. Two public entry
|
|
4
|
+
points — `gen_cte_def` for plain CTEs, `gen_recursive_cte_def` for
|
|
5
|
+
the rigidly-shaped `WITH RECURSIVE` form.
|
|
6
|
+
|
|
7
|
+
INVARIANT on recursive CTEs: the structure is non-negotiable —
|
|
8
|
+
non-recursive "anchor" UNION ALL recursive "step", with the
|
|
9
|
+
self-reference appearing only in the step. Any other shape (anchor
|
|
10
|
+
on the right, no UNION, self-reference in the anchor) is a PG parse
|
|
11
|
+
error. The two-arm builder pair below enforces the shape by
|
|
12
|
+
construction; the printer just renders it.
|
|
13
|
+
|
|
14
|
+
Single public entry point: `gen_cte_def(ctx, name)` builds one
|
|
15
|
+
CteDef. The orchestration of multi-CTE WITH clauses lives in
|
|
16
|
+
`gen/select.py` (`_gen_with_clause`), which calls `gen_cte_def` in
|
|
17
|
+
a loop and registers each result in the parent scope before
|
|
18
|
+
generating the next — that's what lets later CTEs reference earlier
|
|
19
|
+
ones without forward references.
|
|
20
|
+
|
|
21
|
+
A CTE body is a full SELECT generated via `gen_select` on a child
|
|
22
|
+
context, with a few constraints:
|
|
23
|
+
|
|
24
|
+
* `descend_subquery(correlated=False)` — CTE bodies are
|
|
25
|
+
self-contained; they don't reference outer-query *columns*. They
|
|
26
|
+
DO see outer *CTEs* via lookup_cte's unconditional parent-chain
|
|
27
|
+
walk, but that's a separate visibility rule.
|
|
28
|
+
|
|
29
|
+
* `allow_with=False` (set by descend_subquery) — milestone 5
|
|
30
|
+
keeps WITHs top-level only; the CTE body can't have its own
|
|
31
|
+
nested WITH.
|
|
32
|
+
|
|
33
|
+
* Single-target output with explicit `c1` alias on the target —
|
|
34
|
+
same predictable-column-resolution pattern as derived tables.
|
|
35
|
+
|
|
36
|
+
The lazy import of `gen_select` is the standard cycle-breaking
|
|
37
|
+
trick: `gen_select` calls into `gen_cte_def` (via
|
|
38
|
+
`_gen_with_clause`), and `gen_cte_def` calls `gen_select` for the
|
|
39
|
+
inner body, so a top-level import at either end would close the
|
|
40
|
+
loop. Lazy import inside the function keeps both modules importable
|
|
41
|
+
in either order.
|
|
42
|
+
"""
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
from dataclasses import replace
|
|
46
|
+
|
|
47
|
+
from ..ast import (
|
|
48
|
+
BinaryOp, ColumnRef, CteCycle, CteDef, CteRef, CteSearch, Expr,
|
|
49
|
+
FromItem, Literal, Select, SelectTarget, SetOp, TableRef,
|
|
50
|
+
)
|
|
51
|
+
from ..context import GenContext
|
|
52
|
+
from ..types import BOOL, INT4, INT8, NUMERIC, PgType, TEXT, TIMESTAMPTZ
|
|
53
|
+
from .expr import gen_expr
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Probabilities for the optional SEARCH and CYCLE clauses on a
|
|
57
|
+
# recursive CTE. Independent dice rolls — both can fire on the same
|
|
58
|
+
# CTE, both can be skipped. Real-world recursive CTEs are usually
|
|
59
|
+
# bare (no SEARCH/CYCLE); these probabilities are biased moderate-low.
|
|
60
|
+
_P_RECURSIVE_SEARCH: float = 0.3
|
|
61
|
+
_P_RECURSIVE_CYCLE: float = 0.3
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Recursive CTE column type pool. Restricted to types where
|
|
65
|
+
# arithmetic-style recursion is meaningful (counter increments,
|
|
66
|
+
# string concatenation paths). The base/recursive arms must agree
|
|
67
|
+
# on column type; this pool is what both arms produce.
|
|
68
|
+
_RECURSIVE_COLUMN_TYPES: tuple[PgType, ...] = (
|
|
69
|
+
INT4, INT8, NUMERIC, TEXT,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def gen_cte_def(
|
|
74
|
+
ctx: GenContext,
|
|
75
|
+
name: str,
|
|
76
|
+
) -> tuple[CteDef, list[tuple[str, PgType]]]:
|
|
77
|
+
"""Generate one non-recursive CTE definition.
|
|
78
|
+
|
|
79
|
+
Returns a `(CteDef, columns)` tuple — the AST node for inclusion
|
|
80
|
+
in the WITH list, plus the column info for registration in the
|
|
81
|
+
enclosing scope (so subsequent CTEs and the main query body can
|
|
82
|
+
resolve `cte_name.col` references).
|
|
83
|
+
|
|
84
|
+
The body is a 1..N-target SELECT with explicit `cN` aliases.
|
|
85
|
+
The inner SELECT is generated by gen_select (which produces a
|
|
86
|
+
single target by design); we then synthesize additional targets
|
|
87
|
+
here when the column-count draw says so. Synthetic extra targets
|
|
88
|
+
are generated under the same scope as the gen_select body, so
|
|
89
|
+
they can reference the same FROM-clause columns.
|
|
90
|
+
|
|
91
|
+
The single-column case stays the most common shape because
|
|
92
|
+
max_cte_columns is small (3). Recursive CTEs are kept single-
|
|
93
|
+
column — see gen_recursive_cte_def for why.
|
|
94
|
+
"""
|
|
95
|
+
# Lazy import — gen/select.py imports `_gen_with_clause` (which
|
|
96
|
+
# calls this function), creating an import cycle if we did the
|
|
97
|
+
# gen_select import at module top-level.
|
|
98
|
+
from .select import gen_select
|
|
99
|
+
|
|
100
|
+
# Descend into a fresh subquery context. correlated=False because
|
|
101
|
+
# CTE bodies don't reference outer-query columns; CTE-to-CTE
|
|
102
|
+
# references work through lookup_cte (unconditional parent walk),
|
|
103
|
+
# not through column visibility.
|
|
104
|
+
child_ctx = ctx.descend_subquery(correlated=False)
|
|
105
|
+
inner = gen_select(child_ctx)
|
|
106
|
+
|
|
107
|
+
# Decide on column count. The inner SELECT already produced one
|
|
108
|
+
# target; if we want more, generate them in the same child scope
|
|
109
|
+
# (so they see the same FROM-clause bindings). The +1 is because
|
|
110
|
+
# rng.randint is inclusive — we want 1..max with 1 being valid.
|
|
111
|
+
#
|
|
112
|
+
# SUBTLE: skip extras entirely when the inner is aggregate-mode
|
|
113
|
+
# (group_by non-empty). The existing target was generated under
|
|
114
|
+
# the GROUP BY constraint; adding plain column refs as extras
|
|
115
|
+
# would violate the "every non-aggregate target must be in GROUP
|
|
116
|
+
# BY" rule (PG: 42803). Detecting this and synthesizing matching
|
|
117
|
+
# grouped extras is doable but requires GROUP-BY-list awareness
|
|
118
|
+
# — out of scope for the polish item. Single-column aggregate
|
|
119
|
+
# CTEs remain valid; this just narrows multi-column to non-
|
|
120
|
+
# aggregate CTE bodies.
|
|
121
|
+
is_aggregate_mode = bool(inner.group_by)
|
|
122
|
+
n_extra = (
|
|
123
|
+
0 if is_aggregate_mode
|
|
124
|
+
else child_ctx.rng.randint(0, child_ctx.config.max_cte_columns - 1)
|
|
125
|
+
)
|
|
126
|
+
extra_exprs: list[Expr] = []
|
|
127
|
+
if n_extra > 0:
|
|
128
|
+
target_ctx = replace(child_ctx, allow_aggregates=False)
|
|
129
|
+
extra_types = [
|
|
130
|
+
child_ctx.rng.choice(_RECURSIVE_COLUMN_TYPES)
|
|
131
|
+
for _ in range(n_extra)
|
|
132
|
+
]
|
|
133
|
+
extra_exprs = [gen_expr(target_ctx, t) for t in extra_types]
|
|
134
|
+
|
|
135
|
+
# Build the final aliased target list: original target as c1,
|
|
136
|
+
# extras as c2, c3, ...
|
|
137
|
+
all_exprs = [inner.targets[0].expr, *extra_exprs]
|
|
138
|
+
inner_aliased = replace(
|
|
139
|
+
inner,
|
|
140
|
+
targets=tuple(
|
|
141
|
+
SelectTarget(expr=e, alias=f"c{i + 1}")
|
|
142
|
+
for i, e in enumerate(all_exprs)
|
|
143
|
+
),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
columns = [(f"c{i + 1}", e.pg_type) for i, e in enumerate(all_exprs)]
|
|
147
|
+
return CteDef(name=name, select=inner_aliased), columns
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def gen_recursive_cte_def(
|
|
151
|
+
ctx: GenContext,
|
|
152
|
+
name: str,
|
|
153
|
+
) -> tuple[CteDef, list[tuple[str, PgType]]]:
|
|
154
|
+
"""Generate a recursive CTE: `name AS (base UNION ALL recursive)`.
|
|
155
|
+
|
|
156
|
+
Two arms in a SetOp wrapper:
|
|
157
|
+
|
|
158
|
+
* **Base arm**: ordinary single-target SELECT with a
|
|
159
|
+
deterministic column type from `_RECURSIVE_COLUMN_TYPES`.
|
|
160
|
+
No self-reference (the CTE doesn't exist yet from PG's
|
|
161
|
+
POV when the base arm is evaluated).
|
|
162
|
+
|
|
163
|
+
* **Recursive arm**: a SELECT whose FROM clause contains a
|
|
164
|
+
CteRef to `name`. This is the forced self-reference — without
|
|
165
|
+
it, the CTE wouldn't actually recurse (and PG would treat the
|
|
166
|
+
WITH as effectively non-recursive). The recursive arm's
|
|
167
|
+
target type matches the base arm's.
|
|
168
|
+
|
|
169
|
+
The CTE is registered in `ctx.scope` BEFORE the recursive arm is
|
|
170
|
+
generated, so the recursive arm's gen_expr / _make_from_item can
|
|
171
|
+
resolve the name. The caller (`_gen_with_clause`) must NOT
|
|
172
|
+
re-register; we return the same `(CteDef, columns)` shape as
|
|
173
|
+
`gen_cte_def` for consistency, but signal "already registered"
|
|
174
|
+
by virtue of the caller checking `cte_def.recursive`.
|
|
175
|
+
"""
|
|
176
|
+
rng = ctx.rng
|
|
177
|
+
cfg = ctx.config
|
|
178
|
+
|
|
179
|
+
# Pick the column count and types. Both arms must produce N
|
|
180
|
+
# targets in the same positional order, with each cN matching
|
|
181
|
+
# in type — that's the structural invariant for `UNION ALL`
|
|
182
|
+
# between the arms.
|
|
183
|
+
n_cols = rng.randint(1, cfg.max_cte_columns)
|
|
184
|
+
target_types = tuple(
|
|
185
|
+
rng.choice(_RECURSIVE_COLUMN_TYPES) for _ in range(n_cols)
|
|
186
|
+
)
|
|
187
|
+
columns = [(f"c{i + 1}", t) for i, t in enumerate(target_types)]
|
|
188
|
+
|
|
189
|
+
# ---- Base arm -----------------------------------------------------
|
|
190
|
+
# No self-reference. Use a fresh subquery scope, generate FROM
|
|
191
|
+
# and N typed targets aliased c1..cN.
|
|
192
|
+
base_ctx = ctx.descend_subquery(correlated=False)
|
|
193
|
+
base = _build_recursive_arm_base(base_ctx, target_types)
|
|
194
|
+
|
|
195
|
+
# ---- Register CTE in OUTER scope BEFORE recursive arm ------------
|
|
196
|
+
# The recursive arm's _make_from_item (or gen_expr) must be able
|
|
197
|
+
# to find `name` via has_visible_ctes / lookup_cte.
|
|
198
|
+
# ORDERING DEPENDENCY: this registration MUST happen between the
|
|
199
|
+
# base-arm build and the recursive-arm build. Registering earlier
|
|
200
|
+
# would let the base arm see its own name (PG rejects this);
|
|
201
|
+
# registering later would mean the recursive arm has no name to
|
|
202
|
+
# reference and the WITH would silently degrade to non-recursive.
|
|
203
|
+
ctx.scope.add_cte(name, columns)
|
|
204
|
+
|
|
205
|
+
# ---- Recursive arm with forced self-reference --------------------
|
|
206
|
+
# Build a Select whose FROM clause includes a CteRef to `name`.
|
|
207
|
+
# Same mechanism as milestone 3's forced correlation predicate
|
|
208
|
+
# and milestone 4's LATERAL forcer: the structural enforcement
|
|
209
|
+
# of "this thing actually exercises the feature."
|
|
210
|
+
rec_ctx = ctx.descend_subquery(correlated=False)
|
|
211
|
+
rec = _build_recursive_arm_self_ref(rec_ctx, name, target_types)
|
|
212
|
+
|
|
213
|
+
body = SetOp(op="UNION", all=True, arms=(base, rec))
|
|
214
|
+
|
|
215
|
+
# Optional SEARCH and CYCLE clauses — independent dice rolls.
|
|
216
|
+
# Both reference c1 only (the first column), even when the CTE
|
|
217
|
+
# has multiple columns. PG accepts SEARCH/CYCLE on any subset of
|
|
218
|
+
# the CTE's exposed columns; sticking with c1 keeps the invariant
|
|
219
|
+
# simple and matches the most common real-world shape.
|
|
220
|
+
# The synthetic columns (search_seq / is_cycle / cycle_path) are
|
|
221
|
+
# deliberately NOT added to the CTE's exposed `columns` list:
|
|
222
|
+
# outer queries don't try to reference them by name, and
|
|
223
|
+
# registering them would require modeling PG's row-array type
|
|
224
|
+
# which isn't in our type system. PG happily defines them and
|
|
225
|
+
# ignores the lack of outer use.
|
|
226
|
+
search = None
|
|
227
|
+
if rng.random() < _P_RECURSIVE_SEARCH:
|
|
228
|
+
search = CteSearch(
|
|
229
|
+
breadth_first=rng.random() < 0.5,
|
|
230
|
+
by_columns=("c1",),
|
|
231
|
+
set_column="search_seq",
|
|
232
|
+
)
|
|
233
|
+
cycle = None
|
|
234
|
+
if rng.random() < _P_RECURSIVE_CYCLE:
|
|
235
|
+
cycle = CteCycle(
|
|
236
|
+
columns=("c1",),
|
|
237
|
+
cycle_mark_column="is_cycle",
|
|
238
|
+
path_column="cycle_path",
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return CteDef(
|
|
242
|
+
name=name, select=body, recursive=True,
|
|
243
|
+
search=search, cycle=cycle,
|
|
244
|
+
), columns
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _build_recursive_arm_base(
|
|
248
|
+
ctx: GenContext,
|
|
249
|
+
target_types: tuple[PgType, ...],
|
|
250
|
+
) -> Select:
|
|
251
|
+
"""Build the base arm of a recursive CTE — a non-self-referencing
|
|
252
|
+
SELECT with N targets of the requested types, aliased c1..cN."""
|
|
253
|
+
from .select import _gen_from_clause
|
|
254
|
+
|
|
255
|
+
from_ = _gen_from_clause(ctx)
|
|
256
|
+
|
|
257
|
+
# Generate one target per requested type. allow_aggregates=False
|
|
258
|
+
# because the base arm is a non-aggregate SELECT (matching the
|
|
259
|
+
# recursive arm's shape; aggregate-recursive bodies have weird
|
|
260
|
+
# interactions with the recursion termination semantics).
|
|
261
|
+
expr_ctx = replace(ctx, allow_aggregates=False)
|
|
262
|
+
targets = tuple(
|
|
263
|
+
SelectTarget(expr=gen_expr(expr_ctx, t), alias=f"c{i + 1}")
|
|
264
|
+
for i, t in enumerate(target_types)
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
return Select(targets=targets, from_=from_)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _build_recursive_arm_self_ref(
|
|
271
|
+
ctx: GenContext,
|
|
272
|
+
cte_name: str,
|
|
273
|
+
target_types: tuple[PgType, ...],
|
|
274
|
+
) -> Select:
|
|
275
|
+
"""Build the recursive arm — a SELECT whose FROM contains a
|
|
276
|
+
CteRef to `cte_name` (forcing actual self-reference). Each of
|
|
277
|
+
the N targets is a column ref to the corresponding self.cN
|
|
278
|
+
(so the recursive types match the base types by construction).
|
|
279
|
+
|
|
280
|
+
Optionally adds a base table to the FROM as well — `WITH
|
|
281
|
+
RECURSIVE r AS (... UNION ALL SELECT r.c1, r.c2 FROM r, base
|
|
282
|
+
WHERE ...)` is the canonical recursive shape (CTE + driver
|
|
283
|
+
table joined to walk a relation).
|
|
284
|
+
|
|
285
|
+
The termination WHERE predicate uses self.c1 only — works for
|
|
286
|
+
any column type that has `<` in our catalog, and limiting on
|
|
287
|
+
one column is sufficient to bound recursion depth.
|
|
288
|
+
"""
|
|
289
|
+
rng = ctx.rng
|
|
290
|
+
|
|
291
|
+
# CteRef to self — give it a fresh tN alias from the shared
|
|
292
|
+
# alias_counter. Register all N columns so the target expressions
|
|
293
|
+
# below can reference any of them.
|
|
294
|
+
# CONSTRAINT: the recursive arm is the ONLY place a CteRef to
|
|
295
|
+
# `cte_name` may appear inside this WITH entry. The base arm
|
|
296
|
+
# would treat the name as undefined (it's registered after the
|
|
297
|
+
# base arm builds); the outer query treats it as a normal CTE
|
|
298
|
+
# reference. PG enforces this same single-arm-recursive rule at
|
|
299
|
+
# parse time.
|
|
300
|
+
self_alias = f"t{ctx.alias_counter.take()}"
|
|
301
|
+
self_ref = CteRef(cte_name=cte_name, alias=self_alias)
|
|
302
|
+
self_columns = [
|
|
303
|
+
(f"c{i + 1}", t) for i, t in enumerate(target_types)
|
|
304
|
+
]
|
|
305
|
+
ctx.scope.add_derived(self_alias, self_columns)
|
|
306
|
+
|
|
307
|
+
# Optionally add a base-table FROM item alongside the CteRef
|
|
308
|
+
# — the canonical recursive pattern is `r JOIN base ON r.x =
|
|
309
|
+
# base.parent` for graph walks. Comma-join with one base table.
|
|
310
|
+
from_items: list[FromItem] = [self_ref]
|
|
311
|
+
if rng.random() < 0.7:
|
|
312
|
+
base_table = rng.choice(ctx.schema.tables)
|
|
313
|
+
base_alias = f"t{ctx.alias_counter.take()}"
|
|
314
|
+
ctx.scope.add_table(base_alias, base_table)
|
|
315
|
+
from_items.append(TableRef(base_table.name, base_alias))
|
|
316
|
+
|
|
317
|
+
# Targets: each cN references self.cN. This guarantees positional
|
|
318
|
+
# type-match between the two arms, which is what UNION ALL
|
|
319
|
+
# requires. Adding arithmetic ("advance" patterns like `r.c1 + 1`)
|
|
320
|
+
# is more realistic but more bookkeeping; deferred.
|
|
321
|
+
targets = tuple(
|
|
322
|
+
SelectTarget(
|
|
323
|
+
expr=ColumnRef(t, self_alias, f"c{i + 1}"),
|
|
324
|
+
alias=f"c{i + 1}",
|
|
325
|
+
)
|
|
326
|
+
for i, t in enumerate(target_types)
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Termination predicate on self.c1. Bounding any one column is
|
|
330
|
+
# sufficient to terminate the recursion in our generator (since
|
|
331
|
+
# recursive arms just project self-columns; without this the WITH
|
|
332
|
+
# would be an infinite empty loop in PG's planner model).
|
|
333
|
+
where: Expr = BinaryOp(
|
|
334
|
+
BOOL, "<",
|
|
335
|
+
ColumnRef(target_types[0], self_alias, "c1"),
|
|
336
|
+
_typed_terminator_literal(target_types[0]),
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
return Select(
|
|
340
|
+
targets=targets,
|
|
341
|
+
from_=tuple(from_items),
|
|
342
|
+
where=where,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _typed_terminator_literal(t: PgType) -> Literal:
|
|
347
|
+
"""Return a Literal of `t` that's plausibly a recursion-terminator
|
|
348
|
+
bound. Numeric types get a "moderate" upper bound; text gets a
|
|
349
|
+
string. Used as the RHS of the recursive arm's WHERE predicate."""
|
|
350
|
+
if t in (INT4, INT8):
|
|
351
|
+
return Literal(t, 1000)
|
|
352
|
+
if t == NUMERIC:
|
|
353
|
+
return Literal(t, 1000.0)
|
|
354
|
+
if t == TEXT:
|
|
355
|
+
return Literal(t, "zzz")
|
|
356
|
+
if t == TIMESTAMPTZ:
|
|
357
|
+
return Literal(t, "2024-12-31 00:00:00+00")
|
|
358
|
+
# Defensive fallback: target_types[0] is drawn from
|
|
359
|
+
# _RECURSIVE_COLUMN_TYPES which is currently a strict subset of
|
|
360
|
+
# the cases above, so this branch is unreachable. Kept as a
|
|
361
|
+
# tripwire — if the pool ever expands, the WHERE predicate will
|
|
362
|
+
# become `c1 < NULL` (always FALSE / unknown), which terminates
|
|
363
|
+
# the recursion immediately rather than producing invalid SQL.
|
|
364
|
+
return Literal(t, None)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
__all__ = ["gen_cte_def", "gen_recursive_cte_def"]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Internal package for the data generator.
|
|
2
|
+
|
|
3
|
+
Public entry point is `waxsql.data.generate_data`. Submodules:
|
|
4
|
+
- strategies: per-type value generators
|
|
5
|
+
- columns: column-name override registry (hook for future plausibility)
|
|
6
|
+
- rows: topological row materialization + FK resolution
|
|
7
|
+
- emit: COPY block formatting
|
|
8
|
+
|
|
9
|
+
Role: every module in this package is internal. Callers should import
|
|
10
|
+
through `waxsql.data` (or the top-level `waxsql.generate_data` re-export);
|
|
11
|
+
the layout here may change without notice. The split exists so each
|
|
12
|
+
concern (type-keyed values, name-keyed overrides, FK ordering, COPY
|
|
13
|
+
encoding) lives in one file with one set of invariants.
|
|
14
|
+
"""
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Column-name override registry.
|
|
2
|
+
|
|
3
|
+
This is a hook for the eventual 'column named email actually has emails'
|
|
4
|
+
story. Today the registry is nearly empty and `strategy_for` falls
|
|
5
|
+
through to the type strategy in nearly every case. The registry is a
|
|
6
|
+
tuple (not a dict) because order matters — first match wins — and
|
|
7
|
+
because tuple iteration is deterministic across Python versions.
|
|
8
|
+
|
|
9
|
+
Role in the system: `strategy_for(col)` is the single per-column
|
|
10
|
+
dispatch point used by `rows.generate_row`. Centralizing the lookup
|
|
11
|
+
here means future semantic plausibility (emails, names, URLs, etc.)
|
|
12
|
+
can grow inside this module without touching the row materializer or
|
|
13
|
+
the type-strategy registry.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
|
|
19
|
+
from waxsql.gen.data.strategies import Strategy, strategy_for_type
|
|
20
|
+
from waxsql.schema import Column
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Tuple of (compiled-pattern, strategy). First match wins. Empty today —
|
|
24
|
+
# this is the seam for future semantic plausibility. Adding an entry
|
|
25
|
+
# here doesn't require schema changes; the dispatch happens per column
|
|
26
|
+
# at row-generation time.
|
|
27
|
+
#
|
|
28
|
+
# Why a tuple of (pattern, strategy) pairs and not a dict of name → strategy:
|
|
29
|
+
# (1) ordering matters when patterns can overlap (e.g. `email_verified_at`
|
|
30
|
+
# should match a timestamp pattern, not the email pattern), and tuple
|
|
31
|
+
# iteration order is part of the source; (2) regex matching against
|
|
32
|
+
# every column name avoids exact-name brittleness.
|
|
33
|
+
_NAME_PATTERNS: tuple[tuple[re.Pattern, Strategy], ...] = ()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def strategy_for(column: Column) -> Strategy:
|
|
37
|
+
"""Return the strategy to use for `column`. Patterns are consulted
|
|
38
|
+
in order; the first one that matches the column name wins. Falls
|
|
39
|
+
through to `strategy_for_type(column.type)` when nothing matches.
|
|
40
|
+
|
|
41
|
+
Today this almost always falls through to the type strategy —
|
|
42
|
+
plausibility is type-driven, the name-override seam is intentionally
|
|
43
|
+
underused. The fallthrough is the common case, not an error path.
|
|
44
|
+
"""
|
|
45
|
+
for pat, strat in _NAME_PATTERNS:
|
|
46
|
+
if pat.search(column.name):
|
|
47
|
+
return strat
|
|
48
|
+
return strategy_for_type(column.type)
|
waxsql/gen/data/emit.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""COPY block formatting.
|
|
2
|
+
|
|
3
|
+
PostgreSQL's text-format COPY uses tab as the column separator, `\\N`
|
|
4
|
+
as the NULL sentinel, and backslash-escapes for tab, newline, carriage
|
|
5
|
+
return, and backslash itself. This module owns all of that; strategies
|
|
6
|
+
return native Python values and the emitter formats them.
|
|
7
|
+
|
|
8
|
+
Role in the system: this is the bottom of the value-rendering stack.
|
|
9
|
+
Everything above it (strategies, row materializer) traffics in native
|
|
10
|
+
Python objects; nothing above this module knows anything about tab
|
|
11
|
+
encoding, NULL sentinels, COPY framing, or PG array literal syntax.
|
|
12
|
+
That separation lets the strategy registry stay trivially testable
|
|
13
|
+
(no string round-trips needed to compare values).
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import datetime as _dt
|
|
18
|
+
import json
|
|
19
|
+
import uuid
|
|
20
|
+
from decimal import Decimal
|
|
21
|
+
from collections.abc import Iterable, Sequence
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# PG text-format COPY: the literal two-character sequence `\N` (backslash-N)
|
|
25
|
+
# in an unescaped position is the NULL marker. Raw string so the backslash
|
|
26
|
+
# stays a backslash — not an escape introducer for Python.
|
|
27
|
+
NULL_SENTINEL = r"\N"
|
|
28
|
+
|
|
29
|
+
# COPY text-format escape rules. Order matters: backslash MUST come
|
|
30
|
+
# first so we don't double-escape escapes we just inserted. If `\t`
|
|
31
|
+
# came first, the subsequent backslash pass would turn the `\` in
|
|
32
|
+
# `\t` into `\\`, producing `\\t` instead of the intended `\t`.
|
|
33
|
+
_ESCAPES: tuple[tuple[str, str], ...] = (
|
|
34
|
+
("\\", r"\\"),
|
|
35
|
+
("\t", r"\t"),
|
|
36
|
+
("\n", r"\n"),
|
|
37
|
+
("\r", r"\r"),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _escape_text(s: str) -> str:
|
|
42
|
+
# Linear pass through the ordered _ESCAPES table. str.replace is
|
|
43
|
+
# already optimized in CPython; rolling our own char-by-char loop
|
|
44
|
+
# would be slower and no clearer.
|
|
45
|
+
for raw, escaped in _ESCAPES:
|
|
46
|
+
s = s.replace(raw, escaped)
|
|
47
|
+
return s
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def encode_value(v: object) -> str:
|
|
51
|
+
"""Encode a Python value for PG's text-format COPY.
|
|
52
|
+
|
|
53
|
+
Returns the COPY representation of `v`. NULL is represented by
|
|
54
|
+
`\\N`; bools as `t`/`f`; datetimes/dates as ISO-8601 (with `+00:00`
|
|
55
|
+
suffix for timestamptz); intervals as ISO-8601 duration; dicts as
|
|
56
|
+
compact JSON; lists as PG array literals.
|
|
57
|
+
|
|
58
|
+
Dispatch ORDER is load-bearing — see the bool-before-int note below
|
|
59
|
+
and the str-after-everything note for the array element path.
|
|
60
|
+
"""
|
|
61
|
+
if v is None:
|
|
62
|
+
return NULL_SENTINEL
|
|
63
|
+
if isinstance(v, bool):
|
|
64
|
+
# bool MUST come before int — bool is a subclass of int in
|
|
65
|
+
# Python (isinstance(True, int) is True), so an earlier int
|
|
66
|
+
# branch would swallow booleans and emit them as "1"/"0".
|
|
67
|
+
return "t" if v else "f"
|
|
68
|
+
if isinstance(v, (int, float)):
|
|
69
|
+
return str(v)
|
|
70
|
+
if isinstance(v, Decimal):
|
|
71
|
+
# Decimal __str__ is canonical (no trailing zeros stripping, no
|
|
72
|
+
# scientific notation for reasonable values); PG numeric parses
|
|
73
|
+
# it directly.
|
|
74
|
+
return str(v)
|
|
75
|
+
if isinstance(v, uuid.UUID):
|
|
76
|
+
return str(v)
|
|
77
|
+
if isinstance(v, _dt.datetime):
|
|
78
|
+
# str() on tz-aware datetime gives "YYYY-MM-DD HH:MM:SS+HH:MM"
|
|
79
|
+
# which is what PG accepts for timestamptz literals.
|
|
80
|
+
# Note: the date branch below is reached only by naïve `date`
|
|
81
|
+
# values — datetime is a subclass of date, so this branch
|
|
82
|
+
# MUST come before the date branch.
|
|
83
|
+
return str(v)
|
|
84
|
+
if isinstance(v, _dt.date):
|
|
85
|
+
return v.isoformat()
|
|
86
|
+
if isinstance(v, _dt.timedelta):
|
|
87
|
+
# ISO-8601 duration: PnDTnS. Limited to days/seconds because
|
|
88
|
+
# timedelta doesn't carry month/year parts. PG accepts this.
|
|
89
|
+
# If we ever need year/month resolution we'll have to leave
|
|
90
|
+
# timedelta behind for a richer carrier type.
|
|
91
|
+
days = v.days
|
|
92
|
+
seconds = v.seconds
|
|
93
|
+
return f"P{days}DT{seconds}S"
|
|
94
|
+
if isinstance(v, dict):
|
|
95
|
+
# Compact JSON (no spaces) keeps COPY output narrow and matches
|
|
96
|
+
# what PG emits for jsonb on dump. separators kwarg is required;
|
|
97
|
+
# the default json.dumps inserts ", " and ": " padding.
|
|
98
|
+
#
|
|
99
|
+
# The result is then run through `_escape_text` because
|
|
100
|
+
# json.dumps emits backslash-escape sequences (`\t`, `\n`, `\"`,
|
|
101
|
+
# `\uXXXX`) for control chars, quotes, and non-ASCII. Those
|
|
102
|
+
# backslashes mean nothing to JSON's *reader* (it resolves
|
|
103
|
+
# them) but everything to COPY's *reader* (which uses
|
|
104
|
+
# backslash as its own escape introducer). Without
|
|
105
|
+
# re-escaping, a JSON value containing e.g. an embedded tab
|
|
106
|
+
# would arrive at PG with the `\` already consumed by COPY,
|
|
107
|
+
# leaving invalid JSON for jsonb_in. _escape_text doubles
|
|
108
|
+
# every `\` so COPY resolves them back to single `\` before
|
|
109
|
+
# handing the JSON to PG.
|
|
110
|
+
# allow_nan=False: inf/nan would serialize as Infinity/NaN, which
|
|
111
|
+
# jsonb_in rejects. Fail at generation time, not COPY-load time.
|
|
112
|
+
return _escape_text(json.dumps(v, separators=(",", ":"), allow_nan=False))
|
|
113
|
+
if isinstance(v, list):
|
|
114
|
+
# PG text-format array literal: `{elem,elem,...}`. Per-element
|
|
115
|
+
# formatting goes through `_array_element`, which knows the
|
|
116
|
+
# two-layer escape rules required for strings/dicts inside an
|
|
117
|
+
# array literal that itself sits inside a COPY cell.
|
|
118
|
+
return "{" + ",".join(_array_element(e) for e in v) + "}"
|
|
119
|
+
if isinstance(v, str):
|
|
120
|
+
return _escape_text(v)
|
|
121
|
+
raise TypeError(f"no COPY encoder for type {type(v).__name__}: {v!r}")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _array_element(v: object) -> str:
|
|
125
|
+
"""Format a single element for inclusion in a PG array literal.
|
|
126
|
+
|
|
127
|
+
A PG array literal lives INSIDE a COPY cell, so two parsing
|
|
128
|
+
layers apply to its bytes:
|
|
129
|
+
|
|
130
|
+
1. PG's COPY row reader resolves row-format escapes first
|
|
131
|
+
(`\\\\`, `\\t`, `\\n`, `\\r`).
|
|
132
|
+
2. PG's array_in then parses the resolved cell text, with
|
|
133
|
+
its own escape grammar for quoted elements: `\\\\` resolves
|
|
134
|
+
to `\\`, `\\"` resolves to `"`. (Note: array_in does NOT
|
|
135
|
+
recognize `\\t`/`\\n`/`\\r` — anywhere it sees a literal
|
|
136
|
+
tab/newline/CR inside a quoted element, it just keeps
|
|
137
|
+
the character as part of the string.)
|
|
138
|
+
|
|
139
|
+
String and dict elements need BOTH escape passes; numeric/bool/
|
|
140
|
+
datetime elements need NEITHER (PG's array_in accepts them
|
|
141
|
+
unquoted via the element type's own input function). Lists
|
|
142
|
+
recurse for multidimensional arrays — they do NOT get quoted,
|
|
143
|
+
because PG's array literal supports `{{1,2},{3,4}}` directly.
|
|
144
|
+
|
|
145
|
+
None becomes the bare token `NULL` (case-insensitive), NOT the
|
|
146
|
+
row-level `\\N` sentinel — inside a `{...}` literal, `\\N`
|
|
147
|
+
is interpreted as the two-character string `\\N`, not SQL NULL.
|
|
148
|
+
"""
|
|
149
|
+
if v is None:
|
|
150
|
+
# Bare unquoted token: PG's array_in treats this as SQL NULL.
|
|
151
|
+
# The row-level `\N` sentinel would be misread as the literal
|
|
152
|
+
# 2-char string, not NULL — a silent data corruption.
|
|
153
|
+
return "NULL"
|
|
154
|
+
if isinstance(v, list):
|
|
155
|
+
# Multidimensional array: emit the inner `{...}` literal
|
|
156
|
+
# WITHOUT quoting, so the outer array_in sees it as a
|
|
157
|
+
# sub-array, not a string element. Today `array_of(array_of(
|
|
158
|
+
# ...))` isn't constructible by the data generator, but the
|
|
159
|
+
# path is correct for the eventual extension.
|
|
160
|
+
return encode_value(v)
|
|
161
|
+
if isinstance(v, dict):
|
|
162
|
+
# Array-of-jsonb: each element is a JSON-encoded value
|
|
163
|
+
# wrapped as a quoted array-element. PG's array_in extracts
|
|
164
|
+
# the quoted content (resolving its escapes) and hands the
|
|
165
|
+
# resulting JSON text to jsonb_in for parsing. Two-layer
|
|
166
|
+
# escape handles backslashes from JSON's own escapes plus
|
|
167
|
+
# the quote-and-escape required by the array element format.
|
|
168
|
+
return _quote_array_element(json.dumps(v, separators=(",", ":"), allow_nan=False))
|
|
169
|
+
if isinstance(v, str):
|
|
170
|
+
return _quote_array_element(v)
|
|
171
|
+
# Scalar non-string types (int, float, bool, Decimal, UUID,
|
|
172
|
+
# datetime, date, timedelta): PG's array_in accepts these as
|
|
173
|
+
# unquoted tokens — quoting would actually be wrong here,
|
|
174
|
+
# because array_in would then call the element type's input
|
|
175
|
+
# function on a *string* form rather than the bare token.
|
|
176
|
+
return encode_value(v)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _quote_array_element(s: str) -> str:
|
|
180
|
+
"""Quote-and-escape a string for use as a PG array literal element.
|
|
181
|
+
|
|
182
|
+
Encodes in REVERSE order of how PG's parsers resolve the bytes:
|
|
183
|
+
array_in's escapes are applied FIRST in our code (because they
|
|
184
|
+
resolve LAST when PG reads), then COPY's escapes are applied
|
|
185
|
+
LAST in our code (because they resolve FIRST when PG reads).
|
|
186
|
+
|
|
187
|
+
Concretely:
|
|
188
|
+
|
|
189
|
+
* Step 1 (array_in escapes): `\\\\` → `\\\\\\\\` (one backslash
|
|
190
|
+
becomes two), `"` → `\\"` (one quote becomes backslash-quote).
|
|
191
|
+
These are the only escapes array_in recognizes inside a
|
|
192
|
+
double-quoted element.
|
|
193
|
+
|
|
194
|
+
* Step 2 (COPY escapes via `_escape_text`): re-escapes every
|
|
195
|
+
backslash (so each `\\\\` from step 1 becomes `\\\\\\\\`,
|
|
196
|
+
and any pre-existing literal backslash gets doubled) AND
|
|
197
|
+
backslash-escapes tab/newline/CR characters in the source
|
|
198
|
+
(so they don't terminate the COPY cell prematurely on
|
|
199
|
+
reading; once resolved by COPY they become literal
|
|
200
|
+
whitespace inside the quoted element, which array_in
|
|
201
|
+
keeps verbatim).
|
|
202
|
+
|
|
203
|
+
Composition example: a source `\\` (one backslash) → step 1
|
|
204
|
+
→ `\\\\` (two) → step 2 → `\\\\\\\\` (four) → wrapped in `"..."`.
|
|
205
|
+
PG resolution: COPY reads four backslashes → resolves to two
|
|
206
|
+
→ array_in reads two backslashes → resolves to one. Round-trip
|
|
207
|
+
preserves the original character.
|
|
208
|
+
"""
|
|
209
|
+
arr_escaped = s.replace("\\", "\\\\").replace('"', '\\"')
|
|
210
|
+
copy_escaped = _escape_text(arr_escaped)
|
|
211
|
+
return '"' + copy_escaped + '"'
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def emit_copy_block(
|
|
215
|
+
table_name: str,
|
|
216
|
+
columns: Sequence[str],
|
|
217
|
+
rows: Iterable[Sequence[object]],
|
|
218
|
+
) -> str:
|
|
219
|
+
"""Format one COPY block.
|
|
220
|
+
|
|
221
|
+
Output shape:
|
|
222
|
+
COPY "table" ("col1", "col2") FROM STDIN;
|
|
223
|
+
v11<TAB>v12
|
|
224
|
+
v21<TAB>v22
|
|
225
|
+
\\.
|
|
226
|
+
|
|
227
|
+
Identifiers are double-quoted to handle case-sensitivity and
|
|
228
|
+
reserved-word collisions uniformly. PG accepts quoted identifiers
|
|
229
|
+
that happen not to need quoting, so the extra punctuation is harmless.
|
|
230
|
+
|
|
231
|
+
The empty-rows case (no row tuples) is intentionally legal: it
|
|
232
|
+
produces `COPY ... FROM STDIN;\\n\\.\\n` — header immediately
|
|
233
|
+
followed by terminator. psql and psycopg both accept this; the
|
|
234
|
+
`waxsql gen --rows=0` flag relies on it.
|
|
235
|
+
"""
|
|
236
|
+
col_list = ", ".join(f'"{c}"' for c in columns)
|
|
237
|
+
# Build via parts + join: O(n) regardless of row count, vs O(n^2)
|
|
238
|
+
# for repeated string += concatenation.
|
|
239
|
+
parts = [f'COPY "{table_name}" ({col_list}) FROM STDIN;\n']
|
|
240
|
+
for row in rows:
|
|
241
|
+
parts.append("\t".join(encode_value(v) for v in row))
|
|
242
|
+
parts.append("\n")
|
|
243
|
+
# PG COPY text-format end-of-data sentinel: a line containing only
|
|
244
|
+
# `\.` (the same literal sequence used in psql). Followed by a
|
|
245
|
+
# newline so concatenated blocks separate cleanly.
|
|
246
|
+
parts.append("\\.\n")
|
|
247
|
+
return "".join(parts)
|