waxsql 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waxsql/__init__.py +158 -0
- waxsql/ast.py +757 -0
- waxsql/catalog.py +363 -0
- waxsql/cli.py +888 -0
- waxsql/config.py +477 -0
- waxsql/context.py +255 -0
- waxsql/data.py +99 -0
- waxsql/gen/__init__.py +51 -0
- waxsql/gen/cte.py +367 -0
- waxsql/gen/data/__init__.py +14 -0
- waxsql/gen/data/columns.py +48 -0
- waxsql/gen/data/emit.py +247 -0
- waxsql/gen/data/rows.py +236 -0
- waxsql/gen/data/strategies.py +299 -0
- waxsql/gen/expr.py +723 -0
- waxsql/gen/select.py +831 -0
- waxsql/gen/setop.py +259 -0
- waxsql/gen/subquery.py +397 -0
- waxsql/gen/window.py +398 -0
- waxsql/pretty.py +81 -0
- waxsql/printer.py +688 -0
- waxsql/py.typed +0 -0
- waxsql/schema.py +557 -0
- waxsql/scope.py +391 -0
- waxsql/types.py +187 -0
- waxsql/validate/__init__.py +52 -0
- waxsql/validate/parse.py +194 -0
- waxsql/validate/plan.py +149 -0
- waxsql/validate/syntax.py +87 -0
- waxsql-1.0.0.dist-info/METADATA +746 -0
- waxsql-1.0.0.dist-info/RECORD +35 -0
- waxsql-1.0.0.dist-info/WHEEL +5 -0
- waxsql-1.0.0.dist-info/entry_points.txt +2 -0
- waxsql-1.0.0.dist-info/licenses/LICENSE +21 -0
- waxsql-1.0.0.dist-info/top_level.txt +1 -0
waxsql/scope.py
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
"""Binding stack for query generation.
|
|
2
|
+
|
|
3
|
+
A `Scope` answers two questions for the expression generator:
|
|
4
|
+
|
|
5
|
+
1. "Which columns can I reference here?" — `visible_columns(of_type)`.
|
|
6
|
+
2. "Which tables are in the current FROM clause, and under what
|
|
7
|
+
aliases?" — `aliased_tables()` (used by the SELECT generator to
|
|
8
|
+
bias JOIN conditions toward FK-related tables).
|
|
9
|
+
|
|
10
|
+
Scopes form a parent chain. A nested subquery's scope points at the
|
|
11
|
+
outer query's scope; whether the lookup walks up the chain depends on
|
|
12
|
+
whether the subquery is correlated (and, equivalently for FROM-clause
|
|
13
|
+
subqueries, whether it's LATERAL). This is the same mechanism that
|
|
14
|
+
PostgreSQL's parser uses internally for name resolution; modelling it
|
|
15
|
+
the same way means the generator's notion of "what's visible" matches
|
|
16
|
+
PostgreSQL's notion when we eventually wire up PARSE-level validation.
|
|
17
|
+
|
|
18
|
+
The aggregate / GROUP BY / window flags do NOT live here. Those are
|
|
19
|
+
expression-context state (they flip per-call inside an aggregate
|
|
20
|
+
argument, etc.) and belong on GenContext. Scope is purely about
|
|
21
|
+
binding visibility.
|
|
22
|
+
|
|
23
|
+
There's no explicit pop. Scopes nest via parent pointers, and a
|
|
24
|
+
"popped" scope is one the caller simply stops referencing — typically
|
|
25
|
+
by returning from the function that built it, or by replacing the
|
|
26
|
+
GenContext.scope field with the parent. This is enforced structurally
|
|
27
|
+
because GenContext is frozen: `descend_subquery` produces a NEW
|
|
28
|
+
GenContext with a child scope, and once that GenContext goes out of
|
|
29
|
+
scope, the child scope does too. The discipline is "every scope
|
|
30
|
+
borrowed for a subquery is local to that subquery's generation call".
|
|
31
|
+
Violating this means stale bindings leak into sibling generation —
|
|
32
|
+
the prototypical "I see columns from a sibling I shouldn't see" bug.
|
|
33
|
+
|
|
34
|
+
Two related visibility mechanisms:
|
|
35
|
+
|
|
36
|
+
* CTE table-level visibility. A `_cte_defs` dict on each scope holds
|
|
37
|
+
CTE definitions; CTE lookup walks the chain unconditionally — CTEs
|
|
38
|
+
are visible regardless of correlation.
|
|
39
|
+
|
|
40
|
+
* Subquery support. `push_subquery(correlated=...)` creates a child
|
|
41
|
+
scope; the `correlated` flag at construction time decides whether
|
|
42
|
+
parent-chain column lookups walk past this level.
|
|
43
|
+
|
|
44
|
+
What's deliberately not modeled:
|
|
45
|
+
|
|
46
|
+
* Nullability propagation through outer joins. Bindings carry their
|
|
47
|
+
declared `nullable` flag, but the generator currently treats
|
|
48
|
+
everything as potentially-NULL anyway. Refining this requires
|
|
49
|
+
join-tree analysis at generation time and is its own piece of work.
|
|
50
|
+
"""
|
|
51
|
+
from __future__ import annotations
|
|
52
|
+
|
|
53
|
+
from dataclasses import dataclass
|
|
54
|
+
from typing import Optional
|
|
55
|
+
|
|
56
|
+
from .schema import Table
|
|
57
|
+
from .types import PgType, implicitly_castable
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class Binding:
|
|
62
|
+
"""One column visible at a particular scope level.
|
|
63
|
+
|
|
64
|
+
`table_alias` matches the alias used in the FROM clause (not the
|
|
65
|
+
underlying table name) — see the printer's ColumnRef handling for
|
|
66
|
+
why we always reference columns through the alias.
|
|
67
|
+
"""
|
|
68
|
+
table_alias: str
|
|
69
|
+
column: str
|
|
70
|
+
type: PgType
|
|
71
|
+
nullable: bool
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Scope:
|
|
75
|
+
"""Mutable binding container with an immutable parent link.
|
|
76
|
+
|
|
77
|
+
A fresh Scope has no bindings; the caller adds them as it processes
|
|
78
|
+
the FROM clause. Use `push_subquery(...)` to create a child scope
|
|
79
|
+
when entering a subquery; the parent is kept alive (and visible,
|
|
80
|
+
if `correlated=True`) for the lifetime of the child.
|
|
81
|
+
|
|
82
|
+
The `correlated` flag is set once at construction and not changed:
|
|
83
|
+
it reflects what kind of nesting created this scope. For LATERAL
|
|
84
|
+
FROM-subqueries and most expression-position subqueries, pass
|
|
85
|
+
`correlated=True`; for non-LATERAL FROM-subqueries that may not
|
|
86
|
+
reference their siblings, pass `correlated=False`.
|
|
87
|
+
|
|
88
|
+
Scope is NOT frozen, by deliberate exception to the project-wide
|
|
89
|
+
immutability convention. The reason: binding lists grow over a
|
|
90
|
+
single FROM-clause pass — `add_table` is called once per FROM
|
|
91
|
+
item — and rebuilding the whole Scope object for each addition
|
|
92
|
+
would force the caller into an awkward fold-style loop. The
|
|
93
|
+
mutation is confined to one writer (the FROM-clause builder) per
|
|
94
|
+
scope instance, and the parent link is set in __init__ and never
|
|
95
|
+
changed, so the parent-chain walk remains effectively immutable.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
parent: Optional["Scope"] = None,
|
|
101
|
+
*,
|
|
102
|
+
correlated: bool = True,
|
|
103
|
+
) -> None:
|
|
104
|
+
self._parent = parent
|
|
105
|
+
self._correlated = correlated
|
|
106
|
+
# Bindings in insertion order. The expression generator picks
|
|
107
|
+
# weighted-randomly from this list, so the order has to be
|
|
108
|
+
# stable across runs. Insertion order is naturally stable when
|
|
109
|
+
# `add_table` / `add_derived` is called in a deterministic
|
|
110
|
+
# sequence.
|
|
111
|
+
self._bindings: list[Binding] = []
|
|
112
|
+
# alias -> Table-or-None, in insertion order. Base-table
|
|
113
|
+
# aliases get their Table; derived-table aliases get None
|
|
114
|
+
# (derived tables don't have an underlying base Table to
|
|
115
|
+
# carry, but the alias still occupies a slot for collision
|
|
116
|
+
# detection and for ordered enumeration).
|
|
117
|
+
#
|
|
118
|
+
# Used by the SELECT generator's join-condition FK biasing
|
|
119
|
+
# (which filters out the derived-None entries since FKs only
|
|
120
|
+
# exist on base tables).
|
|
121
|
+
self._aliases: list[tuple[str, Optional[Table]]] = []
|
|
122
|
+
# CTE definitions visible at this scope level. Maps CTE name
|
|
123
|
+
# to its column info — list of (col_name, col_type) pairs.
|
|
124
|
+
# Bindings aren't stored here because the table_alias on a
|
|
125
|
+
# binding depends on the LOCAL alias used in a future CteRef,
|
|
126
|
+
# which doesn't exist yet at CTE-definition time. The CteRef-
|
|
127
|
+
# add-to-scope step constructs Bindings with the right alias
|
|
128
|
+
# via `add_derived`.
|
|
129
|
+
#
|
|
130
|
+
# Lookup walks the parent chain UNCONDITIONALLY — CTEs are
|
|
131
|
+
# visible from any nested scope regardless of `_correlated`.
|
|
132
|
+
# That flag gates column visibility, not CTE visibility:
|
|
133
|
+
# those are two separate static-scoping rules in PG.
|
|
134
|
+
self._cte_defs: dict[str, list[tuple[str, PgType]]] = {}
|
|
135
|
+
|
|
136
|
+
# -- mutation -----------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
def add_table(self, alias: str, table: Table) -> None:
|
|
139
|
+
"""Register `table` under `alias` and add all its columns as
|
|
140
|
+
bindings visible at this scope level.
|
|
141
|
+
|
|
142
|
+
Aliases must be unique within a single Scope (PostgreSQL
|
|
143
|
+
enforces this for FROM-clause aliases). The check is intended
|
|
144
|
+
to catch generator bugs, not to police user input. Both base
|
|
145
|
+
and derived aliases participate in the uniqueness check.
|
|
146
|
+
"""
|
|
147
|
+
if any(a == alias for a, _ in self._aliases):
|
|
148
|
+
raise ValueError(f"alias {alias!r} already in scope")
|
|
149
|
+
self._aliases.append((alias, table))
|
|
150
|
+
for col in table.columns:
|
|
151
|
+
self._bindings.append(Binding(
|
|
152
|
+
table_alias=alias,
|
|
153
|
+
column=col.name,
|
|
154
|
+
type=col.type,
|
|
155
|
+
nullable=col.nullable,
|
|
156
|
+
))
|
|
157
|
+
|
|
158
|
+
def add_derived(
|
|
159
|
+
self,
|
|
160
|
+
alias: str,
|
|
161
|
+
columns: list[tuple[str, PgType]],
|
|
162
|
+
) -> None:
|
|
163
|
+
"""Register a derived-table alias whose columns come from a
|
|
164
|
+
FROM-clause subquery's targets, not a base Table.
|
|
165
|
+
|
|
166
|
+
`columns` is a list of (column_name, column_type) pairs —
|
|
167
|
+
typically one entry per inner SELECT target. The generator
|
|
168
|
+
uses synthetic column names (`c1`, `c2`, ...) on the inner
|
|
169
|
+
targets and passes those names here; that keeps column
|
|
170
|
+
resolution `derived.c1` deterministic regardless of what the
|
|
171
|
+
inner expression evaluated to.
|
|
172
|
+
|
|
173
|
+
Derived columns are always treated as nullable: we don't
|
|
174
|
+
propagate NOT NULL constraints through SELECT-list
|
|
175
|
+
expressions, consistent with the generator-wide rule that
|
|
176
|
+
treats everything as potentially-NULL.
|
|
177
|
+
"""
|
|
178
|
+
if any(a == alias for a, _ in self._aliases):
|
|
179
|
+
raise ValueError(f"alias {alias!r} already in scope")
|
|
180
|
+
self._aliases.append((alias, None))
|
|
181
|
+
for col_name, col_type in columns:
|
|
182
|
+
self._bindings.append(Binding(
|
|
183
|
+
table_alias=alias,
|
|
184
|
+
column=col_name,
|
|
185
|
+
type=col_type,
|
|
186
|
+
nullable=True,
|
|
187
|
+
))
|
|
188
|
+
|
|
189
|
+
# -- queries ------------------------------------------------------------
|
|
190
|
+
|
|
191
|
+
def local_bindings(
|
|
192
|
+
self,
|
|
193
|
+
of_type: Optional[PgType] = None,
|
|
194
|
+
) -> list[Binding]:
|
|
195
|
+
"""Bindings introduced at THIS scope level only — no parent
|
|
196
|
+
chain walk.
|
|
197
|
+
|
|
198
|
+
Used by gen_expr when generating aggregate args inside a
|
|
199
|
+
correlated subquery: outer-column refs inside such aggregates
|
|
200
|
+
trigger PG's implicit-grouping inference on the outer query
|
|
201
|
+
(PARSE-tier error 42803). Restricting to local bindings
|
|
202
|
+
prevents the leak. The visible_columns method (which DOES
|
|
203
|
+
walk the chain) remains the right tool everywhere else.
|
|
204
|
+
"""
|
|
205
|
+
if of_type is None:
|
|
206
|
+
return list(self._bindings)
|
|
207
|
+
return [
|
|
208
|
+
b for b in self._bindings
|
|
209
|
+
if implicitly_castable(b.type, of_type)
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
def visible_columns(
|
|
213
|
+
self,
|
|
214
|
+
of_type: Optional[PgType] = None,
|
|
215
|
+
) -> list[Binding]:
|
|
216
|
+
"""All bindings visible at this scope level.
|
|
217
|
+
|
|
218
|
+
Walks up the parent chain when this scope is correlated;
|
|
219
|
+
stops at the first uncorrelated scope. With `of_type` set,
|
|
220
|
+
filters to bindings whose declared type implicitly casts to
|
|
221
|
+
the requested type — same coercion rule the catalog uses.
|
|
222
|
+
|
|
223
|
+
The returned list is freshly built on each call; the caller
|
|
224
|
+
owns it and may freely sort or sample without affecting Scope.
|
|
225
|
+
|
|
226
|
+
The walk order is innermost-first, then outward. Sample
|
|
227
|
+
consumers (gen/expr.py column-ref candidate selection)
|
|
228
|
+
weight earlier entries — i.e., closer scopes — implicitly
|
|
229
|
+
through this ordering, which matches the intuition that
|
|
230
|
+
"the current query's columns" are more relevant than
|
|
231
|
+
"the enclosing query's columns" for a casual column pick.
|
|
232
|
+
"""
|
|
233
|
+
out: list[Binding] = []
|
|
234
|
+
s: Optional[Scope] = self
|
|
235
|
+
while s is not None:
|
|
236
|
+
if of_type is None:
|
|
237
|
+
out.extend(s._bindings)
|
|
238
|
+
else:
|
|
239
|
+
out.extend(
|
|
240
|
+
b for b in s._bindings
|
|
241
|
+
if implicitly_castable(b.type, of_type)
|
|
242
|
+
)
|
|
243
|
+
if not s._correlated:
|
|
244
|
+
break
|
|
245
|
+
s = s._parent
|
|
246
|
+
return out
|
|
247
|
+
|
|
248
|
+
def aliased_tables(self) -> list[tuple[str, Table]]:
|
|
249
|
+
"""Return (alias, table) pairs from this scope only, BASE
|
|
250
|
+
TABLES ONLY — derived-table aliases are filtered out.
|
|
251
|
+
|
|
252
|
+
Used for FK-biased JOIN condition generation, which only
|
|
253
|
+
applies to base tables (derived tables don't have FKs).
|
|
254
|
+
Scope-local because the use case is within a single SELECT's
|
|
255
|
+
FROM clause; parent-scope tables aren't JOIN candidates.
|
|
256
|
+
"""
|
|
257
|
+
return [(a, t) for a, t in self._aliases if t is not None]
|
|
258
|
+
|
|
259
|
+
def lookup_alias(self, alias: str) -> Optional[Table]:
|
|
260
|
+
"""Resolve `alias` to its underlying Table at this scope
|
|
261
|
+
level. Returns None if the alias is absent OR if it's a
|
|
262
|
+
derived-table alias (no underlying Table).
|
|
263
|
+
|
|
264
|
+
The two None cases are deliberately collapsed because the
|
|
265
|
+
only current caller (FK-biased JOIN-condition generation)
|
|
266
|
+
treats both the same way: "can't generate an FK predicate
|
|
267
|
+
against this." Use `has_alias()` for the unambiguous
|
|
268
|
+
existence check.
|
|
269
|
+
"""
|
|
270
|
+
for a, t in self._aliases:
|
|
271
|
+
if a == alias:
|
|
272
|
+
return t # may legitimately be None for derived
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
def has_alias(self, alias: str) -> bool:
|
|
276
|
+
"""True iff `alias` is registered at this scope level,
|
|
277
|
+
regardless of base-table vs derived. Distinct from
|
|
278
|
+
`lookup_alias`, which returns None for both "absent" and
|
|
279
|
+
"present but derived"."""
|
|
280
|
+
return any(a == alias for a, _ in self._aliases)
|
|
281
|
+
|
|
282
|
+
# -- CTE management -----------------------------------------------------
|
|
283
|
+
|
|
284
|
+
def add_cte(
|
|
285
|
+
self,
|
|
286
|
+
name: str,
|
|
287
|
+
columns: list[tuple[str, PgType]],
|
|
288
|
+
) -> None:
|
|
289
|
+
"""Register a CTE definition under `name` with its column
|
|
290
|
+
info. Same shape as `add_derived`'s columns parameter —
|
|
291
|
+
list of (col_name, col_type) pairs.
|
|
292
|
+
|
|
293
|
+
CTE name uniqueness is per-scope (one WITH clause). PG
|
|
294
|
+
enforces this; reject early to surface generator bugs.
|
|
295
|
+
Cross-scope shadowing (an inner WITH defining the same name
|
|
296
|
+
as an outer) is allowed by PG but outside milestone-5 scope.
|
|
297
|
+
"""
|
|
298
|
+
if name in self._cte_defs:
|
|
299
|
+
raise ValueError(f"CTE name {name!r} already defined in scope")
|
|
300
|
+
self._cte_defs[name] = list(columns)
|
|
301
|
+
|
|
302
|
+
def lookup_cte(
|
|
303
|
+
self,
|
|
304
|
+
name: str,
|
|
305
|
+
) -> Optional[list[tuple[str, PgType]]]:
|
|
306
|
+
"""Resolve a CTE name to its column info, walking the parent
|
|
307
|
+
chain UNCONDITIONALLY.
|
|
308
|
+
|
|
309
|
+
Unconditional walk because CTE visibility is static-scope:
|
|
310
|
+
a CTE defined in an outer query is visible from every nested
|
|
311
|
+
SELECT, regardless of correlation/LATERAL semantics. That's
|
|
312
|
+
different from `visible_columns`, which gates parent-chain
|
|
313
|
+
walking on the `_correlated` flag.
|
|
314
|
+
|
|
315
|
+
Returns a fresh list per call so callers can freely mutate it.
|
|
316
|
+
Returns None when the name isn't found anywhere in the chain.
|
|
317
|
+
"""
|
|
318
|
+
s: Optional[Scope] = self
|
|
319
|
+
while s is not None:
|
|
320
|
+
if name in s._cte_defs:
|
|
321
|
+
return list(s._cte_defs[name])
|
|
322
|
+
s = s._parent
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
def has_visible_ctes(self) -> bool:
|
|
326
|
+
"""True iff at least one CTE is defined in this scope or any
|
|
327
|
+
ancestor. Used by the FROM-clause generator to gate the
|
|
328
|
+
"use a CTE reference?" decision — meaningless when no CTEs
|
|
329
|
+
are in scope."""
|
|
330
|
+
s: Optional[Scope] = self
|
|
331
|
+
while s is not None:
|
|
332
|
+
if s._cte_defs:
|
|
333
|
+
return True
|
|
334
|
+
s = s._parent
|
|
335
|
+
return False
|
|
336
|
+
|
|
337
|
+
def visible_cte_names(self) -> list[str]:
|
|
338
|
+
"""All CTE names visible at this scope level, walking the
|
|
339
|
+
parent chain. Insertion-order within each scope, child
|
|
340
|
+
scopes' CTEs first — de-duped so a name shadowed by a
|
|
341
|
+
closer scope appears exactly once, with the child binding
|
|
342
|
+
winning (because the child's copy is emitted before the
|
|
343
|
+
walk reaches the parent's entry for that name).
|
|
344
|
+
|
|
345
|
+
Today's generator only emits top-level WITHs, so the
|
|
346
|
+
shadow case never fires in production; the dedupe is a
|
|
347
|
+
latent-correctness guard for the eventual nested-WITH
|
|
348
|
+
path. Without it, the list would contain the same name
|
|
349
|
+
twice and a caller picking a CTE to reference by name
|
|
350
|
+
could resolve the wrong binding — `lookup_cte` walks
|
|
351
|
+
closest-first and would return the child's binding,
|
|
352
|
+
producing a name/binding mismatch.
|
|
353
|
+
|
|
354
|
+
Used when the generator needs to pick a CTE to reference
|
|
355
|
+
from the FROM clause. Order is deterministic — both dict
|
|
356
|
+
insertion order (Python 3.7+) and the parent-chain walk
|
|
357
|
+
order are stable. The membership-check set is consulted
|
|
358
|
+
only for `in`-tests, never iterated, so this does not
|
|
359
|
+
violate the project's no-set-iteration-in-RNG-paths rule.
|
|
360
|
+
"""
|
|
361
|
+
out: list[str] = []
|
|
362
|
+
seen: set[str] = set()
|
|
363
|
+
s: Optional[Scope] = self
|
|
364
|
+
while s is not None:
|
|
365
|
+
for name in s._cte_defs:
|
|
366
|
+
if name not in seen:
|
|
367
|
+
seen.add(name)
|
|
368
|
+
out.append(name)
|
|
369
|
+
s = s._parent
|
|
370
|
+
return out
|
|
371
|
+
|
|
372
|
+
# -- nesting ------------------------------------------------------------
|
|
373
|
+
|
|
374
|
+
def push_subquery(self, *, correlated: bool) -> "Scope":
|
|
375
|
+
"""Construct a child scope for a nested query.
|
|
376
|
+
|
|
377
|
+
`correlated=True` is the default for subqueries in expression
|
|
378
|
+
position (e.g. `WHERE x = (SELECT ...)`) and for LATERAL FROM
|
|
379
|
+
subqueries. `correlated=False` is for plain FROM subqueries,
|
|
380
|
+
which by SQL standard cannot reference their siblings.
|
|
381
|
+
|
|
382
|
+
Caller is responsible for not leaking the child scope past
|
|
383
|
+
the subquery's generation call — see module docstring.
|
|
384
|
+
Practically, that means the child is held only by a
|
|
385
|
+
descended-then-discarded GenContext, never assigned to a
|
|
386
|
+
long-lived attribute.
|
|
387
|
+
"""
|
|
388
|
+
return Scope(parent=self, correlated=correlated)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
__all__ = ["Binding", "Scope"]
|
waxsql/types.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""PostgreSQL type system model.
|
|
2
|
+
|
|
3
|
+
Mirrors the abstractions PostgreSQL itself uses (pg_type.typcategory) so
|
|
4
|
+
that as we add more types and casts later, the structure already lines up
|
|
5
|
+
with how the planner reasons about coercion.
|
|
6
|
+
|
|
7
|
+
This is a deliberately small slice of PostgreSQL's actual type system —
|
|
8
|
+
~12 scalar types plus arrays. Expand `_IMPLICIT_CASTS` and `SCALAR_TYPES`
|
|
9
|
+
as the generator needs more variety.
|
|
10
|
+
|
|
11
|
+
This module is the load-bearing foundation under the type-driven
|
|
12
|
+
expression generator: every "what produces type T?" lookup in the
|
|
13
|
+
catalog, every column visibility filter in scope.py, and every
|
|
14
|
+
function/operator argument check runs through `implicitly_castable`.
|
|
15
|
+
Mistakes here propagate as "valid-looking SQL that fails parse-analysis"
|
|
16
|
+
across the whole generator. Cross-reference with pg_cast when changing
|
|
17
|
+
anything below.
|
|
18
|
+
"""
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from typing import Optional
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TypeCategory(str, Enum):
|
|
27
|
+
"""Type categories from pg_type.typcategory.
|
|
28
|
+
|
|
29
|
+
Used by the planner to decide implicit coercion in contexts like
|
|
30
|
+
UNION resolution and operator/function dispatch. We track it on every
|
|
31
|
+
type so the catalog can answer "is this thing usable here" without
|
|
32
|
+
reinventing PostgreSQL's logic.
|
|
33
|
+
"""
|
|
34
|
+
ARRAY = "A"
|
|
35
|
+
BOOLEAN = "B"
|
|
36
|
+
COMPOSITE = "C"
|
|
37
|
+
DATETIME = "D"
|
|
38
|
+
ENUM = "E"
|
|
39
|
+
GEOMETRIC = "G"
|
|
40
|
+
NETWORK = "I"
|
|
41
|
+
NUMERIC = "N"
|
|
42
|
+
PSEUDO = "P"
|
|
43
|
+
RANGE = "R"
|
|
44
|
+
STRING = "S"
|
|
45
|
+
TIMESPAN = "T"
|
|
46
|
+
USER = "U"
|
|
47
|
+
BITSTRING = "V"
|
|
48
|
+
UNKNOWN = "X"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass(frozen=True)
|
|
52
|
+
class PgType:
|
|
53
|
+
"""A PostgreSQL type.
|
|
54
|
+
|
|
55
|
+
`name` matches pg_type.typname (so `int8`, not `bigint`); we rely on
|
|
56
|
+
PostgreSQL accepting both spellings in DDL. `element` is set only for
|
|
57
|
+
array types, in which case `name` is conventionally the underscore-
|
|
58
|
+
prefixed form (`_int4` for `int4[]`), again matching pg_type.
|
|
59
|
+
|
|
60
|
+
`typmod` is the type modifier tuple, e.g. (10, 2) for `numeric(10,2)`
|
|
61
|
+
or (50,) for `varchar(50)`. Empty tuple means no modifier.
|
|
62
|
+
|
|
63
|
+
Frozen so PgType instances are hashable and usable as dict keys, which
|
|
64
|
+
matters for type weight tables and catalog indexes.
|
|
65
|
+
"""
|
|
66
|
+
name: str
|
|
67
|
+
category: TypeCategory
|
|
68
|
+
element: Optional["PgType"] = None
|
|
69
|
+
typmod: tuple[int, ...] = ()
|
|
70
|
+
|
|
71
|
+
# The `is_*` predicates are convenience wrappers. They exist so that
|
|
72
|
+
# callers don't have to import TypeCategory just to ask the obvious
|
|
73
|
+
# question, and so future re-categorization (e.g. splitting NUMERIC
|
|
74
|
+
# into INTEGRAL/REAL) only has to touch this file.
|
|
75
|
+
def is_array(self) -> bool:
|
|
76
|
+
return self.element is not None
|
|
77
|
+
|
|
78
|
+
def is_numeric(self) -> bool:
|
|
79
|
+
return self.category == TypeCategory.NUMERIC
|
|
80
|
+
|
|
81
|
+
def is_string(self) -> bool:
|
|
82
|
+
return self.category == TypeCategory.STRING
|
|
83
|
+
|
|
84
|
+
def sql(self) -> str:
|
|
85
|
+
"""Render as a SQL type expression suitable for DDL or CAST."""
|
|
86
|
+
if self.element is not None:
|
|
87
|
+
return f"{self.element.sql()}[]"
|
|
88
|
+
if self.typmod:
|
|
89
|
+
return f"{self.name}({','.join(str(t) for t in self.typmod)})"
|
|
90
|
+
return self.name
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Day-one scalar set. Picked to give the generator interesting variety
|
|
94
|
+
# (numeric, string, temporal, structured) without drowning the catalog
|
|
95
|
+
# in every cast rule PostgreSQL ships with.
|
|
96
|
+
INT4 = PgType("int4", TypeCategory.NUMERIC)
|
|
97
|
+
INT8 = PgType("int8", TypeCategory.NUMERIC)
|
|
98
|
+
NUMERIC = PgType("numeric", TypeCategory.NUMERIC)
|
|
99
|
+
FLOAT8 = PgType("float8", TypeCategory.NUMERIC)
|
|
100
|
+
TEXT = PgType("text", TypeCategory.STRING)
|
|
101
|
+
VARCHAR = PgType("varchar", TypeCategory.STRING)
|
|
102
|
+
BOOL = PgType("bool", TypeCategory.BOOLEAN)
|
|
103
|
+
DATE = PgType("date", TypeCategory.DATETIME)
|
|
104
|
+
TIMESTAMPTZ = PgType("timestamptz", TypeCategory.DATETIME)
|
|
105
|
+
INTERVAL = PgType("interval", TypeCategory.TIMESPAN)
|
|
106
|
+
UUID = PgType("uuid", TypeCategory.USER)
|
|
107
|
+
JSONB = PgType("jsonb", TypeCategory.USER)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def array_of(t: PgType) -> PgType:
|
|
111
|
+
"""Construct an array type over `t`. Mirrors pg_type's `_typname` convention."""
|
|
112
|
+
return PgType(name=f"_{t.name}", category=TypeCategory.ARRAY, element=t)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
SCALAR_TYPES: tuple[PgType, ...] = (
|
|
116
|
+
INT4, INT8, NUMERIC, FLOAT8,
|
|
117
|
+
TEXT, VARCHAR, BOOL,
|
|
118
|
+
DATE, TIMESTAMPTZ, INTERVAL,
|
|
119
|
+
UUID, JSONB,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# Implicit cast graph. Each key maps to the set of target type names that
|
|
124
|
+
# the source coerces to *implicitly* (no CAST needed). This is a small
|
|
125
|
+
# subset of pg_cast — enough to keep the generator honest about what it
|
|
126
|
+
# can pass where, without trying to be a complete oracle for PG semantics.
|
|
127
|
+
#
|
|
128
|
+
# Convention: every type implicitly casts to itself, so the target set
|
|
129
|
+
# always contains the source's own name.
|
|
130
|
+
#
|
|
131
|
+
# Direction matters: this is a source→target relation, not symmetric.
|
|
132
|
+
# `int4 → int8` is listed; `int8 → int4` is not. The numeric chain
|
|
133
|
+
# (int4 → int8 → numeric → float8) reflects PG's standard promotion
|
|
134
|
+
# ladder. A type missing from this dict still casts to itself via the
|
|
135
|
+
# `src == tgt` short-circuit in implicitly_castable, so adding a new
|
|
136
|
+
# scalar without an entry here degrades to "no implicit casts" rather
|
|
137
|
+
# than to broken behavior.
|
|
138
|
+
#
|
|
139
|
+
# Transitivity is precomputed, not derived. `int4` lists `float8`
|
|
140
|
+
# directly even though PG reaches float8 only via the int8 → numeric
|
|
141
|
+
# → float8 chain. The lookup must be O(1) because it runs once per
|
|
142
|
+
# candidate type per expression-generator decision; we'd rather
|
|
143
|
+
# maintain the closure by hand than walk the graph at every check.
|
|
144
|
+
# Anyone editing this dict must keep the closure consistent.
|
|
145
|
+
_IMPLICIT_CASTS: dict[str, frozenset[str]] = {
|
|
146
|
+
"int4": frozenset({"int4", "int8", "numeric", "float8"}),
|
|
147
|
+
"int8": frozenset({"int8", "numeric", "float8"}),
|
|
148
|
+
"numeric": frozenset({"numeric", "float8"}),
|
|
149
|
+
"float8": frozenset({"float8"}),
|
|
150
|
+
"text": frozenset({"text"}),
|
|
151
|
+
"varchar": frozenset({"varchar", "text"}),
|
|
152
|
+
"bool": frozenset({"bool"}),
|
|
153
|
+
"date": frozenset({"date", "timestamptz"}),
|
|
154
|
+
"timestamptz": frozenset({"timestamptz"}),
|
|
155
|
+
"interval": frozenset({"interval"}),
|
|
156
|
+
"uuid": frozenset({"uuid"}),
|
|
157
|
+
"jsonb": frozenset({"jsonb"}),
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def implicitly_castable(src: PgType, tgt: PgType) -> bool:
|
|
162
|
+
"""True iff a value of type `src` can be used where `tgt` is expected
|
|
163
|
+
without an explicit CAST.
|
|
164
|
+
|
|
165
|
+
Arrays are handled with a deliberately strict rule: arrays cast iff
|
|
166
|
+
their element types match exactly. PostgreSQL's actual array casting
|
|
167
|
+
rules are more permissive in some cases, but the strict rule keeps
|
|
168
|
+
the generator from emitting things that *might* parse but rarely
|
|
169
|
+
type-check.
|
|
170
|
+
"""
|
|
171
|
+
# Identity short-circuit before the dict lookup. Two reasons:
|
|
172
|
+
# (1) it's the common case and avoids a hash/lookup per check;
|
|
173
|
+
# (2) it ensures a type with no entry in _IMPLICIT_CASTS still
|
|
174
|
+
# casts to itself — see the dict comment about "degrades to
|
|
175
|
+
# no implicit casts" when an entry is missing.
|
|
176
|
+
if src == tgt:
|
|
177
|
+
return True
|
|
178
|
+
# Mixed scalar/array combinations are always rejected. PG allows
|
|
179
|
+
# some such coercions via container cast machinery, but generating
|
|
180
|
+
# them requires special-cased SQL output (e.g. ARRAY[expr]); the
|
|
181
|
+
# generator doesn't emit those today, so refusing here keeps the
|
|
182
|
+
# generator's notion of cast-availability conservative.
|
|
183
|
+
if src.is_array() or tgt.is_array():
|
|
184
|
+
if src.is_array() and tgt.is_array():
|
|
185
|
+
return src.element == tgt.element
|
|
186
|
+
return False
|
|
187
|
+
return tgt.name in _IMPLICIT_CASTS.get(src.name, frozenset())
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Validation modes for generated SQL.
|
|
2
|
+
|
|
3
|
+
Role in the system: the public surface for "how thoroughly do we
|
|
4
|
+
check this query?" Every CLI entry point and test takes a
|
|
5
|
+
ValidationMode and dispatches to the right submodule. Keeping the
|
|
6
|
+
enum here (separate from the implementations in `syntax.py`,
|
|
7
|
+
`parse.py`, `plan.py`) means callers can refer to a mode without
|
|
8
|
+
importing psycopg or pglast — those imports are deferred to the
|
|
9
|
+
submodule that actually needs them.
|
|
10
|
+
|
|
11
|
+
Three layers, each strictly stronger than the previous in BOTH
|
|
12
|
+
cost and catch-rate — pick the cheapest one that catches the
|
|
13
|
+
failure class you care about. The ordering (SYNTAX < PARSE < PLAN)
|
|
14
|
+
is load-bearing: anything PARSE catches, PLAN also catches, and
|
|
15
|
+
anything SYNTAX catches, the other two also catch. That's why a
|
|
16
|
+
test that fails at PARSE is automatically a failure at PLAN — the
|
|
17
|
+
tiers compose.
|
|
18
|
+
|
|
19
|
+
SYNTAX — parse via libpg_query (pglast). No DB needed. Catches every
|
|
20
|
+
grammar error PostgreSQL itself catches but no name/type
|
|
21
|
+
resolution. Microseconds per check.
|
|
22
|
+
|
|
23
|
+
PARSE — PREPARE against a live DB. Runs full parse analysis: name
|
|
24
|
+
resolution, type checking, aggregate/GROUP BY rules, function
|
|
25
|
+
lookup. Milliseconds per check. Implemented in `parse.py`.
|
|
26
|
+
|
|
27
|
+
PLAN — EXPLAIN against a live DB. Runs the full planner pipeline:
|
|
28
|
+
parse-analysis + rewriting + plan-tree construction. Catches
|
|
29
|
+
operator-class lookup failures (ORDER BY / DISTINCT / GROUP
|
|
30
|
+
BY on types without comparison operators) and the subset of
|
|
31
|
+
runtime errors PG can constant-fold at planning time
|
|
32
|
+
(division by zero on literal divisors, etc.). Implemented
|
|
33
|
+
in `plan.py`.
|
|
34
|
+
"""
|
|
35
|
+
from enum import Enum, auto
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Enum (not a string constant) so callers can't pass typos that fail
|
|
39
|
+
# silently — every dispatch path is forced through the typed match.
|
|
40
|
+
# `auto()` for values because nothing outside this module should
|
|
41
|
+
# depend on the integer identity; only the symbolic name is API.
|
|
42
|
+
class ValidationMode(Enum):
|
|
43
|
+
# NONE: skip validation entirely. Reserved for future "generate-only"
|
|
44
|
+
# callers (benchmarks, reproducer dumps); the test suite always runs
|
|
45
|
+
# at SYNTAX or higher to keep generator bugs visible.
|
|
46
|
+
NONE = auto()
|
|
47
|
+
SYNTAX = auto()
|
|
48
|
+
PARSE = auto()
|
|
49
|
+
PLAN = auto()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
__all__ = ["ValidationMode"]
|