waxsql 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waxsql/gen/cte.py ADDED
@@ -0,0 +1,367 @@
1
+ """CTE definition generator.
2
+
3
+ Role: produces the body of one WITH-clause entry. Two public entry
4
+ points — `gen_cte_def` for plain CTEs, `gen_recursive_cte_def` for
5
+ the rigidly-shaped `WITH RECURSIVE` form.
6
+
7
+ INVARIANT on recursive CTEs: the structure is non-negotiable —
8
+ non-recursive "anchor" UNION ALL recursive "step", with the
9
+ self-reference appearing only in the step. Any other shape (anchor
10
+ on the right, no UNION, self-reference in the anchor) is a PG parse
11
+ error. The two-arm builder pair below enforces the shape by
12
+ construction; the printer just renders it.
13
+
14
+ Single public entry point: `gen_cte_def(ctx, name)` builds one
15
+ CteDef. The orchestration of multi-CTE WITH clauses lives in
16
+ `gen/select.py` (`_gen_with_clause`), which calls `gen_cte_def` in
17
+ a loop and registers each result in the parent scope before
18
+ generating the next — that's what lets later CTEs reference earlier
19
+ ones without forward references.
20
+
21
+ A CTE body is a full SELECT generated via `gen_select` on a child
22
+ context, with a few constraints:
23
+
24
+ * `descend_subquery(correlated=False)` — CTE bodies are
25
+ self-contained; they don't reference outer-query *columns*. They
26
+ DO see outer *CTEs* via lookup_cte's unconditional parent-chain
27
+ walk, but that's a separate visibility rule.
28
+
29
+ * `allow_with=False` (set by descend_subquery) — milestone 5
30
+ keeps WITHs top-level only; the CTE body can't have its own
31
+ nested WITH.
32
+
33
+ * Single-target output with explicit `c1` alias on the target —
34
+ same predictable-column-resolution pattern as derived tables.
35
+
36
+ The lazy import of `gen_select` is the standard cycle-breaking
37
+ trick: `gen_select` calls into `gen_cte_def` (via
38
+ `_gen_with_clause`), and `gen_cte_def` calls `gen_select` for the
39
+ inner body, so a top-level import at either end would close the
40
+ loop. Lazy import inside the function keeps both modules importable
41
+ in either order.
42
+ """
43
+ from __future__ import annotations
44
+
45
+ from dataclasses import replace
46
+
47
+ from ..ast import (
48
+ BinaryOp, ColumnRef, CteCycle, CteDef, CteRef, CteSearch, Expr,
49
+ FromItem, Literal, Select, SelectTarget, SetOp, TableRef,
50
+ )
51
+ from ..context import GenContext
52
+ from ..types import BOOL, INT4, INT8, NUMERIC, PgType, TEXT, TIMESTAMPTZ
53
+ from .expr import gen_expr
54
+
55
+
56
+ # Probabilities for the optional SEARCH and CYCLE clauses on a
57
+ # recursive CTE. Independent dice rolls — both can fire on the same
58
+ # CTE, both can be skipped. Real-world recursive CTEs are usually
59
+ # bare (no SEARCH/CYCLE); these probabilities are biased moderate-low.
60
+ _P_RECURSIVE_SEARCH: float = 0.3
61
+ _P_RECURSIVE_CYCLE: float = 0.3
62
+
63
+
64
+ # Recursive CTE column type pool. Restricted to types where
65
+ # arithmetic-style recursion is meaningful (counter increments,
66
+ # string concatenation paths). The base/recursive arms must agree
67
+ # on column type; this pool is what both arms produce.
68
+ _RECURSIVE_COLUMN_TYPES: tuple[PgType, ...] = (
69
+ INT4, INT8, NUMERIC, TEXT,
70
+ )
71
+
72
+
73
+ def gen_cte_def(
74
+ ctx: GenContext,
75
+ name: str,
76
+ ) -> tuple[CteDef, list[tuple[str, PgType]]]:
77
+ """Generate one non-recursive CTE definition.
78
+
79
+ Returns a `(CteDef, columns)` tuple — the AST node for inclusion
80
+ in the WITH list, plus the column info for registration in the
81
+ enclosing scope (so subsequent CTEs and the main query body can
82
+ resolve `cte_name.col` references).
83
+
84
+ The body is a 1..N-target SELECT with explicit `cN` aliases.
85
+ The inner SELECT is generated by gen_select (which produces a
86
+ single target by design); we then synthesize additional targets
87
+ here when the column-count draw says so. Synthetic extra targets
88
+ are generated under the same scope as the gen_select body, so
89
+ they can reference the same FROM-clause columns.
90
+
91
+ The single-column case stays the most common shape because
92
+ max_cte_columns is small (3). Recursive CTEs are kept single-
93
+ column — see gen_recursive_cte_def for why.
94
+ """
95
+ # Lazy import — gen/select.py imports `_gen_with_clause` (which
96
+ # calls this function), creating an import cycle if we did the
97
+ # gen_select import at module top-level.
98
+ from .select import gen_select
99
+
100
+ # Descend into a fresh subquery context. correlated=False because
101
+ # CTE bodies don't reference outer-query columns; CTE-to-CTE
102
+ # references work through lookup_cte (unconditional parent walk),
103
+ # not through column visibility.
104
+ child_ctx = ctx.descend_subquery(correlated=False)
105
+ inner = gen_select(child_ctx)
106
+
107
+ # Decide on column count. The inner SELECT already produced one
108
+ # target; if we want more, generate them in the same child scope
109
+ # (so they see the same FROM-clause bindings). The +1 is because
110
+ # rng.randint is inclusive — we want 1..max with 1 being valid.
111
+ #
112
+ # SUBTLE: skip extras entirely when the inner is aggregate-mode
113
+ # (group_by non-empty). The existing target was generated under
114
+ # the GROUP BY constraint; adding plain column refs as extras
115
+ # would violate the "every non-aggregate target must be in GROUP
116
+ # BY" rule (PG: 42803). Detecting this and synthesizing matching
117
+ # grouped extras is doable but requires GROUP-BY-list awareness
118
+ # — out of scope for the polish item. Single-column aggregate
119
+ # CTEs remain valid; this just narrows multi-column to non-
120
+ # aggregate CTE bodies.
121
+ is_aggregate_mode = bool(inner.group_by)
122
+ n_extra = (
123
+ 0 if is_aggregate_mode
124
+ else child_ctx.rng.randint(0, child_ctx.config.max_cte_columns - 1)
125
+ )
126
+ extra_exprs: list[Expr] = []
127
+ if n_extra > 0:
128
+ target_ctx = replace(child_ctx, allow_aggregates=False)
129
+ extra_types = [
130
+ child_ctx.rng.choice(_RECURSIVE_COLUMN_TYPES)
131
+ for _ in range(n_extra)
132
+ ]
133
+ extra_exprs = [gen_expr(target_ctx, t) for t in extra_types]
134
+
135
+ # Build the final aliased target list: original target as c1,
136
+ # extras as c2, c3, ...
137
+ all_exprs = [inner.targets[0].expr, *extra_exprs]
138
+ inner_aliased = replace(
139
+ inner,
140
+ targets=tuple(
141
+ SelectTarget(expr=e, alias=f"c{i + 1}")
142
+ for i, e in enumerate(all_exprs)
143
+ ),
144
+ )
145
+
146
+ columns = [(f"c{i + 1}", e.pg_type) for i, e in enumerate(all_exprs)]
147
+ return CteDef(name=name, select=inner_aliased), columns
148
+
149
+
150
+ def gen_recursive_cte_def(
151
+ ctx: GenContext,
152
+ name: str,
153
+ ) -> tuple[CteDef, list[tuple[str, PgType]]]:
154
+ """Generate a recursive CTE: `name AS (base UNION ALL recursive)`.
155
+
156
+ Two arms in a SetOp wrapper:
157
+
158
+ * **Base arm**: ordinary single-target SELECT with a
159
+ deterministic column type from `_RECURSIVE_COLUMN_TYPES`.
160
+ No self-reference (the CTE doesn't exist yet from PG's
161
+ POV when the base arm is evaluated).
162
+
163
+ * **Recursive arm**: a SELECT whose FROM clause contains a
164
+ CteRef to `name`. This is the forced self-reference — without
165
+ it, the CTE wouldn't actually recurse (and PG would treat the
166
+ WITH as effectively non-recursive). The recursive arm's
167
+ target type matches the base arm's.
168
+
169
+ The CTE is registered in `ctx.scope` BEFORE the recursive arm is
170
+ generated, so the recursive arm's gen_expr / _make_from_item can
171
+ resolve the name. The caller (`_gen_with_clause`) must NOT
172
+ re-register; we return the same `(CteDef, columns)` shape as
173
+ `gen_cte_def` for consistency, but signal "already registered"
174
+ by virtue of the caller checking `cte_def.recursive`.
175
+ """
176
+ rng = ctx.rng
177
+ cfg = ctx.config
178
+
179
+ # Pick the column count and types. Both arms must produce N
180
+ # targets in the same positional order, with each cN matching
181
+ # in type — that's the structural invariant for `UNION ALL`
182
+ # between the arms.
183
+ n_cols = rng.randint(1, cfg.max_cte_columns)
184
+ target_types = tuple(
185
+ rng.choice(_RECURSIVE_COLUMN_TYPES) for _ in range(n_cols)
186
+ )
187
+ columns = [(f"c{i + 1}", t) for i, t in enumerate(target_types)]
188
+
189
+ # ---- Base arm -----------------------------------------------------
190
+ # No self-reference. Use a fresh subquery scope, generate FROM
191
+ # and N typed targets aliased c1..cN.
192
+ base_ctx = ctx.descend_subquery(correlated=False)
193
+ base = _build_recursive_arm_base(base_ctx, target_types)
194
+
195
+ # ---- Register CTE in OUTER scope BEFORE recursive arm ------------
196
+ # The recursive arm's _make_from_item (or gen_expr) must be able
197
+ # to find `name` via has_visible_ctes / lookup_cte.
198
+ # ORDERING DEPENDENCY: this registration MUST happen between the
199
+ # base-arm build and the recursive-arm build. Registering earlier
200
+ # would let the base arm see its own name (PG rejects this);
201
+ # registering later would mean the recursive arm has no name to
202
+ # reference and the WITH would silently degrade to non-recursive.
203
+ ctx.scope.add_cte(name, columns)
204
+
205
+ # ---- Recursive arm with forced self-reference --------------------
206
+ # Build a Select whose FROM clause includes a CteRef to `name`.
207
+ # Same mechanism as milestone 3's forced correlation predicate
208
+ # and milestone 4's LATERAL forcer: the structural enforcement
209
+ # of "this thing actually exercises the feature."
210
+ rec_ctx = ctx.descend_subquery(correlated=False)
211
+ rec = _build_recursive_arm_self_ref(rec_ctx, name, target_types)
212
+
213
+ body = SetOp(op="UNION", all=True, arms=(base, rec))
214
+
215
+ # Optional SEARCH and CYCLE clauses — independent dice rolls.
216
+ # Both reference c1 only (the first column), even when the CTE
217
+ # has multiple columns. PG accepts SEARCH/CYCLE on any subset of
218
+ # the CTE's exposed columns; sticking with c1 keeps the invariant
219
+ # simple and matches the most common real-world shape.
220
+ # The synthetic columns (search_seq / is_cycle / cycle_path) are
221
+ # deliberately NOT added to the CTE's exposed `columns` list:
222
+ # outer queries don't try to reference them by name, and
223
+ # registering them would require modeling PG's row-array type
224
+ # which isn't in our type system. PG happily defines them and
225
+ # ignores the lack of outer use.
226
+ search = None
227
+ if rng.random() < _P_RECURSIVE_SEARCH:
228
+ search = CteSearch(
229
+ breadth_first=rng.random() < 0.5,
230
+ by_columns=("c1",),
231
+ set_column="search_seq",
232
+ )
233
+ cycle = None
234
+ if rng.random() < _P_RECURSIVE_CYCLE:
235
+ cycle = CteCycle(
236
+ columns=("c1",),
237
+ cycle_mark_column="is_cycle",
238
+ path_column="cycle_path",
239
+ )
240
+
241
+ return CteDef(
242
+ name=name, select=body, recursive=True,
243
+ search=search, cycle=cycle,
244
+ ), columns
245
+
246
+
247
+ def _build_recursive_arm_base(
248
+ ctx: GenContext,
249
+ target_types: tuple[PgType, ...],
250
+ ) -> Select:
251
+ """Build the base arm of a recursive CTE — a non-self-referencing
252
+ SELECT with N targets of the requested types, aliased c1..cN."""
253
+ from .select import _gen_from_clause
254
+
255
+ from_ = _gen_from_clause(ctx)
256
+
257
+ # Generate one target per requested type. allow_aggregates=False
258
+ # because the base arm is a non-aggregate SELECT (matching the
259
+ # recursive arm's shape; aggregate-recursive bodies have weird
260
+ # interactions with the recursion termination semantics).
261
+ expr_ctx = replace(ctx, allow_aggregates=False)
262
+ targets = tuple(
263
+ SelectTarget(expr=gen_expr(expr_ctx, t), alias=f"c{i + 1}")
264
+ for i, t in enumerate(target_types)
265
+ )
266
+
267
+ return Select(targets=targets, from_=from_)
268
+
269
+
270
+ def _build_recursive_arm_self_ref(
271
+ ctx: GenContext,
272
+ cte_name: str,
273
+ target_types: tuple[PgType, ...],
274
+ ) -> Select:
275
+ """Build the recursive arm — a SELECT whose FROM contains a
276
+ CteRef to `cte_name` (forcing actual self-reference). Each of
277
+ the N targets is a column ref to the corresponding self.cN
278
+ (so the recursive types match the base types by construction).
279
+
280
+ Optionally adds a base table to the FROM as well — `WITH
281
+ RECURSIVE r AS (... UNION ALL SELECT r.c1, r.c2 FROM r, base
282
+ WHERE ...)` is the canonical recursive shape (CTE + driver
283
+ table joined to walk a relation).
284
+
285
+ The termination WHERE predicate uses self.c1 only — works for
286
+ any column type that has `<` in our catalog, and limiting on
287
+ one column is sufficient to bound recursion depth.
288
+ """
289
+ rng = ctx.rng
290
+
291
+ # CteRef to self — give it a fresh tN alias from the shared
292
+ # alias_counter. Register all N columns so the target expressions
293
+ # below can reference any of them.
294
+ # CONSTRAINT: the recursive arm is the ONLY place a CteRef to
295
+ # `cte_name` may appear inside this WITH entry. The base arm
296
+ # would treat the name as undefined (it's registered after the
297
+ # base arm builds); the outer query treats it as a normal CTE
298
+ # reference. PG enforces this same single-arm-recursive rule at
299
+ # parse time.
300
+ self_alias = f"t{ctx.alias_counter.take()}"
301
+ self_ref = CteRef(cte_name=cte_name, alias=self_alias)
302
+ self_columns = [
303
+ (f"c{i + 1}", t) for i, t in enumerate(target_types)
304
+ ]
305
+ ctx.scope.add_derived(self_alias, self_columns)
306
+
307
+ # Optionally add a base-table FROM item alongside the CteRef
308
+ # — the canonical recursive pattern is `r JOIN base ON r.x =
309
+ # base.parent` for graph walks. Comma-join with one base table.
310
+ from_items: list[FromItem] = [self_ref]
311
+ if rng.random() < 0.7:
312
+ base_table = rng.choice(ctx.schema.tables)
313
+ base_alias = f"t{ctx.alias_counter.take()}"
314
+ ctx.scope.add_table(base_alias, base_table)
315
+ from_items.append(TableRef(base_table.name, base_alias))
316
+
317
+ # Targets: each cN references self.cN. This guarantees positional
318
+ # type-match between the two arms, which is what UNION ALL
319
+ # requires. Adding arithmetic ("advance" patterns like `r.c1 + 1`)
320
+ # is more realistic but more bookkeeping; deferred.
321
+ targets = tuple(
322
+ SelectTarget(
323
+ expr=ColumnRef(t, self_alias, f"c{i + 1}"),
324
+ alias=f"c{i + 1}",
325
+ )
326
+ for i, t in enumerate(target_types)
327
+ )
328
+
329
+ # Termination predicate on self.c1. Bounding any one column is
330
+ # sufficient to terminate the recursion in our generator (since
331
+ # recursive arms just project self-columns; without this the WITH
332
+ # would be an infinite empty loop in PG's planner model).
333
+ where: Expr = BinaryOp(
334
+ BOOL, "<",
335
+ ColumnRef(target_types[0], self_alias, "c1"),
336
+ _typed_terminator_literal(target_types[0]),
337
+ )
338
+
339
+ return Select(
340
+ targets=targets,
341
+ from_=tuple(from_items),
342
+ where=where,
343
+ )
344
+
345
+
346
+ def _typed_terminator_literal(t: PgType) -> Literal:
347
+ """Return a Literal of `t` that's plausibly a recursion-terminator
348
+ bound. Numeric types get a "moderate" upper bound; text gets a
349
+ string. Used as the RHS of the recursive arm's WHERE predicate."""
350
+ if t in (INT4, INT8):
351
+ return Literal(t, 1000)
352
+ if t == NUMERIC:
353
+ return Literal(t, 1000.0)
354
+ if t == TEXT:
355
+ return Literal(t, "zzz")
356
+ if t == TIMESTAMPTZ:
357
+ return Literal(t, "2024-12-31 00:00:00+00")
358
+ # Defensive fallback: target_types[0] is drawn from
359
+ # _RECURSIVE_COLUMN_TYPES which is currently a strict subset of
360
+ # the cases above, so this branch is unreachable. Kept as a
361
+ # tripwire — if the pool ever expands, the WHERE predicate will
362
+ # become `c1 < NULL` (always FALSE / unknown), which terminates
363
+ # the recursion immediately rather than producing invalid SQL.
364
+ return Literal(t, None)
365
+
366
+
367
+ __all__ = ["gen_cte_def", "gen_recursive_cte_def"]
@@ -0,0 +1,14 @@
1
+ """Internal package for the data generator.
2
+
3
+ Public entry point is `waxsql.data.generate_data`. Submodules:
4
+ - strategies: per-type value generators
5
+ - columns: column-name override registry (hook for future plausibility)
6
+ - rows: topological row materialization + FK resolution
7
+ - emit: COPY block formatting
8
+
9
+ Role: every module in this package is internal. Callers should import
10
+ through `waxsql.data` (or the top-level `waxsql.generate_data` re-export);
11
+ the layout here may change without notice. The split exists so each
12
+ concern (type-keyed values, name-keyed overrides, FK ordering, COPY
13
+ encoding) lives in one file with one set of invariants.
14
+ """
@@ -0,0 +1,48 @@
1
+ """Column-name override registry.
2
+
3
+ This is a hook for the eventual 'column named email actually has emails'
4
+ story. Today the registry is nearly empty and `strategy_for` falls
5
+ through to the type strategy in nearly every case. The registry is a
6
+ tuple (not a dict) because order matters — first match wins — and
7
+ because tuple iteration is deterministic across Python versions.
8
+
9
+ Role in the system: `strategy_for(col)` is the single per-column
10
+ dispatch point used by `rows.generate_row`. Centralizing the lookup
11
+ here means future semantic plausibility (emails, names, URLs, etc.)
12
+ can grow inside this module without touching the row materializer or
13
+ the type-strategy registry.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import re
18
+
19
+ from waxsql.gen.data.strategies import Strategy, strategy_for_type
20
+ from waxsql.schema import Column
21
+
22
+
23
+ # Tuple of (compiled-pattern, strategy). First match wins. Empty today —
24
+ # this is the seam for future semantic plausibility. Adding an entry
25
+ # here doesn't require schema changes; the dispatch happens per column
26
+ # at row-generation time.
27
+ #
28
+ # Why a tuple of (pattern, strategy) pairs and not a dict of name → strategy:
29
+ # (1) ordering matters when patterns can overlap (e.g. `email_verified_at`
30
+ # should match a timestamp pattern, not the email pattern), and tuple
31
+ # iteration order is part of the source; (2) regex matching against
32
+ # every column name avoids exact-name brittleness.
33
+ _NAME_PATTERNS: tuple[tuple[re.Pattern, Strategy], ...] = ()
34
+
35
+
36
+ def strategy_for(column: Column) -> Strategy:
37
+ """Return the strategy to use for `column`. Patterns are consulted
38
+ in order; the first one that matches the column name wins. Falls
39
+ through to `strategy_for_type(column.type)` when nothing matches.
40
+
41
+ Today this almost always falls through to the type strategy —
42
+ plausibility is type-driven, the name-override seam is intentionally
43
+ underused. The fallthrough is the common case, not an error path.
44
+ """
45
+ for pat, strat in _NAME_PATTERNS:
46
+ if pat.search(column.name):
47
+ return strat
48
+ return strategy_for_type(column.type)
@@ -0,0 +1,247 @@
1
+ """COPY block formatting.
2
+
3
+ PostgreSQL's text-format COPY uses tab as the column separator, `\\N`
4
+ as the NULL sentinel, and backslash-escapes for tab, newline, carriage
5
+ return, and backslash itself. This module owns all of that; strategies
6
+ return native Python values and the emitter formats them.
7
+
8
+ Role in the system: this is the bottom of the value-rendering stack.
9
+ Everything above it (strategies, row materializer) traffics in native
10
+ Python objects; nothing above this module knows anything about tab
11
+ encoding, NULL sentinels, COPY framing, or PG array literal syntax.
12
+ That separation lets the strategy registry stay trivially testable
13
+ (no string round-trips needed to compare values).
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import datetime as _dt
18
+ import json
19
+ import uuid
20
+ from decimal import Decimal
21
+ from collections.abc import Iterable, Sequence
22
+
23
+
24
+ # PG text-format COPY: the literal two-character sequence `\N` (backslash-N)
25
+ # in an unescaped position is the NULL marker. Raw string so the backslash
26
+ # stays a backslash — not an escape introducer for Python.
27
+ NULL_SENTINEL = r"\N"
28
+
29
+ # COPY text-format escape rules. Order matters: backslash MUST come
30
+ # first so we don't double-escape escapes we just inserted. If `\t`
31
+ # came first, the subsequent backslash pass would turn the `\` in
32
+ # `\t` into `\\`, producing `\\t` instead of the intended `\t`.
33
+ _ESCAPES: tuple[tuple[str, str], ...] = (
34
+ ("\\", r"\\"),
35
+ ("\t", r"\t"),
36
+ ("\n", r"\n"),
37
+ ("\r", r"\r"),
38
+ )
39
+
40
+
41
+ def _escape_text(s: str) -> str:
42
+ # Linear pass through the ordered _ESCAPES table. str.replace is
43
+ # already optimized in CPython; rolling our own char-by-char loop
44
+ # would be slower and no clearer.
45
+ for raw, escaped in _ESCAPES:
46
+ s = s.replace(raw, escaped)
47
+ return s
48
+
49
+
50
+ def encode_value(v: object) -> str:
51
+ """Encode a Python value for PG's text-format COPY.
52
+
53
+ Returns the COPY representation of `v`. NULL is represented by
54
+ `\\N`; bools as `t`/`f`; datetimes/dates as ISO-8601 (with `+00:00`
55
+ suffix for timestamptz); intervals as ISO-8601 duration; dicts as
56
+ compact JSON; lists as PG array literals.
57
+
58
+ Dispatch ORDER is load-bearing — see the bool-before-int note below
59
+ and the str-after-everything note for the array element path.
60
+ """
61
+ if v is None:
62
+ return NULL_SENTINEL
63
+ if isinstance(v, bool):
64
+ # bool MUST come before int — bool is a subclass of int in
65
+ # Python (isinstance(True, int) is True), so an earlier int
66
+ # branch would swallow booleans and emit them as "1"/"0".
67
+ return "t" if v else "f"
68
+ if isinstance(v, (int, float)):
69
+ return str(v)
70
+ if isinstance(v, Decimal):
71
+ # Decimal __str__ is canonical (no trailing zeros stripping, no
72
+ # scientific notation for reasonable values); PG numeric parses
73
+ # it directly.
74
+ return str(v)
75
+ if isinstance(v, uuid.UUID):
76
+ return str(v)
77
+ if isinstance(v, _dt.datetime):
78
+ # str() on tz-aware datetime gives "YYYY-MM-DD HH:MM:SS+HH:MM"
79
+ # which is what PG accepts for timestamptz literals.
80
+ # Note: the date branch below is reached only by naïve `date`
81
+ # values — datetime is a subclass of date, so this branch
82
+ # MUST come before the date branch.
83
+ return str(v)
84
+ if isinstance(v, _dt.date):
85
+ return v.isoformat()
86
+ if isinstance(v, _dt.timedelta):
87
+ # ISO-8601 duration: PnDTnS. Limited to days/seconds because
88
+ # timedelta doesn't carry month/year parts. PG accepts this.
89
+ # If we ever need year/month resolution we'll have to leave
90
+ # timedelta behind for a richer carrier type.
91
+ days = v.days
92
+ seconds = v.seconds
93
+ return f"P{days}DT{seconds}S"
94
+ if isinstance(v, dict):
95
+ # Compact JSON (no spaces) keeps COPY output narrow and matches
96
+ # what PG emits for jsonb on dump. separators kwarg is required;
97
+ # the default json.dumps inserts ", " and ": " padding.
98
+ #
99
+ # The result is then run through `_escape_text` because
100
+ # json.dumps emits backslash-escape sequences (`\t`, `\n`, `\"`,
101
+ # `\uXXXX`) for control chars, quotes, and non-ASCII. Those
102
+ # backslashes mean nothing to JSON's *reader* (it resolves
103
+ # them) but everything to COPY's *reader* (which uses
104
+ # backslash as its own escape introducer). Without
105
+ # re-escaping, a JSON value containing e.g. an embedded tab
106
+ # would arrive at PG with the `\` already consumed by COPY,
107
+ # leaving invalid JSON for jsonb_in. _escape_text doubles
108
+ # every `\` so COPY resolves them back to single `\` before
109
+ # handing the JSON to PG.
110
+ # allow_nan=False: inf/nan would serialize as Infinity/NaN, which
111
+ # jsonb_in rejects. Fail at generation time, not COPY-load time.
112
+ return _escape_text(json.dumps(v, separators=(",", ":"), allow_nan=False))
113
+ if isinstance(v, list):
114
+ # PG text-format array literal: `{elem,elem,...}`. Per-element
115
+ # formatting goes through `_array_element`, which knows the
116
+ # two-layer escape rules required for strings/dicts inside an
117
+ # array literal that itself sits inside a COPY cell.
118
+ return "{" + ",".join(_array_element(e) for e in v) + "}"
119
+ if isinstance(v, str):
120
+ return _escape_text(v)
121
+ raise TypeError(f"no COPY encoder for type {type(v).__name__}: {v!r}")
122
+
123
+
124
+ def _array_element(v: object) -> str:
125
+ """Format a single element for inclusion in a PG array literal.
126
+
127
+ A PG array literal lives INSIDE a COPY cell, so two parsing
128
+ layers apply to its bytes:
129
+
130
+ 1. PG's COPY row reader resolves row-format escapes first
131
+ (`\\\\`, `\\t`, `\\n`, `\\r`).
132
+ 2. PG's array_in then parses the resolved cell text, with
133
+ its own escape grammar for quoted elements: `\\\\` resolves
134
+ to `\\`, `\\"` resolves to `"`. (Note: array_in does NOT
135
+ recognize `\\t`/`\\n`/`\\r` — anywhere it sees a literal
136
+ tab/newline/CR inside a quoted element, it just keeps
137
+ the character as part of the string.)
138
+
139
+ String and dict elements need BOTH escape passes; numeric/bool/
140
+ datetime elements need NEITHER (PG's array_in accepts them
141
+ unquoted via the element type's own input function). Lists
142
+ recurse for multidimensional arrays — they do NOT get quoted,
143
+ because PG's array literal supports `{{1,2},{3,4}}` directly.
144
+
145
+ None becomes the bare token `NULL` (case-insensitive), NOT the
146
+ row-level `\\N` sentinel — inside a `{...}` literal, `\\N`
147
+ is interpreted as the two-character string `\\N`, not SQL NULL.
148
+ """
149
+ if v is None:
150
+ # Bare unquoted token: PG's array_in treats this as SQL NULL.
151
+ # The row-level `\N` sentinel would be misread as the literal
152
+ # 2-char string, not NULL — a silent data corruption.
153
+ return "NULL"
154
+ if isinstance(v, list):
155
+ # Multidimensional array: emit the inner `{...}` literal
156
+ # WITHOUT quoting, so the outer array_in sees it as a
157
+ # sub-array, not a string element. Today `array_of(array_of(
158
+ # ...))` isn't constructible by the data generator, but the
159
+ # path is correct for the eventual extension.
160
+ return encode_value(v)
161
+ if isinstance(v, dict):
162
+ # Array-of-jsonb: each element is a JSON-encoded value
163
+ # wrapped as a quoted array-element. PG's array_in extracts
164
+ # the quoted content (resolving its escapes) and hands the
165
+ # resulting JSON text to jsonb_in for parsing. Two-layer
166
+ # escape handles backslashes from JSON's own escapes plus
167
+ # the quote-and-escape required by the array element format.
168
+ return _quote_array_element(json.dumps(v, separators=(",", ":"), allow_nan=False))
169
+ if isinstance(v, str):
170
+ return _quote_array_element(v)
171
+ # Scalar non-string types (int, float, bool, Decimal, UUID,
172
+ # datetime, date, timedelta): PG's array_in accepts these as
173
+ # unquoted tokens — quoting would actually be wrong here,
174
+ # because array_in would then call the element type's input
175
+ # function on a *string* form rather than the bare token.
176
+ return encode_value(v)
177
+
178
+
179
+ def _quote_array_element(s: str) -> str:
180
+ """Quote-and-escape a string for use as a PG array literal element.
181
+
182
+ Encodes in REVERSE order of how PG's parsers resolve the bytes:
183
+ array_in's escapes are applied FIRST in our code (because they
184
+ resolve LAST when PG reads), then COPY's escapes are applied
185
+ LAST in our code (because they resolve FIRST when PG reads).
186
+
187
+ Concretely:
188
+
189
+ * Step 1 (array_in escapes): `\\\\` → `\\\\\\\\` (one backslash
190
+ becomes two), `"` → `\\"` (one quote becomes backslash-quote).
191
+ These are the only escapes array_in recognizes inside a
192
+ double-quoted element.
193
+
194
+ * Step 2 (COPY escapes via `_escape_text`): re-escapes every
195
+ backslash (so each `\\\\` from step 1 becomes `\\\\\\\\`,
196
+ and any pre-existing literal backslash gets doubled) AND
197
+ backslash-escapes tab/newline/CR characters in the source
198
+ (so they don't terminate the COPY cell prematurely on
199
+ reading; once resolved by COPY they become literal
200
+ whitespace inside the quoted element, which array_in
201
+ keeps verbatim).
202
+
203
+ Composition example: a source `\\` (one backslash) → step 1
204
+ → `\\\\` (two) → step 2 → `\\\\\\\\` (four) → wrapped in `"..."`.
205
+ PG resolution: COPY reads four backslashes → resolves to two
206
+ → array_in reads two backslashes → resolves to one. Round-trip
207
+ preserves the original character.
208
+ """
209
+ arr_escaped = s.replace("\\", "\\\\").replace('"', '\\"')
210
+ copy_escaped = _escape_text(arr_escaped)
211
+ return '"' + copy_escaped + '"'
212
+
213
+
214
+ def emit_copy_block(
215
+ table_name: str,
216
+ columns: Sequence[str],
217
+ rows: Iterable[Sequence[object]],
218
+ ) -> str:
219
+ """Format one COPY block.
220
+
221
+ Output shape:
222
+ COPY "table" ("col1", "col2") FROM STDIN;
223
+ v11<TAB>v12
224
+ v21<TAB>v22
225
+ \\.
226
+
227
+ Identifiers are double-quoted to handle case-sensitivity and
228
+ reserved-word collisions uniformly. PG accepts quoted identifiers
229
+ that happen not to need quoting, so the extra punctuation is harmless.
230
+
231
+ The empty-rows case (no row tuples) is intentionally legal: it
232
+ produces `COPY ... FROM STDIN;\\n\\.\\n` — header immediately
233
+ followed by terminator. psql and psycopg both accept this; the
234
+ `waxsql gen --rows=0` flag relies on it.
235
+ """
236
+ col_list = ", ".join(f'"{c}"' for c in columns)
237
+ # Build via parts + join: O(n) regardless of row count, vs O(n^2)
238
+ # for repeated string += concatenation.
239
+ parts = [f'COPY "{table_name}" ({col_list}) FROM STDIN;\n']
240
+ for row in rows:
241
+ parts.append("\t".join(encode_value(v) for v in row))
242
+ parts.append("\n")
243
+ # PG COPY text-format end-of-data sentinel: a line containing only
244
+ # `\.` (the same literal sequence used in psql). Followed by a
245
+ # newline so concatenated blocks separate cleanly.
246
+ parts.append("\\.\n")
247
+ return "".join(parts)