waxsql 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- waxsql/__init__.py +158 -0
- waxsql/ast.py +757 -0
- waxsql/catalog.py +363 -0
- waxsql/cli.py +888 -0
- waxsql/config.py +477 -0
- waxsql/context.py +255 -0
- waxsql/data.py +99 -0
- waxsql/gen/__init__.py +51 -0
- waxsql/gen/cte.py +367 -0
- waxsql/gen/data/__init__.py +14 -0
- waxsql/gen/data/columns.py +48 -0
- waxsql/gen/data/emit.py +247 -0
- waxsql/gen/data/rows.py +236 -0
- waxsql/gen/data/strategies.py +299 -0
- waxsql/gen/expr.py +723 -0
- waxsql/gen/select.py +831 -0
- waxsql/gen/setop.py +259 -0
- waxsql/gen/subquery.py +397 -0
- waxsql/gen/window.py +398 -0
- waxsql/pretty.py +81 -0
- waxsql/printer.py +688 -0
- waxsql/py.typed +0 -0
- waxsql/schema.py +557 -0
- waxsql/scope.py +391 -0
- waxsql/types.py +187 -0
- waxsql/validate/__init__.py +52 -0
- waxsql/validate/parse.py +194 -0
- waxsql/validate/plan.py +149 -0
- waxsql/validate/syntax.py +87 -0
- waxsql-1.0.0.dist-info/METADATA +746 -0
- waxsql-1.0.0.dist-info/RECORD +35 -0
- waxsql-1.0.0.dist-info/WHEEL +5 -0
- waxsql-1.0.0.dist-info/entry_points.txt +2 -0
- waxsql-1.0.0.dist-info/licenses/LICENSE +21 -0
- waxsql-1.0.0.dist-info/top_level.txt +1 -0
waxsql/gen/data/rows.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Per-table row materialization and FK resolution.
|
|
2
|
+
|
|
3
|
+
The walk is one topological pass over the FK DAG. For each table, we
|
|
4
|
+
materialize all rows in memory as Python tuples, capture the PK column
|
|
5
|
+
values into a per-table ID store, and hand the tuples to the emitter.
|
|
6
|
+
Children resolve FK columns by sampling from the parent's ID list,
|
|
7
|
+
which is guaranteed populated by the topological order.
|
|
8
|
+
|
|
9
|
+
Self-referential FKs (a column in table T referencing T.id) are handled
|
|
10
|
+
as a special case: row with pk=N can reference any row in [1..N] including
|
|
11
|
+
itself, because PostgreSQL checks FK constraints per-row by default (NOT
|
|
12
|
+
DEFERRABLE). Nullable self-FK columns honour null_fraction (the null roll
|
|
13
|
+
fires above, before the FK branch, and controls whether the column is NULL);
|
|
14
|
+
NOT NULL self-FK columns always sample from [1..pk].
|
|
15
|
+
|
|
16
|
+
In-memory materialization is fine at demo scales (default 100 rows ×
|
|
17
|
+
fanout 5 across a few levels of depth is comfortably under a megabyte).
|
|
18
|
+
If volume ever becomes a real constraint, a streaming variant can be
|
|
19
|
+
added behind the same public API.
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import random
|
|
24
|
+
from collections.abc import Mapping
|
|
25
|
+
|
|
26
|
+
from waxsql.gen.data.columns import strategy_for
|
|
27
|
+
from waxsql.schema import Schema, Table
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def topological_order(schema: Schema) -> list[Table]:
|
|
31
|
+
"""Return the schema's tables in FK-topological order: every table
|
|
32
|
+
appears before any table that has an FK referencing it.
|
|
33
|
+
|
|
34
|
+
Within each topological rank, tables are sorted alphabetically by
|
|
35
|
+
name. This eliminates dict-insertion-order dependency and keeps the
|
|
36
|
+
walk byte-identical across Python builds.
|
|
37
|
+
|
|
38
|
+
Raises ValueError if a cycle is detected (the current generator does
|
|
39
|
+
not produce cycles, but we assert loudly rather than loop forever).
|
|
40
|
+
|
|
41
|
+
Algorithm: Kahn-style level-set walk. Each iteration extracts the
|
|
42
|
+
set of tables whose remaining parents are empty (i.e. all their FK
|
|
43
|
+
targets have already been placed), sorts that set alphabetically,
|
|
44
|
+
and appends them. A "no ready tables but remaining is non-empty"
|
|
45
|
+
state means a cycle and raises.
|
|
46
|
+
"""
|
|
47
|
+
by_name = {t.name: t for t in schema.tables}
|
|
48
|
+
# Collect UNIQUE external parents per table: FKs that point to a
|
|
49
|
+
# *different* table. Self-referential FKs (`t → t`) are intentionally
|
|
50
|
+
# excluded because they impose no ordering constraint — the table
|
|
51
|
+
# trivially appears before itself.
|
|
52
|
+
parents: dict[str, set[str]] = {
|
|
53
|
+
t.name: {fk.ref_table for fk in t.foreign_keys if fk.ref_table != t.name}
|
|
54
|
+
for t in schema.tables
|
|
55
|
+
}
|
|
56
|
+
remaining = set(by_name)
|
|
57
|
+
out: list[Table] = []
|
|
58
|
+
while remaining:
|
|
59
|
+
# sorted(...) is essential here, not cosmetic: `set` iteration
|
|
60
|
+
# order is not stable across Python builds (hash-randomized for
|
|
61
|
+
# strings since 3.3), and any set-driven choice would silently
|
|
62
|
+
# break the determinism contract.
|
|
63
|
+
ready = sorted(
|
|
64
|
+
n for n in remaining
|
|
65
|
+
if not (parents[n] & remaining)
|
|
66
|
+
)
|
|
67
|
+
if not ready:
|
|
68
|
+
# No table has all its parents already placed AND `remaining`
|
|
69
|
+
# is non-empty → cycle. The data generator can't currently
|
|
70
|
+
# handle cycles (would need deferred constraints + UPDATE
|
|
71
|
+
# patches); the CLI catches this ValueError and produces a
|
|
72
|
+
# clean usage message pointing at the cycle.
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"FK cycle in schema: remaining {sorted(remaining)!r}"
|
|
75
|
+
)
|
|
76
|
+
for n in ready:
|
|
77
|
+
out.append(by_name[n])
|
|
78
|
+
remaining.discard(n)
|
|
79
|
+
return out
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def depth_of(table: Table, schema: Schema) -> int:
|
|
83
|
+
"""The longest FK chain from any root to `table`.
|
|
84
|
+
|
|
85
|
+
Roots have depth 0. A table directly referencing a root has depth 1.
|
|
86
|
+
When a table has multiple FK parents, depth is 1 + max(parent depths)
|
|
87
|
+
— the longest chain, not the shortest.
|
|
88
|
+
|
|
89
|
+
Depth feeds the row-count formula: `rows * fanout ** depth`. Choosing
|
|
90
|
+
longest-chain (not shortest) means a leaf with even one deep ancestor
|
|
91
|
+
gets the deep row count — appropriate, since the leaf has to spread
|
|
92
|
+
across the parent's larger ID pool to avoid clumping.
|
|
93
|
+
|
|
94
|
+
The internal `memo` dict is local to this call, so memoization
|
|
95
|
+
avoids re-walking shared ancestry WITHIN a single invocation but
|
|
96
|
+
is NOT shared across calls. `rows_for_table` calls `depth_of`
|
|
97
|
+
once per table in the schema, so the total work for a full
|
|
98
|
+
data-generation pass is O(N · D) where N is the number of
|
|
99
|
+
tables and D the maximum chain length — fine at the schema
|
|
100
|
+
generator's N=12 ceiling. Promote to a shared memo threaded
|
|
101
|
+
through `topological_order`/`rows_for_table` only if N grows
|
|
102
|
+
materially. The `stack` parameter threads the current path for
|
|
103
|
+
cycle detection; a cycle raises ValueError rather than
|
|
104
|
+
recursing to Python's stack limit.
|
|
105
|
+
"""
|
|
106
|
+
by_name = {t.name: t for t in schema.tables}
|
|
107
|
+
memo: dict[str, int] = {}
|
|
108
|
+
|
|
109
|
+
def visit(name: str, stack: tuple[str, ...] = ()) -> int:
|
|
110
|
+
if name in stack:
|
|
111
|
+
# Cycle detected on the recursion path itself — different
|
|
112
|
+
# from the topological_order cycle check, which is global.
|
|
113
|
+
# Tuple-based stack is cheap (small N) and keeps the path
|
|
114
|
+
# for the error message.
|
|
115
|
+
raise ValueError(f"FK cycle through {name}: {stack}")
|
|
116
|
+
if name in memo:
|
|
117
|
+
return memo[name]
|
|
118
|
+
t = by_name[name]
|
|
119
|
+
# Only consider FKs to *other* tables. Self-referential FKs
|
|
120
|
+
# (`t → t`) would recurse forever; they carry no depth information
|
|
121
|
+
# because the row materializer handles self-references separately
|
|
122
|
+
# (sample from [1..pk] in the same row generation).
|
|
123
|
+
external_parents = [fk.ref_table for fk in t.foreign_keys if fk.ref_table != name]
|
|
124
|
+
d = 0 if not external_parents else 1 + max(
|
|
125
|
+
visit(p, stack + (name,)) for p in external_parents
|
|
126
|
+
)
|
|
127
|
+
memo[name] = d
|
|
128
|
+
return d
|
|
129
|
+
|
|
130
|
+
return visit(table.name)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# PK column name convention is fixed by the schema generator: every table
|
|
134
|
+
# has an `id BIGINT NOT NULL` PK at the first column position. We don't
|
|
135
|
+
# search for it dynamically; if that invariant ever changes, this module
|
|
136
|
+
# will produce nonsense for that table and tests will fail loudly, which
|
|
137
|
+
# is the desired behavior.
|
|
138
|
+
_PK_COLUMN_NAME = "id"
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def rows_for_table(table: Table, schema: Schema, *, base: int, fanout: int) -> int:
|
|
142
|
+
"""Row count for `table`: `base * fanout ** depth`. No ceiling; the
|
|
143
|
+
user is responsible for not asking for combinations that produce
|
|
144
|
+
absurd output. Depth is 0 for root tables, so they always get exactly
|
|
145
|
+
`base` rows regardless of fanout.
|
|
146
|
+
|
|
147
|
+
Why the power law: it lets child tables outnumber their parents
|
|
148
|
+
geometrically, which is what real one-to-many shapes look like
|
|
149
|
+
(orders >> customers, line_items >> orders). With base=100, fanout=5,
|
|
150
|
+
depth=3: 12500 rows — large enough to exercise EXPLAIN's choices,
|
|
151
|
+
small enough to fit comfortably in memory and load in seconds.
|
|
152
|
+
"""
|
|
153
|
+
return base * fanout ** depth_of(table, schema)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def generate_row(
|
|
157
|
+
*,
|
|
158
|
+
table: Table,
|
|
159
|
+
pk: int,
|
|
160
|
+
rng: random.Random,
|
|
161
|
+
id_store: Mapping[str, list[int]],
|
|
162
|
+
null_fraction: float,
|
|
163
|
+
) -> tuple[object, ...]:
|
|
164
|
+
"""Generate one row for `table` as a Python tuple.
|
|
165
|
+
|
|
166
|
+
PK column gets `pk`; FK columns sample from `id_store[ref_table]`;
|
|
167
|
+
other columns dispatch through `strategy_for`. Nullable columns —
|
|
168
|
+
including nullable FK columns — roll for NULL first; only columns
|
|
169
|
+
that survive the null roll reach FK or strategy dispatch.
|
|
170
|
+
|
|
171
|
+
Self-referential FKs bypass `id_store` entirely: PostgreSQL's default
|
|
172
|
+
NOT DEFERRABLE constraint checking allows row pk=N to reference any
|
|
173
|
+
row in [1..N] (itself included, since the row exists by check time).
|
|
174
|
+
Both nullable and non-nullable self-FK columns sample from [1..pk] once
|
|
175
|
+
they reach the FK branch — the null roll above already gave nullable
|
|
176
|
+
columns their chance to become NULL, so null_fraction is honoured
|
|
177
|
+
correctly instead of forcing self-FK nullables to always be NULL.
|
|
178
|
+
"""
|
|
179
|
+
# Build a column-name → ref_table lookup once per row. With most
|
|
180
|
+
# schemas having ≤ a handful of FKs per table this is cheap.
|
|
181
|
+
# Dict order doesn't matter here — we lookup by name, not iterate.
|
|
182
|
+
fk_by_col: dict[str, str] = {}
|
|
183
|
+
for fk in table.foreign_keys:
|
|
184
|
+
for child_col, _parent_col in zip(fk.columns, fk.ref_columns, strict=True):
|
|
185
|
+
fk_by_col[child_col] = fk.ref_table
|
|
186
|
+
|
|
187
|
+
values: list[object] = []
|
|
188
|
+
# Column ORDER matters: every consumed rng call advances the shared
|
|
189
|
+
# RNG state, so a swap here changes downstream output. The iteration
|
|
190
|
+
# order follows `table.columns`, which is the schema generator's
|
|
191
|
+
# declared order — same source of truth as the DDL emitter.
|
|
192
|
+
for col in table.columns:
|
|
193
|
+
if col.name == _PK_COLUMN_NAME:
|
|
194
|
+
# PK is supplied by the caller (sequential 1..n). PK never
|
|
195
|
+
# rolls for NULL and never goes through a strategy — bypass.
|
|
196
|
+
values.append(pk)
|
|
197
|
+
continue
|
|
198
|
+
if col.nullable and rng.random() < null_fraction:
|
|
199
|
+
# Null roll consumes one rng tick — this consumption happens
|
|
200
|
+
# for EVERY nullable column regardless of FK status, which
|
|
201
|
+
# keeps the FK and non-FK paths byte-stable when a column's
|
|
202
|
+
# type changes between FK and non-FK in future refactors.
|
|
203
|
+
values.append(None)
|
|
204
|
+
continue
|
|
205
|
+
if col.name in fk_by_col:
|
|
206
|
+
ref_table = fk_by_col[col.name]
|
|
207
|
+
if ref_table == table.name:
|
|
208
|
+
# Self-FK: PG enforces FKs per-row (default NOT DEFERRABLE),
|
|
209
|
+
# so row pk=N can reference any row in [1..N] including itself.
|
|
210
|
+
# Without this branch we'd try to read id_store[table.name]
|
|
211
|
+
# which is empty (populated only after this loop in data.py).
|
|
212
|
+
# For nullable columns, the null roll has already fired above
|
|
213
|
+
# — any column that reaches this branch survived that roll
|
|
214
|
+
# and should get a real value, not unconditional NULL.
|
|
215
|
+
values.append(rng.randint(1, pk))
|
|
216
|
+
continue
|
|
217
|
+
parent_ids = id_store[ref_table]
|
|
218
|
+
# If the parent table has zero rows, this FK column must be NULL.
|
|
219
|
+
# A NOT NULL FK into an empty parent table is a generator error —
|
|
220
|
+
# the topological walk should have materialized parents first.
|
|
221
|
+
# This branch ONLY fires when the user passes --rows=0 (or the
|
|
222
|
+
# parent's depth-adjusted count rounds to 0, which can't happen
|
|
223
|
+
# with the current `base * fanout**depth` formula and base > 0).
|
|
224
|
+
if not parent_ids:
|
|
225
|
+
if not col.nullable:
|
|
226
|
+
raise ValueError(
|
|
227
|
+
f"NOT NULL FK {table.name}.{col.name} references "
|
|
228
|
+
f"empty parent {ref_table}"
|
|
229
|
+
)
|
|
230
|
+
values.append(None)
|
|
231
|
+
continue
|
|
232
|
+
values.append(rng.choice(parent_ids))
|
|
233
|
+
continue
|
|
234
|
+
strat = strategy_for(col)
|
|
235
|
+
values.append(strat(rng, col))
|
|
236
|
+
return tuple(values)
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""Per-PgType value strategies. Each strategy maps (rng, Column) → object;
|
|
2
|
+
the emitter is responsible for formatting that object for COPY.
|
|
3
|
+
|
|
4
|
+
The split between strategies and the emitter is deliberate: strategies
|
|
5
|
+
return native Python values (Decimal, datetime, UUID, dict, list) and
|
|
6
|
+
have no knowledge of tab encoding or NULL sentinels. That keeps the
|
|
7
|
+
strategy registry trivially testable and lets the emitter own all of
|
|
8
|
+
PostgreSQL's COPY text-format escape rules in one place.
|
|
9
|
+
|
|
10
|
+
Determinism invariants (load-bearing across this whole module):
|
|
11
|
+
* Only the injected `rng` is allowed as a source of randomness.
|
|
12
|
+
No `uuid.uuid4()`, no `random.gauss()` from the global module,
|
|
13
|
+
no `datetime.now()`, no environment lookups.
|
|
14
|
+
* `_EPOCH` is a fixed date constant — wall-clock input would mean
|
|
15
|
+
the same seed produces different data tomorrow, breaking the
|
|
16
|
+
"reproduce a bug from a seed years later" guarantee.
|
|
17
|
+
* Dict iteration order is stable in Python 3.7+, so `_TYPE_STRATEGIES`
|
|
18
|
+
is fine; but anything that iterates a `set` for rng decisions must
|
|
19
|
+
`sorted()` first (none currently do).
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import datetime as _dt
|
|
24
|
+
import random
|
|
25
|
+
import uuid
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from decimal import Decimal
|
|
28
|
+
from collections.abc import Callable
|
|
29
|
+
|
|
30
|
+
from waxsql.schema import Column
|
|
31
|
+
from waxsql.types import PgType
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Hand-curated short list of simple English words. Used as the default
|
|
35
|
+
# text/varchar value source: each cell gets one of these. Not a name
|
|
36
|
+
# dictionary, not Lorem Ipsum — just enough variety that SELECT * FROM t
|
|
37
|
+
# doesn't look like line noise. Grow as taste dictates.
|
|
38
|
+
WORDLIST: tuple[str, ...] = (
|
|
39
|
+
"alpha", "amber", "anchor", "apple", "arrow", "atlas", "azure",
|
|
40
|
+
"badger", "basin", "beacon", "berry", "birch", "blossom", "boulder",
|
|
41
|
+
"breeze", "bridge", "bronze", "buffalo", "cabin", "candle", "canyon",
|
|
42
|
+
"cedar", "cherry", "cinder", "clover", "cobalt", "comet", "copper",
|
|
43
|
+
"coral", "cottage", "crater", "crescent", "crimson", "crystal",
|
|
44
|
+
"dahlia", "dawn", "delta", "diamond", "dolphin", "dove", "dusk",
|
|
45
|
+
"ember", "emerald", "falcon", "feather", "fern", "festival", "fjord",
|
|
46
|
+
"forest", "fossil", "frost", "galaxy", "garnet", "gentian", "geyser",
|
|
47
|
+
"glacier", "glade", "granite", "harbor", "harvest", "haven", "hazel",
|
|
48
|
+
"heron", "hickory", "horizon", "indigo", "iris", "island", "ivory",
|
|
49
|
+
"jasper", "juniper", "kestrel", "lagoon", "lantern", "lavender",
|
|
50
|
+
"library", "linden", "lotus", "magnet", "maple", "marble", "marigold",
|
|
51
|
+
"marsh", "meadow", "mercury", "midnight", "mineral", "mint", "mirage",
|
|
52
|
+
"morning", "mosaic", "mountain", "nectar", "nimbus", "nocturne",
|
|
53
|
+
"oasis", "obsidian", "ocean", "olive", "opal", "orchid", "otter",
|
|
54
|
+
"panther", "parsley", "peach", "pebble", "pelican", "petal", "phlox",
|
|
55
|
+
"pine", "plateau", "poppy", "prairie", "prism", "quartz", "quail",
|
|
56
|
+
"quill", "rainbow", "raven", "redwood", "river", "robin", "rose",
|
|
57
|
+
"ruby", "saffron", "sage", "salmon", "sapphire", "scarlet", "sequoia",
|
|
58
|
+
"shadow", "shoreline", "silver", "slate", "solstice", "sparrow",
|
|
59
|
+
"spruce", "starling", "stone", "summit", "sunset", "swallow", "tangerine",
|
|
60
|
+
"thicket", "thistle", "thunder", "tide", "topaz", "tulip", "turquoise",
|
|
61
|
+
"twilight", "umber", "valley", "velvet", "verdant", "violet", "walnut",
|
|
62
|
+
"waterfall", "willow", "winter", "wisteria", "yarrow", "yew", "zenith",
|
|
63
|
+
"zephyr", "zircon",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def pick_word(rng: random.Random) -> str:
|
|
68
|
+
"""Pick a deterministic word from `WORDLIST`. Uses `rng.choice` rather
|
|
69
|
+
than `rng.randint`+index so adding/removing wordlist entries shifts
|
|
70
|
+
output predictably rather than scrambling it.
|
|
71
|
+
"""
|
|
72
|
+
return rng.choice(WORDLIST)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
# Per-type strategy functions
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
# A strategy is a pure function: (rng, column) -> Python value.
|
|
80
|
+
# Native types are returned; the emitter formats them for COPY.
|
|
81
|
+
Strategy = Callable[[random.Random, Column], object]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _int4(rng: random.Random, col: Column) -> int:
|
|
85
|
+
# PG int4 is signed 32-bit: -2147483648..2147483647. The -1 floor
|
|
86
|
+
# avoids the minimum-int corner that some downstream string
|
|
87
|
+
# formatters mishandle; the loss of one value is irrelevant.
|
|
88
|
+
return rng.randint(-(2**31) + 1, (2**31) - 1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _int8(rng: random.Random, col: Column) -> int:
|
|
92
|
+
# PG int8 is signed 64-bit but we deliberately bound this tighter
|
|
93
|
+
# than the full range. Full-range int8 values aren't interesting
|
|
94
|
+
# for demo data and produce visually ugly output.
|
|
95
|
+
# Trade-off: planner-cost estimates that depend on value distribution
|
|
96
|
+
# see a slightly narrower range; this hasn't bitten anything yet.
|
|
97
|
+
return rng.randint(-(2**62), (2**62))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _text(rng: random.Random, col: Column) -> str:
|
|
101
|
+
return pick_word(rng)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _varchar(rng: random.Random, col: Column) -> str:
|
|
105
|
+
word = pick_word(rng)
|
|
106
|
+
# Respect typmod when present; varchar(N) rejects values longer than N.
|
|
107
|
+
# When typmod is absent (`varchar` with no length), cap at 32 — generous
|
|
108
|
+
# enough for any single wordlist entry, modest enough to keep COPY
|
|
109
|
+
# output readable. Note: pick_word ALWAYS fires (consumes one rng tick)
|
|
110
|
+
# even if the cap would truncate to empty — keeps the rng stream
|
|
111
|
+
# independent of typmod, so adding a length to a column doesn't
|
|
112
|
+
# cascade into downstream byte-shifts.
|
|
113
|
+
cap = col.type.typmod[0] if col.type.typmod else 32
|
|
114
|
+
return word[:cap]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _bool(rng: random.Random, col: Column) -> bool:
|
|
118
|
+
return rng.choice((True, False))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _uuid(rng: random.Random, col: Column) -> uuid.UUID:
|
|
122
|
+
# rng.getrandbits(128) keeps determinism within our RNG; uuid.uuid4()
|
|
123
|
+
# would reach for os.urandom and break that. The resulting UUID won't
|
|
124
|
+
# have the version-4 bit pattern set — PG doesn't care, the uuid column
|
|
125
|
+
# accepts any 128-bit value, and the determinism contract trumps RFC
|
|
126
|
+
# 4122 cosmetic correctness.
|
|
127
|
+
return uuid.UUID(int=rng.getrandbits(128))
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _float8(rng: random.Random, col: Column) -> float:
|
|
131
|
+
# Bounded so output stays human-readable. Avoid `random.gauss()` —
|
|
132
|
+
# we want a flat distribution for COPY, not a bell curve.
|
|
133
|
+
return rng.uniform(-1_000_000.0, 1_000_000.0)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _numeric(rng: random.Random, col: Column) -> Decimal:
|
|
137
|
+
# numeric(precision, scale): `precision` total digits, `scale` after
|
|
138
|
+
# the decimal point. Magnitude < 10^(precision-scale); fractional
|
|
139
|
+
# digits = scale. Without typmod, fall back to a sensible default.
|
|
140
|
+
# The default (10, 4) is arbitrary but matches a common business-data
|
|
141
|
+
# shape (six integer digits, four fractional) and keeps output narrow.
|
|
142
|
+
if col.type.typmod and len(col.type.typmod) >= 2:
|
|
143
|
+
precision, scale = col.type.typmod[0], col.type.typmod[1]
|
|
144
|
+
elif col.type.typmod and len(col.type.typmod) == 1:
|
|
145
|
+
# PG allows `numeric(P)` (scale implicit 0). Mirror that.
|
|
146
|
+
precision, scale = col.type.typmod[0], 0
|
|
147
|
+
else:
|
|
148
|
+
precision, scale = 10, 4
|
|
149
|
+
integer_digits = precision - scale
|
|
150
|
+
upper = 10**integer_digits - 1
|
|
151
|
+
# Pick a raw integer in the value space [−upper·10^scale, upper·10^scale],
|
|
152
|
+
# then divide by 10^scale. Doing integer arithmetic first keeps every
|
|
153
|
+
# produced Decimal exactly representable (no float rounding intrusion).
|
|
154
|
+
raw = rng.randint(-upper * 10**scale, upper * 10**scale)
|
|
155
|
+
return Decimal(raw) / (Decimal(10) ** scale)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# Fixed reference epoch — NOT `datetime.now()`. Determinism contract:
|
|
159
|
+
# same seed must produce same output years from now, which means no
|
|
160
|
+
# wall-clock input anywhere in the generator.
|
|
161
|
+
_EPOCH = _dt.date(2025, 1, 1)
|
|
162
|
+
_WINDOW_DAYS = 5 * 365 # ±5 years
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _date(rng: random.Random, col: Column) -> _dt.date:
|
|
166
|
+
days = rng.randint(-_WINDOW_DAYS, _WINDOW_DAYS)
|
|
167
|
+
return _EPOCH + _dt.timedelta(days=days)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _timestamptz(rng: random.Random, col: Column) -> _dt.datetime:
|
|
171
|
+
# Three rng calls in fixed order: days, seconds, microseconds.
|
|
172
|
+
# Changing the order would shift every downstream value — these
|
|
173
|
+
# are part of the determinism contract.
|
|
174
|
+
days = rng.randint(-_WINDOW_DAYS, _WINDOW_DAYS)
|
|
175
|
+
seconds = rng.randint(0, 86_399)
|
|
176
|
+
micros = rng.randint(0, 999_999)
|
|
177
|
+
# tz: stick with UTC. timestamptz stores UTC internally regardless
|
|
178
|
+
# of the input tz; emitting UTC keeps COPY output canonical and
|
|
179
|
+
# avoids planner statistics being affected by client TZ settings.
|
|
180
|
+
return _dt.datetime(
|
|
181
|
+
_EPOCH.year, _EPOCH.month, _EPOCH.day, tzinfo=_dt.timezone.utc,
|
|
182
|
+
) + _dt.timedelta(days=days, seconds=seconds, microseconds=micros)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _interval(rng: random.Random, col: Column) -> _dt.timedelta:
|
|
186
|
+
# Bounded interval: at most a few months. PG intervals can encode
|
|
187
|
+
# year/month parts that timedelta cannot; the emitter formats as
|
|
188
|
+
# ISO-8601-like and the server accepts it cleanly.
|
|
189
|
+
days = rng.randint(0, 120)
|
|
190
|
+
seconds = rng.randint(0, 86_399)
|
|
191
|
+
return _dt.timedelta(days=days, seconds=seconds)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _jsonb(rng: random.Random, col: Column) -> dict:
|
|
195
|
+
"""Shallow random object: 1-4 string keys, scalar values.
|
|
196
|
+
|
|
197
|
+
Deliberately not nested. The spec allows for richer JSON later, but
|
|
198
|
+
for parse/plan validation and demo readability, shallow is enough.
|
|
199
|
+
|
|
200
|
+
Key collisions across the loop are tolerated: when two iterations
|
|
201
|
+
pick the same word, the later iteration overwrites the earlier.
|
|
202
|
+
That's why output dicts may have fewer than `n_keys` entries; this
|
|
203
|
+
is intentional and keeps the rng-call count fixed at `n_keys` keys
|
|
204
|
+
regardless of collisions (every iteration consumes the same ticks).
|
|
205
|
+
"""
|
|
206
|
+
n_keys = rng.randint(1, 4)
|
|
207
|
+
out: dict[str, object] = {}
|
|
208
|
+
for _ in range(n_keys):
|
|
209
|
+
key = pick_word(rng)
|
|
210
|
+
# Cap the dict at n_keys; collisions just overwrite, which is fine.
|
|
211
|
+
# Each branch below consumes EXACTLY ONE rng call (after the kind
|
|
212
|
+
# roll), so the total rng consumption per _jsonb is deterministic:
|
|
213
|
+
# 1 (n_keys) + n_keys × (1 key + 1 kind + 0-or-1 value).
|
|
214
|
+
kind = rng.randint(0, 4)
|
|
215
|
+
if kind == 0:
|
|
216
|
+
out[key] = pick_word(rng)
|
|
217
|
+
elif kind == 1:
|
|
218
|
+
out[key] = rng.randint(-1000, 1000)
|
|
219
|
+
elif kind == 2:
|
|
220
|
+
out[key] = round(rng.uniform(-1000.0, 1000.0), 3)
|
|
221
|
+
elif kind == 3:
|
|
222
|
+
out[key] = rng.choice((True, False))
|
|
223
|
+
else:
|
|
224
|
+
# kind == 4: literal JSON null. No rng call. The asymmetry
|
|
225
|
+
# (4 of 5 branches consume an rng tick, 1 doesn't) is
|
|
226
|
+
# intentional — null is a real JSON value, not an error path,
|
|
227
|
+
# and balancing rng consumption would force a dummy call.
|
|
228
|
+
out[key] = None
|
|
229
|
+
return out
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
@dataclass(frozen=True)
|
|
233
|
+
class _ColumnAdapter:
|
|
234
|
+
"""Lightweight Column stand-in used when array strategies recurse on
|
|
235
|
+
the element type. Avoids depending on the full Column constructor
|
|
236
|
+
surface in case it grows constraints we don't care about here.
|
|
237
|
+
"""
|
|
238
|
+
name: str
|
|
239
|
+
type: PgType
|
|
240
|
+
nullable: bool
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _array(element_strategy: Strategy, element_type: PgType) -> Strategy:
|
|
244
|
+
"""Build a strategy that returns a 0-5 element list. The factory
|
|
245
|
+
closes over the element strategy so call-time work is minimal —
|
|
246
|
+
we pay the `strategy_for_type` lookup once at factory time, not
|
|
247
|
+
once per row.
|
|
248
|
+
|
|
249
|
+
The closure pattern matters because `strategy_for_type` is called
|
|
250
|
+
every time a row generates an array column; without memoization via
|
|
251
|
+
the factory, recursive array types would re-resolve their element
|
|
252
|
+
strategies on every row.
|
|
253
|
+
"""
|
|
254
|
+
def gen(rng: random.Random, col: Column) -> list:
|
|
255
|
+
n = rng.randint(0, 5)
|
|
256
|
+
# Element nullability is intentionally suppressed; NULL injection
|
|
257
|
+
# is an outer-row concern, not an element-level one. A PG array
|
|
258
|
+
# can contain NULL elements, but we don't generate them today —
|
|
259
|
+
# would require differently-quoted output ({NULL} not {""}) and
|
|
260
|
+
# the emit layer doesn't currently model that distinction.
|
|
261
|
+
elem_col = _ColumnAdapter(name=col.name, type=element_type, nullable=False)
|
|
262
|
+
return [element_strategy(rng, elem_col) for _ in range(n)] # type: ignore[arg-type]
|
|
263
|
+
return gen
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# Type-name → strategy lookup. Keyed on `PgType.name` (matches pg_type.typname).
|
|
267
|
+
# Adding a new scalar PgType requires adding an entry here AND making sure
|
|
268
|
+
# the emit module's `encode_value` knows how to format the strategy's
|
|
269
|
+
# return type. The two registries together define what the data generator
|
|
270
|
+
# can produce.
|
|
271
|
+
_TYPE_STRATEGIES: dict[str, Strategy] = {
|
|
272
|
+
"int4": _int4,
|
|
273
|
+
"int8": _int8,
|
|
274
|
+
"text": _text,
|
|
275
|
+
"varchar": _varchar,
|
|
276
|
+
"bool": _bool,
|
|
277
|
+
"uuid": _uuid,
|
|
278
|
+
"float8": _float8,
|
|
279
|
+
"numeric": _numeric,
|
|
280
|
+
"date": _date,
|
|
281
|
+
"timestamptz": _timestamptz,
|
|
282
|
+
"interval": _interval,
|
|
283
|
+
"jsonb": _jsonb,
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def strategy_for_type(t: PgType) -> Strategy:
|
|
288
|
+
"""Return the strategy for type `t`, or raise KeyError if no scalar strategy
|
|
289
|
+
is registered for the element type.
|
|
290
|
+
|
|
291
|
+
Array types are handled directly: we recurse on the element type and
|
|
292
|
+
wrap the result in `_array`. The element strategy is looked up once at
|
|
293
|
+
factory time (not per row), keeping data generation cheap. Recursion
|
|
294
|
+
bottoms out at scalar types, which are in `_TYPE_STRATEGIES`.
|
|
295
|
+
"""
|
|
296
|
+
if t.is_array():
|
|
297
|
+
assert t.element is not None # guaranteed by is_array()
|
|
298
|
+
return _array(strategy_for_type(t.element), t.element)
|
|
299
|
+
return _TYPE_STRATEGIES[t.name]
|