waxsql 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,236 @@
1
+ """Per-table row materialization and FK resolution.
2
+
3
+ The walk is one topological pass over the FK DAG. For each table, we
4
+ materialize all rows in memory as Python tuples, capture the PK column
5
+ values into a per-table ID store, and hand the tuples to the emitter.
6
+ Children resolve FK columns by sampling from the parent's ID list,
7
+ which is guaranteed populated by the topological order.
8
+
9
+ Self-referential FKs (a column in table T referencing T.id) are handled
10
+ as a special case: row with pk=N can reference any row in [1..N] including
11
+ itself, because PostgreSQL checks FK constraints per-row by default (NOT
12
+ DEFERRABLE). Nullable self-FK columns honour null_fraction (the null roll
13
+ fires above, before the FK branch, and controls whether the column is NULL);
14
+ NOT NULL self-FK columns always sample from [1..pk].
15
+
16
+ In-memory materialization is fine at demo scales (default 100 rows ×
17
+ fanout 5 across a few levels of depth is comfortably under a megabyte).
18
+ If volume ever becomes a real constraint, a streaming variant can be
19
+ added behind the same public API.
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import random
24
+ from collections.abc import Mapping
25
+
26
+ from waxsql.gen.data.columns import strategy_for
27
+ from waxsql.schema import Schema, Table
28
+
29
+
30
+ def topological_order(schema: Schema) -> list[Table]:
31
+ """Return the schema's tables in FK-topological order: every table
32
+ appears before any table that has an FK referencing it.
33
+
34
+ Within each topological rank, tables are sorted alphabetically by
35
+ name. This eliminates dict-insertion-order dependency and keeps the
36
+ walk byte-identical across Python builds.
37
+
38
+ Raises ValueError if a cycle is detected (the current generator does
39
+ not produce cycles, but we assert loudly rather than loop forever).
40
+
41
+ Algorithm: Kahn-style level-set walk. Each iteration extracts the
42
+ set of tables whose remaining parents are empty (i.e. all their FK
43
+ targets have already been placed), sorts that set alphabetically,
44
+ and appends them. A "no ready tables but remaining is non-empty"
45
+ state means a cycle and raises.
46
+ """
47
+ by_name = {t.name: t for t in schema.tables}
48
+ # Collect UNIQUE external parents per table: FKs that point to a
49
+ # *different* table. Self-referential FKs (`t → t`) are intentionally
50
+ # excluded because they impose no ordering constraint — the table
51
+ # trivially appears before itself.
52
+ parents: dict[str, set[str]] = {
53
+ t.name: {fk.ref_table for fk in t.foreign_keys if fk.ref_table != t.name}
54
+ for t in schema.tables
55
+ }
56
+ remaining = set(by_name)
57
+ out: list[Table] = []
58
+ while remaining:
59
+ # sorted(...) is essential here, not cosmetic: `set` iteration
60
+ # order is not stable across Python builds (hash-randomized for
61
+ # strings since 3.3), and any set-driven choice would silently
62
+ # break the determinism contract.
63
+ ready = sorted(
64
+ n for n in remaining
65
+ if not (parents[n] & remaining)
66
+ )
67
+ if not ready:
68
+ # No table has all its parents already placed AND `remaining`
69
+ # is non-empty → cycle. The data generator can't currently
70
+ # handle cycles (would need deferred constraints + UPDATE
71
+ # patches); the CLI catches this ValueError and produces a
72
+ # clean usage message pointing at the cycle.
73
+ raise ValueError(
74
+ f"FK cycle in schema: remaining {sorted(remaining)!r}"
75
+ )
76
+ for n in ready:
77
+ out.append(by_name[n])
78
+ remaining.discard(n)
79
+ return out
80
+
81
+
82
+ def depth_of(table: Table, schema: Schema) -> int:
83
+ """The longest FK chain from any root to `table`.
84
+
85
+ Roots have depth 0. A table directly referencing a root has depth 1.
86
+ When a table has multiple FK parents, depth is 1 + max(parent depths)
87
+ — the longest chain, not the shortest.
88
+
89
+ Depth feeds the row-count formula: `rows * fanout ** depth`. Choosing
90
+ longest-chain (not shortest) means a leaf with even one deep ancestor
91
+ gets the deep row count — appropriate, since the leaf has to spread
92
+ across the parent's larger ID pool to avoid clumping.
93
+
94
+ The internal `memo` dict is local to this call, so memoization
95
+ avoids re-walking shared ancestry WITHIN a single invocation but
96
+ is NOT shared across calls. `rows_for_table` calls `depth_of`
97
+ once per table in the schema, so the total work for a full
98
+ data-generation pass is O(N · D) where N is the number of
99
+ tables and D the maximum chain length — fine at the schema
100
+ generator's N=12 ceiling. Promote to a shared memo threaded
101
+ through `topological_order`/`rows_for_table` only if N grows
102
+ materially. The `stack` parameter threads the current path for
103
+ cycle detection; a cycle raises ValueError rather than
104
+ recursing to Python's stack limit.
105
+ """
106
+ by_name = {t.name: t for t in schema.tables}
107
+ memo: dict[str, int] = {}
108
+
109
+ def visit(name: str, stack: tuple[str, ...] = ()) -> int:
110
+ if name in stack:
111
+ # Cycle detected on the recursion path itself — different
112
+ # from the topological_order cycle check, which is global.
113
+ # Tuple-based stack is cheap (small N) and keeps the path
114
+ # for the error message.
115
+ raise ValueError(f"FK cycle through {name}: {stack}")
116
+ if name in memo:
117
+ return memo[name]
118
+ t = by_name[name]
119
+ # Only consider FKs to *other* tables. Self-referential FKs
120
+ # (`t → t`) would recurse forever; they carry no depth information
121
+ # because the row materializer handles self-references separately
122
+ # (sample from [1..pk] in the same row generation).
123
+ external_parents = [fk.ref_table for fk in t.foreign_keys if fk.ref_table != name]
124
+ d = 0 if not external_parents else 1 + max(
125
+ visit(p, stack + (name,)) for p in external_parents
126
+ )
127
+ memo[name] = d
128
+ return d
129
+
130
+ return visit(table.name)
131
+
132
+
133
+ # PK column name convention is fixed by the schema generator: every table
134
+ # has an `id BIGINT NOT NULL` PK at the first column position. We don't
135
+ # search for it dynamically; if that invariant ever changes, this module
136
+ # will produce nonsense for that table and tests will fail loudly, which
137
+ # is the desired behavior.
138
+ _PK_COLUMN_NAME = "id"
139
+
140
+
141
+ def rows_for_table(table: Table, schema: Schema, *, base: int, fanout: int) -> int:
142
+ """Row count for `table`: `base * fanout ** depth`. No ceiling; the
143
+ user is responsible for not asking for combinations that produce
144
+ absurd output. Depth is 0 for root tables, so they always get exactly
145
+ `base` rows regardless of fanout.
146
+
147
+ Why the power law: it lets child tables outnumber their parents
148
+ geometrically, which is what real one-to-many shapes look like
149
+ (orders >> customers, line_items >> orders). With base=100, fanout=5,
150
+ depth=3: 12500 rows — large enough to exercise EXPLAIN's choices,
151
+ small enough to fit comfortably in memory and load in seconds.
152
+ """
153
+ return base * fanout ** depth_of(table, schema)
154
+
155
+
156
+ def generate_row(
157
+ *,
158
+ table: Table,
159
+ pk: int,
160
+ rng: random.Random,
161
+ id_store: Mapping[str, list[int]],
162
+ null_fraction: float,
163
+ ) -> tuple[object, ...]:
164
+ """Generate one row for `table` as a Python tuple.
165
+
166
+ PK column gets `pk`; FK columns sample from `id_store[ref_table]`;
167
+ other columns dispatch through `strategy_for`. Nullable columns —
168
+ including nullable FK columns — roll for NULL first; only columns
169
+ that survive the null roll reach FK or strategy dispatch.
170
+
171
+ Self-referential FKs bypass `id_store` entirely: PostgreSQL's default
172
+ NOT DEFERRABLE constraint checking allows row pk=N to reference any
173
+ row in [1..N] (itself included, since the row exists by check time).
174
+ Both nullable and non-nullable self-FK columns sample from [1..pk] once
175
+ they reach the FK branch — the null roll above already gave nullable
176
+ columns their chance to become NULL, so null_fraction is honoured
177
+ correctly instead of forcing self-FK nullables to always be NULL.
178
+ """
179
+ # Build a column-name → ref_table lookup once per row. With most
180
+ # schemas having ≤ a handful of FKs per table this is cheap.
181
+ # Dict order doesn't matter here — we lookup by name, not iterate.
182
+ fk_by_col: dict[str, str] = {}
183
+ for fk in table.foreign_keys:
184
+ for child_col, _parent_col in zip(fk.columns, fk.ref_columns, strict=True):
185
+ fk_by_col[child_col] = fk.ref_table
186
+
187
+ values: list[object] = []
188
+ # Column ORDER matters: every consumed rng call advances the shared
189
+ # RNG state, so a swap here changes downstream output. The iteration
190
+ # order follows `table.columns`, which is the schema generator's
191
+ # declared order — same source of truth as the DDL emitter.
192
+ for col in table.columns:
193
+ if col.name == _PK_COLUMN_NAME:
194
+ # PK is supplied by the caller (sequential 1..n). PK never
195
+ # rolls for NULL and never goes through a strategy — bypass.
196
+ values.append(pk)
197
+ continue
198
+ if col.nullable and rng.random() < null_fraction:
199
+ # Null roll consumes one rng tick — this consumption happens
200
+ # for EVERY nullable column regardless of FK status, which
201
+ # keeps the FK and non-FK paths byte-stable when a column's
202
+ # type changes between FK and non-FK in future refactors.
203
+ values.append(None)
204
+ continue
205
+ if col.name in fk_by_col:
206
+ ref_table = fk_by_col[col.name]
207
+ if ref_table == table.name:
208
+ # Self-FK: PG enforces FKs per-row (default NOT DEFERRABLE),
209
+ # so row pk=N can reference any row in [1..N] including itself.
210
+ # Without this branch we'd try to read id_store[table.name]
211
+ # which is empty (populated only after this loop in data.py).
212
+ # For nullable columns, the null roll has already fired above
213
+ # — any column that reaches this branch survived that roll
214
+ # and should get a real value, not unconditional NULL.
215
+ values.append(rng.randint(1, pk))
216
+ continue
217
+ parent_ids = id_store[ref_table]
218
+ # If the parent table has zero rows, this FK column must be NULL.
219
+ # A NOT NULL FK into an empty parent table is a generator error —
220
+ # the topological walk should have materialized parents first.
221
+ # This branch ONLY fires when the user passes --rows=0 (or the
222
+ # parent's depth-adjusted count rounds to 0, which can't happen
223
+ # with the current `base * fanout**depth` formula and base > 0).
224
+ if not parent_ids:
225
+ if not col.nullable:
226
+ raise ValueError(
227
+ f"NOT NULL FK {table.name}.{col.name} references "
228
+ f"empty parent {ref_table}"
229
+ )
230
+ values.append(None)
231
+ continue
232
+ values.append(rng.choice(parent_ids))
233
+ continue
234
+ strat = strategy_for(col)
235
+ values.append(strat(rng, col))
236
+ return tuple(values)
@@ -0,0 +1,299 @@
1
+ """Per-PgType value strategies. Each strategy maps (rng, Column) → object;
2
+ the emitter is responsible for formatting that object for COPY.
3
+
4
+ The split between strategies and the emitter is deliberate: strategies
5
+ return native Python values (Decimal, datetime, UUID, dict, list) and
6
+ have no knowledge of tab encoding or NULL sentinels. That keeps the
7
+ strategy registry trivially testable and lets the emitter own all of
8
+ PostgreSQL's COPY text-format escape rules in one place.
9
+
10
+ Determinism invariants (load-bearing across this whole module):
11
+ * Only the injected `rng` is allowed as a source of randomness.
12
+ No `uuid.uuid4()`, no `random.gauss()` from the global module,
13
+ no `datetime.now()`, no environment lookups.
14
+ * `_EPOCH` is a fixed date constant — wall-clock input would mean
15
+ the same seed produces different data tomorrow, breaking the
16
+ "reproduce a bug from a seed years later" guarantee.
17
+ * Dict iteration order is stable in Python 3.7+, so `_TYPE_STRATEGIES`
18
+ is fine; but anything that iterates a `set` for rng decisions must
19
+ `sorted()` first (none currently do).
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import datetime as _dt
24
+ import random
25
+ import uuid
26
+ from dataclasses import dataclass
27
+ from decimal import Decimal
28
+ from collections.abc import Callable
29
+
30
+ from waxsql.schema import Column
31
+ from waxsql.types import PgType
32
+
33
+
34
+ # Hand-curated short list of simple English words. Used as the default
35
+ # text/varchar value source: each cell gets one of these. Not a name
36
+ # dictionary, not Lorem Ipsum — just enough variety that SELECT * FROM t
37
+ # doesn't look like line noise. Grow as taste dictates.
38
+ WORDLIST: tuple[str, ...] = (
39
+ "alpha", "amber", "anchor", "apple", "arrow", "atlas", "azure",
40
+ "badger", "basin", "beacon", "berry", "birch", "blossom", "boulder",
41
+ "breeze", "bridge", "bronze", "buffalo", "cabin", "candle", "canyon",
42
+ "cedar", "cherry", "cinder", "clover", "cobalt", "comet", "copper",
43
+ "coral", "cottage", "crater", "crescent", "crimson", "crystal",
44
+ "dahlia", "dawn", "delta", "diamond", "dolphin", "dove", "dusk",
45
+ "ember", "emerald", "falcon", "feather", "fern", "festival", "fjord",
46
+ "forest", "fossil", "frost", "galaxy", "garnet", "gentian", "geyser",
47
+ "glacier", "glade", "granite", "harbor", "harvest", "haven", "hazel",
48
+ "heron", "hickory", "horizon", "indigo", "iris", "island", "ivory",
49
+ "jasper", "juniper", "kestrel", "lagoon", "lantern", "lavender",
50
+ "library", "linden", "lotus", "magnet", "maple", "marble", "marigold",
51
+ "marsh", "meadow", "mercury", "midnight", "mineral", "mint", "mirage",
52
+ "morning", "mosaic", "mountain", "nectar", "nimbus", "nocturne",
53
+ "oasis", "obsidian", "ocean", "olive", "opal", "orchid", "otter",
54
+ "panther", "parsley", "peach", "pebble", "pelican", "petal", "phlox",
55
+ "pine", "plateau", "poppy", "prairie", "prism", "quartz", "quail",
56
+ "quill", "rainbow", "raven", "redwood", "river", "robin", "rose",
57
+ "ruby", "saffron", "sage", "salmon", "sapphire", "scarlet", "sequoia",
58
+ "shadow", "shoreline", "silver", "slate", "solstice", "sparrow",
59
+ "spruce", "starling", "stone", "summit", "sunset", "swallow", "tangerine",
60
+ "thicket", "thistle", "thunder", "tide", "topaz", "tulip", "turquoise",
61
+ "twilight", "umber", "valley", "velvet", "verdant", "violet", "walnut",
62
+ "waterfall", "willow", "winter", "wisteria", "yarrow", "yew", "zenith",
63
+ "zephyr", "zircon",
64
+ )
65
+
66
+
67
+ def pick_word(rng: random.Random) -> str:
68
+ """Pick a deterministic word from `WORDLIST`. Uses `rng.choice` rather
69
+ than `rng.randint`+index so adding/removing wordlist entries shifts
70
+ output predictably rather than scrambling it.
71
+ """
72
+ return rng.choice(WORDLIST)
73
+
74
+
75
+ # ---------------------------------------------------------------------------
76
+ # Per-type strategy functions
77
+ # ---------------------------------------------------------------------------
78
+
79
+ # A strategy is a pure function: (rng, column) -> Python value.
80
+ # Native types are returned; the emitter formats them for COPY.
81
+ Strategy = Callable[[random.Random, Column], object]
82
+
83
+
84
+ def _int4(rng: random.Random, col: Column) -> int:
85
+ # PG int4 is signed 32-bit: -2147483648..2147483647. The -1 floor
86
+ # avoids the minimum-int corner that some downstream string
87
+ # formatters mishandle; the loss of one value is irrelevant.
88
+ return rng.randint(-(2**31) + 1, (2**31) - 1)
89
+
90
+
91
+ def _int8(rng: random.Random, col: Column) -> int:
92
+ # PG int8 is signed 64-bit but we deliberately bound this tighter
93
+ # than the full range. Full-range int8 values aren't interesting
94
+ # for demo data and produce visually ugly output.
95
+ # Trade-off: planner-cost estimates that depend on value distribution
96
+ # see a slightly narrower range; this hasn't bitten anything yet.
97
+ return rng.randint(-(2**62), (2**62))
98
+
99
+
100
+ def _text(rng: random.Random, col: Column) -> str:
101
+ return pick_word(rng)
102
+
103
+
104
+ def _varchar(rng: random.Random, col: Column) -> str:
105
+ word = pick_word(rng)
106
+ # Respect typmod when present; varchar(N) rejects values longer than N.
107
+ # When typmod is absent (`varchar` with no length), cap at 32 — generous
108
+ # enough for any single wordlist entry, modest enough to keep COPY
109
+ # output readable. Note: pick_word ALWAYS fires (consumes one rng tick)
110
+ # even if the cap would truncate to empty — keeps the rng stream
111
+ # independent of typmod, so adding a length to a column doesn't
112
+ # cascade into downstream byte-shifts.
113
+ cap = col.type.typmod[0] if col.type.typmod else 32
114
+ return word[:cap]
115
+
116
+
117
+ def _bool(rng: random.Random, col: Column) -> bool:
118
+ return rng.choice((True, False))
119
+
120
+
121
+ def _uuid(rng: random.Random, col: Column) -> uuid.UUID:
122
+ # rng.getrandbits(128) keeps determinism within our RNG; uuid.uuid4()
123
+ # would reach for os.urandom and break that. The resulting UUID won't
124
+ # have the version-4 bit pattern set — PG doesn't care, the uuid column
125
+ # accepts any 128-bit value, and the determinism contract trumps RFC
126
+ # 4122 cosmetic correctness.
127
+ return uuid.UUID(int=rng.getrandbits(128))
128
+
129
+
130
+ def _float8(rng: random.Random, col: Column) -> float:
131
+ # Bounded so output stays human-readable. Avoid `random.gauss()` —
132
+ # we want a flat distribution for COPY, not a bell curve.
133
+ return rng.uniform(-1_000_000.0, 1_000_000.0)
134
+
135
+
136
+ def _numeric(rng: random.Random, col: Column) -> Decimal:
137
+ # numeric(precision, scale): `precision` total digits, `scale` after
138
+ # the decimal point. Magnitude < 10^(precision-scale); fractional
139
+ # digits = scale. Without typmod, fall back to a sensible default.
140
+ # The default (10, 4) is arbitrary but matches a common business-data
141
+ # shape (six integer digits, four fractional) and keeps output narrow.
142
+ if col.type.typmod and len(col.type.typmod) >= 2:
143
+ precision, scale = col.type.typmod[0], col.type.typmod[1]
144
+ elif col.type.typmod and len(col.type.typmod) == 1:
145
+ # PG allows `numeric(P)` (scale implicit 0). Mirror that.
146
+ precision, scale = col.type.typmod[0], 0
147
+ else:
148
+ precision, scale = 10, 4
149
+ integer_digits = precision - scale
150
+ upper = 10**integer_digits - 1
151
+ # Pick a raw integer in the value space [−upper·10^scale, upper·10^scale],
152
+ # then divide by 10^scale. Doing integer arithmetic first keeps every
153
+ # produced Decimal exactly representable (no float rounding intrusion).
154
+ raw = rng.randint(-upper * 10**scale, upper * 10**scale)
155
+ return Decimal(raw) / (Decimal(10) ** scale)
156
+
157
+
158
+ # Fixed reference epoch — NOT `datetime.now()`. Determinism contract:
159
+ # same seed must produce same output years from now, which means no
160
+ # wall-clock input anywhere in the generator.
161
+ _EPOCH = _dt.date(2025, 1, 1)
162
+ _WINDOW_DAYS = 5 * 365 # ±5 years
163
+
164
+
165
+ def _date(rng: random.Random, col: Column) -> _dt.date:
166
+ days = rng.randint(-_WINDOW_DAYS, _WINDOW_DAYS)
167
+ return _EPOCH + _dt.timedelta(days=days)
168
+
169
+
170
+ def _timestamptz(rng: random.Random, col: Column) -> _dt.datetime:
171
+ # Three rng calls in fixed order: days, seconds, microseconds.
172
+ # Changing the order would shift every downstream value — these
173
+ # are part of the determinism contract.
174
+ days = rng.randint(-_WINDOW_DAYS, _WINDOW_DAYS)
175
+ seconds = rng.randint(0, 86_399)
176
+ micros = rng.randint(0, 999_999)
177
+ # tz: stick with UTC. timestamptz stores UTC internally regardless
178
+ # of the input tz; emitting UTC keeps COPY output canonical and
179
+ # avoids planner statistics being affected by client TZ settings.
180
+ return _dt.datetime(
181
+ _EPOCH.year, _EPOCH.month, _EPOCH.day, tzinfo=_dt.timezone.utc,
182
+ ) + _dt.timedelta(days=days, seconds=seconds, microseconds=micros)
183
+
184
+
185
+ def _interval(rng: random.Random, col: Column) -> _dt.timedelta:
186
+ # Bounded interval: at most a few months. PG intervals can encode
187
+ # year/month parts that timedelta cannot; the emitter formats as
188
+ # ISO-8601-like and the server accepts it cleanly.
189
+ days = rng.randint(0, 120)
190
+ seconds = rng.randint(0, 86_399)
191
+ return _dt.timedelta(days=days, seconds=seconds)
192
+
193
+
194
+ def _jsonb(rng: random.Random, col: Column) -> dict:
195
+ """Shallow random object: 1-4 string keys, scalar values.
196
+
197
+ Deliberately not nested. The spec allows for richer JSON later, but
198
+ for parse/plan validation and demo readability, shallow is enough.
199
+
200
+ Key collisions across the loop are tolerated: when two iterations
201
+ pick the same word, the later iteration overwrites the earlier.
202
+ That's why output dicts may have fewer than `n_keys` entries; this
203
+ is intentional and keeps the rng-call count fixed at `n_keys` keys
204
+ regardless of collisions (every iteration consumes the same ticks).
205
+ """
206
+ n_keys = rng.randint(1, 4)
207
+ out: dict[str, object] = {}
208
+ for _ in range(n_keys):
209
+ key = pick_word(rng)
210
+ # Cap the dict at n_keys; collisions just overwrite, which is fine.
211
+ # Each branch below consumes EXACTLY ONE rng call (after the kind
212
+ # roll), so the total rng consumption per _jsonb is deterministic:
213
+ # 1 (n_keys) + n_keys × (1 key + 1 kind + 0-or-1 value).
214
+ kind = rng.randint(0, 4)
215
+ if kind == 0:
216
+ out[key] = pick_word(rng)
217
+ elif kind == 1:
218
+ out[key] = rng.randint(-1000, 1000)
219
+ elif kind == 2:
220
+ out[key] = round(rng.uniform(-1000.0, 1000.0), 3)
221
+ elif kind == 3:
222
+ out[key] = rng.choice((True, False))
223
+ else:
224
+ # kind == 4: literal JSON null. No rng call. The asymmetry
225
+ # (4 of 5 branches consume an rng tick, 1 doesn't) is
226
+ # intentional — null is a real JSON value, not an error path,
227
+ # and balancing rng consumption would force a dummy call.
228
+ out[key] = None
229
+ return out
230
+
231
+
232
+ @dataclass(frozen=True)
233
+ class _ColumnAdapter:
234
+ """Lightweight Column stand-in used when array strategies recurse on
235
+ the element type. Avoids depending on the full Column constructor
236
+ surface in case it grows constraints we don't care about here.
237
+ """
238
+ name: str
239
+ type: PgType
240
+ nullable: bool
241
+
242
+
243
+ def _array(element_strategy: Strategy, element_type: PgType) -> Strategy:
244
+ """Build a strategy that returns a 0-5 element list. The factory
245
+ closes over the element strategy so call-time work is minimal —
246
+ we pay the `strategy_for_type` lookup once at factory time, not
247
+ once per row.
248
+
249
+ The closure pattern matters because `strategy_for_type` is called
250
+ every time a row generates an array column; without memoization via
251
+ the factory, recursive array types would re-resolve their element
252
+ strategies on every row.
253
+ """
254
+ def gen(rng: random.Random, col: Column) -> list:
255
+ n = rng.randint(0, 5)
256
+ # Element nullability is intentionally suppressed; NULL injection
257
+ # is an outer-row concern, not an element-level one. A PG array
258
+ # can contain NULL elements, but we don't generate them today —
259
+ # would require differently-quoted output ({NULL} not {""}) and
260
+ # the emit layer doesn't currently model that distinction.
261
+ elem_col = _ColumnAdapter(name=col.name, type=element_type, nullable=False)
262
+ return [element_strategy(rng, elem_col) for _ in range(n)] # type: ignore[arg-type]
263
+ return gen
264
+
265
+
266
+ # Type-name → strategy lookup. Keyed on `PgType.name` (matches pg_type.typname).
267
+ # Adding a new scalar PgType requires adding an entry here AND making sure
268
+ # the emit module's `encode_value` knows how to format the strategy's
269
+ # return type. The two registries together define what the data generator
270
+ # can produce.
271
+ _TYPE_STRATEGIES: dict[str, Strategy] = {
272
+ "int4": _int4,
273
+ "int8": _int8,
274
+ "text": _text,
275
+ "varchar": _varchar,
276
+ "bool": _bool,
277
+ "uuid": _uuid,
278
+ "float8": _float8,
279
+ "numeric": _numeric,
280
+ "date": _date,
281
+ "timestamptz": _timestamptz,
282
+ "interval": _interval,
283
+ "jsonb": _jsonb,
284
+ }
285
+
286
+
287
+ def strategy_for_type(t: PgType) -> Strategy:
288
+ """Return the strategy for type `t`, or raise KeyError if no scalar strategy
289
+ is registered for the element type.
290
+
291
+ Array types are handled directly: we recurse on the element type and
292
+ wrap the result in `_array`. The element strategy is looked up once at
293
+ factory time (not per row), keeping data generation cheap. Recursion
294
+ bottoms out at scalar types, which are in `_TYPE_STRATEGIES`.
295
+ """
296
+ if t.is_array():
297
+ assert t.element is not None # guaranteed by is_array()
298
+ return _array(strategy_for_type(t.element), t.element)
299
+ return _TYPE_STRATEGIES[t.name]