waxsql 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
waxsql/cli.py ADDED
@@ -0,0 +1,888 @@
1
+ """Command-line interface to waxsql.
2
+
3
+ Three subcommands mirror the library's halves:
4
+ - gen: produce a random schema + query at a chosen complexity / seed
5
+ (optionally with COPY blocks for table data)
6
+ - data: produce only COPY blocks for a deterministic schema
7
+ - validate: run SQL through the SYNTAX, PARSE, or PLAN tier
8
+
9
+ This module is loaded by the `waxsql` console_scripts entry point in
10
+ pyproject.toml. The CLI's runtime dependency on `click` is declared via
11
+ the optional [cli] extra; a friendly install hint is printed before any
12
+ click code runs if the dep is missing, so a user who installed plain
13
+ `waxsql` and ran the command gets a clear message instead of a raw
14
+ ImportError traceback.
15
+
16
+ Role: this is the user-facing pipe surface. Everything here is glue —
17
+ flag parsing, stream routing, header round-tripping, error formatting.
18
+ No generator logic lives here; if you find yourself reaching for one,
19
+ the right answer is almost always "import it from `waxsql.*`".
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import sys
24
+ from dataclasses import dataclass
25
+
26
+ # ----- Friendly missing-dep check ---------------------------------------
27
+ # Must run before any @click decorator is evaluated. If [cli] isn't
28
+ # installed, exit 3 with an install hint rather than letting the
29
+ # ModuleNotFoundError propagate up through the console-script wrapper.
30
+ try:
31
+ import click
32
+ except ImportError:
33
+ sys.stderr.write(
34
+ "waxsql: the CLI requires the 'click' package, which ships with the\n"
35
+ "[cli] optional extra. Install with:\n\n"
36
+ " pip install 'waxsql[cli]'\n\n"
37
+ )
38
+ sys.exit(3)
39
+
40
+ import contextlib
41
+ import random as _random
42
+ import re
43
+ from typing import Any, Optional
44
+
45
+ from waxsql import __version__, generate_query, generate_schema, print_query
46
+
47
+ # ----- Header format ----------------------------------------------------
48
+ # The `gen` subcommand prefixes its output with a header describing the
49
+ # seed and complexity used; the `validate` subcommand's --auto-schema
50
+ # feature parses that header back to regenerate the matching schema.
51
+ #
52
+ # Base format (stable across CLI versions):
53
+ #
54
+ # -- waxsql <version> seed=<int> complexity=<int>
55
+ #
56
+ # When data generation is requested, additional keys follow:
57
+ #
58
+ # -- waxsql <version> seed=<int> complexity=<int> with-data=true rows=<int> fanout=<int> null-fraction=<float>
59
+ #
60
+ # The regex matches the prefix loosely and we extract key=value pairs
61
+ # post-hoc, which lets unknown future keys be ignored gracefully rather
62
+ # than causing a parse failure. The required keys (seed, complexity) are
63
+ # still enforced after extraction.
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class Header:
68
+ """Parsed contents of a `-- waxsql ... seed=N complexity=K ...` line.
69
+
70
+ Older headers had only seed and complexity; newer headers add keys for
71
+ data generation. Missing keys take sensible defaults so old gen output
72
+ continues to parse cleanly.
73
+
74
+ The defaults here (with_data=False, rows=100, fanout=5, null_fraction=0.05)
75
+ deliberately match the defaults of `generate_data` and the gen subcommand
76
+ flags. That alignment is what lets a header produced by an older version
77
+ (without data keys) be replayed by a newer validator without surprises.
78
+ """
79
+
80
+ seed: int
81
+ complexity: int
82
+ with_data: bool = False
83
+ rows: int = 100
84
+ fanout: int = 5
85
+ null_fraction: float = 0.05
86
+
87
+
88
+ # Matches "-- waxsql X.Y.Z" followed by anything — we parse key=value pairs
89
+ # from the captured rest rather than encoding the full format in the regex.
90
+ # This is deliberately loose: the strictness lives in the required-key check
91
+ # inside _parse_header, not in the regex. Future unknown keys are simply
92
+ # ignored, so older validators reading newer gen output won't reject it.
93
+ _HEADER_RE = re.compile(r"^--\s+waxsql\s+\S+(?P<rest>.*)$")
94
+ # Key names are lowercase letters and hyphens only (e.g. `null-fraction`).
95
+ # Values are non-whitespace tokens — no spaces allowed in values today;
96
+ # the format predates any value that would need it.
97
+ _KV_RE = re.compile(r"(?P<k>[a-z-]+)=(?P<v>\S+)")
98
+
99
+
100
+ def _render_header(
101
+ seed: int,
102
+ complexity: int,
103
+ *,
104
+ with_data: bool = False,
105
+ rows: int = 100,
106
+ fanout: int = 5,
107
+ null_fraction: float = 0.05,
108
+ ) -> str:
109
+ """Render the gen-output header. Round-trips with `_parse_header`.
110
+
111
+ When `with_data` is False (the historical default), only seed and
112
+ complexity are emitted so existing gen output remains byte-stable.
113
+ Data keys are appended only when they carry meaning.
114
+ """
115
+ parts = [f"-- waxsql {__version__}", f"seed={seed}", f"complexity={complexity}"]
116
+ if with_data:
117
+ parts.extend([
118
+ "with-data=true",
119
+ f"rows={rows}",
120
+ f"fanout={fanout}",
121
+ f"null-fraction={null_fraction}",
122
+ ])
123
+ return " ".join(parts)
124
+
125
+
126
+ def _parse_header(text: str) -> Optional[Header]:
127
+ """Parse a waxsql header from the first non-empty line of `text`.
128
+
129
+ Returns None if the first non-empty line isn't a recognizable waxsql
130
+ header (the common case — most SQL fed to the validator won't have one).
131
+ Tolerates leading blank lines so a header preceded by whitespace parses.
132
+
133
+ Returns a `Header` with defaults filled in for any missing optional keys
134
+ so old gen output (seed+complexity only) continues to parse cleanly.
135
+ Requires seed and complexity to be present; returns None otherwise.
136
+ """
137
+ for raw_line in text.splitlines():
138
+ # Defense-in-depth: strip trailing whitespace (including any \r
139
+ # that survived a non-canonical line-ending normalization in the
140
+ # pipeline). splitlines normally handles CRLF cleanly, but if
141
+ # the input was decoded twice or assembled from bytes irregularly,
142
+ # a lingering \r would have made `int("5\r")` raise. Stripping is
143
+ # cheaper than auditing every input path that could feed us text.
144
+ line = raw_line.rstrip()
145
+ if not line.strip():
146
+ continue
147
+ m = _HEADER_RE.match(line)
148
+ if not m:
149
+ return None
150
+ kv = {kv_m.group("k"): kv_m.group("v") for kv_m in _KV_RE.finditer(m.group("rest"))}
151
+ if "seed" not in kv or "complexity" not in kv:
152
+ return None
153
+ # Conversions wrapped so a malformed-but-regex-matching header
154
+ # (e.g. a hand-edited `seed=abc` or `rows=many`) falls through
155
+ # to None rather than raising ValueError up to the caller. The
156
+ # validator's no-header path handles None gracefully; an
157
+ # uncaught ValueError here would crash the CLI with a traceback
158
+ # that hides the real "I tried to parse your header and gave up"
159
+ # diagnostic.
160
+ try:
161
+ return Header(
162
+ seed=int(kv["seed"]),
163
+ complexity=int(kv["complexity"]),
164
+ with_data=kv.get("with-data", "false").lower() == "true",
165
+ rows=int(kv.get("rows", 100)),
166
+ fanout=int(kv.get("fanout", 5)),
167
+ null_fraction=float(kv.get("null-fraction", 0.05)),
168
+ )
169
+ except ValueError:
170
+ return None
171
+ return None
172
+
173
+
174
+ # ----- DSN redaction ----------------------------------------------------
175
+ # Connection-failure error messages echo the DSN to stderr for diagnostics.
176
+ # That stream may be captured by logs or piped through less-trusted
177
+ # tools, so we mask any password material before printing. The user
178
+ # supplied the secret themselves, but echoing it verbatim is a small,
179
+ # avoidable exposure — and matches the discipline psycopg's own
180
+ # `conninfo` redaction follows.
181
+ #
182
+ # Two patterns handle the two DSN forms psycopg accepts:
183
+ # - key-value: `dbname=x password=secret host=h`
184
+ # - URI: `postgresql://user:secret@host/db`
185
+ #
186
+ # Conservative: doesn't try to handle quoted values (`password='se cret'`)
187
+ # because no reasonable user/CI tool produces those; the fallthrough
188
+ # leaves a quote-wrapped password partially exposed, which is still
189
+ # strictly better than the current "echo verbatim" baseline.
190
+ _DSN_PASSWORD_KV_RE = re.compile(r"(password=)\S+")
191
+ _DSN_PASSWORD_URI_RE = re.compile(r"(://[^:/@]*:)[^@]+(@)")
192
+
193
+
194
+ def _redact_dsn(dsn: str) -> str:
195
+ """Replace password fields in a DSN with `***` for safe logging.
196
+
197
+ Used on the connect-failure error message that goes to stderr.
198
+ Other DSN components (host, dbname, user, options, application_name)
199
+ are preserved intact — they're useful for diagnostics and don't
200
+ constitute credentials.
201
+ """
202
+ dsn = _DSN_PASSWORD_KV_RE.sub(r"\1***", dsn)
203
+ dsn = _DSN_PASSWORD_URI_RE.sub(r"\1***\2", dsn)
204
+ return dsn
205
+
206
+
207
+ def _resolve_schema_source(
208
+ *,
209
+ input_text: str,
210
+ schema_from: Optional[str],
211
+ schema_seed: Optional[int],
212
+ schema_complexity: int,
213
+ auto_schema: bool,
214
+ ) -> Optional[str]:
215
+ """Decide which DDL to install for parse/plan validation.
216
+
217
+ Returns the DDL string (a CREATE TABLE / ALTER TABLE / CREATE INDEX
218
+ script) or None if no schema source was supplied. Precedence:
219
+
220
+ 1. --schema-from PATH (most explicit — caller provided a file)
221
+ 2. --schema-seed N (explicit regenerate, uses --schema-complexity)
222
+ 3. --auto-schema parses the input's header and regenerates from it
223
+ (convenience: gen | validate pipelines just work, no extra flags)
224
+ 4. None of the above → return None (caller decides if that's fatal)
225
+
226
+ Pure beyond reading --schema-from PATH. Raises FileNotFoundError if
227
+ the path doesn't exist (let it propagate; click will format it as a
228
+ usage error).
229
+
230
+ All parameters are keyword-only — with six arguments and an order-sensitive
231
+ precedence chain, positional calls would be error-prone and unreadable.
232
+ """
233
+ if schema_from is not None:
234
+ with open(schema_from, encoding="utf-8") as f:
235
+ return f.read()
236
+
237
+ if schema_seed is not None:
238
+ return generate_schema(
239
+ seed=schema_seed, complexity=schema_complexity
240
+ ).emit_ddl()
241
+
242
+ if auto_schema:
243
+ parsed = _parse_header(input_text)
244
+ if parsed is not None:
245
+ return generate_schema(
246
+ seed=parsed.seed, complexity=parsed.complexity
247
+ ).emit_ddl()
248
+
249
+ return None
250
+
251
+
252
+ def _split_copy_blocks(data_sql: str) -> list[str]:
253
+ """Split a `generate_data` output string into individual COPY blocks.
254
+
255
+ Each block is `COPY ... FROM STDIN;\\n<rows>\\n\\.\\n`. They're
256
+ separated by blank lines in the combined stream. Blocks are collected
257
+ by accumulating lines until the `\\.` terminator is found, so partial
258
+ or empty COPY blocks (rows=0) are handled correctly.
259
+
260
+ Leading blank lines between blocks are skipped so every returned block
261
+ starts with its COPY header line. This matters because `_execute_copy_block`
262
+ treats `lines[0]` as the COPY statement to pass to `cur.copy()`.
263
+
264
+ We split on the `\\.` terminator rather than on blank-line boundaries
265
+ because COPY data rows can in principle contain blank-looking content
266
+ (a row with all empty strings would render as a single tab character).
267
+ Anchoring on `\\.` is grammatically correct for PG text-format COPY.
268
+ """
269
+ blocks: list[str] = []
270
+ current: list[str] = []
271
+ for line in data_sql.splitlines(keepends=True):
272
+ # Skip blank lines between blocks (don't accumulate into current).
273
+ # Once `current` has any content, blank lines are treated as
274
+ # row separators within the block (rare but possible) and kept.
275
+ if not current and not line.strip():
276
+ continue
277
+ current.append(line)
278
+ if line.strip() == r"\.":
279
+ blocks.append("".join(current))
280
+ current = []
281
+ # A non-empty `current` here means the stream ended mid-block: the
282
+ # final COPY block never hit its `\.` terminator. generate_data always
283
+ # terminates every block it emits, so this only fires on truncated or
284
+ # hand-edited input — raise rather than silently returning fewer blocks
285
+ # than the caller will go on to load (which would look like "some rows
286
+ # just didn't show up" with no error).
287
+ if current:
288
+ raise ValueError("unterminated COPY block: missing '\\.' terminator")
289
+ return blocks
290
+
291
+
292
+ def _execute_copy_block(cur: Any, block: str) -> None:
293
+ """Execute one COPY block via psycopg's `copy()` context manager.
294
+
295
+ The block's first line is `COPY "t" (...) FROM STDIN;`; the body is
296
+ tab-encoded rows; the terminator is `\\.`. We strip the terminator
297
+ before feeding because psycopg's `copy()` provides its own end-of-data
298
+ marker — sending `\\.` as a data row would corrupt the stream.
299
+
300
+ The `cur` parameter is untyped because psycopg is an optional dep and
301
+ a forward reference to `psycopg.Cursor` would require a TYPE_CHECKING
302
+ guard; `Any` is the same pattern used for `check_fn` above.
303
+
304
+ Side effect: data rows are inserted into the table named by the COPY
305
+ header. The caller must hold a transaction (or a savepoint) around
306
+ this call; the validate command wraps each block in a SAVEPOINT for
307
+ granular error reporting.
308
+ """
309
+ lines = block.splitlines()
310
+ # Strip trailing `;` — psycopg's `cur.copy(stmt)` doesn't want the
311
+ # statement terminator (it's not a normal SQL execute).
312
+ header = lines[0].rstrip(";")
313
+ # Drop the trailing `\.` terminator; psycopg handles end-of-stream.
314
+ # Using a filter here means stray `\.` lines anywhere in the block
315
+ # would be dropped, but `_split_copy_blocks` guarantees there's
316
+ # exactly one terminator at the end.
317
+ body_lines = [line for line in lines[1:] if line.strip() != r"\."]
318
+ with cur.copy(header) as copy:
319
+ for line in body_lines:
320
+ copy.write(line + "\n")
321
+
322
+
323
+ def _extract_first_select(text: str) -> str:
324
+ """Pull the first SELECT or WITH statement out of `text`.
325
+
326
+ The gen subcommand emits its output as multiple `;`-terminated
327
+ statements (CREATE TABLE ...; ALTER TABLE ...; SELECT ...;); the
328
+ parse/plan validators take a single statement at a time. This
329
+ helper finds the first SELECT/WITH and returns it through its
330
+ trailing `;` (exclusive). If no SELECT/WITH is found, returns the
331
+ whole text unchanged so a hand-written single-statement input
332
+ still works without special-casing.
333
+
334
+ The trailing semicolon is stripped because PREPARE and EXPLAIN
335
+ both reject a statement-terminating `;` — the validator supplies
336
+ its own statement boundary.
337
+ """
338
+ lines = text.splitlines()
339
+ select_start = None
340
+ # Use a word-boundary match so that identifiers starting with 'SELECT'
341
+ # or 'WITH' (e.g. 'SELECTED', 'WITHOUT') don't accidentally trigger
342
+ # extraction. re.match is anchored at the start of the stripped line.
343
+ _select_re = re.compile(r"(?i)(SELECT|WITH)\b")
344
+ for i, line in enumerate(lines):
345
+ stripped = line.lstrip()
346
+ if _select_re.match(stripped):
347
+ select_start = i
348
+ break
349
+ if select_start is None:
350
+ return text # no SELECT/WITH found; pass through
351
+
352
+ body = "\n".join(lines[select_start:])
353
+ # Strip trailing semicolon — PREPARE/EXPLAIN don't want one.
354
+ body = body.rstrip().rstrip(";").rstrip()
355
+ return body
356
+
357
+
358
+ @click.group()
359
+ @click.version_option(version=__version__, prog_name="waxsql")
360
+ def main() -> None:
361
+ """Random PostgreSQL query generator."""
362
+
363
+
364
+ # Subcommands (gen, validate) are registered in subsequent tasks.
365
+
366
+
367
+ def _pick_random_seed() -> int:
368
+ """Pick a fresh seed for `waxsql gen` with no `--seed` flag.
369
+
370
+ Uses SystemRandom (backed by os.urandom) rather than the global random
371
+ module to avoid crossing streams with any downstream RNG users. The range
372
+ matches what `random.Random(seed)` accepts comfortably (non-negative 63-bit
373
+ int), and it's large enough that collision across independent invocations
374
+ is astronomically unlikely.
375
+
376
+ This is the ONLY place in the codebase that intentionally consumes
377
+ OS entropy. Everywhere else in waxsql is strictly deterministic; this
378
+ function exists precisely to bootstrap a fresh deterministic seed
379
+ when the user didn't supply one. The seed is then echoed in the
380
+ output header so the run remains reproducible.
381
+ """
382
+ return _random.SystemRandom().randint(0, 2**63 - 1)
383
+
384
+
385
+ def _apply_pprint(sql: str, *, color: bool) -> str:
386
+ """Pretty-print `sql` for `gen --pprint`, converting the pretty
387
+ module's optional-dependency RuntimeError into a friendly CLI exit
388
+ (exit 3, matching the click missing-dep guard at module top)."""
389
+ from waxsql.pretty import prettify_sql
390
+
391
+ try:
392
+ return prettify_sql(sql, color=color)
393
+ except RuntimeError as e:
394
+ click.echo(str(e), err=True)
395
+ sys.exit(3)
396
+
397
+
398
+ @main.command("gen")
399
+ @click.option(
400
+ "-s", "--seed", type=int, default=None,
401
+ help="RNG seed. If omitted, picks one randomly and prints it in the "
402
+ "output header so the run is reproducible.",
403
+ )
404
+ @click.option(
405
+ "-c", "--complexity", type=click.IntRange(0, 10), default=5, show_default=True,
406
+ help="Complexity dial 0..10.",
407
+ )
408
+ @click.option(
409
+ "--schema-only", is_flag=True, default=False,
410
+ help="Emit only the CREATE TABLE script (no query).",
411
+ )
412
+ @click.option(
413
+ "--query-only", is_flag=True, default=False,
414
+ help="Emit only the SELECT statement (no schema DDL).",
415
+ )
416
+ @click.option(
417
+ "-n", "--count", type=click.IntRange(min=1), default=1, show_default=True,
418
+ help="Number of queries to emit against the same schema. "
419
+ "Each gets its own query seed starting from --seed.",
420
+ )
421
+ @click.option(
422
+ "--no-header", is_flag=True, default=False,
423
+ help="Suppress the leading `-- waxsql seed=N complexity=X` header line.",
424
+ )
425
+ @click.option(
426
+ "--with-data", "with_data", is_flag=True, default=False,
427
+ help="Emit COPY blocks for table data between DDL and queries. "
428
+ "Ignored when --schema-only or --query-only is set.",
429
+ )
430
+ @click.option(
431
+ "--rows", type=click.IntRange(min=0), default=100, show_default=True,
432
+ help="Base row count per table when --with-data is set. "
433
+ "0 emits empty COPY blocks (header + immediate terminator).",
434
+ )
435
+ @click.option(
436
+ "--fanout", type=click.IntRange(min=1), default=5, show_default=True,
437
+ help="FK-depth row multiplier when --with-data is set.",
438
+ )
439
+ @click.option(
440
+ "--null-fraction", "null_fraction", type=float, default=0.05, show_default=True,
441
+ help="Per-nullable-column NULL probability when --with-data is set.",
442
+ )
443
+ @click.option(
444
+ "--pprint", is_flag=True, default=False,
445
+ help="Reformat and (on a terminal) colorize the generated SQL for "
446
+ "human reading. Display-only — not for piping into validate. "
447
+ "Requires the [pprint] extra.",
448
+ )
449
+ def gen(
450
+ seed: Optional[int],
451
+ complexity: int,
452
+ schema_only: bool,
453
+ query_only: bool,
454
+ count: int,
455
+ no_header: bool,
456
+ with_data: bool,
457
+ rows: int,
458
+ fanout: int,
459
+ null_fraction: float,
460
+ pprint: bool,
461
+ ) -> None:
462
+ """Generate a random schema and/or query against it.
463
+
464
+ By default emits both: schema DDL first, then the query, both
465
+ semicolon-terminated and psql-ready. --schema-only / --query-only
466
+ are mutually exclusive escape hatches. --count N produces N queries
467
+ against the same schema (seeds are seed, seed+1, ..., seed+N-1), so
468
+ `--seed S -n 3` always yields the same three-query batch.
469
+
470
+ --with-data inserts COPY blocks between the DDL and queries, producing
471
+ a fully self-contained psql script. It is silently ignored when
472
+ --schema-only or --query-only is set, because the combined stream
473
+ (header + DDL + data + query) requires both halves to be present.
474
+ """
475
+ if schema_only and query_only:
476
+ raise click.UsageError(
477
+ "--schema-only and --query-only are mutually exclusive."
478
+ )
479
+
480
+ # --with-data only makes sense in the combined stream (DDL + data +
481
+ # queries). Silently suppress it for partial outputs so callers don't
482
+ # need to remember to also add --schema-only flags when building pipelines.
483
+ emit_data = with_data and not schema_only and not query_only
484
+
485
+ # --pprint reformats always; color is gated on an interactive
486
+ # terminal. should_colorize needs no optional deps (just isatty +
487
+ # NO_COLOR); the pglast/pygments requirement surfaces later when
488
+ # _apply_pprint actually calls prettify_sql.
489
+ pprint_color = False
490
+ if pprint:
491
+ from waxsql.pretty import should_colorize
492
+ pprint_color = should_colorize(sys.stdout)
493
+
494
+ if seed is None:
495
+ seed = _pick_random_seed()
496
+
497
+ schema = generate_schema(seed=seed, complexity=complexity)
498
+
499
+ out: list[str] = []
500
+ if not no_header:
501
+ out.append(_render_header(
502
+ seed, complexity,
503
+ with_data=emit_data, rows=rows, fanout=fanout,
504
+ null_fraction=null_fraction,
505
+ ))
506
+
507
+ if not query_only:
508
+ out.append("-- schema:")
509
+ ddl = schema.emit_ddl().rstrip()
510
+ if pprint:
511
+ ddl = _apply_pprint(ddl, color=pprint_color)
512
+ out.append(ddl)
513
+
514
+ if emit_data:
515
+ # COPY blocks go between DDL and queries. We flush `out` first so
516
+ # click.echo handles the DDL, then emit the COPY text with nl=False
517
+ # (the generate_data output already ends with "\n"), then continue
518
+ # building the query section.
519
+ from waxsql.data import generate_data
520
+ click.echo("\n".join(out))
521
+ out = []
522
+ click.echo() # blank line separating DDL from COPY blocks
523
+ try:
524
+ data_text = generate_data(
525
+ schema, seed=seed, rows=rows, fanout=fanout,
526
+ null_fraction=null_fraction,
527
+ )
528
+ except ValueError as e:
529
+ # ValueError from generate_data is almost always an FK cycle
530
+ # at complexity ≥ 8 (the schema generator can produce them;
531
+ # the data generator can't yet untangle them via deferred
532
+ # constraints + UPDATE patches). Catch here and re-raise as
533
+ # a usage error rather than a stack trace.
534
+ click.echo(
535
+ f"waxsql gen: cannot generate data for this schema: {e}\n"
536
+ f"This typically means the schema has FK cycles, which "
537
+ f"data generation does not yet support. Try a lower --complexity.",
538
+ err=True,
539
+ )
540
+ sys.exit(1)
541
+ click.echo(data_text, nl=False)
542
+
543
+ if not schema_only:
544
+ for i in range(count):
545
+ if not query_only or i > 0:
546
+ # Blank separator: between schema (or data) and queries
547
+ # (i==0 with a schema present), and between consecutive queries.
548
+ out.append("")
549
+ label = "-- query:" if count == 1 else f"-- query {i + 1}/{count}"
550
+ out.append(label)
551
+ q = generate_query(
552
+ seed=seed + i, schema=schema, complexity=complexity,
553
+ )
554
+ q_sql = print_query(q) + ";"
555
+ if pprint:
556
+ q_sql = _apply_pprint(q_sql, color=pprint_color)
557
+ out.append(q_sql)
558
+
559
+ click.echo("\n".join(out))
560
+
561
+
562
+ @main.command("validate")
563
+ @click.argument(
564
+ "sql_file", type=click.Path(exists=True, dir_okay=False, allow_dash=True),
565
+ required=False, default="-",
566
+ )
567
+ @click.option(
568
+ "-t", "--tier",
569
+ type=click.Choice(["syntax", "parse", "plan"], case_sensitive=False),
570
+ default="syntax", show_default=True,
571
+ help="Validation tier.",
572
+ )
573
+ @click.option(
574
+ "--dsn", default=None,
575
+ help="psycopg DSN for parse/plan tiers. Defaults to $WAXSQL_PG_DSN "
576
+ "or 'dbname=waxsql_test'.",
577
+ )
578
+ @click.option(
579
+ "--schema-from", "schema_from", type=click.Path(exists=True, dir_okay=False),
580
+ default=None,
581
+ help="Install DDL from this file before validating (parse/plan only).",
582
+ )
583
+ @click.option(
584
+ "--schema-seed", "schema_seed", type=int, default=None,
585
+ help="Regenerate schema from seed instead (parse/plan only).",
586
+ )
587
+ @click.option(
588
+ "--schema-complexity", "schema_complexity",
589
+ type=click.IntRange(0, 10), default=5, show_default=True,
590
+ help="Companion to --schema-seed.",
591
+ )
592
+ @click.option(
593
+ "--auto-schema/--no-auto-schema", "auto_schema",
594
+ default=True, show_default=True,
595
+ help="If input begins with a `-- waxsql seed=N complexity=X` header, "
596
+ "regenerate that schema automatically.",
597
+ )
598
+ @click.option(
599
+ "-v", "--verbose", is_flag=True, default=False,
600
+ help='Print "OK" on success (silent by default, Unix style).',
601
+ )
602
+ def validate(
603
+ sql_file: str,
604
+ tier: str,
605
+ dsn: Optional[str],
606
+ schema_from: Optional[str],
607
+ schema_seed: Optional[int],
608
+ schema_complexity: int,
609
+ auto_schema: bool,
610
+ verbose: bool,
611
+ ) -> None:
612
+ """Validate SQL through the SYNTAX, PARSE, or PLAN tier.
613
+
614
+ Reads SQL from SQL_FILE, or from stdin if SQL_FILE is omitted or '-'.
615
+ SYNTAX is the default tier (no DB required, fast). PARSE and PLAN
616
+ tiers require a live PG connection and a schema to install. The
617
+ gen-output header (`-- waxsql seed=N complexity=X`) is parsed
618
+ automatically by default so `gen | validate` just works.
619
+ """
620
+ # ----- Read input --------------------------------------------------------
621
+ # click.Path with allow_dash=True does NOT auto-read stdin; we do it here.
622
+ # The distinction between None and '-' matters: both mean stdin, but the
623
+ # user can also explicitly pass '-' as the file argument.
624
+ if sql_file == "-":
625
+ sql_text = click.get_text_stream("stdin").read()
626
+ else:
627
+ with open(sql_file, encoding="utf-8") as f:
628
+ sql_text = f.read()
629
+
630
+ # ----- SYNTAX tier -------------------------------------------------------
631
+ if tier.lower() == "syntax":
632
+ try:
633
+ from waxsql.validate.syntax import check_syntax
634
+ except ImportError:
635
+ # pglast isn't installed — shouldn't happen in normal use (it's a
636
+ # required dep of waxsql itself), but we handle it gracefully for
637
+ # completeness and consistency with the [parse]/[plan] guard below.
638
+ click.echo(
639
+ "waxsql validate --tier syntax requires the [syntax] extra.\n"
640
+ "Install with: pip install 'waxsql[syntax]'",
641
+ err=True,
642
+ )
643
+ sys.exit(3)
644
+
645
+ # pglast handles multi-statement SQL (gen output includes DDL + query)
646
+ # and SQL comments (the header is a comment line) without issue.
647
+ result = check_syntax(sql_text)
648
+ if not result.ok:
649
+ click.echo(f"SYNTAX error: {result.error}", err=True)
650
+ sys.exit(1)
651
+ if verbose:
652
+ click.echo("OK")
653
+ return
654
+
655
+ # ----- PARSE / PLAN tiers ------------------------------------------------
656
+ # Both tiers need psycopg + a live PG. Same connection lifecycle; only
657
+ # the check function differs (PREPARE vs EXPLAIN).
658
+ try:
659
+ import psycopg
660
+ except ImportError:
661
+ click.echo(
662
+ f"waxsql validate --tier {tier} requires psycopg, which ships\n"
663
+ f"with the [parse] (or [plan]) optional extra. Install with:\n\n"
664
+ f" pip install 'waxsql[{tier}]'",
665
+ err=True,
666
+ )
667
+ sys.exit(3)
668
+
669
+ # check_fn typed as Any: check_parse and check_plan have different return
670
+ # types (ParseResult vs PlanResult) that mypy can't reconcile in a single
671
+ # variable without a Union — Any is simpler and correct here since both
672
+ # results have the same .ok / .error / .error_code duck-type interface.
673
+ check_fn: Any
674
+ if tier.lower() == "parse":
675
+ from waxsql.validate.parse import check_parse
676
+ check_fn = check_parse
677
+ else: # "plan"
678
+ from waxsql.validate.plan import check_plan
679
+ check_fn = check_plan
680
+
681
+ # Resolve schema source per the precedence rules in _resolve_schema_source.
682
+ ddl = _resolve_schema_source(
683
+ input_text=sql_text,
684
+ schema_from=schema_from,
685
+ schema_seed=schema_seed,
686
+ schema_complexity=schema_complexity,
687
+ auto_schema=auto_schema,
688
+ )
689
+ if ddl is None:
690
+ raise click.UsageError(
691
+ f"--tier {tier} needs a schema; pass --schema-from PATH, "
692
+ f"--schema-seed N, or pipe in waxsql gen output (which "
693
+ f"includes a parseable header)."
694
+ )
695
+
696
+ # Resolve DSN: --dsn flag > $WAXSQL_PG_DSN > default.
697
+ import os as _os
698
+ resolved_dsn = (
699
+ dsn
700
+ or _os.environ.get("WAXSQL_PG_DSN")
701
+ or "dbname=waxsql_test"
702
+ )
703
+
704
+ # Open a transaction-mode connection, install the schema DDL, run the
705
+ # check, then roll back so nothing persists in the DB. Same pattern as
706
+ # the install_and_check fixture in tests/conftest.py, packaged for
707
+ # one-shot use. We execute the DDL directly (not via install_schema)
708
+ # because _resolve_schema_source returns a raw DDL string, not a Schema
709
+ # object — and both sources (--schema-from file and regenerated) are
710
+ # equally valid as raw SQL.
711
+ try:
712
+ conn = psycopg.connect(resolved_dsn, autocommit=False)
713
+ except psycopg.Error as e:
714
+ # _redact_dsn masks any `password=...` or URI-form credentials
715
+ # so an exposed log (CI artifact, piped-to-less-trusted-tool)
716
+ # doesn't leak the caller's secret. Other DSN parts (host,
717
+ # dbname, etc.) survive intact for diagnostic value.
718
+ click.echo(
719
+ f"could not connect to PG ({_redact_dsn(resolved_dsn)!r}): {e}",
720
+ err=True,
721
+ )
722
+ sys.exit(1)
723
+
724
+ try:
725
+ with conn.cursor() as cur:
726
+ # psycopg (autocommit=False) opens a transaction implicitly on
727
+ # the first statement — no explicit BEGIN needed or wanted.
728
+ # Issuing BEGIN on an already-open psycopg transaction produces
729
+ # a spurious "WARNING: there is already a transaction in progress"
730
+ # from PG and is the psycopg anti-pattern.
731
+ cur.execute(ddl)
732
+
733
+ # If the input was a `gen --with-data` stream, regenerate the data
734
+ # deterministically from the header parameters and load it before
735
+ # the query check. ANALYZE so the planner sees the populated-table
736
+ # statistics. Order matters: DDL → COPY blocks → ANALYZE → EXPLAIN.
737
+ # Running ANALYZE before all COPYs would only see partial data for
738
+ # the tables loaded so far; we want one ANALYZE covering all tables.
739
+ header = _parse_header(sql_text)
740
+ if header is not None and header.with_data:
741
+ from waxsql.data import generate_data
742
+ from waxsql.schema import generate_schema as _gen_schema
743
+
744
+ # Regenerate the data from the header's parameters. Because
745
+ # generate_data is deterministic in (schema, seed, rows, fanout,
746
+ # null_fraction), this produces byte-identical output to what
747
+ # the original gen command emitted — but we don't keep the
748
+ # gen-output COPY text, we regenerate. That avoids needing to
749
+ # parse arbitrary COPY blocks back out of the input stream
750
+ # (which would conflict with extracting the trailing SELECT).
751
+ schema = _gen_schema(seed=header.seed, complexity=header.complexity)
752
+ try:
753
+ data_sql = generate_data(
754
+ schema,
755
+ seed=header.seed,
756
+ rows=header.rows,
757
+ fanout=header.fanout,
758
+ null_fraction=header.null_fraction,
759
+ )
760
+ except ValueError as e:
761
+ # Same cycle-handling branch as in `gen` — kept consistent
762
+ # so failure modes are recognizable across subcommands.
763
+ click.echo(
764
+ f"waxsql validate: cannot generate data for this schema: {e}\n"
765
+ f"This typically means the schema has FK cycles, which "
766
+ f"data generation does not yet support. Try a lower --complexity.",
767
+ err=True,
768
+ )
769
+ sys.exit(1)
770
+ with conn.cursor() as cur:
771
+ # Each COPY block runs in its own savepoint so a single load
772
+ # failure surfaces cleanly with the offending table, rather
773
+ # than aborting the surrounding transaction and losing the
774
+ # entire error context.
775
+ for block in _split_copy_blocks(data_sql):
776
+ cur.execute("SAVEPOINT _waxsql_copy")
777
+ try:
778
+ _execute_copy_block(cur, block)
779
+ except psycopg.Error as e:
780
+ cur.execute("ROLLBACK TO SAVEPOINT _waxsql_copy")
781
+ cur.execute("RELEASE SAVEPOINT _waxsql_copy")
782
+ # PG's error doesn't include the table; the block's
783
+ # first line is the COPY header which does.
784
+ table_header = block.splitlines()[0]
785
+ click.echo(
786
+ f"COPY error loading {table_header!r}: {e}",
787
+ err=True,
788
+ )
789
+ sys.exit(1)
790
+ cur.execute("RELEASE SAVEPOINT _waxsql_copy")
791
+ # ANALYZE after all COPYs so the planner sees statistics
792
+ # for every table, not just those loaded so far.
793
+ cur.execute("ANALYZE")
794
+
795
+ # Extract the first SELECT/WITH statement — PREPARE and EXPLAIN
796
+ # take a single statement; gen output may contain DDL first.
797
+ sql_to_check = _extract_first_select(sql_text)
798
+ # Use a separate variable name to avoid the SyntaxResult type mypy
799
+ # inferred for `result` in the SYNTAX branch above (even though the
800
+ # SYNTAX branch always returns before we get here, mypy tracks the
801
+ # narrowed type across both branches of the if/else chain).
802
+ check_result = check_fn(sql_to_check, conn)
803
+ except psycopg.Error as e:
804
+ # Schema install (`cur.execute(ddl)`) and `ANALYZE` run inside this
805
+ # try with no inner handler; a malformed `--schema-from` DDL or an
806
+ # ANALYZE failure would otherwise escape as a raw traceback. Surface
807
+ # it as a clean CLI error, matching the connect/COPY-load handlers.
808
+ # (check_parse/check_plan don't raise psycopg.Error — they return a
809
+ # result — so this only catches the setup statements.)
810
+ click.echo(f"validation setup failed (schema/ANALYZE): {e}", err=True)
811
+ sys.exit(1)
812
+ finally:
813
+ # If check_fn raised a non-psycopg exception (e.g. BrokenPipeError
814
+ # from a dead connection), rollback/close can raise again and mask
815
+ # the original error. Suppress so the original propagates to the
816
+ # caller — cleanup failures on an already-failing path are noise.
817
+ with contextlib.suppress(Exception):
818
+ conn.rollback()
819
+ with contextlib.suppress(Exception):
820
+ conn.close()
821
+
822
+ if not check_result.ok:
823
+ code_part = f"[{check_result.error_code}] " if check_result.error_code else ""
824
+ click.echo(f"{tier.upper()} error: {code_part}{check_result.error}", err=True)
825
+ sys.exit(1)
826
+ if verbose:
827
+ click.echo("OK")
828
+
829
+
830
+ @main.command("data")
831
+ @click.option(
832
+ "--seed", type=int, required=True,
833
+ help="Schema/data seed (required).",
834
+ )
835
+ @click.option(
836
+ "--complexity", type=click.IntRange(0, 10), default=5, show_default=True,
837
+ help="Schema complexity dial 0..10.",
838
+ )
839
+ @click.option(
840
+ "--rows", type=click.IntRange(min=0), default=100, show_default=True,
841
+ help="Base row count per table (multiplied by fanout**depth). "
842
+ "0 emits empty COPY blocks (header + immediate terminator).",
843
+ )
844
+ @click.option(
845
+ "--fanout", type=click.IntRange(min=1), default=5, show_default=True,
846
+ help="FK-depth row multiplier.",
847
+ )
848
+ @click.option(
849
+ "--null-fraction", "null_fraction", type=float, default=0.05, show_default=True,
850
+ help="Per-nullable-column NULL probability.",
851
+ )
852
+ def data(
853
+ seed: int,
854
+ complexity: int,
855
+ rows: int,
856
+ fanout: int,
857
+ null_fraction: float,
858
+ ) -> None:
859
+ """Generate row data (COPY blocks) for a deterministic schema.
860
+
861
+ Output is COPY blocks only — no DDL, no queries, no header. Pipe
862
+ `waxsql gen --schema-only` (or your own DDL) before this output to
863
+ produce a loadable psql script. The schema is regenerated from
864
+ `--seed` and `--complexity`, so the same pair always yields the same
865
+ tables and the same data.
866
+ """
867
+ # Local imports keep startup time low when click is installed but the
868
+ # data/schema modules haven't been used yet. The pattern mirrors the
869
+ # optional-import guard in `validate` for psycopg/pglast.
870
+ from waxsql.data import generate_data
871
+ from waxsql.schema import generate_schema
872
+
873
+ schema = generate_schema(seed=seed, complexity=complexity)
874
+ try:
875
+ text = generate_data(
876
+ schema, seed=seed, rows=rows, fanout=fanout, null_fraction=null_fraction,
877
+ )
878
+ except ValueError as e:
879
+ click.echo(
880
+ f"waxsql data: cannot generate data for this schema: {e}\n"
881
+ f"This typically means the schema has FK cycles, which "
882
+ f"data generation does not yet support. Try a lower --complexity.",
883
+ err=True,
884
+ )
885
+ sys.exit(1)
886
+ # nl=False: the COPY block string from emit_copy_block already ends with
887
+ # "\n" after the "\." terminator, so we let the content dictate line endings.
888
+ click.echo(text, nl=False)