claude-sql 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
claude_sql/cli.py ADDED
@@ -0,0 +1,2344 @@
1
+ """Cyclopts CLI entry point for ``claude-sql``.
2
+
3
+ Wires the ``claude-sql`` console script to its thirteen subcommands. Shared
4
+ flags -- ``--verbose`` / ``--quiet``, ``--glob``, ``--subagent-glob``,
5
+ ``--format`` -- live on a flattened :class:`Common` dataclass so callers write
6
+ ``claude-sql query ... --format json`` instead of ``--common.format json``.
7
+
8
+ Agent-friendly defaults
9
+ -----------------------
10
+ * ``--format auto`` emits a human table on a TTY and machine-readable JSON
11
+ when stdout is a pipe, so agents do not have to set a flag.
12
+ * DuckDB errors are classified into parse / catalog / runtime and mapped to
13
+ stable exit codes (64 / 65 / 70) with a JSON error payload on non-TTY.
14
+ * ``--quiet`` is honored by every subcommand; view registration goes to DEBUG
15
+ so the default stderr stays quiet for routine reads.
16
+
17
+ ``asyncio`` and subprocess imports are performed lazily inside the relevant
18
+ commands so that the fast path (``schema``, ``query``, ``explain``) does not
19
+ drag extra modules into startup.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ import os
26
+ import re
27
+ import subprocess
28
+ import sys
29
+ import tempfile
30
+ import time
31
+ from dataclasses import dataclass
32
+ from datetime import UTC, datetime
33
+ from pathlib import Path
34
+ from typing import Annotated
35
+
36
+ import duckdb
37
+ import polars as pl
38
+ from cyclopts import App, Parameter
39
+ from loguru import logger
40
+
41
+ from claude_sql import (
42
+ binding as _binding,
43
+ blind_handover as _blind_handover,
44
+ checkpointer,
45
+ freeze as _freeze,
46
+ judge_worker as _judge_worker,
47
+ judges as _judge_catalog,
48
+ kappa_worker as _kappa_worker,
49
+ skills_catalog as _skills_catalog,
50
+ ungrounded_worker as _ungrounded_worker,
51
+ )
52
+ from claude_sql.cluster_worker import run_clustering
53
+ from claude_sql.community_worker import run_communities
54
+ from claude_sql.config import Settings
55
+ from claude_sql.embed_worker import embed_query, run_backfill
56
+ from claude_sql.friction_worker import detect_user_friction
57
+ from claude_sql.install_source import format_version
58
+ from claude_sql.llm_worker import classify_sessions, detect_conflicts, trajectory_messages
59
+ from claude_sql.logging_setup import configure_logging
60
+ from claude_sql.output import (
61
+ EXIT_CODES,
62
+ ClassifiedError,
63
+ InputValidationError,
64
+ OutputFormat,
65
+ emit_dataframe,
66
+ emit_error,
67
+ emit_json,
68
+ resolve_format,
69
+ run_or_die,
70
+ validate_glob,
71
+ )
72
+ from claude_sql.parquet_shards import (
73
+ count_rows,
74
+ is_sharded_dir,
75
+ iter_part_files,
76
+ )
77
+ from claude_sql.review_sheet_render import render_markdown, render_refusal_markdown
78
+ from claude_sql.review_sheet_worker import generate_review_sheet
79
+ from claude_sql.sql_views import (
80
+ describe_all,
81
+ list_macros,
82
+ register_all,
83
+ register_raw,
84
+ register_views,
85
+ )
86
+ from claude_sql.terms_worker import run_terms
87
+
88
+ _APP_HELP = """\
89
+ Zero-copy SQL + Cohere Embed v4 semantic search + Sonnet 4.6 analytics over
90
+ ~/.claude/ JSONL transcripts (and their subagent sidecars).
91
+
92
+ Surfaces at a glance
93
+ --------------------
94
+ schema / list-cache / explain introspection (read-only, zero cost)
95
+ query / shell run SQL against 18 views + 14 macros
96
+ embed / search Cohere Embed v4 + HNSW cosine search
97
+ classify / trajectory / Sonnet 4.6 analytics -- each defaults to
98
+ conflicts / friction --dry-run; pass --no-dry-run to spend
99
+ cluster / terms / community UMAP+HDBSCAN, c-TF-IDF, Louvain
100
+ analyze composite pipeline over every stage above
101
+
102
+ Flag placement (important for agents)
103
+ -------------------------------------
104
+ All flags attach to a SUBCOMMAND, not the top-level binary. Correct:
105
+ claude-sql query --format json "SELECT 1"
106
+ claude-sql classify --no-dry-run --limit 5
107
+ Incorrect (flag gets swallowed as the subcommand argument):
108
+ claude-sql --format json query "SELECT 1"
109
+
110
+ Output & exit codes
111
+ -------------------
112
+ * --format {auto,table,json,ndjson,csv} on every subcommand. auto = table on
113
+ TTY / json on pipe, so `claude-sql <cmd> | jq` works without a flag.
114
+ * 0 success
115
+ * 2 missing embeddings parquet (run: claude-sql embed --since-days N --no-dry-run)
116
+ * 64 invalid input -- malformed --glob, unparseable SQL, bad flag
117
+ * 65 catalog error -- unknown view/column; run `claude-sql schema` for the catalog
118
+ * 70 runtime error -- everything else DuckDB raises (check --format json stderr)
119
+ * 127 system `duckdb` binary not on PATH (only affects `shell`)
120
+
121
+ Cost guard
122
+ ----------
123
+ Every command that calls Bedrock (embed, classify, trajectory, conflicts,
124
+ friction, analyze) defaults to --dry-run. Dry-run emits a plan JSON to stdout
125
+ with candidate counts, estimated tokens, and dollar estimate -- agents can
126
+ parse that to decide whether to proceed. Real spend requires --no-dry-run.
127
+
128
+ Glob scoping (cheaper workers)
129
+ ------------------------------
130
+ Narrow to one project with --glob to cut worker budget:
131
+ --glob "/home/you/.claude/projects/-efs-you-workplace-bonk/*.jsonl"
132
+ At most one '**' segment is allowed per pattern (DuckDB limitation) -- the
133
+ CLI rejects multi-star globs with a clear hint before DuckDB sees them.
134
+ """
135
+
136
+
137
+ app = App(
138
+ name="claude-sql",
139
+ version=format_version,
140
+ help=_APP_HELP,
141
+ )
142
+
143
+
144
+ @Parameter(name="*")
145
+ @dataclass
146
+ class Common:
147
+ """Shared CLI flags flattened onto every subcommand.
148
+
149
+ ``verbose`` and its paired ``--quiet`` negation both map to this single
150
+ bool (cyclopts uses the ``negative=`` argument to wire the "opposite"
151
+ flag onto the same field). ``quiet`` is the one extra concept the
152
+ dataclass needs to carry: it cannot piggyback on ``verbose`` because
153
+ the two states are not symmetric (verbose forces DEBUG, quiet forces
154
+ ERROR, and the default is INFO).
155
+ """
156
+
157
+ verbose: bool = False
158
+ quiet: bool = False
159
+ glob: str | None = None
160
+ subagent_glob: str | None = None
161
+ format: Annotated[OutputFormat, Parameter(name="--format")] = OutputFormat.AUTO
162
+
163
+
164
+ def _configure(common: Common | None) -> None:
165
+ """Install logging based on the shared flags."""
166
+ configure_logging(
167
+ verbose=common.verbose if common else False,
168
+ quiet=common.quiet if common else False,
169
+ )
170
+
171
+
172
+ def _fmt(common: Common | None) -> OutputFormat:
173
+ """Resolve the effective output format for a subcommand."""
174
+ return common.format if common else OutputFormat.AUTO
175
+
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # Helpers
179
+ # ---------------------------------------------------------------------------
180
+
181
+
182
+ def _resolve_settings(common: Common | None) -> Settings:
183
+ """Build :class:`Settings` from env then apply CLI overrides.
184
+
185
+ Validates ``--glob`` / ``--subagent-glob`` up front so DuckDB never sees
186
+ a pattern it cannot consume (e.g. ``**/.../**``). On failure emits a
187
+ classified error and exits with code 64 so every subcommand gets the
188
+ same treatment without wrapping each call site.
189
+ """
190
+ settings = Settings()
191
+ if common is None:
192
+ return settings
193
+ try:
194
+ validate_glob(common.glob, flag="--glob")
195
+ validate_glob(common.subagent_glob, flag="--subagent-glob")
196
+ except InputValidationError as exc:
197
+ err = ClassifiedError(
198
+ kind="invalid_input",
199
+ exit_code=EXIT_CODES["invalid_input"],
200
+ message=str(exc),
201
+ hint=exc.hint,
202
+ )
203
+ emit_error(err, _fmt(common))
204
+ sys.exit(err.exit_code)
205
+ updates: dict[str, str] = {}
206
+ if common.glob is not None:
207
+ updates["default_glob"] = common.glob
208
+ if common.subagent_glob is not None:
209
+ updates["subagent_glob"] = common.subagent_glob
210
+ if not updates:
211
+ return settings
212
+ return settings.model_copy(update=updates)
213
+
214
+
215
+ _PERCENT_LIMIT_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*%\s*$")
216
+
217
+
218
+ def _resolve_memory_limit(limit: str) -> str:
219
+ """Translate ``"<n>%"`` into an absolute size DuckDB accepts.
220
+
221
+ DuckDB's ``memory_limit`` parser only knows ``KB / MB / GB / TB`` and the
222
+ binary variants. Percentage strings are rejected, so we resolve them
223
+ against the host's reported total memory before the PRAGMA fires. Any
224
+ other form passes through unchanged so the env var can still pin an
225
+ absolute size like ``"4GB"`` directly.
226
+ """
227
+ match = _PERCENT_LIMIT_RE.match(limit)
228
+ if match is None:
229
+ return limit.strip()
230
+ fraction = float(match.group(1)) / 100.0
231
+ try:
232
+ page_size = os.sysconf("SC_PAGE_SIZE")
233
+ phys_pages = os.sysconf("SC_PHYS_PAGES")
234
+ except (AttributeError, ValueError, OSError):
235
+ # Non-POSIX or restricted host — fall back to a conservative 4 GiB.
236
+ total_bytes = 4 * 1024**3
237
+ else:
238
+ total_bytes = page_size * phys_pages
239
+ target_mib = max(1, int((total_bytes * fraction) // (1024 * 1024)))
240
+ return f"{target_mib}MiB"
241
+
242
+
243
+ def _open_connection(settings: Settings) -> duckdb.DuckDBPyConnection:
244
+ """Open an in-memory DuckDB connection with every claude-sql object wired.
245
+
246
+ Tuning PRAGMAs are set before view registration so the registration
247
+ queries themselves benefit from the higher thread count and the spill
248
+ directory pointed at real disk (Amazon devboxes ship ``/tmp`` as a
249
+ 4 GB tmpfs that thrashes once a clustering run starts spilling).
250
+ """
251
+ con = duckdb.connect(":memory:")
252
+ settings.duckdb_temp_dir.mkdir(parents=True, exist_ok=True)
253
+ memory_limit = _resolve_memory_limit(settings.duckdb_memory_limit)
254
+ con.execute(f"SET threads = {int(settings.duckdb_threads)}")
255
+ con.execute(f"SET memory_limit = '{memory_limit}'")
256
+ con.execute(f"SET temp_directory = '{settings.duckdb_temp_dir}'")
257
+ con.execute("SET enable_object_cache = true")
258
+ con.execute("SET preserve_insertion_order = false")
259
+ register_all(con, settings=settings)
260
+ return con
261
+
262
+
263
+ def _emit_worker_result(result: int | dict, common: Common | None, pipeline: str) -> None:
264
+ """Normalize worker results for stdout.
265
+
266
+ Workers return either an ``int`` (rows processed) or a plan ``dict`` when
267
+ ``--dry-run`` is set. Agents parse stdout JSON, so we always emit something
268
+ machine-readable: the plan dict under dry-run, or a compact summary dict
269
+ when real work runs.
270
+ """
271
+ fmt = _fmt(common)
272
+ if isinstance(result, dict):
273
+ emit_json(result, fmt)
274
+ else:
275
+ emit_json({"pipeline": pipeline, "rows_processed": int(result), "dry_run": False}, fmt)
276
+
277
+
278
+ # EXPLAIN plan markers that indicate pushdown or noteworthy physical ops.
279
+ _EXPLAIN_MARKERS: tuple[str, ...] = (
280
+ "READ_JSON",
281
+ "Filters:",
282
+ "Projection",
283
+ "Filter",
284
+ "HASH_JOIN",
285
+ "HNSW_INDEX_SCAN",
286
+ "HASH_GROUP_BY",
287
+ )
288
+
289
+
290
+ def _describe_checkpoint_entry(path: Path) -> dict[str, object]:
291
+ """Report the persistent DuckDB checkpoint file alongside the parquet caches.
292
+
293
+ Keeps the same ``{name, path, exists[, bytes, mtime, rows]}`` shape as
294
+ :func:`_describe_cache_entry` so ``list-cache`` stays homogeneous. Row
295
+ count is queried via :func:`checkpointer.count_rows`.
296
+ """
297
+ exists = path.exists() and path.is_file()
298
+ entry: dict[str, object] = {"name": "session_checkpoint", "path": str(path), "exists": exists}
299
+ if not exists:
300
+ return entry
301
+ stat = path.stat()
302
+ entry["bytes"] = stat.st_size
303
+ entry["mtime"] = datetime.fromtimestamp(stat.st_mtime, tz=UTC).isoformat()
304
+ try:
305
+ entry["rows"] = checkpointer.count_rows(path)
306
+ except duckdb.Error:
307
+ entry["rows"] = None
308
+ return entry
309
+
310
+
311
+ def _describe_cache_entry(name: str, path: Path) -> dict[str, object]:
312
+ """Collect filesystem metadata about one parquet cache entry.
313
+
314
+ Handles both legacy single-file caches and the sharded directory layout
315
+ (``<dir>/part-*.parquet``). For a sharded directory, ``bytes`` is the
316
+ sum across parts, ``mtime`` is the latest part's modification time,
317
+ and ``rows`` is the union row count.
318
+
319
+ Row counts are read via :func:`count_rows` (footer-only ``scan_parquet``)
320
+ so the call is cheap even on very large caches. A zero-byte / corrupt
321
+ part surfaces ``rows=None`` rather than aborting the whole listing.
322
+ """
323
+ parts = iter_part_files(path)
324
+ exists = bool(parts) or path.exists()
325
+ entry: dict[str, object] = {"name": name, "path": str(path), "exists": exists}
326
+ if not exists:
327
+ return entry
328
+ if not parts:
329
+ # Path exists (e.g. an empty directory) but has no part files; surface
330
+ # the bare directory mtime so users can still see "we made the dir".
331
+ stat = path.stat()
332
+ entry["bytes"] = 0
333
+ entry["mtime"] = datetime.fromtimestamp(stat.st_mtime, tz=UTC).isoformat()
334
+ entry["rows"] = 0
335
+ return entry
336
+ total_bytes = 0
337
+ latest_mtime = 0.0
338
+ healthy = True
339
+ for p in parts:
340
+ st = p.stat()
341
+ total_bytes += st.st_size
342
+ latest_mtime = max(latest_mtime, st.st_mtime)
343
+ if st.st_size <= 16:
344
+ healthy = False
345
+ entry["bytes"] = total_bytes
346
+ entry["mtime"] = datetime.fromtimestamp(latest_mtime, tz=UTC).isoformat()
347
+ if not healthy:
348
+ entry["rows"] = None
349
+ return entry
350
+ try:
351
+ entry["rows"] = count_rows(path)
352
+ except (OSError, ValueError):
353
+ # ``count_rows`` is a polars scan; an unreadable footer surfaces here.
354
+ entry["rows"] = None
355
+ return entry
356
+
357
+
358
+ # ---------------------------------------------------------------------------
359
+ # Subcommands
360
+ # ---------------------------------------------------------------------------
361
+
362
+
363
+ @app.command
364
+ def shell(*, common: Common | None = None) -> None:
365
+ """Launch the interactive duckdb REPL with every view, macro, and the HNSW index pre-registered.
366
+
367
+ When to use
368
+ -----------
369
+ Interactive exploration -- iterating on SQL joins, inspecting macros,
370
+ feeling out the catalog. Agents should prefer ``query`` (single-shot)
371
+ or ``shell`` via a subprocess only when they truly need a session.
372
+
373
+ What it does
374
+ ------------
375
+ 1. Creates a temporary on-disk DuckDB file.
376
+ 2. Runs ``register_all`` to materialize 18 views + 14 macros + VSS.
377
+ 3. Execs the system ``duckdb`` binary against the file.
378
+
379
+ Exit codes
380
+ ----------
381
+ * 127 ``duckdb`` binary not on PATH (install it with `uv tool install
382
+ duckdb` or your OS package manager).
383
+
384
+ Notes
385
+ -----
386
+ The temp DB path is printed on startup so you can reopen it later or
387
+ delete it. The path is NOT cleaned up automatically on exit -- that's
388
+ intentional so long-running sessions can be resumed.
389
+ """
390
+ _configure(common)
391
+ settings = _resolve_settings(common)
392
+
393
+ # mkstemp returns a tuple of (fd, path); we want the path only — duckdb
394
+ # opens its own handle. Closing the fd immediately keeps the file but
395
+ # releases the descriptor (mkstemp is preferred over NamedTemporaryFile
396
+ # here because we never write to the handle; we just need a unique
397
+ # path that already exists on disk so duckdb can open it).
398
+ fd, db_path = tempfile.mkstemp(suffix=".duckdb")
399
+ os.close(fd)
400
+
401
+ con = duckdb.connect(db_path)
402
+ try:
403
+ register_all(con, settings=settings)
404
+ finally:
405
+ con.close()
406
+
407
+ logger.info("Opening DuckDB REPL with pre-registered views + macros + HNSW index")
408
+ logger.info("(Exit with .quit; DB persists at {})", db_path)
409
+ try:
410
+ subprocess.run(["duckdb", db_path], check=False)
411
+ except FileNotFoundError:
412
+ logger.error(
413
+ "`duckdb` binary not found on PATH. Install it or run queries via "
414
+ "`claude-sql query '<sql>'`. DB persists at {}",
415
+ db_path,
416
+ )
417
+ sys.exit(EXIT_CODES["duckdb_missing"])
418
+
419
+
420
+ def _profile_path_for(label: str) -> Path:
421
+ """Build the destination path used by ``--profile-json``.
422
+
423
+ Splits filename composition out of the writer so callers can configure
424
+ DuckDB's ``profiling_output`` PRAGMA before the profiled query runs
425
+ (DuckDB writes the JSON itself; we just read it back to confirm the
426
+ file landed and surface its location to the user).
427
+ """
428
+ profiling_dir = Path(os.path.expanduser("~/.claude/profiling/"))
429
+ profiling_dir.mkdir(parents=True, exist_ok=True)
430
+ safe_label = re.sub(r"[^A-Za-z0-9_-]+", "-", label).strip("-") or "profile"
431
+ return profiling_dir / f"{safe_label}-{int(time.time() * 1000)}.json"
432
+
433
+
434
+ def _capture_profile(con: duckdb.DuckDBPyConnection, label: str) -> Path:
435
+ """Run a profiled query and return where DuckDB persisted the JSON output.
436
+
437
+ Caller must have set ``enable_profiling = 'json'`` and pointed
438
+ ``profiling_output`` at a file *before* executing the query of
439
+ interest. We synthesize the output path here, set the PRAGMAs, and
440
+ return the path the next query will populate. The caller is
441
+ responsible for executing exactly one statement after this returns.
442
+ """
443
+ out_path = _profile_path_for(label)
444
+ # Escape single-quotes in the path for the SQL literal; tmp paths can
445
+ # contain unusual characters under pytest.
446
+ safe_path = str(out_path).replace("'", "''")
447
+ con.execute("SET enable_profiling = 'json'")
448
+ con.execute(f"SET profiling_output = '{safe_path}'")
449
+ return out_path
450
+
451
+
452
+ @app.command
453
+ def query(
454
+ sql: str,
455
+ /,
456
+ *,
457
+ profile_json: bool = False,
458
+ common: Common | None = None,
459
+ ) -> None:
460
+ """Run one SQL query against the claude-sql catalog and emit results.
461
+
462
+ When to use
463
+ -----------
464
+ Read-only exploration and aggregation against the 18 pre-registered
465
+ views. The catalog is free (no Bedrock, no LLM, no cost), so run queries
466
+ liberally -- they're the cheapest way to introspect sessions / messages
467
+ / tool calls / analytics.
468
+
469
+ Positional args
470
+ ---------------
471
+ SQL
472
+ A single SQL statement. Multi-statement scripts are rejected by
473
+ DuckDB's single-exec path -- use ``shell`` for those.
474
+
475
+ Key flags
476
+ ---------
477
+ --glob PATTERN
478
+ Narrow the universe of JSONLs scanned. Must have at most one '**'
479
+ segment. Example:
480
+ --glob "/home/you/.claude/projects/-efs-you-bonk/*.jsonl"
481
+ --subagent-glob PATTERN
482
+ Same, for subagent sidecar files.
483
+ --format {auto,table,json,ndjson,csv}
484
+ auto emits table on TTY, json on pipe.
485
+
486
+ Output
487
+ ------
488
+ TTY default: Polars-rendered table.
489
+ Non-TTY: JSON array of row dicts (ideal for `jq` / agent parsing).
490
+
491
+ Exit codes
492
+ ----------
493
+ * 64 parse_error malformed SQL (see error.hint for the fix)
494
+ * 65 catalog_error unknown view/macro/column (try ``schema``)
495
+ * 70 runtime_error everything else DuckDB raises
496
+
497
+ Catalog discovery
498
+ -----------------
499
+ Run ``claude-sql schema --format json`` for the full view + macro list,
500
+ or ``claude-sql list-cache`` to see which analytics parquets exist.
501
+
502
+ Examples
503
+ --------
504
+ Session counts:
505
+ claude-sql query "SELECT COUNT(*) FROM sessions"
506
+ Top assistants by token spend:
507
+ claude-sql query --format json "
508
+ SELECT model, SUM(input_tokens + output_tokens) AS toks
509
+ FROM messages GROUP BY 1 ORDER BY 2 DESC LIMIT 5"
510
+ """
511
+ _configure(common)
512
+ settings = _resolve_settings(common)
513
+ fmt = _fmt(common)
514
+ con = _open_connection(settings)
515
+ try:
516
+ profile_path: Path | None = None
517
+ if profile_json:
518
+ profile_path = _capture_profile(con, label="query")
519
+ df = run_or_die(lambda: con.execute(sql).pl(), fmt=fmt)
520
+ emit_dataframe(df, fmt)
521
+ if profile_path is not None:
522
+ logger.info("Wrote profile JSON: {}", profile_path)
523
+ finally:
524
+ con.close()
525
+
526
+
527
+ @app.command
528
+ def explain(
529
+ sql: str,
530
+ /,
531
+ *,
532
+ analyze: bool = False,
533
+ profile_json: bool = False,
534
+ common: Common | None = None,
535
+ ) -> None:
536
+ """Show the DuckDB query plan and highlight pushdown / noteworthy operators.
537
+
538
+ When to use
539
+ -----------
540
+ Before running a ``query`` that might scan a lot of JSONLs -- confirm
541
+ filter pushdown, spot accidental full scans, verify HNSW_INDEX_SCAN
542
+ kicks in for vector searches.
543
+
544
+ Flags
545
+ -----
546
+ --analyze
547
+ Run ``EXPLAIN ANALYZE`` (executes the query and reports real
548
+ timings). Off by default so probing slow queries is free.
549
+ --format {auto,table,json,...}
550
+ TTY table highlights READ_JSON / Filter / HASH_JOIN / HASH_GROUP_BY
551
+ / HNSW_INDEX_SCAN in green. JSON emits ``{"plan": "<text>"}``.
552
+
553
+ Exit codes
554
+ ----------
555
+ Same as ``query``: 64 parse / 65 catalog / 70 runtime.
556
+ """
557
+ _configure(common)
558
+ settings = _resolve_settings(common)
559
+ fmt = resolve_format(_fmt(common))
560
+ con = _open_connection(settings)
561
+ try:
562
+ profile_path: Path | None = None
563
+ if profile_json:
564
+ profile_path = _capture_profile(con, label="explain")
565
+ prefix = "EXPLAIN ANALYZE " if analyze else "EXPLAIN "
566
+ rows = run_or_die(lambda: con.execute(prefix + sql).fetchall(), fmt=fmt)
567
+ # EXPLAIN rows are (type, plan_text) tuples; the plan sits in the last
568
+ # column regardless of row shape.
569
+ text = "\n".join(str(r[-1]) for r in rows)
570
+ if fmt is OutputFormat.TABLE:
571
+ for line in text.splitlines():
572
+ if any(m in line for m in _EXPLAIN_MARKERS):
573
+ print(f"\033[92m{line}\033[0m")
574
+ else:
575
+ print(line)
576
+ else:
577
+ emit_json({"plan": text}, fmt)
578
+ if profile_path is not None:
579
+ logger.info("Wrote profile JSON: {}", profile_path)
580
+ finally:
581
+ con.close()
582
+
583
+
584
+ @app.command
585
+ def schema(*, common: Common | None = None) -> None:
586
+ """List every registered view (with columns) and every macro in one pass.
587
+
588
+ When to use
589
+ -----------
590
+ First thing an agent should call after ``--help``: it's the canonical
591
+ catalog. Use it to discover column names before composing ``query``
592
+ calls -- e.g., ``session_classifications`` uses both ``autonomy_tier``
593
+ (canonical) and ``autonomy`` (alias), and the schema lists both.
594
+
595
+ Output shape (non-TTY / JSON)
596
+ -----------------------------
597
+ ::
598
+
599
+ {
600
+ "views": {
601
+ "sessions": [{"column": "session_id", "type": "VARCHAR"}, ...],
602
+ "messages": [...],
603
+ "session_classifications": [...], // only if parquet exists
604
+ ...
605
+ },
606
+ "macros": ["autonomy_trend", "conflict_rate", ...]
607
+ }
608
+
609
+ Missing analytics parquets are silently omitted (register_analytics
610
+ skips them). Use ``list-cache`` to see which generators still need to
611
+ run.
612
+ """
613
+ _configure(common)
614
+ settings = _resolve_settings(common)
615
+ fmt = resolve_format(_fmt(common))
616
+ con = _open_connection(settings)
617
+ try:
618
+ views = describe_all(con)
619
+ macros = list_macros(con)
620
+ if fmt is OutputFormat.TABLE:
621
+ for name, cols in views.items():
622
+ print(f"\n\033[1m{name}\033[0m ({len(cols)} cols)")
623
+ for col, col_type in cols:
624
+ print(f" {col:<28} {col_type}")
625
+ print(f"\n\033[1mMacros\033[0m ({len(macros)})")
626
+ for macro in macros:
627
+ print(f" {macro}")
628
+ else:
629
+ payload = {
630
+ "views": {
631
+ name: [{"column": c, "type": t} for c, t in cols]
632
+ for name, cols in views.items()
633
+ },
634
+ "macros": list(macros),
635
+ }
636
+ emit_json(payload, fmt)
637
+ finally:
638
+ con.close()
639
+
640
+
641
+ @app.command(name="list-cache")
642
+ def list_cache(*, common: Common | None = None) -> None:
643
+ """Report each parquet cache's presence, size, freshness, and row count.
644
+
645
+ When to use
646
+ -----------
647
+ Before running ``search`` (requires ``embeddings``) or composing
648
+ analytics queries (require ``session_classifications`` /
649
+ ``message_trajectory`` / ``session_conflicts`` / ``message_clusters``
650
+ / ``cluster_terms`` / ``session_communities`` / ``user_friction``).
651
+
652
+ What it reports
653
+ ---------------
654
+ One entry per cache (plus the persistent checkpointer DB):
655
+ ``{name, path, exists, bytes, mtime, rows}``. When ``exists`` is
656
+ false, ``bytes`` / ``mtime`` / ``rows`` are omitted.
657
+
658
+ How to populate each cache
659
+ --------------------------
660
+ * embeddings → ``claude-sql embed --no-dry-run``
661
+ * session_classifications → ``claude-sql classify --no-dry-run``
662
+ * message_trajectory → ``claude-sql trajectory --no-dry-run``
663
+ * session_conflicts → ``claude-sql conflicts --no-dry-run``
664
+ * message_clusters → ``claude-sql cluster``
665
+ * cluster_terms → ``claude-sql terms``
666
+ * session_communities → ``claude-sql community``
667
+ * user_friction → ``claude-sql friction --no-dry-run``
668
+ * skills_catalog → ``claude-sql skills sync``
669
+ """
670
+ _configure(common)
671
+ settings = _resolve_settings(common)
672
+ fmt = resolve_format(_fmt(common))
673
+ entries = [
674
+ _describe_cache_entry("embeddings", settings.embeddings_parquet_path),
675
+ _describe_cache_entry("session_classifications", settings.classifications_parquet_path),
676
+ _describe_cache_entry("message_trajectory", settings.trajectory_parquet_path),
677
+ _describe_cache_entry("session_conflicts", settings.conflicts_parquet_path),
678
+ _describe_cache_entry("message_clusters", settings.clusters_parquet_path),
679
+ _describe_cache_entry("cluster_terms", settings.cluster_terms_parquet_path),
680
+ _describe_cache_entry("session_communities", settings.communities_parquet_path),
681
+ _describe_cache_entry("user_friction", settings.user_friction_parquet_path),
682
+ _describe_cache_entry("skills_catalog", settings.skills_catalog_parquet_path),
683
+ _describe_checkpoint_entry(settings.checkpoint_db_path),
684
+ ]
685
+
686
+ if fmt is OutputFormat.TABLE:
687
+ df = pl.DataFrame(entries)
688
+ emit_dataframe(df, OutputFormat.TABLE)
689
+ return
690
+ # JSON / NDJSON / CSV -- emit the list directly so downstream tooling
691
+ # doesn't have to unwrap a wrapper object.
692
+ if fmt is OutputFormat.NDJSON:
693
+ for entry in entries:
694
+ sys.stdout.write(json.dumps(entry, default=str))
695
+ sys.stdout.write("\n")
696
+ return
697
+ if fmt is OutputFormat.CSV:
698
+ emit_dataframe(pl.DataFrame(entries), OutputFormat.CSV)
699
+ return
700
+ emit_json(entries, fmt)
701
+
702
+
703
+ # ---------------------------------------------------------------------------
704
+ # ``cache`` sub-app — compact / migrate the sharded worker-output parquets.
705
+ # ---------------------------------------------------------------------------
706
+ #
707
+ # Workers (embed, classify, trajectory, conflicts, friction) write each
708
+ # chunk as a fresh ``part-<ts_ns>.parquet`` under their cache directory.
709
+ # Over time many small parts accumulate; ``cache compact`` consolidates
710
+ # them into a single ``part-compacted-<ts>.parquet`` and removes the
711
+ # originals. ``cache migrate`` walks legacy single-file caches that
712
+ # pre-date this layout and moves each one into a sibling directory with
713
+ # its existing mtime preserved so the HNSW persistence and cluster-mtime
714
+ # sidecar logic stay valid.
715
+ #
716
+ # Both commands honour ``--dry-run`` (default ``True``) the same way every
717
+ # Bedrock-bearing command does in this codebase: nothing happens until you
718
+ # pass ``--no-dry-run``.
719
+
720
+ cache_app = App(
721
+ name="cache",
722
+ help=(
723
+ "Manage the sharded worker-output parquet caches.\n\n"
724
+ " cache compact consolidates many ``part-*.parquet`` shards into one.\n"
725
+ " cache migrate moves a legacy single-file cache into the new dir layout.\n\n"
726
+ "Both commands default to --dry-run; pass --no-dry-run to act."
727
+ ),
728
+ )
729
+ app.command(cache_app)
730
+
731
+
732
+ def _resolve_cache_paths(settings: Settings) -> dict[str, Path]:
733
+ """Return ``{cache_name: path}`` for every worker-append cache.
734
+
735
+ These are the five caches with sharded-write semantics: writers append
736
+ by dropping fresh parts, so they accumulate and benefit from ``compact``.
737
+ The four single-write caches (``clusters``, ``cluster_terms``,
738
+ ``communities``, ``skills_catalog``) and the checkpoint DB don't fit
739
+ the same pattern and are intentionally excluded.
740
+ """
741
+ return {
742
+ "embeddings": settings.embeddings_parquet_path,
743
+ "session_classifications": settings.classifications_parquet_path,
744
+ "message_trajectory": settings.trajectory_parquet_path,
745
+ "session_conflicts": settings.conflicts_parquet_path,
746
+ "user_friction": settings.user_friction_parquet_path,
747
+ }
748
+
749
+
750
+ @cache_app.command(name="compact")
751
+ def cache_compact(
752
+ *,
753
+ name: str | None = None,
754
+ dry_run: bool = True,
755
+ common: Common | None = None,
756
+ ) -> None:
757
+ """Consolidate ``part-*.parquet`` shards into a single compacted part file.
758
+
759
+ Walks each sharded cache directory, reads every part, writes a fresh
760
+ ``part-compacted-<ts_ns>.parquet`` containing the union, and only after
761
+ that succeeds removes the originals. Legacy single-file caches and
762
+ caches with zero or one parts are left alone — there is nothing to
763
+ consolidate.
764
+
765
+ Flags
766
+ -----
767
+ --name <cache> Restrict to one of: embeddings, session_classifications,
768
+ message_trajectory, session_conflicts, user_friction.
769
+ Default is "all five".
770
+ --dry-run Default True. Pass ``--no-dry-run`` to actually rewrite.
771
+ """
772
+ _configure(common)
773
+ settings = _resolve_settings(common)
774
+ fmt = resolve_format(_fmt(common))
775
+
776
+ targets = _resolve_cache_paths(settings)
777
+ if name is not None:
778
+ if name not in targets:
779
+ err = ClassifiedError(
780
+ kind="invalid_input",
781
+ exit_code=EXIT_CODES["invalid_input"],
782
+ message=f"Unknown cache name: {name!r}",
783
+ hint=f"Pick one of: {', '.join(sorted(targets))}",
784
+ )
785
+ emit_error(err, _fmt(common))
786
+ sys.exit(err.exit_code)
787
+ targets = {name: targets[name]}
788
+
789
+ summaries: list[dict[str, object]] = []
790
+ for cache_name, path in targets.items():
791
+ parts = iter_part_files(path)
792
+ if len(parts) <= 1 or not is_sharded_dir(path):
793
+ summaries.append(
794
+ {
795
+ "name": cache_name,
796
+ "path": str(path),
797
+ "parts": len(parts),
798
+ "action": "skip",
799
+ "reason": "no_compaction_needed",
800
+ }
801
+ )
802
+ continue
803
+ if dry_run:
804
+ total_bytes = sum(p.stat().st_size for p in parts)
805
+ summaries.append(
806
+ {
807
+ "name": cache_name,
808
+ "path": str(path),
809
+ "parts": len(parts),
810
+ "bytes": total_bytes,
811
+ "action": "would_compact",
812
+ }
813
+ )
814
+ continue
815
+ # Read the union via polars, write a fresh compacted shard, delete
816
+ # the originals only after the write succeeds. Any IO error here
817
+ # leaves the directory intact so a retry does not lose data.
818
+ df = pl.read_parquet([str(p) for p in parts])
819
+ compacted = path / f"part-compacted-{time.time_ns()}.parquet"
820
+ df.write_parquet(compacted)
821
+ for p in parts:
822
+ p.unlink()
823
+ summaries.append(
824
+ {
825
+ "name": cache_name,
826
+ "path": str(path),
827
+ "parts": len(parts),
828
+ "rows": int(df.height),
829
+ "compacted_to": str(compacted),
830
+ "action": "compacted",
831
+ }
832
+ )
833
+
834
+ if fmt is OutputFormat.TABLE:
835
+ emit_dataframe(pl.DataFrame(summaries), OutputFormat.TABLE)
836
+ return
837
+ if fmt is OutputFormat.NDJSON:
838
+ for s in summaries:
839
+ sys.stdout.write(json.dumps(s, default=str))
840
+ sys.stdout.write("\n")
841
+ return
842
+ if fmt is OutputFormat.CSV:
843
+ emit_dataframe(pl.DataFrame(summaries), OutputFormat.CSV)
844
+ return
845
+ emit_json(summaries, fmt)
846
+
847
+
848
+ @cache_app.command(name="migrate")
849
+ def cache_migrate(
850
+ *,
851
+ dry_run: bool = True,
852
+ common: Common | None = None,
853
+ ) -> None:
854
+ """Move legacy single-file caches into the sharded directory layout.
855
+
856
+ For each of the five worker-append caches, looks for the historical
857
+ ``~/.claude/<name>.parquet`` file alongside the new
858
+ ``~/.claude/<name>/`` directory. When a single-file cache exists, the
859
+ file is moved (not copied) into the directory as
860
+ ``part-<original_mtime_ns>.parquet`` so subsequent runs treat it as
861
+ just another shard. The original mtime is preserved on the new file so
862
+ HNSW-persistence freshness checks behave identically.
863
+
864
+ Flags
865
+ -----
866
+ --dry-run Default True. Pass ``--no-dry-run`` to actually move files.
867
+ """
868
+ _configure(common)
869
+ settings = _resolve_settings(common)
870
+ fmt = resolve_format(_fmt(common))
871
+
872
+ targets = _resolve_cache_paths(settings)
873
+ summaries: list[dict[str, object]] = []
874
+ for cache_name, dir_path in targets.items():
875
+ # Legacy single-file path is the same parent directory + the cache
876
+ # name + ".parquet" — that's what ``_default_*_parquet`` returned
877
+ # before this PR.
878
+ legacy = dir_path.with_suffix(".parquet")
879
+ # Some users may have customised the cache path explicitly; we only
880
+ # touch the canonical sibling, never an arbitrary user file.
881
+ if not legacy.is_file():
882
+ summaries.append(
883
+ {
884
+ "name": cache_name,
885
+ "from": str(legacy),
886
+ "to": str(dir_path),
887
+ "action": "skip",
888
+ "reason": "no_legacy_file",
889
+ }
890
+ )
891
+ continue
892
+ original_ns = legacy.stat().st_mtime_ns
893
+ target = dir_path / f"part-{original_ns}.parquet"
894
+ if dry_run:
895
+ summaries.append(
896
+ {
897
+ "name": cache_name,
898
+ "from": str(legacy),
899
+ "to": str(target),
900
+ "bytes": legacy.stat().st_size,
901
+ "action": "would_move",
902
+ }
903
+ )
904
+ continue
905
+ dir_path.mkdir(parents=True, exist_ok=True)
906
+ # ``rename`` preserves contents and mtime when both paths live on
907
+ # the same filesystem — for the canonical ``~/.claude/`` layout
908
+ # they always do. ``os.utime`` is a defensive belt+suspenders.
909
+ legacy.rename(target)
910
+ os.utime(target, ns=(original_ns, original_ns))
911
+ summaries.append(
912
+ {
913
+ "name": cache_name,
914
+ "from": str(legacy),
915
+ "to": str(target),
916
+ "action": "migrated",
917
+ }
918
+ )
919
+
920
+ if fmt is OutputFormat.TABLE:
921
+ emit_dataframe(pl.DataFrame(summaries), OutputFormat.TABLE)
922
+ return
923
+ if fmt is OutputFormat.NDJSON:
924
+ for s in summaries:
925
+ sys.stdout.write(json.dumps(s, default=str))
926
+ sys.stdout.write("\n")
927
+ return
928
+ if fmt is OutputFormat.CSV:
929
+ emit_dataframe(pl.DataFrame(summaries), OutputFormat.CSV)
930
+ return
931
+ emit_json(summaries, fmt)
932
+
933
+
934
+ # ---------------------------------------------------------------------------
935
+ # ``skills`` sub-app — catalog of locally-available Skills and slash commands.
936
+ # ---------------------------------------------------------------------------
937
+
938
+ skills_app = App(
939
+ name="skills",
940
+ help=(
941
+ "Seed and inspect the local Skills catalog.\n\n"
942
+ "The catalog binds skill_id (e.g. 'erpaval', 'personal-plugins:erpaval') "
943
+ "to its human description, source plugin, and version so skill_usage can "
944
+ "enrich raw invocations. Seeded from ~/.claude/skills/ and "
945
+ "~/.claude/plugins/cache/**; no Bedrock cost."
946
+ ),
947
+ )
948
+ app.command(skills_app)
949
+
950
+
951
+ @skills_app.command(name="sync")
952
+ def skills_sync(
953
+ *,
954
+ dry_run: bool = False,
955
+ common: Common | None = None,
956
+ ) -> None:
957
+ """Walk ``~/.claude/skills`` and ``~/.claude/plugins/cache`` → skills_catalog.parquet.
958
+
959
+ Sources
960
+ -------
961
+ * ``~/.claude/skills/<name>/SKILL.md`` → ``user-skill``
962
+ * ``<plugins_cache>/<owner>/<plugin>/<v>/skills/<n>/SKILL.md``
963
+ → ``plugin-skill`` (bare + ``<plugin>:<n>``)
964
+ * ``<plugins_cache>/<owner>/<plugin>/<v>/commands/<n>.md``
965
+ → ``plugin-command`` (bare + ``<plugin>:<n>``)
966
+ * Built-in slash commands (``/clear``, ``/compact``, …) → ``builtin``
967
+
968
+ Cost: zero (pure filesystem walk). Run whenever you install or
969
+ upgrade a plugin; ``claude-sql analyze`` runs it automatically.
970
+
971
+ Flags
972
+ -----
973
+ --dry-run Count rows without writing the parquet. Useful for
974
+ previewing how many skills will be catalogued.
975
+ """
976
+ _configure(common)
977
+ settings = _resolve_settings(common)
978
+ stats = _skills_catalog.sync(settings, dry_run=dry_run)
979
+ target = "would write" if dry_run else "wrote"
980
+ logger.info(
981
+ "skills sync: {} {} rows to {} ({} skills, {} commands, {} builtins)",
982
+ target,
983
+ stats["rows"],
984
+ settings.skills_catalog_parquet_path,
985
+ stats["skills"],
986
+ stats["commands"],
987
+ stats["builtins"],
988
+ )
989
+
990
+
991
+ @skills_app.command(name="ls")
992
+ def skills_ls(
993
+ *,
994
+ kind: str | None = None,
995
+ plugin: str | None = None,
996
+ common: Common | None = None,
997
+ ) -> None:
998
+ """List entries from the skills catalog parquet.
999
+
1000
+ Run ``claude-sql skills sync`` first. Emits the catalog in the
1001
+ shared ``--format`` shape (table on TTY, JSON on pipe).
1002
+
1003
+ Flags
1004
+ -----
1005
+ --kind <value> Filter by ``source_kind`` (``user-skill``,
1006
+ ``plugin-skill``, ``plugin-command``, ``builtin``).
1007
+ --plugin <value> Filter by plugin name (exact match).
1008
+ """
1009
+ _configure(common)
1010
+ settings = _resolve_settings(common)
1011
+ fmt = resolve_format(_fmt(common))
1012
+ path = settings.skills_catalog_parquet_path
1013
+ if not path.exists():
1014
+ logger.error(
1015
+ "skills catalog parquet missing at {}. Run `claude-sql skills sync` first.",
1016
+ path,
1017
+ )
1018
+ sys.exit(EXIT_CODES["no_embeddings"])
1019
+ df = pl.read_parquet(path)
1020
+ if kind is not None:
1021
+ df = df.filter(pl.col("source_kind") == kind)
1022
+ if plugin is not None:
1023
+ df = df.filter(pl.col("plugin") == plugin)
1024
+ df = df.sort(["source_kind", "plugin", "name"], nulls_last=True)
1025
+ if fmt is OutputFormat.TABLE:
1026
+ emit_dataframe(df, OutputFormat.TABLE)
1027
+ return
1028
+ if fmt is OutputFormat.CSV:
1029
+ emit_dataframe(df, OutputFormat.CSV)
1030
+ return
1031
+ if fmt is OutputFormat.NDJSON:
1032
+ for row in df.iter_rows(named=True):
1033
+ sys.stdout.write(json.dumps(row, default=str))
1034
+ sys.stdout.write("\n")
1035
+ return
1036
+ emit_json(df.to_dicts(), fmt)
1037
+
1038
+
1039
+ @app.command
1040
+ def embed(
1041
+ *,
1042
+ since_days: int | None = None,
1043
+ limit: int | None = None,
1044
+ dry_run: bool = False,
1045
+ common: Common | None = None,
1046
+ ) -> None:
1047
+ """Embed new messages with Cohere Embed v4 and append to the embeddings parquet.
1048
+
1049
+ Cost
1050
+ ----
1051
+ Calls Bedrock (``global.cohere.embed-v4:0``) on every unembedded
1052
+ message. ``--dry-run`` is OFF by default here (unlike LLM workers);
1053
+ pass it if you only want to see the plan.
1054
+
1055
+ Flags
1056
+ -----
1057
+ --since-days N Only consider messages newer than N days.
1058
+ --limit N Cap the number of messages embedded this run.
1059
+ --dry-run Preview only; emit plan JSON, no Bedrock calls.
1060
+ --glob PATTERN Narrow the universe (see top-level --help).
1061
+
1062
+ Dry-run output (stdout JSON)
1063
+ ----------------------------
1064
+ ::
1065
+
1066
+ {
1067
+ "pipeline": "embed",
1068
+ "candidates": N,
1069
+ "batches": B,
1070
+ "batch_size": 96,
1071
+ "concurrency": 2,
1072
+ "model": "...",
1073
+ "since_days": null,
1074
+ "limit": null,
1075
+ "dry_run": true,
1076
+ }
1077
+
1078
+ Real-run output
1079
+ ---------------
1080
+ ``{"pipeline": "embed", "rows_processed": N, "dry_run": false}``
1081
+
1082
+ Exit codes: 0 success, 70 runtime (Bedrock / DuckDB failure).
1083
+ """
1084
+ import asyncio
1085
+
1086
+ _configure(common)
1087
+ settings = _resolve_settings(common)
1088
+ con = duckdb.connect(":memory:")
1089
+ try:
1090
+ register_raw(
1091
+ con,
1092
+ glob=settings.default_glob,
1093
+ subagent_glob=settings.subagent_glob,
1094
+ subagent_meta_glob=settings.subagent_meta_glob,
1095
+ )
1096
+ register_views(con)
1097
+ result = asyncio.run(
1098
+ run_backfill(
1099
+ con=con,
1100
+ settings=settings,
1101
+ since_days=since_days,
1102
+ limit=limit,
1103
+ dry_run=dry_run,
1104
+ )
1105
+ )
1106
+ logger.info("Embedded {} messages (dry_run={})", result, dry_run)
1107
+ _emit_worker_result(result, common, pipeline="embed")
1108
+ finally:
1109
+ con.close()
1110
+
1111
+
1112
+ @app.command
1113
+ def search(
1114
+ query_text: str,
1115
+ /,
1116
+ *,
1117
+ k: int = 10,
1118
+ common: Common | None = None,
1119
+ ) -> None:
1120
+ """Semantic top-k nearest-neighbor search over message embeddings via HNSW.
1121
+
1122
+ Pipeline
1123
+ --------
1124
+ 1. Embed ``query_text`` with Cohere Embed v4 ``search_query`` mode.
1125
+ 2. DuckDB VSS HNSW cosine lookup against the existing embeddings parquet.
1126
+ 3. Join back to ``messages_text`` for a 200-char snippet.
1127
+
1128
+ Prereq
1129
+ ------
1130
+ The embeddings parquet must exist. If it's empty or missing, the
1131
+ command exits with code 2 and a hint. Run
1132
+ ``claude-sql embed --since-days 7 --no-dry-run`` to populate.
1133
+
1134
+ Positional args
1135
+ ---------------
1136
+ QUERY_TEXT A single natural-language query string.
1137
+
1138
+ Flags
1139
+ -----
1140
+ --k N Top-k (default 10).
1141
+ --glob PATTERN Narrow the messages_text view before the HNSW join.
1142
+ --format ... See top-level --help.
1143
+
1144
+ Output columns
1145
+ --------------
1146
+ uuid, session_id, role, sim (cosine similarity ∈ [-1, 1]), snippet.
1147
+ Sorted by cosine distance ascending -- highest sim first.
1148
+
1149
+ When to prefer ``query`` instead
1150
+ --------------------------------
1151
+ Semantic search is good at recall but bad at tie-breaking when the
1152
+ topic is over-represented in the corpus. If you are pinpointing a
1153
+ single known session (not a theme) and the subject is frequent --
1154
+ "the claude-sql session where I ran over 30 days", "the session
1155
+ where the test suite failed" -- a literal ILIKE on a distinctive
1156
+ token finds it in one hop:
1157
+
1158
+ claude-sql query "SELECT DISTINCT session_id FROM messages_text
1159
+ WHERE text_content ILIKE '%--since-days 30%'"
1160
+
1161
+ Good distinctive tokens: exact CLI flags, dollar amounts from a
1162
+ dry-run cost table, precise error strings, the exact command the
1163
+ user ran. If the first search returns >3 plausible sessions at
1164
+ similar ``sim``, stop rephrasing and switch modality.
1165
+
1166
+ Exit codes: 0 success, 2 no_embeddings, 70 runtime.
1167
+ """
1168
+ _configure(common)
1169
+ settings = _resolve_settings(common)
1170
+ fmt = _fmt(common)
1171
+ con = _open_connection(settings)
1172
+ try:
1173
+ row = con.execute("SELECT count(*) FROM message_embeddings").fetchone()
1174
+ count = int(row[0]) if row else 0
1175
+ if count == 0:
1176
+ logger.error("No embeddings yet. Run: claude-sql embed --since-days 7")
1177
+ sys.exit(EXIT_CODES["no_embeddings"])
1178
+
1179
+ qv = embed_query(query_text, settings=settings)
1180
+ dim = int(settings.output_dimension)
1181
+ # Rank by cosine similarity descending. The HNSW index was built with
1182
+ # metric='cosine', so ORDER BY array_cosine_distance (== 1 - sim) ASC
1183
+ # is what triggers the index lookup. Using array_distance here (L2)
1184
+ # would silently bypass the index AND give wrong ranks because the
1185
+ # raw int8-cast-to-float document vectors have magnitudes in the
1186
+ # thousands while the query vector is unit-normalized.
1187
+ df = run_or_die(
1188
+ lambda: con.execute(
1189
+ f"""
1190
+ WITH qv AS (SELECT CAST(? AS FLOAT[{dim}]) AS v)
1191
+ SELECT CAST(mt.uuid AS VARCHAR) AS uuid,
1192
+ CAST(mt.session_id AS VARCHAR) AS session_id,
1193
+ mt.role,
1194
+ array_cosine_similarity(me.embedding, (SELECT v FROM qv)) AS sim,
1195
+ substr(mt.text_content, 1, 200) AS snippet
1196
+ FROM message_embeddings me
1197
+ JOIN messages_text mt ON CAST(mt.uuid AS VARCHAR) = me.uuid
1198
+ ORDER BY array_cosine_distance(me.embedding, (SELECT v FROM qv)) ASC
1199
+ LIMIT ?
1200
+ """,
1201
+ [qv, k],
1202
+ ).pl(),
1203
+ fmt=fmt,
1204
+ )
1205
+ emit_dataframe(df, fmt, table_rows=k, table_str_len=200)
1206
+ finally:
1207
+ con.close()
1208
+
1209
+
1210
+ @app.command
1211
+ def classify(
1212
+ *,
1213
+ since_days: int | None = None,
1214
+ limit: int | None = None,
1215
+ dry_run: bool = True,
1216
+ no_thinking: bool = False,
1217
+ common: Common | None = None,
1218
+ ) -> None:
1219
+ """Classify sessions with Sonnet 4.6: autonomy tier, work category, success, goal.
1220
+
1221
+ Output columns (``session_classifications`` view)
1222
+ -------------------------------------------------
1223
+ session_id, autonomy_tier ∈ {autonomous,assisted,manual},
1224
+ work_category (sde/admin/strategy_business/thought_leadership/other),
1225
+ success ∈ {success,partial,failure,unknown}, goal (string),
1226
+ confidence ∈ [0,1], classified_at.
1227
+ Alias columns added by the view layer: ``autonomy``,
1228
+ ``success_outcome``, ``category`` (same values as above).
1229
+
1230
+ Cost (defaults to --dry-run)
1231
+ ----------------------------
1232
+ Back-of-envelope ~8K input + ~300 output tokens per session. With
1233
+ Sonnet 4.6 pricing, 1,000 sessions ≈ $25-30. Always start with
1234
+ ``--dry-run`` (default) to see the plan JSON, then confirm with
1235
+ ``--no-dry-run``.
1236
+
1237
+ Flags
1238
+ -----
1239
+ --since-days N Only classify sessions newer than N days.
1240
+ --limit N Cap at N sessions this run.
1241
+ --dry-run (DEFAULT) emit plan JSON, no Bedrock calls.
1242
+ --no-dry-run Spend real money.
1243
+ --no-thinking Disable Sonnet adaptive thinking (cheaper, less precise).
1244
+ --glob PATTERN Narrow the corpus (recommended for first runs).
1245
+
1246
+ Dry-run stdout JSON
1247
+ -------------------
1248
+ ``{"pipeline":"classify","candidates":N,"llm_calls":N,
1249
+ "avg_input_tokens":8000,"avg_output_tokens":300,
1250
+ "estimated_cost_usd":X,"model":"...","thinking":"adaptive",
1251
+ "since_days":null,"limit":null,"dry_run":true}``
1252
+
1253
+ Checkpointing
1254
+ -------------
1255
+ Session-level checkpoint in ``~/.claude/claude_sql.duckdb`` means
1256
+ reruns on unchanged sessions are free -- only sessions whose JSONL
1257
+ mtime changed are re-processed.
1258
+ """
1259
+ _configure(common)
1260
+ settings = _resolve_settings(common)
1261
+ con = _open_connection(settings)
1262
+ try:
1263
+ result = classify_sessions(
1264
+ con,
1265
+ settings,
1266
+ since_days=since_days,
1267
+ limit=limit,
1268
+ dry_run=dry_run,
1269
+ no_thinking=no_thinking,
1270
+ )
1271
+ logger.info("classify: {} sessions processed (dry_run={})", result, dry_run)
1272
+ _emit_worker_result(result, common, pipeline="classify")
1273
+ finally:
1274
+ con.close()
1275
+
1276
+
1277
+ @app.command
1278
+ def trajectory(
1279
+ *,
1280
+ since_days: int | None = None,
1281
+ limit: int | None = None,
1282
+ dry_run: bool = True,
1283
+ no_thinking: bool = False,
1284
+ common: Common | None = None,
1285
+ ) -> None:
1286
+ """Per-message sentiment + topic-transition classification (regex prefilter → Sonnet 4.6).
1287
+
1288
+ Output columns (``message_trajectory`` view)
1289
+ --------------------------------------------
1290
+ uuid, sentiment_delta ∈ {positive,neutral,negative},
1291
+ is_transition (boolean -- does this message mark a topic shift?),
1292
+ confidence ∈ [0,1], classified_at.
1293
+ Alias columns: ``sentiment`` (same as sentiment_delta),
1294
+ ``transition`` (same as is_transition).
1295
+
1296
+ Pipeline
1297
+ --------
1298
+ 1. Regex prefilter catches ~50% of obvious transitions for free.
1299
+ 2. Sonnet 4.6 classifies the remainder with structured output.
1300
+
1301
+ Cost: defaults to ``--dry-run``. ~500 input / 50 output tokens per LLM
1302
+ call.
1303
+
1304
+ Flags / exit codes identical to ``classify``. See its help for the
1305
+ dry-run JSON schema.
1306
+ """
1307
+ _configure(common)
1308
+ settings = _resolve_settings(common)
1309
+ con = _open_connection(settings)
1310
+ try:
1311
+ result = trajectory_messages(
1312
+ con,
1313
+ settings,
1314
+ since_days=since_days,
1315
+ limit=limit,
1316
+ dry_run=dry_run,
1317
+ no_thinking=no_thinking,
1318
+ )
1319
+ logger.info("trajectory: {} messages processed (dry_run={})", result, dry_run)
1320
+ _emit_worker_result(result, common, pipeline="trajectory")
1321
+ finally:
1322
+ con.close()
1323
+
1324
+
1325
+ @app.command
1326
+ def conflicts(
1327
+ *,
1328
+ since_days: int | None = None,
1329
+ limit: int | None = None,
1330
+ dry_run: bool = True,
1331
+ no_thinking: bool = False,
1332
+ common: Common | None = None,
1333
+ ) -> None:
1334
+ """Per-session stance-conflict detection via Sonnet 4.6.
1335
+
1336
+ What it finds
1337
+ -------------
1338
+ Places where the user and the agent disagreed on approach or scope,
1339
+ or where the agent contradicted itself. Each conflict gets two stance
1340
+ snippets (``stance_a`` / ``stance_b``), a resolution label
1341
+ ∈ {resolved, unresolved, abandoned, null}, and a detected_at timestamp.
1342
+
1343
+ Output columns (``session_conflicts`` view)
1344
+ -------------------------------------------
1345
+ session_id, conflict_idx, stance_a, stance_b, resolution,
1346
+ detected_at, empty. Alias: ``conflict_resolution`` = resolution.
1347
+
1348
+ Cost: defaults to ``--dry-run``. ~6K input / 400 output tokens / session.
1349
+ Flags / exit codes identical to ``classify``.
1350
+ """
1351
+ _configure(common)
1352
+ settings = _resolve_settings(common)
1353
+ con = _open_connection(settings)
1354
+ try:
1355
+ result = detect_conflicts(
1356
+ con,
1357
+ settings,
1358
+ since_days=since_days,
1359
+ limit=limit,
1360
+ dry_run=dry_run,
1361
+ no_thinking=no_thinking,
1362
+ )
1363
+ logger.info("conflicts: {} sessions processed (dry_run={})", result, dry_run)
1364
+ _emit_worker_result(result, common, pipeline="conflicts")
1365
+ finally:
1366
+ con.close()
1367
+
1368
+
1369
+ @app.command
1370
+ def friction(
1371
+ *,
1372
+ since_days: int | None = None,
1373
+ limit: int | None = None,
1374
+ dry_run: bool = True,
1375
+ no_thinking: bool = False,
1376
+ common: Common | None = None,
1377
+ ) -> None:
1378
+ """Classify short user messages (≤300 chars) for friction signals.
1379
+
1380
+ Labels
1381
+ ------
1382
+ status_ping / unmet_expectation / confusion / interruption /
1383
+ correction / frustration / none.
1384
+
1385
+ Pipeline
1386
+ --------
1387
+ 1. Pull user-role messages ≤ ``CLAUDE_SQL_FRICTION_MAX_CHARS`` (300).
1388
+ 2. Regex fast-path catches ``status_ping`` / ``interruption`` /
1389
+ ``correction`` at 0.9 confidence.
1390
+ 3. Everything else → Sonnet 4.6 with the USER_FRICTION_SCHEMA.
1391
+
1392
+ Output columns (``user_friction`` view)
1393
+ ---------------------------------------
1394
+ uuid, session_id, ts, label, source ∈ {regex, llm, refused},
1395
+ confidence, rationale, text (the original user message).
1396
+
1397
+ Cost: defaults to ``--dry-run``. Short prompts (~200 in / 60 out),
1398
+ so even 10K candidates cost ≈ $3-4.
1399
+ Flags / exit codes identical to ``classify``.
1400
+ """
1401
+ _configure(common)
1402
+ settings = _resolve_settings(common)
1403
+ con = _open_connection(settings)
1404
+ try:
1405
+ result = detect_user_friction(
1406
+ con,
1407
+ settings,
1408
+ since_days=since_days,
1409
+ limit=limit,
1410
+ dry_run=dry_run,
1411
+ no_thinking=no_thinking,
1412
+ )
1413
+ logger.info("friction: {} rows written (dry_run={})", result, dry_run)
1414
+ _emit_worker_result(result, common, pipeline="friction")
1415
+ finally:
1416
+ con.close()
1417
+
1418
+
1419
+ @app.command
1420
+ def cluster(*, force: bool = False, common: Common | None = None) -> None:
1421
+ """Cluster message embeddings with UMAP (8D) + HDBSCAN. Writes clusters.parquet.
1422
+
1423
+ Prereq
1424
+ ------
1425
+ The embeddings parquet must exist. Run ``embed --no-dry-run`` first.
1426
+
1427
+ Output columns (``message_clusters`` view)
1428
+ ------------------------------------------
1429
+ uuid, cluster_id (int; -1 = noise), probability (HDBSCAN soft label).
1430
+
1431
+ Cost: zero (CPU-only, no Bedrock). Seeded by ``CLAUDE_SQL_SEED=42`` so
1432
+ cluster IDs are stable across reruns unless the embedding set changes.
1433
+
1434
+ Flags
1435
+ -----
1436
+ --force Re-cluster even if clusters.parquet already exists.
1437
+ """
1438
+ _configure(common)
1439
+ settings = _resolve_settings(common)
1440
+ stats = run_clustering(settings, force=force)
1441
+ logger.info(
1442
+ "cluster: {} messages, {} clusters, {} noise ({:.1%})",
1443
+ stats["total"],
1444
+ stats["clusters"],
1445
+ stats["noise"],
1446
+ stats["noise"] / stats["total"] if stats["total"] else 0,
1447
+ )
1448
+
1449
+
1450
+ @app.command
1451
+ def terms(*, force: bool = False, common: Common | None = None) -> None:
1452
+ """Compute c-TF-IDF per-cluster term labels; writes cluster_terms.parquet.
1453
+
1454
+ Prereq: ``cluster`` (i.e., clusters.parquet must exist).
1455
+
1456
+ Output columns (``cluster_terms`` view)
1457
+ ---------------------------------------
1458
+ cluster_id (int), term (unigram or bigram), weight (float),
1459
+ rank (int, 1 = strongest term in that cluster).
1460
+
1461
+ Math: per-class TF → IDF → L1 normalize, ngram (1,2), min_df=2.
1462
+ Cost: zero (sklearn CountVectorizer). See CLAUDE.md for design rationale.
1463
+
1464
+ Flags
1465
+ -----
1466
+ --force Recompute even if cluster_terms.parquet already exists.
1467
+ """
1468
+ _configure(common)
1469
+ settings = _resolve_settings(common)
1470
+ con = _open_connection(settings)
1471
+ try:
1472
+ tstats = run_terms(con, settings, force=force)
1473
+ logger.info(
1474
+ "terms: {} clusters, {} term-rows",
1475
+ tstats["clusters"],
1476
+ tstats["terms"],
1477
+ )
1478
+ finally:
1479
+ con.close()
1480
+
1481
+
1482
+ @app.command
1483
+ def community(*, force: bool = False, common: Common | None = None) -> None:
1484
+ """Session-level Louvain community detection over a cosine-similarity graph.
1485
+
1486
+ Prereq: ``embed`` (needs the embeddings parquet).
1487
+
1488
+ Output columns (``session_communities`` view)
1489
+ ---------------------------------------------
1490
+ session_id, community_id (int; -1 = isolated).
1491
+
1492
+ Method: build a session-centroid-cosine KNN graph, then run
1493
+ ``networkx.community.louvain_communities`` (networkx ≥3.4).
1494
+ Cost: zero. Seeded by ``CLAUDE_SQL_SEED=42``.
1495
+
1496
+ Flags
1497
+ -----
1498
+ --force Re-detect even if session_communities.parquet exists.
1499
+ """
1500
+ _configure(common)
1501
+ settings = _resolve_settings(common)
1502
+ con = _open_connection(settings)
1503
+ try:
1504
+ stats = run_communities(con, settings, force=force)
1505
+ logger.info(
1506
+ "community: {} sessions grouped into {} communities",
1507
+ stats["sessions"],
1508
+ stats["communities"],
1509
+ )
1510
+ finally:
1511
+ con.close()
1512
+
1513
+
1514
+ @app.command
1515
+ def analyze(
1516
+ *,
1517
+ since_days: int | None = 30,
1518
+ limit: int | None = None,
1519
+ dry_run: bool = True,
1520
+ no_thinking: bool = False,
1521
+ skip_embed: bool = False,
1522
+ skip_classify: bool = False,
1523
+ skip_trajectory: bool = False,
1524
+ skip_conflicts: bool = False,
1525
+ skip_friction: bool = False,
1526
+ skip_cluster: bool = False,
1527
+ skip_community: bool = False,
1528
+ skip_skills_sync: bool = False,
1529
+ force_cluster: bool = False,
1530
+ force_community: bool = False,
1531
+ common: Common | None = None,
1532
+ ) -> None:
1533
+ """Run the full analytics pipeline end-to-end: embed → structure → LLM analytics.
1534
+
1535
+ Stages (in order)
1536
+ -----------------
1537
+ 0. skills sync (filesystem walk; zero-cost; produces skills_catalog.parquet)
1538
+ 1. embed (Bedrock Cohere Embed v4; honors --dry-run)
1539
+ 2. cluster (UMAP+HDBSCAN; zero-cost; --force_cluster to rebuild)
1540
+ 3. terms (c-TF-IDF labels for clusters; zero-cost)
1541
+ 4. community (Louvain; zero-cost; --force_community to rebuild)
1542
+ 5. classify (Sonnet 4.6; honors --dry-run)
1543
+ 6. trajectory (Sonnet 4.6; honors --dry-run)
1544
+ 7. conflicts (Sonnet 4.6; honors --dry-run)
1545
+ 8. friction (Sonnet 4.6; honors --dry-run)
1546
+
1547
+ Cost
1548
+ ----
1549
+ Every LLM-touching stage defaults to ``--dry-run`` -- stdout logs the
1550
+ plan per stage. Pass ``--no-dry-run`` to execute for real.
1551
+
1552
+ Flags
1553
+ -----
1554
+ --since-days N Scope all stages to the last N days (default 30).
1555
+ --limit N Cap each LLM stage at N items.
1556
+ --dry-run / --no-dry-run (default --dry-run)
1557
+ --no-thinking Disable Sonnet adaptive thinking across all stages.
1558
+ --skip-<stage> Drop a stage:
1559
+ embed, cluster, community, classify, trajectory,
1560
+ conflicts, friction. Terms is bound to cluster.
1561
+ --force-cluster Rebuild clusters.parquet (+ terms) even if present.
1562
+ --force-community Rebuild session_communities.parquet even if present.
1563
+ --glob / --subagent-glob Narrow the corpus (applies to every stage).
1564
+
1565
+ Typical recipes
1566
+ ---------------
1567
+ Preview spend over the last week::
1568
+
1569
+ claude-sql analyze --since-days 7
1570
+
1571
+ Run the non-LLM stages only (cluster + terms + community)::
1572
+
1573
+ claude-sql analyze --skip-embed --skip-classify \
1574
+ --skip-trajectory --skip-conflicts --skip-friction \
1575
+ --force-cluster --force-community
1576
+ """
1577
+ import asyncio
1578
+
1579
+ _configure(common)
1580
+ settings = _resolve_settings(common)
1581
+
1582
+ # 0. Skills catalog sync (filesystem walk, zero cost). Runs even in
1583
+ # --dry-run because it does not hit Bedrock; opt out via
1584
+ # --skip-skills-sync if you want to keep the parquet frozen.
1585
+ if not skip_skills_sync:
1586
+ stats = _skills_catalog.sync(settings)
1587
+ logger.info(
1588
+ "analyze/skills: wrote {} rows to {} ({} skills, {} commands, {} builtins)",
1589
+ stats["rows"],
1590
+ settings.skills_catalog_parquet_path,
1591
+ stats["skills"],
1592
+ stats["commands"],
1593
+ stats["builtins"],
1594
+ )
1595
+
1596
+ # 1. Embed (reuses embed_worker). Silently skipped if the parquet is up to date.
1597
+ if not skip_embed:
1598
+ con = _open_connection(settings)
1599
+ try:
1600
+ n = asyncio.run(
1601
+ run_backfill(
1602
+ con=con,
1603
+ settings=settings,
1604
+ since_days=since_days,
1605
+ limit=limit,
1606
+ dry_run=dry_run,
1607
+ )
1608
+ )
1609
+ logger.info("analyze/embed: {} new embeddings (dry_run={})", n, dry_run)
1610
+ finally:
1611
+ con.close()
1612
+
1613
+ # 2. Cluster (reads embeddings parquet, writes clusters.parquet). Non-LLM.
1614
+ if not skip_cluster:
1615
+ stats = run_clustering(settings, force=force_cluster)
1616
+ logger.info(
1617
+ "analyze/cluster: {} messages, {} clusters, {} noise",
1618
+ stats["total"],
1619
+ stats["clusters"],
1620
+ stats["noise"],
1621
+ )
1622
+ con = _open_connection(settings)
1623
+ try:
1624
+ tstats = run_terms(con, settings, force=force_cluster)
1625
+ logger.info(
1626
+ "analyze/terms: {} clusters, {} term-rows",
1627
+ tstats["clusters"],
1628
+ tstats["terms"],
1629
+ )
1630
+ finally:
1631
+ con.close()
1632
+
1633
+ # 3. Community detection (non-LLM, runs in parallel conceptually with cluster).
1634
+ if not skip_community:
1635
+ con = _open_connection(settings)
1636
+ try:
1637
+ cstats = run_communities(con, settings, force=force_community)
1638
+ logger.info(
1639
+ "analyze/community: {} sessions, {} communities",
1640
+ cstats["sessions"],
1641
+ cstats["communities"],
1642
+ )
1643
+ finally:
1644
+ con.close()
1645
+
1646
+ # 4. Session classification (LLM).
1647
+ if not skip_classify:
1648
+ con = _open_connection(settings)
1649
+ try:
1650
+ n = classify_sessions(
1651
+ con,
1652
+ settings,
1653
+ since_days=since_days,
1654
+ limit=limit,
1655
+ dry_run=dry_run,
1656
+ no_thinking=no_thinking,
1657
+ )
1658
+ logger.info("analyze/classify: {} sessions (dry_run={})", n, dry_run)
1659
+ finally:
1660
+ con.close()
1661
+
1662
+ # 5. Trajectory (LLM).
1663
+ if not skip_trajectory:
1664
+ con = _open_connection(settings)
1665
+ try:
1666
+ n = trajectory_messages(
1667
+ con,
1668
+ settings,
1669
+ since_days=since_days,
1670
+ limit=limit,
1671
+ dry_run=dry_run,
1672
+ no_thinking=no_thinking,
1673
+ )
1674
+ logger.info("analyze/trajectory: {} messages (dry_run={})", n, dry_run)
1675
+ finally:
1676
+ con.close()
1677
+
1678
+ # 6. Conflicts (LLM, requires full session context).
1679
+ if not skip_conflicts:
1680
+ con = _open_connection(settings)
1681
+ try:
1682
+ n = detect_conflicts(
1683
+ con,
1684
+ settings,
1685
+ since_days=since_days,
1686
+ limit=limit,
1687
+ dry_run=dry_run,
1688
+ no_thinking=no_thinking,
1689
+ )
1690
+ logger.info("analyze/conflicts: {} sessions (dry_run={})", n, dry_run)
1691
+ finally:
1692
+ con.close()
1693
+
1694
+ # 7. Friction (LLM, short-message scope).
1695
+ if not skip_friction:
1696
+ con = _open_connection(settings)
1697
+ try:
1698
+ n = detect_user_friction(
1699
+ con,
1700
+ settings,
1701
+ since_days=since_days,
1702
+ limit=limit,
1703
+ dry_run=dry_run,
1704
+ no_thinking=no_thinking,
1705
+ )
1706
+ logger.info("analyze/friction: {} rows (dry_run={})", n, dry_run)
1707
+ finally:
1708
+ con.close()
1709
+
1710
+ logger.info("analyze: done")
1711
+
1712
+
1713
+ @app.command(name="judges")
1714
+ def judges_cmd(*, common: Common | None = None) -> None:
1715
+ """List the cross-provider Bedrock judge catalog (shortname, model ID, family, notes)."""
1716
+ _configure(common)
1717
+ fmt = _fmt(common)
1718
+ rows = [
1719
+ {
1720
+ "shortname": j.shortname,
1721
+ "model_id": j.model_id,
1722
+ "provider": j.provider,
1723
+ "family": j.family,
1724
+ "role": j.role,
1725
+ "notes": j.notes,
1726
+ }
1727
+ for j in _judge_catalog.catalog()
1728
+ ]
1729
+ df = pl.DataFrame(rows)
1730
+ emit_dataframe(df, fmt=fmt)
1731
+
1732
+
1733
+ @app.command(name="freeze")
1734
+ def freeze_cmd(
1735
+ rubric: Path,
1736
+ /,
1737
+ *,
1738
+ panel: str,
1739
+ embed_model: str = "global.cohere.embed-v4:0",
1740
+ seed: int = 42,
1741
+ min_turns: int = 10,
1742
+ max_turns: int = 40,
1743
+ common: Common | None = None,
1744
+ ) -> None:
1745
+ """Pre-register a study: write an immutable manifest under ~/.claude/studies/<sha>/.
1746
+
1747
+ ``panel`` is a comma-separated list of judge shortnames (see ``claude-sql
1748
+ judges``). The returned SHA is what every downstream worker consumes.
1749
+ """
1750
+ _configure(common)
1751
+ fmt = _fmt(common)
1752
+ panel_list = [s.strip() for s in panel.split(",") if s.strip()]
1753
+ if not panel_list:
1754
+ raise InputValidationError("--panel must have at least one shortname")
1755
+ scope = _freeze.SessionScope(min_turns=min_turns, max_turns=max_turns)
1756
+ study = _freeze.freeze(
1757
+ rubric_path=rubric,
1758
+ panel_shortnames=tuple(panel_list),
1759
+ embed_model_id=embed_model,
1760
+ session_scope=scope,
1761
+ seed=seed,
1762
+ )
1763
+ emit_json(
1764
+ {
1765
+ "manifest_sha": study.manifest_sha,
1766
+ "rubric_path": study.rubric_path,
1767
+ "panel_shortnames": list(study.panel_shortnames),
1768
+ "commit_sha": study.commit_sha,
1769
+ "created_at_utc": study.created_at_utc,
1770
+ },
1771
+ fmt=fmt,
1772
+ )
1773
+
1774
+
1775
+ @app.command(name="replay")
1776
+ def replay_cmd(manifest_sha: str, /, *, common: Common | None = None) -> None:
1777
+ """Load and echo a frozen study manifest by SHA."""
1778
+ _configure(common)
1779
+ fmt = _fmt(common)
1780
+ study = _freeze.replay(manifest_sha)
1781
+ emit_json(study.to_dict(), fmt=fmt)
1782
+
1783
+
1784
+ @app.command(name="blind-handover")
1785
+ def blind_handover_cmd(
1786
+ input_path: Path,
1787
+ /,
1788
+ output_path: Path,
1789
+ *,
1790
+ common: Common | None = None,
1791
+ ) -> None:
1792
+ """Strip identity markers from a parquet of sessions for grader-safe handover.
1793
+
1794
+ Input parquet must have (session_id, text) columns. Writes the same
1795
+ parquet with text stripped and an ``original_hash`` column added.
1796
+ """
1797
+ _configure(common)
1798
+ df = pl.read_parquet(input_path)
1799
+ required = {"session_id", "text"}
1800
+ missing = required - set(df.columns)
1801
+ if missing:
1802
+ raise InputValidationError(f"input parquet missing columns: {sorted(missing)}")
1803
+ stripped = [_blind_handover.strip_text(t) for t in df["text"].to_list()]
1804
+ out = df.with_columns(
1805
+ pl.Series("text", [r.text for r in stripped]),
1806
+ pl.Series(
1807
+ "original_hash",
1808
+ [_blind_handover.original_hash(s) for s in df["session_id"].to_list()],
1809
+ ),
1810
+ )
1811
+ out.write_parquet(output_path)
1812
+ logger.info("blind-handover: wrote {} stripped rows to {}", out.height, output_path)
1813
+
1814
+
1815
+ @app.command(name="judge")
1816
+ def judge_cmd(
1817
+ manifest_sha: str,
1818
+ /,
1819
+ *,
1820
+ sessions_parquet: Path,
1821
+ output_parquet: Path,
1822
+ dry_run: bool = True,
1823
+ concurrency: int = 4,
1824
+ region: str = "us-east-1",
1825
+ common: Common | None = None,
1826
+ ) -> None:
1827
+ """Dispatch a frozen study's judge panel over a sessions parquet.
1828
+
1829
+ ``sessions_parquet`` must have (session_id, text) columns. Defaults to
1830
+ ``--dry-run`` per the project cost-guard convention.
1831
+ """
1832
+ _configure(common)
1833
+ fmt = _fmt(common)
1834
+ study = _freeze.replay(manifest_sha)
1835
+ df = pl.read_parquet(sessions_parquet)
1836
+ required = {"session_id", "text"}
1837
+ missing = required - set(df.columns)
1838
+ if missing:
1839
+ raise InputValidationError(f"sessions parquet missing columns: {sorted(missing)}")
1840
+ sessions = list(zip(df["session_id"].to_list(), df["text"].to_list(), strict=True))
1841
+ result = _judge_worker.run(
1842
+ sessions=sessions,
1843
+ panel_shortnames=list(study.panel_shortnames),
1844
+ rubric_yaml_path=Path(study.rubric_path),
1845
+ freeze_sha=study.manifest_sha,
1846
+ out_parquet=output_parquet,
1847
+ dry_run=dry_run,
1848
+ concurrency=concurrency,
1849
+ region=region,
1850
+ )
1851
+ if isinstance(result, _judge_worker.GradePlan):
1852
+ emit_json(
1853
+ {
1854
+ "dry_run": True,
1855
+ "n_sessions": result.n_sessions,
1856
+ "n_judges": result.n_judges,
1857
+ "n_axes": result.n_axes,
1858
+ "n_calls": result.n_calls,
1859
+ "est_input_tokens": result.est_input_tokens,
1860
+ "est_output_tokens": result.est_output_tokens,
1861
+ "est_usd": result.est_usd,
1862
+ },
1863
+ fmt=fmt,
1864
+ )
1865
+ else:
1866
+ emit_json({"dry_run": False, "n_scores": len(result), "out": str(output_parquet)}, fmt=fmt)
1867
+
1868
+
1869
+ @app.command(name="ungrounded-claim")
1870
+ def ungrounded_cmd(
1871
+ manifest_sha: str,
1872
+ /,
1873
+ *,
1874
+ turns_parquet: Path,
1875
+ output_parquet: Path,
1876
+ common: Common | None = None,
1877
+ ) -> None:
1878
+ """Run the ungrounded-claim detector over a turns parquet.
1879
+
1880
+ ``turns_parquet`` needs (session_id, turn_idx, assistant_text,
1881
+ tool_output_text) columns. Writes per-claim grounded flags.
1882
+ """
1883
+ _configure(common)
1884
+ fmt = _fmt(common)
1885
+ study = _freeze.replay(manifest_sha)
1886
+ df = pl.read_parquet(turns_parquet)
1887
+ required = {"session_id", "turn_idx", "assistant_text", "tool_output_text"}
1888
+ missing = required - set(df.columns)
1889
+ if missing:
1890
+ raise InputValidationError(f"turns parquet missing columns: {sorted(missing)}")
1891
+ turns = [
1892
+ _ungrounded_worker.Turn(
1893
+ session_id=row["session_id"],
1894
+ turn_idx=int(row["turn_idx"]),
1895
+ assistant_text=row["assistant_text"],
1896
+ tool_output_text=row["tool_output_text"],
1897
+ )
1898
+ for row in df.iter_rows(named=True)
1899
+ ]
1900
+ out = _ungrounded_worker.detect(turns, freeze_sha=study.manifest_sha)
1901
+ _ungrounded_worker.to_parquet(out, output_parquet)
1902
+ summary = _ungrounded_worker.summarize(out)
1903
+ emit_dataframe(summary, fmt=fmt)
1904
+
1905
+
1906
+ @app.command(name="kappa")
1907
+ def kappa_cmd(
1908
+ scores_parquet: Path,
1909
+ /,
1910
+ *,
1911
+ bootstrap: int = 1000,
1912
+ floor: float = 0.6,
1913
+ delta_gate: Path | None = None,
1914
+ common: Common | None = None,
1915
+ ) -> None:
1916
+ """Compute Cohen's + Fleiss' kappa with bootstrapped 95% CI.
1917
+
1918
+ Exits non-zero (66) if any axis has Fleiss kappa below ``--floor`` OR
1919
+ if ``--delta-gate <prior.parquet>`` is set and the delta-kappa CI
1920
+ excludes zero on any axis (pre-registered stopping rule).
1921
+ """
1922
+ _configure(common)
1923
+ fmt = _fmt(common)
1924
+ df = _kappa_worker.load_scores(scores_parquet)
1925
+ pairs = _kappa_worker.compute_pairwise(df, n_bootstrap=bootstrap)
1926
+ fleiss = _kappa_worker.compute_fleiss(df, n_bootstrap=bootstrap)
1927
+ report = {
1928
+ "pairs": [
1929
+ {
1930
+ "axis": p.axis,
1931
+ "judge_a": p.judge_a,
1932
+ "judge_b": p.judge_b,
1933
+ "n_items": p.n_items,
1934
+ "kappa": round(p.kappa, 4),
1935
+ "ci_low": round(p.ci_low, 4),
1936
+ "ci_high": round(p.ci_high, 4),
1937
+ }
1938
+ for p in pairs
1939
+ ],
1940
+ "fleiss": [
1941
+ {
1942
+ "axis": f.axis,
1943
+ "n_judges": f.n_judges,
1944
+ "n_items": f.n_items,
1945
+ "kappa": round(f.kappa, 4),
1946
+ "ci_low": round(f.ci_low, 4),
1947
+ "ci_high": round(f.ci_high, 4),
1948
+ "below_floor": f.kappa < floor,
1949
+ }
1950
+ for f in fleiss
1951
+ ],
1952
+ "floor": floor,
1953
+ }
1954
+ any_gate_tripped = any(row["below_floor"] for row in report["fleiss"])
1955
+ if delta_gate is not None:
1956
+ prior_df = _kappa_worker.load_scores(delta_gate)
1957
+ prior_fleiss = {
1958
+ f.axis: f for f in _kappa_worker.compute_fleiss(prior_df, n_bootstrap=bootstrap)
1959
+ }
1960
+ delta_rows = []
1961
+ for cur in fleiss:
1962
+ prior = prior_fleiss.get(cur.axis)
1963
+ if prior is None:
1964
+ continue
1965
+ tripped = _kappa_worker.delta_gate_excludes_zero(cur, prior, n_bootstrap=bootstrap)
1966
+ delta_rows.append(
1967
+ {
1968
+ "axis": cur.axis,
1969
+ "delta_excludes_zero": tripped,
1970
+ "current_kappa": cur.kappa,
1971
+ "prior_kappa": prior.kappa,
1972
+ }
1973
+ )
1974
+ any_gate_tripped = any_gate_tripped or tripped
1975
+ report["delta_gate"] = delta_rows
1976
+ emit_json(report, fmt=fmt)
1977
+ if any_gate_tripped:
1978
+ sys.exit(66)
1979
+
1980
+
1981
+ @app.command(name="bind")
1982
+ def bind_cmd(
1983
+ *,
1984
+ repo: Path | None = None,
1985
+ commit_msg: Path | None = None,
1986
+ dry_run: bool = False,
1987
+ common: Common | None = None,
1988
+ ) -> None:
1989
+ """Attach the transcript-PR binding (trailers + git-notes JSON) to a commit.
1990
+
1991
+ Pre-commit-hook entry point per RFC 0001 (see
1992
+ ``docs/rfc/0001-transcript-pr-binding.md``). Wires into a
1993
+ ``prepare-commit-msg`` lefthook job so the trailer lands in the
1994
+ user's editor before they confirm the message.
1995
+
1996
+ Discovery order for the commit-message file:
1997
+
1998
+ 1. ``--commit-msg PATH`` flag if set.
1999
+ 2. ``GIT_PARAMS`` / ``$1`` from the hook -- we re-read it from
2000
+ the ``CLAUDE_SQL_BIND_COMMIT_MSG`` env var, which is the
2001
+ lefthook-friendly way to pass the hook's ``{0}`` arg through.
2002
+ 3. ``<repo>/.git/COMMIT_EDITMSG`` as a last-ditch fallback.
2003
+
2004
+ Resolves the active transcript via
2005
+ :func:`claude_sql.binding.find_active_transcript` (latest mtime
2006
+ under ``~/.claude/projects/<projectified-cwd>/*.jsonl``); when no
2007
+ transcript is found the command exits 0 cleanly without touching
2008
+ the message — bind is best-effort by design.
2009
+
2010
+ With ``--dry-run`` (default ``False``), prints the planned
2011
+ binding as JSON and writes nothing. Off ``--dry-run``, writes
2012
+ the three trailers in place and a JSON note under
2013
+ ``refs/notes/transcripts``.
2014
+ """
2015
+ _configure(common)
2016
+ fmt = _fmt(common)
2017
+ repo_path = repo.resolve() if repo is not None else _binding._resolve_repo(None)
2018
+ cwd = Path.cwd()
2019
+ transcript = _binding.find_active_transcript(cwd)
2020
+ if transcript is None:
2021
+ emit_json(
2022
+ {
2023
+ "bound": False,
2024
+ "reason": "no-active-transcript",
2025
+ "cwd": str(cwd),
2026
+ "projects_dir": f"~/.claude/projects/{_binding.projectify(cwd)}",
2027
+ },
2028
+ fmt=fmt,
2029
+ )
2030
+ return
2031
+ binding = _binding.build_binding(transcript_path=transcript)
2032
+
2033
+ msg_path: Path | None = commit_msg
2034
+ if msg_path is None:
2035
+ env_path = os.environ.get("CLAUDE_SQL_BIND_COMMIT_MSG")
2036
+ if env_path:
2037
+ msg_path = Path(env_path)
2038
+ if msg_path is None:
2039
+ candidate = repo_path / ".git" / "COMMIT_EDITMSG"
2040
+ if candidate.exists():
2041
+ msg_path = candidate
2042
+
2043
+ if dry_run:
2044
+ emit_json(
2045
+ {
2046
+ "bound": False,
2047
+ "dry_run": True,
2048
+ "transcript_path": str(transcript),
2049
+ "binding": binding.to_dict(),
2050
+ "note_payload": binding.to_note_payload(),
2051
+ "commit_msg_path": str(msg_path) if msg_path else None,
2052
+ "repo": str(repo_path),
2053
+ },
2054
+ fmt=fmt,
2055
+ )
2056
+ return
2057
+
2058
+ if msg_path is None:
2059
+ err = ClassifiedError(
2060
+ kind="invalid_input",
2061
+ exit_code=EXIT_CODES["invalid_input"],
2062
+ message="no commit-message file found; pass --commit-msg or run from a prepare-commit-msg hook",
2063
+ hint="set --commit-msg PATH or CLAUDE_SQL_BIND_COMMIT_MSG=$1 in your hook",
2064
+ )
2065
+ emit_error(err, fmt)
2066
+ sys.exit(err.exit_code)
2067
+
2068
+ try:
2069
+ _binding.write_trailer(msg_path, binding)
2070
+ except _binding.GitInvocationError as exc:
2071
+ err = ClassifiedError(
2072
+ kind="runtime_error",
2073
+ exit_code=EXIT_CODES["runtime_error"],
2074
+ message=f"git interpret-trailers failed: {exc.stderr.strip()}",
2075
+ hint=None,
2076
+ )
2077
+ emit_error(err, fmt)
2078
+ sys.exit(err.exit_code)
2079
+
2080
+ # Note write is best-effort: we have a HEAD commit only when bind
2081
+ # runs *after* the commit (e.g., post-commit hook). In a
2082
+ # prepare-commit-msg flow the commit doesn't exist yet, so we skip
2083
+ # the note here and the integration relies on a separate
2084
+ # post-commit step. When the caller is invoking us with --commit
2085
+ # already created (e.g., backfill), they pass --no-dry-run with a
2086
+ # repo containing HEAD.
2087
+ head_cp = _binding._run_git(
2088
+ ["git", "-C", str(repo_path), "rev-parse", "HEAD"],
2089
+ )
2090
+ if head_cp.returncode == 0:
2091
+ commit_sha = head_cp.stdout.strip()
2092
+ try:
2093
+ _binding.write_note(repo_path, commit_sha, binding)
2094
+ except _binding.GitInvocationError as exc:
2095
+ logger.warning("git notes write failed (non-fatal): {}", exc.stderr.strip())
2096
+ commit_sha = ""
2097
+ else:
2098
+ commit_sha = ""
2099
+
2100
+ emit_json(
2101
+ {
2102
+ "bound": True,
2103
+ "dry_run": False,
2104
+ "transcript_path": str(transcript),
2105
+ "binding": binding.to_dict(),
2106
+ "commit_msg_path": str(msg_path),
2107
+ "repo": str(repo_path),
2108
+ "commit_sha": commit_sha,
2109
+ },
2110
+ fmt=fmt,
2111
+ )
2112
+
2113
+
2114
+ @app.command(name="resolve")
2115
+ def resolve_cmd(
2116
+ commit_sha: str,
2117
+ /,
2118
+ *,
2119
+ repo: Path | None = None,
2120
+ all_sources: bool = False,
2121
+ common: Common | None = None,
2122
+ ) -> None:
2123
+ """Resolve a commit's bound transcript per RFC 0001 §Resolution precedence.
2124
+
2125
+ Reads the ``Claude-Transcript-*`` trailers first; falls back to
2126
+ the JSON note under ``refs/notes/transcripts``; raises a loud
2127
+ error (exit 70) when both surfaces disagree on the digest.
2128
+ Returns the parsed binding as JSON.
2129
+
2130
+ Flags
2131
+ -----
2132
+ --repo PATH
2133
+ Repository root. Defaults to ``git rev-parse --show-toplevel``
2134
+ from the current cwd.
2135
+ --all-sources
2136
+ Return ``{"trailer": ..., "note": ...}`` instead of merging.
2137
+ Diagnostic flow for investigating mismatches; never raises on
2138
+ disagreement.
2139
+
2140
+ Exit codes
2141
+ ----------
2142
+ * 0 binding resolved cleanly (or ``--all-sources`` returned both)
2143
+ * 2 commit has no binding (no trailer, no note)
2144
+ * 65 commit not found / git invocation failed
2145
+ * 70 trailer and note disagree on digest
2146
+ """
2147
+ _configure(common)
2148
+ fmt = _fmt(common)
2149
+ repo_path = repo.resolve() if repo is not None else None
2150
+ try:
2151
+ if all_sources:
2152
+ sources = _binding.resolve_all_sources(commit_sha, repo=repo_path)
2153
+ payload: dict[str, dict[str, str] | None] = {
2154
+ "trailer": sources["trailer"].to_dict() if sources["trailer"] is not None else None,
2155
+ "note": sources["note"].to_dict() if sources["note"] is not None else None,
2156
+ }
2157
+ emit_json(payload, fmt=fmt)
2158
+ return
2159
+ binding = _binding.resolve_commit_to_transcript(commit_sha, repo=repo_path)
2160
+ except _binding.BindingMismatchError as exc:
2161
+ err = ClassifiedError(
2162
+ kind="runtime_error",
2163
+ exit_code=EXIT_CODES["runtime_error"],
2164
+ message=str(exc),
2165
+ hint="run `claude-sql resolve <sha> --all-sources` to see both surfaces",
2166
+ )
2167
+ emit_error(err, fmt)
2168
+ sys.exit(err.exit_code)
2169
+ except LookupError as exc:
2170
+ err = ClassifiedError(
2171
+ kind="no_embeddings", # re-uses the "absent-but-not-broken" kind
2172
+ exit_code=EXIT_CODES["no_embeddings"],
2173
+ message=str(exc),
2174
+ hint="commit has no Claude-Transcript-* trailer and no refs/notes/transcripts entry",
2175
+ )
2176
+ emit_error(err, fmt)
2177
+ sys.exit(err.exit_code)
2178
+ except _binding.GitInvocationError as exc:
2179
+ err = ClassifiedError(
2180
+ kind="catalog_error",
2181
+ exit_code=EXIT_CODES["catalog_error"],
2182
+ message=f"git invocation failed: {exc.stderr.strip()}",
2183
+ hint="check that the commit SHA exists in --repo",
2184
+ )
2185
+ emit_error(err, fmt)
2186
+ sys.exit(err.exit_code)
2187
+
2188
+ emit_json(binding.to_dict(), fmt=fmt)
2189
+
2190
+
2191
+ def _review_sheet_format(common: Common | None) -> OutputFormat:
2192
+ """Pick the review-sheet effective format.
2193
+
2194
+ Default policy diverges from every other subcommand: review-sheet
2195
+ output is human-first prose, so ``AUTO`` resolves to ``MARKDOWN`` on
2196
+ a TTY (override of the global ``TABLE`` default) and ``JSON``
2197
+ off-TTY. Explicit ``--format`` flags pass through unchanged so
2198
+ agents can still pin ``--format json`` regardless of TTY state.
2199
+ """
2200
+ fmt = _fmt(common)
2201
+ if fmt is not OutputFormat.AUTO:
2202
+ return fmt
2203
+ return OutputFormat.MARKDOWN if sys.stdout.isatty() else OutputFormat.JSON
2204
+
2205
+
2206
+ @app.command(name="review-sheet")
2207
+ def review_sheet_cmd(
2208
+ commit_sha: str,
2209
+ /,
2210
+ *,
2211
+ repo: Path | None = None,
2212
+ no_thinking: bool = False,
2213
+ dry_run: bool = True,
2214
+ common: Common | None = None,
2215
+ ) -> None:
2216
+ """Render a compressed PR review sheet for a merged commit.
2217
+
2218
+ Resolves the commit's bound transcript via
2219
+ :func:`claude_sql.binding.resolve_commit_to_transcript` (RFC 0001
2220
+ precedence: trailer first, note fallback, loud failure on
2221
+ disagreement), flattens the JSONL into a single review text, and
2222
+ asks Sonnet 4.6 — via ``output_config.format`` structured output —
2223
+ to populate the :class:`PRReviewSheet` schema.
2224
+
2225
+ Defaults to ``--dry-run`` per the project cost-guard convention.
2226
+ Dry-run prints a plan dict (commit_sha, transcript_uri,
2227
+ transcript_digest, model_id, prompt_chars_estimate) and skips the
2228
+ Bedrock call.
2229
+
2230
+ Output format
2231
+ -------------
2232
+ On a TTY ``--format auto`` resolves to ``markdown`` (the
2233
+ human-readable review-sheet shape). Off-TTY it resolves to
2234
+ ``json`` so agents get machine-readable output without a flag. Pass
2235
+ ``--format json`` / ``--format markdown`` explicitly to override.
2236
+ Dry-run always emits JSON regardless of the selected format —
2237
+ plan output is structured by design.
2238
+
2239
+ Exit codes
2240
+ ----------
2241
+ * 0 review sheet rendered (or refused; refusal still exits 0 with
2242
+ ``{"refused": true}`` in the payload).
2243
+ * 2 commit has no binding (no trailer, no note).
2244
+ * 65 commit not found / git invocation failed.
2245
+ * 70 trailer and note disagree on digest.
2246
+ """
2247
+ _configure(common)
2248
+ fmt = _review_sheet_format(common)
2249
+ settings = _resolve_settings(common)
2250
+ repo_path = repo.resolve() if repo is not None else None
2251
+
2252
+ try:
2253
+ # Resolve up-front so the worker's binding lookup uses the same repo
2254
+ # (the worker re-runs resolve internally when ``transcript_uri_override``
2255
+ # is unset; we pre-resolve so we can map LookupError / mismatch errors
2256
+ # to the canonical CLI exit codes before opening a DuckDB connection).
2257
+ binding = _binding.resolve_commit_to_transcript(commit_sha, repo=repo_path)
2258
+ except _binding.BindingMismatchError as exc:
2259
+ err = ClassifiedError(
2260
+ kind="runtime_error",
2261
+ exit_code=EXIT_CODES["runtime_error"],
2262
+ message=str(exc),
2263
+ hint="run `claude-sql resolve <sha> --all-sources` to see both surfaces",
2264
+ )
2265
+ emit_error(err, fmt)
2266
+ sys.exit(err.exit_code)
2267
+ except LookupError as exc:
2268
+ err = ClassifiedError(
2269
+ kind="no_embeddings",
2270
+ exit_code=EXIT_CODES["no_embeddings"],
2271
+ message=str(exc),
2272
+ hint="commit has no Claude-Transcript-* trailer and no refs/notes/transcripts entry",
2273
+ )
2274
+ emit_error(err, fmt)
2275
+ sys.exit(err.exit_code)
2276
+ except _binding.GitInvocationError as exc:
2277
+ err = ClassifiedError(
2278
+ kind="catalog_error",
2279
+ exit_code=EXIT_CODES["catalog_error"],
2280
+ message=f"git invocation failed: {exc.stderr.strip()}",
2281
+ hint="check that the commit SHA exists in --repo",
2282
+ )
2283
+ emit_error(err, fmt)
2284
+ sys.exit(err.exit_code)
2285
+
2286
+ # Hand the resolved URI through the override so the worker doesn't
2287
+ # round-trip to git twice (and so it stays testable without a repo).
2288
+ result = generate_review_sheet(
2289
+ None,
2290
+ settings,
2291
+ commit_sha=commit_sha,
2292
+ transcript_uri_override=binding.uri,
2293
+ dry_run=dry_run,
2294
+ no_thinking=no_thinking,
2295
+ )
2296
+
2297
+ if dry_run:
2298
+ # Plan output is structured regardless of --format choice; users
2299
+ # asking for markdown still get JSON for the plan because there's
2300
+ # no narrative to render yet.
2301
+ plan = result.get("plan", result)
2302
+ emit_json(plan, fmt=OutputFormat.JSON)
2303
+ return
2304
+
2305
+ if result.get("refused"):
2306
+ if fmt is OutputFormat.MARKDOWN:
2307
+ metadata = result.get("metadata") or {"commit_sha": commit_sha}
2308
+ print(render_refusal_markdown(str(result.get("reason", "")), metadata))
2309
+ return
2310
+ emit_json(result, fmt=fmt)
2311
+ return
2312
+
2313
+ sheet = result.get("sheet") or {}
2314
+ metadata = result.get("metadata") or {}
2315
+ if fmt is OutputFormat.MARKDOWN:
2316
+ print(render_markdown(sheet, metadata))
2317
+ return
2318
+ emit_json({"sheet": sheet, "metadata": metadata}, fmt=fmt)
2319
+
2320
+
2321
+ @app.default
2322
+ def _default(*, common: Common | None = None) -> None:
2323
+ """Print a hint when ``claude-sql`` is invoked without a subcommand."""
2324
+ del common
2325
+ print("claude-sql - pass a subcommand or --help")
2326
+ print(" schema | query | explain | shell | list-cache")
2327
+ print(" embed | search")
2328
+ print(" classify | trajectory | conflicts | friction | cluster | terms | community | analyze")
2329
+ print(" judges | freeze | replay | judge | ungrounded-claim | kappa | blind-handover")
2330
+ print(" bind | resolve | review-sheet")
2331
+
2332
+
2333
+ # ---------------------------------------------------------------------------
2334
+ # Entry point
2335
+ # ---------------------------------------------------------------------------
2336
+
2337
+
2338
+ def main() -> None:
2339
+ """Entry point wired into ``[project.scripts]`` in ``pyproject.toml``."""
2340
+ app()
2341
+
2342
+
2343
+ if __name__ == "__main__":
2344
+ main()