npm - ccqa - Versions diffs - 0.6.0 → 0.7.0 - Mend

ccqa 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/bin/ccqa.mjs CHANGED Viewed

@@ -9,11 +9,11 @@ import { query } from "@anthropic-ai/claude-agent-sdk";
 import { ZodError, z } from "zod";
 import { delimiter, dirname, join, relative, resolve } from "node:path";
 import { parse, stringify } from "yaml";
-import { execFile, spawn } from "node:child_process";
+import { execFile, spawn, spawnSync } from "node:child_process";
 import { createInterface } from "node:readline";
 import { homedir, tmpdir } from "node:os";
-import { createInterface as createInterface$1 } from "node:readline/promises";
 import { promisify } from "node:util";
+import { createInterface as createInterface$1 } from "node:readline/promises";
 //#region src/prompts/trace.ts
 function generateSessionName() {
 	return `ccqa-trace-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`;
@@ -3378,10 +3378,36 @@ function previewDiff(before, after) {
 	return out.join("\n");
 }
 //#endregion
+//#region src/prompts/format.ts
+/**
+* Formatting helpers shared by the Claude prompt builders (diagnose, report).
+* Centralised so the prompts cannot drift apart on mechanics that must stay
+* consistent across commands.
+*/
+/** Prefix every line with its 1-based number, the form fix suggestions cite. */
+function numberLines(script) {
+	return script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n");
+}
+/**
+* The "## Output language" prompt section. Empty for "auto" so the prompt
+* stays byte-identical to the no-flag baseline. `fields` names the
+* human-readable JSON fields to translate; `verbatimNames` names the
+* enum-like values that must never be translated.
+*/
+function outputLanguageBlock(outputLanguage, fields, verbatimNames) {
+	if (outputLanguage === "auto") return "";
+	return `## Output language
+Write all human-readable fields (${fields}) in **${outputLanguage}** (BCP-47 tag).
+Selectors, file paths, identifiers, ${verbatimNames}, JSON keys, and quoted strings stay verbatim regardless of language.
+`;
+}
+//#endregion
 //#region src/diagnose/prompt.ts
 function buildDiagnosePrompt(input) {
 	const { script, specYaml, actions, failureLog, pageSnapshot, outputLanguage = "auto" } = input;
-	const numbered = script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n");
+	const numbered = numberLines(script);
 	const actionsSummary = actions.map((a, i) => {
 		const parts = [`${i + 1}. ${a.command}`];
 		if (a.assertType) parts.push(`assertType="${a.assertType}"`);
@@ -3392,12 +3418,7 @@ function buildDiagnosePrompt(input) {
 	}).join("\n");
 	return `You are diagnosing a failing E2E test. The test was generated from a recorded trace of the original interaction. Compare the failing run against the original spec and recorded actions to determine WHY the test failed and what the right fix is.
-${outputLanguage === "auto" ? "" : `## Output language
-Write all human-readable fields (\`reasoning\`, \`reason\`) in **${outputLanguage}** (BCP-47 tag).
-Selectors, file paths, identifiers, code, type names (TIMING_ISSUE, etc.), JSON keys, and quoted strings stay verbatim regardless of language.
-`}## You have read-only filesystem tools
+${outputLanguageBlock(outputLanguage, "`reasoning`, `reason`", "code, type names (TIMING_ISSUE, etc.)")}## You have read-only filesystem tools
 You can call \`Grep\`, \`Glob\`, and \`Read\` against the current repository before producing the JSON.
@@ -4630,165 +4651,1187 @@ async function checkSpec(target, opts) {
 	};
 }
 //#endregion
-//#region src/drift/format.ts
+//#region src/drift/affected.ts
+const execFileP = promisify(execFile);
 /**
-* Render drift results as a string. The CLI commands and the `run` failure
-* hook are the only callers; both want the formatted output returned so
-* they can prefix / interleave / pipe it as needed.
+* Resolve the base ref to diff against for `ccqa drift --changed`.
+* Precedence: explicit override > GITHUB_BASE_REF > origin/main.
 */
-function renderDrift(results, format, cwd) {
-	if (format === "json") return renderJson(results);
-	if (format === "github") return renderGithub(results, cwd);
-	return renderText(results);
+function resolveBaseRef(explicit) {
+	if (explicit && explicit.length > 0) return explicit;
+	const ghBase = process.env["GITHUB_BASE_REF"];
+	if (ghBase && ghBase.length > 0) return ghBase.startsWith("origin/") ? ghBase : `origin/${ghBase}`;
+	return "origin/main";
 }
-const HEAVY_RULE = "═".repeat(72);
-function renderText(results) {
+/**
+* Run `git diff --name-status base...HEAD` from `cwd` and return one entry per
+* changed file. Renames are reported under their NEW path with status
+* "renamed" — the OLD path is dropped because the spec mapping is against the
+* post-rename layout.
+*
+* Paths are re-rooted to be relative to `cwd`, not the git repo root. In a
+* monorepo where `cwd` is a sub-package (e.g. `apps/foo`), git emits paths
+* relative to the repo root, but specs declare relatedPaths relative to
+* their own package. Changes outside `cwd` are dropped so an unrelated PR
+* can never accidentally scope a sub-package's specs in.
+*/
+async function getChangedFiles(base, cwd) {
+	const [{ stdout: rootOut }, { stdout: diffOut }] = await Promise.all([execFileP("git", ["rev-parse", "--show-toplevel"], { cwd }), execFileP("git", [
+		"diff",
+		"--name-status",
+		"-M",
+		`${base}...HEAD`
+	], {
+		cwd,
+		maxBuffer: 32 * 1024 * 1024
+	})]);
+	return rerootChangedFiles(parseGitDiffOutput(diffOut), rootOut.trim(), cwd);
+}
+/**
+* Convert paths in `entries` from git-repo-root relative to `cwd` relative,
+* dropping anything outside `cwd`. Exported for unit tests.
+*/
+function rerootChangedFiles(entries, repoRoot, cwd) {
+	const prefix = relative(repoRoot, cwd);
+	if (!prefix) return entries;
 	const out = [];
-	for (const r of results) {
-		out.push("");
-		const heading = `══ ${r.target.featureName}/${r.target.specName} `;
-		const tail = "═".repeat(Math.max(3, 72 - heading.length));
-		out.push(`${heading}${tail}`);
-		if (r.error) {
-			out.push(`  ERROR  ${r.error}`);
+	for (const e of entries) {
+		const rel = relative(prefix, e.path);
+		if (rel.startsWith("..") || rel === "") continue;
+		out.push({
+			...e,
+			path: rel
+		});
+	}
+	return out;
+}
+function parseGitDiffOutput(stdout) {
+	const out = [];
+	for (const line of stdout.split("\n")) {
+		if (!line.trim()) continue;
+		const parts = line.split("	");
+		const code = parts[0];
+		if (!code) continue;
+		if (code.startsWith("R")) {
+			const newPath = parts[2];
+			if (newPath) out.push({
+				path: newPath,
+				status: "renamed"
+			});
 			continue;
 		}
-		const errors = r.issues.filter((i) => i.severity === "ERROR");
-		const warnings = r.issues.filter((i) => i.severity === "WARN");
-		const passed = r.issues.filter((i) => i.severity === "OK");
-		if (errors.length === 0 && warnings.length === 0) {
-			const label = passed.length === 1 ? "check" : "checks";
-			const detail = passed.length > 0 ? `all ${passed.length} ${label} passed` : "no issues";
-			out.push(`  ✓  ${detail}`);
+		if (code.startsWith("C")) {
+			const newPath = parts[2];
+			if (newPath) out.push({
+				path: newPath,
+				status: "added"
+			});
 			continue;
 		}
-		for (const issue of errors) appendFinding(out, "ERROR", issue);
-		for (const issue of warnings) appendFinding(out, "WARN", issue);
-		if (passed.length > 0) {
-			const names = passed.map((i) => DRAFT_CATEGORY_LABEL[i.category]).join(", ");
-			out.push("");
-			out.push(`  ✓  passed (${passed.length}): ${names}`);
+		const path = parts[1];
+		if (!path) continue;
+		switch (code[0]) {
+			case "A":
+				out.push({
+					path,
+					status: "added"
+				});
+				break;
+			case "M":
+			case "T":
+				out.push({
+					path,
+					status: "modified"
+				});
+				break;
+			case "D":
+				out.push({
+					path,
+					status: "deleted"
+				});
+				break;
+			default: out.push({
+				path,
+				status: "modified"
+			});
 		}
 	}
-	out.push("");
-	out.push(HEAVY_RULE);
-	const totals = summarize(results);
-	out.push(`  specs    ${results.length} (${totals.errored} errored)`);
-	out.push(`  findings ${totals.error} error, ${totals.warn} warn, ${totals.ok} ok`);
-	out.push("");
-	return out.join("\n");
+	return out;
 }
-function appendFinding(out, level, issue) {
-	const stepPart = issue.stepId ? ` ${issue.stepId}` : "";
-	out.push("");
-	out.push(`  ${level}  ${DRAFT_CATEGORY_LABEL[issue.category]}${stepPart}`);
-	out.push(`    ${issue.message}`);
-	if (issue.detail) out.push(`    └ ${issue.detail.replace(/\n/g, "\n      ")}`);
+function stripLeadingDotSlash(s) {
+	return s.startsWith("./") ? s.slice(2) : s;
 }
-function renderJson(results) {
-	const payload = { specs: results.map((r) => ({
-		feature: r.target.featureName,
-		spec: r.target.specName,
-		ok: r.ok,
-		...r.error ? { error: r.error } : {},
-		issues: r.issues.map((i) => ({
-			severity: i.severity,
-			category: i.category,
-			stepId: i.stepId,
-			message: i.message,
-			...i.detail ? { detail: i.detail } : {}
-		}))
-	})) };
-	return `${JSON.stringify(payload, null, 2)}\n`;
+const REGEX_CACHE = /* @__PURE__ */ new Map();
+/** Compiles `pattern` to a RegExp, memoized so repeated `--changed` matches don't re-build. */
+function compileGlob(pattern) {
+	const cached = REGEX_CACHE.get(pattern);
+	if (cached) return cached;
+	const compiled = globToRegExp(stripLeadingDotSlash(pattern));
+	REGEX_CACHE.set(pattern, compiled);
+	return compiled;
 }
-function renderGithub(results, cwd) {
-	const repoRoot = process.env["GITHUB_WORKSPACE"] ?? process.cwd();
-	const lines = [];
-	for (const r of results) {
-		const file = githubRelPath(cwd, repoRoot, r.target.featureName, r.target.specName);
-		if (r.error) {
-			lines.push(`::error file=${file}::${escapeGhMessage(r.error)}`);
+function globToRegExp(pattern) {
+	let re = "^";
+	let i = 0;
+	while (i < pattern.length) {
+		const ch = pattern[i];
+		if (ch === "?") {
+			re += "[^/]";
+			i++;
 			continue;
 		}
-		for (const issue of r.issues) {
-			if (issue.severity === "OK") continue;
-			const level = issue.severity === "ERROR" ? "error" : "warning";
-			const title = `${r.target.featureName}/${r.target.specName} — ${issue.category}${issue.stepId ? ` (${issue.stepId})` : ""}`;
-			const body = issue.detail ? `${issue.message}\n${issue.detail}` : issue.message;
-			lines.push(`::${level} file=${file},title=${escapeGhProp(title)}::${escapeGhMessage(body)}`);
+		if (ch !== "*") {
+			re += /[.+^${}()|[\]\\]/.test(ch) ? "\\" + ch : ch;
+			i++;
+			continue;
+		}
+		if (pattern[i + 1] !== "*") {
+			re += "[^/]*";
+			i++;
+			continue;
 		}
+		const hasLeadingSlash = re.endsWith("/");
+		const hasTrailingSlash = pattern[i + 2] === "/";
+		if (hasLeadingSlash) re = re.slice(0, -1);
+		if (hasLeadingSlash || hasTrailingSlash) re += "(?:/?.*)?";
+		else re += ".*";
+		i += hasTrailingSlash ? 3 : 2;
 	}
-	return lines.length === 0 ? "" : `${lines.join("\n")}\n`;
-}
-function githubRelPath(cwd, repoRoot, featureName, specName) {
-	const abs = resolve(cwd, ".ccqa", "features", featureName, "test-cases", specName, "spec.yaml");
-	const rel = relative(repoRoot, abs);
-	return rel.startsWith("..") ? abs : rel;
-}
-function escapeGhMessage(s) {
-	return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A");
-}
-function escapeGhProp(s) {
-	return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A").replace(/,/g, "%2C").replace(/:/g, "%3A");
+	return new RegExp(re + "$");
 }
-function summarize(results) {
-	let error = 0;
-	let warn = 0;
-	let ok = 0;
-	let errored = 0;
-	for (const r of results) {
-		if (r.error) errored++;
-		for (const issue of r.issues) if (issue.severity === "ERROR") error++;
-		else if (issue.severity === "WARN") warn++;
-		else ok++;
-	}
-	return {
-		error,
-		warn,
-		ok,
-		errored
-	};
+/**
+* Returns true if `changedPath` is covered by any of `relatedPaths`. An empty
+* `relatedPaths` returns false — callers handle the "unscoped spec" case
+* separately (treat the spec as always-affected) before calling this.
+*/
+function isPathAffectedBy(changedPath, relatedPaths) {
+	const stripped = stripLeadingDotSlash(changedPath);
+	for (const pattern of relatedPaths) if (compileGlob(pattern).test(stripped)) return true;
+	return false;
 }
 //#endregion
-//#region src/drift/exit-code.ts
-/**
-* Map drift results to an exit code. Spec-level errors (Claude call failed)
-* always fail; otherwise ERROR severity always fails, WARN fails only when
-* the threshold is `warn`.
-*/
-function determineExitCode(results, threshold) {
-	for (const r of results) {
-		if (r.error) return 1;
-		for (const issue of r.issues) {
-			if (issue.severity === "ERROR") return 1;
-			if (threshold === "warn" && issue.severity === "WARN") return 1;
-		}
-	}
-	return 0;
-}
-//#endregion
-//#region src/drift/auth.ts
+//#region src/drift/auth.ts
 /**
 * Probe whether the host has any credential the Anthropic SDK can pick up:
 *   1. ANTHROPIC_API_KEY env var (CI / scripted use)
-*   2. ~/.claude/.credentials.json (local Claude Code login)
+*   2. ~/.claude/.credentials.json (Claude Code login, file-based platforms)
+*   3. macOS Keychain item "Claude Code-credentials" (Claude Code login on
+*      darwin stores the OAuth credentials in the Keychain, not on disk)
 *
-* `run --drift` is opt-in, so the caller will only consult this after the
-* user has asked for drift. We never throw — auth absence is a normal flow
-* that surfaces as "drift analysis skipped".
+* Claude-driven hooks are opt-in, so the caller only consults this after the
+* user has asked for analysis. We never throw — auth absence is a normal flow
+* that surfaces as "analysis skipped".
 */
 function driftAuthAvailable() {
 	const key = process.env["ANTHROPIC_API_KEY"];
 	if (typeof key === "string" && key.length > 0) return { ok: true };
 	if (existsSync(join(homedir(), ".claude", ".credentials.json"))) return { ok: true };
+	if (process.platform === "darwin" && keychainHasClaudeCredentials()) return { ok: true };
 	return {
 		ok: false,
 		reason: "no ANTHROPIC_API_KEY / claude login"
 	};
 }
+/**
+* `security find-generic-password` without `-w` only checks the item's
+* existence (exit 0) — it never reads the secret, so no Keychain unlock
+* prompt is triggered. Resolved via PATH so tests can stub the binary.
+*/
+function keychainHasClaudeCredentials() {
+	try {
+		return spawnSync("security", [
+			"find-generic-password",
+			"-s",
+			"Claude Code-credentials"
+		], {
+			stdio: "ignore",
+			timeout: 3e3
+		}).status === 0;
+	} catch {
+		return false;
+	}
+}
+//#endregion
+//#region src/report/prompt.ts
+function buildFailureAnalysisPrompt(input) {
+	const { script, specYaml, failureLog, diffPatch, changedFiles, baseRef, driftIssues, outputLanguage = "auto" } = input;
+	const numbered = numberLines(script);
+	return `You are analyzing a failing E2E regression test right after a source change landed. Your job is a root-cause CALL, not a fix: decide which of three categories explains the failure, using the source diff as your primary context.
+${outputLanguageBlock(outputLanguage, "`reasoning`, `detail`", "label names (TEST_DRIFT, etc.)")}## The three categories
+The question that separates them: **is the behavior the spec describes still what the product intends?**
+1. TEST_DRIFT — what the spec verifies is unchanged; only the test code drifted from the source. Typical: a selector/aria-label/placeholder rename, a timing change, an over-tight assertion. The diff shows a change that is invisible to the user's intent but visible to the test.
+2. SPEC_CHANGE — the thing being verified itself changed: the UI flow, the layout, the feature's intended behavior. The diff deliberately changes what the spec asserts. You MUST cite the diff hunk (file + what changed) as evidence for this label.
+3. PRODUCT_BUG — neither of the above: the failure is not explained by the diff nor by test staleness. The product regressed.
+If the evidence is too weak to choose, answer UNKNOWN — a wrong confident call is worse than an honest UNKNOWN, because humans grade these predictions to measure accuracy.
+## You have read-only filesystem tools
+You can call \`Grep\`, \`Glob\`, and \`Read\` against the current repository (post-change state) before producing the JSON. Use them to:
+- confirm a suspected selector rename (grep for \`aria-label=\`, \`placeholder=\`, \`data-testid\`, i18n strings),
+- read the changed files in full when the truncated patch is not enough,
+- check whether the element/flow the spec describes still exists in the source.
+You have **up to 12 tool turns**. Do NOT write, edit, run shell commands, or hit the network.
+## Decision guidance
+- Diff touches only attributes/identifiers the test selects on (labels, testids, class names, timing) while the user-visible flow is intact → TEST_DRIFT.
+- Diff intentionally removes/reworks the UI or flow that a spec step verifies (component deleted, page restructured, copy redefined, feature flag flipped) → SPEC_CHANGE.
+- Diff UNINTENTIONALLY breaks behavior the spec still intends — e.g. a refactor that drops a side effect, an inverted condition, a regression hiding inside a cleanup commit — → PRODUCT_BUG, citing the diff hunk as evidence. A product bug is often introduced BY the diff; what separates it from SPEC_CHANGE is intent: does the change read as a deliberate redesign of what the spec verifies, or as collateral damage?
+- Diff is unrelated to the failing step (or there is no relevant diff) and the test was passing before → lean PRODUCT_BUG; first rule out timing/data flakiness and infrastructure errors (daemon not running, network down, missing credentials) — those read as UNKNOWN with low confidence, not PRODUCT_BUG.
+- The drift audit findings (when present) flag spec↔code mismatches; an ERROR there usually supports TEST_DRIFT or SPEC_CHANGE over PRODUCT_BUG.
+## Sub-diagnosis vocabulary
+Alongside the label, report the closest fine-grained mechanic:
+- SELECTOR_DRIFT, TIMING_ISSUE, OVER_ASSERTION — usually under TEST_DRIFT
+- DATA_MISSING — missing test data/state; usually UNKNOWN or PRODUCT_BUG depending on cause
+- NONE — when nothing fits (typical for SPEC_CHANGE and PRODUCT_BUG)
+## Output
+Your **final** assistant message must start with \`{\` and end with \`}\` — a single JSON object, nothing before or after. No prose preamble, no markdown fences, no tool calls in the same turn.
+{
+  "label": "TEST_DRIFT" | "SPEC_CHANGE" | "PRODUCT_BUG" | "UNKNOWN",
+  "confidence": <0.0-1.0>,
+  "subDiagnosis": "SELECTOR_DRIFT" | "TIMING_ISSUE" | "OVER_ASSERTION" | "DATA_MISSING" | "NONE",
+  "evidence": [
+    { "file": "<file:line or diff hunk reference, omit if log-only>", "detail": "<what this shows>" }
+  ],
+  "reasoning": "<why this label, citing the evidence>"
+}
+## Confidence guidance
+- 0.9-1.0: the diff (or a file you read) directly shows the cause
+- 0.7-0.9: strong indirect evidence
+- 0.4-0.7: plausible but another category could explain it
+- < 0.4: answer UNKNOWN instead of guessing
+Evidence rules: TEST_DRIFT and SPEC_CHANGE require at least one concrete \`file\` reference (diff hunk or file:line you actually read). PRODUCT_BUG should explain why the diff does NOT account for the failure.
+## Test Spec (spec.yaml)
+${specYaml}
+## Test Script (with line numbers)
+${numbered}
+${diffPatch ? `## Source changes since ${baseRef ?? "base"} (git diff, may be truncated)
+### Changed files (name-status)
+${changedFiles ?? "(unavailable)"}
+### Patch
+\`\`\`diff
+${diffPatch}
+\`\`\`
+` : `## Source changes
+No diff context is available (the base ref could not be resolved, or there are no changes). Classify from the failure log, the spec, and what you can read in the repository — and be correspondingly more conservative: prefer UNKNOWN over a confident SPEC_CHANGE/PRODUCT_BUG call without diff evidence.
+`}
+${driftIssues && driftIssues.length > 0 ? `## Spec↔code drift audit findings
+A separate read-only audit compared the spec against the current source. Treat these as hints, not verdicts:
+${driftIssues.map((i) => `- [${i.severity}] (${DRAFT_CATEGORY_LABEL[i.category]}${i.stepId ? `, step ${i.stepId}` : ""}) ${i.message}${i.detail ? ` — ${i.detail}` : ""}`).join("\n")}
+` : ""}## Failure Log
+${failureLog.slice(0, 8e3)}`;
+}
+//#endregion
+//#region src/diagnose/types.ts
+/**
+* The concrete (fixable) diagnosis tags as a value, for consumers that need
+* to enumerate them (e.g. the run report's subDiagnosis vocabulary). The
+* `satisfies` clause makes renaming a union member without updating this
+* list a compile error.
+*/
+const FIXABLE_DIAGNOSIS_TYPES = [
+	"SELECTOR_DRIFT",
+	"TIMING_ISSUE",
+	"OVER_ASSERTION",
+	"DATA_MISSING"
+];
+//#endregion
+//#region src/report/schema.ts
+/**
+* The three-way root-cause call for a failing spec, framed as drift analysis:
+*  - TEST_DRIFT:  what the spec verifies is unchanged; only the test code
+*                 drifted from the source (selector rename, timing, ...).
+*                 Future iterations may auto-fix these.
+*  - SPEC_CHANGE: the thing being verified itself changed (UI redesign,
+*                 spec change). Never auto-fix — a human must re-draft.
+*  - PRODUCT_BUG: neither of the above explains the failure — treat it as
+*                 a product regression.
+*
+* The stakeholder ask behind this module is measurement-first: the call is
+* known to be hard, so every prediction is embedded in the HTML report where
+* a human records the ground truth and the report computes the confusion
+* matrix client-side. Accuracy may start low; it must be *visible*.
+*/
+const FAILURE_LABELS = [
+	"TEST_DRIFT",
+	"SPEC_CHANGE",
+	"PRODUCT_BUG"
+];
+const FailureLabelSchema = z.enum(FAILURE_LABELS);
+/** What the model may answer: the three labels, or UNKNOWN when evidence is weak. */
+const PREDICTED_LABELS = [...FAILURE_LABELS, "UNKNOWN"];
+const PredictedLabelSchema = z.enum(PREDICTED_LABELS);
+const SUB_DIAGNOSES = [...FIXABLE_DIAGNOSIS_TYPES, "NONE"];
+const FailureEvidenceSchema = z.object({
+	file: z.string().optional(),
+	detail: z.string()
+});
+/**
+* LLM output shape. Deliberately NOT .strict(): the model occasionally adds
+* keys, and rejecting the whole analysis over an extra field would collapse
+* a usable prediction into UNKNOWN. Zod's default strips unknown keys.
+*/
+const FailureAnalysisSchema = z.object({
+	label: PredictedLabelSchema,
+	confidence: z.number().min(0).max(1),
+	subDiagnosis: z.enum(SUB_DIAGNOSES).optional(),
+	evidence: z.array(FailureEvidenceSchema),
+	reasoning: z.string()
+});
+const ReportAssertionSchema = z.object({
+	name: z.string(),
+	status: z.enum([
+		"passed",
+		"failed",
+		"skipped"
+	]),
+	durationMs: z.number().nullable()
+});
+const ReportSpecResultSchema = z.object({
+	feature: z.string(),
+	spec: z.string(),
+	status: z.enum(["passed", "failed"]),
+	testCounts: z.object({
+		total: z.number(),
+		passed: z.number(),
+		failed: z.number()
+	}).nullable(),
+	durationMs: z.number().nullable(),
+	assertions: z.array(ReportAssertionSchema).nullable(),
+	analysis: FailureAnalysisSchema.nullable(),
+	analysisSkipped: z.string().nullable(),
+	driftIssues: z.array(DraftIssueSchema).nullable(),
+	failureLogExcerpt: z.string().nullable(),
+	diffExcerpt: z.string().nullable(),
+	specYaml: z.string().nullable()
+});
+z.object({
+	schemaVersion: z.literal(1),
+	createdAt: z.string(),
+	runId: z.string().nullable(),
+	git: z.object({
+		head: z.string().nullable(),
+		base: z.string().nullable()
+	}),
+	model: z.string().nullable(),
+	promptVersion: z.string(),
+	results: z.array(ReportSpecResultSchema)
+});
+/** Shape of the "export labels" download produced by the report's client-side JS. */
+const LabelEntrySchema = z.object({
+	feature: z.string(),
+	spec: z.string(),
+	predicted: PredictedLabelSchema,
+	label: FailureLabelSchema,
+	note: z.string().optional()
+});
+z.object({
+	schemaVersion: z.literal(1),
+	runId: z.string().nullable(),
+	promptVersion: z.string(),
+	exportedAt: z.string(),
+	labels: z.array(LabelEntrySchema)
+});
+//#endregion
+//#region src/report/analyze.ts
+/**
+* Classify one failing spec into TEST_DRIFT / SPEC_CHANGE / PRODUCT_BUG /
+* UNKNOWN. Same resilience contract as diagnose(): read-only tools, JSON-only
+* final message, and any parse failure degrades to UNKNOWN with confidence 0
+* rather than throwing — the report must always render.
+*/
+async function analyzeFailure(input, options = {}) {
+	const { result: raw, isError } = await invokeClaudeStreaming({
+		prompt: buildFailureAnalysisPrompt(input),
+		allowedTools: [
+			"Read",
+			"Grep",
+			"Glob"
+		],
+		silenceBashLog: true,
+		maxTurns: 12,
+		...options.model ? { model: options.model } : {},
+		...options.cwd ? { cwd: options.cwd } : {}
+	}, () => {});
+	if (isError || !raw) return {
+		analysis: unknownAnalysis(isError ? "Claude returned an error result" : "Claude returned no output"),
+		raw: raw ?? "",
+		sdkError: isError
+	};
+	for (const candidate of extractJsonCandidates(raw)) {
+		let parsed;
+		try {
+			parsed = JSON.parse(candidate);
+		} catch {
+			continue;
+		}
+		const normalised = normaliseFailureAnalysis(parsed);
+		if (normalised) return {
+			analysis: normalised,
+			raw,
+			sdkError: false
+		};
+	}
+	return {
+		analysis: unknownAnalysis(`analysis returned no parseable JSON: ${truncate$2(raw, 500)}`),
+		raw,
+		sdkError: false
+	};
+}
+function unknownAnalysis(reasoning) {
+	return {
+		label: "UNKNOWN",
+		confidence: 0,
+		subDiagnosis: "NONE",
+		evidence: [],
+		reasoning
+	};
+}
+const LABELS = new Set(PREDICTED_LABELS);
+const SUB_SET = new Set(SUB_DIAGNOSES);
+/**
+* Manual, lenient normalisation (mirrors diagnose's normaliseResult): a
+* missing/extra field should degrade gracefully, not reject the whole
+* prediction — only an unrecognisable label makes the candidate unusable.
+*/
+function normaliseFailureAnalysis(parsed) {
+	if (!isObject(parsed)) return null;
+	const label = parsed["label"];
+	if (typeof label !== "string" || !LABELS.has(label)) return null;
+	const confidence = typeof parsed["confidence"] === "number" ? clamp(parsed["confidence"], 0, 1) : 0;
+	const reasoning = typeof parsed["reasoning"] === "string" ? parsed["reasoning"] : "";
+	const rawSub = parsed["subDiagnosis"];
+	const subDiagnosis = typeof rawSub === "string" && SUB_SET.has(rawSub) ? rawSub : "NONE";
+	const evidence = [];
+	if (Array.isArray(parsed["evidence"])) for (const item of parsed["evidence"]) {
+		if (!isObject(item)) continue;
+		const detail = typeof item["detail"] === "string" ? item["detail"] : null;
+		if (detail === null) continue;
+		const file = typeof item["file"] === "string" ? item["file"] : void 0;
+		evidence.push(file !== void 0 ? {
+			file,
+			detail
+		} : { detail });
+	}
+	return {
+		label,
+		confidence,
+		subDiagnosis,
+		evidence,
+		reasoning
+	};
+}
+/**
+* Capture the PR diff used as context for failure analysis. `--relative`
+* re-roots paths to `cwd` and drops changes outside it, matching how
+* relatedPaths are declared in a monorepo sub-package.
+*
+* Errors (unknown base ref, not a git repo, ...) are returned, not thrown:
+* the report is still worth generating without diff context.
+*/
+async function capturePrDiff(base, cwd) {
+	try {
+		const [{ stdout: head }, { stdout: patch }, { stdout: nameStatus }] = await Promise.all([
+			execFileP("git", [
+				"rev-parse",
+				"--short",
+				"HEAD"
+			], { cwd }),
+			execFileP("git", [
+				"diff",
+				"-M",
+				"--relative",
+				`${base}...HEAD`
+			], {
+				cwd,
+				maxBuffer: 64 * 1024 * 1024
+			}),
+			execFileP("git", [
+				"diff",
+				"--name-status",
+				"-M",
+				"--relative",
+				`${base}...HEAD`
+			], {
+				cwd,
+				maxBuffer: 32 * 1024 * 1024
+			})
+		]);
+		return {
+			ok: true,
+			diff: {
+				patch,
+				nameStatus: nameStatus.trim(),
+				head: head.trim()
+			}
+		};
+	} catch (e) {
+		return {
+			ok: false,
+			error: e.message.split("\n")[0] ?? "git diff failed"
+		};
+	}
+}
+/**
+* Split a unified diff into per-file sections on `diff --git` boundaries.
+* The path is taken from the `b/` side so renames/edits key on the
+* post-change layout — the same side relatedPaths are written against.
+*/
+const DIFF_HEADER = /^diff --git a\/(.+) b\/(.+)$/;
+function splitPatchByFile(patch) {
+	const sections = [];
+	const lines = patch.split("\n");
+	let current = null;
+	const flush = () => {
+		if (current) sections.push({
+			path: current.path,
+			body: current.lines.join("\n")
+		});
+		current = null;
+	};
+	for (const line of lines) {
+		const m = DIFF_HEADER.exec(line);
+		if (m) {
+			flush();
+			current = {
+				path: m[2],
+				lines: [line]
+			};
+		} else if (current) current.lines.push(line);
+	}
+	flush();
+	return sections;
+}
+/**
+* Scope a full patch down to the files a spec depends on, then truncate so
+* the analysis prompt stays bounded. `relatedPaths` null/empty means the
+* spec is unscoped — keep the whole patch (still truncated). Callers scoping
+* the same patch for many specs can pass pre-split sections instead.
+*/
+function scopePatchForSpec(patch, relatedPaths, caps = {}) {
+	const perFile = caps.perFile ?? 8192;
+	const total = caps.total ?? 49152;
+	let sections = typeof patch === "string" ? splitPatchByFile(patch) : patch;
+	if (relatedPaths && relatedPaths.length > 0) {
+		const scoped = sections.filter((s) => isPathAffectedBy(s.path, relatedPaths));
+		if (scoped.length > 0) sections = scoped;
+	}
+	const parts = [];
+	let used = 0;
+	let droppedFiles = 0;
+	for (const s of sections) {
+		if (used >= total) {
+			droppedFiles++;
+			continue;
+		}
+		let body = s.body;
+		if (body.length > perFile) body = `${body.slice(0, perFile)}\n[truncated: ${body.length - perFile} more chars of ${s.path}]`;
+		if (used + body.length > total) body = `${body.slice(0, total - used)}\n[truncated: total patch cap reached]`;
+		parts.push(body);
+		used += body.length;
+	}
+	if (droppedFiles > 0) parts.push(`[truncated: ${droppedFiles} more changed file(s) omitted]`);
+	return parts.join("\n");
+}
+//#endregion
+//#region src/report/render.ts
+/**
+* Render the run report as ONE self-contained HTML file (inline CSS/JS, no
+* network). It is meant to be uploaded as a CI artifact like Playwright's
+* HTML report and opened locally; the layout deliberately mirrors that
+* report's conventions — header stats that double as filters, a search box,
+* collapsible per-spec cards with a step list and durations, automatic
+* light/dark theme.
+*
+* The measurement loop lives client-side: each analyzed failure gets
+* ground-truth radio buttons, and a vanilla-JS block recomputes accuracy /
+* confusion matrix / per-class precision-recall on every change. Labels
+* persist in localStorage and can be exported/imported as JSON
+* (LabelsExportSchema) so the grading work survives the browser session.
+*/
+function renderRunReport(data) {
+	const failed = data.results.filter((r) => r.status === "failed");
+	const analyzed = failed.filter((r) => r.analysis !== null);
+	const passedCount = data.results.length - failed.length;
+	const totalDuration = data.results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0);
+	const dataJson = JSON.stringify(data).replace(/</g, "\\u003c");
+	return `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>ccqa run report</title>
+<style>${CSS}</style>
+</head>
+<body>
+<header>
+  <div class="header-inner">
+    <div class="header-top">
+      <h1>ccqa run report</h1>
+      <div class="meta">
+        <span title="generated at">${esc(formatDate(data.createdAt))}</span>
+        ${totalDuration > 0 ? `<span>${formatDuration$1(totalDuration)}</span>` : ""}
+        ${data.runId ? `<span>CI run ${esc(data.runId)}</span>` : ""}
+        ${data.git.head ? `<span><code>${esc(data.git.head)}</code>${data.git.base ? ` vs <code>${esc(data.git.base)}</code>` : ""}</span>` : ""}
+        <span class="dim">prompt v${esc(data.promptVersion)}</span>
+      </div>
+    </div>
+    <div class="toolbar">
+      <div class="chips" id="filter-chips">
+        <button type="button" class="chip active" data-filter="all">All <span class="count">${data.results.length}</span></button>
+        <button type="button" class="chip chip-pass" data-filter="passed">${passedCount} passed</button>
+        <button type="button" class="chip chip-fail" data-filter="failed">${failed.length} failed</button>
+      </div>
+      <input type="search" id="search" placeholder="Filter by name…" autocomplete="off">
+    </div>
+  </div>
+</header>
+<div class="page">
+${analyzed.length > 0 ? metricsPanel() : ""}
+<main id="spec-list">
+${data.results.map((r, i) => renderResult(r, i)).join("\n")}
+</main>
+<p class="empty-note" id="no-match" hidden>No specs match the current filter.</p>
+</div>
+<script type="application/json" id="ccqa-report-data">${dataJson}<\/script>
+<script>${CLIENT_JS}<\/script>
+</body>
+</html>
+`;
+}
+function metricsPanel() {
+	return `<section class="panel" id="measure-panel">
+  <div class="panel-head">
+    <h2>Prediction accuracy</h2>
+    <div class="measure-actions">
+      <button type="button" id="export-labels">Export labels (JSON)</button>
+      <label class="import-label">Import labels<input type="file" id="import-labels" accept="application/json"></label>
+    </div>
+  </div>
+  <p class="hint">Grade each failed case below with its true cause; the matrix updates live. Labels are saved in this browser (localStorage) — export them to keep or merge.</p>
+  <div id="metrics"></div>
+</section>`;
+}
+function renderResult(r, index) {
+	const id = `${r.feature}/${r.spec}`;
+	const duration = r.durationMs != null && r.durationMs > 0 ? `<span class="duration">${formatDuration$1(r.durationMs)}</span>` : "";
+	const counts = r.testCounts ? `<span class="counts">${r.testCounts.passed}/${r.testCounts.total}</span>` : "";
+	const predictionChip = r.status === "failed" && r.analysis ? `<span class="badge ${r.analysis.label}">${r.analysis.label}</span>` : "";
+	return `<details class="spec ${r.status}" data-status="${r.status}" data-case-id="${esc(id)}"${r.status === "failed" ? " open" : ""}>
+  <summary>
+    ${statusIcon(r.status)}
+    <span class="spec-name">${esc(id)}</span>
+    ${predictionChip}
+    <span class="spacer"></span>
+    ${counts}
+    ${duration}
+  </summary>
+  <div class="spec-body">
+    ${renderAssertions(r)}
+    ${r.status === "failed" ? r.analysis ? renderAnalysis(r, index) : renderSkipped(r) : ""}
+    ${renderDriftIssues(r)}
+    ${collapsible("Failure log", r.failureLogExcerpt)}
+    ${collapsible("Source diff (scoped)", r.diffExcerpt, "diff")}
+    ${collapsible("spec.yaml", r.specYaml)}
+  </div>
+</details>`;
+}
+function statusIcon(status) {
+	if (status === "passed") return `<span class="status-icon pass" aria-label="passed">✓</span>`;
+	if (status === "failed") return `<span class="status-icon fail" aria-label="failed">✕</span>`;
+	return `<span class="status-icon skip" aria-label="skipped">◌</span>`;
+}
+function renderAssertions(r) {
+	if (!r.assertions || r.assertions.length === 0) return "";
+	return `<ul class="steps">${r.assertions.map((a) => {
+		const dur = a.durationMs != null ? `<span class="duration">${formatDuration$1(a.durationMs)}</span>` : "";
+		return `<li>${statusIcon(a.status)}<span class="step-name">${esc(a.name)}</span><span class="spacer"></span>${dur}</li>`;
+	}).join("")}</ul>`;
+}
+function renderAnalysis(r, index) {
+	const a = r.analysis;
+	const pct = Math.round(a.confidence * 100);
+	const evidence = a.evidence.length > 0 ? `<ul class="evidence">${a.evidence.map((e) => `<li>${e.file ? `<code>${esc(e.file)}</code> — ` : ""}${esc(e.detail)}</li>`).join("")}</ul>` : "";
+	return `<div class="analysis">
+  <div class="prediction">
+    <span class="badge ${a.label}">${a.label}</span>
+    <span class="confidence" title="confidence"><span class="confidence-bar"><span style="width:${pct}%"></span></span>${pct}%</span>
+    ${a.subDiagnosis && a.subDiagnosis !== "NONE" ? `<span class="sub">${esc(a.subDiagnosis)}</span>` : ""}
+  </div>
+  <p class="reasoning">${esc(a.reasoning)}</p>
+  ${evidence}
+  <div class="truth">
+    <span class="truth-title">True cause</span>
+    ${FAILURE_LABELS.map((label) => `<label class="truth-option ${label}"><input type="radio" name="label--${index}" value="${label}"><span>${label}</span></label>`).join("\n    ")}
+    <input type="text" class="note" placeholder="note (optional)" data-case-index="${index}">
+  </div>
+</div>`;
+}
+function renderSkipped(r) {
+	return `<div class="analysis skipped">analysis skipped${r.analysisSkipped ? `: ${esc(r.analysisSkipped)}` : ""}</div>`;
+}
+function renderDriftIssues(r) {
+	if (!r.driftIssues || r.driftIssues.length === 0) return "";
+	const items = r.driftIssues.map((i) => `<li><span class="severity ${i.severity}">${i.severity}</span> (${esc(DRAFT_CATEGORY_LABEL[i.category])}${i.stepId ? `, step ${esc(i.stepId)}` : ""}) ${esc(i.message)}${i.detail ? ` — ${esc(i.detail)}` : ""}</li>`).join("");
+	return `<details class="drift"><summary>Spec↔code drift audit (${r.driftIssues.length})</summary><ul>${items}</ul></details>`;
+}
+function collapsible(title, content, kind = "") {
+	if (!content) return "";
+	return `<details class="raw ${kind}"><summary>${esc(title)}</summary><pre>${esc(content)}</pre></details>`;
+}
+const ESC_MAP = {
+	"&": "&amp;",
+	"<": "&lt;",
+	">": "&gt;",
+	"\"": "&quot;",
+	"'": "&#39;"
+};
+function esc(s) {
+	return s.replace(/[&<>"']/g, (c) => ESC_MAP[c]);
+}
+function formatDuration$1(ms) {
+	if (ms < 1e3) return `${Math.round(ms)}ms`;
+	if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
+	return `${Math.floor(ms / 6e4)}m ${Math.round(ms % 6e4 / 1e3)}s`;
+}
+function formatDate(iso) {
+	return iso.replace("T", " ").replace(/\.\d+Z$/, " UTC");
+}
+const CSS = `
+:root {
+  color-scheme: light dark;
+  --bg: #f4f5f7;
+  --surface: #ffffff;
+  --surface-2: #f8f9fa;
+  --border: #e1e4e8;
+  --text: #1f2328;
+  --text-dim: #656d76;
+  --accent: #1f6feb;
+  --pass: #1a7f37;
+  --pass-bg: #dafbe1;
+  --fail: #cf222e;
+  --fail-bg: #ffebe9;
+  --skip: #9a6700;
+  --code-bg: #0d1117;
+  --code-text: #e6edf3;
+  --shadow: 0 1px 3px rgba(31, 35, 40, 0.06);
+}
+@media (prefers-color-scheme: dark) {
+  :root {
+    --bg: #0d1117;
+    --surface: #161b22;
+    --surface-2: #1c2129;
+    --border: #30363d;
+    --text: #e6edf3;
+    --text-dim: #8b949e;
+    --accent: #58a6ff;
+    --pass: #3fb950;
+    --pass-bg: rgba(63, 185, 80, 0.15);
+    --fail: #f85149;
+    --fail-bg: rgba(248, 81, 73, 0.15);
+    --skip: #d29922;
+    --code-bg: #010409;
+    --code-text: #e6edf3;
+    --shadow: none;
+  }
+}
+* { box-sizing: border-box; }
+body {
+  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Hiragino Sans", "Noto Sans JP", sans-serif;
+  margin: 0; background: var(--bg); color: var(--text); font-size: 14px;
+}
+header {
+  position: sticky; top: 0; z-index: 10;
+  background: var(--surface); border-bottom: 1px solid var(--border);
+}
+.header-inner { max-width: 1080px; margin: 0 auto; padding: 14px 24px 10px; }
+.header-top { display: flex; align-items: baseline; gap: 18px; flex-wrap: wrap; }
+h1 { font-size: 17px; margin: 0; font-weight: 650; }
+h2 { font-size: 14px; margin: 0; font-weight: 650; }
+.meta { font-size: 12px; color: var(--text-dim); display: flex; gap: 14px; flex-wrap: wrap; }
+.meta code { background: var(--surface-2); border: 1px solid var(--border); padding: 0 5px; border-radius: 4px; font-size: 11px; }
+.dim { color: var(--text-dim); }
+.toolbar { display: flex; align-items: center; gap: 12px; margin-top: 10px; flex-wrap: wrap; }
+.chips { display: flex; gap: 6px; }
+.chip {
+  font: inherit; font-size: 12.5px; font-weight: 600; cursor: pointer;
+  padding: 3px 12px; border-radius: 999px; border: 1px solid var(--border);
+  background: var(--surface); color: var(--text-dim);
+}
+.chip .count { opacity: 0.7; }
+.chip.active { background: var(--text); color: var(--surface); border-color: var(--text); }
+.chip-pass.active { background: var(--pass); border-color: var(--pass); color: #fff; }
+.chip-fail.active { background: var(--fail); border-color: var(--fail); color: #fff; }
+#search {
+  font: inherit; font-size: 13px; flex: 1; min-width: 180px; max-width: 320px; margin-left: auto;
+  padding: 5px 12px; border: 1px solid var(--border); border-radius: 6px;
+  background: var(--surface-2); color: var(--text);
+}
+#search:focus { outline: 2px solid var(--accent); outline-offset: -1px; }
+.page { max-width: 1080px; margin: 16px auto; padding: 0 24px; }
+.panel {
+  background: var(--surface); border: 1px solid var(--border); border-radius: 8px;
+  padding: 14px 18px; margin-bottom: 16px; box-shadow: var(--shadow);
+}
+.panel-head { display: flex; align-items: center; justify-content: space-between; gap: 12px; flex-wrap: wrap; }
+.hint { font-size: 12px; color: var(--text-dim); margin: 6px 0 10px; }
+.spec {
+  background: var(--surface); border: 1px solid var(--border); border-radius: 8px;
+  margin-bottom: 8px; box-shadow: var(--shadow);
+}
+.spec > summary {
+  display: flex; align-items: center; gap: 10px; padding: 10px 16px;
+  cursor: pointer; list-style: none; user-select: none;
+}
+.spec > summary::-webkit-details-marker { display: none; }
+.spec > summary::before {
+  content: "▸"; color: var(--text-dim); font-size: 11px;
+  transition: transform 0.12s ease; flex: 0 0 auto;
+}
+.spec[open] > summary::before { transform: rotate(90deg); }
+.spec-name { font-weight: 600; font-size: 13.5px; }
+.spacer { flex: 1; }
+.counts { font-size: 12px; color: var(--text-dim); }
+.duration { font-size: 12px; color: var(--text-dim); font-variant-numeric: tabular-nums; }
+.status-icon { font-weight: 700; font-size: 13px; flex: 0 0 auto; }
+.status-icon.pass { color: var(--pass); }
+.status-icon.fail { color: var(--fail); }
+.status-icon.skip { color: var(--skip); }
+.spec-body { padding: 2px 16px 12px 36px; border-top: 1px solid var(--border); }
+.steps { list-style: none; margin: 10px 0; padding: 0; }
+.steps li {
+  display: flex; align-items: center; gap: 8px; padding: 3px 8px;
+  font-size: 13px; border-radius: 5px;
+}
+.steps li:hover { background: var(--surface-2); }
+.step-name { overflow-wrap: anywhere; }
+.analysis {
+  border: 1px solid var(--border); border-left: 3px solid var(--accent);
+  border-radius: 6px; background: var(--surface-2);
+  padding: 10px 14px; margin: 10px 0;
+}
+.analysis.skipped { color: var(--text-dim); font-size: 13px; font-style: italic; border-left-color: var(--border); }
+.prediction { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; }
+.badge {
+  font-size: 11.5px; font-weight: 700; letter-spacing: 0.02em;
+  padding: 2px 10px; border-radius: 4px; color: #fff; flex: 0 0 auto;
+}
+.badge.TEST_DRIFT { background: #b45309; }
+.badge.SPEC_CHANGE { background: #1d4ed8; }
+.badge.PRODUCT_BUG { background: #b91c1c; }
+.badge.UNKNOWN { background: #6b7280; }
+.confidence { display: inline-flex; align-items: center; gap: 7px; font-size: 12.5px; font-weight: 600; color: var(--text-dim); }
+.confidence-bar {
+  display: inline-block; width: 64px; height: 6px; border-radius: 999px;
+  background: var(--border); overflow: hidden;
+}
+.confidence-bar > span { display: block; height: 100%; background: var(--accent); border-radius: 999px; }
+.sub { font-size: 11px; background: var(--surface); border: 1px solid var(--border); color: var(--text-dim); padding: 1px 8px; border-radius: 999px; }
+.reasoning { font-size: 13px; margin: 9px 0; white-space: pre-wrap; line-height: 1.55; }
+.evidence { font-size: 12.5px; color: var(--text-dim); margin: 6px 0; padding-left: 18px; line-height: 1.5; }
+.evidence code { background: var(--surface); border: 1px solid var(--border); padding: 0 5px; border-radius: 4px; font-size: 11px; }
+.truth {
+  display: flex; align-items: center; gap: 10px; flex-wrap: wrap;
+  background: var(--surface); border: 1px dashed var(--border); border-radius: 6px;
+  padding: 8px 12px; margin-top: 10px; font-size: 12.5px;
+}
+.truth-title { font-weight: 650; color: var(--text-dim); }
+.truth-option {
+  display: inline-flex; align-items: center; gap: 5px; cursor: pointer;
+  border: 1px solid var(--border); border-radius: 999px; padding: 2px 10px;
+}
+.truth-option:has(input:checked) { border-color: var(--accent); background: var(--surface-2); font-weight: 650; }
+.note { flex: 1; min-width: 150px; font: inherit; font-size: 12px; padding: 4px 9px; border: 1px solid var(--border); border-radius: 5px; background: var(--surface-2); color: var(--text); }
+details.raw, details.drift { margin: 7px 0; font-size: 13px; }
+details.raw summary, details.drift summary { cursor: pointer; color: var(--text-dim); }
+details.raw pre {
+  background: var(--code-bg); color: var(--code-text);
+  font-size: 11.5px; line-height: 1.5; padding: 12px 14px; border-radius: 6px;
+  overflow-x: auto; white-space: pre-wrap; word-break: break-word; margin: 6px 0;
+}
+.severity { font-size: 10.5px; font-weight: 700; padding: 0 6px; border-radius: 4px; margin-right: 4px; }
+.severity.ERROR { background: var(--fail-bg); color: var(--fail); }
+.severity.WARN { background: rgba(212, 167, 44, 0.18); color: var(--skip); }
+.severity.OK { background: var(--pass-bg); color: var(--pass); }
+.drift ul { padding-left: 18px; font-size: 12.5px; line-height: 1.55; }
+table.matrix { border-collapse: collapse; font-size: 12.5px; margin: 10px 16px 10px 0; display: inline-table; vertical-align: top; }
+table.matrix th, table.matrix td { border: 1px solid var(--border); padding: 4px 12px; text-align: center; }
+table.matrix th { background: var(--surface-2); font-weight: 600; }
+table.matrix td { font-variant-numeric: tabular-nums; }
+table.matrix td.hit { background: var(--pass-bg); font-weight: 700; }
+table.matrix td.miss-nonzero { background: var(--fail-bg); }
+.stats { font-size: 13px; }
+.stats .big { font-size: 17px; font-weight: 700; }
+.measure-actions { display: flex; gap: 14px; align-items: center; font-size: 12.5px; }
+.measure-actions button {
+  font: inherit; font-size: 12.5px; padding: 4px 13px; cursor: pointer;
+  border: 1px solid var(--border); border-radius: 6px; background: var(--surface); color: var(--text);
+}
+.measure-actions button:hover { background: var(--surface-2); }
+.import-label { cursor: pointer; color: var(--text-dim); }
+.import-label input { display: none; }
+.empty-note { color: var(--text-dim); text-align: center; font-size: 13px; }
+`;
+const CLIENT_JS = `
+(function () {
+  var dataEl = document.getElementById('ccqa-report-data');
+  if (!dataEl) return;
+  var data = JSON.parse(dataEl.textContent);
+  var LABELS = ${JSON.stringify(FAILURE_LABELS)};
+  var PRED_LABELS = LABELS.concat(['UNKNOWN']);
+  var storageKey = 'ccqa-report:' + (data.runId || data.createdAt);
+  // ---- filtering ------------------------------------------------------
+  var activeFilter = 'all';
+  var searchQuery = '';
+  function applyFilters() {
+    var sections = document.querySelectorAll('.spec');
+    var visible = 0;
+    sections.forEach(function (el) {
+      var statusOk = activeFilter === 'all' || el.getAttribute('data-status') === activeFilter;
+      var name = (el.getAttribute('data-case-id') || '').toLowerCase();
+      var searchOk = !searchQuery || name.indexOf(searchQuery) >= 0;
+      var show = statusOk && searchOk;
+      el.style.display = show ? '' : 'none';
+      if (show) visible++;
+    });
+    var note = document.getElementById('no-match');
+    if (note) note.hidden = visible > 0;
+  }
+  var chips = document.querySelectorAll('#filter-chips .chip');
+  chips.forEach(function (chip) {
+    chip.addEventListener('click', function () {
+      activeFilter = chip.getAttribute('data-filter') || 'all';
+      chips.forEach(function (c) { c.classList.toggle('active', c === chip); });
+      applyFilters();
+    });
+  });
+  var search = document.getElementById('search');
+  if (search) {
+    search.addEventListener('input', function () {
+      searchQuery = search.value.trim().toLowerCase();
+      applyFilters();
+    });
+  }
+  // ---- measurement ----------------------------------------------------
+  // cases: analyzed failures only — they carry a prediction we can grade.
+  var cases = [];
+  for (var i = 0; i < data.results.length; i++) {
+    var r = data.results[i];
+    if (r.status === 'failed' && r.analysis) {
+      cases.push({ index: i, feature: r.feature, spec: r.spec, predicted: r.analysis.label });
+    }
+  }
+  var state = {};
+  try { state = JSON.parse(localStorage.getItem(storageKey) || '{}'); } catch (e) { state = {}; }
+  function save() {
+    try { localStorage.setItem(storageKey, JSON.stringify(state)); } catch (e) {}
+  }
+  function caseKey(c) { return c.feature + '/' + c.spec; }
+  function applyStateToInputs() {
+    cases.forEach(function (c) {
+      var entry = state[caseKey(c)];
+      if (!entry) return;
+      // Guard: only known labels may flow into the attribute selector below
+      // (localStorage is user-controlled; anything else is dropped).
+      if (entry.label && LABELS.indexOf(entry.label) >= 0) {
+        var radio = document.querySelector('input[name="label--' + c.index + '"][value="' + entry.label + '"]');
+        if (radio) radio.checked = true;
+      }
+      var note = document.querySelector('.note[data-case-index="' + c.index + '"]');
+      if (note && entry.note) note.value = entry.note;
+    });
+  }
+  function renderMetrics() {
+    var target = document.getElementById('metrics');
+    if (!target) return;
+    var m = {};
+    PRED_LABELS.forEach(function (p) {
+      m[p] = {};
+      LABELS.forEach(function (a) { m[p][a] = 0; });
+    });
+    var labeled = 0;
+    var correct = 0;
+    cases.forEach(function (c) {
+      var entry = state[caseKey(c)];
+      if (!entry || !entry.label || LABELS.indexOf(entry.label) < 0) return;
+      labeled++;
+      m[c.predicted][entry.label]++;
+      if (c.predicted === entry.label) correct++;
+    });
+    var html = '';
+    html += '<div class="stats"><span class="big">' +
+      (labeled === 0 ? '–' : Math.round((correct / labeled) * 100) + '%') +
+      '</span> accuracy · ' + labeled + ' labeled / ' + cases.length + ' analyzed failures' +
+      (cases.length - labeled > 0 ? ' · <strong>' + (cases.length - labeled) + ' unlabeled</strong>' : '') +
+      '</div>';
+    html += '<table class="matrix"><thead><tr><th>predicted \\\\ actual</th>';
+    LABELS.forEach(function (a) { html += '<th>' + a + '</th>'; });
+    html += '</tr></thead><tbody>';
+    PRED_LABELS.forEach(function (p) {
+      html += '<tr><th>' + p + '</th>';
+      LABELS.forEach(function (a) {
+        var v = m[p][a];
+        var cls = p === a ? 'hit' : (v > 0 ? 'miss-nonzero' : '');
+        html += '<td class="' + cls + '">' + v + '</td>';
+      });
+      html += '</tr>';
+    });
+    html += '</tbody></table>';
+    html += '<table class="matrix"><thead><tr><th>class</th><th>precision</th><th>recall</th><th>F1</th><th>support</th></tr></thead><tbody>';
+    LABELS.forEach(function (cls) {
+      var tp = m[cls][cls];
+      var predictedAs = 0;
+      LABELS.forEach(function (a) { predictedAs += m[cls][a]; });
+      var actualAs = 0;
+      PRED_LABELS.forEach(function (p) { actualAs += m[p][cls]; });
+      var precision = predictedAs > 0 ? tp / predictedAs : null;
+      var recall = actualAs > 0 ? tp / actualAs : null;
+      var f1 = precision !== null && recall !== null && precision + recall > 0
+        ? (2 * precision * recall) / (precision + recall) : null;
+      html += '<tr><th>' + cls + '</th><td>' + fmt(precision) + '</td><td>' + fmt(recall) +
+        '</td><td>' + fmt(f1) + '</td><td>' + actualAs + '</td></tr>';
+    });
+    html += '</tbody></table>';
+    target.innerHTML = html;
+  }
+  function fmt(v) { return v === null ? '–' : (Math.round(v * 100) / 100).toFixed(2); }
+  function findCaseByIndex(index) {
+    for (var i = 0; i < cases.length; i++) {
+      if (cases[i].index === index) return cases[i];
+    }
+    return null;
+  }
+  document.addEventListener('change', function (e) {
+    var t = e.target;
+    if (t && t.name && t.name.indexOf('label--') === 0) {
+      var index = parseInt(t.name.slice('label--'.length), 10);
+      var c = findCaseByIndex(index);
+      if (!c) return;
+      var key = caseKey(c);
+      state[key] = state[key] || {};
+      state[key].label = t.value;
+      save();
+      renderMetrics();
+    }
+  });
+  document.addEventListener('input', function (e) {
+    var t = e.target;
+    if (t && t.classList && t.classList.contains('note')) {
+      var index = parseInt(t.getAttribute('data-case-index'), 10);
+      var c = findCaseByIndex(index);
+      if (!c) return;
+      var key = caseKey(c);
+      state[key] = state[key] || {};
+      state[key].note = t.value;
+      save();
+    }
+  });
+  var exportBtn = document.getElementById('export-labels');
+  if (exportBtn) {
+    exportBtn.addEventListener('click', function () {
+      var labels = [];
+      cases.forEach(function (c) {
+        var entry = state[caseKey(c)];
+        if (!entry || !entry.label) return;
+        var item = { feature: c.feature, spec: c.spec, predicted: c.predicted, label: entry.label };
+        if (entry.note) item.note = entry.note;
+        labels.push(item);
+      });
+      var payload = {
+        schemaVersion: 1,
+        runId: data.runId,
+        promptVersion: data.promptVersion,
+        exportedAt: new Date().toISOString(),
+        labels: labels
+      };
+      var blob = new Blob([JSON.stringify(payload, null, 2)], { type: 'application/json' });
+      var a = document.createElement('a');
+      a.href = URL.createObjectURL(blob);
+      a.download = 'ccqa-labels-' + (data.runId || data.createdAt).replace(/[^A-Za-z0-9_-]/g, '_') + '.json';
+      a.click();
+      URL.revokeObjectURL(a.href);
+    });
+  }
+  var importInput = document.getElementById('import-labels');
+  if (importInput) {
+    importInput.addEventListener('change', function () {
+      var file = importInput.files && importInput.files[0];
+      if (!file) return;
+      var reader = new FileReader();
+      reader.onload = function () {
+        try {
+          var payload = JSON.parse(String(reader.result));
+          (payload.labels || []).forEach(function (item) {
+            var key = item.feature + '/' + item.spec;
+            state[key] = state[key] || {};
+            if (item.label) state[key].label = item.label;
+            if (item.note) state[key].note = item.note;
+          });
+          save();
+          applyStateToInputs();
+          renderMetrics();
+        } catch (e) {
+          alert('Could not parse labels JSON: ' + e.message);
+        }
+      };
+      reader.readAsText(file);
+    });
+  }
+  applyStateToInputs();
+  renderMetrics();
+})();
+`;
 //#endregion
 //#region src/cli/run.ts
 const USER_VITEST_CONFIG = resolve(".ccqa/vitest.config.ts");
+const DEFAULT_REPORT_DIR = "ccqa-report";
 async function resolveVitestConfig() {
 	try {
 		await access(USER_VITEST_CONFIG);
@@ -4797,7 +5840,7 @@ async function resolveVitestConfig() {
 		return bundledVitestConfigPath();
 	}
 }
-const runCommand = addLanguageOption(new Command("run").argument("[target]", "Spec to run: '<feature>/<spec>', '<feature>', or omit for all").description("Run generated agent-browser test scripts. Pass --drift to invoke a Claude-driven drift analysis on each failing spec (skipped silently when no test fails). Requires ANTHROPIC_API_KEY or a local Claude login.").option("--drift", "On vitest failure, run drift analysis on the failing specs").option("--drift-strict", "Treat drift ERROR findings as a run failure (exit 1 even if vitest passed). Implies --drift.").option("--format <fmt>", "Output format for the drift block: text | json | github", "text").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Used by --drift only. Overrides CCQA_MODEL.")).action(async (target, opts) => {
+const runCommand = addLanguageOption(new Command("run").argument("[target]", "Spec to run: '<feature>/<spec>', '<feature>', or omit for all").description("Run generated agent-browser test scripts. Pass --drift-report to also write a self-contained HTML run report: each failing spec gets a drift audit plus a root-cause call (TEST_DRIFT / SPEC_CHANGE / PRODUCT_BUG), and the report lets a human grade the calls to measure their accuracy. Requires ANTHROPIC_API_KEY or a local Claude login for the analysis part.").option("--drift-report [dir]", `Write an HTML run report with drift analysis of failures (default dir: ${DEFAULT_REPORT_DIR}/)`).option("--drift-base <ref>", "Base ref the source diff is taken against for failure analysis (default: GITHUB_BASE_REF, then origin/main)").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Used by --drift-report only. Overrides CCQA_MODEL.")).action(async (target, opts) => {
 	await runTests(target, opts);
 });
 async function runTests(target, opts) {
@@ -4812,6 +5855,7 @@ async function runTests(target, opts) {
 	const summaries = [];
 	let overallExitCode = 0;
 	const vitestConfig = await resolveVitestConfig();
+	const captureOutput = Boolean(opts.driftReport);
 	try {
 		for (let i = 0; i < specs.length; i++) {
 			const { featureName, specName } = specs[i];
@@ -4832,7 +5876,8 @@ async function runTests(target, opts) {
 				"--reporter=json",
 				`--outputFile.json=${reportFile}`
 			]);
-			await Promise.all([streamFiltered(proc.stdout, process.stdout), streamFiltered(proc.stderr, process.stderr)]);
+			const tail = captureOutput ? new TailBuffer(OUTPUT_TAIL_CAP) : null;
+			await Promise.all([streamFiltered(proc.stdout, process.stdout, tail), streamFiltered(proc.stderr, process.stderr, tail)]);
 			const exitCode = await proc.exited;
 			if (exitCode !== 0) overallExitCode = exitCode;
 			const report = await readReport(reportFile);
@@ -4841,12 +5886,13 @@ async function runTests(target, opts) {
 				specName,
 				scriptFile,
 				report,
-				exitCode
+				exitCode,
+				outputTail: tail ? tail.toString() : null
 			});
 			blank();
 		}
 		printSummary(summaries);
-		overallExitCode = await maybeRunDrift(summaries, opts, overallExitCode);
+		await maybeWriteDriftReport(summaries, opts);
 	} finally {
 		await rm(tmpDir, {
 			recursive: true,
@@ -4859,74 +5905,208 @@ function failedSpec(s) {
 	if (s.exitCode !== 0) return true;
 	return (s.report?.numFailedTests ?? 0) > 0;
 }
-function parseDriftFormat(raw) {
-	const v = raw ?? "text";
-	if (v === "text" || v === "json" || v === "github") return v;
-	error(`invalid --format: ${v} (expected text|json|github)`);
-	process.exit(2);
-}
 /**
-* Choose which specs to drift-check. `--drift` is a fail-supplement: only the
-* specs that failed get a drift analysis (the goal is to *explain* a vitest
-* failure). `--drift-strict` is an audit: even passing specs are checked,
-* because the CI need is "fail loud if the spec lags behind the source",
-* which can absolutely happen while vitest is still green against a stale
-* staging environment.
+* Opt-in post-vitest report hook. With `--drift-report`, a self-contained
+* HTML report is ALWAYS written (a green run is still a useful run summary);
+* failing specs additionally get a spec↔code drift audit and a three-way
+* root-cause call with the PR diff as context. The hook never changes the
+* exit code — the run's outcome is determined by vitest alone — and when
+* Claude auth is unavailable only the analysis is skipped, not the report.
 */
-function selectDriftTargets(summaries, opts) {
-	if (opts.driftStrict) return summaries;
-	if (opts.drift) return summaries.filter(failedSpec);
-	return [];
-}
-/**
-* Opt-in post-vitest drift hook. With `--drift`, fires only when at least
-* one spec failed (supplemental signal). With `--drift-strict`, fires
-* unconditionally so a spec/source divergence is caught even when vitest
-* passed. Skips silently when auth is unavailable so the run's exit code
-* is determined by vitest alone.
-*/
-async function maybeRunDrift(summaries, opts, currentExitCode) {
-	const candidates = selectDriftTargets(summaries, opts);
-	if (candidates.length === 0) return currentExitCode;
+async function maybeWriteDriftReport(summaries, opts) {
+	if (!opts.driftReport) return;
+	const outDir = typeof opts.driftReport === "string" ? opts.driftReport : DEFAULT_REPORT_DIR;
+	const cwd = process.cwd();
 	const auth = driftAuthAvailable();
-	if (!auth.ok) {
-		info(`drift analysis skipped (${auth.reason})`);
-		return currentExitCode;
+	const failed = summaries.filter(failedSpec);
+	if (!auth.ok && failed.length > 0) info(`failure analysis skipped (${auth.reason})`);
+	const baseRef = resolveBaseRef(opts.driftBase);
+	let diff = {
+		ok: false,
+		error: "diff not captured (no failures)"
+	};
+	if (failed.length > 0) {
+		diff = await capturePrDiff(baseRef, cwd);
+		if (!diff.ok) info(`drift-report: source diff unavailable (${diff.error}) — analyzing without diff context`);
+	}
+	const tree = failed.length > 0 ? await listFeatureTree(cwd) : [];
+	const specInfoByKey = new Map(tree.flatMap((f) => f.specs.map((sp) => [`${f.featureName}/${sp.specName}`, sp])));
+	const findSpecInfo = (s) => specInfoByKey.get(`${s.featureName}/${s.specName}`) ?? null;
+	let driftResults = [];
+	if (auth.ok && failed.length > 0) {
+		const targets = failed.map((s) => {
+			const spec = findSpecInfo(s);
+			if (!spec) return null;
+			const t = {
+				featureName: s.featureName,
+				specName: s.specName
+			};
+			if (spec.relatedPaths) t.relatedPaths = spec.relatedPaths;
+			if (spec.includedBlocks) t.includedBlocks = spec.includedBlocks;
+			return t;
+		}).filter((t) => t !== null);
+		if (targets.length > 0) driftResults = await analyzeDrift({
+			targets,
+			cwd,
+			blocks: await loadAvailableBlocks(cwd),
+			concurrency: Math.min(3, targets.length),
+			...opts.model ? { model: opts.model } : {},
+			...opts.language ? { language: opts.language } : {},
+			onSpecStart: (t) => info(`drift audit: ${t.featureName}/${t.specName}`)
+		});
 	}
-	const format = parseDriftFormat(opts.format);
-	const cwd = process.cwd();
-	const tree = await listFeatureTree(cwd);
-	const targets = candidates.map((s) => {
-		const spec = tree.find((f) => f.featureName === s.featureName)?.specs.find((sp) => sp.specName === s.specName);
-		if (!spec) return null;
-		const t = {
-			featureName: s.featureName,
-			specName: s.specName
+	const patchSections = diff.ok && diff.diff.patch.length > 0 ? splitPatchByFile(diff.diff.patch) : null;
+	let printedHeader = false;
+	const results = [];
+	for (const s of summaries) {
+		const assertions = collectAssertions(s);
+		const base = {
+			feature: s.featureName,
+			spec: s.specName,
+			testCounts: s.report ? {
+				total: s.report.numTotalTests,
+				passed: s.report.numPassedTests,
+				failed: s.report.numFailedTests
+			} : null,
+			durationMs: assertions ? assertions.reduce((sum, a) => sum + (a.durationMs ?? 0), 0) : null,
+			assertions
 		};
-		if (spec.relatedPaths) t.relatedPaths = spec.relatedPaths;
-		if (spec.includedBlocks) t.includedBlocks = spec.includedBlocks;
-		return t;
-	}).filter((t) => t !== null);
-	if (targets.length === 0) {
-		info("drift analysis skipped (no spec.yaml found for failing specs)");
-		return currentExitCode;
-	}
-	const results = await analyzeDrift({
-		targets,
-		cwd,
-		blocks: await loadAvailableBlocks(cwd),
-		concurrency: Math.min(3, targets.length),
-		...opts.model ? { model: opts.model } : {},
-		...opts.language ? { language: opts.language } : {},
-		onSpecStart: (t) => {
-			if (format === "text") info(`drift: checking ${t.featureName}/${t.specName}`);
+		if (!failedSpec(s)) {
+			results.push({
+				...base,
+				status: "passed",
+				analysis: null,
+				analysisSkipped: null,
+				driftIssues: null,
+				failureLogExcerpt: null,
+				diffExcerpt: null,
+				specYaml: null
+			});
+			continue;
+		}
+		const specYaml = await tryReadSpecFile(s.featureName, s.specName, cwd);
+		const relatedPaths = findSpecInfo(s)?.relatedPaths ?? null;
+		const diffExcerpt = patchSections ? scopePatchForSpec(patchSections, relatedPaths) : null;
+		const driftResult = driftResults.find((r) => r.target.featureName === s.featureName && r.target.specName === s.specName);
+		const driftIssues = driftResult?.ok ? driftResult.issues : null;
+		const failureLog = buildFailureLog(s);
+		let analysis = null;
+		let analysisSkipped = null;
+		if (!auth.ok) analysisSkipped = auth.reason;
+		else if (specYaml === null) analysisSkipped = "no spec.yaml found for this spec";
+		else {
+			const script = await readScriptSafe(s.scriptFile);
+			info(`failure analysis: ${s.featureName}/${s.specName}`);
+			const outcome = await analyzeFailure({
+				script,
+				specYaml,
+				failureLog,
+				diffPatch: diffExcerpt,
+				changedFiles: diff.ok ? diff.diff.nameStatus : null,
+				baseRef: diff.ok ? baseRef : null,
+				driftIssues,
+				...opts.language ? { outputLanguage: opts.language } : {}
+			}, {
+				...opts.model ? { model: opts.model } : {},
+				cwd
+			});
+			analysis = outcome.analysis;
+			if (!printedHeader) {
+				process.stdout.write(`\n${C.cyan}${C.bold}──────── failure analysis ────────${C.reset}\n`);
+				printedHeader = true;
+			}
+			const pct = Math.round(outcome.analysis.confidence * 100);
+			const firstLine = outcome.analysis.reasoning.split("\n")[0] ?? "";
+			process.stdout.write(`${C.red}✖${C.reset} ${C.bold}${s.featureName}/${s.specName}${C.reset} → ${C.bold}${outcome.analysis.label}${C.reset} (${pct}%)${firstLine ? ` ${C.dim}${firstLine}${C.reset}` : ""}\n`);
 		}
+		results.push({
+			...base,
+			status: "failed",
+			analysis,
+			analysisSkipped,
+			driftIssues,
+			failureLogExcerpt: failureLog.length > 0 ? failureLog : null,
+			diffExcerpt,
+			specYaml
+		});
+	}
+	const data = {
+		schemaVersion: 1,
+		createdAt: (/* @__PURE__ */ new Date()).toISOString(),
+		runId: process.env["GITHUB_RUN_ID"] ?? null,
+		git: {
+			head: diff.ok ? diff.diff.head : null,
+			base: diff.ok ? baseRef : null
+		},
+		model: opts.model ?? null,
+		promptVersion: "2",
+		results
+	};
+	const reportPath = join(outDir, "index.html");
+	await mkdir(outDir, { recursive: true });
+	await writeFile(reportPath, renderRunReport(data), "utf8");
+	info(`run report written to ${reportPath}`);
+}
+function collectAssertions(s) {
+	if (!s.report) return null;
+	const out = [];
+	for (const file of s.report.testResults) for (const a of file.assertionResults) out.push({
+		name: a.fullName,
+		status: a.status === "passed" || a.status === "failed" ? a.status : "skipped",
+		durationMs: a.duration ?? null
 	});
-	if (format === "text") process.stdout.write(`\n${C.cyan}${C.bold}──────── drift analysis ────────${C.reset}\n`);
-	process.stdout.write(renderDrift(results, format, cwd));
-	if (opts.driftStrict && determineExitCode(results, "error") !== 0) return currentExitCode || 1;
-	return currentExitCode;
+	return out;
+}
+/**
+* Compose the failure log fed to the analysis prompt and embedded in the
+* report. With `--reporter=json` vitest writes (almost) nothing to
+* stdout/stderr — the assertion failures live in the JSON report — so the
+* structured failureMessages come first and the raw output tail (console
+* logs, agent-browser noise) is appended as secondary context.
+*/
+function buildFailureLog(s) {
+	const parts = [];
+	if (s.report) for (const file of s.report.testResults) for (const a of file.assertionResults) {
+		if (a.status !== "failed") continue;
+		parts.push(`✖ ${a.fullName}`);
+		for (const m of a.failureMessages ?? []) parts.push(m);
+	}
+	const tail = s.outputTail?.trim();
+	if (tail) {
+		parts.push("--- vitest output (tail) ---");
+		parts.push(tail);
+	}
+	return parts.join("\n");
 }
+async function readScriptSafe(path) {
+	try {
+		return await readFile(path, "utf8");
+	} catch {
+		return "";
+	}
+}
+/** Cap on the per-spec output tail kept for the report / analysis prompt. */
+const OUTPUT_TAIL_CAP = 64 * 1024;
+/**
+* Keeps the LAST `cap` characters appended. Vitest puts the failure summary
+* at the end of its output, so the tail is the part worth keeping when a
+* noisy spec overflows the cap.
+*/
+var TailBuffer = class {
+	buf = "";
+	cap;
+	constructor(cap) {
+		this.cap = cap;
+	}
+	append(s) {
+		this.buf += s;
+		if (this.buf.length > this.cap * 2) this.buf = this.buf.slice(-this.cap);
+	}
+	toString() {
+		if (this.buf.length <= this.cap) return this.buf;
+		return `[...output truncated...]\n${this.buf.slice(-this.cap)}`;
+	}
+};
 async function readReport(path) {
 	try {
 		const raw = await readFile(path, "utf8");
@@ -4998,7 +6178,7 @@ function formatDuration(ms) {
 	return `${(ms / 1e3).toFixed(2)}s`;
 }
 const NOISE_LINE_PATTERNS = [/^JSON report written to /];
-async function streamFiltered(source, sink) {
+async function streamFiltered(source, sink, capture) {
 	source.setEncoding("utf8");
 	let buffer = "";
 	for await (const chunk of source) {
@@ -5007,11 +6187,17 @@ async function streamFiltered(source, sink) {
 		while (nl !== -1) {
 			const line = buffer.slice(0, nl);
 			buffer = buffer.slice(nl + 1);
-			if (!NOISE_LINE_PATTERNS.some((p) => p.test(line))) sink.write(line + "\n");
+			if (!NOISE_LINE_PATTERNS.some((p) => p.test(line))) {
+				sink.write(line + "\n");
+				capture?.append(line + "\n");
+			}
 			nl = buffer.indexOf("\n");
 		}
 	}
-	if (buffer.length > 0 && !NOISE_LINE_PATTERNS.some((p) => p.test(buffer))) sink.write(buffer);
+	if (buffer.length > 0 && !NOISE_LINE_PATTERNS.some((p) => p.test(buffer))) {
+		sink.write(buffer);
+		capture?.append(buffer);
+	}
 }
 async function resolveSpecs(target) {
 	if (!target) return listAllSpecs();
@@ -5397,163 +6583,141 @@ function truncate(s, n) {
 	return s.slice(s.length - n);
 }
 //#endregion
-//#region src/drift/affected.ts
-const execFileP = promisify(execFile);
-/**
-* Resolve the base ref to diff against for `ccqa drift --changed`.
-* Precedence: explicit override > GITHUB_BASE_REF > origin/main.
-*/
-function resolveBaseRef(explicit) {
-	if (explicit && explicit.length > 0) return explicit;
-	const ghBase = process.env["GITHUB_BASE_REF"];
-	if (ghBase && ghBase.length > 0) return ghBase.startsWith("origin/") ? ghBase : `origin/${ghBase}`;
-	return "origin/main";
-}
-/**
-* Run `git diff --name-status base...HEAD` from `cwd` and return one entry per
-* changed file. Renames are reported under their NEW path with status
-* "renamed" — the OLD path is dropped because the spec mapping is against the
-* post-rename layout.
-*
-* Paths are re-rooted to be relative to `cwd`, not the git repo root. In a
-* monorepo where `cwd` is a sub-package (e.g. `apps/foo`), git emits paths
-* relative to the repo root, but specs declare relatedPaths relative to
-* their own package. Changes outside `cwd` are dropped so an unrelated PR
-* can never accidentally scope a sub-package's specs in.
-*/
-async function getChangedFiles(base, cwd) {
-	const [{ stdout: rootOut }, { stdout: diffOut }] = await Promise.all([execFileP("git", ["rev-parse", "--show-toplevel"], { cwd }), execFileP("git", [
-		"diff",
-		"--name-status",
-		"-M",
-		`${base}...HEAD`
-	], {
-		cwd,
-		maxBuffer: 32 * 1024 * 1024
-	})]);
-	return rerootChangedFiles(parseGitDiffOutput(diffOut), rootOut.trim(), cwd);
-}
+//#region src/drift/format.ts
 /**
-* Convert paths in `entries` from git-repo-root relative to `cwd` relative,
-* dropping anything outside `cwd`. Exported for unit tests.
+* Render drift results as a string. The CLI commands and the `run` failure
+* hook are the only callers; both want the formatted output returned so
+* they can prefix / interleave / pipe it as needed.
 */
-function rerootChangedFiles(entries, repoRoot, cwd) {
-	const prefix = relative(repoRoot, cwd);
-	if (!prefix) return entries;
-	const out = [];
-	for (const e of entries) {
-		const rel = relative(prefix, e.path);
-		if (rel.startsWith("..") || rel === "") continue;
-		out.push({
-			...e,
-			path: rel
-		});
-	}
-	return out;
+function renderDrift(results, format, cwd) {
+	if (format === "json") return renderJson(results);
+	if (format === "github") return renderGithub(results, cwd);
+	return renderText(results);
 }
-function parseGitDiffOutput(stdout) {
+const HEAVY_RULE = "═".repeat(72);
+function renderText(results) {
 	const out = [];
-	for (const line of stdout.split("\n")) {
-		if (!line.trim()) continue;
-		const parts = line.split("	");
-		const code = parts[0];
-		if (!code) continue;
-		if (code.startsWith("R")) {
-			const newPath = parts[2];
-			if (newPath) out.push({
-				path: newPath,
-				status: "renamed"
-			});
+	for (const r of results) {
+		out.push("");
+		const heading = `══ ${r.target.featureName}/${r.target.specName} `;
+		const tail = "═".repeat(Math.max(3, 72 - heading.length));
+		out.push(`${heading}${tail}`);
+		if (r.error) {
+			out.push(`  ERROR  ${r.error}`);
 			continue;
 		}
-		if (code.startsWith("C")) {
-			const newPath = parts[2];
-			if (newPath) out.push({
-				path: newPath,
-				status: "added"
-			});
+		const errors = r.issues.filter((i) => i.severity === "ERROR");
+		const warnings = r.issues.filter((i) => i.severity === "WARN");
+		const passed = r.issues.filter((i) => i.severity === "OK");
+		if (errors.length === 0 && warnings.length === 0) {
+			const label = passed.length === 1 ? "check" : "checks";
+			const detail = passed.length > 0 ? `all ${passed.length} ${label} passed` : "no issues";
+			out.push(`  ✓  ${detail}`);
 			continue;
 		}
-		const path = parts[1];
-		if (!path) continue;
-		switch (code[0]) {
-			case "A":
-				out.push({
-					path,
-					status: "added"
-				});
-				break;
-			case "M":
-			case "T":
-				out.push({
-					path,
-					status: "modified"
-				});
-				break;
-			case "D":
-				out.push({
-					path,
-					status: "deleted"
-				});
-				break;
-			default: out.push({
-				path,
-				status: "modified"
-			});
+		for (const issue of errors) appendFinding(out, "ERROR", issue);
+		for (const issue of warnings) appendFinding(out, "WARN", issue);
+		if (passed.length > 0) {
+			const names = passed.map((i) => DRAFT_CATEGORY_LABEL[i.category]).join(", ");
+			out.push("");
+			out.push(`  ✓  passed (${passed.length}): ${names}`);
 		}
 	}
-	return out;
+	out.push("");
+	out.push(HEAVY_RULE);
+	const totals = summarize(results);
+	out.push(`  specs    ${results.length} (${totals.errored} errored)`);
+	out.push(`  findings ${totals.error} error, ${totals.warn} warn, ${totals.ok} ok`);
+	out.push("");
+	return out.join("\n");
 }
-function stripLeadingDotSlash(s) {
-	return s.startsWith("./") ? s.slice(2) : s;
+function appendFinding(out, level, issue) {
+	const stepPart = issue.stepId ? ` ${issue.stepId}` : "";
+	out.push("");
+	out.push(`  ${level}  ${DRAFT_CATEGORY_LABEL[issue.category]}${stepPart}`);
+	out.push(`    ${issue.message}`);
+	if (issue.detail) out.push(`    └ ${issue.detail.replace(/\n/g, "\n      ")}`);
 }
-const REGEX_CACHE = /* @__PURE__ */ new Map();
-/** Compiles `pattern` to a RegExp, memoized so repeated `--changed` matches don't re-build. */
-function compileGlob(pattern) {
-	const cached = REGEX_CACHE.get(pattern);
-	if (cached) return cached;
-	const compiled = globToRegExp(stripLeadingDotSlash(pattern));
-	REGEX_CACHE.set(pattern, compiled);
-	return compiled;
+function renderJson(results) {
+	const payload = { specs: results.map((r) => ({
+		feature: r.target.featureName,
+		spec: r.target.specName,
+		ok: r.ok,
+		...r.error ? { error: r.error } : {},
+		issues: r.issues.map((i) => ({
+			severity: i.severity,
+			category: i.category,
+			stepId: i.stepId,
+			message: i.message,
+			...i.detail ? { detail: i.detail } : {}
+		}))
+	})) };
+	return `${JSON.stringify(payload, null, 2)}\n`;
 }
-function globToRegExp(pattern) {
-	let re = "^";
-	let i = 0;
-	while (i < pattern.length) {
-		const ch = pattern[i];
-		if (ch === "?") {
-			re += "[^/]";
-			i++;
-			continue;
-		}
-		if (ch !== "*") {
-			re += /[.+^${}()|[\]\\]/.test(ch) ? "\\" + ch : ch;
-			i++;
+function renderGithub(results, cwd) {
+	const repoRoot = process.env["GITHUB_WORKSPACE"] ?? process.cwd();
+	const lines = [];
+	for (const r of results) {
+		const file = githubRelPath(cwd, repoRoot, r.target.featureName, r.target.specName);
+		if (r.error) {
+			lines.push(`::error file=${file}::${escapeGhMessage(r.error)}`);
 			continue;
 		}
-		if (pattern[i + 1] !== "*") {
-			re += "[^/]*";
-			i++;
-			continue;
+		for (const issue of r.issues) {
+			if (issue.severity === "OK") continue;
+			const level = issue.severity === "ERROR" ? "error" : "warning";
+			const title = `${r.target.featureName}/${r.target.specName} — ${issue.category}${issue.stepId ? ` (${issue.stepId})` : ""}`;
+			const body = issue.detail ? `${issue.message}\n${issue.detail}` : issue.message;
+			lines.push(`::${level} file=${file},title=${escapeGhProp(title)}::${escapeGhMessage(body)}`);
 		}
-		const hasLeadingSlash = re.endsWith("/");
-		const hasTrailingSlash = pattern[i + 2] === "/";
-		if (hasLeadingSlash) re = re.slice(0, -1);
-		if (hasLeadingSlash || hasTrailingSlash) re += "(?:/?.*)?";
-		else re += ".*";
-		i += hasTrailingSlash ? 3 : 2;
 	}
-	return new RegExp(re + "$");
+	return lines.length === 0 ? "" : `${lines.join("\n")}\n`;
+}
+function githubRelPath(cwd, repoRoot, featureName, specName) {
+	const abs = resolve(cwd, ".ccqa", "features", featureName, "test-cases", specName, "spec.yaml");
+	const rel = relative(repoRoot, abs);
+	return rel.startsWith("..") ? abs : rel;
+}
+function escapeGhMessage(s) {
+	return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A");
+}
+function escapeGhProp(s) {
+	return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A").replace(/,/g, "%2C").replace(/:/g, "%3A");
+}
+function summarize(results) {
+	let error = 0;
+	let warn = 0;
+	let ok = 0;
+	let errored = 0;
+	for (const r of results) {
+		if (r.error) errored++;
+		for (const issue of r.issues) if (issue.severity === "ERROR") error++;
+		else if (issue.severity === "WARN") warn++;
+		else ok++;
+	}
+	return {
+		error,
+		warn,
+		ok,
+		errored
+	};
 }
+//#endregion
+//#region src/drift/exit-code.ts
 /**
-* Returns true if `changedPath` is covered by any of `relatedPaths`. An empty
-* `relatedPaths` returns false — callers handle the "unscoped spec" case
-* separately (treat the spec as always-affected) before calling this.
+* Map drift results to an exit code. Spec-level errors (Claude call failed)
+* always fail; otherwise ERROR severity always fails, WARN fails only when
+* the threshold is `warn`.
 */
-function isPathAffectedBy(changedPath, relatedPaths) {
-	const stripped = stripLeadingDotSlash(changedPath);
-	for (const pattern of relatedPaths) if (compileGlob(pattern).test(stripped)) return true;
-	return false;
+function determineExitCode(results, threshold) {
+	for (const r of results) {
+		if (r.error) return 1;
+		for (const issue of r.issues) {
+			if (issue.severity === "ERROR") return 1;
+			if (threshold === "warn" && issue.severity === "WARN") return 1;
+		}
+	}
+	return 0;
 }
 //#endregion
 //#region src/drift/route-new-files.ts