npm - ccqa - Versions diffs - 0.3.4 → 0.3.6 - Mend

ccqa 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +20 -1
package/dist/bin/ccqa.mjs +1130 -180
package/dist/package.json +1 -1
package/dist/runtime/test-helpers.mjs +29 -7
package/package.json +1 -1

package/dist/bin/ccqa.mjs CHANGED Viewed

@@ -8,6 +8,7 @@ import { delimiter, dirname, join, resolve } from "node:path";
 import { query } from "@anthropic-ai/claude-agent-sdk";
 import matter from "gray-matter";
 import { spawn } from "node:child_process";
+import { createInterface } from "node:readline";
 import { tmpdir } from "node:os";
 //#region src/prompts/trace.ts
 function generateSessionName() {
@@ -285,14 +286,17 @@ const STEP_ICONS = {
 function header(command, target) {
 	process.stdout.write(`\nccqa ${command}${target ? ` ${target}` : ""}\n\n`);
 }
+function write(scope, message, sink = process.stdout) {
+	sink.write(`[${scope}] ${message}\n`);
+}
 function meta(key, value) {
-	process.stdout.write(`  ${key}: ${value}\n`);
+	write("meta", `${key}: ${value}`);
 }
 function blank() {
 	process.stdout.write("\n");
 }
 function info(message) {
-	process.stdout.write(`${message}\n`);
+	write("info", message);
 }
 function step(type, stepId, detail) {
 	process.stdout.write(`  ${STEP_ICONS[type]} [${stepId}] ${detail}\n`);
@@ -301,13 +305,37 @@ function bash(command) {
 	process.stdout.write(`  $ ${command.slice(0, 120)}\n`);
 }
 function error(message) {
-	process.stderr.write(`error: ${message}\n`);
+	write("error", message, process.stderr);
 }
 function warn(message) {
-	process.stderr.write(`warn: ${message}\n`);
+	write("warn", message, process.stderr);
 }
 function hint(message) {
-	process.stdout.write(`\nhint: ${message}\n`);
+	process.stdout.write("\n");
+	write("hint", message);
+}
+function fix(message) {
+	write("fix", message);
+}
+function run(message) {
+	write("run", message);
+}
+/**
+* Time a long-running step under the given scope, emitting `started` and
+* `finished in N.Ns` markers. Scope must be a tag the user wants to grep
+* for — typically "run" for vitest and "fix" for diagnose-loop steps.
+*/
+async function timedPhase(label, fn, scope = "fix") {
+	const startedAt = Date.now();
+	write(scope, `${label} started`);
+	try {
+		const result = await fn();
+		write(scope, `${label} finished in ${((Date.now() - startedAt) / 1e3).toFixed(1)}s`);
+		return result;
+	} catch (err) {
+		write(scope, `${label} threw after ${((Date.now() - startedAt) / 1e3).toFixed(1)}s`);
+		throw err;
+	}
 }
 //#endregion
 //#region src/claude/invoke.ts
@@ -488,13 +516,27 @@ const CCQA_DIR = ".ccqa";
 function getCcqaDir(cwd = process.cwd()) {
 	return join(cwd, CCQA_DIR);
 }
+/**
+* Accepts both the canonical 2-segment alias and the on-disk 4-segment path
+* (which is what shell tab-completion produces):
+*   - "tasks/create-and-complete"
+*   - "features/tasks/test-cases/create-and-complete"
+*   - ".ccqa/features/tasks/test-cases/create-and-complete"
+* All forms resolve to { featureName: "tasks", specName: "create-and-complete" }.
+* Trailing slashes are tolerated.
+*/
 function parseSpecPath(specPath) {
-	const parts = specPath.split("/");
-	if (parts.length !== 2 || !parts[0] || !parts[1]) throw new Error(`Invalid spec path: "${specPath}". Expected format: "<feature>/<spec>"`);
-	return {
+	const parts = specPath.replace(/^\.\/+/, "").replace(/\/+$/, "").split("/").filter((p) => p.length > 0);
+	if (parts[0] === ".ccqa") parts.shift();
+	if (parts.length === 4 && parts[0] === "features" && parts[2] === "test-cases") return {
+		featureName: parts[1],
+		specName: parts[3]
+	};
+	if (parts.length === 2 && parts[0] && parts[1]) return {
 		featureName: parts[0],
 		specName: parts[1]
 	};
+	throw new Error(`Invalid spec path: "${specPath}". Expected "<feature>/<spec>" or "features/<feature>/test-cases/<spec>".`);
 }
 function getFeatureDir(featureName, cwd) {
 	return join(getCcqaDir(cwd), "features", featureName);
@@ -706,10 +748,10 @@ function bundledVitestConfigPath() {
 }
 //#endregion
 //#region src/runtime/spawn-vitest.ts
-const require$1 = createRequire(import.meta.url);
+const require$2 = createRequire(import.meta.url);
 function resolveVitestBin() {
-	const pkgPath = require$1.resolve("vitest/package.json");
-	const pkg = require$1(pkgPath);
+	const pkgPath = require$2.resolve("vitest/package.json");
+	const pkg = require$2(pkgPath);
 	const binRel = typeof pkg.bin === "string" ? pkg.bin : pkg.bin?.vitest;
 	if (!binRel) throw new Error(`vitest package.json has no bin entry (resolved at ${pkgPath})`);
 	return resolve(dirname(pkgPath), binRel);
@@ -727,6 +769,19 @@ async function spawnVitestCaptured(args, opts = {}) {
 		stderr
 	};
 }
+async function spawnVitestTeed(args, opts = {}) {
+	const child = spawnVitestChild(args, opts, "pipe");
+	const [stdout, stderr, exitCode] = await Promise.all([
+		teeDrain(child.stdout, process.stdout),
+		teeDrain(child.stderr, process.stderr),
+		waitExit(child)
+	]);
+	return {
+		exitCode,
+		stdout,
+		stderr
+	};
+}
 function spawnVitestStreaming(args, opts = {}) {
 	const child = spawnVitestChild(args, opts, "pipe");
 	return {
@@ -754,6 +809,15 @@ async function drain(stream) {
 	for await (const chunk of stream) buf += chunk;
 	return buf;
 }
+async function teeDrain(stream, sink) {
+	stream.setEncoding("utf8");
+	let buf = "";
+	for await (const chunk of stream) {
+		buf += chunk;
+		sink.write(chunk);
+	}
+	return buf;
+}
 function waitExit(child) {
 	return new Promise((resolvePromise, rejectPromise) => {
 		child.once("exit", (code) => resolvePromise(code ?? 0));
@@ -762,7 +826,7 @@ function waitExit(child) {
 }
 //#endregion
 //#region src/runtime/agent-browser-bin.ts
-const require = createRequire(import.meta.url);
+const require$1 = createRequire(import.meta.url);
 /**
 * Resolves the directory containing the `agent-browser` shim that npm/pnpm
 * exposes on PATH for the peer-installed package. Used by `ccqa trace` to
@@ -774,7 +838,7 @@ const require = createRequire(import.meta.url);
 function resolveAgentBrowserBinDir() {
 	let pkgJsonPath;
 	try {
-		pkgJsonPath = require.resolve("agent-browser/package.json");
+		pkgJsonPath = require$1.resolve("agent-browser/package.json");
 	} catch {
 		return null;
 	}
@@ -793,8 +857,58 @@ function pathWithAgentBrowserShim(currentPath) {
 	return dir + delimiter + path;
 }
 //#endregion
+//#region src/runtime/env-vars.ts
+const ENV_VAR_RE = /\$\{([A-Z_][A-Z0-9_]*)\}|\$([A-Z_][A-Z0-9_]*)/g;
+/**
+* Returns true if the value contains at least one `$VAR` or `${VAR}` reference.
+*/
+function hasEnvRef(value) {
+	ENV_VAR_RE.lastIndex = 0;
+	return ENV_VAR_RE.test(value);
+}
+/**
+* Resolve every `$VAR` / `${VAR}` reference against the current process env.
+*
+* Missing variables expand to the empty string, mirroring `sh` behaviour.
+* Throwing would force ccqa to be invoked with every var set even for
+* unused setups, which is more user-hostile than letting the test fail
+* downstream with a clearer message ("login form rejected: empty password").
+*/
+function resolveEnvRefs(value) {
+	return value.replace(ENV_VAR_RE, (_, braced, plain) => {
+		const name = braced ?? plain ?? "";
+		return process.env[name] ?? "";
+	});
+}
+/**
+* Embed `$VAR` / `${VAR}` as a JS template-literal expression that reads
+* `process.env.VAR ?? ""` at runtime. Used by `ccqa generate` so the test
+* script never bakes in the secret value.
+*
+* Returns a JavaScript string-literal expression (template literal when env
+* refs are present, plain string literal otherwise).
+*
+* Examples:
+*   "${PASSWORD}"             -> '`${process.env.PASSWORD ?? ""}`'
+*   "user-${SUFFIX}@x.com"    -> '`user-${process.env.SUFFIX ?? ""}@x.com`'
+*   "literal value"           -> '"literal value"'
+*/
+function envRefsToJsExpression(value) {
+	if (!hasEnvRef(value)) return JSON.stringify(value);
+	const escaped = value.replace(/\\/g, "\\\\").replace(/`/g, "\\`").replace(/\$\{/g, (match, offset, source) => {
+		ENV_VAR_RE.lastIndex = 0;
+		let m;
+		while ((m = ENV_VAR_RE.exec(source)) !== null) if (m.index === offset) return "${";
+		return "\\${";
+	});
+	ENV_VAR_RE.lastIndex = 0;
+	return `\`${escaped.replace(ENV_VAR_RE, (_, braced, plain) => {
+		return `\${process.env.${braced ?? plain ?? ""} ?? ""}`;
+	})}\``;
+}
+//#endregion
 //#region src/cli/trace.ts
-const traceCommand = new Command("trace").argument("<feature/spec>", "Spec to trace (e.g. tasks/create-and-complete)").description("Run agent-browser, verify assertions, and record structured actions").action(async (specPath) => {
+const traceCommand = new Command("trace").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Run agent-browser, verify assertions, and record structured actions").action(async (specPath) => {
 	const { featureName, specName } = parseSpecPath(specPath);
 	await runTrace(featureName, specName);
 });
@@ -892,8 +1006,8 @@ async function runSetups(setups, sessionName) {
 		let script = await readFile(scriptPath, "utf-8").catch(() => {
 			throw new Error(`Setup test script not found: ${scriptPath}. Run \`ccqa generate-setup ${ref.name}\` first.`);
 		});
-		for (const [key, value] of Object.entries(ref.params ?? {})) script = script.replaceAll(`{{${key}}}`, value);
-		script = script.replace(/process\.env\.AGENT_BROWSER_SESSION\s*=\s*`.+`;/, `process.env.AGENT_BROWSER_SESSION = ${JSON.stringify(sessionName)};`);
+		for (const [key, value] of Object.entries(ref.params ?? {})) script = script.replaceAll(`{{${key}}}`, resolveEnvRefs(value));
+		script = script.replace(/process\.env\.AGENT_BROWSER_SESSION\s*\|?\|?=\s*`.+`;/, `process.env.AGENT_BROWSER_SESSION = ${JSON.stringify(sessionName)};`);
 		const tmpPath = join(getSetupDir(ref.name), `_run.spec.ts`);
 		await writeFile(tmpPath, script, "utf-8");
 		try {
@@ -1012,8 +1126,10 @@ function actionsToScript(actions, title, setupScripts) {
 		`import { spawnSync } from "node:child_process";`,
 		`import { ab, abWait, abAssertTextVisible, abAssertVisible, abAssertNotVisible, abAssertUrl, abAssertEnabled, abAssertDisabled, abAssertChecked, abAssertUnchecked } from "ccqa/test-helpers";`,
 		"",
-		`// Single session shared across all tests — reset per run via cookies clear in first test`,
-		`process.env.AGENT_BROWSER_SESSION = \`ccqa-run-\${Date.now()}\`;`,
+		`// Single session shared across all tests — reset per run via cookies clear in first test.`,
+		`// Use ||= so an outer harness (e.g. ccqa generate's auto-fix loop) can pre-set the session`,
+		`// name and inspect the same session after the run finishes.`,
+		`process.env.AGENT_BROWSER_SESSION ||= \`ccqa-run-\${Date.now()}\`;`,
 		""
 	]];
 	if (setupScripts?.length) for (const setup of setupScripts) parts.push(`test("setup: ${setup.name}", () => {`, setup.body, "}, 3 * 60 * 1000);", "");
@@ -1118,43 +1234,6 @@ function actionToLine(action) {
 const j = (s) => JSON.stringify(s);
 //#endregion
 //#region src/prompts/codegen.ts
-function buildAutoFixPrompt(script, failureLog) {
-	return `You are analyzing a failing E2E test script. The test fails because some browser actions execute before the page has finished loading or navigating.
-Your task: identify which line numbers need a sleep/wait inserted BEFORE them to fix timing issues.
-## Rules
-- ONLY identify lines where a sleep is needed — do NOT suggest any other changes
-- Common patterns that need a sleep:
-  - After \`ab("open", ...)\` when the next line interacts with elements (fill, click, etc.)
-  - After \`ab("press", "Enter")\` or \`ab("click", ...)\` when a page navigation occurs before the next action
-  - After any action that triggers a redirect or page reload
-- Look at the error log to identify WHICH lines failed, then determine if a sleep before that line would fix it
-- If a \`spawnSync("sleep", ...)\` already exists before a failing line, suggest increasing its duration instead
-- Output ONLY a JSON array of objects, no explanation, no markdown code fences
-## Output format
-Each object has:
-- "line": the 1-based line number to insert a sleep BEFORE
-- "seconds": recommended sleep duration (typically 3-5)
-- "reason": very short explanation (e.g., "page navigation after form submit")
-If a sleep already exists and needs to be increased:
-- "line": the line number of the existing sleep
-- "increase_to": the new duration in seconds
-- "reason": explanation
-Example output:
-[{"line": 15, "seconds": 3, "reason": "page navigation after press Enter"}, {"line": 22, "increase_to": 5, "reason": "slow page load"}]
-If no fixes are needed, return: []
-## Test Script (with line numbers)
-${script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n")}
-## Failure Log
-${failureLog.slice(0, 3e3)}`;
-}
 function buildCleanupPrompt(actions) {
 	return `You are given a list of browser actions recorded during an E2E test trace.
 The trace contains noise: failed attempts, redundant retries, and duplicate operations recorded because the agent explored multiple strategies.
@@ -1185,54 +1264,890 @@ ${actions.map((a, i) => {
 	}).join("\n")}`;
 }
 //#endregion
+//#region src/diagnose/apply.ts
+function applyDiagnosis(script, diagnosis) {
+	switch (diagnosis.type) {
+		case "TIMING_ISSUE": return applyTiming(script, diagnosis.fixes);
+		case "OVER_ASSERTION": return applyOverAssertion(script, diagnosis.lines);
+		case "SELECTOR_DRIFT": return applySelectorDrift(script, diagnosis.line, diagnosis.oldSelector, diagnosis.newSelector);
+		case "DATA_MISSING": return {
+			applied: false,
+			reason: `data missing — ${diagnosis.reason}`
+		};
+		case "UNKNOWN": return {
+			applied: false,
+			reason: `unknown failure — ${diagnosis.reason}`
+		};
+	}
+}
+function applyTiming(script, fixes) {
+	if (fixes.length === 0) return {
+		applied: false,
+		reason: "no timing fixes proposed"
+	};
+	const lines = script.split("\n");
+	const summary = [];
+	for (const fix of fixes) if (fix.kind === "increase") {
+		const idx = fix.line - 1;
+		if (idx < 0 || idx >= lines.length) continue;
+		const original = lines[idx];
+		const replaced = original.replace(/spawnSync\("sleep",\s*\["\d+"\]/, `spawnSync("sleep", ["${fix.increase_to}"]`);
+		if (replaced !== original) {
+			lines[idx] = replaced;
+			summary.push(`line ${fix.line}: sleep → ${fix.increase_to}s`);
+		}
+	}
+	const inserts = fixes.filter((f) => f.kind === "insert").sort((a, b) => b.line - a.line);
+	for (const fix of inserts) {
+		const idx = fix.line - 1;
+		if (idx < 0 || idx > lines.length) continue;
+		lines.splice(idx, 0, `  spawnSync("sleep", ["${fix.seconds}"], { stdio: "inherit" });`);
+		summary.push(`line ${fix.line}: insert sleep ${fix.seconds}s`);
+	}
+	if (summary.length === 0) return {
+		applied: false,
+		reason: "timing fixes pointed at out-of-range lines"
+	};
+	return {
+		applied: true,
+		script: lines.join("\n"),
+		summary: summary.join("; ")
+	};
+}
+function applyOverAssertion(script, lineNumbers) {
+	if (lineNumbers.length === 0) return {
+		applied: false,
+		reason: "no lines to remove"
+	};
+	const lines = script.split("\n");
+	const targets = [...new Set(lineNumbers)].sort((a, b) => b - a);
+	const removed = [];
+	for (const line of targets) {
+		const idx = line - 1;
+		if (idx < 0 || idx >= lines.length) continue;
+		const content = lines[idx];
+		if (!/abAssert/.test(content)) continue;
+		removed.push(`line ${line}: ${content.trim()}`);
+		lines.splice(idx, 1);
+	}
+	if (removed.length === 0) return {
+		applied: false,
+		reason: "no abAssert lines matched the proposed line numbers"
+	};
+	return {
+		applied: true,
+		script: lines.join("\n"),
+		summary: `removed ${removed.length} assertion(s)`
+	};
+}
+function applySelectorDrift(script, line, oldSelector, newSelector) {
+	const lines = script.split("\n");
+	const idx = line - 1;
+	if (idx < 0 || idx >= lines.length) return {
+		applied: false,
+		reason: `line ${line} out of range`
+	};
+	const content = lines[idx];
+	if (!content.includes(oldSelector)) return {
+		applied: false,
+		reason: `oldSelector not found on line ${line}`
+	};
+	lines[idx] = content.replaceAll(oldSelector, newSelector);
+	return {
+		applied: true,
+		script: lines.join("\n"),
+		summary: `line ${line}: "${oldSelector}" → "${newSelector}"`
+	};
+}
+/**
+* Build a unified-style diff snippet for showing the user what would change.
+* Just the changed lines with -/+ prefixes; not a real patch.
+*/
+function previewDiff(before, after) {
+	const a = before.split("\n");
+	const b = after.split("\n");
+	const out = [];
+	const max = Math.max(a.length, b.length);
+	for (let i = 0; i < max; i++) {
+		if (a[i] === b[i]) continue;
+		if (a[i] !== void 0) out.push(`- ${a[i]}`);
+		if (b[i] !== void 0) out.push(`+ ${b[i]}`);
+	}
+	return out.join("\n");
+}
+//#endregion
+//#region src/diagnose/prompt.ts
+function buildDiagnosePrompt(input) {
+	const { script, specMarkdown, actions, failureLog, pageSnapshot, outputLanguage = "en" } = input;
+	const numbered = script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n");
+	return `You are diagnosing a failing E2E test. The test was generated from a recorded trace of the original interaction. Compare the failing run against the original spec and recorded actions to determine WHY the test failed and what the right fix is.
+## Output language
+Write all human-readable fields (\`reasoning\`, \`reason\`) in **${outputLanguage}** (BCP-47 tag).
+Selectors, file paths, identifiers, code, type names (TIMING_ISSUE, etc.), JSON keys, and quoted strings stay verbatim regardless of language.
+## You have read-only filesystem tools
+You can call \`Grep\`, \`Glob\`, and \`Read\` against the current repository before producing the JSON.
+For SELECTOR_DRIFT specifically the failure log is usually NOT enough on its own — the runner only reports "selector X not visible". To confirm a rename, search the application source for the *type* of selector that's failing:
+- For \`[aria-label='OLD']\` failures: \`Grep\` for \`aria-label=\` (or i18n key \`OLD\`) in the app source. If you find a near-miss like \`aria-label="NEW"\` whose text is a superset/rephrase of the failing label, that is your evidence.
+- For \`[placeholder='OLD']\` failures: \`Grep\` for \`placeholder=\`.
+- For \`[role='OLD']\` or \`[data-testid='OLD']\`: same pattern.
+- For \`text=OLD\` failures: \`Grep\` the source / i18n bundles for \`OLD\`. Locale files (\`*.json\`, \`*.yml\`, \`messages.ts\`, etc.) often hold the canonical strings.
+You have **up to 10 tool turns**. Spend them on grep/read; do not loop. Only when you have concrete file:line evidence should you emit SELECTOR_DRIFT — otherwise prefer UNKNOWN with confidence < 0.4 and let the human decide.
+Do NOT attempt to write, edit, run shell commands, or hit the network. Only Grep/Glob/Read.
+## Diagnosis categories
+Pick exactly ONE category. The output JSON must follow the shape for that category.
+1. TIMING_ISSUE — element not yet present because the page hasn't loaded / navigated. Fix by inserting or extending sleeps.
+   {
+     "diagnosis": {
+       "type": "TIMING_ISSUE",
+       "fixes": [
+         { "kind": "insert", "line": <1-based>, "seconds": <int>, "reason": "<short>" },
+         { "kind": "increase", "line": <1-based of existing sleep>, "increase_to": <int>, "reason": "<short>" }
+       ]
+     },
+     "confidence": <0.0-1.0>,
+     "reasoning": "<why timing is the cause>"
+   }
+2. OVER_ASSERTION — the test is asserting something the spec never required, OR a recorded assertion that is environment-dependent (e.g. a placeholder text that varies). The right fix is to remove those lines from the test.
+   {
+     "diagnosis": {
+       "type": "OVER_ASSERTION",
+       "lines": [<1-based line numbers to remove>],
+       "reason": "<short>"
+     },
+     "confidence": <0.0-1.0>,
+     "reasoning": "<why this assertion isn't required by the spec>"
+   }
+3. SELECTOR_DRIFT — the page is healthy but a selector has been renamed/refined since the trace was recorded. The failure log will typically contain a snapshot showing the new selector. ONLY use this when you can name the exact replacement selector.
+   {
+     "diagnosis": {
+       "type": "SELECTOR_DRIFT",
+       "line": <1-based>,
+       "oldSelector": "<exact string in current line>",
+       "newSelector": "<exact replacement>",
+       "reason": "<short>"
+     },
+     "confidence": <0.0-1.0>,
+     "reasoning": "<evidence from failure log>"
+   }
+4. DATA_MISSING — the test depends on data (a record, a setup, a logged-in state) that no longer exists. Not auto-fixable; the human must reseed or update the spec.
+   {
+     "diagnosis": { "type": "DATA_MISSING", "reason": "<what is missing>" },
+     "confidence": <0.0-1.0>,
+     "reasoning": "<evidence>"
+   }
+5. UNKNOWN — none of the above fit, or evidence is too weak to choose.
+   {
+     "diagnosis": { "type": "UNKNOWN", "reason": "<short>" },
+     "confidence": <0.0-1.0>,
+     "reasoning": "<what you saw and why you can't classify>"
+   }
+## Confidence guidance
+- 0.9-1.0: failure log directly shows the cause (e.g. "selector X not found, snapshot lists Y" → SELECTOR_DRIFT)
+- 0.7-0.9: strong indirect evidence (e.g. timing pattern after navigation, or assertion text that doesn't appear in spec)
+- 0.4-0.7: plausible classification but multiple categories could explain it
+- < 0.4: prefer UNKNOWN over guessing
+## Rules
+- Your **final** assistant message must start with \`{\` and end with \`}\` — a single JSON object, nothing before or after. No prose preamble like "Confirmed: ...", no markdown fences, no commentary, no tool calls in the same turn. If you have an analysis sentence, put it in the \`reasoning\` field.
+- Line numbers refer to the numbered test script below (1-based).
+- For SELECTOR_DRIFT, \`oldSelector\` must match a substring of the script at that line; \`newSelector\` must be backed by a concrete file:line you read with Grep/Read (do not invent). Cite the evidence in \`reasoning\`.
+- For OVER_ASSERTION, only include lines that contain assert calls (\`abAssert*\`).
+- Cross-check assertions against the spec markdown. If the spec doesn't require the assertion, OVER_ASSERTION is the better diagnosis than SELECTOR_DRIFT.
+## Test Spec (test-spec.md)
+${specMarkdown}
+## Recorded Actions (actions.json summary)
+${actions.map((a, i) => {
+		const parts = [`${i + 1}. ${a.command}`];
+		if (a.assertType) parts.push(`assertType="${a.assertType}"`);
+		if (a.selector) parts.push(`selector="${a.selector}"`);
+		if (a.value) parts.push(`value="${a.value}"`);
+		if (a.observation) parts.push(`→ ${a.observation}`);
+		return parts.join(" ");
+	}).join("\n")}
+## Test Script (with line numbers)
+${numbered}
+## Failure Log
+${failureLog.slice(0, 4e3)}${pageSnapshot ? formatPageSnapshot(pageSnapshot) : ""}`;
+}
+/**
+* Page snapshot captured by ccqa right after the failure (agent-browser
+* accessibility tree). When present, it usually decides SELECTOR_DRIFT vs
+* TIMING_ISSUE: a near-miss aria-label / role / placeholder in the
+* snapshot is direct evidence of a rename, while a tree that doesn't
+* contain the failing locator at all (without a near-miss) points to a
+* still-loading page or genuinely missing element.
+*/
+function formatPageSnapshot(snapshot) {
+	return `
+## Page Snapshot (accessibility tree captured right after the failure)
+This is the live state of the page when the test failed. Prefer this over your own assumptions:
+- If a near-miss of the failing selector appears here (e.g. failing \`[aria-label='A']\` and snapshot contains \`aria-label="A-prime"\`), that is direct evidence of SELECTOR_DRIFT — propose the snapshot's value as \`newSelector\`.
+- If the failing locator is genuinely absent and no near-miss exists, the page may be still loading (TIMING_ISSUE) or the spec is asserting something not on this page (OVER_ASSERTION / DATA_MISSING).
+- If the snapshot looks unrelated to the spec (e.g. error page, login wall), DATA_MISSING is likely.
+\`\`\`
+${snapshot}
+\`\`\``;
+}
+//#endregion
+//#region src/diagnose/diagnose.ts
+async function diagnose(input) {
+	const { result: raw, isError } = await invokeClaudeStreaming({
+		prompt: buildDiagnosePrompt(input),
+		allowedTools: [
+			"Read",
+			"Grep",
+			"Glob"
+		],
+		maxTurns: 10
+	}, () => {});
+	if (isError) return {
+		result: null,
+		raw: raw ?? "",
+		sdkError: true
+	};
+	if (!raw) return {
+		result: null,
+		raw: "",
+		sdkError: false
+	};
+	const candidates = extractJsonCandidates(raw);
+	for (const candidate of candidates) {
+		let parsed;
+		try {
+			parsed = JSON.parse(candidate);
+		} catch {
+			continue;
+		}
+		const normalised = normaliseResult(parsed);
+		if (normalised) return {
+			result: normalised,
+			raw,
+			sdkError: false
+		};
+	}
+	return {
+		result: {
+			diagnosis: {
+				type: "UNKNOWN",
+				reason: "diagnose returned no parseable diagnosis JSON"
+			},
+			confidence: 0,
+			reasoning: truncate$1(raw, 1e3)
+		},
+		raw,
+		sdkError: false
+	};
+}
+/**
+* Pull every plausible JSON object out of `raw`. We try, in order:
+*   1. The whole string with code fences stripped (the prompt asks for
+*      JSON-only, so this is the happy path).
+*   2. Each balanced `{...}` block found by scanning the text. The model
+*      sometimes prefixes the JSON with a "Confirmed: ..." sentence or
+*      mentions partial JSON in its tool-using reasoning; we want to
+*      try the *last* well-formed object first because it's most likely
+*      the final answer, then earlier ones as a fallback.
+*
+* The caller `JSON.parse`s each candidate and stops at the first match
+* that normalises to a known DiagnosisResult.
+*/
+function extractJsonCandidates(raw) {
+	const out = [];
+	const stripped = stripFence(raw);
+	if (stripped) out.push(stripped);
+	const blocks = [];
+	let depth = 0;
+	let start = -1;
+	let inString = false;
+	let escaped = false;
+	for (let i = 0; i < raw.length; i++) {
+		const ch = raw[i];
+		if (inString) {
+			if (escaped) escaped = false;
+			else if (ch === "\\") escaped = true;
+			else if (ch === "\"") inString = false;
+			continue;
+		}
+		if (ch === "\"") {
+			inString = true;
+			continue;
+		}
+		if (ch === "{") {
+			if (depth === 0) start = i;
+			depth++;
+		} else if (ch === "}") {
+			depth--;
+			if (depth === 0 && start >= 0) {
+				blocks.push(raw.slice(start, i + 1));
+				start = -1;
+			}
+		}
+	}
+	for (let i = blocks.length - 1; i >= 0; i--) {
+		const block = blocks[i];
+		if (!out.includes(block)) out.push(block);
+	}
+	return out;
+}
+function truncate$1(s, max) {
+	return s.length <= max ? s : `${s.slice(0, max)}... [truncated, ${s.length - max} more chars]`;
+}
+function stripFence(raw) {
+	return raw.trim().replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "").trim();
+}
+function normaliseResult(parsed) {
+	if (!isObject(parsed)) return null;
+	const diagnosis = normaliseDiagnosis(parsed["diagnosis"]);
+	if (!diagnosis) return null;
+	return {
+		diagnosis,
+		confidence: typeof parsed["confidence"] === "number" ? clamp(parsed["confidence"], 0, 1) : 0,
+		reasoning: typeof parsed["reasoning"] === "string" ? parsed["reasoning"] : ""
+	};
+}
+function normaliseDiagnosis(raw) {
+	if (!isObject(raw)) return null;
+	switch (raw["type"]) {
+		case "TIMING_ISSUE": {
+			const fixes = normaliseSleepFixes(raw["fixes"]);
+			if (fixes.length === 0) return null;
+			return {
+				type: "TIMING_ISSUE",
+				fixes
+			};
+		}
+		case "OVER_ASSERTION": {
+			const lines = Array.isArray(raw["lines"]) ? raw["lines"].filter((n) => typeof n === "number" && Number.isFinite(n)) : [];
+			if (lines.length === 0) return null;
+			return {
+				type: "OVER_ASSERTION",
+				lines,
+				reason: typeof raw["reason"] === "string" ? raw["reason"] : ""
+			};
+		}
+		case "SELECTOR_DRIFT": {
+			const line = typeof raw["line"] === "number" ? raw["line"] : null;
+			const oldSelector = typeof raw["oldSelector"] === "string" ? raw["oldSelector"] : null;
+			const newSelector = typeof raw["newSelector"] === "string" ? raw["newSelector"] : null;
+			if (line === null || !oldSelector || !newSelector) return null;
+			return {
+				type: "SELECTOR_DRIFT",
+				line,
+				oldSelector,
+				newSelector,
+				reason: typeof raw["reason"] === "string" ? raw["reason"] : ""
+			};
+		}
+		case "DATA_MISSING": return {
+			type: "DATA_MISSING",
+			reason: typeof raw["reason"] === "string" ? raw["reason"] : ""
+		};
+		case "UNKNOWN": return {
+			type: "UNKNOWN",
+			reason: typeof raw["reason"] === "string" ? raw["reason"] : ""
+		};
+		default: return null;
+	}
+}
+function normaliseSleepFixes(raw) {
+	if (!Array.isArray(raw)) return [];
+	const out = [];
+	for (const item of raw) {
+		if (!isObject(item)) continue;
+		const line = typeof item["line"] === "number" ? item["line"] : null;
+		if (line === null) continue;
+		const reason = typeof item["reason"] === "string" ? item["reason"] : "";
+		const kind = item["kind"];
+		if (kind === "insert" || typeof item["seconds"] === "number" && item["increase_to"] === void 0) {
+			const seconds = typeof item["seconds"] === "number" ? item["seconds"] : null;
+			if (seconds === null) continue;
+			out.push({
+				kind: "insert",
+				line,
+				seconds,
+				reason
+			});
+			continue;
+		}
+		if (kind === "increase" || typeof item["increase_to"] === "number") {
+			const increaseTo = typeof item["increase_to"] === "number" ? item["increase_to"] : null;
+			if (increaseTo === null) continue;
+			out.push({
+				kind: "increase",
+				line,
+				increase_to: increaseTo,
+				reason
+			});
+			continue;
+		}
+	}
+	return out;
+}
+function isObject(v) {
+	return typeof v === "object" && v !== null && !Array.isArray(v);
+}
+function clamp(n, lo, hi) {
+	if (n < lo) return lo;
+	if (n > hi) return hi;
+	return n;
+}
+//#endregion
+//#region src/diagnose/interactive.ts
+async function promptForChoice(input) {
+	printContext(input);
+	const rl = createInterface({
+		input: process.stdin,
+		output: process.stdout
+	});
+	try {
+		while (true) switch ((await question(rl, "[a]pply / [s]kip / [m]anual / [q]uit > ")).trim().toLowerCase()) {
+			case "a":
+			case "apply": return "apply";
+			case "s":
+			case "skip": return "skip";
+			case "m":
+			case "manual": return "manual";
+			case "q":
+			case "quit": return "quit";
+			default: process.stdout.write("  please answer a/s/m/q\n");
+		}
+	} finally {
+		rl.close();
+	}
+}
+function question(rl, prompt) {
+	return new Promise((resolve) => rl.question(prompt, resolve));
+}
+function printContext({ result, diff, failureExcerpt }) {
+	const { diagnosis, confidence, reasoning } = result;
+	process.stdout.write("\n");
+	process.stdout.write(`[fix] diagnosis: ${diagnosis.type} (confidence ${confidence.toFixed(2)})\n`);
+	if (reasoning) process.stdout.write(`[fix] reasoning: ${reasoning}\n`);
+	for (const line of formatDiagnosisDetail(diagnosis)) process.stdout.write(`[fix] ${line}\n`);
+	if (failureExcerpt) {
+		process.stdout.write("\n[fix] failure excerpt:\n");
+		process.stdout.write(prefixLines(failureExcerpt, "[fix]   "));
+		process.stdout.write("\n");
+	}
+	if (diff) {
+		process.stdout.write("\n[fix] proposed fix:\n");
+		process.stdout.write(prefixLines(diff, "[fix]   "));
+		process.stdout.write("\n");
+	}
+	process.stdout.write("\n");
+}
+function formatDiagnosisDetail(diagnosis) {
+	switch (diagnosis.type) {
+		case "TIMING_ISSUE": return [`fixes: ${diagnosis.fixes.map((f) => f.kind === "insert" ? `insert ${f.seconds}s @ line ${f.line}` : `increase to ${f.increase_to}s @ line ${f.line}`).join(", ")}`];
+		case "OVER_ASSERTION": return [`lines: ${diagnosis.lines.join(", ")}`, `reason: ${diagnosis.reason}`];
+		case "SELECTOR_DRIFT": return [`line ${diagnosis.line}: "${diagnosis.oldSelector}" → "${diagnosis.newSelector}"`, `reason: ${diagnosis.reason}`];
+		case "DATA_MISSING":
+		case "UNKNOWN": return [`reason: ${diagnosis.reason}`];
+	}
+}
+function prefixLines(text, prefix) {
+	return text.split("\n").map((l) => `${prefix}${l}`).join("\n");
+}
+//#endregion
+//#region src/diagnose/snapshot.ts
+const require = createRequire(import.meta.url);
+const SNAPSHOT_TIMEOUT_MS = 1e4;
+const CLOSE_TIMEOUT_MS = 1e4;
+const MAX_OUTPUT_BYTES = 6e4;
+function resolveAgentBrowserBin() {
+	try {
+		return require.resolve("agent-browser/bin/agent-browser.js");
+	} catch {
+		return null;
+	}
+}
+/**
+* Run `agent-browser snapshot` against the session that the failed vitest
+* run just used, and return its accessibility-tree dump.
+*
+* Returns null when agent-browser is missing, the daemon has no live page
+* for the session, or the call exceeds {@link SNAPSHOT_TIMEOUT_MS}. We
+* never throw — a missing snapshot just means diagnose has less context.
+*
+* The output is truncated to {@link MAX_OUTPUT_BYTES} so the prompt stays
+* within budget on large pages.
+*/
+async function captureSnapshot(sessionName) {
+	const abBin = resolveAgentBrowserBin();
+	if (!abBin) return null;
+	return new Promise((resolve) => {
+		const child = spawn(process.execPath, [abBin, "snapshot"], {
+			env: {
+				...process.env,
+				AGENT_BROWSER_SESSION: sessionName
+			},
+			stdio: [
+				"ignore",
+				"pipe",
+				"pipe"
+			]
+		});
+		let stdout = "";
+		let stderr = "";
+		let timedOut = false;
+		const timer = setTimeout(() => {
+			timedOut = true;
+			child.kill("SIGTERM");
+		}, SNAPSHOT_TIMEOUT_MS);
+		child.stdout.setEncoding("utf8");
+		child.stderr.setEncoding("utf8");
+		child.stdout.on("data", (chunk) => {
+			stdout += chunk;
+		});
+		child.stderr.on("data", (chunk) => {
+			stderr += chunk;
+		});
+		child.on("error", () => {
+			clearTimeout(timer);
+			resolve(null);
+		});
+		child.on("exit", (code) => {
+			clearTimeout(timer);
+			if (timedOut || code !== 0) {
+				resolve(null);
+				return;
+			}
+			const trimmed = stdout.trim();
+			if (!trimmed) {
+				resolve(null);
+				return;
+			}
+			resolve(truncate(trimmed, MAX_OUTPUT_BYTES));
+		});
+	});
+}
+function truncate(s, maxBytes) {
+	if (s.length <= maxBytes) return s;
+	return `${s.slice(0, maxBytes)}\n... [truncated, ${s.length - maxBytes} more chars]`;
+}
+/**
+* Close an agent-browser session by name. Used before/after a `ccqa generate`
+* run so a wedged daemon from a previous attempt can't hang the next one.
+*
+* Always resolves; never throws. If the binary is missing, the session
+* doesn't exist, or the call exceeds {@link CLOSE_TIMEOUT_MS}, we silently
+* return — close is best-effort cleanup, not a precondition.
+*/
+async function closeSession(sessionName) {
+	const abBin = resolveAgentBrowserBin();
+	if (!abBin) return;
+	await new Promise((resolve) => {
+		const child = spawn(process.execPath, [abBin, "close"], {
+			env: {
+				...process.env,
+				AGENT_BROWSER_SESSION: sessionName
+			},
+			stdio: "ignore"
+		});
+		const timer = setTimeout(() => {
+			child.kill("SIGTERM");
+		}, CLOSE_TIMEOUT_MS);
+		const finish = () => {
+			clearTimeout(timer);
+			resolve();
+		};
+		child.on("error", finish);
+		child.on("exit", finish);
+	});
+}
+//#endregion
+//#region src/diagnose/loop.ts
+const DEFAULT_CONFIDENCE_THRESHOLD = .8;
+/**
+* Returns true when vitest finally passed; false when retries were exhausted
+* or the diagnose loop chose to bail out early.
+*/
+async function runAutoFixLoop(input) {
+	const { scriptPath, initialRun, specMarkdown, actions, maxRetries, mode, runVitest, agentBrowserSession, outputLanguage } = input;
+	let { exitCode, output, currentScript } = initialRun;
+	if (exitCode === 0) return true;
+	for (let attempt = 1; attempt <= maxRetries; attempt++) {
+		fix(`attempt ${attempt}/${maxRetries}`);
+		blank();
+		const pageSnapshot = agentBrowserSession ? await timedPhase("page snapshot", () => captureSnapshot(agentBrowserSession), "fix") : null;
+		if (agentBrowserSession) if (pageSnapshot) fix(`page snapshot: ${pageSnapshot.length} chars captured`);
+		else fix("page snapshot unavailable; continuing without it");
+		const fixed = await diagnoseAndFix({
+			script: currentScript,
+			specMarkdown,
+			actions,
+			failureLog: output,
+			pageSnapshot: pageSnapshot ?? void 0,
+			mode,
+			outputLanguage
+		});
+		if (!fixed) {
+			fix("bailed out; see diagnosis above");
+			return false;
+		}
+		await writeFile(scriptPath, fixed, "utf-8");
+		fix(`saved: ${scriptPath}`);
+		blank();
+		({exitCode, output, currentScript} = await timedPhase(`vitest run #${attempt + 1}`, () => runVitest(scriptPath), "run"));
+		if (exitCode === 0) return true;
+	}
+	return false;
+}
+async function diagnoseAndFix(input) {
+	const { script, specMarkdown, actions, failureLog, pageSnapshot, mode, outputLanguage } = input;
+	const outcome = await timedPhase("diagnose", () => diagnose({
+		script,
+		specMarkdown,
+		actions,
+		failureLog,
+		pageSnapshot,
+		outputLanguage
+	}), "fix");
+	if (outcome.sdkError) {
+		fix("diagnose: SDK error talking to Claude");
+		if (outcome.raw) fix(`diagnose raw: ${truncateForLog(outcome.raw)}`);
+		hint("re-run later, or check ANTHROPIC_API_KEY / network connectivity");
+		return null;
+	}
+	if (!outcome.result) {
+		fix("diagnose: empty response from LLM");
+		hint("re-run; if this keeps happening the failure log may be too short to diagnose");
+		return null;
+	}
+	const result = outcome.result;
+	reportDiagnosis(result);
+	if (result.diagnosis.type === "DATA_MISSING" || result.diagnosis.type === "UNKNOWN") {
+		handoffToUser(result, outcome.raw, outputLanguage);
+		return null;
+	}
+	const apply = applyDiagnosis(script, result.diagnosis);
+	if (!apply.applied) {
+		fix(`cannot apply: ${apply.reason}`);
+		handoffToUser(result, outcome.raw, outputLanguage);
+		return null;
+	}
+	const decision = decide(result, mode);
+	if (decision === "apply-auto") {
+		fix(`applying automatically: ${apply.summary}`);
+		return apply.script;
+	}
+	if (decision === "skip-low-confidence") {
+		fix(`confidence ${result.confidence.toFixed(2)} below threshold ${DEFAULT_CONFIDENCE_THRESHOLD}; skipping (--no-interactive)`);
+		handoffToUser(result, outcome.raw, outputLanguage);
+		return null;
+	}
+	switch (await promptForChoice({
+		result,
+		diff: previewDiff(script, apply.script),
+		failureExcerpt: failureLog.slice(0, 800)
+	})) {
+		case "apply":
+			fix(`applied: ${apply.summary}`);
+			return apply.script;
+		case "skip":
+			fix("skipped; leaving script untouched");
+			return null;
+		case "manual":
+			fix("paused for manual edit");
+			handoffToUser(result, outcome.raw, outputLanguage);
+			return null;
+		case "quit":
+			fix("user quit");
+			process.exit(1);
+	}
+}
+function decide(result, mode) {
+	if (mode === "auto") return "apply-auto";
+	const highConfidence = result.confidence >= DEFAULT_CONFIDENCE_THRESHOLD;
+	if (mode === "non-interactive") return highConfidence ? "apply-auto" : "skip-low-confidence";
+	return highConfidence ? "apply-auto" : "interactive";
+}
+function reportDiagnosis(result) {
+	fix(`diagnosis: ${result.diagnosis.type}`);
+	fix(`confidence: ${result.confidence.toFixed(2)}`);
+	if (result.reasoning) fix(`reasoning: ${result.reasoning}`);
+}
+/**
+* Emit a category-specific [hint] block that tells the user what to do next.
+* Called whenever the loop has decided it cannot proceed on its own —
+* because the diagnosis is intrinsically not auto-fixable, because the
+* proposed fix wasn't applicable to the current script, or because the
+* confidence was too low under --no-interactive.
+*
+* The goal is to never leave the user with just "auto-fix exhausted" —
+* always state which side (test artifacts vs. application) likely needs
+* the next action.
+*/
+function handoffToUser(result, raw, language) {
+	const lines = handoffMessage(result.diagnosis, normLang(language));
+	for (const line of lines) hint(line);
+	if (raw) fix(`diagnose raw: ${truncateForLog(raw)}`);
+}
+function normLang(language) {
+	if (!language) return "en";
+	return language.toLowerCase().startsWith("ja") ? "ja" : "en";
+}
+const HANDOFF = {
+	en: handoffEn,
+	ja: handoffJa
+};
+function handoffMessage(diagnosis, language) {
+	return HANDOFF[language](diagnosis);
+}
+function handoffEn(diagnosis) {
+	switch (diagnosis.type) {
+		case "DATA_MISSING": return [`application-side issue: required data is missing. ${diagnosis.reason}`, "next step: seed the data (or update test-spec.md prerequisites), then re-run trace + generate."];
+		case "UNKNOWN": return [`could not classify the failure. ${diagnosis.reason}`, "next step: read the failure log above, decide whether the test or the app is wrong, and fix manually. consider re-running ccqa trace if the recorded flow no longer matches the live app."];
+		case "SELECTOR_DRIFT": return [
+			`selector likely drifted but auto-apply was not safe.`,
+			`proposed: line ${diagnosis.line}: "${diagnosis.oldSelector}" → "${diagnosis.newSelector}" (${diagnosis.reason}).`,
+			"next step: confirm in the live app and either accept the proposal manually, or re-run ccqa trace to recapture the new selector."
+		];
+		case "OVER_ASSERTION": return [`assertion may not be required by the spec. lines: ${diagnosis.lines.join(", ")} (${diagnosis.reason}).`, "next step: cross-check test-spec.md. either delete the assertion from the test, or tighten the spec to require it."];
+		case "TIMING_ISSUE": return [`timing fix proposed but couldn't be applied automatically.`, "next step: insert a sleep manually before the failing line, or re-run with a higher confidence trace."];
+	}
+}
+function handoffJa(diagnosis) {
+	switch (diagnosis.type) {
+		case "DATA_MISSING": return [`アプリ側の問題: 必要なデータが不足しています。${diagnosis.reason}`, "次のステップ: データを seed する（または test-spec.md の prerequisites を更新）してから ccqa trace + generate をやり直してください。"];
+		case "UNKNOWN": return [`失敗を分類できませんでした。${diagnosis.reason}`, "次のステップ: 上の失敗ログを確認し、テストとアプリのどちらが原因か判断して手動で修正してください。記録した手順がアプリの現状と合わない場合は ccqa trace の再実行を検討してください。"];
+		case "SELECTOR_DRIFT": return [
+			"selector が変わった可能性が高いですが、自動適用は安全でないと判断しました。",
+			`提案: 行 ${diagnosis.line}: "${diagnosis.oldSelector}" → "${diagnosis.newSelector}" (${diagnosis.reason})`,
+			"次のステップ: アプリで新 selector を確認し、手動で適用するか ccqa trace をやり直して新しい selector を取り直してください。"
+		];
+		case "OVER_ASSERTION": return [`spec が要求していない assertion の可能性があります。対象行: ${diagnosis.lines.join(", ")} (${diagnosis.reason})`, "次のステップ: test-spec.md と照合して、テスト側の assertion を削るか、spec 側を更新してください。"];
+		case "TIMING_ISSUE": return ["timing 関連の修正案は出ましたが、自動適用できませんでした。", "次のステップ: 失敗行の前に手動で sleep を入れるか、より信頼度の高い trace を取り直してください。"];
+	}
+}
+function truncateForLog(s) {
+	const oneLine = s.replace(/\n+/g, " ⏎ ");
+	return oneLine.length <= 400 ? oneLine : `${oneLine.slice(0, 400)}... [+${oneLine.length - 400} chars]`;
+}
+function resolveMode(opts) {
+	if (opts.auto) return "auto";
+	if (opts.interactive === false || opts.noInteractive) return "non-interactive";
+	return "interactive";
+}
+//#endregion
 //#region src/cli/generate.ts
-const generateCommand = new Command("generate").argument("<feature/spec>", "Spec to generate test for (e.g. tasks/create-and-complete)").description("Generate agent-browser test script from recorded trace actions").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").action(async (specPath, opts) => {
+const generateCommand = new Command("generate").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Generate agent-browser test script from recorded trace actions. test.spec.ts is regenerated from actions.json on every run; pass --force to overwrite manual edits.").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--auto", "Apply auto-fixes without confirmation regardless of confidence (CI use)").option("--no-interactive", "Never prompt; only auto-apply when confidence is high, otherwise give up").option("--force", "Overwrite an existing test.spec.ts without warning").option("--no-snapshot", "Don't pin AGENT_BROWSER_SESSION / capture page snapshots after a failure (debug toggle)").option("--language <bcp47>", "Language for diagnose reasoning / hint text (e.g. 'en', 'ja')", "en").action(async (specPath, opts) => {
 	const { featureName, specName } = parseSpecPath(specPath);
-	await runGenerate(featureName, specName, parseInt(opts.maxRetries, 10));
+	const mode = resolveMode(opts);
+	const useSnapshot = opts.snapshot !== false;
+	await runGenerate(featureName, specName, parseInt(opts.maxRetries, 10), mode, opts.force ?? false, useSnapshot, opts.language ?? "en");
 });
-async function runGenerate(featureName, specName, maxRetries) {
+async function runGenerate(featureName, specName, maxRetries, mode, force, useSnapshot, outputLanguage) {
 	header("generate", `${featureName}/${specName}`);
 	await ensureCcqaDir();
+	const existingScriptPath = await getTestScript(featureName, specName);
+	if (existingScriptPath && !force) {
+		if (!await confirmOverwrite(existingScriptPath)) {
+			info("aborted; pass --force to overwrite without prompting");
+			return;
+		}
+	}
 	const { path: actionsPath, actions } = await getTraceActions(featureName, specName);
 	meta("trace", actionsPath);
 	meta("actions", actions.length);
-	const spec = parseTestSpec(await readSpecFile(featureName, specName));
+	const specContent = await readSpecFile(featureName, specName);
+	const spec = parseTestSpec(specContent);
 	const setupScripts = await loadSetupScripts(spec.setups);
 	if (setupScripts.length > 0) meta("setups", setupScripts.map((s) => s.name).join(", "));
+	meta("fix-mode", mode);
+	meta("language", outputLanguage);
 	blank();
 	const cleanedActions = await cleanupActions$1(actions);
 	if (cleanedActions.length !== actions.length) meta("cleaned", cleanedActions.length);
 	const scriptPath = await saveTestScript(featureName, specName, actionsToScript(cleanedActions, spec.title, setupScripts.length > 0 ? setupScripts : void 0));
 	meta("saved", scriptPath);
 	blank();
-	let { exitCode, output, currentScript } = await runVitest$1(scriptPath);
-	if (exitCode === 0) {
-		hint(`run 'ccqa run ${featureName}/${specName}' to execute the test`);
-		return;
+	const agentBrowserSession = useSnapshot ? `ccqa-generate-${Date.now()}` : void 0;
+	const runVitestForSession = (path) => runVitest$1(path, agentBrowserSession);
+	let signalHandler = null;
+	if (agentBrowserSession) {
+		await closeSession(agentBrowserSession);
+		signalHandler = () => {
+			closeSession(agentBrowserSession).finally(() => process.exit(130));
+		};
+		process.once("SIGINT", signalHandler);
+		process.once("SIGTERM", signalHandler);
 	}
-	for (let attempt = 1; attempt <= maxRetries; attempt++) {
-		info(`auto-fix attempt ${attempt}/${maxRetries}...`);
-		blank();
-		const fixed = await autoFixWithLLM$1(currentScript, output);
-		if (!fixed) {
-			warn("could not determine fix from failure log");
-			break;
+	try {
+		const initialRun = await timedPhase("vitest run #1", () => runVitestForSession(scriptPath), "run");
+		if (initialRun.exitCode === 0) {
+			hint(`run 'ccqa run ${featureName}/${specName}' to execute the test`);
+			return;
 		}
-		await writeFile(scriptPath, fixed, "utf-8");
-		meta("saved", scriptPath);
-		blank();
-		({exitCode, output, currentScript} = await runVitest$1(scriptPath));
-		if (exitCode === 0) {
+		if (await runAutoFixLoop({
+			scriptPath,
+			initialRun,
+			specMarkdown: specContent,
+			actions: cleanedActions,
+			maxRetries,
+			mode,
+			runVitest: runVitestForSession,
+			agentBrowserSession,
+			outputLanguage
+		})) {
 			hint(`run 'ccqa run ${featureName}/${specName}' to execute the test`);
 			return;
 		}
+		warn("auto-fix exhausted; test still failing");
+		process.exit(1);
+	} finally {
+		if (signalHandler) {
+			process.off("SIGINT", signalHandler);
+			process.off("SIGTERM", signalHandler);
+		}
+		if (agentBrowserSession) await closeSession(agentBrowserSession);
+	}
+}
+async function confirmOverwrite(path) {
+	if (!process.stdin.isTTY) {
+		warn(`${path} exists and stdin is not a TTY; refusing to overwrite. Pass --force to allow.`);
+		return false;
+	}
+	const rl = createInterface({
+		input: process.stdin,
+		output: process.stdout
+	});
+	try {
+		process.stdout.write("\n");
+		process.stdout.write(`[warn] ${path} already exists.\n`);
+		process.stdout.write(`[warn] generate will regenerate it from actions.json and any manual edits will be lost.\n`);
+		const norm = (await new Promise((res) => rl.question("Overwrite? [y/N] ", res))).trim().toLowerCase();
+		return norm === "y" || norm === "yes";
+	} finally {
+		rl.close();
 	}
-	warn("auto-fix exhausted — test still failing");
-	process.exit(1);
 }
-/**
-* Load setup test scripts, extract test body, and replace {{placeholders}} with params values.
-*/
 async function loadSetupScripts(setups) {
 	if (!setups?.length) return [];
 	const result = [];
@@ -1282,48 +2197,28 @@ function extractTestBody(script) {
 }
 function replacePlaceholders(body, params) {
 	let result = body;
-	for (const [key, value] of Object.entries(params)) result = result.replaceAll(`{{${key}}}`, value);
+	for (const [key, value] of Object.entries(params)) if (hasEnvRef(value)) {
+		const expr = envRefsToJsExpression(value);
+		const re = new RegExp(`(["'])\\{\\{${escapeRegExp(key)}\\}\\}\\1`, "g");
+		result = result.replace(re, expr);
+		result = result.replaceAll(`{{${key}}}`, value);
+	} else result = result.replaceAll(`{{${key}}}`, value);
 	return result;
 }
-async function autoFixWithLLM$1(script, failureLog) {
-	try {
-		const { result, isError } = await invokeClaudeStreaming({
-			prompt: buildAutoFixPrompt(script, failureLog),
-			disableBuiltinTools: true,
-			maxTurns: 1
-		}, () => {});
-		if (isError || !result) return null;
-		const json = result.trim().replace(/^```(?:json)?\n?([\s\S]*?)\n?```$/, "$1").trim();
-		const fixes = JSON.parse(json);
-		if (!Array.isArray(fixes) || fixes.length === 0) return null;
-		return applySleepFixes$1(script, fixes);
-	} catch {
-		return null;
-	}
-}
-function applySleepFixes$1(script, fixes) {
-	const lines = script.split("\n");
-	for (const fix of fixes) if ("increase_to" in fix) {
-		const idx = fix.line - 1;
-		if (idx >= 0 && idx < lines.length) lines[idx] = lines[idx].replace(/spawnSync\("sleep",\s*\["\d+"\]/, `spawnSync("sleep", ["${fix.increase_to}"]`);
-	}
-	const inserts = fixes.filter((f) => "seconds" in f && !("increase_to" in f)).sort((a, b) => b.line - a.line);
-	for (const fix of inserts) {
-		const idx = fix.line - 1;
-		if (idx >= 0 && idx <= lines.length) lines.splice(idx, 0, `  spawnSync("sleep", ["${fix.seconds}"], { stdio: "inherit" });`);
-	}
-	return lines.join("\n");
+function escapeRegExp(s) {
+	return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
 }
-async function runVitest$1(scriptPath) {
-	const { exitCode, stdout, stderr } = await spawnVitestCaptured([
+async function runVitest$1(scriptPath, agentBrowserSession) {
+	const { exitCode, stdout, stderr } = await spawnVitestTeed([
 		"run",
 		"--config",
 		bundledVitestConfigPath(),
 		scriptPath
-	]);
+	], agentBrowserSession ? { env: {
+		...process.env,
+		AGENT_BROWSER_SESSION: agentBrowserSession
+	} } : {});
 	const currentScript = await readFile(scriptPath, "utf8");
-	process.stdout.write(stdout);
-	if (stderr) process.stderr.write(stderr);
 	return {
 		exitCode,
 		output: stdout + stderr,
@@ -1378,7 +2273,7 @@ async function runTests(target) {
 				warn(`${featureName}/${specName}: no test.spec.ts found`);
 				continue;
 			}
-			info(`▶ ${featureName}/${specName}`);
+			run(`${featureName}/${specName}`);
 			meta("test", scriptFile);
 			blank();
 			const reportFile = join(tmpDir, `report-${i}.json`);
@@ -1522,6 +2417,7 @@ async function runTraceSetup(name) {
 	await ensureCcqaDir();
 	const spec = parseSetupSpec(await readSetupSpecFile(name));
 	const resolvedSpec = replacePlaceholdersWithDummies(spec);
+	const secretsToScrub = buildSecretsToScrub(spec);
 	meta("setup", spec.title);
 	meta("steps", spec.steps.length);
 	if (spec.placeholders) meta("placeholders", Object.keys(spec.placeholders).join(", "));
@@ -1542,8 +2438,12 @@ async function runTraceSetup(name) {
 			"Grep",
 			"Glob"
 		],
+		env: {
+			PATH: pathWithAgentBrowserShim(process.env["PATH"]),
+			ANTHROPIC_API_KEY: ""
+		},
 		onAbAction: (abAction) => {
-			const action = parseAbAction(abAction);
+			const action = parseAbAction(scrubSecrets(abAction, secretsToScrub));
 			if (action) traceActions.push(action);
 		},
 		onAbActionFailed: () => {
@@ -1565,7 +2465,7 @@ async function runTraceSetup(name) {
 						if (routeStep.status === "FAILED") overallStatus = "failed";
 					}
 				} else if (trimmed.startsWith("AB_ACTION|snapshot|") || trimmed.startsWith("AB_ACTION|assert|")) {
-					const action = parseAbAction(trimmed);
+					const action = parseAbAction(scrubSecrets(trimmed, secretsToScrub));
 					if (action) traceActions.push(action);
 				}
 			}
@@ -1591,7 +2491,7 @@ function replacePlaceholdersWithDummies(spec) {
 	const dummies = spec.placeholders;
 	const resolve = (text) => {
 		let result = text;
-		for (const [key, def] of Object.entries(dummies)) result = result.replaceAll(`{{${key}}}`, def.dummy);
+		for (const [key, def] of Object.entries(dummies)) result = result.replaceAll(`{{${key}}}`, resolveEnvRefs(def.dummy));
 		return result;
 	};
 	return {
@@ -1603,17 +2503,52 @@ function replacePlaceholdersWithDummies(spec) {
 		}))
 	};
 }
+/**
+* Build the substitution map used to scrub real secret values out of
+* recorded actions before they are written to actions.json.
+*
+* For each placeholder whose dummy contains env refs, store
+*   <resolved-value> -> <original ${VAR} string>
+* so that an `ab fill ... <secret>` line records the placeholder string
+* instead of the secret. Empty resolved values are skipped — they would
+* otherwise replace incidental empty strings in the recorded actions.
+*/
+function buildSecretsToScrub(spec) {
+	const map = /* @__PURE__ */ new Map();
+	if (!spec.placeholders) return map;
+	const dummies = spec.placeholders;
+	for (const def of Object.values(dummies)) {
+		if (!hasEnvRef(def.dummy)) continue;
+		const resolved = resolveEnvRefs(def.dummy);
+		if (!resolved) continue;
+		map.set(resolved, def.dummy);
+	}
+	return map;
+}
+/** Replace every occurrence of a recorded secret with its `${VAR}` placeholder. */
+function scrubSecrets(line, secrets) {
+	if (secrets.size === 0) return line;
+	let result = line;
+	for (const [secret, placeholder] of secrets) {
+		if (!result.includes(secret)) continue;
+		result = result.split(secret).join(placeholder);
+	}
+	return result;
+}
 //#endregion
 //#region src/cli/generate-setup.ts
-const generateSetupCommand = new Command("generate-setup").argument("<name>", "Setup name to generate (e.g. login)").description("Clean up, validate, and templatize setup actions").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--from-dummy", "Resume from existing test.dummy.spec.ts (after manual fix)").action(async (name, opts) => {
-	await runGenerateSetup(name, parseInt(opts.maxRetries, 10), opts.fromDummy ?? false);
+const generateSetupCommand = new Command("generate-setup").argument("<name>", "Setup name to generate (e.g. login)").description("Clean up, validate, and templatize setup actions").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--from-dummy", "Resume from existing test.dummy.spec.ts (after manual fix)").option("--auto", "Apply auto-fixes without confirmation regardless of confidence (CI use)").option("--no-interactive", "Never prompt; only auto-apply when confidence is high, otherwise give up").option("--language <bcp47>", "Language for diagnose reasoning / hint text (e.g. 'en', 'ja')", "en").action(async (name, opts) => {
+	const mode = resolveMode(opts);
+	await runGenerateSetup(name, parseInt(opts.maxRetries, 10), opts.fromDummy ?? false, mode, opts.language ?? "en");
 });
-async function runGenerateSetup(name, maxRetries, fromDummy) {
+async function runGenerateSetup(name, maxRetries, fromDummy, mode, outputLanguage) {
 	header("generate-setup", name);
 	await ensureCcqaDir();
-	const spec = parseSetupSpec(await readSetupSpecFile(name));
+	const specContent = await readSetupSpecFile(name);
+	const spec = parseSetupSpec(specContent);
 	const dummyPath = join(getSetupDir(name), "test.dummy.spec.ts");
 	const finalPath = join(getSetupDir(name), "test.spec.ts");
+	let cleanedActions = [];
 	if (fromDummy) {
 		if (!await stat(dummyPath).then(() => true).catch(() => false)) {
 			warn(`test.dummy.spec.ts not found. Run without --from-dummy first.`);
@@ -1624,40 +2559,52 @@ async function runGenerateSetup(name, maxRetries, fromDummy) {
 		const { actions } = await getSetupActions(name);
 		meta("setup", spec.title);
 		meta("actions", actions.length);
+		meta("fix-mode", mode);
+		meta("language", outputLanguage);
 		blank();
-		const cleanedActions = await cleanupActions(actions);
+		cleanedActions = await cleanupActions(actions);
 		if (cleanedActions.length !== actions.length) meta("cleaned", cleanedActions.length);
 		await writeFile(dummyPath, actionsToScript(cleanedActions, spec.title), "utf-8");
 		meta("saved", dummyPath);
 	}
 	blank();
-	let { exitCode, output, currentScript } = await runVitest(dummyPath);
-	if (exitCode !== 0) {
-		for (let attempt = 1; attempt <= maxRetries; attempt++) {
-			info(`auto-fix attempt ${attempt}/${maxRetries}...`);
-			blank();
-			const fixed = await autoFixWithLLM(currentScript, output);
-			if (!fixed) {
-				warn("could not determine fix from failure log");
-				break;
-			}
-			await writeFile(dummyPath, fixed, "utf-8");
-			meta("saved", dummyPath);
-			blank();
-			({exitCode, output, currentScript} = await runVitest(dummyPath));
-			if (exitCode === 0) break;
-		}
-		if (exitCode !== 0) {
-			warn("auto-fix exhausted — setup test still failing");
+	const agentBrowserSession = `ccqa-generate-setup-${name}-${Date.now()}`;
+	const runVitestForSession = (path) => runVitestResolved(path, agentBrowserSession);
+	await closeSession(agentBrowserSession);
+	const signalHandler = () => {
+		closeSession(agentBrowserSession).finally(() => process.exit(130));
+	};
+	process.once("SIGINT", signalHandler);
+	process.once("SIGTERM", signalHandler);
+	try {
+		const initialRun = await timedPhase("vitest run #1", () => runVitestForSession(dummyPath), "run");
+		let passed = initialRun.exitCode === 0;
+		if (!passed) passed = await runAutoFixLoop({
+			scriptPath: dummyPath,
+			initialRun,
+			specMarkdown: specContent,
+			actions: cleanedActions,
+			maxRetries,
+			mode,
+			runVitest: runVitestForSession,
+			agentBrowserSession,
+			outputLanguage
+		});
+		if (!passed) {
+			warn("auto-fix exhausted; setup test still failing");
 			hint(`edit ${dummyPath} manually, then run: ccqa generate-setup ${name} --from-dummy`);
 			process.exit(1);
 		}
+		await writeFile(finalPath, reversePlaceholdersInScript(await readFile(dummyPath, "utf8"), spec.placeholders), "utf-8");
+		await unlink(dummyPath).catch(() => {});
+		blank();
+		meta("saved", finalPath);
+		hint(`setup '${name}' is ready; reference it in test-spec.md with setups: [{name: ${name}, params: {...}}]`);
+	} finally {
+		process.off("SIGINT", signalHandler);
+		process.off("SIGTERM", signalHandler);
+		await closeSession(agentBrowserSession);
 	}
-	await writeFile(finalPath, reversePlaceholdersInScript(currentScript, spec.placeholders), "utf-8");
-	await unlink(dummyPath).catch(() => {});
-	blank();
-	meta("saved", finalPath);
-	hint(`setup '${name}' is ready — reference it in test-spec.md with setups: [{name: ${name}, params: {...}}]`);
 }
 /**
 * Replace dummy values with {{placeholder}} directly in the test script text.
@@ -1670,51 +2617,54 @@ function reversePlaceholdersInScript(script, placeholders) {
 	for (const [key, def] of entries) result = result.replaceAll(def.dummy, `{{${key}}}`);
 	return result;
 }
-async function autoFixWithLLM(script, failureLog) {
-	try {
-		const { result, isError } = await invokeClaudeStreaming({
-			prompt: buildAutoFixPrompt(script, failureLog),
-			disableBuiltinTools: true,
-			maxTurns: 1
-		}, () => {});
-		if (isError || !result) return null;
-		const json = result.trim().replace(/^```(?:json)?\n?([\s\S]*?)\n?```$/, "$1").trim();
-		const fixes = JSON.parse(json);
-		if (!Array.isArray(fixes) || fixes.length === 0) return null;
-		return applySleepFixes(script, fixes);
-	} catch {
-		return null;
-	}
-}
-function applySleepFixes(script, fixes) {
-	const lines = script.split("\n");
-	for (const fix of fixes) if ("increase_to" in fix) {
-		const idx = fix.line - 1;
-		if (idx >= 0 && idx < lines.length) lines[idx] = lines[idx].replace(/spawnSync\("sleep",\s*\["\d+"\]/, `spawnSync("sleep", ["${fix.increase_to}"]`);
-	}
-	const inserts = fixes.filter((f) => "seconds" in f && !("increase_to" in f)).sort((a, b) => b.line - a.line);
-	for (const fix of inserts) {
-		const idx = fix.line - 1;
-		if (idx >= 0 && idx <= lines.length) lines.splice(idx, 0, `  spawnSync("sleep", ["${fix.seconds}"], { stdio: "inherit" });`);
-	}
-	return lines.join("\n");
-}
-async function runVitest(scriptPath) {
-	const { exitCode, stdout, stderr } = await spawnVitestCaptured([
+async function runVitest(scriptPath, agentBrowserSession) {
+	const { exitCode, stdout, stderr } = await spawnVitestTeed([
 		"run",
 		"--config",
 		bundledVitestConfigPath(),
 		scriptPath
-	]);
+	], agentBrowserSession ? { env: {
+		...process.env,
+		AGENT_BROWSER_SESSION: agentBrowserSession
+	} } : {});
 	const currentScript = await readFile(scriptPath, "utf8");
-	process.stdout.write(stdout);
-	if (stderr) process.stderr.write(stderr);
 	return {
 		exitCode,
 		output: stdout + stderr,
 		currentScript
 	};
 }
+/**
+* Run vitest on `test.dummy.spec.ts`, but transparently expand any `${VAR}`
+* env refs to real values for the duration of the run. The original file is
+* preserved unchanged so subsequent reverse-replace still sees the env-ref
+* literals. Auto-fix edits the original file (via writeFile in callers), so
+* we always re-read it before each invocation.
+*/
+async function runVitestResolved(scriptPath, agentBrowserSession) {
+	const original = await readFile(scriptPath, "utf8");
+	if (!hasEnvRef(original)) return runVitest(scriptPath, agentBrowserSession);
+	const tmpPath = scriptPath.replace(/\.ts$/, ".__resolved.spec.ts");
+	await writeFile(tmpPath, resolveEnvRefs(original), "utf-8");
+	try {
+		const { exitCode, stdout, stderr } = await spawnVitestTeed([
+			"run",
+			"--config",
+			bundledVitestConfigPath(),
+			tmpPath
+		], agentBrowserSession ? { env: {
+			...process.env,
+			AGENT_BROWSER_SESSION: agentBrowserSession
+		} } : {});
+		return {
+			exitCode,
+			output: stdout + stderr,
+			currentScript: original
+		};
+	} finally {
+		await unlink(tmpPath).catch(() => {});
+	}
+}
 async function cleanupActions(actions) {
 	try {
 		const { result, isError } = await invokeClaudeStreaming({