ccqa 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/ccqa.mjs CHANGED
@@ -8,6 +8,7 @@ import { delimiter, dirname, join, resolve } from "node:path";
8
8
  import { query } from "@anthropic-ai/claude-agent-sdk";
9
9
  import matter from "gray-matter";
10
10
  import { spawn } from "node:child_process";
11
+ import { createInterface } from "node:readline";
11
12
  import { tmpdir } from "node:os";
12
13
  //#region src/prompts/trace.ts
13
14
  function generateSessionName() {
@@ -285,14 +286,17 @@ const STEP_ICONS = {
285
286
  function header(command, target) {
286
287
  process.stdout.write(`\nccqa ${command}${target ? ` ${target}` : ""}\n\n`);
287
288
  }
289
+ function write(scope, message, sink = process.stdout) {
290
+ sink.write(`[${scope}] ${message}\n`);
291
+ }
288
292
  function meta(key, value) {
289
- process.stdout.write(` ${key}: ${value}\n`);
293
+ write("meta", `${key}: ${value}`);
290
294
  }
291
295
  function blank() {
292
296
  process.stdout.write("\n");
293
297
  }
294
298
  function info(message) {
295
- process.stdout.write(`${message}\n`);
299
+ write("info", message);
296
300
  }
297
301
  function step(type, stepId, detail) {
298
302
  process.stdout.write(` ${STEP_ICONS[type]} [${stepId}] ${detail}\n`);
@@ -301,13 +305,37 @@ function bash(command) {
301
305
  process.stdout.write(` $ ${command.slice(0, 120)}\n`);
302
306
  }
303
307
  function error(message) {
304
- process.stderr.write(`error: ${message}\n`);
308
+ write("error", message, process.stderr);
305
309
  }
306
310
  function warn(message) {
307
- process.stderr.write(`warn: ${message}\n`);
311
+ write("warn", message, process.stderr);
308
312
  }
309
313
  function hint(message) {
310
- process.stdout.write(`\nhint: ${message}\n`);
314
+ process.stdout.write("\n");
315
+ write("hint", message);
316
+ }
317
+ function fix(message) {
318
+ write("fix", message);
319
+ }
320
+ function run(message) {
321
+ write("run", message);
322
+ }
323
+ /**
324
+ * Time a long-running step under the given scope, emitting `started` and
325
+ * `finished in N.Ns` markers. Scope must be a tag the user wants to grep
326
+ * for — typically "run" for vitest and "fix" for diagnose-loop steps.
327
+ */
328
+ async function timedPhase(label, fn, scope = "fix") {
329
+ const startedAt = Date.now();
330
+ write(scope, `${label} started`);
331
+ try {
332
+ const result = await fn();
333
+ write(scope, `${label} finished in ${((Date.now() - startedAt) / 1e3).toFixed(1)}s`);
334
+ return result;
335
+ } catch (err) {
336
+ write(scope, `${label} threw after ${((Date.now() - startedAt) / 1e3).toFixed(1)}s`);
337
+ throw err;
338
+ }
311
339
  }
312
340
  //#endregion
313
341
  //#region src/claude/invoke.ts
@@ -488,13 +516,27 @@ const CCQA_DIR = ".ccqa";
488
516
  function getCcqaDir(cwd = process.cwd()) {
489
517
  return join(cwd, CCQA_DIR);
490
518
  }
519
+ /**
520
+ * Accepts both the canonical 2-segment alias and the on-disk 4-segment path
521
+ * (which is what shell tab-completion produces):
522
+ * - "tasks/create-and-complete"
523
+ * - "features/tasks/test-cases/create-and-complete"
524
+ * - ".ccqa/features/tasks/test-cases/create-and-complete"
525
+ * All forms resolve to { featureName: "tasks", specName: "create-and-complete" }.
526
+ * Trailing slashes are tolerated.
527
+ */
491
528
  function parseSpecPath(specPath) {
492
- const parts = specPath.split("/");
493
- if (parts.length !== 2 || !parts[0] || !parts[1]) throw new Error(`Invalid spec path: "${specPath}". Expected format: "<feature>/<spec>"`);
494
- return {
529
+ const parts = specPath.replace(/^\.\/+/, "").replace(/\/+$/, "").split("/").filter((p) => p.length > 0);
530
+ if (parts[0] === ".ccqa") parts.shift();
531
+ if (parts.length === 4 && parts[0] === "features" && parts[2] === "test-cases") return {
532
+ featureName: parts[1],
533
+ specName: parts[3]
534
+ };
535
+ if (parts.length === 2 && parts[0] && parts[1]) return {
495
536
  featureName: parts[0],
496
537
  specName: parts[1]
497
538
  };
539
+ throw new Error(`Invalid spec path: "${specPath}". Expected "<feature>/<spec>" or "features/<feature>/test-cases/<spec>".`);
498
540
  }
499
541
  function getFeatureDir(featureName, cwd) {
500
542
  return join(getCcqaDir(cwd), "features", featureName);
@@ -706,10 +748,10 @@ function bundledVitestConfigPath() {
706
748
  }
707
749
  //#endregion
708
750
  //#region src/runtime/spawn-vitest.ts
709
- const require$1 = createRequire(import.meta.url);
751
+ const require$2 = createRequire(import.meta.url);
710
752
  function resolveVitestBin() {
711
- const pkgPath = require$1.resolve("vitest/package.json");
712
- const pkg = require$1(pkgPath);
753
+ const pkgPath = require$2.resolve("vitest/package.json");
754
+ const pkg = require$2(pkgPath);
713
755
  const binRel = typeof pkg.bin === "string" ? pkg.bin : pkg.bin?.vitest;
714
756
  if (!binRel) throw new Error(`vitest package.json has no bin entry (resolved at ${pkgPath})`);
715
757
  return resolve(dirname(pkgPath), binRel);
@@ -727,6 +769,19 @@ async function spawnVitestCaptured(args, opts = {}) {
727
769
  stderr
728
770
  };
729
771
  }
772
+ async function spawnVitestTeed(args, opts = {}) {
773
+ const child = spawnVitestChild(args, opts, "pipe");
774
+ const [stdout, stderr, exitCode] = await Promise.all([
775
+ teeDrain(child.stdout, process.stdout),
776
+ teeDrain(child.stderr, process.stderr),
777
+ waitExit(child)
778
+ ]);
779
+ return {
780
+ exitCode,
781
+ stdout,
782
+ stderr
783
+ };
784
+ }
730
785
  function spawnVitestStreaming(args, opts = {}) {
731
786
  const child = spawnVitestChild(args, opts, "pipe");
732
787
  return {
@@ -754,6 +809,15 @@ async function drain(stream) {
754
809
  for await (const chunk of stream) buf += chunk;
755
810
  return buf;
756
811
  }
812
+ async function teeDrain(stream, sink) {
813
+ stream.setEncoding("utf8");
814
+ let buf = "";
815
+ for await (const chunk of stream) {
816
+ buf += chunk;
817
+ sink.write(chunk);
818
+ }
819
+ return buf;
820
+ }
757
821
  function waitExit(child) {
758
822
  return new Promise((resolvePromise, rejectPromise) => {
759
823
  child.once("exit", (code) => resolvePromise(code ?? 0));
@@ -762,7 +826,7 @@ function waitExit(child) {
762
826
  }
763
827
  //#endregion
764
828
  //#region src/runtime/agent-browser-bin.ts
765
- const require = createRequire(import.meta.url);
829
+ const require$1 = createRequire(import.meta.url);
766
830
  /**
767
831
  * Resolves the directory containing the `agent-browser` shim that npm/pnpm
768
832
  * exposes on PATH for the peer-installed package. Used by `ccqa trace` to
@@ -774,7 +838,7 @@ const require = createRequire(import.meta.url);
774
838
  function resolveAgentBrowserBinDir() {
775
839
  let pkgJsonPath;
776
840
  try {
777
- pkgJsonPath = require.resolve("agent-browser/package.json");
841
+ pkgJsonPath = require$1.resolve("agent-browser/package.json");
778
842
  } catch {
779
843
  return null;
780
844
  }
@@ -793,8 +857,58 @@ function pathWithAgentBrowserShim(currentPath) {
793
857
  return dir + delimiter + path;
794
858
  }
795
859
  //#endregion
860
+ //#region src/runtime/env-vars.ts
861
+ const ENV_VAR_RE = /\$\{([A-Z_][A-Z0-9_]*)\}|\$([A-Z_][A-Z0-9_]*)/g;
862
+ /**
863
+ * Returns true if the value contains at least one `$VAR` or `${VAR}` reference.
864
+ */
865
+ function hasEnvRef(value) {
866
+ ENV_VAR_RE.lastIndex = 0;
867
+ return ENV_VAR_RE.test(value);
868
+ }
869
+ /**
870
+ * Resolve every `$VAR` / `${VAR}` reference against the current process env.
871
+ *
872
+ * Missing variables expand to the empty string, mirroring `sh` behaviour.
873
+ * Throwing would force ccqa to be invoked with every var set even for
874
+ * unused setups, which is more user-hostile than letting the test fail
875
+ * downstream with a clearer message ("login form rejected: empty password").
876
+ */
877
+ function resolveEnvRefs(value) {
878
+ return value.replace(ENV_VAR_RE, (_, braced, plain) => {
879
+ const name = braced ?? plain ?? "";
880
+ return process.env[name] ?? "";
881
+ });
882
+ }
883
+ /**
884
+ * Embed `$VAR` / `${VAR}` as a JS template-literal expression that reads
885
+ * `process.env.VAR ?? ""` at runtime. Used by `ccqa generate` so the test
886
+ * script never bakes in the secret value.
887
+ *
888
+ * Returns a JavaScript string-literal expression (template literal when env
889
+ * refs are present, plain string literal otherwise).
890
+ *
891
+ * Examples:
892
+ * "${PASSWORD}" -> '`${process.env.PASSWORD ?? ""}`'
893
+ * "user-${SUFFIX}@x.com" -> '`user-${process.env.SUFFIX ?? ""}@x.com`'
894
+ * "literal value" -> '"literal value"'
895
+ */
896
+ function envRefsToJsExpression(value) {
897
+ if (!hasEnvRef(value)) return JSON.stringify(value);
898
+ const escaped = value.replace(/\\/g, "\\\\").replace(/`/g, "\\`").replace(/\$\{/g, (match, offset, source) => {
899
+ ENV_VAR_RE.lastIndex = 0;
900
+ let m;
901
+ while ((m = ENV_VAR_RE.exec(source)) !== null) if (m.index === offset) return "${";
902
+ return "\\${";
903
+ });
904
+ ENV_VAR_RE.lastIndex = 0;
905
+ return `\`${escaped.replace(ENV_VAR_RE, (_, braced, plain) => {
906
+ return `\${process.env.${braced ?? plain ?? ""} ?? ""}`;
907
+ })}\``;
908
+ }
909
+ //#endregion
796
910
  //#region src/cli/trace.ts
797
- const traceCommand = new Command("trace").argument("<feature/spec>", "Spec to trace (e.g. tasks/create-and-complete)").description("Run agent-browser, verify assertions, and record structured actions").action(async (specPath) => {
911
+ const traceCommand = new Command("trace").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Run agent-browser, verify assertions, and record structured actions").action(async (specPath) => {
798
912
  const { featureName, specName } = parseSpecPath(specPath);
799
913
  await runTrace(featureName, specName);
800
914
  });
@@ -892,8 +1006,8 @@ async function runSetups(setups, sessionName) {
892
1006
  let script = await readFile(scriptPath, "utf-8").catch(() => {
893
1007
  throw new Error(`Setup test script not found: ${scriptPath}. Run \`ccqa generate-setup ${ref.name}\` first.`);
894
1008
  });
895
- for (const [key, value] of Object.entries(ref.params ?? {})) script = script.replaceAll(`{{${key}}}`, value);
896
- script = script.replace(/process\.env\.AGENT_BROWSER_SESSION\s*=\s*`.+`;/, `process.env.AGENT_BROWSER_SESSION = ${JSON.stringify(sessionName)};`);
1009
+ for (const [key, value] of Object.entries(ref.params ?? {})) script = script.replaceAll(`{{${key}}}`, resolveEnvRefs(value));
1010
+ script = script.replace(/process\.env\.AGENT_BROWSER_SESSION\s*\|?\|?=\s*`.+`;/, `process.env.AGENT_BROWSER_SESSION = ${JSON.stringify(sessionName)};`);
897
1011
  const tmpPath = join(getSetupDir(ref.name), `_run.spec.ts`);
898
1012
  await writeFile(tmpPath, script, "utf-8");
899
1013
  try {
@@ -1012,8 +1126,10 @@ function actionsToScript(actions, title, setupScripts) {
1012
1126
  `import { spawnSync } from "node:child_process";`,
1013
1127
  `import { ab, abWait, abAssertTextVisible, abAssertVisible, abAssertNotVisible, abAssertUrl, abAssertEnabled, abAssertDisabled, abAssertChecked, abAssertUnchecked } from "ccqa/test-helpers";`,
1014
1128
  "",
1015
- `// Single session shared across all tests — reset per run via cookies clear in first test`,
1016
- `process.env.AGENT_BROWSER_SESSION = \`ccqa-run-\${Date.now()}\`;`,
1129
+ `// Single session shared across all tests — reset per run via cookies clear in first test.`,
1130
+ `// Use ||= so an outer harness (e.g. ccqa generate's auto-fix loop) can pre-set the session`,
1131
+ `// name and inspect the same session after the run finishes.`,
1132
+ `process.env.AGENT_BROWSER_SESSION ||= \`ccqa-run-\${Date.now()}\`;`,
1017
1133
  ""
1018
1134
  ]];
1019
1135
  if (setupScripts?.length) for (const setup of setupScripts) parts.push(`test("setup: ${setup.name}", () => {`, setup.body, "}, 3 * 60 * 1000);", "");
@@ -1118,43 +1234,6 @@ function actionToLine(action) {
1118
1234
  const j = (s) => JSON.stringify(s);
1119
1235
  //#endregion
1120
1236
  //#region src/prompts/codegen.ts
1121
- function buildAutoFixPrompt(script, failureLog) {
1122
- return `You are analyzing a failing E2E test script. The test fails because some browser actions execute before the page has finished loading or navigating.
1123
-
1124
- Your task: identify which line numbers need a sleep/wait inserted BEFORE them to fix timing issues.
1125
-
1126
- ## Rules
1127
- - ONLY identify lines where a sleep is needed — do NOT suggest any other changes
1128
- - Common patterns that need a sleep:
1129
- - After \`ab("open", ...)\` when the next line interacts with elements (fill, click, etc.)
1130
- - After \`ab("press", "Enter")\` or \`ab("click", ...)\` when a page navigation occurs before the next action
1131
- - After any action that triggers a redirect or page reload
1132
- - Look at the error log to identify WHICH lines failed, then determine if a sleep before that line would fix it
1133
- - If a \`spawnSync("sleep", ...)\` already exists before a failing line, suggest increasing its duration instead
1134
- - Output ONLY a JSON array of objects, no explanation, no markdown code fences
1135
-
1136
- ## Output format
1137
- Each object has:
1138
- - "line": the 1-based line number to insert a sleep BEFORE
1139
- - "seconds": recommended sleep duration (typically 3-5)
1140
- - "reason": very short explanation (e.g., "page navigation after form submit")
1141
-
1142
- If a sleep already exists and needs to be increased:
1143
- - "line": the line number of the existing sleep
1144
- - "increase_to": the new duration in seconds
1145
- - "reason": explanation
1146
-
1147
- Example output:
1148
- [{"line": 15, "seconds": 3, "reason": "page navigation after press Enter"}, {"line": 22, "increase_to": 5, "reason": "slow page load"}]
1149
-
1150
- If no fixes are needed, return: []
1151
-
1152
- ## Test Script (with line numbers)
1153
- ${script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n")}
1154
-
1155
- ## Failure Log
1156
- ${failureLog.slice(0, 3e3)}`;
1157
- }
1158
1237
  function buildCleanupPrompt(actions) {
1159
1238
  return `You are given a list of browser actions recorded during an E2E test trace.
1160
1239
  The trace contains noise: failed attempts, redundant retries, and duplicate operations recorded because the agent explored multiple strategies.
@@ -1185,54 +1264,890 @@ ${actions.map((a, i) => {
1185
1264
  }).join("\n")}`;
1186
1265
  }
1187
1266
  //#endregion
1267
+ //#region src/diagnose/apply.ts
1268
+ function applyDiagnosis(script, diagnosis) {
1269
+ switch (diagnosis.type) {
1270
+ case "TIMING_ISSUE": return applyTiming(script, diagnosis.fixes);
1271
+ case "OVER_ASSERTION": return applyOverAssertion(script, diagnosis.lines);
1272
+ case "SELECTOR_DRIFT": return applySelectorDrift(script, diagnosis.line, diagnosis.oldSelector, diagnosis.newSelector);
1273
+ case "DATA_MISSING": return {
1274
+ applied: false,
1275
+ reason: `data missing — ${diagnosis.reason}`
1276
+ };
1277
+ case "UNKNOWN": return {
1278
+ applied: false,
1279
+ reason: `unknown failure — ${diagnosis.reason}`
1280
+ };
1281
+ }
1282
+ }
1283
+ function applyTiming(script, fixes) {
1284
+ if (fixes.length === 0) return {
1285
+ applied: false,
1286
+ reason: "no timing fixes proposed"
1287
+ };
1288
+ const lines = script.split("\n");
1289
+ const summary = [];
1290
+ for (const fix of fixes) if (fix.kind === "increase") {
1291
+ const idx = fix.line - 1;
1292
+ if (idx < 0 || idx >= lines.length) continue;
1293
+ const original = lines[idx];
1294
+ const replaced = original.replace(/spawnSync\("sleep",\s*\["\d+"\]/, `spawnSync("sleep", ["${fix.increase_to}"]`);
1295
+ if (replaced !== original) {
1296
+ lines[idx] = replaced;
1297
+ summary.push(`line ${fix.line}: sleep → ${fix.increase_to}s`);
1298
+ }
1299
+ }
1300
+ const inserts = fixes.filter((f) => f.kind === "insert").sort((a, b) => b.line - a.line);
1301
+ for (const fix of inserts) {
1302
+ const idx = fix.line - 1;
1303
+ if (idx < 0 || idx > lines.length) continue;
1304
+ lines.splice(idx, 0, ` spawnSync("sleep", ["${fix.seconds}"], { stdio: "inherit" });`);
1305
+ summary.push(`line ${fix.line}: insert sleep ${fix.seconds}s`);
1306
+ }
1307
+ if (summary.length === 0) return {
1308
+ applied: false,
1309
+ reason: "timing fixes pointed at out-of-range lines"
1310
+ };
1311
+ return {
1312
+ applied: true,
1313
+ script: lines.join("\n"),
1314
+ summary: summary.join("; ")
1315
+ };
1316
+ }
1317
+ function applyOverAssertion(script, lineNumbers) {
1318
+ if (lineNumbers.length === 0) return {
1319
+ applied: false,
1320
+ reason: "no lines to remove"
1321
+ };
1322
+ const lines = script.split("\n");
1323
+ const targets = [...new Set(lineNumbers)].sort((a, b) => b - a);
1324
+ const removed = [];
1325
+ for (const line of targets) {
1326
+ const idx = line - 1;
1327
+ if (idx < 0 || idx >= lines.length) continue;
1328
+ const content = lines[idx];
1329
+ if (!/abAssert/.test(content)) continue;
1330
+ removed.push(`line ${line}: ${content.trim()}`);
1331
+ lines.splice(idx, 1);
1332
+ }
1333
+ if (removed.length === 0) return {
1334
+ applied: false,
1335
+ reason: "no abAssert lines matched the proposed line numbers"
1336
+ };
1337
+ return {
1338
+ applied: true,
1339
+ script: lines.join("\n"),
1340
+ summary: `removed ${removed.length} assertion(s)`
1341
+ };
1342
+ }
1343
+ function applySelectorDrift(script, line, oldSelector, newSelector) {
1344
+ const lines = script.split("\n");
1345
+ const idx = line - 1;
1346
+ if (idx < 0 || idx >= lines.length) return {
1347
+ applied: false,
1348
+ reason: `line ${line} out of range`
1349
+ };
1350
+ const content = lines[idx];
1351
+ if (!content.includes(oldSelector)) return {
1352
+ applied: false,
1353
+ reason: `oldSelector not found on line ${line}`
1354
+ };
1355
+ lines[idx] = content.replaceAll(oldSelector, newSelector);
1356
+ return {
1357
+ applied: true,
1358
+ script: lines.join("\n"),
1359
+ summary: `line ${line}: "${oldSelector}" → "${newSelector}"`
1360
+ };
1361
+ }
1362
+ /**
1363
+ * Build a unified-style diff snippet for showing the user what would change.
1364
+ * Just the changed lines with -/+ prefixes; not a real patch.
1365
+ */
1366
+ function previewDiff(before, after) {
1367
+ const a = before.split("\n");
1368
+ const b = after.split("\n");
1369
+ const out = [];
1370
+ const max = Math.max(a.length, b.length);
1371
+ for (let i = 0; i < max; i++) {
1372
+ if (a[i] === b[i]) continue;
1373
+ if (a[i] !== void 0) out.push(`- ${a[i]}`);
1374
+ if (b[i] !== void 0) out.push(`+ ${b[i]}`);
1375
+ }
1376
+ return out.join("\n");
1377
+ }
1378
+ //#endregion
1379
+ //#region src/diagnose/prompt.ts
1380
+ function buildDiagnosePrompt(input) {
1381
+ const { script, specMarkdown, actions, failureLog, pageSnapshot, outputLanguage = "en" } = input;
1382
+ const numbered = script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n");
1383
+ return `You are diagnosing a failing E2E test. The test was generated from a recorded trace of the original interaction. Compare the failing run against the original spec and recorded actions to determine WHY the test failed and what the right fix is.
1384
+
1385
+ ## Output language
1386
+
1387
+ Write all human-readable fields (\`reasoning\`, \`reason\`) in **${outputLanguage}** (BCP-47 tag).
1388
+ Selectors, file paths, identifiers, code, type names (TIMING_ISSUE, etc.), JSON keys, and quoted strings stay verbatim regardless of language.
1389
+
1390
+ ## You have read-only filesystem tools
1391
+
1392
+ You can call \`Grep\`, \`Glob\`, and \`Read\` against the current repository before producing the JSON.
1393
+
1394
+ For SELECTOR_DRIFT specifically the failure log is usually NOT enough on its own — the runner only reports "selector X not visible". To confirm a rename, search the application source for the *type* of selector that's failing:
1395
+
1396
+ - For \`[aria-label='OLD']\` failures: \`Grep\` for \`aria-label=\` (or i18n key \`OLD\`) in the app source. If you find a near-miss like \`aria-label="NEW"\` whose text is a superset/rephrase of the failing label, that is your evidence.
1397
+ - For \`[placeholder='OLD']\` failures: \`Grep\` for \`placeholder=\`.
1398
+ - For \`[role='OLD']\` or \`[data-testid='OLD']\`: same pattern.
1399
+ - For \`text=OLD\` failures: \`Grep\` the source / i18n bundles for \`OLD\`. Locale files (\`*.json\`, \`*.yml\`, \`messages.ts\`, etc.) often hold the canonical strings.
1400
+
1401
+ You have **up to 10 tool turns**. Spend them on grep/read; do not loop. Only when you have concrete file:line evidence should you emit SELECTOR_DRIFT — otherwise prefer UNKNOWN with confidence < 0.4 and let the human decide.
1402
+
1403
+ Do NOT attempt to write, edit, run shell commands, or hit the network. Only Grep/Glob/Read.
1404
+
1405
+ ## Diagnosis categories
1406
+
1407
+ Pick exactly ONE category. The output JSON must follow the shape for that category.
1408
+
1409
+ 1. TIMING_ISSUE — element not yet present because the page hasn't loaded / navigated. Fix by inserting or extending sleeps.
1410
+ {
1411
+ "diagnosis": {
1412
+ "type": "TIMING_ISSUE",
1413
+ "fixes": [
1414
+ { "kind": "insert", "line": <1-based>, "seconds": <int>, "reason": "<short>" },
1415
+ { "kind": "increase", "line": <1-based of existing sleep>, "increase_to": <int>, "reason": "<short>" }
1416
+ ]
1417
+ },
1418
+ "confidence": <0.0-1.0>,
1419
+ "reasoning": "<why timing is the cause>"
1420
+ }
1421
+
1422
+ 2. OVER_ASSERTION — the test is asserting something the spec never required, OR a recorded assertion that is environment-dependent (e.g. a placeholder text that varies). The right fix is to remove those lines from the test.
1423
+ {
1424
+ "diagnosis": {
1425
+ "type": "OVER_ASSERTION",
1426
+ "lines": [<1-based line numbers to remove>],
1427
+ "reason": "<short>"
1428
+ },
1429
+ "confidence": <0.0-1.0>,
1430
+ "reasoning": "<why this assertion isn't required by the spec>"
1431
+ }
1432
+
1433
+ 3. SELECTOR_DRIFT — the page is healthy but a selector has been renamed/refined since the trace was recorded. The failure log will typically contain a snapshot showing the new selector. ONLY use this when you can name the exact replacement selector.
1434
+ {
1435
+ "diagnosis": {
1436
+ "type": "SELECTOR_DRIFT",
1437
+ "line": <1-based>,
1438
+ "oldSelector": "<exact string in current line>",
1439
+ "newSelector": "<exact replacement>",
1440
+ "reason": "<short>"
1441
+ },
1442
+ "confidence": <0.0-1.0>,
1443
+ "reasoning": "<evidence from failure log>"
1444
+ }
1445
+
1446
+ 4. DATA_MISSING — the test depends on data (a record, a setup, a logged-in state) that no longer exists. Not auto-fixable; the human must reseed or update the spec.
1447
+ {
1448
+ "diagnosis": { "type": "DATA_MISSING", "reason": "<what is missing>" },
1449
+ "confidence": <0.0-1.0>,
1450
+ "reasoning": "<evidence>"
1451
+ }
1452
+
1453
+ 5. UNKNOWN — none of the above fit, or evidence is too weak to choose.
1454
+ {
1455
+ "diagnosis": { "type": "UNKNOWN", "reason": "<short>" },
1456
+ "confidence": <0.0-1.0>,
1457
+ "reasoning": "<what you saw and why you can't classify>"
1458
+ }
1459
+
1460
+ ## Confidence guidance
1461
+
1462
+ - 0.9-1.0: failure log directly shows the cause (e.g. "selector X not found, snapshot lists Y" → SELECTOR_DRIFT)
1463
+ - 0.7-0.9: strong indirect evidence (e.g. timing pattern after navigation, or assertion text that doesn't appear in spec)
1464
+ - 0.4-0.7: plausible classification but multiple categories could explain it
1465
+ - < 0.4: prefer UNKNOWN over guessing
1466
+
1467
+ ## Rules
1468
+
1469
+ - Your **final** assistant message must start with \`{\` and end with \`}\` — a single JSON object, nothing before or after. No prose preamble like "Confirmed: ...", no markdown fences, no commentary, no tool calls in the same turn. If you have an analysis sentence, put it in the \`reasoning\` field.
1470
+ - Line numbers refer to the numbered test script below (1-based).
1471
+ - For SELECTOR_DRIFT, \`oldSelector\` must match a substring of the script at that line; \`newSelector\` must be backed by a concrete file:line you read with Grep/Read (do not invent). Cite the evidence in \`reasoning\`.
1472
+ - For OVER_ASSERTION, only include lines that contain assert calls (\`abAssert*\`).
1473
+ - Cross-check assertions against the spec markdown. If the spec doesn't require the assertion, OVER_ASSERTION is the better diagnosis than SELECTOR_DRIFT.
1474
+
1475
+ ## Test Spec (test-spec.md)
1476
+ ${specMarkdown}
1477
+
1478
+ ## Recorded Actions (actions.json summary)
1479
+ ${actions.map((a, i) => {
1480
+ const parts = [`${i + 1}. ${a.command}`];
1481
+ if (a.assertType) parts.push(`assertType="${a.assertType}"`);
1482
+ if (a.selector) parts.push(`selector="${a.selector}"`);
1483
+ if (a.value) parts.push(`value="${a.value}"`);
1484
+ if (a.observation) parts.push(`→ ${a.observation}`);
1485
+ return parts.join(" ");
1486
+ }).join("\n")}
1487
+
1488
+ ## Test Script (with line numbers)
1489
+ ${numbered}
1490
+
1491
+ ## Failure Log
1492
+ ${failureLog.slice(0, 4e3)}${pageSnapshot ? formatPageSnapshot(pageSnapshot) : ""}`;
1493
+ }
1494
+ /**
1495
+ * Page snapshot captured by ccqa right after the failure (agent-browser
1496
+ * accessibility tree). When present, it usually decides SELECTOR_DRIFT vs
1497
+ * TIMING_ISSUE: a near-miss aria-label / role / placeholder in the
1498
+ * snapshot is direct evidence of a rename, while a tree that doesn't
1499
+ * contain the failing locator at all (without a near-miss) points to a
1500
+ * still-loading page or genuinely missing element.
1501
+ */
1502
+ function formatPageSnapshot(snapshot) {
1503
+ return `
1504
+
1505
+ ## Page Snapshot (accessibility tree captured right after the failure)
1506
+
1507
+ This is the live state of the page when the test failed. Prefer this over your own assumptions:
1508
+
1509
+ - If a near-miss of the failing selector appears here (e.g. failing \`[aria-label='A']\` and snapshot contains \`aria-label="A-prime"\`), that is direct evidence of SELECTOR_DRIFT — propose the snapshot's value as \`newSelector\`.
1510
+ - If the failing locator is genuinely absent and no near-miss exists, the page may be still loading (TIMING_ISSUE) or the spec is asserting something not on this page (OVER_ASSERTION / DATA_MISSING).
1511
+ - If the snapshot looks unrelated to the spec (e.g. error page, login wall), DATA_MISSING is likely.
1512
+
1513
+ \`\`\`
1514
+ ${snapshot}
1515
+ \`\`\``;
1516
+ }
1517
+ //#endregion
1518
+ //#region src/diagnose/diagnose.ts
1519
+ async function diagnose(input) {
1520
+ const { result: raw, isError } = await invokeClaudeStreaming({
1521
+ prompt: buildDiagnosePrompt(input),
1522
+ allowedTools: [
1523
+ "Read",
1524
+ "Grep",
1525
+ "Glob"
1526
+ ],
1527
+ maxTurns: 10
1528
+ }, () => {});
1529
+ if (isError) return {
1530
+ result: null,
1531
+ raw: raw ?? "",
1532
+ sdkError: true
1533
+ };
1534
+ if (!raw) return {
1535
+ result: null,
1536
+ raw: "",
1537
+ sdkError: false
1538
+ };
1539
+ const candidates = extractJsonCandidates(raw);
1540
+ for (const candidate of candidates) {
1541
+ let parsed;
1542
+ try {
1543
+ parsed = JSON.parse(candidate);
1544
+ } catch {
1545
+ continue;
1546
+ }
1547
+ const normalised = normaliseResult(parsed);
1548
+ if (normalised) return {
1549
+ result: normalised,
1550
+ raw,
1551
+ sdkError: false
1552
+ };
1553
+ }
1554
+ return {
1555
+ result: {
1556
+ diagnosis: {
1557
+ type: "UNKNOWN",
1558
+ reason: "diagnose returned no parseable diagnosis JSON"
1559
+ },
1560
+ confidence: 0,
1561
+ reasoning: truncate$1(raw, 1e3)
1562
+ },
1563
+ raw,
1564
+ sdkError: false
1565
+ };
1566
+ }
1567
+ /**
1568
+ * Pull every plausible JSON object out of `raw`. We try, in order:
1569
+ * 1. The whole string with code fences stripped (the prompt asks for
1570
+ * JSON-only, so this is the happy path).
1571
+ * 2. Each balanced `{...}` block found by scanning the text. The model
1572
+ * sometimes prefixes the JSON with a "Confirmed: ..." sentence or
1573
+ * mentions partial JSON in its tool-using reasoning; we want to
1574
+ * try the *last* well-formed object first because it's most likely
1575
+ * the final answer, then earlier ones as a fallback.
1576
+ *
1577
+ * The caller `JSON.parse`s each candidate and stops at the first match
1578
+ * that normalises to a known DiagnosisResult.
1579
+ */
1580
+ function extractJsonCandidates(raw) {
1581
+ const out = [];
1582
+ const stripped = stripFence(raw);
1583
+ if (stripped) out.push(stripped);
1584
+ const blocks = [];
1585
+ let depth = 0;
1586
+ let start = -1;
1587
+ let inString = false;
1588
+ let escaped = false;
1589
+ for (let i = 0; i < raw.length; i++) {
1590
+ const ch = raw[i];
1591
+ if (inString) {
1592
+ if (escaped) escaped = false;
1593
+ else if (ch === "\\") escaped = true;
1594
+ else if (ch === "\"") inString = false;
1595
+ continue;
1596
+ }
1597
+ if (ch === "\"") {
1598
+ inString = true;
1599
+ continue;
1600
+ }
1601
+ if (ch === "{") {
1602
+ if (depth === 0) start = i;
1603
+ depth++;
1604
+ } else if (ch === "}") {
1605
+ depth--;
1606
+ if (depth === 0 && start >= 0) {
1607
+ blocks.push(raw.slice(start, i + 1));
1608
+ start = -1;
1609
+ }
1610
+ }
1611
+ }
1612
+ for (let i = blocks.length - 1; i >= 0; i--) {
1613
+ const block = blocks[i];
1614
+ if (!out.includes(block)) out.push(block);
1615
+ }
1616
+ return out;
1617
+ }
1618
+ function truncate$1(s, max) {
1619
+ return s.length <= max ? s : `${s.slice(0, max)}... [truncated, ${s.length - max} more chars]`;
1620
+ }
1621
+ function stripFence(raw) {
1622
+ return raw.trim().replace(/^```(?:json)?\s*\n?/i, "").replace(/\n?```\s*$/i, "").trim();
1623
+ }
1624
+ function normaliseResult(parsed) {
1625
+ if (!isObject(parsed)) return null;
1626
+ const diagnosis = normaliseDiagnosis(parsed["diagnosis"]);
1627
+ if (!diagnosis) return null;
1628
+ return {
1629
+ diagnosis,
1630
+ confidence: typeof parsed["confidence"] === "number" ? clamp(parsed["confidence"], 0, 1) : 0,
1631
+ reasoning: typeof parsed["reasoning"] === "string" ? parsed["reasoning"] : ""
1632
+ };
1633
+ }
1634
+ function normaliseDiagnosis(raw) {
1635
+ if (!isObject(raw)) return null;
1636
+ switch (raw["type"]) {
1637
+ case "TIMING_ISSUE": {
1638
+ const fixes = normaliseSleepFixes(raw["fixes"]);
1639
+ if (fixes.length === 0) return null;
1640
+ return {
1641
+ type: "TIMING_ISSUE",
1642
+ fixes
1643
+ };
1644
+ }
1645
+ case "OVER_ASSERTION": {
1646
+ const lines = Array.isArray(raw["lines"]) ? raw["lines"].filter((n) => typeof n === "number" && Number.isFinite(n)) : [];
1647
+ if (lines.length === 0) return null;
1648
+ return {
1649
+ type: "OVER_ASSERTION",
1650
+ lines,
1651
+ reason: typeof raw["reason"] === "string" ? raw["reason"] : ""
1652
+ };
1653
+ }
1654
+ case "SELECTOR_DRIFT": {
1655
+ const line = typeof raw["line"] === "number" ? raw["line"] : null;
1656
+ const oldSelector = typeof raw["oldSelector"] === "string" ? raw["oldSelector"] : null;
1657
+ const newSelector = typeof raw["newSelector"] === "string" ? raw["newSelector"] : null;
1658
+ if (line === null || !oldSelector || !newSelector) return null;
1659
+ return {
1660
+ type: "SELECTOR_DRIFT",
1661
+ line,
1662
+ oldSelector,
1663
+ newSelector,
1664
+ reason: typeof raw["reason"] === "string" ? raw["reason"] : ""
1665
+ };
1666
+ }
1667
+ case "DATA_MISSING": return {
1668
+ type: "DATA_MISSING",
1669
+ reason: typeof raw["reason"] === "string" ? raw["reason"] : ""
1670
+ };
1671
+ case "UNKNOWN": return {
1672
+ type: "UNKNOWN",
1673
+ reason: typeof raw["reason"] === "string" ? raw["reason"] : ""
1674
+ };
1675
+ default: return null;
1676
+ }
1677
+ }
1678
+ function normaliseSleepFixes(raw) {
1679
+ if (!Array.isArray(raw)) return [];
1680
+ const out = [];
1681
+ for (const item of raw) {
1682
+ if (!isObject(item)) continue;
1683
+ const line = typeof item["line"] === "number" ? item["line"] : null;
1684
+ if (line === null) continue;
1685
+ const reason = typeof item["reason"] === "string" ? item["reason"] : "";
1686
+ const kind = item["kind"];
1687
+ if (kind === "insert" || typeof item["seconds"] === "number" && item["increase_to"] === void 0) {
1688
+ const seconds = typeof item["seconds"] === "number" ? item["seconds"] : null;
1689
+ if (seconds === null) continue;
1690
+ out.push({
1691
+ kind: "insert",
1692
+ line,
1693
+ seconds,
1694
+ reason
1695
+ });
1696
+ continue;
1697
+ }
1698
+ if (kind === "increase" || typeof item["increase_to"] === "number") {
1699
+ const increaseTo = typeof item["increase_to"] === "number" ? item["increase_to"] : null;
1700
+ if (increaseTo === null) continue;
1701
+ out.push({
1702
+ kind: "increase",
1703
+ line,
1704
+ increase_to: increaseTo,
1705
+ reason
1706
+ });
1707
+ continue;
1708
+ }
1709
+ }
1710
+ return out;
1711
+ }
1712
+ function isObject(v) {
1713
+ return typeof v === "object" && v !== null && !Array.isArray(v);
1714
+ }
1715
+ function clamp(n, lo, hi) {
1716
+ if (n < lo) return lo;
1717
+ if (n > hi) return hi;
1718
+ return n;
1719
+ }
1720
+ //#endregion
1721
+ //#region src/diagnose/interactive.ts
1722
+ async function promptForChoice(input) {
1723
+ printContext(input);
1724
+ const rl = createInterface({
1725
+ input: process.stdin,
1726
+ output: process.stdout
1727
+ });
1728
+ try {
1729
+ while (true) switch ((await question(rl, "[a]pply / [s]kip / [m]anual / [q]uit > ")).trim().toLowerCase()) {
1730
+ case "a":
1731
+ case "apply": return "apply";
1732
+ case "s":
1733
+ case "skip": return "skip";
1734
+ case "m":
1735
+ case "manual": return "manual";
1736
+ case "q":
1737
+ case "quit": return "quit";
1738
+ default: process.stdout.write(" please answer a/s/m/q\n");
1739
+ }
1740
+ } finally {
1741
+ rl.close();
1742
+ }
1743
+ }
1744
+ function question(rl, prompt) {
1745
+ return new Promise((resolve) => rl.question(prompt, resolve));
1746
+ }
1747
+ function printContext({ result, diff, failureExcerpt }) {
1748
+ const { diagnosis, confidence, reasoning } = result;
1749
+ process.stdout.write("\n");
1750
+ process.stdout.write(`[fix] diagnosis: ${diagnosis.type} (confidence ${confidence.toFixed(2)})\n`);
1751
+ if (reasoning) process.stdout.write(`[fix] reasoning: ${reasoning}\n`);
1752
+ for (const line of formatDiagnosisDetail(diagnosis)) process.stdout.write(`[fix] ${line}\n`);
1753
+ if (failureExcerpt) {
1754
+ process.stdout.write("\n[fix] failure excerpt:\n");
1755
+ process.stdout.write(prefixLines(failureExcerpt, "[fix] "));
1756
+ process.stdout.write("\n");
1757
+ }
1758
+ if (diff) {
1759
+ process.stdout.write("\n[fix] proposed fix:\n");
1760
+ process.stdout.write(prefixLines(diff, "[fix] "));
1761
+ process.stdout.write("\n");
1762
+ }
1763
+ process.stdout.write("\n");
1764
+ }
1765
+ function formatDiagnosisDetail(diagnosis) {
1766
+ switch (diagnosis.type) {
1767
+ case "TIMING_ISSUE": return [`fixes: ${diagnosis.fixes.map((f) => f.kind === "insert" ? `insert ${f.seconds}s @ line ${f.line}` : `increase to ${f.increase_to}s @ line ${f.line}`).join(", ")}`];
1768
+ case "OVER_ASSERTION": return [`lines: ${diagnosis.lines.join(", ")}`, `reason: ${diagnosis.reason}`];
1769
+ case "SELECTOR_DRIFT": return [`line ${diagnosis.line}: "${diagnosis.oldSelector}" → "${diagnosis.newSelector}"`, `reason: ${diagnosis.reason}`];
1770
+ case "DATA_MISSING":
1771
+ case "UNKNOWN": return [`reason: ${diagnosis.reason}`];
1772
+ }
1773
+ }
1774
+ function prefixLines(text, prefix) {
1775
+ return text.split("\n").map((l) => `${prefix}${l}`).join("\n");
1776
+ }
1777
+ //#endregion
1778
+ //#region src/diagnose/snapshot.ts
1779
+ const require = createRequire(import.meta.url);
1780
+ const SNAPSHOT_TIMEOUT_MS = 1e4;
1781
+ const CLOSE_TIMEOUT_MS = 1e4;
1782
+ const MAX_OUTPUT_BYTES = 6e4;
1783
+ function resolveAgentBrowserBin() {
1784
+ try {
1785
+ return require.resolve("agent-browser/bin/agent-browser.js");
1786
+ } catch {
1787
+ return null;
1788
+ }
1789
+ }
1790
+ /**
1791
+ * Run `agent-browser snapshot` against the session that the failed vitest
1792
+ * run just used, and return its accessibility-tree dump.
1793
+ *
1794
+ * Returns null when agent-browser is missing, the daemon has no live page
1795
+ * for the session, or the call exceeds {@link SNAPSHOT_TIMEOUT_MS}. We
1796
+ * never throw — a missing snapshot just means diagnose has less context.
1797
+ *
1798
+ * The output is truncated to {@link MAX_OUTPUT_BYTES} so the prompt stays
1799
+ * within budget on large pages.
1800
+ */
1801
+ async function captureSnapshot(sessionName) {
1802
+ const abBin = resolveAgentBrowserBin();
1803
+ if (!abBin) return null;
1804
+ return new Promise((resolve) => {
1805
+ const child = spawn(process.execPath, [abBin, "snapshot"], {
1806
+ env: {
1807
+ ...process.env,
1808
+ AGENT_BROWSER_SESSION: sessionName
1809
+ },
1810
+ stdio: [
1811
+ "ignore",
1812
+ "pipe",
1813
+ "pipe"
1814
+ ]
1815
+ });
1816
+ let stdout = "";
1817
+ let stderr = "";
1818
+ let timedOut = false;
1819
+ const timer = setTimeout(() => {
1820
+ timedOut = true;
1821
+ child.kill("SIGTERM");
1822
+ }, SNAPSHOT_TIMEOUT_MS);
1823
+ child.stdout.setEncoding("utf8");
1824
+ child.stderr.setEncoding("utf8");
1825
+ child.stdout.on("data", (chunk) => {
1826
+ stdout += chunk;
1827
+ });
1828
+ child.stderr.on("data", (chunk) => {
1829
+ stderr += chunk;
1830
+ });
1831
+ child.on("error", () => {
1832
+ clearTimeout(timer);
1833
+ resolve(null);
1834
+ });
1835
+ child.on("exit", (code) => {
1836
+ clearTimeout(timer);
1837
+ if (timedOut || code !== 0) {
1838
+ resolve(null);
1839
+ return;
1840
+ }
1841
+ const trimmed = stdout.trim();
1842
+ if (!trimmed) {
1843
+ resolve(null);
1844
+ return;
1845
+ }
1846
+ resolve(truncate(trimmed, MAX_OUTPUT_BYTES));
1847
+ });
1848
+ });
1849
+ }
1850
+ function truncate(s, maxBytes) {
1851
+ if (s.length <= maxBytes) return s;
1852
+ return `${s.slice(0, maxBytes)}\n... [truncated, ${s.length - maxBytes} more chars]`;
1853
+ }
1854
+ /**
1855
+ * Close an agent-browser session by name. Used before/after a `ccqa generate`
1856
+ * run so a wedged daemon from a previous attempt can't hang the next one.
1857
+ *
1858
+ * Always resolves; never throws. If the binary is missing, the session
1859
+ * doesn't exist, or the call exceeds {@link CLOSE_TIMEOUT_MS}, we silently
1860
+ * return — close is best-effort cleanup, not a precondition.
1861
+ */
1862
+ async function closeSession(sessionName) {
1863
+ const abBin = resolveAgentBrowserBin();
1864
+ if (!abBin) return;
1865
+ await new Promise((resolve) => {
1866
+ const child = spawn(process.execPath, [abBin, "close"], {
1867
+ env: {
1868
+ ...process.env,
1869
+ AGENT_BROWSER_SESSION: sessionName
1870
+ },
1871
+ stdio: "ignore"
1872
+ });
1873
+ const timer = setTimeout(() => {
1874
+ child.kill("SIGTERM");
1875
+ }, CLOSE_TIMEOUT_MS);
1876
+ const finish = () => {
1877
+ clearTimeout(timer);
1878
+ resolve();
1879
+ };
1880
+ child.on("error", finish);
1881
+ child.on("exit", finish);
1882
+ });
1883
+ }
1884
+ //#endregion
1885
+ //#region src/diagnose/loop.ts
1886
+ const DEFAULT_CONFIDENCE_THRESHOLD = .8;
1887
+ /**
1888
+ * Returns true when vitest finally passed; false when retries were exhausted
1889
+ * or the diagnose loop chose to bail out early.
1890
+ */
1891
+ async function runAutoFixLoop(input) {
1892
+ const { scriptPath, initialRun, specMarkdown, actions, maxRetries, mode, runVitest, agentBrowserSession, outputLanguage } = input;
1893
+ let { exitCode, output, currentScript } = initialRun;
1894
+ if (exitCode === 0) return true;
1895
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
1896
+ fix(`attempt ${attempt}/${maxRetries}`);
1897
+ blank();
1898
+ const pageSnapshot = agentBrowserSession ? await timedPhase("page snapshot", () => captureSnapshot(agentBrowserSession), "fix") : null;
1899
+ if (agentBrowserSession) if (pageSnapshot) fix(`page snapshot: ${pageSnapshot.length} chars captured`);
1900
+ else fix("page snapshot unavailable; continuing without it");
1901
+ const fixed = await diagnoseAndFix({
1902
+ script: currentScript,
1903
+ specMarkdown,
1904
+ actions,
1905
+ failureLog: output,
1906
+ pageSnapshot: pageSnapshot ?? void 0,
1907
+ mode,
1908
+ outputLanguage
1909
+ });
1910
+ if (!fixed) {
1911
+ fix("bailed out; see diagnosis above");
1912
+ return false;
1913
+ }
1914
+ await writeFile(scriptPath, fixed, "utf-8");
1915
+ fix(`saved: ${scriptPath}`);
1916
+ blank();
1917
+ ({exitCode, output, currentScript} = await timedPhase(`vitest run #${attempt + 1}`, () => runVitest(scriptPath), "run"));
1918
+ if (exitCode === 0) return true;
1919
+ }
1920
+ return false;
1921
+ }
1922
+ async function diagnoseAndFix(input) {
1923
+ const { script, specMarkdown, actions, failureLog, pageSnapshot, mode, outputLanguage } = input;
1924
+ const outcome = await timedPhase("diagnose", () => diagnose({
1925
+ script,
1926
+ specMarkdown,
1927
+ actions,
1928
+ failureLog,
1929
+ pageSnapshot,
1930
+ outputLanguage
1931
+ }), "fix");
1932
+ if (outcome.sdkError) {
1933
+ fix("diagnose: SDK error talking to Claude");
1934
+ if (outcome.raw) fix(`diagnose raw: ${truncateForLog(outcome.raw)}`);
1935
+ hint("re-run later, or check ANTHROPIC_API_KEY / network connectivity");
1936
+ return null;
1937
+ }
1938
+ if (!outcome.result) {
1939
+ fix("diagnose: empty response from LLM");
1940
+ hint("re-run; if this keeps happening the failure log may be too short to diagnose");
1941
+ return null;
1942
+ }
1943
+ const result = outcome.result;
1944
+ reportDiagnosis(result);
1945
+ if (result.diagnosis.type === "DATA_MISSING" || result.diagnosis.type === "UNKNOWN") {
1946
+ handoffToUser(result, outcome.raw, outputLanguage);
1947
+ return null;
1948
+ }
1949
+ const apply = applyDiagnosis(script, result.diagnosis);
1950
+ if (!apply.applied) {
1951
+ fix(`cannot apply: ${apply.reason}`);
1952
+ handoffToUser(result, outcome.raw, outputLanguage);
1953
+ return null;
1954
+ }
1955
+ const decision = decide(result, mode);
1956
+ if (decision === "apply-auto") {
1957
+ fix(`applying automatically: ${apply.summary}`);
1958
+ return apply.script;
1959
+ }
1960
+ if (decision === "skip-low-confidence") {
1961
+ fix(`confidence ${result.confidence.toFixed(2)} below threshold ${DEFAULT_CONFIDENCE_THRESHOLD}; skipping (--no-interactive)`);
1962
+ handoffToUser(result, outcome.raw, outputLanguage);
1963
+ return null;
1964
+ }
1965
+ switch (await promptForChoice({
1966
+ result,
1967
+ diff: previewDiff(script, apply.script),
1968
+ failureExcerpt: failureLog.slice(0, 800)
1969
+ })) {
1970
+ case "apply":
1971
+ fix(`applied: ${apply.summary}`);
1972
+ return apply.script;
1973
+ case "skip":
1974
+ fix("skipped; leaving script untouched");
1975
+ return null;
1976
+ case "manual":
1977
+ fix("paused for manual edit");
1978
+ handoffToUser(result, outcome.raw, outputLanguage);
1979
+ return null;
1980
+ case "quit":
1981
+ fix("user quit");
1982
+ process.exit(1);
1983
+ }
1984
+ }
1985
+ function decide(result, mode) {
1986
+ if (mode === "auto") return "apply-auto";
1987
+ const highConfidence = result.confidence >= DEFAULT_CONFIDENCE_THRESHOLD;
1988
+ if (mode === "non-interactive") return highConfidence ? "apply-auto" : "skip-low-confidence";
1989
+ return highConfidence ? "apply-auto" : "interactive";
1990
+ }
1991
+ function reportDiagnosis(result) {
1992
+ fix(`diagnosis: ${result.diagnosis.type}`);
1993
+ fix(`confidence: ${result.confidence.toFixed(2)}`);
1994
+ if (result.reasoning) fix(`reasoning: ${result.reasoning}`);
1995
+ }
1996
+ /**
1997
+ * Emit a category-specific [hint] block that tells the user what to do next.
1998
+ * Called whenever the loop has decided it cannot proceed on its own —
1999
+ * because the diagnosis is intrinsically not auto-fixable, because the
2000
+ * proposed fix wasn't applicable to the current script, or because the
2001
+ * confidence was too low under --no-interactive.
2002
+ *
2003
+ * The goal is to never leave the user with just "auto-fix exhausted" —
2004
+ * always state which side (test artifacts vs. application) likely needs
2005
+ * the next action.
2006
+ */
2007
+ function handoffToUser(result, raw, language) {
2008
+ const lines = handoffMessage(result.diagnosis, normLang(language));
2009
+ for (const line of lines) hint(line);
2010
+ if (raw) fix(`diagnose raw: ${truncateForLog(raw)}`);
2011
+ }
2012
+ function normLang(language) {
2013
+ if (!language) return "en";
2014
+ return language.toLowerCase().startsWith("ja") ? "ja" : "en";
2015
+ }
2016
+ const HANDOFF = {
2017
+ en: handoffEn,
2018
+ ja: handoffJa
2019
+ };
2020
+ function handoffMessage(diagnosis, language) {
2021
+ return HANDOFF[language](diagnosis);
2022
+ }
2023
+ function handoffEn(diagnosis) {
2024
+ switch (diagnosis.type) {
2025
+ case "DATA_MISSING": return [`application-side issue: required data is missing. ${diagnosis.reason}`, "next step: seed the data (or update test-spec.md prerequisites), then re-run trace + generate."];
2026
+ case "UNKNOWN": return [`could not classify the failure. ${diagnosis.reason}`, "next step: read the failure log above, decide whether the test or the app is wrong, and fix manually. consider re-running ccqa trace if the recorded flow no longer matches the live app."];
2027
+ case "SELECTOR_DRIFT": return [
2028
+ `selector likely drifted but auto-apply was not safe.`,
2029
+ `proposed: line ${diagnosis.line}: "${diagnosis.oldSelector}" → "${diagnosis.newSelector}" (${diagnosis.reason}).`,
2030
+ "next step: confirm in the live app and either accept the proposal manually, or re-run ccqa trace to recapture the new selector."
2031
+ ];
2032
+ case "OVER_ASSERTION": return [`assertion may not be required by the spec. lines: ${diagnosis.lines.join(", ")} (${diagnosis.reason}).`, "next step: cross-check test-spec.md. either delete the assertion from the test, or tighten the spec to require it."];
2033
+ case "TIMING_ISSUE": return [`timing fix proposed but couldn't be applied automatically.`, "next step: insert a sleep manually before the failing line, or re-run with a higher confidence trace."];
2034
+ }
2035
+ }
2036
+ function handoffJa(diagnosis) {
2037
+ switch (diagnosis.type) {
2038
+ case "DATA_MISSING": return [`アプリ側の問題: 必要なデータが不足しています。${diagnosis.reason}`, "次のステップ: データを seed する(または test-spec.md の prerequisites を更新)してから ccqa trace + generate をやり直してください。"];
2039
+ case "UNKNOWN": return [`失敗を分類できませんでした。${diagnosis.reason}`, "次のステップ: 上の失敗ログを確認し、テストとアプリのどちらが原因か判断して手動で修正してください。記録した手順がアプリの現状と合わない場合は ccqa trace の再実行を検討してください。"];
2040
+ case "SELECTOR_DRIFT": return [
2041
+ "selector が変わった可能性が高いですが、自動適用は安全でないと判断しました。",
2042
+ `提案: 行 ${diagnosis.line}: "${diagnosis.oldSelector}" → "${diagnosis.newSelector}" (${diagnosis.reason})`,
2043
+ "次のステップ: アプリで新 selector を確認し、手動で適用するか ccqa trace をやり直して新しい selector を取り直してください。"
2044
+ ];
2045
+ case "OVER_ASSERTION": return [`spec が要求していない assertion の可能性があります。対象行: ${diagnosis.lines.join(", ")} (${diagnosis.reason})`, "次のステップ: test-spec.md と照合して、テスト側の assertion を削るか、spec 側を更新してください。"];
2046
+ case "TIMING_ISSUE": return ["timing 関連の修正案は出ましたが、自動適用できませんでした。", "次のステップ: 失敗行の前に手動で sleep を入れるか、より信頼度の高い trace を取り直してください。"];
2047
+ }
2048
+ }
2049
+ function truncateForLog(s) {
2050
+ const oneLine = s.replace(/\n+/g, " ⏎ ");
2051
+ return oneLine.length <= 400 ? oneLine : `${oneLine.slice(0, 400)}... [+${oneLine.length - 400} chars]`;
2052
+ }
2053
+ function resolveMode(opts) {
2054
+ if (opts.auto) return "auto";
2055
+ if (opts.interactive === false || opts.noInteractive) return "non-interactive";
2056
+ return "interactive";
2057
+ }
2058
+ //#endregion
1188
2059
  //#region src/cli/generate.ts
1189
- const generateCommand = new Command("generate").argument("<feature/spec>", "Spec to generate test for (e.g. tasks/create-and-complete)").description("Generate agent-browser test script from recorded trace actions").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").action(async (specPath, opts) => {
2060
+ const generateCommand = new Command("generate").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Generate agent-browser test script from recorded trace actions. test.spec.ts is regenerated from actions.json on every run; pass --force to overwrite manual edits.").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--auto", "Apply auto-fixes without confirmation regardless of confidence (CI use)").option("--no-interactive", "Never prompt; only auto-apply when confidence is high, otherwise give up").option("--force", "Overwrite an existing test.spec.ts without warning").option("--no-snapshot", "Don't pin AGENT_BROWSER_SESSION / capture page snapshots after a failure (debug toggle)").option("--language <bcp47>", "Language for diagnose reasoning / hint text (e.g. 'en', 'ja')", "en").action(async (specPath, opts) => {
1190
2061
  const { featureName, specName } = parseSpecPath(specPath);
1191
- await runGenerate(featureName, specName, parseInt(opts.maxRetries, 10));
2062
+ const mode = resolveMode(opts);
2063
+ const useSnapshot = opts.snapshot !== false;
2064
+ await runGenerate(featureName, specName, parseInt(opts.maxRetries, 10), mode, opts.force ?? false, useSnapshot, opts.language ?? "en");
1192
2065
  });
1193
- async function runGenerate(featureName, specName, maxRetries) {
2066
+ async function runGenerate(featureName, specName, maxRetries, mode, force, useSnapshot, outputLanguage) {
1194
2067
  header("generate", `${featureName}/${specName}`);
1195
2068
  await ensureCcqaDir();
2069
+ const existingScriptPath = await getTestScript(featureName, specName);
2070
+ if (existingScriptPath && !force) {
2071
+ if (!await confirmOverwrite(existingScriptPath)) {
2072
+ info("aborted; pass --force to overwrite without prompting");
2073
+ return;
2074
+ }
2075
+ }
1196
2076
  const { path: actionsPath, actions } = await getTraceActions(featureName, specName);
1197
2077
  meta("trace", actionsPath);
1198
2078
  meta("actions", actions.length);
1199
- const spec = parseTestSpec(await readSpecFile(featureName, specName));
2079
+ const specContent = await readSpecFile(featureName, specName);
2080
+ const spec = parseTestSpec(specContent);
1200
2081
  const setupScripts = await loadSetupScripts(spec.setups);
1201
2082
  if (setupScripts.length > 0) meta("setups", setupScripts.map((s) => s.name).join(", "));
2083
+ meta("fix-mode", mode);
2084
+ meta("language", outputLanguage);
1202
2085
  blank();
1203
2086
  const cleanedActions = await cleanupActions$1(actions);
1204
2087
  if (cleanedActions.length !== actions.length) meta("cleaned", cleanedActions.length);
1205
2088
  const scriptPath = await saveTestScript(featureName, specName, actionsToScript(cleanedActions, spec.title, setupScripts.length > 0 ? setupScripts : void 0));
1206
2089
  meta("saved", scriptPath);
1207
2090
  blank();
1208
- let { exitCode, output, currentScript } = await runVitest$1(scriptPath);
1209
- if (exitCode === 0) {
1210
- hint(`run 'ccqa run ${featureName}/${specName}' to execute the test`);
1211
- return;
2091
+ const agentBrowserSession = useSnapshot ? `ccqa-generate-${Date.now()}` : void 0;
2092
+ const runVitestForSession = (path) => runVitest$1(path, agentBrowserSession);
2093
+ let signalHandler = null;
2094
+ if (agentBrowserSession) {
2095
+ await closeSession(agentBrowserSession);
2096
+ signalHandler = () => {
2097
+ closeSession(agentBrowserSession).finally(() => process.exit(130));
2098
+ };
2099
+ process.once("SIGINT", signalHandler);
2100
+ process.once("SIGTERM", signalHandler);
1212
2101
  }
1213
- for (let attempt = 1; attempt <= maxRetries; attempt++) {
1214
- info(`auto-fix attempt ${attempt}/${maxRetries}...`);
1215
- blank();
1216
- const fixed = await autoFixWithLLM$1(currentScript, output);
1217
- if (!fixed) {
1218
- warn("could not determine fix from failure log");
1219
- break;
2102
+ try {
2103
+ const initialRun = await timedPhase("vitest run #1", () => runVitestForSession(scriptPath), "run");
2104
+ if (initialRun.exitCode === 0) {
2105
+ hint(`run 'ccqa run ${featureName}/${specName}' to execute the test`);
2106
+ return;
1220
2107
  }
1221
- await writeFile(scriptPath, fixed, "utf-8");
1222
- meta("saved", scriptPath);
1223
- blank();
1224
- ({exitCode, output, currentScript} = await runVitest$1(scriptPath));
1225
- if (exitCode === 0) {
2108
+ if (await runAutoFixLoop({
2109
+ scriptPath,
2110
+ initialRun,
2111
+ specMarkdown: specContent,
2112
+ actions: cleanedActions,
2113
+ maxRetries,
2114
+ mode,
2115
+ runVitest: runVitestForSession,
2116
+ agentBrowserSession,
2117
+ outputLanguage
2118
+ })) {
1226
2119
  hint(`run 'ccqa run ${featureName}/${specName}' to execute the test`);
1227
2120
  return;
1228
2121
  }
2122
+ warn("auto-fix exhausted; test still failing");
2123
+ process.exit(1);
2124
+ } finally {
2125
+ if (signalHandler) {
2126
+ process.off("SIGINT", signalHandler);
2127
+ process.off("SIGTERM", signalHandler);
2128
+ }
2129
+ if (agentBrowserSession) await closeSession(agentBrowserSession);
2130
+ }
2131
+ }
2132
+ async function confirmOverwrite(path) {
2133
+ if (!process.stdin.isTTY) {
2134
+ warn(`${path} exists and stdin is not a TTY; refusing to overwrite. Pass --force to allow.`);
2135
+ return false;
2136
+ }
2137
+ const rl = createInterface({
2138
+ input: process.stdin,
2139
+ output: process.stdout
2140
+ });
2141
+ try {
2142
+ process.stdout.write("\n");
2143
+ process.stdout.write(`[warn] ${path} already exists.\n`);
2144
+ process.stdout.write(`[warn] generate will regenerate it from actions.json and any manual edits will be lost.\n`);
2145
+ const norm = (await new Promise((res) => rl.question("Overwrite? [y/N] ", res))).trim().toLowerCase();
2146
+ return norm === "y" || norm === "yes";
2147
+ } finally {
2148
+ rl.close();
1229
2149
  }
1230
- warn("auto-fix exhausted — test still failing");
1231
- process.exit(1);
1232
2150
  }
1233
- /**
1234
- * Load setup test scripts, extract test body, and replace {{placeholders}} with params values.
1235
- */
1236
2151
  async function loadSetupScripts(setups) {
1237
2152
  if (!setups?.length) return [];
1238
2153
  const result = [];
@@ -1282,48 +2197,28 @@ function extractTestBody(script) {
1282
2197
  }
1283
2198
  function replacePlaceholders(body, params) {
1284
2199
  let result = body;
1285
- for (const [key, value] of Object.entries(params)) result = result.replaceAll(`{{${key}}}`, value);
2200
+ for (const [key, value] of Object.entries(params)) if (hasEnvRef(value)) {
2201
+ const expr = envRefsToJsExpression(value);
2202
+ const re = new RegExp(`(["'])\\{\\{${escapeRegExp(key)}\\}\\}\\1`, "g");
2203
+ result = result.replace(re, expr);
2204
+ result = result.replaceAll(`{{${key}}}`, value);
2205
+ } else result = result.replaceAll(`{{${key}}}`, value);
1286
2206
  return result;
1287
2207
  }
1288
- async function autoFixWithLLM$1(script, failureLog) {
1289
- try {
1290
- const { result, isError } = await invokeClaudeStreaming({
1291
- prompt: buildAutoFixPrompt(script, failureLog),
1292
- disableBuiltinTools: true,
1293
- maxTurns: 1
1294
- }, () => {});
1295
- if (isError || !result) return null;
1296
- const json = result.trim().replace(/^```(?:json)?\n?([\s\S]*?)\n?```$/, "$1").trim();
1297
- const fixes = JSON.parse(json);
1298
- if (!Array.isArray(fixes) || fixes.length === 0) return null;
1299
- return applySleepFixes$1(script, fixes);
1300
- } catch {
1301
- return null;
1302
- }
1303
- }
1304
- function applySleepFixes$1(script, fixes) {
1305
- const lines = script.split("\n");
1306
- for (const fix of fixes) if ("increase_to" in fix) {
1307
- const idx = fix.line - 1;
1308
- if (idx >= 0 && idx < lines.length) lines[idx] = lines[idx].replace(/spawnSync\("sleep",\s*\["\d+"\]/, `spawnSync("sleep", ["${fix.increase_to}"]`);
1309
- }
1310
- const inserts = fixes.filter((f) => "seconds" in f && !("increase_to" in f)).sort((a, b) => b.line - a.line);
1311
- for (const fix of inserts) {
1312
- const idx = fix.line - 1;
1313
- if (idx >= 0 && idx <= lines.length) lines.splice(idx, 0, ` spawnSync("sleep", ["${fix.seconds}"], { stdio: "inherit" });`);
1314
- }
1315
- return lines.join("\n");
2208
+ function escapeRegExp(s) {
2209
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1316
2210
  }
1317
- async function runVitest$1(scriptPath) {
1318
- const { exitCode, stdout, stderr } = await spawnVitestCaptured([
2211
+ async function runVitest$1(scriptPath, agentBrowserSession) {
2212
+ const { exitCode, stdout, stderr } = await spawnVitestTeed([
1319
2213
  "run",
1320
2214
  "--config",
1321
2215
  bundledVitestConfigPath(),
1322
2216
  scriptPath
1323
- ]);
2217
+ ], agentBrowserSession ? { env: {
2218
+ ...process.env,
2219
+ AGENT_BROWSER_SESSION: agentBrowserSession
2220
+ } } : {});
1324
2221
  const currentScript = await readFile(scriptPath, "utf8");
1325
- process.stdout.write(stdout);
1326
- if (stderr) process.stderr.write(stderr);
1327
2222
  return {
1328
2223
  exitCode,
1329
2224
  output: stdout + stderr,
@@ -1378,7 +2273,7 @@ async function runTests(target) {
1378
2273
  warn(`${featureName}/${specName}: no test.spec.ts found`);
1379
2274
  continue;
1380
2275
  }
1381
- info(`▶ ${featureName}/${specName}`);
2276
+ run(`${featureName}/${specName}`);
1382
2277
  meta("test", scriptFile);
1383
2278
  blank();
1384
2279
  const reportFile = join(tmpDir, `report-${i}.json`);
@@ -1522,6 +2417,7 @@ async function runTraceSetup(name) {
1522
2417
  await ensureCcqaDir();
1523
2418
  const spec = parseSetupSpec(await readSetupSpecFile(name));
1524
2419
  const resolvedSpec = replacePlaceholdersWithDummies(spec);
2420
+ const secretsToScrub = buildSecretsToScrub(spec);
1525
2421
  meta("setup", spec.title);
1526
2422
  meta("steps", spec.steps.length);
1527
2423
  if (spec.placeholders) meta("placeholders", Object.keys(spec.placeholders).join(", "));
@@ -1542,8 +2438,12 @@ async function runTraceSetup(name) {
1542
2438
  "Grep",
1543
2439
  "Glob"
1544
2440
  ],
2441
+ env: {
2442
+ PATH: pathWithAgentBrowserShim(process.env["PATH"]),
2443
+ ANTHROPIC_API_KEY: ""
2444
+ },
1545
2445
  onAbAction: (abAction) => {
1546
- const action = parseAbAction(abAction);
2446
+ const action = parseAbAction(scrubSecrets(abAction, secretsToScrub));
1547
2447
  if (action) traceActions.push(action);
1548
2448
  },
1549
2449
  onAbActionFailed: () => {
@@ -1565,7 +2465,7 @@ async function runTraceSetup(name) {
1565
2465
  if (routeStep.status === "FAILED") overallStatus = "failed";
1566
2466
  }
1567
2467
  } else if (trimmed.startsWith("AB_ACTION|snapshot|") || trimmed.startsWith("AB_ACTION|assert|")) {
1568
- const action = parseAbAction(trimmed);
2468
+ const action = parseAbAction(scrubSecrets(trimmed, secretsToScrub));
1569
2469
  if (action) traceActions.push(action);
1570
2470
  }
1571
2471
  }
@@ -1591,7 +2491,7 @@ function replacePlaceholdersWithDummies(spec) {
1591
2491
  const dummies = spec.placeholders;
1592
2492
  const resolve = (text) => {
1593
2493
  let result = text;
1594
- for (const [key, def] of Object.entries(dummies)) result = result.replaceAll(`{{${key}}}`, def.dummy);
2494
+ for (const [key, def] of Object.entries(dummies)) result = result.replaceAll(`{{${key}}}`, resolveEnvRefs(def.dummy));
1595
2495
  return result;
1596
2496
  };
1597
2497
  return {
@@ -1603,17 +2503,52 @@ function replacePlaceholdersWithDummies(spec) {
1603
2503
  }))
1604
2504
  };
1605
2505
  }
2506
+ /**
2507
+ * Build the substitution map used to scrub real secret values out of
2508
+ * recorded actions before they are written to actions.json.
2509
+ *
2510
+ * For each placeholder whose dummy contains env refs, store
2511
+ * <resolved-value> -> <original ${VAR} string>
2512
+ * so that an `ab fill ... <secret>` line records the placeholder string
2513
+ * instead of the secret. Empty resolved values are skipped — they would
2514
+ * otherwise replace incidental empty strings in the recorded actions.
2515
+ */
2516
+ function buildSecretsToScrub(spec) {
2517
+ const map = /* @__PURE__ */ new Map();
2518
+ if (!spec.placeholders) return map;
2519
+ const dummies = spec.placeholders;
2520
+ for (const def of Object.values(dummies)) {
2521
+ if (!hasEnvRef(def.dummy)) continue;
2522
+ const resolved = resolveEnvRefs(def.dummy);
2523
+ if (!resolved) continue;
2524
+ map.set(resolved, def.dummy);
2525
+ }
2526
+ return map;
2527
+ }
2528
+ /** Replace every occurrence of a recorded secret with its `${VAR}` placeholder. */
2529
+ function scrubSecrets(line, secrets) {
2530
+ if (secrets.size === 0) return line;
2531
+ let result = line;
2532
+ for (const [secret, placeholder] of secrets) {
2533
+ if (!result.includes(secret)) continue;
2534
+ result = result.split(secret).join(placeholder);
2535
+ }
2536
+ return result;
2537
+ }
1606
2538
  //#endregion
1607
2539
  //#region src/cli/generate-setup.ts
1608
- const generateSetupCommand = new Command("generate-setup").argument("<name>", "Setup name to generate (e.g. login)").description("Clean up, validate, and templatize setup actions").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--from-dummy", "Resume from existing test.dummy.spec.ts (after manual fix)").action(async (name, opts) => {
1609
- await runGenerateSetup(name, parseInt(opts.maxRetries, 10), opts.fromDummy ?? false);
2540
+ const generateSetupCommand = new Command("generate-setup").argument("<name>", "Setup name to generate (e.g. login)").description("Clean up, validate, and templatize setup actions").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--from-dummy", "Resume from existing test.dummy.spec.ts (after manual fix)").option("--auto", "Apply auto-fixes without confirmation regardless of confidence (CI use)").option("--no-interactive", "Never prompt; only auto-apply when confidence is high, otherwise give up").option("--language <bcp47>", "Language for diagnose reasoning / hint text (e.g. 'en', 'ja')", "en").action(async (name, opts) => {
2541
+ const mode = resolveMode(opts);
2542
+ await runGenerateSetup(name, parseInt(opts.maxRetries, 10), opts.fromDummy ?? false, mode, opts.language ?? "en");
1610
2543
  });
1611
- async function runGenerateSetup(name, maxRetries, fromDummy) {
2544
+ async function runGenerateSetup(name, maxRetries, fromDummy, mode, outputLanguage) {
1612
2545
  header("generate-setup", name);
1613
2546
  await ensureCcqaDir();
1614
- const spec = parseSetupSpec(await readSetupSpecFile(name));
2547
+ const specContent = await readSetupSpecFile(name);
2548
+ const spec = parseSetupSpec(specContent);
1615
2549
  const dummyPath = join(getSetupDir(name), "test.dummy.spec.ts");
1616
2550
  const finalPath = join(getSetupDir(name), "test.spec.ts");
2551
+ let cleanedActions = [];
1617
2552
  if (fromDummy) {
1618
2553
  if (!await stat(dummyPath).then(() => true).catch(() => false)) {
1619
2554
  warn(`test.dummy.spec.ts not found. Run without --from-dummy first.`);
@@ -1624,40 +2559,52 @@ async function runGenerateSetup(name, maxRetries, fromDummy) {
1624
2559
  const { actions } = await getSetupActions(name);
1625
2560
  meta("setup", spec.title);
1626
2561
  meta("actions", actions.length);
2562
+ meta("fix-mode", mode);
2563
+ meta("language", outputLanguage);
1627
2564
  blank();
1628
- const cleanedActions = await cleanupActions(actions);
2565
+ cleanedActions = await cleanupActions(actions);
1629
2566
  if (cleanedActions.length !== actions.length) meta("cleaned", cleanedActions.length);
1630
2567
  await writeFile(dummyPath, actionsToScript(cleanedActions, spec.title), "utf-8");
1631
2568
  meta("saved", dummyPath);
1632
2569
  }
1633
2570
  blank();
1634
- let { exitCode, output, currentScript } = await runVitest(dummyPath);
1635
- if (exitCode !== 0) {
1636
- for (let attempt = 1; attempt <= maxRetries; attempt++) {
1637
- info(`auto-fix attempt ${attempt}/${maxRetries}...`);
1638
- blank();
1639
- const fixed = await autoFixWithLLM(currentScript, output);
1640
- if (!fixed) {
1641
- warn("could not determine fix from failure log");
1642
- break;
1643
- }
1644
- await writeFile(dummyPath, fixed, "utf-8");
1645
- meta("saved", dummyPath);
1646
- blank();
1647
- ({exitCode, output, currentScript} = await runVitest(dummyPath));
1648
- if (exitCode === 0) break;
1649
- }
1650
- if (exitCode !== 0) {
1651
- warn("auto-fix exhausted — setup test still failing");
2571
+ const agentBrowserSession = `ccqa-generate-setup-${name}-${Date.now()}`;
2572
+ const runVitestForSession = (path) => runVitestResolved(path, agentBrowserSession);
2573
+ await closeSession(agentBrowserSession);
2574
+ const signalHandler = () => {
2575
+ closeSession(agentBrowserSession).finally(() => process.exit(130));
2576
+ };
2577
+ process.once("SIGINT", signalHandler);
2578
+ process.once("SIGTERM", signalHandler);
2579
+ try {
2580
+ const initialRun = await timedPhase("vitest run #1", () => runVitestForSession(dummyPath), "run");
2581
+ let passed = initialRun.exitCode === 0;
2582
+ if (!passed) passed = await runAutoFixLoop({
2583
+ scriptPath: dummyPath,
2584
+ initialRun,
2585
+ specMarkdown: specContent,
2586
+ actions: cleanedActions,
2587
+ maxRetries,
2588
+ mode,
2589
+ runVitest: runVitestForSession,
2590
+ agentBrowserSession,
2591
+ outputLanguage
2592
+ });
2593
+ if (!passed) {
2594
+ warn("auto-fix exhausted; setup test still failing");
1652
2595
  hint(`edit ${dummyPath} manually, then run: ccqa generate-setup ${name} --from-dummy`);
1653
2596
  process.exit(1);
1654
2597
  }
2598
+ await writeFile(finalPath, reversePlaceholdersInScript(await readFile(dummyPath, "utf8"), spec.placeholders), "utf-8");
2599
+ await unlink(dummyPath).catch(() => {});
2600
+ blank();
2601
+ meta("saved", finalPath);
2602
+ hint(`setup '${name}' is ready; reference it in test-spec.md with setups: [{name: ${name}, params: {...}}]`);
2603
+ } finally {
2604
+ process.off("SIGINT", signalHandler);
2605
+ process.off("SIGTERM", signalHandler);
2606
+ await closeSession(agentBrowserSession);
1655
2607
  }
1656
- await writeFile(finalPath, reversePlaceholdersInScript(currentScript, spec.placeholders), "utf-8");
1657
- await unlink(dummyPath).catch(() => {});
1658
- blank();
1659
- meta("saved", finalPath);
1660
- hint(`setup '${name}' is ready — reference it in test-spec.md with setups: [{name: ${name}, params: {...}}]`);
1661
2608
  }
1662
2609
  /**
1663
2610
  * Replace dummy values with {{placeholder}} directly in the test script text.
@@ -1670,51 +2617,54 @@ function reversePlaceholdersInScript(script, placeholders) {
1670
2617
  for (const [key, def] of entries) result = result.replaceAll(def.dummy, `{{${key}}}`);
1671
2618
  return result;
1672
2619
  }
1673
- async function autoFixWithLLM(script, failureLog) {
1674
- try {
1675
- const { result, isError } = await invokeClaudeStreaming({
1676
- prompt: buildAutoFixPrompt(script, failureLog),
1677
- disableBuiltinTools: true,
1678
- maxTurns: 1
1679
- }, () => {});
1680
- if (isError || !result) return null;
1681
- const json = result.trim().replace(/^```(?:json)?\n?([\s\S]*?)\n?```$/, "$1").trim();
1682
- const fixes = JSON.parse(json);
1683
- if (!Array.isArray(fixes) || fixes.length === 0) return null;
1684
- return applySleepFixes(script, fixes);
1685
- } catch {
1686
- return null;
1687
- }
1688
- }
1689
- function applySleepFixes(script, fixes) {
1690
- const lines = script.split("\n");
1691
- for (const fix of fixes) if ("increase_to" in fix) {
1692
- const idx = fix.line - 1;
1693
- if (idx >= 0 && idx < lines.length) lines[idx] = lines[idx].replace(/spawnSync\("sleep",\s*\["\d+"\]/, `spawnSync("sleep", ["${fix.increase_to}"]`);
1694
- }
1695
- const inserts = fixes.filter((f) => "seconds" in f && !("increase_to" in f)).sort((a, b) => b.line - a.line);
1696
- for (const fix of inserts) {
1697
- const idx = fix.line - 1;
1698
- if (idx >= 0 && idx <= lines.length) lines.splice(idx, 0, ` spawnSync("sleep", ["${fix.seconds}"], { stdio: "inherit" });`);
1699
- }
1700
- return lines.join("\n");
1701
- }
1702
- async function runVitest(scriptPath) {
1703
- const { exitCode, stdout, stderr } = await spawnVitestCaptured([
2620
+ async function runVitest(scriptPath, agentBrowserSession) {
2621
+ const { exitCode, stdout, stderr } = await spawnVitestTeed([
1704
2622
  "run",
1705
2623
  "--config",
1706
2624
  bundledVitestConfigPath(),
1707
2625
  scriptPath
1708
- ]);
2626
+ ], agentBrowserSession ? { env: {
2627
+ ...process.env,
2628
+ AGENT_BROWSER_SESSION: agentBrowserSession
2629
+ } } : {});
1709
2630
  const currentScript = await readFile(scriptPath, "utf8");
1710
- process.stdout.write(stdout);
1711
- if (stderr) process.stderr.write(stderr);
1712
2631
  return {
1713
2632
  exitCode,
1714
2633
  output: stdout + stderr,
1715
2634
  currentScript
1716
2635
  };
1717
2636
  }
2637
+ /**
2638
+ * Run vitest on `test.dummy.spec.ts`, but transparently expand any `${VAR}`
2639
+ * env refs to real values for the duration of the run. The original file is
2640
+ * preserved unchanged so subsequent reverse-replace still sees the env-ref
2641
+ * literals. Auto-fix edits the original file (via writeFile in callers), so
2642
+ * we always re-read it before each invocation.
2643
+ */
2644
+ async function runVitestResolved(scriptPath, agentBrowserSession) {
2645
+ const original = await readFile(scriptPath, "utf8");
2646
+ if (!hasEnvRef(original)) return runVitest(scriptPath, agentBrowserSession);
2647
+ const tmpPath = scriptPath.replace(/\.ts$/, ".__resolved.spec.ts");
2648
+ await writeFile(tmpPath, resolveEnvRefs(original), "utf-8");
2649
+ try {
2650
+ const { exitCode, stdout, stderr } = await spawnVitestTeed([
2651
+ "run",
2652
+ "--config",
2653
+ bundledVitestConfigPath(),
2654
+ tmpPath
2655
+ ], agentBrowserSession ? { env: {
2656
+ ...process.env,
2657
+ AGENT_BROWSER_SESSION: agentBrowserSession
2658
+ } } : {});
2659
+ return {
2660
+ exitCode,
2661
+ output: stdout + stderr,
2662
+ currentScript: original
2663
+ };
2664
+ } finally {
2665
+ await unlink(tmpPath).catch(() => {});
2666
+ }
2667
+ }
1718
2668
  async function cleanupActions(actions) {
1719
2669
  try {
1720
2670
  const { result, isError } = await invokeClaudeStreaming({