ccqa 0.8.3 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/ccqa.mjs CHANGED
@@ -6,12 +6,14 @@ import { accessSync, existsSync, readFileSync, statSync } from "node:fs";
6
6
  import { fileURLToPath } from "node:url";
7
7
  import { access, mkdir, mkdtemp, readFile, readdir, rm, stat, writeFile } from "node:fs/promises";
8
8
  import { homedir, tmpdir } from "node:os";
9
- import { delimiter, dirname, join, posix, relative, resolve } from "node:path";
9
+ import { delimiter, dirname, isAbsolute, join, posix, relative, resolve } from "node:path";
10
10
  import { parse, stringify } from "yaml";
11
11
  import { ZodError, z } from "zod";
12
12
  import { execFile, spawn, spawnSync } from "node:child_process";
13
13
  import { query } from "@anthropic-ai/claude-agent-sdk";
14
+ import { AsyncLocalStorage } from "node:async_hooks";
14
15
  import { promisify } from "node:util";
16
+ import { randomUUID } from "node:crypto";
15
17
  import { createInterface } from "node:readline";
16
18
  import { createInterface as createInterface$1 } from "node:readline/promises";
17
19
  //#region src/runtime/env-vars.ts
@@ -139,6 +141,7 @@ const TestSpecSchema = z.object({
139
141
  title: z.string().min(1),
140
142
  relatedPaths: z.array(z.string().min(1)).optional(),
141
143
  mode: SpecModeSchema.optional(),
144
+ statePath: z.string().min(1).optional(),
142
145
  steps: z.array(StepSchema).min(1)
143
146
  }).strict();
144
147
  /** Default mode when `mode:` is absent. */
@@ -480,50 +483,62 @@ async function loadAvailableBlocks(cwd) {
480
483
  }))
481
484
  }));
482
485
  }
483
- const TRACE_USER_PROMPT_PATH = ".ccqa/prompts/trace.user.md";
484
- const RUN_ND_USER_PROMPT_PATH = ".ccqa/prompts/run-nd.user.md";
486
+ const RECORD_USER_PROMPT_PATH = ".ccqa/prompts/record.user.md";
487
+ const RECORD_AGENT_PROMPT_PATH = ".ccqa/prompts/record.agent.md";
488
+ const LIVE_USER_PROMPT_PATH = ".ccqa/prompts/live.user.md";
489
+ const LIVE_AGENT_PROMPT_PATH = ".ccqa/prompts/live.agent.md";
485
490
  const USER_PROMPT_MAX_BYTES = 32768;
486
491
  /**
487
- * Load project-specific guidance to append to the trace system prompt.
492
+ * Load the prompt bundle appended to the `ccqa record` (trace) system prompt.
488
493
  *
489
- * Returns the file's contents (trimmed) when `.ccqa/prompts/trace.user.md`
490
- * exists and is non-empty. Missing file, empty file, or read error all
491
- * resolve to `null` so callers can treat the override as strictly optional.
494
+ * Reads `.ccqa/prompts/record.user.md` (human-maintained, stable project
495
+ * rules) and `.ccqa/prompts/record.agent.md` (auto-rewritten by
496
+ * `ccqa record --update-agent-prompt`). Returns null when both files are
497
+ * missing / empty. The combined text is capped at 32 KiB after concatenation.
492
498
  *
493
- * The file is meant for organisation-specific rules that don't belong in
494
- * the OSS-default prompt — naming conventions, staging URL hints, repeated
495
- * UI quirks that recur across specs. Anything that genuinely belongs in
496
- * one spec should go in that spec's instruction, not here.
497
- *
498
- * Size-capped at 32 KiB to keep accidental commits of huge files from
499
- * blowing up the system prompt; the cap is observable to callers as a
500
- * truncated warning suffix.
499
+ * Use `ccqa init` to scaffold both files.
501
500
  */
502
- async function loadTraceUserPrompt(cwd) {
503
- return loadUserPromptFile(TRACE_USER_PROMPT_PATH, cwd, "trace.user.md");
501
+ async function loadRecordPromptBundle(cwd) {
502
+ return loadPromptBundle(RECORD_USER_PROMPT_PATH, RECORD_AGENT_PROMPT_PATH, cwd);
504
503
  }
505
504
  /**
506
- * Load project-specific guidance to append to the `ccqa run-nd` system prompt.
505
+ * Load the prompt bundle appended to the `ccqa run` (live mode) system prompt.
507
506
  *
508
- * Same shape as `loadTraceUserPrompt`, but reads from
509
- * `.ccqa/prompts/run-nd.user.md`. The non-deterministic test mode delegates
510
- * every step to Claude live, so anything that helps Claude do that job for a
511
- * particular product domain glossary, staging URL conventions, known
512
- * "this is fine" warnings, login flow quirks — belongs here. Keeping it in the
507
+ * Reads `.ccqa/prompts/live.user.md` (human-maintained, stable project
508
+ * rules) and `.ccqa/prompts/live.agent.md` (auto-rewritten by
509
+ * `ccqa run --update-agent-prompt`). Same null / cap semantics as
510
+ * `loadRecordPromptBundle`. Keeping product-specific context in the
513
511
  * consuming repo (not the ccqa OSS prompt) is the explicit non-contamination
514
- * boundary: ccqa stays product-agnostic, projects can layer in whatever
515
- * context they need.
512
+ * boundary.
516
513
  */
517
- async function loadRunNdUserPrompt(cwd) {
518
- return loadUserPromptFile(RUN_ND_USER_PROMPT_PATH, cwd, "run-nd.user.md");
514
+ async function loadLivePromptBundle(cwd) {
515
+ return loadPromptBundle(LIVE_USER_PROMPT_PATH, LIVE_AGENT_PROMPT_PATH, cwd);
519
516
  }
520
- async function loadUserPromptFile(relPath, cwd, labelForTruncation) {
517
+ async function loadPromptBundle(userRelPath, agentRelPath, cwd) {
518
+ const [userText, agentText] = await Promise.all([readPromptFile(userRelPath, cwd), readPromptFile(agentRelPath, cwd)]);
519
+ if (userText === null && agentText === null) return null;
520
+ const sections = [];
521
+ const loaded = [];
522
+ if (userText !== null) {
523
+ sections.push(`### Project guidance (human-maintained)\n\n${userText}`);
524
+ loaded.push(userRelPath);
525
+ }
526
+ if (agentText !== null) {
527
+ sections.push(`### Agent learnings (auto-updated by ccqa --update-agent-prompt)\n\n${agentText}`);
528
+ loaded.push(agentRelPath);
529
+ }
530
+ let text = sections.join("\n\n");
531
+ if (text.length > USER_PROMPT_MAX_BYTES) text = text.slice(0, USER_PROMPT_MAX_BYTES) + `\n\n[ccqa] (prompt bundle truncated at ${USER_PROMPT_MAX_BYTES} bytes)`;
532
+ return {
533
+ text,
534
+ loaded
535
+ };
536
+ }
537
+ async function readPromptFile(relPath, cwd) {
521
538
  const content = await readFile(join(cwd ?? process.cwd(), relPath), "utf-8").catch(() => null);
522
539
  if (content === null) return null;
523
540
  const trimmed = content.trim();
524
- if (trimmed.length === 0) return null;
525
- if (trimmed.length > USER_PROMPT_MAX_BYTES) return trimmed.slice(0, USER_PROMPT_MAX_BYTES) + `\n\n[ccqa] (${labelForTruncation} truncated at ${USER_PROMPT_MAX_BYTES} bytes)`;
526
- return trimmed;
541
+ return trimmed.length === 0 ? null : trimmed;
527
542
  }
528
543
  /**
529
544
  * Probe for orphaned files left over from earlier ccqa versions inside
@@ -567,9 +582,9 @@ async function getTestScript(featureName, specName, cwd) {
567
582
  }
568
583
  /**
569
584
  * Variant of `listAllSpecs` for callers that care about the spec definition
570
- * itself (spec.yaml) rather than its compiled vitest script. `ccqa run-nd`
571
- * uses this because it skips codegen entirely — a freshly drafted spec with
572
- * no `test.spec.ts` is still a valid target.
585
+ * itself (spec.yaml) rather than its compiled vitest script. `ccqa run` uses
586
+ * this for live-mode specs because they skip codegen entirely — a freshly
587
+ * drafted spec with no `test.spec.ts` is still a valid target.
573
588
  */
574
589
  async function listAllSpecsWithSpecFile(cwd) {
575
590
  return listAllSpecsFilteredBy(SPEC_FILE, cwd);
@@ -589,10 +604,10 @@ async function listAllSpecsFilteredBy(requiredFilename, cwd) {
589
604
  }))).flat();
590
605
  }
591
606
  /**
592
- * Resolve a CLI `<target>` argument into a list of spec refs. Shared between
593
- * `ccqa run` and `ccqa run-nd`. Callers pass the right enumerator for "no
594
- * target" (run wants `test.spec.ts`-having specs; run-nd wants `spec.yaml`-
595
- * having specs).
607
+ * Resolve a CLI `<target>` argument into a list of spec refs. Used by
608
+ * `ccqa run`. Callers pass the right enumerator for "no target" (deterministic
609
+ * specs want `test.spec.ts`-having specs; live specs want `spec.yaml`-having
610
+ * specs).
596
611
  */
597
612
  async function resolveSpecTargets(target, enumerateAll, cwd) {
598
613
  if (!target) return enumerateAll();
@@ -745,6 +760,27 @@ function waitExit(child) {
745
760
  });
746
761
  }
747
762
  //#endregion
763
+ //#region src/runtime/pool.ts
764
+ /**
765
+ * Run each item through `fn` with at most `concurrency` running at once.
766
+ * Results preserve input order. A throwing `fn` rejects the whole pool
767
+ * (callers that want per-item isolation should catch inside `fn`).
768
+ */
769
+ async function runPool(items, concurrency, fn) {
770
+ const results = new Array(items.length);
771
+ let cursor = 0;
772
+ const worker = async () => {
773
+ while (true) {
774
+ const idx = cursor++;
775
+ if (idx >= items.length) return;
776
+ results[idx] = await fn(items[idx], idx);
777
+ }
778
+ };
779
+ const n = Math.max(1, Math.min(concurrency, items.length));
780
+ await Promise.all(Array.from({ length: n }, () => worker()));
781
+ return results;
782
+ }
783
+ //#endregion
748
784
  //#region src/claude/extract-json.ts
749
785
  /**
750
786
  * Pulls a JSON object out of a Claude completion. Accepts either a fenced
@@ -767,26 +803,70 @@ const STEP_ICONS = {
767
803
  STEP_SKIPPED: "⊘",
768
804
  RUN_COMPLETED: "■"
769
805
  };
806
+ /**
807
+ * When a `withBuffer` scope is active, every log line (stdout and stderr) is
808
+ * appended to its buffer instead of being written immediately. Parallel spec
809
+ * runs use this so each spec's narration — including logs emitted deep inside
810
+ * the live executor — flushes as one contiguous block, not interleaved.
811
+ */
812
+ const bufferStore = new AsyncLocalStorage();
813
+ /** True while inside a `withBuffer` scope: progress lines avoid TTY cursor tricks. */
814
+ function isBuffered() {
815
+ return bufferStore.getStore() !== void 0;
816
+ }
817
+ function emit(text, sink = process.stdout) {
818
+ const store = bufferStore.getStore();
819
+ if (store) {
820
+ store.out.push(text);
821
+ return;
822
+ }
823
+ sink.write(text);
824
+ }
825
+ /**
826
+ * Write raw text to the active `withBuffer` scope, or straight to stdout when
827
+ * none is active. Lets a runner redirect sub-process output (e.g. a child's
828
+ * stdout) into the same buffer as its `log.*` lines so they flush together.
829
+ */
830
+ function emitRaw(text) {
831
+ emit(text);
832
+ }
833
+ /**
834
+ * Run `fn` with all its log output captured into a buffer, then flush the
835
+ * buffer in one shot under `label`. Used by parallel runners to keep each
836
+ * spec's output legible. Output is flushed even when `fn` throws.
837
+ *
838
+ * When `buffered` is false, `fn` runs with no buffer so its output streams
839
+ * live — this is the sequential (concurrency 1) path, unchanged from before.
840
+ */
841
+ async function withBuffer(label, buffered, fn) {
842
+ if (!buffered) return fn();
843
+ const store = { out: [] };
844
+ try {
845
+ return await bufferStore.run(store, fn);
846
+ } finally {
847
+ process.stdout.write(`\n──── ${label} ────\n${store.out.join("")}`);
848
+ }
849
+ }
770
850
  function header(command, target) {
771
- process.stdout.write(`\nccqa ${command}${target ? ` ${target}` : ""}\n\n`);
851
+ emit(`\nccqa ${command}${target ? ` ${target}` : ""}\n\n`);
772
852
  }
773
853
  function write(scope, message, sink = process.stdout) {
774
- sink.write(`[${scope}] ${message}\n`);
854
+ emit(`[${scope}] ${message}\n`, sink);
775
855
  }
776
856
  function meta(key, value) {
777
857
  write("meta", `${key}: ${value}`);
778
858
  }
779
859
  function blank() {
780
- process.stdout.write("\n");
860
+ emit("\n");
781
861
  }
782
862
  function info(message) {
783
863
  write("info", message);
784
864
  }
785
865
  function step(type, stepId, detail) {
786
- process.stdout.write(` ${STEP_ICONS[type]} [${stepId}] ${detail}\n`);
866
+ emit(` ${STEP_ICONS[type]} [${stepId}] ${detail}\n`);
787
867
  }
788
868
  function bash(command) {
789
- process.stdout.write(` $ ${command.slice(0, 120)}\n`);
869
+ emit(` $ ${command.slice(0, 120)}\n`);
790
870
  }
791
871
  function error(message) {
792
872
  write("error", message, process.stderr);
@@ -795,7 +875,7 @@ function warn(message) {
795
875
  write("warn", message, process.stderr);
796
876
  }
797
877
  function hint(message) {
798
- process.stdout.write("\n");
878
+ emit("\n");
799
879
  write("hint", message);
800
880
  }
801
881
  function fix(message) {
@@ -820,17 +900,17 @@ const PROGRESS_NONTTY_STRIDE = 5;
820
900
  let lastProgressNonTtyEmit = -1;
821
901
  function progress(current, total, label) {
822
902
  const text = `[info] ${current + 1}/${total} ${label}`;
823
- if (process.stdout.isTTY) {
903
+ if (process.stdout.isTTY && !isBuffered()) {
824
904
  process.stdout.write(`\r${text}\x1b[K`);
825
905
  return;
826
906
  }
827
907
  if (current === 0 || current - lastProgressNonTtyEmit >= PROGRESS_NONTTY_STRIDE) {
828
- process.stdout.write(`${text}\n`);
908
+ emit(`${text}\n`);
829
909
  lastProgressNonTtyEmit = current;
830
910
  }
831
911
  }
832
912
  function progressEnd() {
833
- if (process.stdout.isTTY) process.stdout.write(`\r\x1b[K`);
913
+ if (process.stdout.isTTY && !isBuffered()) process.stdout.write(`\r\x1b[K`);
834
914
  lastProgressNonTtyEmit = -1;
835
915
  }
836
916
  /**
@@ -1351,6 +1431,12 @@ function extractAbActionFromBashCommand(cmd) {
1351
1431
  case "type":
1352
1432
  case "select": return `AB_ACTION|${subCmd}|${args[0] ?? ""}|${args[1] ?? ""}|${args[2] ?? ""}`;
1353
1433
  case "drag": return `AB_ACTION|drag|${args[0] ?? ""}|${args[1] ?? ""}|${args[2] ?? ""}`;
1434
+ case "upload": {
1435
+ const sel = args[0] ?? "";
1436
+ const files = args.slice(1);
1437
+ if (!sel || files.length === 0) return null;
1438
+ return `AB_ACTION|upload|${sel}|${files.join("|")}`;
1439
+ }
1354
1440
  case "snapshot": return null;
1355
1441
  case "find": return extractFindAbAction(args);
1356
1442
  default: return null;
@@ -1688,25 +1774,15 @@ const DEFAULT_CONCURRENCY$1 = 3;
1688
1774
  */
1689
1775
  async function analyzeDrift(input) {
1690
1776
  const { targets, cwd, blocks, concurrency = DEFAULT_CONCURRENCY$1, model, language, onSpecStart } = input;
1691
- const results = new Array(targets.length);
1692
- let cursor = 0;
1693
- const worker = async () => {
1694
- while (true) {
1695
- const idx = cursor++;
1696
- if (idx >= targets.length) return;
1697
- const target = targets[idx];
1698
- onSpecStart?.(target);
1699
- results[idx] = await checkSpec(target, {
1700
- cwd,
1701
- blocks,
1702
- model,
1703
- language
1704
- });
1705
- }
1706
- };
1707
- const pool = Array.from({ length: Math.min(concurrency, targets.length) }, () => worker());
1708
- await Promise.all(pool);
1709
- return results;
1777
+ return runPool(targets, concurrency, async (target) => {
1778
+ onSpecStart?.(target);
1779
+ return checkSpec(target, {
1780
+ cwd,
1781
+ blocks,
1782
+ model,
1783
+ language
1784
+ });
1785
+ });
1710
1786
  }
1711
1787
  async function checkSpec(target, opts) {
1712
1788
  const { featureName, specName } = target;
@@ -2324,7 +2400,7 @@ function clamp(n, lo, hi) {
2324
2400
  //#endregion
2325
2401
  //#region src/report/prompt.ts
2326
2402
  function buildFailureAnalysisPrompt(input) {
2327
- const { script, specYaml, failureLog, ndTranscriptExcerpt, diffPatch, changedFiles, baseRef, driftIssues, outputLanguage = "auto" } = input;
2403
+ const { script, specYaml, failureLog, liveTranscriptExcerpt, diffPatch, changedFiles, baseRef, driftIssues, outputLanguage = "auto" } = input;
2328
2404
  return `You are analyzing a failing E2E regression test right after a source change landed. Your job is a root-cause CALL, not a fix: decide which of three categories explains the failure, using the source diff as your primary context.
2329
2405
 
2330
2406
  ${outputLanguageBlock(outputLanguage, "`reasoning`, `detail`", "label names (TEST_DRIFT, etc.)")}## The three categories
@@ -2396,7 +2472,7 @@ Evidence rules: TEST_DRIFT and SPEC_CHANGE require at least one concrete \`file\
2396
2472
  ## Test Spec (spec.yaml)
2397
2473
  ${specYaml}
2398
2474
 
2399
- ${buildExecutionEvidenceBlock(script, failureLog, ndTranscriptExcerpt)}
2475
+ ${buildExecutionEvidenceBlock(script, failureLog, liveTranscriptExcerpt)}
2400
2476
 
2401
2477
  ${diffPatch ? `## Source changes since ${baseRef ?? "base"} (git diff, may be truncated)
2402
2478
 
@@ -2432,14 +2508,14 @@ ${driftIssues.map((i) => `- [${i.severity}] (${DRAFT_CATEGORY_LABEL[i.category]}
2432
2508
  * never has to branch on mode — it just sees "here's what was executed
2433
2509
  * and here's how it failed".
2434
2510
  */
2435
- function buildExecutionEvidenceBlock(script, failureLog, ndTranscriptExcerpt) {
2511
+ function buildExecutionEvidenceBlock(script, failureLog, liveTranscriptExcerpt) {
2436
2512
  const sections = [];
2437
2513
  if (script && script.length > 0) sections.push(`## Test Script (with line numbers)
2438
2514
  ${numberLines(script)}`);
2439
2515
  if (failureLog && failureLog.length > 0) sections.push(`## Failure Log
2440
2516
  ${failureLog.slice(0, 8e3)}`);
2441
- if (ndTranscriptExcerpt && ndTranscriptExcerpt.length > 0) sections.push(`## Live Run Transcript (summary of Claude's per-step execution)
2442
- ${ndTranscriptExcerpt}`);
2517
+ if (liveTranscriptExcerpt && liveTranscriptExcerpt.length > 0) sections.push(`## Live Run Transcript (summary of Claude's per-step execution)
2518
+ ${liveTranscriptExcerpt}`);
2443
2519
  if (sections.length === 0) return `## Execution evidence
2444
2520
 
2445
2521
  (No script, failure log, or live transcript was captured for this run. Classify from spec.yaml + diff only, and be correspondingly more conservative — prefer UNKNOWN over a confident call.)`;
@@ -2535,11 +2611,11 @@ const ReportEvidenceSchema = z.object({
2535
2611
  failureSummary: z.string().nullable().default(null)
2536
2612
  });
2537
2613
  /**
2538
- * Per-step row for a non-deterministic run (`ccqa run-nd`). Mirrors the
2539
- * structure produced by `src/runtime/nd-executor.ts:NdStepResult` but
2614
+ * Per-step row for a live-mode run (spec.yaml `mode: live`). Mirrors the
2615
+ * structure produced by `src/runtime/live-executor.ts:LiveStepResult` but
2540
2616
  * encoded against the report schema so the HTML renderer can carry both
2541
- * deterministic (`evidence`) and non-deterministic (`ndRun`) sources of
2542
- * step-boundary screenshots.
2617
+ * deterministic (`evidence`) and live (`liveRun`) sources of step-boundary
2618
+ * screenshots.
2543
2619
  *
2544
2620
  * `beforePng` / `afterPng` are RELATIVE to the HTML report directory — the
2545
2621
  * caller computes the relative path with `node:path`'s `relative()` so the
@@ -2554,7 +2630,7 @@ const ReportEvidenceSchema = z.object({
2554
2630
  * `models` is the union of model ids the SDK reported using; usually a
2555
2631
  * single element, but the SDK can fan out across models in some modes.
2556
2632
  */
2557
- const NdReportCostSchema = z.object({
2633
+ const LiveReportCostSchema = z.object({
2558
2634
  totalCostUsd: z.number().nullable(),
2559
2635
  durationApiMs: z.number().nullable(),
2560
2636
  numTurns: z.number().nullable(),
@@ -2564,7 +2640,7 @@ const NdReportCostSchema = z.object({
2564
2640
  outputTokens: z.number().nullable(),
2565
2641
  models: z.array(z.string())
2566
2642
  });
2567
- const NdReportStepSchema = z.object({
2643
+ const LiveReportStepSchema = z.object({
2568
2644
  stepId: z.string(),
2569
2645
  source: z.string(),
2570
2646
  instruction: z.string(),
@@ -2578,15 +2654,15 @@ const NdReportStepSchema = z.object({
2578
2654
  beforePng: z.string().nullable(),
2579
2655
  afterPng: z.string().nullable(),
2580
2656
  durationMs: z.number(),
2581
- cost: NdReportCostSchema
2657
+ cost: LiveReportCostSchema
2582
2658
  });
2583
- const NdReportRunSchema = z.object({
2659
+ const LiveReportRunSchema = z.object({
2584
2660
  runId: z.string(),
2585
2661
  sessionName: z.string(),
2586
2662
  startedAt: z.string(),
2587
2663
  durationMs: z.number(),
2588
- steps: z.array(NdReportStepSchema),
2589
- cost: NdReportCostSchema
2664
+ steps: z.array(LiveReportStepSchema),
2665
+ cost: LiveReportCostSchema
2590
2666
  });
2591
2667
  const ReportSpecResultSchema = z.object({
2592
2668
  feature: z.string(),
@@ -2607,7 +2683,7 @@ const ReportSpecResultSchema = z.object({
2607
2683
  diffExcerpt: z.string().nullable(),
2608
2684
  specYaml: z.string().nullable(),
2609
2685
  evidence: z.array(ReportEvidenceSchema).nullable(),
2610
- ndRun: NdReportRunSchema.nullable()
2686
+ liveRun: LiveReportRunSchema.nullable()
2611
2687
  });
2612
2688
  z.object({
2613
2689
  schemaVersion: z.literal(1),
@@ -2846,7 +2922,7 @@ function scopePatchForSpec(patch, relatedPaths, caps = {}) {
2846
2922
  return parts.join("\n");
2847
2923
  }
2848
2924
  //#endregion
2849
- //#region src/runtime/nd-cost-format.ts
2925
+ //#region src/runtime/live-cost-format.ts
2850
2926
  /**
2851
2927
  * Compact one-line cost summary. Format:
2852
2928
  * "$0.1234 · 4 turns · 42 in / 6,511 out · 2.0M cached · sonnet"
@@ -2856,7 +2932,7 @@ function scopePatchForSpec(patch, relatedPaths, caps = {}) {
2856
2932
  * `model=...` segment. `compact: true` (HTML chip) thousand-separates fresh
2857
2933
  * tokens, abbreviates cache-read with K/M, drops the `model=` prefix.
2858
2934
  */
2859
- function formatNdCost(cost, options) {
2935
+ function formatLiveCost(cost, options) {
2860
2936
  if (cost.totalCostUsd === null) return null;
2861
2937
  const compact = options.compact;
2862
2938
  const sep = compact ? " · " : " / ";
@@ -2875,7 +2951,7 @@ function formatNdCost(cost, options) {
2875
2951
  * Sum of per-spec costs for a batch. Used only by the CLI batch summary.
2876
2952
  * Returns null when no spec has cost data.
2877
2953
  */
2878
- function formatNdBatchCost(costs) {
2954
+ function formatLiveBatchCost(costs) {
2879
2955
  let totalUsd = 0;
2880
2956
  let seen = false;
2881
2957
  let totalIn = 0;
@@ -3176,7 +3252,7 @@ function renderResult(r, index, s) {
3176
3252
  const heading = r.title ? `<span class="spec-title">${esc(r.title)}</span><span class="spec-slug">(${esc(id)})</span>` : `<span class="spec-title">${esc(id)}</span>`;
3177
3253
  const predictionLine = r.status === "failed" && r.analysis ? `<span class="label-text label-${r.analysis.label}">${esc(displayLabel(r.analysis.label, s))} · ${Math.round(r.analysis.confidence * 100)}%</span>` : "";
3178
3254
  const needsGradingDot = r.status === "failed" && r.analysis ? `<span class="needs-grading-dot" data-case-id="${esc(id)}" title="${esc(s.needsGrading)}"></span>` : "";
3179
- const modeTag = r.ndRun ? `<span class="mode-tag" title="executed in live mode (Claude drove the browser per step)">LIVE</span>` : `<span class="mode-tag" title="executed in deterministic mode (vitest replayed test.spec.ts)">DETERMINISTIC</span>`;
3255
+ const modeTag = r.liveRun ? `<span class="mode-tag" title="executed in live mode (Claude drove the browser per step)">LIVE</span>` : `<span class="mode-tag" title="executed in deterministic mode (vitest replayed test.spec.ts)">DETERMINISTIC</span>`;
3180
3256
  return `<details class="spec ${r.status}" data-status="${r.status}" data-case-id="${esc(id)}"${r.status === "failed" ? " open" : ""}>
3181
3257
  <summary>
3182
3258
  ${statusIcon(r.status)}
@@ -3189,7 +3265,7 @@ function renderResult(r, index, s) {
3189
3265
  </summary>
3190
3266
  <div class="spec-body">
3191
3267
  ${renderEvidence(r, s)}
3192
- ${r.ndRun ? renderNdRun(r.ndRun, s) : ""}
3268
+ ${r.liveRun ? renderLiveRun(r.liveRun, s) : ""}
3193
3269
  ${renderSpecBody(r, index, s)}
3194
3270
  ${collapsible(s.collSpecYaml, s.collSpecYamlHelp, r.specYaml)}
3195
3271
  </div>
@@ -3200,16 +3276,16 @@ function renderSpecBody(r, index, s) {
3200
3276
  if (r.analysis) return renderAnalysis(r, index, s);
3201
3277
  return renderSkippedWithSupporting(r, s);
3202
3278
  }
3203
- function renderNdRun(nd, strings) {
3204
- const stepItems = nd.steps.map((s) => {
3279
+ function renderLiveRun(live, strings) {
3280
+ const stepItems = live.steps.map((s) => {
3205
3281
  const before = s.beforePng ? `<a class="shot" href="${esc(s.beforePng)}" target="_blank" rel="noopener"><img src="${esc(s.beforePng)}" alt="before ${esc(s.stepId)}" loading="lazy"><span>before</span></a>` : "";
3206
3282
  const after = s.afterPng ? `<a class="shot" href="${esc(s.afterPng)}" target="_blank" rel="noopener"><img src="${esc(s.afterPng)}" alt="after ${esc(s.stepId)}" loading="lazy"><span>after</span></a>` : "";
3207
3283
  const dur = s.durationMs > 0 ? `<span class="duration">${formatDuration$1(s.durationMs)}</span>` : "";
3208
- const stepCost = formatNdCostChip(s.cost);
3284
+ const stepCost = formatLiveCostChip(s.cost);
3209
3285
  const stepModel = formatModelChip(s.cost.models);
3210
- const sourceBadge = s.source && s.source !== "spec" ? `<span class="nd-source">[${esc(s.source)}]</span>` : "";
3211
- return `<li class="nd-step ${s.status}">
3212
- <div class="nd-step-head">
3286
+ const sourceBadge = s.source && s.source !== "spec" ? `<span class="live-source">[${esc(s.source)}]</span>` : "";
3287
+ return `<li class="live-step ${s.status}">
3288
+ <div class="live-step-head">
3213
3289
  ${statusIcon(s.status)}
3214
3290
  <span class="step-name">${esc(s.stepId)}</span>
3215
3291
  ${sourceBadge}
@@ -3218,44 +3294,44 @@ function renderNdRun(nd, strings) {
3218
3294
  ${stepCost}
3219
3295
  ${dur}
3220
3296
  </div>
3221
- <div class="nd-step-body">
3222
- <p class="nd-instr"><strong>${esc(strings.stepDoLabel)}:</strong> ${esc(s.instruction)}</p>
3223
- <p class="nd-instr"><strong>${esc(strings.stepExpectLabel)}:</strong> ${esc(s.expected)}</p>
3224
- ${s.reasoning ? `<p class="nd-reasoning">${esc(s.reasoning)}</p>` : ""}
3225
- ${before || after ? `<div class="nd-shots">${before}${after}</div>` : ""}
3297
+ <div class="live-step-body">
3298
+ <p class="live-instr"><strong>${esc(strings.stepDoLabel)}:</strong> ${esc(s.instruction)}</p>
3299
+ <p class="live-instr"><strong>${esc(strings.stepExpectLabel)}:</strong> ${esc(s.expected)}</p>
3300
+ ${s.reasoning ? `<p class="live-reasoning">${esc(s.reasoning)}</p>` : ""}
3301
+ ${before || after ? `<div class="live-shots">${before}${after}</div>` : ""}
3226
3302
  </div>
3227
3303
  </li>`;
3228
3304
  }).join("\n");
3229
- const runCost = formatNdCostChip(nd.cost);
3230
- const runModel = formatModelChip(nd.cost.models);
3231
- return `<section class="nd-run">
3232
- <details class="nd-run-meta">
3305
+ const runCost = formatLiveCostChip(live.cost);
3306
+ const runModel = formatModelChip(live.cost.models);
3307
+ return `<section class="live-run">
3308
+ <details class="live-run-meta">
3233
3309
  <summary>${labelWithHelp(esc(strings.collLiveRunMeta), strings.collLiveRunMetaHelp)}</summary>
3234
- <div class="nd-run-meta-body">
3310
+ <div class="live-run-meta-body">
3235
3311
  <span class="dim">${esc(strings.liveRunIdLabel)}</span>
3236
- <code>${esc(nd.runId)}</code>
3312
+ <code>${esc(live.runId)}</code>
3237
3313
  <span class="dim">${esc(strings.liveSessionLabel)}</span>
3238
- <code>${esc(nd.sessionName)}</code>
3314
+ <code>${esc(live.sessionName)}</code>
3239
3315
  ${runModel}
3240
3316
  ${runCost}
3241
- <span class="duration">${formatDuration$1(nd.durationMs)}</span>
3317
+ <span class="duration">${formatDuration$1(live.durationMs)}</span>
3242
3318
  </div>
3243
3319
  </details>
3244
- <ol class="nd-steps">${stepItems}</ol>
3320
+ <ol class="live-steps">${stepItems}</ol>
3245
3321
  </section>`;
3246
3322
  }
3247
3323
  /** Compact dot-separated cost chip, e.g. "$0.1234 · 4 turns · 42 in / 6,511 out · 2.0M cached". */
3248
- function formatNdCostChip(cost) {
3249
- const line = formatNdCost(cost, { compact: true });
3324
+ function formatLiveCostChip(cost) {
3325
+ const line = formatLiveCost(cost, { compact: true });
3250
3326
  if (line === null) return "";
3251
- return `<span class="nd-cost" title="cost · turns · fresh-input/output tokens · cache-read input">${esc(line)}</span>`;
3327
+ return `<span class="live-cost" title="cost · turns · fresh-input/output tokens · cache-read input">${esc(line)}</span>`;
3252
3328
  }
3253
3329
  function formatModelChip(models) {
3254
3330
  if (!models || models.length === 0) return "";
3255
- return `<span class="nd-model" title="Claude model id(s) reported by the SDK">${esc(models.join(", "))}</span>`;
3331
+ return `<span class="live-model" title="Claude model id(s) reported by the SDK">${esc(models.join(", "))}</span>`;
3256
3332
  }
3257
3333
  /**
3258
- * Per-step UI for deterministic runs. Adopts the same `nd-step` card layout
3334
+ * Per-step UI for deterministic runs. Adopts the same `live-step` card layout
3259
3335
  * used by live runs so reviewers don't have to context-switch between two
3260
3336
  * visual idioms. We map the evidence entries (which are already keyed by
3261
3337
  * stepId) onto the same shape, leaving live-only fields (before png, cost,
@@ -3263,14 +3339,14 @@ function formatModelChip(models) {
3263
3339
  */
3264
3340
  function renderEvidence(r, s) {
3265
3341
  if (!r.evidence || r.evidence.length === 0) return "";
3266
- return `<section class="nd-run">
3267
- <ol class="nd-steps">${r.evidence.map((e) => renderDetStepCard(e, s)).join("\n")}</ol>
3342
+ return `<section class="live-run">
3343
+ <ol class="live-steps">${r.evidence.map((e) => renderDetStepCard(e, s)).join("\n")}</ol>
3268
3344
  </section>`;
3269
3345
  }
3270
3346
  function renderDetStepCard(e, s) {
3271
3347
  const status = e.status === "failed" ? "failed" : "passed";
3272
- const description = e.description ? `<p class="nd-instr"><strong>${esc(s.stepExpectLabel)}:</strong> ${esc(e.description)}</p>` : "";
3273
- const failureBlock = e.status === "failed" && e.failureSummary ? `<p class="nd-reasoning">${esc(e.failureSummary)}</p>` : "";
3348
+ const description = e.description ? `<p class="live-instr"><strong>${esc(s.stepExpectLabel)}:</strong> ${esc(e.description)}</p>` : "";
3349
+ const failureBlock = e.status === "failed" && e.failureSummary ? `<p class="live-reasoning">${esc(e.failureSummary)}</p>` : "";
3274
3350
  const metaRows = [];
3275
3351
  if (e.url) {
3276
3352
  const shortUrl = shortenUrl(e.url);
@@ -3279,16 +3355,16 @@ function renderDetStepCard(e, s) {
3279
3355
  if (e.title) metaRows.push(`<div class="evidence-meta-row"><span class="evidence-meta-label">${esc(s.metaPage)}</span><span class="evidence-meta-value">${esc(e.title)}</span></div>`);
3280
3356
  const meta = metaRows.length > 0 ? `<div class="evidence-meta">${metaRows.join("")}</div>` : "";
3281
3357
  const after = `<a class="shot" href="${esc(e.pngPath)}" target="_blank" rel="noopener"><img src="${esc(e.pngPath)}" alt="${esc(e.stepId)}" loading="lazy"><span>after</span></a>`;
3282
- return `<li class="nd-step ${status}">
3283
- <div class="nd-step-head">
3358
+ return `<li class="live-step ${status}">
3359
+ <div class="live-step-head">
3284
3360
  ${statusIcon(status)}
3285
3361
  <span class="step-name">${esc(e.stepId)}</span>
3286
3362
  <span class="spacer"></span>
3287
3363
  </div>
3288
- <div class="nd-step-body">
3364
+ <div class="live-step-body">
3289
3365
  ${description}
3290
3366
  ${failureBlock}
3291
- <div class="nd-shots">${after}</div>
3367
+ <div class="live-shots">${after}</div>
3292
3368
  ${meta}
3293
3369
  </div>
3294
3370
  </li>`;
@@ -3726,54 +3802,54 @@ table.matrix td.miss-nonzero { background: var(--fail-bg); }
3726
3802
 
3727
3803
  /* Per-step block: indented + a thin rail under the test title so the
3728
3804
  hierarchy spec → test → step is visible. */
3729
- .nd-run {
3805
+ .live-run {
3730
3806
  padding: 0 0 0 14px;
3731
3807
  margin-left: 6px;
3732
3808
  border-left: 1px solid var(--border-soft);
3733
3809
  }
3734
- .nd-run-meta { margin: 0 0 8px; font-size: 11.5px; }
3735
- .nd-run-meta > summary {
3810
+ .live-run-meta { margin: 0 0 8px; font-size: 11.5px; }
3811
+ .live-run-meta > summary {
3736
3812
  cursor: pointer; color: var(--text-mute); list-style: none;
3737
3813
  padding: 4px 0;
3738
3814
  }
3739
- .nd-run-meta > summary::-webkit-details-marker { display: none; }
3740
- .nd-run-meta > summary::before {
3815
+ .live-run-meta > summary::-webkit-details-marker { display: none; }
3816
+ .live-run-meta > summary::before {
3741
3817
  content: "▸"; color: var(--text-dim); font-size: 10px;
3742
3818
  margin-right: 6px; transition: transform 0.12s ease;
3743
3819
  display: inline-block;
3744
3820
  }
3745
- .nd-run-meta[open] > summary::before { transform: rotate(90deg); }
3746
- .nd-run-meta-body {
3821
+ .live-run-meta[open] > summary::before { transform: rotate(90deg); }
3822
+ .live-run-meta-body {
3747
3823
  display: flex; gap: 12px; align-items: baseline; flex-wrap: wrap;
3748
3824
  color: var(--text-mute); padding: 6px 0 8px 16px;
3749
3825
  }
3750
- .nd-run-meta-body code { background: transparent; padding: 0; font-size: 11.5px; color: var(--text-dim); }
3751
- .nd-run-meta-body .dim { color: var(--text-mute); }
3826
+ .live-run-meta-body code { background: transparent; padding: 0; font-size: 11.5px; color: var(--text-dim); }
3827
+ .live-run-meta-body .dim { color: var(--text-mute); }
3752
3828
 
3753
3829
  /* Steps: flat list. The separator between steps has to outweigh anything
3754
3830
  *inside* a step (e.g. evidence-meta footer) so the eye finds the
3755
3831
  step boundary at a glance — hence a solid var(--border), not the
3756
3832
  softer hairline used inside the step body. */
3757
- .nd-steps { list-style: none; padding: 0; margin: 0; display: flex; flex-direction: column; gap: 0; }
3758
- .nd-step { border-top: 1px solid var(--border); padding: 16px 0; background: transparent; }
3759
- .nd-step:first-child { border-top: 0; padding-top: 0; }
3760
- .nd-step.skipped { opacity: 0.55; }
3761
- .nd-step-head { display: flex; align-items: baseline; gap: 8px; padding: 0; background: transparent; border-bottom: 0; font-size: 13px; margin-bottom: 6px; }
3762
- .nd-step-body { padding: 0; font-size: 12.5px; line-height: 1.55; }
3763
- .nd-step-body p { margin: 4px 0; }
3764
- .nd-instr strong { color: var(--text-mute); font-weight: 600; margin-right: 4px; font-size: 11px; letter-spacing: 0.04em; text-transform: uppercase; }
3833
+ .live-steps { list-style: none; padding: 0; margin: 0; display: flex; flex-direction: column; gap: 0; }
3834
+ .live-step { border-top: 1px solid var(--border); padding: 16px 0; background: transparent; }
3835
+ .live-step:first-child { border-top: 0; padding-top: 0; }
3836
+ .live-step.skipped { opacity: 0.55; }
3837
+ .live-step-head { display: flex; align-items: baseline; gap: 8px; padding: 0; background: transparent; border-bottom: 0; font-size: 13px; margin-bottom: 6px; }
3838
+ .live-step-body { padding: 0; font-size: 12.5px; line-height: 1.55; }
3839
+ .live-step-body p { margin: 4px 0; }
3840
+ .live-instr strong { color: var(--text-mute); font-weight: 600; margin-right: 4px; font-size: 11px; letter-spacing: 0.04em; text-transform: uppercase; }
3765
3841
 
3766
3842
  /* Reasoning: left rail, no fill. */
3767
- .nd-reasoning { color: var(--text-dim); font-style: italic; background: transparent; padding: 4px 0 4px 12px; border-left: 2px solid var(--fail); border-radius: 0; margin: 6px 0; }
3768
- .nd-step.passed .nd-reasoning { border-left-color: var(--border); color: var(--text-mute); font-style: normal; }
3843
+ .live-reasoning { color: var(--text-dim); font-style: italic; background: transparent; padding: 4px 0 4px 12px; border-left: 2px solid var(--fail); border-radius: 0; margin: 6px 0; }
3844
+ .live-step.passed .live-reasoning { border-left-color: var(--border); color: var(--text-mute); font-style: normal; }
3769
3845
 
3770
- .nd-source { font-size: 11px; color: var(--text-mute); }
3771
- .nd-shots { display: flex; gap: 12px; margin-top: 10px; flex-wrap: wrap; }
3772
- .nd-shots .shot { display: flex; flex-direction: column; align-items: center; gap: 4px; text-decoration: none; color: var(--text-mute); font-size: 10px; letter-spacing: 0.08em; }
3773
- .nd-shots .shot img { max-width: 280px; max-height: 180px; border: 1px solid var(--border-soft); border-radius: 3px; object-fit: contain; background: #000; }
3846
+ .live-source { font-size: 11px; color: var(--text-mute); }
3847
+ .live-shots { display: flex; gap: 12px; margin-top: 10px; flex-wrap: wrap; }
3848
+ .live-shots .shot { display: flex; flex-direction: column; align-items: center; gap: 4px; text-decoration: none; color: var(--text-mute); font-size: 10px; letter-spacing: 0.08em; }
3849
+ .live-shots .shot img { max-width: 280px; max-height: 180px; border: 1px solid var(--border-soft); border-radius: 3px; object-fit: contain; background: #000; }
3774
3850
 
3775
3851
  /* Cost / model chips: muted text, no fill. */
3776
- .nd-cost, .nd-model {
3852
+ .live-cost, .live-model {
3777
3853
  font-size: 11px; padding: 0;
3778
3854
  background: transparent;
3779
3855
  color: var(--text-mute);
@@ -4083,6 +4159,123 @@ const CLIENT_JS = `
4083
4159
  })();
4084
4160
  `;
4085
4161
  //#endregion
4162
+ //#region src/runtime/profile-env.ts
4163
+ /**
4164
+ * Profile env (Issue #37). A profile is a named `.env` under
4165
+ * `.ccqa/profiles/<name>.env`; its contents merge into `process.env` before any
4166
+ * spec work, so one spec targets dev/stg/prd without per-environment copies.
4167
+ * Spec `${VAR}` references all resolve against `process.env` downstream.
4168
+ *
4169
+ * The `.env` parser is a small hand-rolled subset (no dotenv dependency).
4170
+ */
4171
+ /**
4172
+ * Parse a `.env` body into a `name → value` map. Subset: blank / `#` lines
4173
+ * skipped, optional leading `export`, split on the first `=`, surrounding
4174
+ * quotes stripped, inline `# comment` dropped. No multi-line / interpolation.
4175
+ */
4176
+ function parseDotenv(content) {
4177
+ const out = {};
4178
+ for (const rawLine of content.split(/\r?\n/)) {
4179
+ const line = rawLine.trim();
4180
+ if (line === "" || line.startsWith("#")) continue;
4181
+ const withoutExport = line.replace(/^export\s+/, "");
4182
+ const eq = withoutExport.indexOf("=");
4183
+ if (eq === -1) continue;
4184
+ const key = withoutExport.slice(0, eq).trim();
4185
+ if (key === "") continue;
4186
+ out[key] = parseValue(withoutExport.slice(eq + 1).trim());
4187
+ }
4188
+ return out;
4189
+ }
4190
+ function parseValue(raw) {
4191
+ const quote = raw[0];
4192
+ if (quote === "\"" || quote === "'") {
4193
+ const close = raw.indexOf(quote, 1);
4194
+ if (close !== -1 && /^\s*(#.*)?$/.test(raw.slice(close + 1))) return raw.slice(1, close);
4195
+ }
4196
+ const hash = raw.search(/\s#/);
4197
+ return hash === -1 ? raw : raw.slice(0, hash).trimEnd();
4198
+ }
4199
+ var ProfileNotFoundError = class extends Error {
4200
+ profile;
4201
+ path;
4202
+ constructor(profile, path) {
4203
+ super(`profile "${profile}" not found: ${path}`);
4204
+ this.name = "ProfileNotFoundError";
4205
+ this.profile = profile;
4206
+ this.path = path;
4207
+ }
4208
+ };
4209
+ var InvalidProfileNameError = class extends Error {
4210
+ profile;
4211
+ constructor(profile) {
4212
+ super(`invalid profile name "${profile}": expected a bare name like "stg" (no path separators, no leading dot)`);
4213
+ this.name = "InvalidProfileNameError";
4214
+ this.profile = profile;
4215
+ }
4216
+ };
4217
+ /**
4218
+ * A profile name must be a single, non-dot-leading path segment, so
4219
+ * `--profile <name>` can't read a file outside the profiles dir (e.g.
4220
+ * `--profile ../../etc/hosts`). Rejecting separators and a leading dot already
4221
+ * blocks `..` traversal, so an in-name `..` (like `v1..2`) stays allowed.
4222
+ */
4223
+ function assertValidProfileName(profile) {
4224
+ if (profile === "" || profile.includes("/") || profile.includes("\\") || profile.startsWith(".")) throw new InvalidProfileNameError(profile);
4225
+ }
4226
+ /** Absolute path of the `.env` file backing `<profile>` under `<cwd>/.ccqa/`. */
4227
+ function profilePath(profile, cwd) {
4228
+ assertValidProfileName(profile);
4229
+ return join(cwd, ".ccqa", "profiles", `${profile}.env`);
4230
+ }
4231
+ /** Read + parse a `.env`, or `null` if absent. Other read errors propagate. */
4232
+ async function readDotenv(path) {
4233
+ let content;
4234
+ try {
4235
+ content = await readFile(path, "utf8");
4236
+ } catch (err) {
4237
+ if (err.code === "ENOENT") return null;
4238
+ throw err;
4239
+ }
4240
+ return parseDotenv(content);
4241
+ }
4242
+ /**
4243
+ * Load `.ccqa/profiles/<profile>.env`. A missing file throws — a typo must fail
4244
+ * loudly, not silently resolve every credential to empty.
4245
+ */
4246
+ async function loadProfileEnv(profile, cwd) {
4247
+ const path = profilePath(profile, cwd);
4248
+ const vars = await readDotenv(path);
4249
+ if (vars === null) throw new ProfileNotFoundError(profile, path);
4250
+ return vars;
4251
+ }
4252
+ /** Absolute path of the default `.env` ccqa loads when `--profile` is absent. */
4253
+ function defaultEnvPath(cwd) {
4254
+ return join(cwd, ".env");
4255
+ }
4256
+ /**
4257
+ * Load `<cwd>/.env`, the default when no `--profile` is given. A missing `.env`
4258
+ * is fine (returns `null`) — the run falls back to the existing `process.env`.
4259
+ */
4260
+ async function loadDefaultEnv(cwd) {
4261
+ return readDotenv(defaultEnvPath(cwd));
4262
+ }
4263
+ /**
4264
+ * Merge vars into `process.env`. With `override` (the default), the profile
4265
+ * wins over inherited values. Returns the applied names — never values, so
4266
+ * callers log names only and secrets stay out of the log.
4267
+ */
4268
+ function applyProfileEnv(vars, opts = {}) {
4269
+ const override = opts.override ?? true;
4270
+ const applied = [];
4271
+ for (const [name, value] of Object.entries(vars)) {
4272
+ if (!override && process.env[name] !== void 0) continue;
4273
+ process.env[name] = value;
4274
+ applied.push(name);
4275
+ }
4276
+ return applied;
4277
+ }
4278
+ //#endregion
4086
4279
  //#region src/cli/options.ts
4087
4280
  /**
4088
4281
  * Shared `--language` flag. Every Claude-driven command writes some
@@ -4093,6 +4286,53 @@ const CLIENT_JS = `
4093
4286
  function addLanguageOption(command) {
4094
4287
  return command.option("--language <bcp47>", "Language for human-readable output (e.g. 'en', 'ja'). Default 'auto' follows the language of the spec/codebase.", DEFAULT_LANGUAGE);
4095
4288
  }
4289
+ /**
4290
+ * Shared `--profile <name>` flag for the browser-driving commands (`run`,
4291
+ * `record`), registered identically so help text and behaviour don't drift.
4292
+ */
4293
+ function addProfileOption(command) {
4294
+ return command.option("--profile <name>", "Load .ccqa/profiles/<name>.env into the environment before resolving spec ${VAR} references (URLs, credentials), so one spec can target dev/stg/prd without per-environment copies. Profile values override the inherited environment.");
4295
+ }
4296
+ /**
4297
+ * Merge the environment for a `run` / `record` invocation into `process.env`
4298
+ * before any spec work. With `--profile <name>`, load that profile (missing /
4299
+ * invalid → exit 2). Without it, auto-load `<cwd>/.env` if present (a missing
4300
+ * `.env` is fine). Checking `!== undefined` rejects `--profile ""` rather than
4301
+ * skipping it.
4302
+ */
4303
+ async function applyProfileFromOption(profile, cwd) {
4304
+ if (profile !== void 0) await applyNamedProfile(profile, cwd);
4305
+ else await applyDefaultEnv(cwd);
4306
+ }
4307
+ /** "1 var" / "2 vars" — the count summary shared by both load paths' meta line. */
4308
+ function varCount(n) {
4309
+ return `${n} var${n === 1 ? "" : "s"}`;
4310
+ }
4311
+ async function applyNamedProfile(profile, cwd) {
4312
+ try {
4313
+ const applied = applyProfileEnv(await loadProfileEnv(profile, cwd));
4314
+ meta("profile", `${profile} (${varCount(applied.length)})`);
4315
+ if (applied.length === 0) warn(`profile "${profile}" defined no variables — spec $\{VAR} references will resolve to empty`);
4316
+ } catch (err) {
4317
+ if (err instanceof ProfileNotFoundError) {
4318
+ error(err.message);
4319
+ hint(`create ${err.path} with the environment's $\{VAR} values`);
4320
+ } else if (err instanceof InvalidProfileNameError) error(err.message);
4321
+ else error(`failed to load profile "${profile}": ${err instanceof Error ? err.message : String(err)}`);
4322
+ process.exit(2);
4323
+ }
4324
+ }
4325
+ async function applyDefaultEnv(cwd) {
4326
+ let vars;
4327
+ try {
4328
+ vars = await loadDefaultEnv(cwd);
4329
+ } catch (err) {
4330
+ error(`failed to load ${defaultEnvPath(cwd)}: ${err instanceof Error ? err.message : String(err)}`);
4331
+ process.exit(2);
4332
+ }
4333
+ if (vars === null) return;
4334
+ meta("env", `.env (${varCount(applyProfileEnv(vars, { override: false }).length)})`);
4335
+ }
4096
4336
  //#endregion
4097
4337
  //#region src/cli/resolve-cwd.ts
4098
4338
  /**
@@ -4104,7 +4344,7 @@ function addLanguageOption(command) {
4104
4344
  *
4105
4345
  * It's mostly useful in monorepos where you want to invoke ccqa from the
4106
4346
  * repo root but target a subpackage (e.g.
4107
- * `ccqa run --cwd js/apps/knowledge-webapp`).
4347
+ * `ccqa run --cwd apps/web-app`).
4108
4348
  *
4109
4349
  * Falls back to `process.cwd()` when the option is not given.
4110
4350
  */
@@ -4249,7 +4489,7 @@ function formatAgentBrowserUnavailableMessage() {
4249
4489
  //#region src/cli/preflight.ts
4250
4490
  /**
4251
4491
  * Shared startup steps for every command that drives a real `agent-browser`
4252
- * (currently `ccqa trace` and `ccqa run-nd`):
4492
+ * (currently `ccqa record` (trace) and `ccqa run` (live mode)):
4253
4493
  *
4254
4494
  * 1. Verify the peer-installed agent-browser binary is reachable. On
4255
4495
  * failure print the standard guidance and `process.exit(1)`; on
@@ -4276,14 +4516,14 @@ async function preflightAgentBrowserCommand() {
4276
4516
  await warnStaleBlockArtifacts();
4277
4517
  }
4278
4518
  //#endregion
4279
- //#region src/report/nd-transcript-excerpt.ts
4519
+ //#region src/report/live-transcript-excerpt.ts
4280
4520
  /**
4281
4521
  * Build a compact transcript summary for the failure classifier.
4282
4522
  *
4283
4523
  * Returns `null` when the run has no failed step (every step passed/skipped),
4284
4524
  * since the failure analyzer has nothing to explain in that case.
4285
4525
  */
4286
- async function buildNdTranscriptExcerpt(result, options = {}) {
4526
+ async function buildLiveTranscriptExcerpt(result, options = {}) {
4287
4527
  const failingIndex = result.steps.findIndex((s) => s.status === "failed");
4288
4528
  if (failingIndex === -1) return null;
4289
4529
  const failingStep = result.steps[failingIndex];
@@ -4316,7 +4556,7 @@ function oneLine$1(s) {
4316
4556
  return s.replace(/\s+/g, " ").trim();
4317
4557
  }
4318
4558
  //#endregion
4319
- //#region src/runtime/nd-artifacts.ts
4559
+ //#region src/runtime/live-artifacts.ts
4320
4560
  /**
4321
4561
  * Build a sortable run id from the current wall-clock time. ISO8601 with
4322
4562
  * `:` / `.` replaced so it's filename-safe. Caller is expected to mkdir the
@@ -4346,6 +4586,12 @@ function stepArtifactPaths(runDir, stepId) {
4346
4586
  //#endregion
4347
4587
  //#region src/claude/agent-browser-invoke.ts
4348
4588
  function agentBrowserInvokeBase(input) {
4589
+ const env = {
4590
+ AGENT_BROWSER_SESSION: input.sessionName,
4591
+ CCQA_RUN_ID: input.runId,
4592
+ PATH: pathWithAgentBrowserShim(process.env["PATH"])
4593
+ };
4594
+ if (input.statePath) env["CCQA_AB_STATE"] = input.statePath;
4349
4595
  return {
4350
4596
  allowedTools: [
4351
4597
  "Bash(*)",
@@ -4353,23 +4599,25 @@ function agentBrowserInvokeBase(input) {
4353
4599
  "Grep",
4354
4600
  "Glob"
4355
4601
  ],
4356
- env: {
4357
- AGENT_BROWSER_SESSION: input.sessionName,
4358
- CCQA_RUN_ID: input.runId,
4359
- PATH: pathWithAgentBrowserShim(process.env["PATH"])
4360
- }
4602
+ env
4361
4603
  };
4362
4604
  }
4363
4605
  //#endregion
4364
- //#region src/prompts/run-nd.ts
4365
- function generateRunNdSessionName() {
4366
- return `ccqa-run-nd-${buildRunId()}`;
4606
+ //#region src/prompts/live.ts
4607
+ /**
4608
+ * Unique agent-browser session name. The runId is millisecond-precision wall
4609
+ * clock, so under `--concurrency > 1` two specs can start in the same
4610
+ * millisecond and collide; a random suffix guarantees each spec gets its own
4611
+ * Chrome session and state never bleeds across parallel runs.
4612
+ */
4613
+ function generateLiveSessionName() {
4614
+ return `ccqa-live-${buildRunId()}-${randomUUID().slice(0, 8)}`;
4367
4615
  }
4368
4616
  /**
4369
- * Static prefix of the `ccqa run-nd` system prompt. Built once per run and
4370
- * reused across every step's invocation — the only piece that changes per
4371
- * step is the trailing "Your Task: <stepId>" section produced by
4372
- * `buildRunNdSystemPromptStepSection`. Keeping the split here lets the prompt
4617
+ * Static prefix of the `ccqa run` (live spec) system prompt. Built once per
4618
+ * run and reused across every step's invocation — the only piece that
4619
+ * changes per step is the trailing "Your Task: <stepId>" section produced by
4620
+ * `buildLiveSystemPromptStepSection`. Keeping the split here lets the prompt
4373
4621
  * cache absorb the shared bulk and keeps each turn's prompt construction down
4374
4622
  * to a small string concat.
4375
4623
  *
@@ -4378,32 +4626,35 @@ function generateRunNdSessionName() {
4378
4626
  * but never names a specific product, URL, account, role, or UI element.
4379
4627
  * Project-specific guidance ("the admin tenant is foo.example", "session
4380
4628
  * times out at X minutes", …) is appended from
4381
- * `.ccqa/prompts/run-nd.user.md` by the caller, so ccqa stays clean of
4382
- * downstream-product context.
4629
+ * `.ccqa/prompts/live.user.md` (human-maintained) and
4630
+ * `.ccqa/prompts/live.agent.md` (updated by `ccqa run --update-agent-prompt`)
4631
+ * by the caller, so ccqa stays clean of downstream-product context.
4383
4632
  *
4384
- * Constraint posture: `ccqa trace` enforces a strict selector whitelist and
4385
- * blocks `eval` / `@ref` / chained agent-browser invocations because those
4386
- * trace outputs need to replay deterministically. `run-nd` has no replay —
4387
- * the model judges the step live — so those guards are off and the model is
4388
- * told it may use any agent-browser subcommand and any selector strategy.
4389
- */
4390
- function buildRunNdSystemPromptPrefix(input) {
4633
+ * Constraint posture: `ccqa record` (trace) enforces a strict selector
4634
+ * whitelist and blocks `eval` / `@ref` / chained agent-browser invocations
4635
+ * because those trace outputs need to replay deterministically. Live specs
4636
+ * have no replay — the model judges the step live — so those guards are off
4637
+ * and the model is told it may use any agent-browser subcommand and any
4638
+ * selector strategy.
4639
+ */
4640
+ function buildLiveSystemPromptPrefix(input) {
4391
4641
  const stepsText = input.allSteps.map((s) => `### ${s.id} [${s.source}]
4392
4642
  - **Instruction**: ${s.instruction}
4393
4643
  - **Expected**: ${s.expected}`).join("\n\n");
4644
+ const stateLine = input.statePath ? `\n\nA pre-recorded auth-state file is provided at \`${input.statePath}\` (also in the env var \`CCQA_AB_STATE\`). **Always also pass \`--state "$CCQA_AB_STATE"\`** to every \`agent-browser\` command — this restores cookies and localStorage from a prior interactive login, so the user is already signed in to the application under test from step 1. The file is loaded read-only; do not run \`agent-browser state save\`.` : "";
4394
4645
  return `You are a QA execution agent. You are executing ONE step of a browser-based end-to-end test and judging whether the step's expected outcome was achieved. You are NOT recording a replayable test script — be flexible, explore the DOM as needed, and make a clear pass / fail call at the end.
4395
4646
 
4396
4647
  ## Session
4397
4648
 
4398
4649
  SESSION NAME: \`${input.sessionName}\`
4399
4650
 
4400
- Always pass \`--session ${input.sessionName}\` to every \`agent-browser\` command. The session persists across steps within this test run, so the browser state from previous steps is already loaded when this turn starts.
4651
+ Always pass \`--session ${input.sessionName}\` to every \`agent-browser\` command. The session persists across steps within this test run, so the browser state from previous steps is already loaded when this turn starts.${stateLine}
4401
4652
 
4402
4653
  ## Tools
4403
4654
 
4404
4655
  You have:
4405
4656
 
4406
- - **Bash** to run \`agent-browser\` (the full surface — \`open\`, \`snapshot\`, \`click\`, \`fill\`, \`press\`, \`wait\`, \`find\`, \`screenshot\`, \`eval\`, \`js\`, \`get\`, etc.). Any selector form is allowed: \`@ref\` (e.g. \`@e14\`), CSS selectors, \`text=...\`, \`[aria-label='...']\`, \`[data-testid='...']\`, bare tags inside \`find first/last/nth\` — whatever works for this single run. There is no replay contract to honour.
4657
+ - **Bash** to run \`agent-browser\` (the full surface — \`open\`, \`snapshot\`, \`click\`, \`fill\`, \`upload\`, \`press\`, \`wait\`, \`find\`, \`screenshot\`, \`eval\`, \`js\`, \`get\`, etc.). Any selector form is allowed: \`@ref\` (e.g. \`@e14\`), CSS selectors, \`text=...\`, \`[aria-label='...']\`, \`[data-testid='...']\`, bare tags inside \`find first/last/nth\` — whatever works for this single run. There is no replay contract to honour. For file inputs (\`<input type="file">\`) do NOT \`click\` the input — use \`agent-browser upload "<selector>" <path>\` so no OS file-picker dialog opens. Fixtures conventionally live under \`.ccqa/fixtures/\`; reference them via \`\${CCQA_FIXTURES_DIR}/<name>\`.
4407
4658
  - **Read / Grep / Glob** for inspecting the application source code when you need to find a selector or understand routing. Read-only — do not modify source files.
4408
4659
 
4409
4660
  ## Test Specification
@@ -4456,7 +4707,7 @@ Everything else you write (narrative, tool output summaries, etc.) is fine — o
4456
4707
  `;
4457
4708
  }
4458
4709
  /** Per-step trailer with the current step's instruction / expected. */
4459
- function buildRunNdSystemPromptStepSection(step) {
4710
+ function buildLiveSystemPromptStepSection(step) {
4460
4711
  return `
4461
4712
  ## Your Task: ${step.id}
4462
4713
 
@@ -4467,11 +4718,11 @@ Execute the instruction in the running browser session, then judge whether the e
4467
4718
  `;
4468
4719
  }
4469
4720
  /** Per-turn user message — the system prompt already carries all spec context. */
4470
- function buildRunNdUserPrompt(step) {
4721
+ function buildLiveUserPrompt(step) {
4471
4722
  return `Execute step ${step.id} and emit your STEP_RESULT verdict as instructed in the system prompt.`;
4472
4723
  }
4473
4724
  //#endregion
4474
- //#region src/runtime/nd-result-parse.ts
4725
+ //#region src/runtime/live-result-parse.ts
4475
4726
  const MAX_REASON_LEN = 2e3;
4476
4727
  /** Parse a single STEP_RESULT line. Returns null on malformed input. */
4477
4728
  function parseStepResultLine(line) {
@@ -4501,7 +4752,7 @@ function findLastStepResult(text) {
4501
4752
  //#region src/runtime/screenshot.ts
4502
4753
  /**
4503
4754
  * Take a PNG screenshot of the current page in the given agent-browser session
4504
- * and write it to `outPath`. Used by `ccqa run-nd` to capture per-step
4755
+ * and write it to `outPath`. Used by `ccqa run` (live mode) to capture per-step
4505
4756
  * artifacts (before / after the step's actions) so the human-readable run
4506
4757
  * report has a visual trail even though no AB_ACTION stream is recorded.
4507
4758
  *
@@ -4511,11 +4762,9 @@ function findLastStepResult(text) {
4511
4762
  * artifact, not a reason to abort the test step.
4512
4763
  */
4513
4764
  function takeScreenshot(sessionName, outPath, options) {
4514
- const args = [
4515
- "--session",
4516
- sessionName,
4517
- "screenshot"
4518
- ];
4765
+ const args = ["--session", sessionName];
4766
+ if (options?.statePath) args.push("--state", options.statePath);
4767
+ args.push("screenshot");
4519
4768
  if (options?.fullPage) args.push("--full");
4520
4769
  args.push(outPath);
4521
4770
  const res = spawnAB(args);
@@ -4530,10 +4779,10 @@ function takeScreenshot(sessionName, outPath, options) {
4530
4779
  };
4531
4780
  }
4532
4781
  //#endregion
4533
- //#region src/runtime/nd-executor.ts
4782
+ //#region src/runtime/live-executor.ts
4534
4783
  /**
4535
- * Run all spec steps once through Claude (non-deterministic mode). Each step
4536
- * is one Claude invocation that:
4784
+ * Run all spec steps once through Claude (live mode). Each step is one Claude
4785
+ * invocation that:
4537
4786
  * 1. takes a "before" screenshot of the live session
4538
4787
  * 2. lets Claude execute the step's instruction via agent-browser (full
4539
4788
  * surface, no replay-time selector constraints)
@@ -4544,20 +4793,23 @@ function takeScreenshot(sessionName, outPath, options) {
4544
4793
  * the overall run status flips to `failed`. The Chrome session persists
4545
4794
  * across steps so step N+1 starts on whatever page step N left the browser on.
4546
4795
  */
4547
- async function runNdExecutor(input) {
4796
+ async function runLiveExecutor(input) {
4548
4797
  const startedAt = /* @__PURE__ */ new Date();
4549
4798
  const stepResults = [];
4550
4799
  let overallFailed = false;
4551
- const promptPrefix = buildRunNdSystemPromptPrefix({
4800
+ const statePath = input.statePath ?? null;
4801
+ const promptPrefix = buildLiveSystemPromptPrefix({
4552
4802
  title: input.spec.title,
4553
4803
  allSteps: input.steps,
4554
- sessionName: input.sessionName
4804
+ sessionName: input.sessionName,
4805
+ statePath
4555
4806
  });
4556
4807
  const suffixBlock = input.systemPromptSuffix ? `\n## Project-specific guidance\n\n${input.systemPromptSuffix}\n` : "";
4557
4808
  const langDirective = languageDirective(input.language);
4558
4809
  const invokeBase = agentBrowserInvokeBase({
4559
4810
  sessionName: input.sessionName,
4560
- runId: input.runId
4811
+ runId: input.runId,
4812
+ statePath
4561
4813
  });
4562
4814
  const retries = Math.max(0, input.retries ?? 0);
4563
4815
  for (let i = 0; i < input.steps.length; i++) {
@@ -4571,8 +4823,8 @@ async function runNdExecutor(input) {
4571
4823
  const paths = stepArtifactPaths(input.runDir, step$1.id);
4572
4824
  await ensureDir(paths.beforePng);
4573
4825
  const stepStartedAt = Date.now();
4574
- const systemPrompt = promptPrefix + buildRunNdSystemPromptStepSection(step$1) + suffixBlock + langDirective;
4575
- const userPrompt = buildRunNdUserPrompt(step$1);
4826
+ const systemPrompt = promptPrefix + buildLiveSystemPromptStepSection(step$1) + suffixBlock + langDirective;
4827
+ const userPrompt = buildLiveUserPrompt(step$1);
4576
4828
  let attempt = 0;
4577
4829
  let lastOutcome = null;
4578
4830
  while (attempt <= retries) {
@@ -4602,7 +4854,7 @@ async function runNdExecutor(input) {
4602
4854
  }
4603
4855
  }
4604
4856
  async function executeStepAttempt(step, paths, systemPrompt, userPrompt) {
4605
- const before = takeScreenshot(input.sessionName, paths.beforePng);
4857
+ const before = takeScreenshot(input.sessionName, paths.beforePng, { statePath });
4606
4858
  if (!before.ok) warn(`screenshot (before, ${step.id}) failed: ${before.error}`);
4607
4859
  const transcriptParts = [];
4608
4860
  let isError = false;
@@ -4634,7 +4886,10 @@ async function runNdExecutor(input) {
4634
4886
  transcriptParts.push(`[ccqa] invokeClaudeStreaming threw: ${err instanceof Error ? err.message : String(err)}`);
4635
4887
  }
4636
4888
  const transcript = transcriptParts.join("\n");
4637
- const after = takeScreenshot(input.sessionName, paths.afterPng, { fullPage: true });
4889
+ const after = takeScreenshot(input.sessionName, paths.afterPng, {
4890
+ fullPage: true,
4891
+ statePath
4892
+ });
4638
4893
  if (!after.ok) warn(`screenshot (after, ${step.id}) failed: ${after.error}`);
4639
4894
  await writeFile(paths.logTxt, transcript || "(no assistant text captured)", "utf-8");
4640
4895
  const { status, reasoning } = judgeStepOutcome({
@@ -4750,24 +5005,24 @@ function truncateForLog$1(s) {
4750
5005
  return oneLine.length > 100 ? oneLine.slice(0, 100) + "…" : oneLine;
4751
5006
  }
4752
5007
  //#endregion
4753
- //#region src/report/nd-adapter.ts
5008
+ //#region src/report/live-adapter.ts
4754
5009
  /**
4755
- * Convert one `run-nd` execution result into the persistence-layer
4756
- * `ReportSpecResult` shape consumed by `renderRunReport`. The conversion
4757
- * does two non-trivial things:
5010
+ * Convert one live-mode (`mode: live`) execution result into the
5011
+ * persistence-layer `ReportSpecResult` shape consumed by `renderRunReport`.
5012
+ * The conversion does two non-trivial things:
4758
5013
  *
4759
5014
  * - rewrites the executor's absolute `beforePng`/`afterPng` paths as
4760
5015
  * `reportDir`-relative hrefs so the rendered HTML opens its PNGs
4761
5016
  * directly when the report dir + the run dir are downloaded together
4762
5017
  * as a CI artifact bundle
4763
5018
  * - nulls out every vitest-only field so the report renderer falls
4764
- * through to its `ndRun` branch
5019
+ * through to its `liveRun` branch
4765
5020
  *
4766
5021
  * Lives in `src/report/` (not the CLI) because the relative-path contract
4767
- * on `NdReportStep.beforePng`/`afterPng` is a report-layer invariant,
5022
+ * on `LiveReportStep.beforePng`/`afterPng` is a report-layer invariant,
4768
5023
  * documented next to the schema, and the CLI should not own it.
4769
5024
  */
4770
- function ndRunToReportResult(args) {
5025
+ function liveRunToReportResult(args) {
4771
5026
  const { featureName, specName, specYaml, result, reportDir } = args;
4772
5027
  const steps = result.steps.map((s) => ({
4773
5028
  stepId: s.stepId,
@@ -4781,7 +5036,7 @@ function ndRunToReportResult(args) {
4781
5036
  durationMs: s.durationMs,
4782
5037
  cost: { ...s.cost }
4783
5038
  }));
4784
- const ndRun = {
5039
+ const liveRun = {
4785
5040
  runId: result.runId,
4786
5041
  sessionName: result.sessionName,
4787
5042
  startedAt: result.startedAt,
@@ -4804,16 +5059,16 @@ function ndRunToReportResult(args) {
4804
5059
  diffExcerpt: null,
4805
5060
  specYaml,
4806
5061
  evidence: null,
4807
- ndRun
5062
+ liveRun
4808
5063
  };
4809
5064
  }
4810
5065
  function relativeIfPresent(absPath, reportDir) {
4811
5066
  return absPath === null ? null : relative(reportDir, absPath);
4812
5067
  }
4813
5068
  //#endregion
4814
- //#region src/cli/run-nd.ts
5069
+ //#region src/cli/run-live.ts
4815
5070
  /**
4816
- * Run pre-filtered `mode: live` specs through `runNdExecutor` (Claude +
5071
+ * Run pre-filtered `mode: live` specs through `runLiveExecutor` (Claude +
4817
5072
  * agent-browser) and, when `reportDir` is set, run drift audit + failure
4818
5073
  * analysis to produce report rows. Sibling of `runDeterministicSpecs`.
4819
5074
  */
@@ -4825,24 +5080,25 @@ async function runLiveSpecs(specs, opts) {
4825
5080
  const cwd = opts.cwd ?? process.cwd();
4826
5081
  await preflightAgentBrowserCommand();
4827
5082
  meta("live-specs", specs.length);
4828
- const userPromptSuffix = await loadRunNdUserPrompt(cwd);
4829
- if (userPromptSuffix !== null) meta("user-prompt", ".ccqa/prompts/run-nd.user.md");
4830
- const runs = [];
4831
- for (let i = 0; i < specs.length; i++) {
4832
- const { featureName, specName } = specs[i];
4833
- const label = `${featureName}/${specName}`;
4834
- if (specs.length > 1) {
4835
- blank();
4836
- info(`[${i + 1}/${specs.length}] ${label}`);
4837
- }
4838
- runs.push(await runOneSpec({
4839
- featureName,
4840
- specName,
4841
- opts,
4842
- userPromptSuffix,
4843
- cwd
4844
- }));
4845
- }
5083
+ const userPromptBundle = await loadLivePromptBundle(cwd);
5084
+ if (userPromptBundle !== null) meta("prompt", userPromptBundle.loaded.join(" + "));
5085
+ const userPromptSuffix = userPromptBundle?.text ?? null;
5086
+ const concurrency = Math.max(1, opts.concurrency ?? 1);
5087
+ const runs = await runPool(specs, concurrency, (spec, i) => {
5088
+ const label = `${spec.featureName}/${spec.specName}`;
5089
+ return withBuffer(label, concurrency > 1, () => {
5090
+ if (concurrency === 1 && specs.length > 1) {
5091
+ blank();
5092
+ info(`[${i + 1}/${specs.length}] ${label}`);
5093
+ }
5094
+ return runOneSpec({
5095
+ ...spec,
5096
+ opts,
5097
+ userPromptSuffix,
5098
+ cwd
5099
+ });
5100
+ });
5101
+ });
4846
5102
  const failedCount = runs.filter((r) => r.kind === "error" || r.kind === "run" && r.result.status === "failed").length;
4847
5103
  blank();
4848
5104
  meta("live-summary", `${runs.length - failedCount} passed / ${failedCount} failed`);
@@ -4859,7 +5115,7 @@ function buildLiveReportResults(runs, driftBySpec, analysisBySpec, reportDir, fa
4859
5115
  if (r.kind !== "run") return [];
4860
5116
  const key = `${r.featureName}/${r.specName}`;
4861
5117
  return [{
4862
- ...ndRunToReportResult({
5118
+ ...liveRunToReportResult({
4863
5119
  featureName: r.featureName,
4864
5120
  specName: r.specName,
4865
5121
  specYaml: r.specYaml,
@@ -4889,7 +5145,7 @@ function analysisFieldsFor(a, status, failureAnalysisEnabled) {
4889
5145
  /**
4890
5146
  * Run `analyzeDrift` against every successfully-loaded spec and return a
4891
5147
  * `featureName/specName → driftIssues` map. Drift findings are advisory —
4892
- * they show in the HTML report but do not change the run-nd exit code.
5148
+ * they show in the HTML report but do not change the live-run exit code.
4893
5149
  */
4894
5150
  async function runDriftAudit(runs, opts, cwd) {
4895
5151
  const targets = runs.filter((r) => r.kind === "run").map((r) => ({
@@ -4939,18 +5195,36 @@ async function runOneSpec(args) {
4939
5195
  meta("steps", expanded.length);
4940
5196
  const includes = collectIncludedBlockNames(spec);
4941
5197
  if (includes.length > 0) meta("blocks", includes.join(", "));
4942
- const sessionName = generateRunNdSessionName();
5198
+ const sessionName = generateLiveSessionName();
4943
5199
  meta("session", sessionName);
5200
+ let statePath = null;
5201
+ if (spec.statePath) {
5202
+ statePath = isAbsolute(spec.statePath) ? spec.statePath : resolve(cwd, spec.statePath);
5203
+ try {
5204
+ await access(statePath);
5205
+ } catch {
5206
+ const msg = `spec.statePath points to a missing file: ${statePath}`;
5207
+ error(msg);
5208
+ return {
5209
+ kind: "error",
5210
+ featureName,
5211
+ specName,
5212
+ error: msg
5213
+ };
5214
+ }
5215
+ meta("state", statePath);
5216
+ }
4944
5217
  const runId = buildRunId();
4945
5218
  const runDir = opts.out ?? join(specDir, "runs", runId);
4946
5219
  await mkdir(runDir, { recursive: true });
4947
5220
  meta("runDir", runDir);
4948
- const result = await runNdExecutor({
5221
+ const result = await runLiveExecutor({
4949
5222
  spec: { title: spec.title },
4950
5223
  steps: expanded,
4951
5224
  runId,
4952
5225
  runDir,
4953
5226
  sessionName,
5227
+ statePath,
4954
5228
  systemPromptSuffix: userPromptSuffix,
4955
5229
  model: opts.model,
4956
5230
  language: opts.language,
@@ -4963,7 +5237,7 @@ async function runOneSpec(args) {
4963
5237
  meta("saved", runJsonPath);
4964
5238
  meta("status", result.status.toUpperCase());
4965
5239
  meta("step-summary", `${count(result.steps, "passed")} passed / ${count(result.steps, "failed")} failed / ${count(result.steps, "skipped")} skipped`);
4966
- const costLine = formatNdCost(result.cost, { compact: false });
5240
+ const costLine = formatLiveCost(result.cost, { compact: false });
4967
5241
  if (costLine) meta("cost", costLine);
4968
5242
  return {
4969
5243
  kind: "run",
@@ -4975,7 +5249,7 @@ async function runOneSpec(args) {
4975
5249
  };
4976
5250
  }
4977
5251
  function logBatchCost(runs) {
4978
- const line = formatNdBatchCost(runs.flatMap((r) => r.kind === "run" ? [r.result.cost] : []));
5252
+ const line = formatLiveBatchCost(runs.flatMap((r) => r.kind === "run" ? [r.result.cost] : []));
4979
5253
  if (line) meta("total-cost", line);
4980
5254
  }
4981
5255
  /**
@@ -5005,7 +5279,7 @@ async function runFailureAnalysisForLiveRuns(runs, driftBySpec, opts, cwd) {
5005
5279
  for (const r of failed) {
5006
5280
  const key = `${r.featureName}/${r.specName}`;
5007
5281
  info(`failure analysis: ${key}`);
5008
- const excerpt = await buildNdTranscriptExcerpt(r.result);
5282
+ const excerpt = await buildLiveTranscriptExcerpt(r.result);
5009
5283
  if (excerpt === null) {
5010
5284
  out.set(key, {
5011
5285
  analysis: null,
@@ -5016,7 +5290,7 @@ async function runFailureAnalysisForLiveRuns(runs, driftBySpec, opts, cwd) {
5016
5290
  continue;
5017
5291
  }
5018
5292
  const outcome = await analyzeFailure({
5019
- ndTranscriptExcerpt: excerpt,
5293
+ liveTranscriptExcerpt: excerpt,
5020
5294
  specYaml: r.specYaml,
5021
5295
  diffPatch: diff.ok ? diff.diff.patch : null,
5022
5296
  changedFiles: diff.ok ? diff.diff.nameStatus : null,
@@ -5067,6 +5341,100 @@ function oneLine(s) {
5067
5341
  return s.replace(/\s+/g, " ").trim();
5068
5342
  }
5069
5343
  //#endregion
5344
+ //#region src/prompts/agent-update.ts
5345
+ function buildAgentUpdateSystemPrompt(input) {
5346
+ const modeLabel = input.mode === "live" ? "live (Claude drives every step at run time)" : "record (Claude records browser actions for vitest replay)";
5347
+ const userMdLabel = `${input.mode}.user.md`;
5348
+ const agentMdLabel = `${input.mode}.agent.md`;
5349
+ return `You maintain the auto-learned half of ccqa's prompt bundle for ${modeLabel}.
5350
+
5351
+ ${outputLanguageBlock(input.language ?? "auto", "the bullet text", "headings, agent-browser subcommand names, selector tokens")}## What you are updating
5352
+
5353
+ \`.ccqa/prompts/${agentMdLabel}\` is appended to ccqa's system prompt for every ${input.mode === "live" ? "step of every `mode: live` spec" : "trace run of `ccqa record`"}. It is meant to capture **stable lessons learned from past runs** — concrete selectors that worked, login flow quirks the agent kept tripping on, common "this is fine" warnings to ignore.
5354
+
5355
+ The sibling file \`${userMdLabel}\` carries human-maintained project guidance (URLs, naming conventions). Rules already well-covered by \`${userMdLabel}\` should NOT be repeated here.
5356
+
5357
+ ## Output rules
5358
+
5359
+ - Emit the COMPLETE replacement contents of \`${agentMdLabel}\`.
5360
+ - Concise bullet points. No narrative paragraphs. No preamble. No closing summary.
5361
+ - Each bullet is a single declarative sentence (or one bullet → one short selector / command).
5362
+ - Group related bullets under \`### …\` subheaders.
5363
+ - Skip everything that was already true and well-covered by the previous file or \`${userMdLabel}\`. Only persist new lessons.
5364
+ - Keep the whole file under ~3 KB.
5365
+ - Output ONLY the new file contents. NO code fences. NO surrounding prose. NO markdown frontmatter.
5366
+ - If the run summary contains nothing worth learning from, output the previous file unchanged.
5367
+ `;
5368
+ }
5369
+ function buildAgentUpdateUserPrompt(input) {
5370
+ const agentMdLabel = `${input.mode}.agent.md`;
5371
+ return `## Previous \`${agentMdLabel}\`
5372
+
5373
+ ${input.currentAgentMd && input.currentAgentMd.trim().length > 0 ? input.currentAgentMd : "(no existing file — this will create one)"}
5374
+
5375
+ ## Run summary
5376
+
5377
+ ${input.runSummary}
5378
+
5379
+ ## Your task
5380
+
5381
+ Write the new contents of \`${agentMdLabel}\`. Output ONLY the file contents — no preamble, no fences, no closing note.`;
5382
+ }
5383
+ //#endregion
5384
+ //#region src/cli/update-agent-prompt.ts
5385
+ /**
5386
+ * Refresh `.ccqa/prompts/<mode>.agent.md` from the latest run.
5387
+ *
5388
+ * Reads the existing file (if any) and a caller-supplied run summary, sends
5389
+ * both to Claude, and writes the response back over the agent prompt file.
5390
+ * Degrades gracefully when auth is missing — logs and returns — so the run
5391
+ * exit code is unaffected by this opt-in side step.
5392
+ */
5393
+ async function updateAgentPrompt(args) {
5394
+ const { mode, runSummary, cwd, model, language } = args;
5395
+ const agentMdPath = join(cwd, ".ccqa", "prompts", `${mode}.agent.md`);
5396
+ const relPath = relative(cwd, agentMdPath);
5397
+ const auth = driftAuthAvailable();
5398
+ if (!auth.ok) {
5399
+ warn(`--update-agent-prompt skipped (${auth.reason})`);
5400
+ return;
5401
+ }
5402
+ const promptInput = {
5403
+ mode,
5404
+ currentAgentMd: await readFile(agentMdPath, "utf-8").catch(() => null),
5405
+ runSummary,
5406
+ ...language ? { language } : {}
5407
+ };
5408
+ const systemPrompt = buildAgentUpdateSystemPrompt(promptInput);
5409
+ const userPrompt = buildAgentUpdateUserPrompt(promptInput);
5410
+ info(`--update-agent-prompt: refreshing ${relPath}`);
5411
+ const { result, isError } = await invokeClaudeStreaming({
5412
+ prompt: userPrompt,
5413
+ systemPrompt,
5414
+ allowedTools: [],
5415
+ disableBuiltinTools: true,
5416
+ ...model ? { model } : {}
5417
+ }, () => {});
5418
+ if (isError || !result || result.trim().length === 0) {
5419
+ warn(`--update-agent-prompt: Claude returned no usable output${isError ? " (SDK error)" : ""}; leaving ${relPath} unchanged`);
5420
+ return;
5421
+ }
5422
+ const newText = stripCodeFences(result.trim()) + "\n";
5423
+ await mkdir(dirname(agentMdPath), { recursive: true });
5424
+ await writeFile(agentMdPath, newText, "utf-8");
5425
+ info(`--update-agent-prompt: wrote ${relPath} (${newText.length} bytes)`);
5426
+ info(`--update-agent-prompt: review the diff with: git diff -- "${relPath}"`);
5427
+ }
5428
+ /**
5429
+ * Some models still wrap the answer in a ```markdown fence despite the
5430
+ * system prompt asking otherwise. Strip a single outer fence when present so
5431
+ * the saved file is clean.
5432
+ */
5433
+ function stripCodeFences(text) {
5434
+ const m = text.match(/^```[a-zA-Z]*\n([\s\S]*?)\n```\s*$/);
5435
+ return m && m[1] !== void 0 ? m[1] : text;
5436
+ }
5437
+ //#endregion
5070
5438
  //#region src/cli/changed-specs.ts
5071
5439
  /**
5072
5440
  * Filter specs to those affected by the git diff against the resolved base
@@ -5122,28 +5490,57 @@ async function resolveVitestConfig(cwd) {
5122
5490
  return bundledVitestConfigPath();
5123
5491
  }
5124
5492
  }
5125
- const runCommand = addLanguageOption(new Command("run").argument("[target]", "Spec to run: '<feature>/<spec>', '<feature>', or omit for all").description("Run specs. Each spec's execution mode comes from its spec.yaml `mode:` field (default deterministic; set `mode: live` to have Claude drive agent-browser live per step). Deterministic specs replay the recorded test.spec.ts under vitest. Pass --report to write one unified HTML report covering both modes.").option("--report [dir]", `Write a self-contained HTML run report (failure analysis + drift audit by default). Default dir: ${DEFAULT_REPORT_DIR}/`).option("--changed", "Restrict execution to specs whose relatedPaths intersect the git diff against --base (or, in CI, $GITHUB_BASE_REF, else origin/main). Cannot be combined with an explicit spec id.").option("--no-failure-analysis", "Skip the per-failure root-cause classification (TEST_DRIFT / SPEC_CHANGE / PRODUCT_BUG). --report only.").option("--no-drift-audit", "Skip the spec↔code drift audit shown in the report. --report only.").option("--base <ref>", "Base ref the source diff is taken against for failure analysis (default: GITHUB_BASE_REF, then origin/main).").option("--cwd <path>", "Working directory containing the .ccqa/ tree (monorepo support). Defaults to the current directory.").option("--format <fmt>", "Additional output format alongside HTML when --report is set: 'text' (default), 'json' (writes report.json), 'github' (GitHub Actions annotations on stdout).", (raw) => {
5493
+ const runCommand = addProfileOption(addLanguageOption(new Command("run").argument("[targets...]", "Specs to run, space-separated: each '<feature>/<spec>', '<feature>', or omit for all. Duplicates are de-duped.").description("Run specs. Each spec's execution mode comes from its spec.yaml `mode:` field (default deterministic; set `mode: live` to have Claude drive agent-browser live per step). Deterministic specs replay the recorded test.spec.ts under vitest. Pass --report to write one unified HTML report covering both modes.").option("--report [dir]", `Write a self-contained HTML run report (failure analysis + drift audit by default). Default dir: ${DEFAULT_REPORT_DIR}/`).option("--changed", "Restrict execution to specs whose relatedPaths intersect the git diff against --base (or, in CI, $GITHUB_BASE_REF, else origin/main). Cannot be combined with an explicit spec id.").option("--no-failure-analysis", "Skip the per-failure root-cause classification (TEST_DRIFT / SPEC_CHANGE / PRODUCT_BUG). --report only.").option("--no-drift-audit", "Skip the spec↔code drift audit shown in the report. --report only.").option("--base <ref>", "Base ref the source diff is taken against for failure analysis (default: GITHUB_BASE_REF, then origin/main).").option("--cwd <path>", "Working directory containing the .ccqa/ tree (monorepo support). Defaults to the current directory.").option("--format <fmt>", "Additional output format alongside HTML when --report is set: 'text' (default), 'json' (writes report.json), 'github' (GitHub Actions annotations on stdout).", (raw) => {
5126
5494
  if (REPORT_FORMATS.includes(raw)) return raw;
5127
5495
  throw new Error(`--format must be one of ${REPORT_FORMATS.join(" | ")}`);
5128
5496
  }, "text").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Overrides CCQA_MODEL.").option("--no-evidence", `(deterministic only) Skip step-boundary evidence capture (PNG + meta JSON written to ${DEFAULT_REPORT_DIR}/${EVIDENCE_SUBDIR}/ by default).`).option("--retry <n>", "(live only) Retry each failed step up to N more times before recording failure. Default 0.", (raw) => {
5129
5497
  const n = Number(raw);
5130
5498
  if (!Number.isFinite(n) || n < 0 || Math.floor(n) !== n) throw new Error(`--retry must be a non-negative integer, got "${raw}"`);
5131
5499
  return n;
5132
- }, 0).option("--out <dir>", "(live only) Override the per-spec artifact directory. Default: <specDir>/runs/<runId>. Ignored when running multiple specs.")).action(async (target, opts) => {
5133
- await runDispatcher(target, opts);
5500
+ }, 0).option("--out <dir>", "(live only) Override the per-spec artifact directory. Default: <specDir>/runs/<runId>. Ignored when running multiple specs.").option("--update-agent-prompt", "(live only) After the run finishes, ask Claude to refresh .ccqa/prompts/live.agent.md from a summary of the run.").option("--concurrency <n>", "Run up to N specs in parallel within each mode (deterministic / live). Default 1 (sequential). Live specs each get an isolated agent-browser session; high values spawn many headed Chrome instances.", parseConcurrency$1, 1))).action(async (targets, opts) => {
5501
+ await runDispatcher(targets, opts);
5134
5502
  });
5503
+ /** Parse --concurrency: a positive integer. Rejects 0, negatives, non-integers. */
5504
+ function parseConcurrency$1(raw) {
5505
+ const n = Number(raw);
5506
+ if (!Number.isInteger(n) || n < 1) {
5507
+ error(`invalid --concurrency: ${raw} (expected positive integer)`);
5508
+ process.exit(2);
5509
+ }
5510
+ return n;
5511
+ }
5135
5512
  function resolveReportDir(report, cwd) {
5136
5513
  if (report === void 0 || report === false) return void 0;
5137
5514
  return resolve(cwd, typeof report === "string" ? report : DEFAULT_REPORT_DIR);
5138
5515
  }
5139
- async function runDispatcher(target, opts) {
5140
- header("run", target ?? (opts.changed ? "(changed)" : "(all specs)"));
5141
- if (opts.changed && target) {
5516
+ /** Header label shown after `ccqa run`: the lone target, a count, or a mode marker. */
5517
+ function headerTarget(targets, opts) {
5518
+ if (targets.length === 1) return targets[0];
5519
+ if (targets.length > 1) return `${targets.length} targets`;
5520
+ return opts.changed ? "(changed)" : "(all specs)";
5521
+ }
5522
+ /** De-dupe by `featureName/specName`, keeping first-seen order. */
5523
+ function dedupeSpecs(specs) {
5524
+ const seen = /* @__PURE__ */ new Set();
5525
+ const out = [];
5526
+ for (const s of specs) {
5527
+ const key = `${s.featureName}/${s.specName}`;
5528
+ if (seen.has(key)) continue;
5529
+ seen.add(key);
5530
+ out.push(s);
5531
+ }
5532
+ return out;
5533
+ }
5534
+ async function runDispatcher(targets, opts) {
5535
+ header("run", headerTarget(targets, opts));
5536
+ if (opts.changed && targets.length > 0) {
5142
5537
  error("--changed and an explicit spec target cannot be combined");
5143
5538
  process.exit(2);
5144
5539
  }
5145
5540
  const cwd = resolveCwd(opts.cwd);
5146
- let specs = await resolveSpecTargets(target, () => listAllSpecsWithSpecFile(cwd), cwd);
5541
+ await applyProfileFromOption(opts.profile, cwd);
5542
+ const enumerateAll = () => listAllSpecsWithSpecFile(cwd);
5543
+ let specs = dedupeSpecs((await Promise.all((targets.length ? targets : [void 0]).map((t) => resolveSpecTargets(t, enumerateAll, cwd)))).flat());
5147
5544
  if (opts.changed) {
5148
5545
  const before = specs.length;
5149
5546
  specs = await collectChangedSpecs(specs, {
@@ -5163,7 +5560,8 @@ async function runDispatcher(target, opts) {
5163
5560
  if (liveSpecs.length === 0) {
5164
5561
  if (typeof opts.retry === "number" && opts.retry > 0) warn("--retry is ignored without any 'mode: live' spec");
5165
5562
  if (opts.out) warn("--out is ignored without any 'mode: live' spec");
5166
- }
5563
+ if (opts.updateAgentPrompt) warn("--update-agent-prompt is ignored without any 'mode: live' spec");
5564
+ } else if (opts.out && liveSpecs.length > 1) warn("--out is ignored when running multiple live specs");
5167
5565
  if (detSpecs.length === 0 && opts.evidence === false) warn("--no-evidence is ignored without any 'mode: deterministic' spec");
5168
5566
  blank();
5169
5567
  const reportDir = resolveReportDir(opts.report, cwd);
@@ -5172,11 +5570,12 @@ async function runDispatcher(target, opts) {
5172
5570
  const live = await runLiveSpecs(liveSpecs, {
5173
5571
  ...opts.model ? { model: opts.model } : {},
5174
5572
  ...opts.language ? { language: opts.language } : {},
5175
- ...opts.out ? { out: opts.out } : {},
5573
+ ...opts.out && liveSpecs.length === 1 ? { out: opts.out } : {},
5176
5574
  cwd,
5177
5575
  ...opts.base ? { base: opts.base } : {},
5178
5576
  ...reportDir ? { reportDir } : {},
5179
5577
  ...typeof opts.retry === "number" ? { retry: opts.retry } : {},
5578
+ concurrency: opts.concurrency ?? 1,
5180
5579
  ...reportDir && opts.driftAudit !== false ? { driftAudit: true } : {},
5181
5580
  ...reportDir && opts.failureAnalysis === false ? { failureAnalysis: false } : {}
5182
5581
  });
@@ -5192,9 +5591,39 @@ async function runDispatcher(target, opts) {
5192
5591
  opts
5193
5592
  });
5194
5593
  }
5594
+ if (opts.updateAgentPrompt && liveSpecs.length > 0) {
5595
+ blank();
5596
+ await updateAgentPrompt({
5597
+ mode: "live",
5598
+ runSummary: buildLiveRunSummary(live.reportResults),
5599
+ cwd,
5600
+ ...opts.model ? { model: opts.model } : {},
5601
+ ...opts.language ? { language: opts.language } : {}
5602
+ });
5603
+ }
5195
5604
  process.exit(overallExitCode);
5196
5605
  }
5197
5606
  /**
5607
+ * Compact, prompt-friendly summary of one ccqa run for the live agent-prompt
5608
+ * update step. One section per spec: header line + per-step verdicts.
5609
+ * Kept to a few KB even with many specs/steps so the prompt cache can absorb
5610
+ * the bulk.
5611
+ */
5612
+ function buildLiveRunSummary(results) {
5613
+ const sections = [];
5614
+ for (const r of results) {
5615
+ if (!r.liveRun) continue;
5616
+ const head = `## ${r.feature}/${r.spec} — ${r.status}`;
5617
+ const steps = r.liveRun.steps.map((s) => `- [${s.status}] ${s.stepId}: ${oneLineSummary$1(s.reasoning)}`).join("\n");
5618
+ sections.push(`${head}\n${steps}`);
5619
+ }
5620
+ return sections.length === 0 ? "(no live runs executed)" : sections.join("\n\n");
5621
+ }
5622
+ function oneLineSummary$1(s) {
5623
+ const flat = s.replace(/\s+/g, " ").trim();
5624
+ return flat.length > 240 ? flat.slice(0, 240) + "…" : flat || "(no reason given)";
5625
+ }
5626
+ /**
5198
5627
  * Run pre-filtered deterministic specs under vitest. Empty input is a no-op.
5199
5628
  * Captures step-boundary evidence under `<reportDir>/evidence/<feature>/<spec>/`
5200
5629
  * when enabled.
@@ -5205,72 +5634,83 @@ async function runDeterministicSpecs(specs, opts, cwd, reportDirAbs) {
5205
5634
  exitCode: 0
5206
5635
  };
5207
5636
  const tmpDir = await mkdtemp(join(tmpdir(), "ccqa-run-"));
5208
- const summaries = [];
5209
- let exitCode = 0;
5210
5637
  const vitestConfig = await resolveVitestConfig(cwd);
5211
5638
  const captureOutput = Boolean(opts.report);
5212
5639
  const evidenceRoot = opts.evidence !== false ? join(reportDirAbs, EVIDENCE_SUBDIR) : null;
5640
+ const concurrency = Math.max(1, opts.concurrency ?? 1);
5641
+ const ctx = {
5642
+ cwd,
5643
+ tmpDir,
5644
+ vitestConfig,
5645
+ captureOutput,
5646
+ evidenceRoot
5647
+ };
5213
5648
  try {
5214
- for (let i = 0; i < specs.length; i++) {
5215
- const { featureName, specName } = specs[i];
5216
- const scriptFile = await getTestScript(featureName, specName, cwd);
5217
- if (!scriptFile) {
5218
- warn(`${featureName}/${specName}: no test.spec.ts found`);
5219
- hint("run 'ccqa record <feature>/<spec>' to record it, or set 'mode: live' in spec.yaml");
5220
- continue;
5221
- }
5222
- run(`${featureName}/${specName}`);
5223
- meta("test", scriptFile);
5224
- blank();
5225
- const reportFile = join(tmpDir, `report-${i}.json`);
5226
- const evidenceDir = evidenceRoot ? join(evidenceRoot, featureName, specName) : null;
5227
- if (evidenceDir) {
5228
- await rm(evidenceDir, {
5229
- recursive: true,
5230
- force: true
5231
- });
5232
- await mkdir(evidenceDir, { recursive: true });
5233
- }
5234
- const proc = spawnVitestStreaming([
5235
- "run",
5236
- "--config",
5237
- vitestConfig,
5238
- scriptFile,
5239
- "--reporter=json",
5240
- `--outputFile.json=${reportFile}`
5241
- ], {
5242
- cwd,
5243
- env: evidenceDir ? {
5244
- ...process.env,
5245
- CCQA_EVIDENCE_DIR: evidenceDir
5246
- } : process.env
5247
- });
5248
- const tail = captureOutput ? new TailBuffer(OUTPUT_TAIL_CAP) : null;
5249
- await Promise.all([streamFiltered(proc.stdout, process.stdout, tail), streamFiltered(proc.stderr, process.stderr, tail)]);
5250
- const specExitCode = await proc.exited;
5251
- if (specExitCode !== 0) exitCode = specExitCode;
5252
- const report = await readReport(reportFile);
5253
- summaries.push({
5254
- featureName,
5255
- specName,
5256
- scriptFile,
5257
- report,
5258
- exitCode: specExitCode,
5259
- outputTail: tail ? tail.toString() : null,
5260
- evidenceDir
5261
- });
5262
- blank();
5263
- }
5649
+ const summaries = (await runPool(specs, concurrency, (spec, i) => withBuffer(`${spec.featureName}/${spec.specName}`, concurrency > 1, () => runOneDeterministicSpec(spec, i, ctx)))).filter((s) => s !== null);
5264
5650
  printSummary(summaries);
5651
+ return {
5652
+ summaries,
5653
+ exitCode: summaries.reduce((acc, s) => s.exitCode !== 0 ? s.exitCode : acc, 0)
5654
+ };
5265
5655
  } finally {
5266
5656
  await rm(tmpDir, {
5267
5657
  recursive: true,
5268
5658
  force: true
5269
5659
  });
5270
5660
  }
5661
+ }
5662
+ /**
5663
+ * Run one spec under vitest. Returns null when the spec has no recorded
5664
+ * test.spec.ts (skipped). All output goes through the logger, so under a
5665
+ * `log.withBuffer` scope it's captured and flushed as one labelled block.
5666
+ */
5667
+ async function runOneDeterministicSpec(spec, index, ctx) {
5668
+ const { featureName, specName } = spec;
5669
+ const scriptFile = await getTestScript(featureName, specName, ctx.cwd);
5670
+ if (!scriptFile) {
5671
+ warn(`${featureName}/${specName}: no test.spec.ts found`);
5672
+ hint("run 'ccqa record <feature>/<spec>' to record it, or set 'mode: live' in spec.yaml");
5673
+ return null;
5674
+ }
5675
+ run(`${featureName}/${specName}`);
5676
+ meta("test", scriptFile);
5677
+ blank();
5678
+ const reportFile = join(ctx.tmpDir, `report-${index}.json`);
5679
+ const evidenceDir = ctx.evidenceRoot ? join(ctx.evidenceRoot, featureName, specName) : null;
5680
+ if (evidenceDir) {
5681
+ await rm(evidenceDir, {
5682
+ recursive: true,
5683
+ force: true
5684
+ });
5685
+ await mkdir(evidenceDir, { recursive: true });
5686
+ }
5687
+ const proc = spawnVitestStreaming([
5688
+ "run",
5689
+ "--config",
5690
+ ctx.vitestConfig,
5691
+ scriptFile,
5692
+ "--reporter=json",
5693
+ `--outputFile.json=${reportFile}`
5694
+ ], {
5695
+ cwd: ctx.cwd,
5696
+ env: evidenceDir ? {
5697
+ ...process.env,
5698
+ CCQA_EVIDENCE_DIR: evidenceDir
5699
+ } : process.env
5700
+ });
5701
+ const sink = { write: emitRaw };
5702
+ const tail = ctx.captureOutput ? new TailBuffer(OUTPUT_TAIL_CAP) : null;
5703
+ await Promise.all([streamFiltered(proc.stdout, sink, tail), streamFiltered(proc.stderr, sink, tail)]);
5704
+ const specExitCode = await proc.exited;
5705
+ blank();
5271
5706
  return {
5272
- summaries,
5273
- exitCode
5707
+ featureName,
5708
+ specName,
5709
+ scriptFile,
5710
+ report: await readReport(reportFile),
5711
+ exitCode: specExitCode,
5712
+ outputTail: tail ? tail.toString() : null,
5713
+ evidenceDir
5274
5714
  };
5275
5715
  }
5276
5716
  function failedSpec(s) {
@@ -5358,7 +5798,7 @@ async function analyzeDeterministicSummaries(summaries, opts, cwd, reportDir) {
5358
5798
  failureLogExcerpt: null,
5359
5799
  diffExcerpt: null,
5360
5800
  specYaml: null,
5361
- ndRun: null
5801
+ liveRun: null
5362
5802
  });
5363
5803
  continue;
5364
5804
  }
@@ -5408,7 +5848,7 @@ async function analyzeDeterministicSummaries(summaries, opts, cwd, reportDir) {
5408
5848
  failureLogExcerpt: failureLog.length > 0 ? failureLog : null,
5409
5849
  diffExcerpt,
5410
5850
  specYaml,
5411
- ndRun: null
5851
+ liveRun: null
5412
5852
  });
5413
5853
  }
5414
5854
  return {
@@ -5719,6 +6159,7 @@ agent-browser --session SESSION wait --load networkidle
5719
6159
  agent-browser --session SESSION get count "<selector>" # element-existence check (returns a number, fast)
5720
6160
  agent-browser --session SESSION cookies clear
5721
6161
  agent-browser --session SESSION find <locator> <value> <action> [<input>] [--name "<n>"] [--exact]
6162
+ agent-browser --session SESSION upload "<input[type=file] selector>" <file> [<file> ...]
5722
6163
  # See "Selector Rules" for the full \`find\` subset.
5723
6164
  # IMPORTANT: do NOT use \`wait "<css-selector>"\`. agent-browser ignores --timeout on a
5724
6165
  # CSS-selector wait and blocks for ~150s when the selector never matches, killing the run.
@@ -5794,6 +6235,8 @@ find nth <index> "<ALLOWED-css>" <action>
5794
6235
 
5795
6236
  **Verifying cleanup / deletion**: assert the *absence* of the deleted thing, not the surrounding listing screen's text. Use \`wait --fn "!document.body.innerText.includes('<unique-label>')"\` (text disappearance) — never \`wait "<css-selector>" --state hidden\` (blocks the daemon) and never \`wait --text "<navbar label>"\` (passes regardless of the deletion).
5796
6237
 
6238
+ **File inputs (\`<input type="file">\`) / OS file-picker dialogs**: do NOT \`click\` the input — that opens the OS picker, which agent-browser cannot drive. Use \`upload "<selector>" <path>\` instead. agent-browser sets the input's files directly via the underlying browser API, no native dialog ever opens. Use an ALLOWED selector to identify the input (\`[aria-label='…']\`, \`[data-testid='…']\`, \`[type='file']\` only when it's unique on the page). File paths must be plain shell args — wrap each in \`"\` for safety. Reference fixtures via \`\${CCQA_FIXTURES_DIR}/<name>\` so the same spec works locally and in CI; conventionally fixtures live under \`.ccqa/fixtures/\` and the env var resolves there. Multi-file inputs accept several positionals: \`upload "[aria-label='Attach']" "\${CCQA_FIXTURES_DIR}/a.pdf" "\${CCQA_FIXTURES_DIR}/b.pdf"\`.
6239
+
5797
6240
  ## Test Specification
5798
6241
 
5799
6242
  Title: ${input.title}
@@ -5876,6 +6319,7 @@ AB_ACTION|select|<selector>|<value>|<aria label>
5876
6319
  AB_ACTION|hover|<selector>|<visible label>
5877
6320
  AB_ACTION|scroll|<direction>|<pixels>
5878
6321
  AB_ACTION|drag|<source selector>|<target selector>|<source label>
6322
+ AB_ACTION|upload|<file-input selector>|<file1>[|<file2>...]
5879
6323
  AB_ACTION|wait|<selector or text>|<label>
5880
6324
  AB_ACTION|snapshot|<key observation, max 100 chars>
5881
6325
  AB_ACTION|assert|<assertType>|<selector or "">|<value or "">|<observation>
@@ -6192,6 +6636,17 @@ function actionToAbArgs(action, sessionName) {
6192
6636
  sub(action.selector),
6193
6637
  sub(action.target)
6194
6638
  ];
6639
+ case "upload": {
6640
+ const sel = sub(action.selector);
6641
+ const files = (action.files ?? []).map((f) => sub(f));
6642
+ if (!sel || files.length === 0) return null;
6643
+ return [
6644
+ ...base,
6645
+ "upload",
6646
+ sel,
6647
+ ...files
6648
+ ];
6649
+ }
6195
6650
  case "wait": {
6196
6651
  const raw = sub(action.selector);
6197
6652
  if (!raw) return null;
@@ -6683,9 +7138,9 @@ async function runTrace(featureName, specName, model, validationMode = "lenient"
6683
7138
  steps: expanded,
6684
7139
  sessionName
6685
7140
  });
6686
- const userPrompt = await loadTraceUserPrompt();
6687
- if (userPrompt !== null) meta("user-prompt", ".ccqa/prompts/trace.user.md");
6688
- const systemPrompt = (userPrompt === null ? baseSystemPrompt : `${baseSystemPrompt}\n## Project-specific guidance\n\n${userPrompt}\n`) + languageDirective(language);
7141
+ const promptBundle = await loadRecordPromptBundle();
7142
+ if (promptBundle !== null) meta("prompt", promptBundle.loaded.join(" + "));
7143
+ const systemPrompt = (promptBundle === null ? baseSystemPrompt : `${baseSystemPrompt}\n## Project-specific guidance\n\n${promptBundle.text}\n`) + languageDirective(language);
6689
7144
  const prompt = buildTracePrompt(spec.title);
6690
7145
  info("Running agent-browser session...");
6691
7146
  blank();
@@ -6767,6 +7222,11 @@ async function runTrace(featureName, specName, model, validationMode = "lenient"
6767
7222
  if (written) meta("relatedPaths", `${relatedPaths.length} path(s) written to ${written}`);
6768
7223
  } else warn("trace did not emit a RELATED_PATHS block; drift --changed cannot scope this spec");
6769
7224
  hint(`run 'ccqa generate ${featureName}/${specName}' to generate a test script`);
7225
+ return {
7226
+ route,
7227
+ actionsKept: validatedActions.length,
7228
+ actionsRecorded: traceActions.length
7229
+ };
6770
7230
  }
6771
7231
  /**
6772
7232
  * Strip actions whose recorded fields contain "unstable literal" values
@@ -6825,7 +7285,7 @@ function dedupAndReport(actions) {
6825
7285
  function isAdjacentDuplicate(a, b) {
6826
7286
  if (a.command !== b.command) return false;
6827
7287
  if ((a.stepId ?? "") !== (b.stepId ?? "")) return false;
6828
- return (a.selector ?? "") === (b.selector ?? "") && (a.value ?? "") === (b.value ?? "") && (a.target ?? "") === (b.target ?? "") && (a.label ?? "") === (b.label ?? "") && (a.assertType ?? "") === (b.assertType ?? "") && (a.findLocator ?? "") === (b.findLocator ?? "") && (a.findValue ?? "") === (b.findValue ?? "") && (a.findName ?? "") === (b.findName ?? "") && (a.findIndex ?? -1) === (b.findIndex ?? -1) && (a.findExact ?? false) === (b.findExact ?? false);
7288
+ return (a.selector ?? "") === (b.selector ?? "") && (a.value ?? "") === (b.value ?? "") && (a.target ?? "") === (b.target ?? "") && (a.label ?? "") === (b.label ?? "") && (a.assertType ?? "") === (b.assertType ?? "") && (a.findLocator ?? "") === (b.findLocator ?? "") && (a.findValue ?? "") === (b.findValue ?? "") && (a.findName ?? "") === (b.findName ?? "") && (a.findIndex ?? -1) === (b.findIndex ?? -1) && (a.findExact ?? false) === (b.findExact ?? false) && (a.files ?? []).join("|") === (b.files ?? []).join("|");
6829
7289
  }
6830
7290
  /**
6831
7291
  * Run the post-trace replay validation and emit user-visible drop reports.
@@ -7047,6 +7507,16 @@ function parseAbAction(line) {
7047
7507
  target: parts[3],
7048
7508
  label: parts[4]
7049
7509
  };
7510
+ case "upload": {
7511
+ const selector = parts[2];
7512
+ const files = parts.slice(3).filter((f) => f !== "");
7513
+ if (!selector || files.length === 0) return null;
7514
+ return {
7515
+ command,
7516
+ selector,
7517
+ files
7518
+ };
7519
+ }
7050
7520
  case "find_click":
7051
7521
  case "find_dblclick":
7052
7522
  case "find_hover":
@@ -7097,6 +7567,7 @@ function actionsToScript(input) {
7097
7567
  `import { ${[
7098
7568
  "ab",
7099
7569
  "abWait",
7570
+ "abUpload",
7100
7571
  "abAssertTextVisible",
7101
7572
  "abAssertVisible",
7102
7573
  "abAssertNotVisible",
@@ -7130,6 +7601,7 @@ const ELEMENT_COMMANDS = new Set([
7130
7601
  "select",
7131
7602
  "hover",
7132
7603
  "drag",
7604
+ "upload",
7133
7605
  "find_click",
7134
7606
  "find_dblclick",
7135
7607
  "find_fill",
@@ -7261,6 +7733,11 @@ function actionToLine(action) {
7261
7733
  case "hover": return `ab("hover", ${j(action.selector)});`;
7262
7734
  case "scroll": return `ab("scroll", ${[action.direction ?? "down", ...action.pixels ? [action.pixels] : []].map(j).join(", ")});`;
7263
7735
  case "drag": return `ab("drag", ${j(action.selector)}, ${j(action.target)});`;
7736
+ case "upload": {
7737
+ const files = action.files ?? [];
7738
+ if (!action.selector || files.length === 0) return null;
7739
+ return `abUpload(${[j(action.selector), ...files.map(jExpr)].join(", ")});`;
7740
+ }
7264
7741
  case "wait": {
7265
7742
  const sel = action.selector;
7266
7743
  if (/^\d+$/.test(sel)) return `spawnSync("sleep", [${j(sel)}], { stdio: "inherit" });`;
@@ -8336,21 +8813,23 @@ function toFixMode(autoFix) {
8336
8813
  case "interactive": return "interactive";
8337
8814
  }
8338
8815
  }
8339
- const recordCommand = addLanguageOption(new Command("record").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Record a deterministic test from a spec: run agent-browser to collect actions (trace), then generate test.spec.ts with auto-fix retries (generate). After recording, `ccqa run <feature/spec>` replays it under vitest (deterministic specs only — live specs do not need recording).").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Overrides CCQA_MODEL.").option("--validation-mode <mode>", "Post-trace validation behaviour: 'lenient' (default) tags failing actions; 'strict' drops them.", (raw) => {
8816
+ const recordCommand = addProfileOption(addLanguageOption(new Command("record").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Record a deterministic test from a spec: run agent-browser to collect actions (trace), then generate test.spec.ts with auto-fix retries (generate). After recording, `ccqa run <feature/spec>` replays it under vitest (deterministic specs only — live specs do not need recording).").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Overrides CCQA_MODEL.").option("--validation-mode <mode>", "Post-trace validation behaviour: 'lenient' (default) tags failing actions; 'strict' drops them.", (raw) => {
8340
8817
  if (VALIDATION_MODES.includes(raw)) return raw;
8341
8818
  throw new Error(`--validation-mode must be one of ${VALIDATION_MODES.join(" | ")}`);
8342
8819
  }, "lenient").option("--auto-fix <mode>", "Auto-fix behaviour during script generation: 'interactive' (default, prompt y/N), 'auto' (apply without prompt, for CI), 'skip' (never prompt, only apply high-confidence fixes).", (raw) => {
8343
8820
  if (AUTO_FIX_MODES.includes(raw)) return raw;
8344
8821
  throw new Error(`--auto-fix must be one of ${AUTO_FIX_MODES.join(" | ")}`);
8345
- }, "interactive").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--force", "Overwrite an existing test.spec.ts without warning").option("--no-snapshot", "Don't pin AGENT_BROWSER_SESSION / capture page snapshots after a failure (debug toggle)").option("--skip-trace", "Skip the trace step and run codegen against an existing actions.json").option("--skip-codegen", "Run only the trace step (do not generate test.spec.ts)")).action(async (specPath, opts) => {
8822
+ }, "interactive").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--force", "Overwrite an existing test.spec.ts without warning").option("--no-snapshot", "Don't pin AGENT_BROWSER_SESSION / capture page snapshots after a failure (debug toggle)").option("--skip-trace", "Skip the trace step and run codegen against an existing actions.json").option("--skip-codegen", "Run only the trace step (do not generate test.spec.ts)").option("--update-agent-prompt", "After the trace finishes, ask Claude to refresh .ccqa/prompts/record.agent.md from a summary of the run.").option("--cwd <path>", "Working directory containing the .ccqa/ tree (monorepo support). Defaults to the current directory."))).action(async (specPath, opts) => {
8346
8823
  const { featureName, specName } = parseSpecPath(specPath);
8347
8824
  const language = opts.language ?? "auto";
8348
8825
  if (opts.skipTrace && opts.skipCodegen) {
8349
8826
  error("--skip-trace and --skip-codegen cannot be combined; nothing would run");
8350
8827
  process.exit(2);
8351
8828
  }
8829
+ await applyProfileFromOption(opts.profile, resolveCwd(opts.cwd));
8830
+ let traceResult = null;
8352
8831
  if (!opts.skipTrace) {
8353
- await runTrace(featureName, specName, opts.model, opts.validationMode ?? "lenient", language);
8832
+ traceResult = await runTrace(featureName, specName, opts.model, opts.validationMode ?? "lenient", language);
8354
8833
  blank();
8355
8834
  }
8356
8835
  if (!opts.skipCodegen) {
@@ -8358,7 +8837,37 @@ const recordCommand = addLanguageOption(new Command("record").argument("<feature
8358
8837
  const useSnapshot = opts.snapshot !== false;
8359
8838
  await runGenerate(featureName, specName, parseInt(opts.maxRetries ?? "3", 10), fixMode, opts.force ?? false, useSnapshot, language, opts.model);
8360
8839
  }
8840
+ if (opts.updateAgentPrompt) if (traceResult === null) warn("--update-agent-prompt is ignored when --skip-trace is set (no run summary available)");
8841
+ else {
8842
+ const cwd = resolveCwd(opts.cwd);
8843
+ blank();
8844
+ await updateAgentPrompt({
8845
+ mode: "record",
8846
+ runSummary: buildRecordRunSummary(featureName, specName, traceResult),
8847
+ cwd,
8848
+ ...opts.model ? { model: opts.model } : {},
8849
+ ...language ? { language } : {}
8850
+ });
8851
+ }
8361
8852
  });
8853
+ /**
8854
+ * Compact summary of the trace pass for the record agent-prompt refresh:
8855
+ * per-step title / action / observation / status. The route steps already
8856
+ * carry the assistant's own framing of what happened — perfect input for
8857
+ * "what should I remember next time".
8858
+ */
8859
+ function buildRecordRunSummary(featureName, specName, t) {
8860
+ return `${`## ${featureName}/${specName} — ${t.route.status}\nActions: ${t.actionsKept} kept / ${t.actionsRecorded} recorded`}\n\n${t.route.steps.length === 0 ? "(no route steps recorded)" : t.route.steps.map((s) => [
8861
+ `### ${s.title} (${s.status})`,
8862
+ `- action: ${oneLineSummary(s.action)}`,
8863
+ `- observation: ${oneLineSummary(s.observation)}`,
8864
+ ...s.reason ? [`- reason: ${oneLineSummary(s.reason)}`] : []
8865
+ ].join("\n")).join("\n\n")}`;
8866
+ }
8867
+ function oneLineSummary(s) {
8868
+ const flat = s.replace(/\s+/g, " ").trim();
8869
+ return flat.length > 240 ? flat.slice(0, 240) + "…" : flat || "(none)";
8870
+ }
8362
8871
  //#endregion
8363
8872
  //#region src/cli/draft.ts
8364
8873
  const CATEGORY_LABEL = DRAFT_CATEGORY_LABEL;
@@ -9128,6 +9637,64 @@ function parseConcurrency(raw) {
9128
9637
  return n;
9129
9638
  }
9130
9639
  //#endregion
9640
+ //#region src/cli/init.ts
9641
+ const TEMPLATES = [
9642
+ {
9643
+ relPath: ".ccqa/prompts/live.user.md",
9644
+ content: `# Project guidance for live specs
9645
+
9646
+ Write stable, hand-maintained context here: staging URLs, naming conventions, known "this is fine" warnings. Lines you add will be appended verbatim to the system prompt of every step in 'mode: live' specs.
9647
+ `
9648
+ },
9649
+ {
9650
+ relPath: ".ccqa/prompts/live.agent.md",
9651
+ content: `# Agent learnings for live specs
9652
+
9653
+ This file is updated by 'ccqa run --update-agent-prompt'. You can edit it by hand, but the next --update-agent-prompt run may rewrite the whole file. Keep stable rules in live.user.md instead.
9654
+ `
9655
+ },
9656
+ {
9657
+ relPath: ".ccqa/prompts/record.user.md",
9658
+ content: `# Project guidance for ccqa record (deterministic trace)
9659
+
9660
+ Write stable, hand-maintained context here for the trace phase of 'ccqa record'. Lines you add will be appended verbatim to the trace system prompt.
9661
+ `
9662
+ },
9663
+ {
9664
+ relPath: ".ccqa/prompts/record.agent.md",
9665
+ content: `# Agent learnings for ccqa record
9666
+
9667
+ This file is updated by 'ccqa record --update-agent-prompt'. Same convention as live.agent.md — stable rules go in record.user.md.
9668
+ `
9669
+ }
9670
+ ];
9671
+ const initCommand = new Command("init").description("Create .ccqa/prompts/{live,record}.{user,agent}.md template files (skips existing files unless --force).").option("--cwd <path>", "Working directory (default: cwd)").option("--force", "Overwrite existing files").action(async (opts) => {
9672
+ const cwd = resolveCwd(opts.cwd);
9673
+ header("init", cwd);
9674
+ await mkdir(join(cwd, ".ccqa", "prompts"), { recursive: true });
9675
+ const created = [];
9676
+ const skipped = [];
9677
+ for (const t of TEMPLATES) if (await writeTemplate(join(cwd, t.relPath), t.content, opts.force ?? false)) created.push(t.relPath);
9678
+ else skipped.push(t.relPath);
9679
+ for (const f of created) info(`created ${f}`);
9680
+ for (const f of skipped) info(`skipped ${f} (already exists; pass --force to overwrite)`);
9681
+ blank();
9682
+ meta("created", created.length);
9683
+ meta("skipped", skipped.length);
9684
+ });
9685
+ async function writeTemplate(absPath, content, force) {
9686
+ try {
9687
+ await writeFile(absPath, content, force ? { encoding: "utf-8" } : {
9688
+ encoding: "utf-8",
9689
+ flag: "wx"
9690
+ });
9691
+ return true;
9692
+ } catch (err) {
9693
+ if (typeof err === "object" && err !== null && err.code === "EEXIST") return false;
9694
+ throw err;
9695
+ }
9696
+ }
9697
+ //#endregion
9131
9698
  //#region src/prompts/perspectives.ts
9132
9699
  /**
9133
9700
  * Build the system prompt. By default the descriptive fields follow the
@@ -9595,8 +10162,6 @@ function renderSpecMarkdown(spec, labels = LABELS_JA) {
9595
10162
  lines.push("");
9596
10163
  lines.push(`| ${labels.itemCol} | ${labels.valueCol} |`);
9597
10164
  lines.push("| --- | --- |");
9598
- lines.push(`| ${labels.modeLabel} | ${mdCell(modeLabel(spec.status, labels))} |`);
9599
- lines.push(`| ${labels.statusCol} | ${mdCell(statusLabel(spec.status, labels))} |`);
9600
10165
  if (spec.summary) lines.push(`| ${labels.summary} | ${mdCell(spec.summary)} |`);
9601
10166
  if (spec.preconditions && spec.preconditions.length > 0) lines.push(`| ${labels.preconditions} | ${spec.preconditions.map(mdCell).join("<br>")} |`);
9602
10167
  if (spec.startScreen) lines.push(`| ${labels.startScreen} | ${mdCell(spec.startScreen)} |`);
@@ -9628,6 +10193,7 @@ function resolvePackageJson() {
9628
10193
  const { version } = JSON.parse(readFileSync(resolvePackageJson(), "utf8"));
9629
10194
  const program = new Command();
9630
10195
  program.name("ccqa").description("E2E test CLI using Claude Code + agent-browser").version(version);
10196
+ program.addCommand(initCommand);
9631
10197
  program.addCommand(draftCommand);
9632
10198
  program.addCommand(perspectivesCommand);
9633
10199
  program.addCommand(recordCommand);