ccqa 0.8.2 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -8
- package/dist/bin/ccqa.mjs +421 -184
- package/dist/package.json +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -94,6 +94,7 @@ ccqa run --changed --report # only specs whose relatedPaths t
|
|
|
94
94
|
## Commands
|
|
95
95
|
|
|
96
96
|
```
|
|
97
|
+
ccqa init Scaffold .ccqa/prompts/{live,record}.{user,agent}.md templates
|
|
97
98
|
ccqa draft [feature/spec] Co-author a test spec with Claude
|
|
98
99
|
ccqa perspectives Inventory existing test coverage into .ccqa/perspectives.yaml
|
|
99
100
|
ccqa record <feature/spec> (deterministic specs only) Trace browser actions + generate test.spec.ts
|
|
@@ -114,6 +115,7 @@ ccqa drift [feature/spec] Standalone spec ↔ codebase static audit (fo
|
|
|
114
115
|
- `--retry <n>` — (live specs only) retry each failing step up to N more times
|
|
115
116
|
- `--format <fmt>` — `text` (default), `json` (report.json), `github` (Actions annotations)
|
|
116
117
|
- `--out <dir>` — (live specs only, single-spec invocations) override the per-run artifact directory
|
|
118
|
+
- `--update-agent-prompt` — (live specs only) after the run, summarise it back to Claude and rewrite `.ccqa/prompts/live.agent.md` so the next run inherits the lessons learned. `ccqa record` ships the same flag, refreshing `record.agent.md` from the trace summary.
|
|
117
119
|
|
|
118
120
|
All Claude-driven commands accept `-m, --model <name>` (alias `sonnet` | `opus` | `haiku`, or a full model ID). The flag overrides `CCQA_MODEL`; when both are unset, the Claude Code CLI default is used. They also accept `--language <bcp47>` (e.g. `ja`, `en`) to set the language of human-readable output; the default `auto` follows the language of the spec/codebase. `--cwd <path>` works on `record` / `run` / `drift` so you can target a subpackage inside a monorepo from the repo root. Interactive commands authenticate via your local Claude Code login; commands that talk to Claude in CI (`ccqa run --report`, `ccqa drift`) additionally honor `ANTHROPIC_API_KEY`.
|
|
119
121
|
|
|
@@ -125,9 +127,11 @@ All Claude-driven commands accept `-m, --model <name>` (alias `sonnet` | `opus`
|
|
|
125
127
|
.ccqa/
|
|
126
128
|
perspectives.yaml # Inventory of existing coverage (machine-readable, canonical)
|
|
127
129
|
perspectives.md # Category index, regenerated from the YAML
|
|
128
|
-
prompts/
|
|
129
|
-
|
|
130
|
-
|
|
130
|
+
prompts/ # Run `ccqa init` to scaffold these
|
|
131
|
+
record.user.md # Human-maintained guidance appended to `ccqa record` (trace phase)
|
|
132
|
+
record.agent.md # Auto-updated by `ccqa record --update-agent-prompt`
|
|
133
|
+
live.user.md # Human-maintained guidance appended to `ccqa run` (live specs)
|
|
134
|
+
live.agent.md # Auto-updated by `ccqa run --update-agent-prompt`
|
|
131
135
|
blocks/
|
|
132
136
|
login/
|
|
133
137
|
spec.yaml # Reusable block (params + steps)
|
|
@@ -175,11 +179,16 @@ ccqa run --retry 2 tasks/create-and-complete
|
|
|
175
179
|
|
|
176
180
|
Constraints on selectors / `agent-browser` subcommands that apply during `ccqa record` (no `eval`, no `@ref`, no bare-tag positional `find`, no chained agent-browser calls) are **relaxed** for live specs — Claude can use any subcommand and any selector style because there is no replay contract to honour.
|
|
177
181
|
|
|
178
|
-
### Per-project guidance (`.ccqa/prompts/
|
|
182
|
+
### Per-project guidance (`.ccqa/prompts/live.user.md` + `live.agent.md`)
|
|
179
183
|
|
|
180
|
-
ccqa's live-mode system prompt is deliberately product-agnostic. Anything specific to **your** project — staging URLs, login flow quirks, rich-editor types, common access-denied wording — belongs in
|
|
184
|
+
ccqa's live-mode system prompt is deliberately product-agnostic. Anything specific to **your** project — staging URLs, login flow quirks, rich-editor types, common access-denied wording — belongs in two sibling files (run `ccqa init` to scaffold both):
|
|
181
185
|
|
|
182
|
-
|
|
186
|
+
- `.ccqa/prompts/live.user.md` — human-maintained stable guidance.
|
|
187
|
+
- `.ccqa/prompts/live.agent.md` — auto-updated by `ccqa run --update-agent-prompt` from each run's summary. You can hand-edit it, but the next `--update-agent-prompt` run may rewrite the whole file; durable rules should live in `live.user.md`.
|
|
188
|
+
|
|
189
|
+
Both files (when present) are read once per invocation and appended to the system prompt under "Project-specific guidance". The `ccqa record` (trace) side has the same split: `record.user.md` + `record.agent.md`, refreshed by `ccqa record --update-agent-prompt`.
|
|
190
|
+
|
|
191
|
+
Keep them short. A page or two of focused notes beats a long handbook — Claude has the spec's `expected` text to work from, these files are for the *non-obvious* product knowledge that isn't in any single spec. Examples of what's useful here:
|
|
183
192
|
|
|
184
193
|
- "the rich text editor is `[contenteditable='true']` — use `fill`, not keystrokes"
|
|
185
194
|
- "login redirects through an IDP service-selection screen; you can skip it by opening the destination URL directly"
|
|
@@ -189,9 +198,9 @@ Examples of what does **not** belong:
|
|
|
189
198
|
|
|
190
199
|
- per-spec details (those belong in the spec's `instruction` / `expected`)
|
|
191
200
|
- restating the STEP_RESULT contract (already in the system prompt)
|
|
192
|
-
- copy-pasted style guidelines from `
|
|
201
|
+
- copy-pasted style guidelines from `record.user.md` (the relaxed-constraint mode doesn't need them)
|
|
193
202
|
|
|
194
|
-
The
|
|
203
|
+
The combined bundle is capped at 32 KiB; anything beyond that is truncated with a warning.
|
|
195
204
|
|
|
196
205
|
## License
|
|
197
206
|
|
package/dist/bin/ccqa.mjs
CHANGED
|
@@ -480,50 +480,62 @@ async function loadAvailableBlocks(cwd) {
|
|
|
480
480
|
}))
|
|
481
481
|
}));
|
|
482
482
|
}
|
|
483
|
-
const
|
|
484
|
-
const
|
|
483
|
+
const RECORD_USER_PROMPT_PATH = ".ccqa/prompts/record.user.md";
|
|
484
|
+
const RECORD_AGENT_PROMPT_PATH = ".ccqa/prompts/record.agent.md";
|
|
485
|
+
const LIVE_USER_PROMPT_PATH = ".ccqa/prompts/live.user.md";
|
|
486
|
+
const LIVE_AGENT_PROMPT_PATH = ".ccqa/prompts/live.agent.md";
|
|
485
487
|
const USER_PROMPT_MAX_BYTES = 32768;
|
|
486
488
|
/**
|
|
487
|
-
* Load
|
|
489
|
+
* Load the prompt bundle appended to the `ccqa record` (trace) system prompt.
|
|
488
490
|
*
|
|
489
|
-
*
|
|
490
|
-
*
|
|
491
|
-
*
|
|
491
|
+
* Reads `.ccqa/prompts/record.user.md` (human-maintained, stable project
|
|
492
|
+
* rules) and `.ccqa/prompts/record.agent.md` (auto-rewritten by
|
|
493
|
+
* `ccqa record --update-agent-prompt`). Returns null when both files are
|
|
494
|
+
* missing / empty. The combined text is capped at 32 KiB after concatenation.
|
|
492
495
|
*
|
|
493
|
-
*
|
|
494
|
-
* the OSS-default prompt — naming conventions, staging URL hints, repeated
|
|
495
|
-
* UI quirks that recur across specs. Anything that genuinely belongs in
|
|
496
|
-
* one spec should go in that spec's instruction, not here.
|
|
497
|
-
*
|
|
498
|
-
* Size-capped at 32 KiB to keep accidental commits of huge files from
|
|
499
|
-
* blowing up the system prompt; the cap is observable to callers as a
|
|
500
|
-
* truncated warning suffix.
|
|
496
|
+
* Use `ccqa init` to scaffold both files.
|
|
501
497
|
*/
|
|
502
|
-
async function
|
|
503
|
-
return
|
|
498
|
+
async function loadRecordPromptBundle(cwd) {
|
|
499
|
+
return loadPromptBundle(RECORD_USER_PROMPT_PATH, RECORD_AGENT_PROMPT_PATH, cwd);
|
|
504
500
|
}
|
|
505
501
|
/**
|
|
506
|
-
* Load
|
|
502
|
+
* Load the prompt bundle appended to the `ccqa run` (live mode) system prompt.
|
|
507
503
|
*
|
|
508
|
-
*
|
|
509
|
-
* `.ccqa/prompts/
|
|
510
|
-
*
|
|
511
|
-
*
|
|
512
|
-
* "this is fine" warnings, login flow quirks — belongs here. Keeping it in the
|
|
504
|
+
* Reads `.ccqa/prompts/live.user.md` (human-maintained, stable project
|
|
505
|
+
* rules) and `.ccqa/prompts/live.agent.md` (auto-rewritten by
|
|
506
|
+
* `ccqa run --update-agent-prompt`). Same null / cap semantics as
|
|
507
|
+
* `loadRecordPromptBundle`. Keeping product-specific context in the
|
|
513
508
|
* consuming repo (not the ccqa OSS prompt) is the explicit non-contamination
|
|
514
|
-
* boundary
|
|
515
|
-
* context they need.
|
|
509
|
+
* boundary.
|
|
516
510
|
*/
|
|
517
|
-
async function
|
|
518
|
-
return
|
|
511
|
+
async function loadLivePromptBundle(cwd) {
|
|
512
|
+
return loadPromptBundle(LIVE_USER_PROMPT_PATH, LIVE_AGENT_PROMPT_PATH, cwd);
|
|
513
|
+
}
|
|
514
|
+
async function loadPromptBundle(userRelPath, agentRelPath, cwd) {
|
|
515
|
+
const [userText, agentText] = await Promise.all([readPromptFile(userRelPath, cwd), readPromptFile(agentRelPath, cwd)]);
|
|
516
|
+
if (userText === null && agentText === null) return null;
|
|
517
|
+
const sections = [];
|
|
518
|
+
const loaded = [];
|
|
519
|
+
if (userText !== null) {
|
|
520
|
+
sections.push(`### Project guidance (human-maintained)\n\n${userText}`);
|
|
521
|
+
loaded.push(userRelPath);
|
|
522
|
+
}
|
|
523
|
+
if (agentText !== null) {
|
|
524
|
+
sections.push(`### Agent learnings (auto-updated by ccqa --update-agent-prompt)\n\n${agentText}`);
|
|
525
|
+
loaded.push(agentRelPath);
|
|
526
|
+
}
|
|
527
|
+
let text = sections.join("\n\n");
|
|
528
|
+
if (text.length > USER_PROMPT_MAX_BYTES) text = text.slice(0, USER_PROMPT_MAX_BYTES) + `\n\n[ccqa] (prompt bundle truncated at ${USER_PROMPT_MAX_BYTES} bytes)`;
|
|
529
|
+
return {
|
|
530
|
+
text,
|
|
531
|
+
loaded
|
|
532
|
+
};
|
|
519
533
|
}
|
|
520
|
-
async function
|
|
534
|
+
async function readPromptFile(relPath, cwd) {
|
|
521
535
|
const content = await readFile(join(cwd ?? process.cwd(), relPath), "utf-8").catch(() => null);
|
|
522
536
|
if (content === null) return null;
|
|
523
537
|
const trimmed = content.trim();
|
|
524
|
-
|
|
525
|
-
if (trimmed.length > USER_PROMPT_MAX_BYTES) return trimmed.slice(0, USER_PROMPT_MAX_BYTES) + `\n\n[ccqa] (${labelForTruncation} truncated at ${USER_PROMPT_MAX_BYTES} bytes)`;
|
|
526
|
-
return trimmed;
|
|
538
|
+
return trimmed.length === 0 ? null : trimmed;
|
|
527
539
|
}
|
|
528
540
|
/**
|
|
529
541
|
* Probe for orphaned files left over from earlier ccqa versions inside
|
|
@@ -567,9 +579,9 @@ async function getTestScript(featureName, specName, cwd) {
|
|
|
567
579
|
}
|
|
568
580
|
/**
|
|
569
581
|
* Variant of `listAllSpecs` for callers that care about the spec definition
|
|
570
|
-
* itself (spec.yaml) rather than its compiled vitest script. `ccqa run
|
|
571
|
-
*
|
|
572
|
-
* no `test.spec.ts` is still a valid target.
|
|
582
|
+
* itself (spec.yaml) rather than its compiled vitest script. `ccqa run` uses
|
|
583
|
+
* this for live-mode specs because they skip codegen entirely — a freshly
|
|
584
|
+
* drafted spec with no `test.spec.ts` is still a valid target.
|
|
573
585
|
*/
|
|
574
586
|
async function listAllSpecsWithSpecFile(cwd) {
|
|
575
587
|
return listAllSpecsFilteredBy(SPEC_FILE, cwd);
|
|
@@ -589,10 +601,10 @@ async function listAllSpecsFilteredBy(requiredFilename, cwd) {
|
|
|
589
601
|
}))).flat();
|
|
590
602
|
}
|
|
591
603
|
/**
|
|
592
|
-
* Resolve a CLI `<target>` argument into a list of spec refs.
|
|
593
|
-
* `ccqa run
|
|
594
|
-
*
|
|
595
|
-
*
|
|
604
|
+
* Resolve a CLI `<target>` argument into a list of spec refs. Used by
|
|
605
|
+
* `ccqa run`. Callers pass the right enumerator for "no target" (deterministic
|
|
606
|
+
* specs want `test.spec.ts`-having specs; live specs want `spec.yaml`-having
|
|
607
|
+
* specs).
|
|
596
608
|
*/
|
|
597
609
|
async function resolveSpecTargets(target, enumerateAll, cwd) {
|
|
598
610
|
if (!target) return enumerateAll();
|
|
@@ -2324,7 +2336,7 @@ function clamp(n, lo, hi) {
|
|
|
2324
2336
|
//#endregion
|
|
2325
2337
|
//#region src/report/prompt.ts
|
|
2326
2338
|
function buildFailureAnalysisPrompt(input) {
|
|
2327
|
-
const { script, specYaml, failureLog,
|
|
2339
|
+
const { script, specYaml, failureLog, liveTranscriptExcerpt, diffPatch, changedFiles, baseRef, driftIssues, outputLanguage = "auto" } = input;
|
|
2328
2340
|
return `You are analyzing a failing E2E regression test right after a source change landed. Your job is a root-cause CALL, not a fix: decide which of three categories explains the failure, using the source diff as your primary context.
|
|
2329
2341
|
|
|
2330
2342
|
${outputLanguageBlock(outputLanguage, "`reasoning`, `detail`", "label names (TEST_DRIFT, etc.)")}## The three categories
|
|
@@ -2396,7 +2408,7 @@ Evidence rules: TEST_DRIFT and SPEC_CHANGE require at least one concrete \`file\
|
|
|
2396
2408
|
## Test Spec (spec.yaml)
|
|
2397
2409
|
${specYaml}
|
|
2398
2410
|
|
|
2399
|
-
${buildExecutionEvidenceBlock(script, failureLog,
|
|
2411
|
+
${buildExecutionEvidenceBlock(script, failureLog, liveTranscriptExcerpt)}
|
|
2400
2412
|
|
|
2401
2413
|
${diffPatch ? `## Source changes since ${baseRef ?? "base"} (git diff, may be truncated)
|
|
2402
2414
|
|
|
@@ -2432,14 +2444,14 @@ ${driftIssues.map((i) => `- [${i.severity}] (${DRAFT_CATEGORY_LABEL[i.category]}
|
|
|
2432
2444
|
* never has to branch on mode — it just sees "here's what was executed
|
|
2433
2445
|
* and here's how it failed".
|
|
2434
2446
|
*/
|
|
2435
|
-
function buildExecutionEvidenceBlock(script, failureLog,
|
|
2447
|
+
function buildExecutionEvidenceBlock(script, failureLog, liveTranscriptExcerpt) {
|
|
2436
2448
|
const sections = [];
|
|
2437
2449
|
if (script && script.length > 0) sections.push(`## Test Script (with line numbers)
|
|
2438
2450
|
${numberLines(script)}`);
|
|
2439
2451
|
if (failureLog && failureLog.length > 0) sections.push(`## Failure Log
|
|
2440
2452
|
${failureLog.slice(0, 8e3)}`);
|
|
2441
|
-
if (
|
|
2442
|
-
${
|
|
2453
|
+
if (liveTranscriptExcerpt && liveTranscriptExcerpt.length > 0) sections.push(`## Live Run Transcript (summary of Claude's per-step execution)
|
|
2454
|
+
${liveTranscriptExcerpt}`);
|
|
2443
2455
|
if (sections.length === 0) return `## Execution evidence
|
|
2444
2456
|
|
|
2445
2457
|
(No script, failure log, or live transcript was captured for this run. Classify from spec.yaml + diff only, and be correspondingly more conservative — prefer UNKNOWN over a confident call.)`;
|
|
@@ -2535,11 +2547,11 @@ const ReportEvidenceSchema = z.object({
|
|
|
2535
2547
|
failureSummary: z.string().nullable().default(null)
|
|
2536
2548
|
});
|
|
2537
2549
|
/**
|
|
2538
|
-
* Per-step row for a
|
|
2539
|
-
* structure produced by `src/runtime/
|
|
2550
|
+
* Per-step row for a live-mode run (spec.yaml `mode: live`). Mirrors the
|
|
2551
|
+
* structure produced by `src/runtime/live-executor.ts:LiveStepResult` but
|
|
2540
2552
|
* encoded against the report schema so the HTML renderer can carry both
|
|
2541
|
-
* deterministic (`evidence`) and
|
|
2542
|
-
*
|
|
2553
|
+
* deterministic (`evidence`) and live (`liveRun`) sources of step-boundary
|
|
2554
|
+
* screenshots.
|
|
2543
2555
|
*
|
|
2544
2556
|
* `beforePng` / `afterPng` are RELATIVE to the HTML report directory — the
|
|
2545
2557
|
* caller computes the relative path with `node:path`'s `relative()` so the
|
|
@@ -2554,7 +2566,7 @@ const ReportEvidenceSchema = z.object({
|
|
|
2554
2566
|
* `models` is the union of model ids the SDK reported using; usually a
|
|
2555
2567
|
* single element, but the SDK can fan out across models in some modes.
|
|
2556
2568
|
*/
|
|
2557
|
-
const
|
|
2569
|
+
const LiveReportCostSchema = z.object({
|
|
2558
2570
|
totalCostUsd: z.number().nullable(),
|
|
2559
2571
|
durationApiMs: z.number().nullable(),
|
|
2560
2572
|
numTurns: z.number().nullable(),
|
|
@@ -2564,7 +2576,7 @@ const NdReportCostSchema = z.object({
|
|
|
2564
2576
|
outputTokens: z.number().nullable(),
|
|
2565
2577
|
models: z.array(z.string())
|
|
2566
2578
|
});
|
|
2567
|
-
const
|
|
2579
|
+
const LiveReportStepSchema = z.object({
|
|
2568
2580
|
stepId: z.string(),
|
|
2569
2581
|
source: z.string(),
|
|
2570
2582
|
instruction: z.string(),
|
|
@@ -2578,15 +2590,15 @@ const NdReportStepSchema = z.object({
|
|
|
2578
2590
|
beforePng: z.string().nullable(),
|
|
2579
2591
|
afterPng: z.string().nullable(),
|
|
2580
2592
|
durationMs: z.number(),
|
|
2581
|
-
cost:
|
|
2593
|
+
cost: LiveReportCostSchema
|
|
2582
2594
|
});
|
|
2583
|
-
const
|
|
2595
|
+
const LiveReportRunSchema = z.object({
|
|
2584
2596
|
runId: z.string(),
|
|
2585
2597
|
sessionName: z.string(),
|
|
2586
2598
|
startedAt: z.string(),
|
|
2587
2599
|
durationMs: z.number(),
|
|
2588
|
-
steps: z.array(
|
|
2589
|
-
cost:
|
|
2600
|
+
steps: z.array(LiveReportStepSchema),
|
|
2601
|
+
cost: LiveReportCostSchema
|
|
2590
2602
|
});
|
|
2591
2603
|
const ReportSpecResultSchema = z.object({
|
|
2592
2604
|
feature: z.string(),
|
|
@@ -2607,7 +2619,7 @@ const ReportSpecResultSchema = z.object({
|
|
|
2607
2619
|
diffExcerpt: z.string().nullable(),
|
|
2608
2620
|
specYaml: z.string().nullable(),
|
|
2609
2621
|
evidence: z.array(ReportEvidenceSchema).nullable(),
|
|
2610
|
-
|
|
2622
|
+
liveRun: LiveReportRunSchema.nullable()
|
|
2611
2623
|
});
|
|
2612
2624
|
z.object({
|
|
2613
2625
|
schemaVersion: z.literal(1),
|
|
@@ -2846,7 +2858,7 @@ function scopePatchForSpec(patch, relatedPaths, caps = {}) {
|
|
|
2846
2858
|
return parts.join("\n");
|
|
2847
2859
|
}
|
|
2848
2860
|
//#endregion
|
|
2849
|
-
//#region src/runtime/
|
|
2861
|
+
//#region src/runtime/live-cost-format.ts
|
|
2850
2862
|
/**
|
|
2851
2863
|
* Compact one-line cost summary. Format:
|
|
2852
2864
|
* "$0.1234 · 4 turns · 42 in / 6,511 out · 2.0M cached · sonnet"
|
|
@@ -2856,7 +2868,7 @@ function scopePatchForSpec(patch, relatedPaths, caps = {}) {
|
|
|
2856
2868
|
* `model=...` segment. `compact: true` (HTML chip) thousand-separates fresh
|
|
2857
2869
|
* tokens, abbreviates cache-read with K/M, drops the `model=` prefix.
|
|
2858
2870
|
*/
|
|
2859
|
-
function
|
|
2871
|
+
function formatLiveCost(cost, options) {
|
|
2860
2872
|
if (cost.totalCostUsd === null) return null;
|
|
2861
2873
|
const compact = options.compact;
|
|
2862
2874
|
const sep = compact ? " · " : " / ";
|
|
@@ -2875,7 +2887,7 @@ function formatNdCost(cost, options) {
|
|
|
2875
2887
|
* Sum of per-spec costs for a batch. Used only by the CLI batch summary.
|
|
2876
2888
|
* Returns null when no spec has cost data.
|
|
2877
2889
|
*/
|
|
2878
|
-
function
|
|
2890
|
+
function formatLiveBatchCost(costs) {
|
|
2879
2891
|
let totalUsd = 0;
|
|
2880
2892
|
let seen = false;
|
|
2881
2893
|
let totalIn = 0;
|
|
@@ -3176,7 +3188,7 @@ function renderResult(r, index, s) {
|
|
|
3176
3188
|
const heading = r.title ? `<span class="spec-title">${esc(r.title)}</span><span class="spec-slug">(${esc(id)})</span>` : `<span class="spec-title">${esc(id)}</span>`;
|
|
3177
3189
|
const predictionLine = r.status === "failed" && r.analysis ? `<span class="label-text label-${r.analysis.label}">${esc(displayLabel(r.analysis.label, s))} · ${Math.round(r.analysis.confidence * 100)}%</span>` : "";
|
|
3178
3190
|
const needsGradingDot = r.status === "failed" && r.analysis ? `<span class="needs-grading-dot" data-case-id="${esc(id)}" title="${esc(s.needsGrading)}"></span>` : "";
|
|
3179
|
-
const modeTag = r.
|
|
3191
|
+
const modeTag = r.liveRun ? `<span class="mode-tag" title="executed in live mode (Claude drove the browser per step)">LIVE</span>` : `<span class="mode-tag" title="executed in deterministic mode (vitest replayed test.spec.ts)">DETERMINISTIC</span>`;
|
|
3180
3192
|
return `<details class="spec ${r.status}" data-status="${r.status}" data-case-id="${esc(id)}"${r.status === "failed" ? " open" : ""}>
|
|
3181
3193
|
<summary>
|
|
3182
3194
|
${statusIcon(r.status)}
|
|
@@ -3189,7 +3201,7 @@ function renderResult(r, index, s) {
|
|
|
3189
3201
|
</summary>
|
|
3190
3202
|
<div class="spec-body">
|
|
3191
3203
|
${renderEvidence(r, s)}
|
|
3192
|
-
${r.
|
|
3204
|
+
${r.liveRun ? renderLiveRun(r.liveRun, s) : ""}
|
|
3193
3205
|
${renderSpecBody(r, index, s)}
|
|
3194
3206
|
${collapsible(s.collSpecYaml, s.collSpecYamlHelp, r.specYaml)}
|
|
3195
3207
|
</div>
|
|
@@ -3200,16 +3212,16 @@ function renderSpecBody(r, index, s) {
|
|
|
3200
3212
|
if (r.analysis) return renderAnalysis(r, index, s);
|
|
3201
3213
|
return renderSkippedWithSupporting(r, s);
|
|
3202
3214
|
}
|
|
3203
|
-
function
|
|
3204
|
-
const stepItems =
|
|
3215
|
+
function renderLiveRun(live, strings) {
|
|
3216
|
+
const stepItems = live.steps.map((s) => {
|
|
3205
3217
|
const before = s.beforePng ? `<a class="shot" href="${esc(s.beforePng)}" target="_blank" rel="noopener"><img src="${esc(s.beforePng)}" alt="before ${esc(s.stepId)}" loading="lazy"><span>before</span></a>` : "";
|
|
3206
3218
|
const after = s.afterPng ? `<a class="shot" href="${esc(s.afterPng)}" target="_blank" rel="noopener"><img src="${esc(s.afterPng)}" alt="after ${esc(s.stepId)}" loading="lazy"><span>after</span></a>` : "";
|
|
3207
3219
|
const dur = s.durationMs > 0 ? `<span class="duration">${formatDuration$1(s.durationMs)}</span>` : "";
|
|
3208
|
-
const stepCost =
|
|
3220
|
+
const stepCost = formatLiveCostChip(s.cost);
|
|
3209
3221
|
const stepModel = formatModelChip(s.cost.models);
|
|
3210
|
-
const sourceBadge = s.source && s.source !== "spec" ? `<span class="
|
|
3211
|
-
return `<li class="
|
|
3212
|
-
<div class="
|
|
3222
|
+
const sourceBadge = s.source && s.source !== "spec" ? `<span class="live-source">[${esc(s.source)}]</span>` : "";
|
|
3223
|
+
return `<li class="live-step ${s.status}">
|
|
3224
|
+
<div class="live-step-head">
|
|
3213
3225
|
${statusIcon(s.status)}
|
|
3214
3226
|
<span class="step-name">${esc(s.stepId)}</span>
|
|
3215
3227
|
${sourceBadge}
|
|
@@ -3218,44 +3230,44 @@ function renderNdRun(nd, strings) {
|
|
|
3218
3230
|
${stepCost}
|
|
3219
3231
|
${dur}
|
|
3220
3232
|
</div>
|
|
3221
|
-
<div class="
|
|
3222
|
-
<p class="
|
|
3223
|
-
<p class="
|
|
3224
|
-
${s.reasoning ? `<p class="
|
|
3225
|
-
${before || after ? `<div class="
|
|
3233
|
+
<div class="live-step-body">
|
|
3234
|
+
<p class="live-instr"><strong>${esc(strings.stepDoLabel)}:</strong> ${esc(s.instruction)}</p>
|
|
3235
|
+
<p class="live-instr"><strong>${esc(strings.stepExpectLabel)}:</strong> ${esc(s.expected)}</p>
|
|
3236
|
+
${s.reasoning ? `<p class="live-reasoning">${esc(s.reasoning)}</p>` : ""}
|
|
3237
|
+
${before || after ? `<div class="live-shots">${before}${after}</div>` : ""}
|
|
3226
3238
|
</div>
|
|
3227
3239
|
</li>`;
|
|
3228
3240
|
}).join("\n");
|
|
3229
|
-
const runCost =
|
|
3230
|
-
const runModel = formatModelChip(
|
|
3231
|
-
return `<section class="
|
|
3232
|
-
<details class="
|
|
3241
|
+
const runCost = formatLiveCostChip(live.cost);
|
|
3242
|
+
const runModel = formatModelChip(live.cost.models);
|
|
3243
|
+
return `<section class="live-run">
|
|
3244
|
+
<details class="live-run-meta">
|
|
3233
3245
|
<summary>${labelWithHelp(esc(strings.collLiveRunMeta), strings.collLiveRunMetaHelp)}</summary>
|
|
3234
|
-
<div class="
|
|
3246
|
+
<div class="live-run-meta-body">
|
|
3235
3247
|
<span class="dim">${esc(strings.liveRunIdLabel)}</span>
|
|
3236
|
-
<code>${esc(
|
|
3248
|
+
<code>${esc(live.runId)}</code>
|
|
3237
3249
|
<span class="dim">${esc(strings.liveSessionLabel)}</span>
|
|
3238
|
-
<code>${esc(
|
|
3250
|
+
<code>${esc(live.sessionName)}</code>
|
|
3239
3251
|
${runModel}
|
|
3240
3252
|
${runCost}
|
|
3241
|
-
<span class="duration">${formatDuration$1(
|
|
3253
|
+
<span class="duration">${formatDuration$1(live.durationMs)}</span>
|
|
3242
3254
|
</div>
|
|
3243
3255
|
</details>
|
|
3244
|
-
<ol class="
|
|
3256
|
+
<ol class="live-steps">${stepItems}</ol>
|
|
3245
3257
|
</section>`;
|
|
3246
3258
|
}
|
|
3247
3259
|
/** Compact dot-separated cost chip, e.g. "$0.1234 · 4 turns · 42 in / 6,511 out · 2.0M cached". */
|
|
3248
|
-
function
|
|
3249
|
-
const line =
|
|
3260
|
+
function formatLiveCostChip(cost) {
|
|
3261
|
+
const line = formatLiveCost(cost, { compact: true });
|
|
3250
3262
|
if (line === null) return "";
|
|
3251
|
-
return `<span class="
|
|
3263
|
+
return `<span class="live-cost" title="cost · turns · fresh-input/output tokens · cache-read input">${esc(line)}</span>`;
|
|
3252
3264
|
}
|
|
3253
3265
|
function formatModelChip(models) {
|
|
3254
3266
|
if (!models || models.length === 0) return "";
|
|
3255
|
-
return `<span class="
|
|
3267
|
+
return `<span class="live-model" title="Claude model id(s) reported by the SDK">${esc(models.join(", "))}</span>`;
|
|
3256
3268
|
}
|
|
3257
3269
|
/**
|
|
3258
|
-
* Per-step UI for deterministic runs. Adopts the same `
|
|
3270
|
+
* Per-step UI for deterministic runs. Adopts the same `live-step` card layout
|
|
3259
3271
|
* used by live runs so reviewers don't have to context-switch between two
|
|
3260
3272
|
* visual idioms. We map the evidence entries (which are already keyed by
|
|
3261
3273
|
* stepId) onto the same shape, leaving live-only fields (before png, cost,
|
|
@@ -3263,14 +3275,14 @@ function formatModelChip(models) {
|
|
|
3263
3275
|
*/
|
|
3264
3276
|
function renderEvidence(r, s) {
|
|
3265
3277
|
if (!r.evidence || r.evidence.length === 0) return "";
|
|
3266
|
-
return `<section class="
|
|
3267
|
-
<ol class="
|
|
3278
|
+
return `<section class="live-run">
|
|
3279
|
+
<ol class="live-steps">${r.evidence.map((e) => renderDetStepCard(e, s)).join("\n")}</ol>
|
|
3268
3280
|
</section>`;
|
|
3269
3281
|
}
|
|
3270
3282
|
function renderDetStepCard(e, s) {
|
|
3271
3283
|
const status = e.status === "failed" ? "failed" : "passed";
|
|
3272
|
-
const description = e.description ? `<p class="
|
|
3273
|
-
const failureBlock = e.status === "failed" && e.failureSummary ? `<p class="
|
|
3284
|
+
const description = e.description ? `<p class="live-instr"><strong>${esc(s.stepExpectLabel)}:</strong> ${esc(e.description)}</p>` : "";
|
|
3285
|
+
const failureBlock = e.status === "failed" && e.failureSummary ? `<p class="live-reasoning">${esc(e.failureSummary)}</p>` : "";
|
|
3274
3286
|
const metaRows = [];
|
|
3275
3287
|
if (e.url) {
|
|
3276
3288
|
const shortUrl = shortenUrl(e.url);
|
|
@@ -3279,16 +3291,16 @@ function renderDetStepCard(e, s) {
|
|
|
3279
3291
|
if (e.title) metaRows.push(`<div class="evidence-meta-row"><span class="evidence-meta-label">${esc(s.metaPage)}</span><span class="evidence-meta-value">${esc(e.title)}</span></div>`);
|
|
3280
3292
|
const meta = metaRows.length > 0 ? `<div class="evidence-meta">${metaRows.join("")}</div>` : "";
|
|
3281
3293
|
const after = `<a class="shot" href="${esc(e.pngPath)}" target="_blank" rel="noopener"><img src="${esc(e.pngPath)}" alt="${esc(e.stepId)}" loading="lazy"><span>after</span></a>`;
|
|
3282
|
-
return `<li class="
|
|
3283
|
-
<div class="
|
|
3294
|
+
return `<li class="live-step ${status}">
|
|
3295
|
+
<div class="live-step-head">
|
|
3284
3296
|
${statusIcon(status)}
|
|
3285
3297
|
<span class="step-name">${esc(e.stepId)}</span>
|
|
3286
3298
|
<span class="spacer"></span>
|
|
3287
3299
|
</div>
|
|
3288
|
-
<div class="
|
|
3300
|
+
<div class="live-step-body">
|
|
3289
3301
|
${description}
|
|
3290
3302
|
${failureBlock}
|
|
3291
|
-
<div class="
|
|
3303
|
+
<div class="live-shots">${after}</div>
|
|
3292
3304
|
${meta}
|
|
3293
3305
|
</div>
|
|
3294
3306
|
</li>`;
|
|
@@ -3726,54 +3738,54 @@ table.matrix td.miss-nonzero { background: var(--fail-bg); }
|
|
|
3726
3738
|
|
|
3727
3739
|
/* Per-step block: indented + a thin rail under the test title so the
|
|
3728
3740
|
hierarchy spec → test → step is visible. */
|
|
3729
|
-
.
|
|
3741
|
+
.live-run {
|
|
3730
3742
|
padding: 0 0 0 14px;
|
|
3731
3743
|
margin-left: 6px;
|
|
3732
3744
|
border-left: 1px solid var(--border-soft);
|
|
3733
3745
|
}
|
|
3734
|
-
.
|
|
3735
|
-
.
|
|
3746
|
+
.live-run-meta { margin: 0 0 8px; font-size: 11.5px; }
|
|
3747
|
+
.live-run-meta > summary {
|
|
3736
3748
|
cursor: pointer; color: var(--text-mute); list-style: none;
|
|
3737
3749
|
padding: 4px 0;
|
|
3738
3750
|
}
|
|
3739
|
-
.
|
|
3740
|
-
.
|
|
3751
|
+
.live-run-meta > summary::-webkit-details-marker { display: none; }
|
|
3752
|
+
.live-run-meta > summary::before {
|
|
3741
3753
|
content: "▸"; color: var(--text-dim); font-size: 10px;
|
|
3742
3754
|
margin-right: 6px; transition: transform 0.12s ease;
|
|
3743
3755
|
display: inline-block;
|
|
3744
3756
|
}
|
|
3745
|
-
.
|
|
3746
|
-
.
|
|
3757
|
+
.live-run-meta[open] > summary::before { transform: rotate(90deg); }
|
|
3758
|
+
.live-run-meta-body {
|
|
3747
3759
|
display: flex; gap: 12px; align-items: baseline; flex-wrap: wrap;
|
|
3748
3760
|
color: var(--text-mute); padding: 6px 0 8px 16px;
|
|
3749
3761
|
}
|
|
3750
|
-
.
|
|
3751
|
-
.
|
|
3762
|
+
.live-run-meta-body code { background: transparent; padding: 0; font-size: 11.5px; color: var(--text-dim); }
|
|
3763
|
+
.live-run-meta-body .dim { color: var(--text-mute); }
|
|
3752
3764
|
|
|
3753
3765
|
/* Steps: flat list. The separator between steps has to outweigh anything
|
|
3754
3766
|
*inside* a step (e.g. evidence-meta footer) so the eye finds the
|
|
3755
3767
|
step boundary at a glance — hence a solid var(--border), not the
|
|
3756
3768
|
softer hairline used inside the step body. */
|
|
3757
|
-
.
|
|
3758
|
-
.
|
|
3759
|
-
.
|
|
3760
|
-
.
|
|
3761
|
-
.
|
|
3762
|
-
.
|
|
3763
|
-
.
|
|
3764
|
-
.
|
|
3769
|
+
.live-steps { list-style: none; padding: 0; margin: 0; display: flex; flex-direction: column; gap: 0; }
|
|
3770
|
+
.live-step { border-top: 1px solid var(--border); padding: 16px 0; background: transparent; }
|
|
3771
|
+
.live-step:first-child { border-top: 0; padding-top: 0; }
|
|
3772
|
+
.live-step.skipped { opacity: 0.55; }
|
|
3773
|
+
.live-step-head { display: flex; align-items: baseline; gap: 8px; padding: 0; background: transparent; border-bottom: 0; font-size: 13px; margin-bottom: 6px; }
|
|
3774
|
+
.live-step-body { padding: 0; font-size: 12.5px; line-height: 1.55; }
|
|
3775
|
+
.live-step-body p { margin: 4px 0; }
|
|
3776
|
+
.live-instr strong { color: var(--text-mute); font-weight: 600; margin-right: 4px; font-size: 11px; letter-spacing: 0.04em; text-transform: uppercase; }
|
|
3765
3777
|
|
|
3766
3778
|
/* Reasoning: left rail, no fill. */
|
|
3767
|
-
.
|
|
3768
|
-
.
|
|
3779
|
+
.live-reasoning { color: var(--text-dim); font-style: italic; background: transparent; padding: 4px 0 4px 12px; border-left: 2px solid var(--fail); border-radius: 0; margin: 6px 0; }
|
|
3780
|
+
.live-step.passed .live-reasoning { border-left-color: var(--border); color: var(--text-mute); font-style: normal; }
|
|
3769
3781
|
|
|
3770
|
-
.
|
|
3771
|
-
.
|
|
3772
|
-
.
|
|
3773
|
-
.
|
|
3782
|
+
.live-source { font-size: 11px; color: var(--text-mute); }
|
|
3783
|
+
.live-shots { display: flex; gap: 12px; margin-top: 10px; flex-wrap: wrap; }
|
|
3784
|
+
.live-shots .shot { display: flex; flex-direction: column; align-items: center; gap: 4px; text-decoration: none; color: var(--text-mute); font-size: 10px; letter-spacing: 0.08em; }
|
|
3785
|
+
.live-shots .shot img { max-width: 280px; max-height: 180px; border: 1px solid var(--border-soft); border-radius: 3px; object-fit: contain; background: #000; }
|
|
3774
3786
|
|
|
3775
3787
|
/* Cost / model chips: muted text, no fill. */
|
|
3776
|
-
.
|
|
3788
|
+
.live-cost, .live-model {
|
|
3777
3789
|
font-size: 11px; padding: 0;
|
|
3778
3790
|
background: transparent;
|
|
3779
3791
|
color: var(--text-mute);
|
|
@@ -4249,7 +4261,7 @@ function formatAgentBrowserUnavailableMessage() {
|
|
|
4249
4261
|
//#region src/cli/preflight.ts
|
|
4250
4262
|
/**
|
|
4251
4263
|
* Shared startup steps for every command that drives a real `agent-browser`
|
|
4252
|
-
* (currently `ccqa
|
|
4264
|
+
* (currently `ccqa record` (trace) and `ccqa run` (live mode)):
|
|
4253
4265
|
*
|
|
4254
4266
|
* 1. Verify the peer-installed agent-browser binary is reachable. On
|
|
4255
4267
|
* failure print the standard guidance and `process.exit(1)`; on
|
|
@@ -4276,14 +4288,14 @@ async function preflightAgentBrowserCommand() {
|
|
|
4276
4288
|
await warnStaleBlockArtifacts();
|
|
4277
4289
|
}
|
|
4278
4290
|
//#endregion
|
|
4279
|
-
//#region src/report/
|
|
4291
|
+
//#region src/report/live-transcript-excerpt.ts
|
|
4280
4292
|
/**
|
|
4281
4293
|
* Build a compact transcript summary for the failure classifier.
|
|
4282
4294
|
*
|
|
4283
4295
|
* Returns `null` when the run has no failed step (every step passed/skipped),
|
|
4284
4296
|
* since the failure analyzer has nothing to explain in that case.
|
|
4285
4297
|
*/
|
|
4286
|
-
async function
|
|
4298
|
+
async function buildLiveTranscriptExcerpt(result, options = {}) {
|
|
4287
4299
|
const failingIndex = result.steps.findIndex((s) => s.status === "failed");
|
|
4288
4300
|
if (failingIndex === -1) return null;
|
|
4289
4301
|
const failingStep = result.steps[failingIndex];
|
|
@@ -4316,7 +4328,7 @@ function oneLine$1(s) {
|
|
|
4316
4328
|
return s.replace(/\s+/g, " ").trim();
|
|
4317
4329
|
}
|
|
4318
4330
|
//#endregion
|
|
4319
|
-
//#region src/runtime/
|
|
4331
|
+
//#region src/runtime/live-artifacts.ts
|
|
4320
4332
|
/**
|
|
4321
4333
|
* Build a sortable run id from the current wall-clock time. ISO8601 with
|
|
4322
4334
|
* `:` / `.` replaced so it's filename-safe. Caller is expected to mkdir the
|
|
@@ -4361,15 +4373,15 @@ function agentBrowserInvokeBase(input) {
|
|
|
4361
4373
|
};
|
|
4362
4374
|
}
|
|
4363
4375
|
//#endregion
|
|
4364
|
-
//#region src/prompts/
|
|
4365
|
-
function
|
|
4366
|
-
return `ccqa-
|
|
4376
|
+
//#region src/prompts/live.ts
|
|
4377
|
+
function generateLiveSessionName() {
|
|
4378
|
+
return `ccqa-live-${buildRunId()}`;
|
|
4367
4379
|
}
|
|
4368
4380
|
/**
|
|
4369
|
-
* Static prefix of the `ccqa run
|
|
4370
|
-
* reused across every step's invocation — the only piece that
|
|
4371
|
-
* step is the trailing "Your Task: <stepId>" section produced by
|
|
4372
|
-
* `
|
|
4381
|
+
* Static prefix of the `ccqa run` (live spec) system prompt. Built once per
|
|
4382
|
+
* run and reused across every step's invocation — the only piece that
|
|
4383
|
+
* changes per step is the trailing "Your Task: <stepId>" section produced by
|
|
4384
|
+
* `buildLiveSystemPromptStepSection`. Keeping the split here lets the prompt
|
|
4373
4385
|
* cache absorb the shared bulk and keeps each turn's prompt construction down
|
|
4374
4386
|
* to a small string concat.
|
|
4375
4387
|
*
|
|
@@ -4378,16 +4390,18 @@ function generateRunNdSessionName() {
|
|
|
4378
4390
|
* but never names a specific product, URL, account, role, or UI element.
|
|
4379
4391
|
* Project-specific guidance ("the admin tenant is foo.example", "session
|
|
4380
4392
|
* times out at X minutes", …) is appended from
|
|
4381
|
-
* `.ccqa/prompts/
|
|
4382
|
-
*
|
|
4393
|
+
* `.ccqa/prompts/live.user.md` (human-maintained) and
|
|
4394
|
+
* `.ccqa/prompts/live.agent.md` (updated by `ccqa run --update-agent-prompt`)
|
|
4395
|
+
* by the caller, so ccqa stays clean of downstream-product context.
|
|
4383
4396
|
*
|
|
4384
|
-
* Constraint posture: `ccqa
|
|
4385
|
-
* blocks `eval` / `@ref` / chained agent-browser invocations
|
|
4386
|
-
* trace outputs need to replay deterministically.
|
|
4387
|
-
* the model judges the step live — so those guards are off
|
|
4388
|
-
* told it may use any agent-browser subcommand and any
|
|
4389
|
-
|
|
4390
|
-
|
|
4397
|
+
* Constraint posture: `ccqa record` (trace) enforces a strict selector
|
|
4398
|
+
* whitelist and blocks `eval` / `@ref` / chained agent-browser invocations
|
|
4399
|
+
* because those trace outputs need to replay deterministically. Live specs
|
|
4400
|
+
* have no replay — the model judges the step live — so those guards are off
|
|
4401
|
+
* and the model is told it may use any agent-browser subcommand and any
|
|
4402
|
+
* selector strategy.
|
|
4403
|
+
*/
|
|
4404
|
+
function buildLiveSystemPromptPrefix(input) {
|
|
4391
4405
|
const stepsText = input.allSteps.map((s) => `### ${s.id} [${s.source}]
|
|
4392
4406
|
- **Instruction**: ${s.instruction}
|
|
4393
4407
|
- **Expected**: ${s.expected}`).join("\n\n");
|
|
@@ -4419,7 +4433,8 @@ ${stepsText}
|
|
|
4419
4433
|
1. Take a fresh \`snapshot\` to see the current page.
|
|
4420
4434
|
2. Carry out the instruction. Use whichever agent-browser subcommand and selector style works. If the first attempt fails, take another snapshot and try a different approach — you are not being recorded.
|
|
4421
4435
|
3. After the instruction is performed, take another \`snapshot\` (and optionally a \`get count\` / \`wait --text\` probe) to verify the expected outcome.
|
|
4422
|
-
4.
|
|
4436
|
+
4. **Before emitting STEP_RESULT, make the judgement target visible in the page** so the auto-captured "after" screenshot proves your verdict. Use \`agent-browser eval "<elementRef>.scrollIntoView({block:'center'})"\` or similar to bring the asserted row / banner / URL bar / bot reply into view. A correct verdict with no on-screen evidence is still a weak artifact.
|
|
4437
|
+
5. Decide: did the **Expected** condition hold? Be honest. If the page is in an unexpected state, that is a fail, not something to work around.
|
|
4423
4438
|
|
|
4424
4439
|
### Judgement rules
|
|
4425
4440
|
|
|
@@ -4428,6 +4443,7 @@ ${stepsText}
|
|
|
4428
4443
|
- If the expected outcome is partially satisfied (e.g. the page loaded but the asserted element is missing) — fail, and say which part is missing.
|
|
4429
4444
|
- Pass only when you have *positive* evidence (a successful snapshot, a verified URL, a wait that resolved). "No error shown" is not enough on its own.
|
|
4430
4445
|
- Do not invent success when blocked: fail honestly with a short reason.
|
|
4446
|
+
- **Evidence discipline**: when the assertion target is a specific row / message / banner / URL, scroll it into view (or focus the relevant pane) before letting the step end. The "after" screenshot is captured for you automatically — your job is to make sure that screenshot shows the thing your STEP_RESULT line is talking about.
|
|
4431
4447
|
|
|
4432
4448
|
### Output contract (STRICT)
|
|
4433
4449
|
|
|
@@ -4454,7 +4470,7 @@ Everything else you write (narrative, tool output summaries, etc.) is fine — o
|
|
|
4454
4470
|
`;
|
|
4455
4471
|
}
|
|
4456
4472
|
/** Per-step trailer with the current step's instruction / expected. */
|
|
4457
|
-
function
|
|
4473
|
+
function buildLiveSystemPromptStepSection(step) {
|
|
4458
4474
|
return `
|
|
4459
4475
|
## Your Task: ${step.id}
|
|
4460
4476
|
|
|
@@ -4465,11 +4481,11 @@ Execute the instruction in the running browser session, then judge whether the e
|
|
|
4465
4481
|
`;
|
|
4466
4482
|
}
|
|
4467
4483
|
/** Per-turn user message — the system prompt already carries all spec context. */
|
|
4468
|
-
function
|
|
4484
|
+
function buildLiveUserPrompt(step) {
|
|
4469
4485
|
return `Execute step ${step.id} and emit your STEP_RESULT verdict as instructed in the system prompt.`;
|
|
4470
4486
|
}
|
|
4471
4487
|
//#endregion
|
|
4472
|
-
//#region src/runtime/
|
|
4488
|
+
//#region src/runtime/live-result-parse.ts
|
|
4473
4489
|
const MAX_REASON_LEN = 2e3;
|
|
4474
4490
|
/** Parse a single STEP_RESULT line. Returns null on malformed input. */
|
|
4475
4491
|
function parseStepResultLine(line) {
|
|
@@ -4499,7 +4515,7 @@ function findLastStepResult(text) {
|
|
|
4499
4515
|
//#region src/runtime/screenshot.ts
|
|
4500
4516
|
/**
|
|
4501
4517
|
* Take a PNG screenshot of the current page in the given agent-browser session
|
|
4502
|
-
* and write it to `outPath`. Used by `ccqa run
|
|
4518
|
+
* and write it to `outPath`. Used by `ccqa run` (live mode) to capture per-step
|
|
4503
4519
|
* artifacts (before / after the step's actions) so the human-readable run
|
|
4504
4520
|
* report has a visual trail even though no AB_ACTION stream is recorded.
|
|
4505
4521
|
*
|
|
@@ -4508,13 +4524,15 @@ function findLastStepResult(text) {
|
|
|
4508
4524
|
* and continues. We never throw, because a missing screenshot is a degraded
|
|
4509
4525
|
* artifact, not a reason to abort the test step.
|
|
4510
4526
|
*/
|
|
4511
|
-
function takeScreenshot(sessionName, outPath) {
|
|
4512
|
-
const
|
|
4527
|
+
function takeScreenshot(sessionName, outPath, options) {
|
|
4528
|
+
const args = [
|
|
4513
4529
|
"--session",
|
|
4514
4530
|
sessionName,
|
|
4515
|
-
"screenshot"
|
|
4516
|
-
|
|
4517
|
-
|
|
4531
|
+
"screenshot"
|
|
4532
|
+
];
|
|
4533
|
+
if (options?.fullPage) args.push("--full");
|
|
4534
|
+
args.push(outPath);
|
|
4535
|
+
const res = spawnAB(args);
|
|
4518
4536
|
if (res.status === 0) return {
|
|
4519
4537
|
ok: true,
|
|
4520
4538
|
path: outPath
|
|
@@ -4526,10 +4544,10 @@ function takeScreenshot(sessionName, outPath) {
|
|
|
4526
4544
|
};
|
|
4527
4545
|
}
|
|
4528
4546
|
//#endregion
|
|
4529
|
-
//#region src/runtime/
|
|
4547
|
+
//#region src/runtime/live-executor.ts
|
|
4530
4548
|
/**
|
|
4531
|
-
* Run all spec steps once through Claude (
|
|
4532
|
-
*
|
|
4549
|
+
* Run all spec steps once through Claude (live mode). Each step is one Claude
|
|
4550
|
+
* invocation that:
|
|
4533
4551
|
* 1. takes a "before" screenshot of the live session
|
|
4534
4552
|
* 2. lets Claude execute the step's instruction via agent-browser (full
|
|
4535
4553
|
* surface, no replay-time selector constraints)
|
|
@@ -4540,11 +4558,11 @@ function takeScreenshot(sessionName, outPath) {
|
|
|
4540
4558
|
* the overall run status flips to `failed`. The Chrome session persists
|
|
4541
4559
|
* across steps so step N+1 starts on whatever page step N left the browser on.
|
|
4542
4560
|
*/
|
|
4543
|
-
async function
|
|
4561
|
+
async function runLiveExecutor(input) {
|
|
4544
4562
|
const startedAt = /* @__PURE__ */ new Date();
|
|
4545
4563
|
const stepResults = [];
|
|
4546
4564
|
let overallFailed = false;
|
|
4547
|
-
const promptPrefix =
|
|
4565
|
+
const promptPrefix = buildLiveSystemPromptPrefix({
|
|
4548
4566
|
title: input.spec.title,
|
|
4549
4567
|
allSteps: input.steps,
|
|
4550
4568
|
sessionName: input.sessionName
|
|
@@ -4567,8 +4585,8 @@ async function runNdExecutor(input) {
|
|
|
4567
4585
|
const paths = stepArtifactPaths(input.runDir, step$1.id);
|
|
4568
4586
|
await ensureDir(paths.beforePng);
|
|
4569
4587
|
const stepStartedAt = Date.now();
|
|
4570
|
-
const systemPrompt = promptPrefix +
|
|
4571
|
-
const userPrompt =
|
|
4588
|
+
const systemPrompt = promptPrefix + buildLiveSystemPromptStepSection(step$1) + suffixBlock + langDirective;
|
|
4589
|
+
const userPrompt = buildLiveUserPrompt(step$1);
|
|
4572
4590
|
let attempt = 0;
|
|
4573
4591
|
let lastOutcome = null;
|
|
4574
4592
|
while (attempt <= retries) {
|
|
@@ -4630,7 +4648,7 @@ async function runNdExecutor(input) {
|
|
|
4630
4648
|
transcriptParts.push(`[ccqa] invokeClaudeStreaming threw: ${err instanceof Error ? err.message : String(err)}`);
|
|
4631
4649
|
}
|
|
4632
4650
|
const transcript = transcriptParts.join("\n");
|
|
4633
|
-
const after = takeScreenshot(input.sessionName, paths.afterPng);
|
|
4651
|
+
const after = takeScreenshot(input.sessionName, paths.afterPng, { fullPage: true });
|
|
4634
4652
|
if (!after.ok) warn(`screenshot (after, ${step.id}) failed: ${after.error}`);
|
|
4635
4653
|
await writeFile(paths.logTxt, transcript || "(no assistant text captured)", "utf-8");
|
|
4636
4654
|
const { status, reasoning } = judgeStepOutcome({
|
|
@@ -4746,24 +4764,24 @@ function truncateForLog$1(s) {
|
|
|
4746
4764
|
return oneLine.length > 100 ? oneLine.slice(0, 100) + "…" : oneLine;
|
|
4747
4765
|
}
|
|
4748
4766
|
//#endregion
|
|
4749
|
-
//#region src/report/
|
|
4767
|
+
//#region src/report/live-adapter.ts
|
|
4750
4768
|
/**
|
|
4751
|
-
* Convert one
|
|
4752
|
-
* `ReportSpecResult` shape consumed by `renderRunReport`.
|
|
4753
|
-
* does two non-trivial things:
|
|
4769
|
+
* Convert one live-mode (`mode: live`) execution result into the
|
|
4770
|
+
* persistence-layer `ReportSpecResult` shape consumed by `renderRunReport`.
|
|
4771
|
+
* The conversion does two non-trivial things:
|
|
4754
4772
|
*
|
|
4755
4773
|
* - rewrites the executor's absolute `beforePng`/`afterPng` paths as
|
|
4756
4774
|
* `reportDir`-relative hrefs so the rendered HTML opens its PNGs
|
|
4757
4775
|
* directly when the report dir + the run dir are downloaded together
|
|
4758
4776
|
* as a CI artifact bundle
|
|
4759
4777
|
* - nulls out every vitest-only field so the report renderer falls
|
|
4760
|
-
* through to its `
|
|
4778
|
+
* through to its `liveRun` branch
|
|
4761
4779
|
*
|
|
4762
4780
|
* Lives in `src/report/` (not the CLI) because the relative-path contract
|
|
4763
|
-
* on `
|
|
4781
|
+
* on `LiveReportStep.beforePng`/`afterPng` is a report-layer invariant,
|
|
4764
4782
|
* documented next to the schema, and the CLI should not own it.
|
|
4765
4783
|
*/
|
|
4766
|
-
function
|
|
4784
|
+
function liveRunToReportResult(args) {
|
|
4767
4785
|
const { featureName, specName, specYaml, result, reportDir } = args;
|
|
4768
4786
|
const steps = result.steps.map((s) => ({
|
|
4769
4787
|
stepId: s.stepId,
|
|
@@ -4777,7 +4795,7 @@ function ndRunToReportResult(args) {
|
|
|
4777
4795
|
durationMs: s.durationMs,
|
|
4778
4796
|
cost: { ...s.cost }
|
|
4779
4797
|
}));
|
|
4780
|
-
const
|
|
4798
|
+
const liveRun = {
|
|
4781
4799
|
runId: result.runId,
|
|
4782
4800
|
sessionName: result.sessionName,
|
|
4783
4801
|
startedAt: result.startedAt,
|
|
@@ -4800,16 +4818,16 @@ function ndRunToReportResult(args) {
|
|
|
4800
4818
|
diffExcerpt: null,
|
|
4801
4819
|
specYaml,
|
|
4802
4820
|
evidence: null,
|
|
4803
|
-
|
|
4821
|
+
liveRun
|
|
4804
4822
|
};
|
|
4805
4823
|
}
|
|
4806
4824
|
function relativeIfPresent(absPath, reportDir) {
|
|
4807
4825
|
return absPath === null ? null : relative(reportDir, absPath);
|
|
4808
4826
|
}
|
|
4809
4827
|
//#endregion
|
|
4810
|
-
//#region src/cli/run-
|
|
4828
|
+
//#region src/cli/run-live.ts
|
|
4811
4829
|
/**
|
|
4812
|
-
* Run pre-filtered `mode: live` specs through `
|
|
4830
|
+
* Run pre-filtered `mode: live` specs through `runLiveExecutor` (Claude +
|
|
4813
4831
|
* agent-browser) and, when `reportDir` is set, run drift audit + failure
|
|
4814
4832
|
* analysis to produce report rows. Sibling of `runDeterministicSpecs`.
|
|
4815
4833
|
*/
|
|
@@ -4821,8 +4839,9 @@ async function runLiveSpecs(specs, opts) {
|
|
|
4821
4839
|
const cwd = opts.cwd ?? process.cwd();
|
|
4822
4840
|
await preflightAgentBrowserCommand();
|
|
4823
4841
|
meta("live-specs", specs.length);
|
|
4824
|
-
const
|
|
4825
|
-
if (
|
|
4842
|
+
const userPromptBundle = await loadLivePromptBundle(cwd);
|
|
4843
|
+
if (userPromptBundle !== null) meta("user-prompt", userPromptBundle.loaded.join(" + "));
|
|
4844
|
+
const userPromptSuffix = userPromptBundle?.text ?? null;
|
|
4826
4845
|
const runs = [];
|
|
4827
4846
|
for (let i = 0; i < specs.length; i++) {
|
|
4828
4847
|
const { featureName, specName } = specs[i];
|
|
@@ -4855,7 +4874,7 @@ function buildLiveReportResults(runs, driftBySpec, analysisBySpec, reportDir, fa
|
|
|
4855
4874
|
if (r.kind !== "run") return [];
|
|
4856
4875
|
const key = `${r.featureName}/${r.specName}`;
|
|
4857
4876
|
return [{
|
|
4858
|
-
...
|
|
4877
|
+
...liveRunToReportResult({
|
|
4859
4878
|
featureName: r.featureName,
|
|
4860
4879
|
specName: r.specName,
|
|
4861
4880
|
specYaml: r.specYaml,
|
|
@@ -4885,7 +4904,7 @@ function analysisFieldsFor(a, status, failureAnalysisEnabled) {
|
|
|
4885
4904
|
/**
|
|
4886
4905
|
* Run `analyzeDrift` against every successfully-loaded spec and return a
|
|
4887
4906
|
* `featureName/specName → driftIssues` map. Drift findings are advisory —
|
|
4888
|
-
* they show in the HTML report but do not change the run
|
|
4907
|
+
* they show in the HTML report but do not change the live-run exit code.
|
|
4889
4908
|
*/
|
|
4890
4909
|
async function runDriftAudit(runs, opts, cwd) {
|
|
4891
4910
|
const targets = runs.filter((r) => r.kind === "run").map((r) => ({
|
|
@@ -4935,13 +4954,13 @@ async function runOneSpec(args) {
|
|
|
4935
4954
|
meta("steps", expanded.length);
|
|
4936
4955
|
const includes = collectIncludedBlockNames(spec);
|
|
4937
4956
|
if (includes.length > 0) meta("blocks", includes.join(", "));
|
|
4938
|
-
const sessionName =
|
|
4957
|
+
const sessionName = generateLiveSessionName();
|
|
4939
4958
|
meta("session", sessionName);
|
|
4940
4959
|
const runId = buildRunId();
|
|
4941
4960
|
const runDir = opts.out ?? join(specDir, "runs", runId);
|
|
4942
4961
|
await mkdir(runDir, { recursive: true });
|
|
4943
4962
|
meta("runDir", runDir);
|
|
4944
|
-
const result = await
|
|
4963
|
+
const result = await runLiveExecutor({
|
|
4945
4964
|
spec: { title: spec.title },
|
|
4946
4965
|
steps: expanded,
|
|
4947
4966
|
runId,
|
|
@@ -4959,7 +4978,7 @@ async function runOneSpec(args) {
|
|
|
4959
4978
|
meta("saved", runJsonPath);
|
|
4960
4979
|
meta("status", result.status.toUpperCase());
|
|
4961
4980
|
meta("step-summary", `${count(result.steps, "passed")} passed / ${count(result.steps, "failed")} failed / ${count(result.steps, "skipped")} skipped`);
|
|
4962
|
-
const costLine =
|
|
4981
|
+
const costLine = formatLiveCost(result.cost, { compact: false });
|
|
4963
4982
|
if (costLine) meta("cost", costLine);
|
|
4964
4983
|
return {
|
|
4965
4984
|
kind: "run",
|
|
@@ -4971,7 +4990,7 @@ async function runOneSpec(args) {
|
|
|
4971
4990
|
};
|
|
4972
4991
|
}
|
|
4973
4992
|
function logBatchCost(runs) {
|
|
4974
|
-
const line =
|
|
4993
|
+
const line = formatLiveBatchCost(runs.flatMap((r) => r.kind === "run" ? [r.result.cost] : []));
|
|
4975
4994
|
if (line) meta("total-cost", line);
|
|
4976
4995
|
}
|
|
4977
4996
|
/**
|
|
@@ -5001,7 +5020,7 @@ async function runFailureAnalysisForLiveRuns(runs, driftBySpec, opts, cwd) {
|
|
|
5001
5020
|
for (const r of failed) {
|
|
5002
5021
|
const key = `${r.featureName}/${r.specName}`;
|
|
5003
5022
|
info(`failure analysis: ${key}`);
|
|
5004
|
-
const excerpt = await
|
|
5023
|
+
const excerpt = await buildLiveTranscriptExcerpt(r.result);
|
|
5005
5024
|
if (excerpt === null) {
|
|
5006
5025
|
out.set(key, {
|
|
5007
5026
|
analysis: null,
|
|
@@ -5012,7 +5031,7 @@ async function runFailureAnalysisForLiveRuns(runs, driftBySpec, opts, cwd) {
|
|
|
5012
5031
|
continue;
|
|
5013
5032
|
}
|
|
5014
5033
|
const outcome = await analyzeFailure({
|
|
5015
|
-
|
|
5034
|
+
liveTranscriptExcerpt: excerpt,
|
|
5016
5035
|
specYaml: r.specYaml,
|
|
5017
5036
|
diffPatch: diff.ok ? diff.diff.patch : null,
|
|
5018
5037
|
changedFiles: diff.ok ? diff.diff.nameStatus : null,
|
|
@@ -5063,6 +5082,100 @@ function oneLine(s) {
|
|
|
5063
5082
|
return s.replace(/\s+/g, " ").trim();
|
|
5064
5083
|
}
|
|
5065
5084
|
//#endregion
|
|
5085
|
+
//#region src/prompts/agent-update.ts
|
|
5086
|
+
function buildAgentUpdateSystemPrompt(input) {
|
|
5087
|
+
const modeLabel = input.mode === "live" ? "live (Claude drives every step at run time)" : "record (Claude records browser actions for vitest replay)";
|
|
5088
|
+
const userMdLabel = `${input.mode}.user.md`;
|
|
5089
|
+
const agentMdLabel = `${input.mode}.agent.md`;
|
|
5090
|
+
return `You maintain the auto-learned half of ccqa's prompt bundle for ${modeLabel}.
|
|
5091
|
+
|
|
5092
|
+
${outputLanguageBlock(input.language ?? "auto", "the bullet text", "headings, agent-browser subcommand names, selector tokens")}## What you are updating
|
|
5093
|
+
|
|
5094
|
+
\`.ccqa/prompts/${agentMdLabel}\` is appended to ccqa's system prompt for every ${input.mode === "live" ? "step of every `mode: live` spec" : "trace run of `ccqa record`"}. It is meant to capture **stable lessons learned from past runs** — concrete selectors that worked, login flow quirks the agent kept tripping on, common "this is fine" warnings to ignore.
|
|
5095
|
+
|
|
5096
|
+
The sibling file \`${userMdLabel}\` carries human-maintained project guidance (URLs, naming conventions). Rules already well-covered by \`${userMdLabel}\` should NOT be repeated here.
|
|
5097
|
+
|
|
5098
|
+
## Output rules
|
|
5099
|
+
|
|
5100
|
+
- Emit the COMPLETE replacement contents of \`${agentMdLabel}\`.
|
|
5101
|
+
- Concise bullet points. No narrative paragraphs. No preamble. No closing summary.
|
|
5102
|
+
- Each bullet is a single declarative sentence (or one bullet → one short selector / command).
|
|
5103
|
+
- Group related bullets under \`### …\` subheaders.
|
|
5104
|
+
- Skip everything that was already true and well-covered by the previous file or \`${userMdLabel}\`. Only persist new lessons.
|
|
5105
|
+
- Keep the whole file under ~3 KB.
|
|
5106
|
+
- Output ONLY the new file contents. NO code fences. NO surrounding prose. NO markdown frontmatter.
|
|
5107
|
+
- If the run summary contains nothing worth learning from, output the previous file unchanged.
|
|
5108
|
+
`;
|
|
5109
|
+
}
|
|
5110
|
+
function buildAgentUpdateUserPrompt(input) {
|
|
5111
|
+
const agentMdLabel = `${input.mode}.agent.md`;
|
|
5112
|
+
return `## Previous \`${agentMdLabel}\`
|
|
5113
|
+
|
|
5114
|
+
${input.currentAgentMd && input.currentAgentMd.trim().length > 0 ? input.currentAgentMd : "(no existing file — this will create one)"}
|
|
5115
|
+
|
|
5116
|
+
## Run summary
|
|
5117
|
+
|
|
5118
|
+
${input.runSummary}
|
|
5119
|
+
|
|
5120
|
+
## Your task
|
|
5121
|
+
|
|
5122
|
+
Write the new contents of \`${agentMdLabel}\`. Output ONLY the file contents — no preamble, no fences, no closing note.`;
|
|
5123
|
+
}
|
|
5124
|
+
//#endregion
|
|
5125
|
+
//#region src/cli/update-agent-prompt.ts
|
|
5126
|
+
/**
|
|
5127
|
+
* Refresh `.ccqa/prompts/<mode>.agent.md` from the latest run.
|
|
5128
|
+
*
|
|
5129
|
+
* Reads the existing file (if any) and a caller-supplied run summary, sends
|
|
5130
|
+
* both to Claude, and writes the response back over the agent prompt file.
|
|
5131
|
+
* Degrades gracefully when auth is missing — logs and returns — so the run
|
|
5132
|
+
* exit code is unaffected by this opt-in side step.
|
|
5133
|
+
*/
|
|
5134
|
+
async function updateAgentPrompt(args) {
|
|
5135
|
+
const { mode, runSummary, cwd, model, language } = args;
|
|
5136
|
+
const agentMdPath = join(cwd, ".ccqa", "prompts", `${mode}.agent.md`);
|
|
5137
|
+
const relPath = relative(cwd, agentMdPath);
|
|
5138
|
+
const auth = driftAuthAvailable();
|
|
5139
|
+
if (!auth.ok) {
|
|
5140
|
+
warn(`--update-agent-prompt skipped (${auth.reason})`);
|
|
5141
|
+
return;
|
|
5142
|
+
}
|
|
5143
|
+
const promptInput = {
|
|
5144
|
+
mode,
|
|
5145
|
+
currentAgentMd: await readFile(agentMdPath, "utf-8").catch(() => null),
|
|
5146
|
+
runSummary,
|
|
5147
|
+
...language ? { language } : {}
|
|
5148
|
+
};
|
|
5149
|
+
const systemPrompt = buildAgentUpdateSystemPrompt(promptInput);
|
|
5150
|
+
const userPrompt = buildAgentUpdateUserPrompt(promptInput);
|
|
5151
|
+
info(`--update-agent-prompt: refreshing ${relPath}`);
|
|
5152
|
+
const { result, isError } = await invokeClaudeStreaming({
|
|
5153
|
+
prompt: userPrompt,
|
|
5154
|
+
systemPrompt,
|
|
5155
|
+
allowedTools: [],
|
|
5156
|
+
disableBuiltinTools: true,
|
|
5157
|
+
...model ? { model } : {}
|
|
5158
|
+
}, () => {});
|
|
5159
|
+
if (isError || !result || result.trim().length === 0) {
|
|
5160
|
+
warn(`--update-agent-prompt: Claude returned no usable output${isError ? " (SDK error)" : ""}; leaving ${relPath} unchanged`);
|
|
5161
|
+
return;
|
|
5162
|
+
}
|
|
5163
|
+
const newText = stripCodeFences(result.trim()) + "\n";
|
|
5164
|
+
await mkdir(dirname(agentMdPath), { recursive: true });
|
|
5165
|
+
await writeFile(agentMdPath, newText, "utf-8");
|
|
5166
|
+
info(`--update-agent-prompt: wrote ${relPath} (${newText.length} bytes)`);
|
|
5167
|
+
info(`--update-agent-prompt: review the diff with: git diff -- "${relPath}"`);
|
|
5168
|
+
}
|
|
5169
|
+
/**
|
|
5170
|
+
* Some models still wrap the answer in a ```markdown fence despite the
|
|
5171
|
+
* system prompt asking otherwise. Strip a single outer fence when present so
|
|
5172
|
+
* the saved file is clean.
|
|
5173
|
+
*/
|
|
5174
|
+
function stripCodeFences(text) {
|
|
5175
|
+
const m = text.match(/^```[a-zA-Z]*\n([\s\S]*?)\n```\s*$/);
|
|
5176
|
+
return m && m[1] !== void 0 ? m[1] : text;
|
|
5177
|
+
}
|
|
5178
|
+
//#endregion
|
|
5066
5179
|
//#region src/cli/changed-specs.ts
|
|
5067
5180
|
/**
|
|
5068
5181
|
* Filter specs to those affected by the git diff against the resolved base
|
|
@@ -5125,7 +5238,7 @@ const runCommand = addLanguageOption(new Command("run").argument("[target]", "Sp
|
|
|
5125
5238
|
const n = Number(raw);
|
|
5126
5239
|
if (!Number.isFinite(n) || n < 0 || Math.floor(n) !== n) throw new Error(`--retry must be a non-negative integer, got "${raw}"`);
|
|
5127
5240
|
return n;
|
|
5128
|
-
}, 0).option("--out <dir>", "(live only) Override the per-spec artifact directory. Default: <specDir>/runs/<runId>. Ignored when running multiple specs.")).action(async (target, opts) => {
|
|
5241
|
+
}, 0).option("--out <dir>", "(live only) Override the per-spec artifact directory. Default: <specDir>/runs/<runId>. Ignored when running multiple specs.").option("--update-agent-prompt", "(live only) After the run finishes, ask Claude to refresh .ccqa/prompts/live.agent.md from a summary of the run.")).action(async (target, opts) => {
|
|
5129
5242
|
await runDispatcher(target, opts);
|
|
5130
5243
|
});
|
|
5131
5244
|
function resolveReportDir(report, cwd) {
|
|
@@ -5159,6 +5272,7 @@ async function runDispatcher(target, opts) {
|
|
|
5159
5272
|
if (liveSpecs.length === 0) {
|
|
5160
5273
|
if (typeof opts.retry === "number" && opts.retry > 0) warn("--retry is ignored without any 'mode: live' spec");
|
|
5161
5274
|
if (opts.out) warn("--out is ignored without any 'mode: live' spec");
|
|
5275
|
+
if (opts.updateAgentPrompt) warn("--update-agent-prompt is ignored without any 'mode: live' spec");
|
|
5162
5276
|
}
|
|
5163
5277
|
if (detSpecs.length === 0 && opts.evidence === false) warn("--no-evidence is ignored without any 'mode: deterministic' spec");
|
|
5164
5278
|
blank();
|
|
@@ -5188,9 +5302,39 @@ async function runDispatcher(target, opts) {
|
|
|
5188
5302
|
opts
|
|
5189
5303
|
});
|
|
5190
5304
|
}
|
|
5305
|
+
if (opts.updateAgentPrompt && liveSpecs.length > 0) {
|
|
5306
|
+
blank();
|
|
5307
|
+
await updateAgentPrompt({
|
|
5308
|
+
mode: "live",
|
|
5309
|
+
runSummary: buildLiveRunSummary(live.reportResults),
|
|
5310
|
+
cwd,
|
|
5311
|
+
...opts.model ? { model: opts.model } : {},
|
|
5312
|
+
...opts.language ? { language: opts.language } : {}
|
|
5313
|
+
});
|
|
5314
|
+
}
|
|
5191
5315
|
process.exit(overallExitCode);
|
|
5192
5316
|
}
|
|
5193
5317
|
/**
|
|
5318
|
+
* Compact, prompt-friendly summary of one ccqa run for the live agent-prompt
|
|
5319
|
+
* update step. One section per spec: header line + per-step verdicts.
|
|
5320
|
+
* Kept to a few KB even with many specs/steps so the prompt cache can absorb
|
|
5321
|
+
* the bulk.
|
|
5322
|
+
*/
|
|
5323
|
+
function buildLiveRunSummary(results) {
|
|
5324
|
+
const sections = [];
|
|
5325
|
+
for (const r of results) {
|
|
5326
|
+
if (!r.liveRun) continue;
|
|
5327
|
+
const head = `## ${r.feature}/${r.spec} — ${r.status}`;
|
|
5328
|
+
const steps = r.liveRun.steps.map((s) => `- [${s.status}] ${s.stepId}: ${oneLineSummary$1(s.reasoning)}`).join("\n");
|
|
5329
|
+
sections.push(`${head}\n${steps}`);
|
|
5330
|
+
}
|
|
5331
|
+
return sections.length === 0 ? "(no live runs executed)" : sections.join("\n\n");
|
|
5332
|
+
}
|
|
5333
|
+
function oneLineSummary$1(s) {
|
|
5334
|
+
const flat = s.replace(/\s+/g, " ").trim();
|
|
5335
|
+
return flat.length > 240 ? flat.slice(0, 240) + "…" : flat || "(no reason given)";
|
|
5336
|
+
}
|
|
5337
|
+
/**
|
|
5194
5338
|
* Run pre-filtered deterministic specs under vitest. Empty input is a no-op.
|
|
5195
5339
|
* Captures step-boundary evidence under `<reportDir>/evidence/<feature>/<spec>/`
|
|
5196
5340
|
* when enabled.
|
|
@@ -5354,7 +5498,7 @@ async function analyzeDeterministicSummaries(summaries, opts, cwd, reportDir) {
|
|
|
5354
5498
|
failureLogExcerpt: null,
|
|
5355
5499
|
diffExcerpt: null,
|
|
5356
5500
|
specYaml: null,
|
|
5357
|
-
|
|
5501
|
+
liveRun: null
|
|
5358
5502
|
});
|
|
5359
5503
|
continue;
|
|
5360
5504
|
}
|
|
@@ -5404,7 +5548,7 @@ async function analyzeDeterministicSummaries(summaries, opts, cwd, reportDir) {
|
|
|
5404
5548
|
failureLogExcerpt: failureLog.length > 0 ? failureLog : null,
|
|
5405
5549
|
diffExcerpt,
|
|
5406
5550
|
specYaml,
|
|
5407
|
-
|
|
5551
|
+
liveRun: null
|
|
5408
5552
|
});
|
|
5409
5553
|
}
|
|
5410
5554
|
return {
|
|
@@ -6679,9 +6823,9 @@ async function runTrace(featureName, specName, model, validationMode = "lenient"
|
|
|
6679
6823
|
steps: expanded,
|
|
6680
6824
|
sessionName
|
|
6681
6825
|
});
|
|
6682
|
-
const
|
|
6683
|
-
if (
|
|
6684
|
-
const systemPrompt = (
|
|
6826
|
+
const promptBundle = await loadRecordPromptBundle();
|
|
6827
|
+
if (promptBundle !== null) meta("user-prompt", promptBundle.loaded.join(" + "));
|
|
6828
|
+
const systemPrompt = (promptBundle === null ? baseSystemPrompt : `${baseSystemPrompt}\n## Project-specific guidance\n\n${promptBundle.text}\n`) + languageDirective(language);
|
|
6685
6829
|
const prompt = buildTracePrompt(spec.title);
|
|
6686
6830
|
info("Running agent-browser session...");
|
|
6687
6831
|
blank();
|
|
@@ -6763,6 +6907,11 @@ async function runTrace(featureName, specName, model, validationMode = "lenient"
|
|
|
6763
6907
|
if (written) meta("relatedPaths", `${relatedPaths.length} path(s) written to ${written}`);
|
|
6764
6908
|
} else warn("trace did not emit a RELATED_PATHS block; drift --changed cannot scope this spec");
|
|
6765
6909
|
hint(`run 'ccqa generate ${featureName}/${specName}' to generate a test script`);
|
|
6910
|
+
return {
|
|
6911
|
+
route,
|
|
6912
|
+
actionsKept: validatedActions.length,
|
|
6913
|
+
actionsRecorded: traceActions.length
|
|
6914
|
+
};
|
|
6766
6915
|
}
|
|
6767
6916
|
/**
|
|
6768
6917
|
* Strip actions whose recorded fields contain "unstable literal" values
|
|
@@ -8338,15 +8487,16 @@ const recordCommand = addLanguageOption(new Command("record").argument("<feature
|
|
|
8338
8487
|
}, "lenient").option("--auto-fix <mode>", "Auto-fix behaviour during script generation: 'interactive' (default, prompt y/N), 'auto' (apply without prompt, for CI), 'skip' (never prompt, only apply high-confidence fixes).", (raw) => {
|
|
8339
8488
|
if (AUTO_FIX_MODES.includes(raw)) return raw;
|
|
8340
8489
|
throw new Error(`--auto-fix must be one of ${AUTO_FIX_MODES.join(" | ")}`);
|
|
8341
|
-
}, "interactive").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--force", "Overwrite an existing test.spec.ts without warning").option("--no-snapshot", "Don't pin AGENT_BROWSER_SESSION / capture page snapshots after a failure (debug toggle)").option("--skip-trace", "Skip the trace step and run codegen against an existing actions.json").option("--skip-codegen", "Run only the trace step (do not generate test.spec.ts)")).action(async (specPath, opts) => {
|
|
8490
|
+
}, "interactive").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--force", "Overwrite an existing test.spec.ts without warning").option("--no-snapshot", "Don't pin AGENT_BROWSER_SESSION / capture page snapshots after a failure (debug toggle)").option("--skip-trace", "Skip the trace step and run codegen against an existing actions.json").option("--skip-codegen", "Run only the trace step (do not generate test.spec.ts)").option("--update-agent-prompt", "After the trace finishes, ask Claude to refresh .ccqa/prompts/record.agent.md from a summary of the run.").option("--cwd <path>", "Working directory containing the .ccqa/ tree (monorepo support). Defaults to the current directory.")).action(async (specPath, opts) => {
|
|
8342
8491
|
const { featureName, specName } = parseSpecPath(specPath);
|
|
8343
8492
|
const language = opts.language ?? "auto";
|
|
8344
8493
|
if (opts.skipTrace && opts.skipCodegen) {
|
|
8345
8494
|
error("--skip-trace and --skip-codegen cannot be combined; nothing would run");
|
|
8346
8495
|
process.exit(2);
|
|
8347
8496
|
}
|
|
8497
|
+
let traceResult = null;
|
|
8348
8498
|
if (!opts.skipTrace) {
|
|
8349
|
-
await runTrace(featureName, specName, opts.model, opts.validationMode ?? "lenient", language);
|
|
8499
|
+
traceResult = await runTrace(featureName, specName, opts.model, opts.validationMode ?? "lenient", language);
|
|
8350
8500
|
blank();
|
|
8351
8501
|
}
|
|
8352
8502
|
if (!opts.skipCodegen) {
|
|
@@ -8354,7 +8504,37 @@ const recordCommand = addLanguageOption(new Command("record").argument("<feature
|
|
|
8354
8504
|
const useSnapshot = opts.snapshot !== false;
|
|
8355
8505
|
await runGenerate(featureName, specName, parseInt(opts.maxRetries ?? "3", 10), fixMode, opts.force ?? false, useSnapshot, language, opts.model);
|
|
8356
8506
|
}
|
|
8507
|
+
if (opts.updateAgentPrompt) if (traceResult === null) warn("--update-agent-prompt is ignored when --skip-trace is set (no run summary available)");
|
|
8508
|
+
else {
|
|
8509
|
+
const cwd = resolveCwd(opts.cwd);
|
|
8510
|
+
blank();
|
|
8511
|
+
await updateAgentPrompt({
|
|
8512
|
+
mode: "record",
|
|
8513
|
+
runSummary: buildRecordRunSummary(featureName, specName, traceResult),
|
|
8514
|
+
cwd,
|
|
8515
|
+
...opts.model ? { model: opts.model } : {},
|
|
8516
|
+
...language ? { language } : {}
|
|
8517
|
+
});
|
|
8518
|
+
}
|
|
8357
8519
|
});
|
|
8520
|
+
/**
|
|
8521
|
+
* Compact summary of the trace pass for the record agent-prompt refresh:
|
|
8522
|
+
* per-step title / action / observation / status. The route steps already
|
|
8523
|
+
* carry the assistant's own framing of what happened — perfect input for
|
|
8524
|
+
* "what should I remember next time".
|
|
8525
|
+
*/
|
|
8526
|
+
function buildRecordRunSummary(featureName, specName, t) {
|
|
8527
|
+
return `${`## ${featureName}/${specName} — ${t.route.status}\nActions: ${t.actionsKept} kept / ${t.actionsRecorded} recorded`}\n\n${t.route.steps.length === 0 ? "(no route steps recorded)" : t.route.steps.map((s) => [
|
|
8528
|
+
`### ${s.title} (${s.status})`,
|
|
8529
|
+
`- action: ${oneLineSummary(s.action)}`,
|
|
8530
|
+
`- observation: ${oneLineSummary(s.observation)}`,
|
|
8531
|
+
...s.reason ? [`- reason: ${oneLineSummary(s.reason)}`] : []
|
|
8532
|
+
].join("\n")).join("\n\n")}`;
|
|
8533
|
+
}
|
|
8534
|
+
function oneLineSummary(s) {
|
|
8535
|
+
const flat = s.replace(/\s+/g, " ").trim();
|
|
8536
|
+
return flat.length > 240 ? flat.slice(0, 240) + "…" : flat || "(none)";
|
|
8537
|
+
}
|
|
8358
8538
|
//#endregion
|
|
8359
8539
|
//#region src/cli/draft.ts
|
|
8360
8540
|
const CATEGORY_LABEL = DRAFT_CATEGORY_LABEL;
|
|
@@ -9124,6 +9304,64 @@ function parseConcurrency(raw) {
|
|
|
9124
9304
|
return n;
|
|
9125
9305
|
}
|
|
9126
9306
|
//#endregion
|
|
9307
|
+
//#region src/cli/init.ts
|
|
9308
|
+
const TEMPLATES = [
|
|
9309
|
+
{
|
|
9310
|
+
relPath: ".ccqa/prompts/live.user.md",
|
|
9311
|
+
content: `# Project guidance for live specs
|
|
9312
|
+
|
|
9313
|
+
Write stable, hand-maintained context here: staging URLs, naming conventions, known "this is fine" warnings. Lines you add will be appended verbatim to the system prompt of every step in 'mode: live' specs.
|
|
9314
|
+
`
|
|
9315
|
+
},
|
|
9316
|
+
{
|
|
9317
|
+
relPath: ".ccqa/prompts/live.agent.md",
|
|
9318
|
+
content: `# Agent learnings for live specs
|
|
9319
|
+
|
|
9320
|
+
This file is updated by 'ccqa run --update-agent-prompt'. You can edit it by hand, but the next --update-agent-prompt run may rewrite the whole file. Keep stable rules in live.user.md instead.
|
|
9321
|
+
`
|
|
9322
|
+
},
|
|
9323
|
+
{
|
|
9324
|
+
relPath: ".ccqa/prompts/record.user.md",
|
|
9325
|
+
content: `# Project guidance for ccqa record (deterministic trace)
|
|
9326
|
+
|
|
9327
|
+
Write stable, hand-maintained context here for the trace phase of 'ccqa record'. Lines you add will be appended verbatim to the trace system prompt.
|
|
9328
|
+
`
|
|
9329
|
+
},
|
|
9330
|
+
{
|
|
9331
|
+
relPath: ".ccqa/prompts/record.agent.md",
|
|
9332
|
+
content: `# Agent learnings for ccqa record
|
|
9333
|
+
|
|
9334
|
+
This file is updated by 'ccqa record --update-agent-prompt'. Same convention as live.agent.md — stable rules go in record.user.md.
|
|
9335
|
+
`
|
|
9336
|
+
}
|
|
9337
|
+
];
|
|
9338
|
+
const initCommand = new Command("init").description("Create .ccqa/prompts/{live,record}.{user,agent}.md template files (skips existing files unless --force).").option("--cwd <path>", "Working directory (default: cwd)").option("--force", "Overwrite existing files").action(async (opts) => {
|
|
9339
|
+
const cwd = resolveCwd(opts.cwd);
|
|
9340
|
+
header("init", cwd);
|
|
9341
|
+
await mkdir(join(cwd, ".ccqa", "prompts"), { recursive: true });
|
|
9342
|
+
const created = [];
|
|
9343
|
+
const skipped = [];
|
|
9344
|
+
for (const t of TEMPLATES) if (await writeTemplate(join(cwd, t.relPath), t.content, opts.force ?? false)) created.push(t.relPath);
|
|
9345
|
+
else skipped.push(t.relPath);
|
|
9346
|
+
for (const f of created) info(`created ${f}`);
|
|
9347
|
+
for (const f of skipped) info(`skipped ${f} (already exists; pass --force to overwrite)`);
|
|
9348
|
+
blank();
|
|
9349
|
+
meta("created", created.length);
|
|
9350
|
+
meta("skipped", skipped.length);
|
|
9351
|
+
});
|
|
9352
|
+
async function writeTemplate(absPath, content, force) {
|
|
9353
|
+
try {
|
|
9354
|
+
await writeFile(absPath, content, force ? { encoding: "utf-8" } : {
|
|
9355
|
+
encoding: "utf-8",
|
|
9356
|
+
flag: "wx"
|
|
9357
|
+
});
|
|
9358
|
+
return true;
|
|
9359
|
+
} catch (err) {
|
|
9360
|
+
if (typeof err === "object" && err !== null && err.code === "EEXIST") return false;
|
|
9361
|
+
throw err;
|
|
9362
|
+
}
|
|
9363
|
+
}
|
|
9364
|
+
//#endregion
|
|
9127
9365
|
//#region src/prompts/perspectives.ts
|
|
9128
9366
|
/**
|
|
9129
9367
|
* Build the system prompt. By default the descriptive fields follow the
|
|
@@ -9591,8 +9829,6 @@ function renderSpecMarkdown(spec, labels = LABELS_JA) {
|
|
|
9591
9829
|
lines.push("");
|
|
9592
9830
|
lines.push(`| ${labels.itemCol} | ${labels.valueCol} |`);
|
|
9593
9831
|
lines.push("| --- | --- |");
|
|
9594
|
-
lines.push(`| ${labels.modeLabel} | ${mdCell(modeLabel(spec.status, labels))} |`);
|
|
9595
|
-
lines.push(`| ${labels.statusCol} | ${mdCell(statusLabel(spec.status, labels))} |`);
|
|
9596
9832
|
if (spec.summary) lines.push(`| ${labels.summary} | ${mdCell(spec.summary)} |`);
|
|
9597
9833
|
if (spec.preconditions && spec.preconditions.length > 0) lines.push(`| ${labels.preconditions} | ${spec.preconditions.map(mdCell).join("<br>")} |`);
|
|
9598
9834
|
if (spec.startScreen) lines.push(`| ${labels.startScreen} | ${mdCell(spec.startScreen)} |`);
|
|
@@ -9624,6 +9860,7 @@ function resolvePackageJson() {
|
|
|
9624
9860
|
const { version } = JSON.parse(readFileSync(resolvePackageJson(), "utf8"));
|
|
9625
9861
|
const program = new Command();
|
|
9626
9862
|
program.name("ccqa").description("E2E test CLI using Claude Code + agent-browser").version(version);
|
|
9863
|
+
program.addCommand(initCommand);
|
|
9627
9864
|
program.addCommand(draftCommand);
|
|
9628
9865
|
program.addCommand(perspectivesCommand);
|
|
9629
9866
|
program.addCommand(recordCommand);
|
package/dist/package.json
CHANGED