ccqa 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -4
- package/dist/bin/ccqa.mjs +2140 -367
- package/dist/package.json +1 -1
- package/package.json +1 -1
package/dist/bin/ccqa.mjs
CHANGED
|
@@ -9,11 +9,11 @@ import { query } from "@anthropic-ai/claude-agent-sdk";
|
|
|
9
9
|
import { ZodError, z } from "zod";
|
|
10
10
|
import { delimiter, dirname, join, relative, resolve } from "node:path";
|
|
11
11
|
import { parse, stringify } from "yaml";
|
|
12
|
-
import { execFile, spawn } from "node:child_process";
|
|
12
|
+
import { execFile, spawn, spawnSync } from "node:child_process";
|
|
13
13
|
import { createInterface } from "node:readline";
|
|
14
14
|
import { homedir, tmpdir } from "node:os";
|
|
15
|
-
import { createInterface as createInterface$1 } from "node:readline/promises";
|
|
16
15
|
import { promisify } from "node:util";
|
|
16
|
+
import { createInterface as createInterface$1 } from "node:readline/promises";
|
|
17
17
|
//#region src/prompts/trace.ts
|
|
18
18
|
function generateSessionName() {
|
|
19
19
|
return `ccqa-trace-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`;
|
|
@@ -562,6 +562,71 @@ function isParamRequired(param) {
|
|
|
562
562
|
return param.required !== false;
|
|
563
563
|
}
|
|
564
564
|
//#endregion
|
|
565
|
+
//#region src/spec/perspectives-schema.ts
|
|
566
|
+
/**
|
|
567
|
+
* `perspectives.yaml` is an inventory of the test coverage that already
|
|
568
|
+
* exists under `.ccqa/` — the ccqa equivalent of a hand-kept QA spreadsheet,
|
|
569
|
+
* but scoped deliberately to *facts about what is tested today*.
|
|
570
|
+
*
|
|
571
|
+
* It intentionally does NOT carry severity / importance / priority. Deciding
|
|
572
|
+
* "how badly does it hurt the customer if this breaks" is a human + PdM
|
|
573
|
+
* decision, not something ccqa should author or silently overwrite. Keeping
|
|
574
|
+
* those columns out of the schema (and `.strict()` rejecting them) makes the
|
|
575
|
+
* boundary explicit: perspectives is a factual stock-take, severity lives
|
|
576
|
+
* wherever the team decides on it.
|
|
577
|
+
*
|
|
578
|
+
* It also does NOT attempt code-vs-test gap analysis (listing untested
|
|
579
|
+
* areas). A flat dump of "things in code with no test" is noise without
|
|
580
|
+
* prioritisation; that is a separate, later concern.
|
|
581
|
+
*/
|
|
582
|
+
/**
|
|
583
|
+
* Whether the spec has been traced / generated. Both are derived mechanically
|
|
584
|
+
* by the CLI from on-disk artifacts (actions.json / test.spec.ts), never
|
|
585
|
+
* written by Claude — these are facts and must not drift.
|
|
586
|
+
*/
|
|
587
|
+
const PerspectiveStatusSchema = z.object({
|
|
588
|
+
traced: z.boolean(),
|
|
589
|
+
generated: z.boolean()
|
|
590
|
+
}).strict();
|
|
591
|
+
/**
|
|
592
|
+
* One test case in the inventory.
|
|
593
|
+
*
|
|
594
|
+
* - `title` / `relatedPaths` are transcribed verbatim from the spec.yaml.
|
|
595
|
+
* - `status` is mechanically derived (see PerspectiveStatusSchema).
|
|
596
|
+
* - `summary` is a 1–2 sentence description of *what the spec verifies*,
|
|
597
|
+
* derived from its steps by Claude.
|
|
598
|
+
* - `startScreen` / `testCondition` / `preconditions` mirror the columns a
|
|
599
|
+
* hand-kept QA table carries. They are Claude-derived from the spec's
|
|
600
|
+
* steps (the opening screen, the state the test assumes, and the setup
|
|
601
|
+
* prerequisites such as which role logs in). Optional: a spec may not
|
|
602
|
+
* express all of them.
|
|
603
|
+
* - `note` is a human-only field. Regenerating perspectives preserves it.
|
|
604
|
+
*
|
|
605
|
+
* The detailed test procedure and expected results are deliberately NOT
|
|
606
|
+
* duplicated here — the spec.yaml steps are the single source of truth for
|
|
607
|
+
* those. The Markdown view links back to the spec instead of restating them.
|
|
608
|
+
*/
|
|
609
|
+
const PerspectiveSpecSchema = z.object({
|
|
610
|
+
specName: z.string().min(1),
|
|
611
|
+
title: z.string().min(1),
|
|
612
|
+
summary: z.string(),
|
|
613
|
+
startScreen: z.string().optional(),
|
|
614
|
+
testCondition: z.string().optional(),
|
|
615
|
+
preconditions: z.array(z.string().min(1)).optional(),
|
|
616
|
+
relatedPaths: z.array(z.string().min(1)).optional(),
|
|
617
|
+
status: PerspectiveStatusSchema,
|
|
618
|
+
note: z.string().optional()
|
|
619
|
+
}).strict();
|
|
620
|
+
const PerspectiveFeatureSchema = z.object({
|
|
621
|
+
featureName: z.string().min(1),
|
|
622
|
+
specs: z.array(PerspectiveSpecSchema)
|
|
623
|
+
}).strict();
|
|
624
|
+
/** Top-level perspectives schema. `.strict()` rejects any unknown key. */
|
|
625
|
+
const PerspectivesSchema = z.object({
|
|
626
|
+
generatedAt: z.string().optional(),
|
|
627
|
+
features: z.array(PerspectiveFeatureSchema)
|
|
628
|
+
}).strict();
|
|
629
|
+
//#endregion
|
|
565
630
|
//#region src/types.ts
|
|
566
631
|
const RouteStepSchema = z.object({
|
|
567
632
|
title: z.string(),
|
|
@@ -633,7 +698,7 @@ const DraftIssueSchema = z.object({
|
|
|
633
698
|
]),
|
|
634
699
|
stepId: z.string().nullable(),
|
|
635
700
|
message: z.string(),
|
|
636
|
-
detail: z.string().
|
|
701
|
+
detail: z.string().nullish()
|
|
637
702
|
});
|
|
638
703
|
const DraftReportSchema = z.object({
|
|
639
704
|
issues: z.array(DraftIssueSchema),
|
|
@@ -1205,6 +1270,8 @@ function collectIncludedBlockNames(spec) {
|
|
|
1205
1270
|
//#region src/store/index.ts
|
|
1206
1271
|
const CCQA_DIR = ".ccqa";
|
|
1207
1272
|
const SPEC_FILE = "spec.yaml";
|
|
1273
|
+
const PERSPECTIVES_FILE = "perspectives.yaml";
|
|
1274
|
+
const PERSPECTIVES_MD_FILE = "perspectives.md";
|
|
1208
1275
|
function getCcqaDir(cwd = process.cwd()) {
|
|
1209
1276
|
return join(cwd, CCQA_DIR);
|
|
1210
1277
|
}
|
|
@@ -1250,6 +1317,56 @@ async function saveSpecFile(featureName, specName, content, cwd) {
|
|
|
1250
1317
|
await writeFile(specPath, content.endsWith("\n") ? content : content + "\n", "utf-8");
|
|
1251
1318
|
return specPath;
|
|
1252
1319
|
}
|
|
1320
|
+
/** Absolute path to the single repo-wide `.ccqa/perspectives.yaml`. */
|
|
1321
|
+
function getPerspectivesPath(cwd) {
|
|
1322
|
+
return join(getCcqaDir(cwd), PERSPECTIVES_FILE);
|
|
1323
|
+
}
|
|
1324
|
+
/**
|
|
1325
|
+
* Read `.ccqa/perspectives.yaml` raw. Returns null when the file does not
|
|
1326
|
+
* exist (first-ever generation) so callers can treat it as optional.
|
|
1327
|
+
*/
|
|
1328
|
+
async function tryReadPerspectives(cwd) {
|
|
1329
|
+
return readFile(getPerspectivesPath(cwd), "utf-8").catch(() => null);
|
|
1330
|
+
}
|
|
1331
|
+
/**
|
|
1332
|
+
* Write `.ccqa/perspectives.yaml`. Mirrors `saveSpecFile`: ensures the
|
|
1333
|
+
* directory exists and the content ends in a trailing newline.
|
|
1334
|
+
*/
|
|
1335
|
+
async function savePerspectives(content, cwd) {
|
|
1336
|
+
await mkdir(getCcqaDir(cwd), { recursive: true });
|
|
1337
|
+
const path = getPerspectivesPath(cwd);
|
|
1338
|
+
await writeFile(path, content.endsWith("\n") ? content : content + "\n", "utf-8");
|
|
1339
|
+
return path;
|
|
1340
|
+
}
|
|
1341
|
+
/**
|
|
1342
|
+
* Human-readable Markdown companion to perspectives.yaml. The `.yaml` is the
|
|
1343
|
+
* machine-readable source of truth; the `.md` is a rendered view for review.
|
|
1344
|
+
*/
|
|
1345
|
+
function getPerspectivesMarkdownPath(cwd) {
|
|
1346
|
+
return join(getCcqaDir(cwd), PERSPECTIVES_MD_FILE);
|
|
1347
|
+
}
|
|
1348
|
+
async function savePerspectivesMarkdown(content, cwd) {
|
|
1349
|
+
await mkdir(getCcqaDir(cwd), { recursive: true });
|
|
1350
|
+
const path = getPerspectivesMarkdownPath(cwd);
|
|
1351
|
+
await writeFile(path, content.endsWith("\n") ? content : content + "\n", "utf-8");
|
|
1352
|
+
return path;
|
|
1353
|
+
}
|
|
1354
|
+
/**
|
|
1355
|
+
* Per-category detail view: `.ccqa/features/<feature>/perspectives.md`. The
|
|
1356
|
+
* root `perspectives.md` is a thin category index that links here; this file
|
|
1357
|
+
* carries the full per-case tables for one feature. The feature dir already
|
|
1358
|
+
* exists (it holds the test cases), but `mkdir -p` keeps this safe when called
|
|
1359
|
+
* in isolation.
|
|
1360
|
+
*/
|
|
1361
|
+
function getFeaturePerspectivesMarkdownPath(featureName, cwd) {
|
|
1362
|
+
return join(getFeatureDir(featureName, cwd), PERSPECTIVES_MD_FILE);
|
|
1363
|
+
}
|
|
1364
|
+
async function saveFeaturePerspectivesMarkdown(featureName, content, cwd) {
|
|
1365
|
+
await mkdir(getFeatureDir(featureName, cwd), { recursive: true });
|
|
1366
|
+
const path = getFeaturePerspectivesMarkdownPath(featureName, cwd);
|
|
1367
|
+
await writeFile(path, content.endsWith("\n") ? content : content + "\n", "utf-8");
|
|
1368
|
+
return path;
|
|
1369
|
+
}
|
|
1253
1370
|
/**
|
|
1254
1371
|
* Replace (or insert) the `relatedPaths` key in the spec. Preserves every
|
|
1255
1372
|
* other top-level field and the entire steps array. Returns the absolute
|
|
@@ -2188,16 +2305,60 @@ function formatUnstableDrop(drop) {
|
|
|
2188
2305
|
return `${`${action.command}${action.assertType ? " " + action.assertType : ""}`}: contains unstable literal (${ids}) — ${samples}`;
|
|
2189
2306
|
}
|
|
2190
2307
|
//#endregion
|
|
2308
|
+
//#region src/prompts/language.ts
|
|
2309
|
+
/**
|
|
2310
|
+
* Shared language handling for every Claude-driven command. Each command
|
|
2311
|
+
* writes some human-readable text (drift findings, trace observations, draft
|
|
2312
|
+
* prose, diagnose hints, perspectives summaries), so the language policy is a
|
|
2313
|
+
* single cross-cutting concern rather than per-command logic.
|
|
2314
|
+
*
|
|
2315
|
+
* The value is a BCP-47 tag (e.g. "ja", "en") or the sentinel "auto". With
|
|
2316
|
+
* "auto" the model follows the language of the material it is given — Japanese
|
|
2317
|
+
* specs/codebase yield Japanese output — and `languageDirective` returns an
|
|
2318
|
+
* empty string so prompts stay byte-identical to the no-flag baseline.
|
|
2319
|
+
*/
|
|
2320
|
+
const DEFAULT_LANGUAGE = "auto";
|
|
2321
|
+
/**
|
|
2322
|
+
* The instruction appended to a command's system prompt. Empty for "auto"
|
|
2323
|
+
* (and undefined / blank), so the model keeps its natural material-following
|
|
2324
|
+
* behaviour; otherwise it pins every human-readable field to the given tag.
|
|
2325
|
+
*/
|
|
2326
|
+
function languageDirective(language) {
|
|
2327
|
+
const lang = (language ?? "auto").trim();
|
|
2328
|
+
if (lang === "" || lang === "auto") return "";
|
|
2329
|
+
return `\n\nIMPORTANT: Write every human-readable field, message, and explanation in **${lang}** (BCP-47 language tag), regardless of the language of the spec or codebase.`;
|
|
2330
|
+
}
|
|
2331
|
+
/**
|
|
2332
|
+
* Whether the CLI's own interactive prompts (the strings ccqa prints itself,
|
|
2333
|
+
* not the model's output) should be Japanese. Only an explicit Japanese tag
|
|
2334
|
+
* (`ja`, `ja-JP`, …) opts in; `auto` (the default) and every other tag keep
|
|
2335
|
+
* the English prompts, so an English user running with no flag is unaffected.
|
|
2336
|
+
*/
|
|
2337
|
+
function useJapanesePrompts(language) {
|
|
2338
|
+
return /^ja\b/i.test((language ?? "").trim());
|
|
2339
|
+
}
|
|
2340
|
+
//#endregion
|
|
2341
|
+
//#region src/cli/options.ts
|
|
2342
|
+
/**
|
|
2343
|
+
* Shared `--language` flag. Every Claude-driven command writes some
|
|
2344
|
+
* human-readable text, so language is a cross-cutting concern handled the same
|
|
2345
|
+
* way everywhere — much like `--model`. The value is a BCP-47 tag (e.g. "ja",
|
|
2346
|
+
* "en") or "auto" (default), which follows the language of the material.
|
|
2347
|
+
*/
|
|
2348
|
+
function addLanguageOption(command) {
|
|
2349
|
+
return command.option("--language <bcp47>", "Language for human-readable output (e.g. 'en', 'ja'). Default 'auto' follows the language of the spec/codebase.", DEFAULT_LANGUAGE);
|
|
2350
|
+
}
|
|
2351
|
+
//#endregion
|
|
2191
2352
|
//#region src/cli/trace.ts
|
|
2192
2353
|
const VALIDATION_MODES = ["lenient", "strict"];
|
|
2193
|
-
const traceCommand = new Command("trace").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Run agent-browser, verify assertions, and record structured actions").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Overrides CCQA_MODEL.").option("--validation-mode <mode>", "Post-trace validation behaviour: 'lenient' (default) tags failing actions with a warning but keeps them; 'strict' drops them from actions.json.", (raw) => {
|
|
2354
|
+
const traceCommand = addLanguageOption(new Command("trace").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Run agent-browser, verify assertions, and record structured actions").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Overrides CCQA_MODEL.").option("--validation-mode <mode>", "Post-trace validation behaviour: 'lenient' (default) tags failing actions with a warning but keeps them; 'strict' drops them from actions.json.", (raw) => {
|
|
2194
2355
|
if (VALIDATION_MODES.includes(raw)) return raw;
|
|
2195
2356
|
throw new Error(`--validation-mode must be one of ${VALIDATION_MODES.join(" | ")}`);
|
|
2196
|
-
}, "lenient").action(async (specPath, opts) => {
|
|
2357
|
+
}, "lenient")).action(async (specPath, opts) => {
|
|
2197
2358
|
const { featureName, specName } = parseSpecPath(specPath);
|
|
2198
|
-
await runTrace(featureName, specName, opts.model, opts.validationMode ?? "lenient");
|
|
2359
|
+
await runTrace(featureName, specName, opts.model, opts.validationMode ?? "lenient", opts.language);
|
|
2199
2360
|
});
|
|
2200
|
-
async function runTrace(featureName, specName, model, validationMode = "lenient") {
|
|
2361
|
+
async function runTrace(featureName, specName, model, validationMode = "lenient", language) {
|
|
2201
2362
|
header("trace", `${featureName}/${specName}`);
|
|
2202
2363
|
try {
|
|
2203
2364
|
meta("agent-browser", assertAgentBrowserAvailable());
|
|
@@ -2228,7 +2389,7 @@ async function runTrace(featureName, specName, model, validationMode = "lenient"
|
|
|
2228
2389
|
});
|
|
2229
2390
|
const userPrompt = await loadTraceUserPrompt();
|
|
2230
2391
|
if (userPrompt !== null) meta("user-prompt", ".ccqa/prompts/trace.user.md");
|
|
2231
|
-
const systemPrompt = userPrompt === null ? baseSystemPrompt : `${baseSystemPrompt}\n## Project-specific guidance\n\n${userPrompt}\n
|
|
2392
|
+
const systemPrompt = (userPrompt === null ? baseSystemPrompt : `${baseSystemPrompt}\n## Project-specific guidance\n\n${userPrompt}\n`) + languageDirective(language);
|
|
2232
2393
|
const prompt = buildTracePrompt(spec.title);
|
|
2233
2394
|
info("Running agent-browser session...");
|
|
2234
2395
|
blank();
|
|
@@ -3217,18 +3378,47 @@ function previewDiff(before, after) {
|
|
|
3217
3378
|
return out.join("\n");
|
|
3218
3379
|
}
|
|
3219
3380
|
//#endregion
|
|
3381
|
+
//#region src/prompts/format.ts
|
|
3382
|
+
/**
|
|
3383
|
+
* Formatting helpers shared by the Claude prompt builders (diagnose, report).
|
|
3384
|
+
* Centralised so the prompts cannot drift apart on mechanics that must stay
|
|
3385
|
+
* consistent across commands.
|
|
3386
|
+
*/
|
|
3387
|
+
/** Prefix every line with its 1-based number, the form fix suggestions cite. */
|
|
3388
|
+
function numberLines(script) {
|
|
3389
|
+
return script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n");
|
|
3390
|
+
}
|
|
3391
|
+
/**
|
|
3392
|
+
* The "## Output language" prompt section. Empty for "auto" so the prompt
|
|
3393
|
+
* stays byte-identical to the no-flag baseline. `fields` names the
|
|
3394
|
+
* human-readable JSON fields to translate; `verbatimNames` names the
|
|
3395
|
+
* enum-like values that must never be translated.
|
|
3396
|
+
*/
|
|
3397
|
+
function outputLanguageBlock(outputLanguage, fields, verbatimNames) {
|
|
3398
|
+
if (outputLanguage === "auto") return "";
|
|
3399
|
+
return `## Output language
|
|
3400
|
+
|
|
3401
|
+
Write all human-readable fields (${fields}) in **${outputLanguage}** (BCP-47 tag).
|
|
3402
|
+
Selectors, file paths, identifiers, ${verbatimNames}, JSON keys, and quoted strings stay verbatim regardless of language.
|
|
3403
|
+
|
|
3404
|
+
`;
|
|
3405
|
+
}
|
|
3406
|
+
//#endregion
|
|
3220
3407
|
//#region src/diagnose/prompt.ts
|
|
3221
3408
|
function buildDiagnosePrompt(input) {
|
|
3222
|
-
const { script, specYaml, actions, failureLog, pageSnapshot, outputLanguage = "
|
|
3223
|
-
const numbered = script
|
|
3409
|
+
const { script, specYaml, actions, failureLog, pageSnapshot, outputLanguage = "auto" } = input;
|
|
3410
|
+
const numbered = numberLines(script);
|
|
3411
|
+
const actionsSummary = actions.map((a, i) => {
|
|
3412
|
+
const parts = [`${i + 1}. ${a.command}`];
|
|
3413
|
+
if (a.assertType) parts.push(`assertType="${a.assertType}"`);
|
|
3414
|
+
if (a.selector) parts.push(`selector="${a.selector}"`);
|
|
3415
|
+
if (a.value) parts.push(`value="${a.value}"`);
|
|
3416
|
+
if (a.observation) parts.push(`→ ${a.observation}`);
|
|
3417
|
+
return parts.join(" ");
|
|
3418
|
+
}).join("\n");
|
|
3224
3419
|
return `You are diagnosing a failing E2E test. The test was generated from a recorded trace of the original interaction. Compare the failing run against the original spec and recorded actions to determine WHY the test failed and what the right fix is.
|
|
3225
3420
|
|
|
3226
|
-
##
|
|
3227
|
-
|
|
3228
|
-
Write all human-readable fields (\`reasoning\`, \`reason\`) in **${outputLanguage}** (BCP-47 tag).
|
|
3229
|
-
Selectors, file paths, identifiers, code, type names (TIMING_ISSUE, etc.), JSON keys, and quoted strings stay verbatim regardless of language.
|
|
3230
|
-
|
|
3231
|
-
## You have read-only filesystem tools
|
|
3421
|
+
${outputLanguageBlock(outputLanguage, "`reasoning`, `reason`", "code, type names (TIMING_ISSUE, etc.)")}## You have read-only filesystem tools
|
|
3232
3422
|
|
|
3233
3423
|
You can call \`Grep\`, \`Glob\`, and \`Read\` against the current repository before producing the JSON.
|
|
3234
3424
|
|
|
@@ -3317,14 +3507,7 @@ Pick exactly ONE category. The output JSON must follow the shape for that catego
|
|
|
3317
3507
|
${specYaml}
|
|
3318
3508
|
|
|
3319
3509
|
## Recorded Actions (actions.json summary)
|
|
3320
|
-
${
|
|
3321
|
-
const parts = [`${i + 1}. ${a.command}`];
|
|
3322
|
-
if (a.assertType) parts.push(`assertType="${a.assertType}"`);
|
|
3323
|
-
if (a.selector) parts.push(`selector="${a.selector}"`);
|
|
3324
|
-
if (a.value) parts.push(`value="${a.value}"`);
|
|
3325
|
-
if (a.observation) parts.push(`→ ${a.observation}`);
|
|
3326
|
-
return parts.join(" ");
|
|
3327
|
-
}).join("\n")}
|
|
3510
|
+
${actionsSummary}
|
|
3328
3511
|
|
|
3329
3512
|
## Test Script (with line numbers)
|
|
3330
3513
|
${numbered}
|
|
@@ -3901,11 +4084,11 @@ function resolveMode(opts) {
|
|
|
3901
4084
|
}
|
|
3902
4085
|
//#endregion
|
|
3903
4086
|
//#region src/cli/generate.ts
|
|
3904
|
-
const generateCommand = new Command("generate").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Generate agent-browser test script from recorded trace actions. test.spec.ts is regenerated from actions.json on every run; pass --force to overwrite manual edits.").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--auto", "Apply auto-fixes without confirmation regardless of confidence (CI use)").option("--no-interactive", "Never prompt; only auto-apply when confidence is high, otherwise give up").option("--force", "Overwrite an existing test.spec.ts without warning").option("--no-snapshot", "Don't pin AGENT_BROWSER_SESSION / capture page snapshots after a failure (debug toggle)").option("
|
|
4087
|
+
const generateCommand = addLanguageOption(new Command("generate").argument("<feature/spec>", "Spec id in '<feature>/<spec>' form (resolves to .ccqa/features/<feature>/test-cases/<spec>/)").description("Generate agent-browser test script from recorded trace actions. test.spec.ts is regenerated from actions.json on every run; pass --force to overwrite manual edits.").option("--max-retries <n>", "Maximum number of auto-fix retries", "3").option("--auto", "Apply auto-fixes without confirmation regardless of confidence (CI use)").option("--no-interactive", "Never prompt; only auto-apply when confidence is high, otherwise give up").option("--force", "Overwrite an existing test.spec.ts without warning").option("--no-snapshot", "Don't pin AGENT_BROWSER_SESSION / capture page snapshots after a failure (debug toggle)").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Overrides CCQA_MODEL.")).action(async (specPath, opts) => {
|
|
3905
4088
|
const { featureName, specName } = parseSpecPath(specPath);
|
|
3906
4089
|
const mode = resolveMode(opts);
|
|
3907
4090
|
const useSnapshot = opts.snapshot !== false;
|
|
3908
|
-
await runGenerate(featureName, specName, parseInt(opts.maxRetries, 10), mode, opts.force ?? false, useSnapshot, opts.language ?? "
|
|
4091
|
+
await runGenerate(featureName, specName, parseInt(opts.maxRetries, 10), mode, opts.force ?? false, useSnapshot, opts.language ?? "auto", opts.model);
|
|
3909
4092
|
});
|
|
3910
4093
|
async function runGenerate(featureName, specName, maxRetries, mode, force, useSnapshot, outputLanguage, model) {
|
|
3911
4094
|
header("generate", `${featureName}/${specName}`);
|
|
@@ -4395,7 +4578,7 @@ const DEFAULT_CONCURRENCY$1 = 3;
|
|
|
4395
4578
|
* `cli/run` calls this with just the failing specs after vitest.
|
|
4396
4579
|
*/
|
|
4397
4580
|
async function analyzeDrift(input) {
|
|
4398
|
-
const { targets, cwd, blocks, concurrency = DEFAULT_CONCURRENCY$1, model, onSpecStart } = input;
|
|
4581
|
+
const { targets, cwd, blocks, concurrency = DEFAULT_CONCURRENCY$1, model, language, onSpecStart } = input;
|
|
4399
4582
|
const results = new Array(targets.length);
|
|
4400
4583
|
let cursor = 0;
|
|
4401
4584
|
const worker = async () => {
|
|
@@ -4407,7 +4590,8 @@ async function analyzeDrift(input) {
|
|
|
4407
4590
|
results[idx] = await checkSpec(target, {
|
|
4408
4591
|
cwd,
|
|
4409
4592
|
blocks,
|
|
4410
|
-
model
|
|
4593
|
+
model,
|
|
4594
|
+
language
|
|
4411
4595
|
});
|
|
4412
4596
|
}
|
|
4413
4597
|
};
|
|
@@ -4426,7 +4610,7 @@ async function checkSpec(target, opts) {
|
|
|
4426
4610
|
};
|
|
4427
4611
|
const { result, isError } = await invokeClaudeStreaming({
|
|
4428
4612
|
prompt: buildDriftUserPrompt(existing),
|
|
4429
|
-
systemPrompt: buildDriftSystemPrompt(opts.blocks),
|
|
4613
|
+
systemPrompt: buildDriftSystemPrompt(opts.blocks) + languageDirective(opts.language),
|
|
4430
4614
|
allowedTools: [
|
|
4431
4615
|
"Read",
|
|
4432
4616
|
"Grep",
|
|
@@ -4467,165 +4651,1187 @@ async function checkSpec(target, opts) {
|
|
|
4467
4651
|
};
|
|
4468
4652
|
}
|
|
4469
4653
|
//#endregion
|
|
4470
|
-
//#region src/drift/
|
|
4654
|
+
//#region src/drift/affected.ts
|
|
4655
|
+
const execFileP = promisify(execFile);
|
|
4471
4656
|
/**
|
|
4472
|
-
*
|
|
4473
|
-
*
|
|
4474
|
-
* they can prefix / interleave / pipe it as needed.
|
|
4657
|
+
* Resolve the base ref to diff against for `ccqa drift --changed`.
|
|
4658
|
+
* Precedence: explicit override > GITHUB_BASE_REF > origin/main.
|
|
4475
4659
|
*/
|
|
4476
|
-
function
|
|
4477
|
-
if (
|
|
4478
|
-
|
|
4479
|
-
return
|
|
4660
|
+
function resolveBaseRef(explicit) {
|
|
4661
|
+
if (explicit && explicit.length > 0) return explicit;
|
|
4662
|
+
const ghBase = process.env["GITHUB_BASE_REF"];
|
|
4663
|
+
if (ghBase && ghBase.length > 0) return ghBase.startsWith("origin/") ? ghBase : `origin/${ghBase}`;
|
|
4664
|
+
return "origin/main";
|
|
4480
4665
|
}
|
|
4481
|
-
|
|
4482
|
-
|
|
4666
|
+
/**
|
|
4667
|
+
* Run `git diff --name-status base...HEAD` from `cwd` and return one entry per
|
|
4668
|
+
* changed file. Renames are reported under their NEW path with status
|
|
4669
|
+
* "renamed" — the OLD path is dropped because the spec mapping is against the
|
|
4670
|
+
* post-rename layout.
|
|
4671
|
+
*
|
|
4672
|
+
* Paths are re-rooted to be relative to `cwd`, not the git repo root. In a
|
|
4673
|
+
* monorepo where `cwd` is a sub-package (e.g. `apps/foo`), git emits paths
|
|
4674
|
+
* relative to the repo root, but specs declare relatedPaths relative to
|
|
4675
|
+
* their own package. Changes outside `cwd` are dropped so an unrelated PR
|
|
4676
|
+
* can never accidentally scope a sub-package's specs in.
|
|
4677
|
+
*/
|
|
4678
|
+
async function getChangedFiles(base, cwd) {
|
|
4679
|
+
const [{ stdout: rootOut }, { stdout: diffOut }] = await Promise.all([execFileP("git", ["rev-parse", "--show-toplevel"], { cwd }), execFileP("git", [
|
|
4680
|
+
"diff",
|
|
4681
|
+
"--name-status",
|
|
4682
|
+
"-M",
|
|
4683
|
+
`${base}...HEAD`
|
|
4684
|
+
], {
|
|
4685
|
+
cwd,
|
|
4686
|
+
maxBuffer: 32 * 1024 * 1024
|
|
4687
|
+
})]);
|
|
4688
|
+
return rerootChangedFiles(parseGitDiffOutput(diffOut), rootOut.trim(), cwd);
|
|
4689
|
+
}
|
|
4690
|
+
/**
|
|
4691
|
+
* Convert paths in `entries` from git-repo-root relative to `cwd` relative,
|
|
4692
|
+
* dropping anything outside `cwd`. Exported for unit tests.
|
|
4693
|
+
*/
|
|
4694
|
+
function rerootChangedFiles(entries, repoRoot, cwd) {
|
|
4695
|
+
const prefix = relative(repoRoot, cwd);
|
|
4696
|
+
if (!prefix) return entries;
|
|
4483
4697
|
const out = [];
|
|
4484
|
-
for (const
|
|
4485
|
-
|
|
4486
|
-
|
|
4487
|
-
|
|
4488
|
-
|
|
4489
|
-
|
|
4490
|
-
|
|
4698
|
+
for (const e of entries) {
|
|
4699
|
+
const rel = relative(prefix, e.path);
|
|
4700
|
+
if (rel.startsWith("..") || rel === "") continue;
|
|
4701
|
+
out.push({
|
|
4702
|
+
...e,
|
|
4703
|
+
path: rel
|
|
4704
|
+
});
|
|
4705
|
+
}
|
|
4706
|
+
return out;
|
|
4707
|
+
}
|
|
4708
|
+
function parseGitDiffOutput(stdout) {
|
|
4709
|
+
const out = [];
|
|
4710
|
+
for (const line of stdout.split("\n")) {
|
|
4711
|
+
if (!line.trim()) continue;
|
|
4712
|
+
const parts = line.split(" ");
|
|
4713
|
+
const code = parts[0];
|
|
4714
|
+
if (!code) continue;
|
|
4715
|
+
if (code.startsWith("R")) {
|
|
4716
|
+
const newPath = parts[2];
|
|
4717
|
+
if (newPath) out.push({
|
|
4718
|
+
path: newPath,
|
|
4719
|
+
status: "renamed"
|
|
4720
|
+
});
|
|
4491
4721
|
continue;
|
|
4492
4722
|
}
|
|
4493
|
-
|
|
4494
|
-
|
|
4495
|
-
|
|
4496
|
-
|
|
4497
|
-
|
|
4498
|
-
|
|
4499
|
-
out.push(` ✓ ${detail}`);
|
|
4723
|
+
if (code.startsWith("C")) {
|
|
4724
|
+
const newPath = parts[2];
|
|
4725
|
+
if (newPath) out.push({
|
|
4726
|
+
path: newPath,
|
|
4727
|
+
status: "added"
|
|
4728
|
+
});
|
|
4500
4729
|
continue;
|
|
4501
4730
|
}
|
|
4502
|
-
|
|
4503
|
-
|
|
4504
|
-
|
|
4505
|
-
|
|
4506
|
-
|
|
4507
|
-
|
|
4731
|
+
const path = parts[1];
|
|
4732
|
+
if (!path) continue;
|
|
4733
|
+
switch (code[0]) {
|
|
4734
|
+
case "A":
|
|
4735
|
+
out.push({
|
|
4736
|
+
path,
|
|
4737
|
+
status: "added"
|
|
4738
|
+
});
|
|
4739
|
+
break;
|
|
4740
|
+
case "M":
|
|
4741
|
+
case "T":
|
|
4742
|
+
out.push({
|
|
4743
|
+
path,
|
|
4744
|
+
status: "modified"
|
|
4745
|
+
});
|
|
4746
|
+
break;
|
|
4747
|
+
case "D":
|
|
4748
|
+
out.push({
|
|
4749
|
+
path,
|
|
4750
|
+
status: "deleted"
|
|
4751
|
+
});
|
|
4752
|
+
break;
|
|
4753
|
+
default: out.push({
|
|
4754
|
+
path,
|
|
4755
|
+
status: "modified"
|
|
4756
|
+
});
|
|
4508
4757
|
}
|
|
4509
4758
|
}
|
|
4510
|
-
out
|
|
4511
|
-
out.push(HEAVY_RULE);
|
|
4512
|
-
const totals = summarize(results);
|
|
4513
|
-
out.push(` specs ${results.length} (${totals.errored} errored)`);
|
|
4514
|
-
out.push(` findings ${totals.error} error, ${totals.warn} warn, ${totals.ok} ok`);
|
|
4515
|
-
out.push("");
|
|
4516
|
-
return out.join("\n");
|
|
4759
|
+
return out;
|
|
4517
4760
|
}
|
|
4518
|
-
function
|
|
4519
|
-
|
|
4520
|
-
out.push("");
|
|
4521
|
-
out.push(` ${level} ${DRAFT_CATEGORY_LABEL[issue.category]}${stepPart}`);
|
|
4522
|
-
out.push(` ${issue.message}`);
|
|
4523
|
-
if (issue.detail) out.push(` └ ${issue.detail.replace(/\n/g, "\n ")}`);
|
|
4761
|
+
function stripLeadingDotSlash(s) {
|
|
4762
|
+
return s.startsWith("./") ? s.slice(2) : s;
|
|
4524
4763
|
}
|
|
4525
|
-
|
|
4526
|
-
|
|
4527
|
-
|
|
4528
|
-
|
|
4529
|
-
|
|
4530
|
-
|
|
4531
|
-
|
|
4532
|
-
|
|
4533
|
-
category: i.category,
|
|
4534
|
-
stepId: i.stepId,
|
|
4535
|
-
message: i.message,
|
|
4536
|
-
...i.detail ? { detail: i.detail } : {}
|
|
4537
|
-
}))
|
|
4538
|
-
})) };
|
|
4539
|
-
return `${JSON.stringify(payload, null, 2)}\n`;
|
|
4764
|
+
const REGEX_CACHE = /* @__PURE__ */ new Map();
|
|
4765
|
+
/** Compiles `pattern` to a RegExp, memoized so repeated `--changed` matches don't re-build. */
|
|
4766
|
+
function compileGlob(pattern) {
|
|
4767
|
+
const cached = REGEX_CACHE.get(pattern);
|
|
4768
|
+
if (cached) return cached;
|
|
4769
|
+
const compiled = globToRegExp(stripLeadingDotSlash(pattern));
|
|
4770
|
+
REGEX_CACHE.set(pattern, compiled);
|
|
4771
|
+
return compiled;
|
|
4540
4772
|
}
|
|
4541
|
-
function
|
|
4542
|
-
|
|
4543
|
-
|
|
4544
|
-
|
|
4545
|
-
const
|
|
4546
|
-
if (
|
|
4547
|
-
|
|
4548
|
-
|
|
4549
|
-
|
|
4550
|
-
for (const issue of r.issues) {
|
|
4551
|
-
if (issue.severity === "OK") continue;
|
|
4552
|
-
const level = issue.severity === "ERROR" ? "error" : "warning";
|
|
4553
|
-
const title = `${r.target.featureName}/${r.target.specName} — ${issue.category}${issue.stepId ? ` (${issue.stepId})` : ""}`;
|
|
4554
|
-
const body = issue.detail ? `${issue.message}\n${issue.detail}` : issue.message;
|
|
4555
|
-
lines.push(`::${level} file=${file},title=${escapeGhProp(title)}::${escapeGhMessage(body)}`);
|
|
4773
|
+
function globToRegExp(pattern) {
|
|
4774
|
+
let re = "^";
|
|
4775
|
+
let i = 0;
|
|
4776
|
+
while (i < pattern.length) {
|
|
4777
|
+
const ch = pattern[i];
|
|
4778
|
+
if (ch === "?") {
|
|
4779
|
+
re += "[^/]";
|
|
4780
|
+
i++;
|
|
4781
|
+
continue;
|
|
4556
4782
|
}
|
|
4783
|
+
if (ch !== "*") {
|
|
4784
|
+
re += /[.+^${}()|[\]\\]/.test(ch) ? "\\" + ch : ch;
|
|
4785
|
+
i++;
|
|
4786
|
+
continue;
|
|
4787
|
+
}
|
|
4788
|
+
if (pattern[i + 1] !== "*") {
|
|
4789
|
+
re += "[^/]*";
|
|
4790
|
+
i++;
|
|
4791
|
+
continue;
|
|
4792
|
+
}
|
|
4793
|
+
const hasLeadingSlash = re.endsWith("/");
|
|
4794
|
+
const hasTrailingSlash = pattern[i + 2] === "/";
|
|
4795
|
+
if (hasLeadingSlash) re = re.slice(0, -1);
|
|
4796
|
+
if (hasLeadingSlash || hasTrailingSlash) re += "(?:/?.*)?";
|
|
4797
|
+
else re += ".*";
|
|
4798
|
+
i += hasTrailingSlash ? 3 : 2;
|
|
4557
4799
|
}
|
|
4558
|
-
return
|
|
4559
|
-
}
|
|
4560
|
-
function githubRelPath(cwd, repoRoot, featureName, specName) {
|
|
4561
|
-
const abs = resolve(cwd, ".ccqa", "features", featureName, "test-cases", specName, "spec.yaml");
|
|
4562
|
-
const rel = relative(repoRoot, abs);
|
|
4563
|
-
return rel.startsWith("..") ? abs : rel;
|
|
4564
|
-
}
|
|
4565
|
-
function escapeGhMessage(s) {
|
|
4566
|
-
return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A");
|
|
4567
|
-
}
|
|
4568
|
-
function escapeGhProp(s) {
|
|
4569
|
-
return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A").replace(/,/g, "%2C").replace(/:/g, "%3A");
|
|
4570
|
-
}
|
|
4571
|
-
function summarize(results) {
|
|
4572
|
-
let error = 0;
|
|
4573
|
-
let warn = 0;
|
|
4574
|
-
let ok = 0;
|
|
4575
|
-
let errored = 0;
|
|
4576
|
-
for (const r of results) {
|
|
4577
|
-
if (r.error) errored++;
|
|
4578
|
-
for (const issue of r.issues) if (issue.severity === "ERROR") error++;
|
|
4579
|
-
else if (issue.severity === "WARN") warn++;
|
|
4580
|
-
else ok++;
|
|
4581
|
-
}
|
|
4582
|
-
return {
|
|
4583
|
-
error,
|
|
4584
|
-
warn,
|
|
4585
|
-
ok,
|
|
4586
|
-
errored
|
|
4587
|
-
};
|
|
4800
|
+
return new RegExp(re + "$");
|
|
4588
4801
|
}
|
|
4589
|
-
//#endregion
|
|
4590
|
-
//#region src/drift/exit-code.ts
|
|
4591
4802
|
/**
|
|
4592
|
-
*
|
|
4593
|
-
*
|
|
4594
|
-
* the
|
|
4803
|
+
* Returns true if `changedPath` is covered by any of `relatedPaths`. An empty
|
|
4804
|
+
* `relatedPaths` returns false — callers handle the "unscoped spec" case
|
|
4805
|
+
* separately (treat the spec as always-affected) before calling this.
|
|
4595
4806
|
*/
|
|
4596
|
-
function
|
|
4597
|
-
|
|
4598
|
-
|
|
4599
|
-
|
|
4600
|
-
if (issue.severity === "ERROR") return 1;
|
|
4601
|
-
if (threshold === "warn" && issue.severity === "WARN") return 1;
|
|
4602
|
-
}
|
|
4603
|
-
}
|
|
4604
|
-
return 0;
|
|
4807
|
+
function isPathAffectedBy(changedPath, relatedPaths) {
|
|
4808
|
+
const stripped = stripLeadingDotSlash(changedPath);
|
|
4809
|
+
for (const pattern of relatedPaths) if (compileGlob(pattern).test(stripped)) return true;
|
|
4810
|
+
return false;
|
|
4605
4811
|
}
|
|
4606
4812
|
//#endregion
|
|
4607
4813
|
//#region src/drift/auth.ts
|
|
4608
4814
|
/**
|
|
4609
4815
|
* Probe whether the host has any credential the Anthropic SDK can pick up:
|
|
4610
4816
|
* 1. ANTHROPIC_API_KEY env var (CI / scripted use)
|
|
4611
|
-
* 2. ~/.claude/.credentials.json (
|
|
4817
|
+
* 2. ~/.claude/.credentials.json (Claude Code login, file-based platforms)
|
|
4818
|
+
* 3. macOS Keychain item "Claude Code-credentials" (Claude Code login on
|
|
4819
|
+
* darwin stores the OAuth credentials in the Keychain, not on disk)
|
|
4612
4820
|
*
|
|
4613
|
-
*
|
|
4614
|
-
* user has asked for
|
|
4615
|
-
* that surfaces as "
|
|
4821
|
+
* Claude-driven hooks are opt-in, so the caller only consults this after the
|
|
4822
|
+
* user has asked for analysis. We never throw — auth absence is a normal flow
|
|
4823
|
+
* that surfaces as "analysis skipped".
|
|
4616
4824
|
*/
|
|
4617
4825
|
function driftAuthAvailable() {
|
|
4618
4826
|
const key = process.env["ANTHROPIC_API_KEY"];
|
|
4619
4827
|
if (typeof key === "string" && key.length > 0) return { ok: true };
|
|
4620
4828
|
if (existsSync(join(homedir(), ".claude", ".credentials.json"))) return { ok: true };
|
|
4829
|
+
if (process.platform === "darwin" && keychainHasClaudeCredentials()) return { ok: true };
|
|
4621
4830
|
return {
|
|
4622
4831
|
ok: false,
|
|
4623
4832
|
reason: "no ANTHROPIC_API_KEY / claude login"
|
|
4624
4833
|
};
|
|
4625
4834
|
}
|
|
4835
|
+
/**
|
|
4836
|
+
* `security find-generic-password` without `-w` only checks the item's
|
|
4837
|
+
* existence (exit 0) — it never reads the secret, so no Keychain unlock
|
|
4838
|
+
* prompt is triggered. Resolved via PATH so tests can stub the binary.
|
|
4839
|
+
*/
|
|
4840
|
+
function keychainHasClaudeCredentials() {
|
|
4841
|
+
try {
|
|
4842
|
+
return spawnSync("security", [
|
|
4843
|
+
"find-generic-password",
|
|
4844
|
+
"-s",
|
|
4845
|
+
"Claude Code-credentials"
|
|
4846
|
+
], {
|
|
4847
|
+
stdio: "ignore",
|
|
4848
|
+
timeout: 3e3
|
|
4849
|
+
}).status === 0;
|
|
4850
|
+
} catch {
|
|
4851
|
+
return false;
|
|
4852
|
+
}
|
|
4853
|
+
}
|
|
4854
|
+
//#endregion
|
|
4855
|
+
//#region src/report/prompt.ts
|
|
4856
|
+
function buildFailureAnalysisPrompt(input) {
|
|
4857
|
+
const { script, specYaml, failureLog, diffPatch, changedFiles, baseRef, driftIssues, outputLanguage = "auto" } = input;
|
|
4858
|
+
const numbered = numberLines(script);
|
|
4859
|
+
return `You are analyzing a failing E2E regression test right after a source change landed. Your job is a root-cause CALL, not a fix: decide which of three categories explains the failure, using the source diff as your primary context.
|
|
4860
|
+
|
|
4861
|
+
${outputLanguageBlock(outputLanguage, "`reasoning`, `detail`", "label names (TEST_DRIFT, etc.)")}## The three categories
|
|
4862
|
+
|
|
4863
|
+
The question that separates them: **is the behavior the spec describes still what the product intends?**
|
|
4864
|
+
|
|
4865
|
+
1. TEST_DRIFT — what the spec verifies is unchanged; only the test code drifted from the source. Typical: a selector/aria-label/placeholder rename, a timing change, an over-tight assertion. The diff shows a change that is invisible to the user's intent but visible to the test.
|
|
4866
|
+
2. SPEC_CHANGE — the thing being verified itself changed: the UI flow, the layout, the feature's intended behavior. The diff deliberately changes what the spec asserts. You MUST cite the diff hunk (file + what changed) as evidence for this label.
|
|
4867
|
+
3. PRODUCT_BUG — neither of the above: the failure is not explained by the diff nor by test staleness. The product regressed.
|
|
4868
|
+
|
|
4869
|
+
If the evidence is too weak to choose, answer UNKNOWN — a wrong confident call is worse than an honest UNKNOWN, because humans grade these predictions to measure accuracy.
|
|
4870
|
+
|
|
4871
|
+
## You have read-only filesystem tools
|
|
4872
|
+
|
|
4873
|
+
You can call \`Grep\`, \`Glob\`, and \`Read\` against the current repository (post-change state) before producing the JSON. Use them to:
|
|
4874
|
+
- confirm a suspected selector rename (grep for \`aria-label=\`, \`placeholder=\`, \`data-testid\`, i18n strings),
|
|
4875
|
+
- read the changed files in full when the truncated patch is not enough,
|
|
4876
|
+
- check whether the element/flow the spec describes still exists in the source.
|
|
4877
|
+
|
|
4878
|
+
You have **up to 12 tool turns**. Do NOT write, edit, run shell commands, or hit the network.
|
|
4879
|
+
|
|
4880
|
+
## Decision guidance
|
|
4881
|
+
|
|
4882
|
+
- Diff touches only attributes/identifiers the test selects on (labels, testids, class names, timing) while the user-visible flow is intact → TEST_DRIFT.
|
|
4883
|
+
- Diff intentionally removes/reworks the UI or flow that a spec step verifies (component deleted, page restructured, copy redefined, feature flag flipped) → SPEC_CHANGE.
|
|
4884
|
+
- Diff UNINTENTIONALLY breaks behavior the spec still intends — e.g. a refactor that drops a side effect, an inverted condition, a regression hiding inside a cleanup commit — → PRODUCT_BUG, citing the diff hunk as evidence. A product bug is often introduced BY the diff; what separates it from SPEC_CHANGE is intent: does the change read as a deliberate redesign of what the spec verifies, or as collateral damage?
|
|
4885
|
+
- Diff is unrelated to the failing step (or there is no relevant diff) and the test was passing before → lean PRODUCT_BUG; first rule out timing/data flakiness and infrastructure errors (daemon not running, network down, missing credentials) — those read as UNKNOWN with low confidence, not PRODUCT_BUG.
|
|
4886
|
+
- The drift audit findings (when present) flag spec↔code mismatches; an ERROR there usually supports TEST_DRIFT or SPEC_CHANGE over PRODUCT_BUG.
|
|
4887
|
+
|
|
4888
|
+
## Sub-diagnosis vocabulary
|
|
4889
|
+
|
|
4890
|
+
Alongside the label, report the closest fine-grained mechanic:
|
|
4891
|
+
- SELECTOR_DRIFT, TIMING_ISSUE, OVER_ASSERTION — usually under TEST_DRIFT
|
|
4892
|
+
- DATA_MISSING — missing test data/state; usually UNKNOWN or PRODUCT_BUG depending on cause
|
|
4893
|
+
- NONE — when nothing fits (typical for SPEC_CHANGE and PRODUCT_BUG)
|
|
4894
|
+
|
|
4895
|
+
## Output
|
|
4896
|
+
|
|
4897
|
+
Your **final** assistant message must start with \`{\` and end with \`}\` — a single JSON object, nothing before or after. No prose preamble, no markdown fences, no tool calls in the same turn.
|
|
4898
|
+
|
|
4899
|
+
{
|
|
4900
|
+
"label": "TEST_DRIFT" | "SPEC_CHANGE" | "PRODUCT_BUG" | "UNKNOWN",
|
|
4901
|
+
"confidence": <0.0-1.0>,
|
|
4902
|
+
"subDiagnosis": "SELECTOR_DRIFT" | "TIMING_ISSUE" | "OVER_ASSERTION" | "DATA_MISSING" | "NONE",
|
|
4903
|
+
"evidence": [
|
|
4904
|
+
{ "file": "<file:line or diff hunk reference, omit if log-only>", "detail": "<what this shows>" }
|
|
4905
|
+
],
|
|
4906
|
+
"reasoning": "<why this label, citing the evidence>"
|
|
4907
|
+
}
|
|
4908
|
+
|
|
4909
|
+
## Confidence guidance
|
|
4910
|
+
|
|
4911
|
+
- 0.9-1.0: the diff (or a file you read) directly shows the cause
|
|
4912
|
+
- 0.7-0.9: strong indirect evidence
|
|
4913
|
+
- 0.4-0.7: plausible but another category could explain it
|
|
4914
|
+
- < 0.4: answer UNKNOWN instead of guessing
|
|
4915
|
+
|
|
4916
|
+
Evidence rules: TEST_DRIFT and SPEC_CHANGE require at least one concrete \`file\` reference (diff hunk or file:line you actually read). PRODUCT_BUG should explain why the diff does NOT account for the failure.
|
|
4917
|
+
|
|
4918
|
+
## Test Spec (spec.yaml)
|
|
4919
|
+
${specYaml}
|
|
4920
|
+
|
|
4921
|
+
## Test Script (with line numbers)
|
|
4922
|
+
${numbered}
|
|
4923
|
+
|
|
4924
|
+
${diffPatch ? `## Source changes since ${baseRef ?? "base"} (git diff, may be truncated)
|
|
4925
|
+
|
|
4926
|
+
### Changed files (name-status)
|
|
4927
|
+
${changedFiles ?? "(unavailable)"}
|
|
4928
|
+
|
|
4929
|
+
### Patch
|
|
4930
|
+
\`\`\`diff
|
|
4931
|
+
${diffPatch}
|
|
4932
|
+
\`\`\`
|
|
4933
|
+
` : `## Source changes
|
|
4934
|
+
|
|
4935
|
+
No diff context is available (the base ref could not be resolved, or there are no changes). Classify from the failure log, the spec, and what you can read in the repository — and be correspondingly more conservative: prefer UNKNOWN over a confident SPEC_CHANGE/PRODUCT_BUG call without diff evidence.
|
|
4936
|
+
`}
|
|
4937
|
+
${driftIssues && driftIssues.length > 0 ? `## Spec↔code drift audit findings
|
|
4938
|
+
|
|
4939
|
+
A separate read-only audit compared the spec against the current source. Treat these as hints, not verdicts:
|
|
4940
|
+
|
|
4941
|
+
${driftIssues.map((i) => `- [${i.severity}] (${DRAFT_CATEGORY_LABEL[i.category]}${i.stepId ? `, step ${i.stepId}` : ""}) ${i.message}${i.detail ? ` — ${i.detail}` : ""}`).join("\n")}
|
|
4942
|
+
` : ""}## Failure Log
|
|
4943
|
+
${failureLog.slice(0, 8e3)}`;
|
|
4944
|
+
}
|
|
4945
|
+
//#endregion
|
|
4946
|
+
//#region src/diagnose/types.ts
|
|
4947
|
+
/**
|
|
4948
|
+
* The concrete (fixable) diagnosis tags as a value, for consumers that need
|
|
4949
|
+
* to enumerate them (e.g. the run report's subDiagnosis vocabulary). The
|
|
4950
|
+
* `satisfies` clause makes renaming a union member without updating this
|
|
4951
|
+
* list a compile error.
|
|
4952
|
+
*/
|
|
4953
|
+
const FIXABLE_DIAGNOSIS_TYPES = [
|
|
4954
|
+
"SELECTOR_DRIFT",
|
|
4955
|
+
"TIMING_ISSUE",
|
|
4956
|
+
"OVER_ASSERTION",
|
|
4957
|
+
"DATA_MISSING"
|
|
4958
|
+
];
|
|
4959
|
+
//#endregion
|
|
4960
|
+
//#region src/report/schema.ts
|
|
4961
|
+
/**
|
|
4962
|
+
* The three-way root-cause call for a failing spec, framed as drift analysis:
|
|
4963
|
+
* - TEST_DRIFT: what the spec verifies is unchanged; only the test code
|
|
4964
|
+
* drifted from the source (selector rename, timing, ...).
|
|
4965
|
+
* Future iterations may auto-fix these.
|
|
4966
|
+
* - SPEC_CHANGE: the thing being verified itself changed (UI redesign,
|
|
4967
|
+
* spec change). Never auto-fix — a human must re-draft.
|
|
4968
|
+
* - PRODUCT_BUG: neither of the above explains the failure — treat it as
|
|
4969
|
+
* a product regression.
|
|
4970
|
+
*
|
|
4971
|
+
* The stakeholder ask behind this module is measurement-first: the call is
|
|
4972
|
+
* known to be hard, so every prediction is embedded in the HTML report where
|
|
4973
|
+
* a human records the ground truth and the report computes the confusion
|
|
4974
|
+
* matrix client-side. Accuracy may start low; it must be *visible*.
|
|
4975
|
+
*/
|
|
4976
|
+
const FAILURE_LABELS = [
|
|
4977
|
+
"TEST_DRIFT",
|
|
4978
|
+
"SPEC_CHANGE",
|
|
4979
|
+
"PRODUCT_BUG"
|
|
4980
|
+
];
|
|
4981
|
+
const FailureLabelSchema = z.enum(FAILURE_LABELS);
|
|
4982
|
+
/** What the model may answer: the three labels, or UNKNOWN when evidence is weak. */
|
|
4983
|
+
const PREDICTED_LABELS = [...FAILURE_LABELS, "UNKNOWN"];
|
|
4984
|
+
const PredictedLabelSchema = z.enum(PREDICTED_LABELS);
|
|
4985
|
+
const SUB_DIAGNOSES = [...FIXABLE_DIAGNOSIS_TYPES, "NONE"];
|
|
4986
|
+
const FailureEvidenceSchema = z.object({
|
|
4987
|
+
file: z.string().optional(),
|
|
4988
|
+
detail: z.string()
|
|
4989
|
+
});
|
|
4990
|
+
/**
|
|
4991
|
+
* LLM output shape. Deliberately NOT .strict(): the model occasionally adds
|
|
4992
|
+
* keys, and rejecting the whole analysis over an extra field would collapse
|
|
4993
|
+
* a usable prediction into UNKNOWN. Zod's default strips unknown keys.
|
|
4994
|
+
*/
|
|
4995
|
+
const FailureAnalysisSchema = z.object({
|
|
4996
|
+
label: PredictedLabelSchema,
|
|
4997
|
+
confidence: z.number().min(0).max(1),
|
|
4998
|
+
subDiagnosis: z.enum(SUB_DIAGNOSES).optional(),
|
|
4999
|
+
evidence: z.array(FailureEvidenceSchema),
|
|
5000
|
+
reasoning: z.string()
|
|
5001
|
+
});
|
|
5002
|
+
const ReportAssertionSchema = z.object({
|
|
5003
|
+
name: z.string(),
|
|
5004
|
+
status: z.enum([
|
|
5005
|
+
"passed",
|
|
5006
|
+
"failed",
|
|
5007
|
+
"skipped"
|
|
5008
|
+
]),
|
|
5009
|
+
durationMs: z.number().nullable()
|
|
5010
|
+
});
|
|
5011
|
+
const ReportSpecResultSchema = z.object({
|
|
5012
|
+
feature: z.string(),
|
|
5013
|
+
spec: z.string(),
|
|
5014
|
+
status: z.enum(["passed", "failed"]),
|
|
5015
|
+
testCounts: z.object({
|
|
5016
|
+
total: z.number(),
|
|
5017
|
+
passed: z.number(),
|
|
5018
|
+
failed: z.number()
|
|
5019
|
+
}).nullable(),
|
|
5020
|
+
durationMs: z.number().nullable(),
|
|
5021
|
+
assertions: z.array(ReportAssertionSchema).nullable(),
|
|
5022
|
+
analysis: FailureAnalysisSchema.nullable(),
|
|
5023
|
+
analysisSkipped: z.string().nullable(),
|
|
5024
|
+
driftIssues: z.array(DraftIssueSchema).nullable(),
|
|
5025
|
+
failureLogExcerpt: z.string().nullable(),
|
|
5026
|
+
diffExcerpt: z.string().nullable(),
|
|
5027
|
+
specYaml: z.string().nullable()
|
|
5028
|
+
});
|
|
5029
|
+
z.object({
|
|
5030
|
+
schemaVersion: z.literal(1),
|
|
5031
|
+
createdAt: z.string(),
|
|
5032
|
+
runId: z.string().nullable(),
|
|
5033
|
+
git: z.object({
|
|
5034
|
+
head: z.string().nullable(),
|
|
5035
|
+
base: z.string().nullable()
|
|
5036
|
+
}),
|
|
5037
|
+
model: z.string().nullable(),
|
|
5038
|
+
promptVersion: z.string(),
|
|
5039
|
+
results: z.array(ReportSpecResultSchema)
|
|
5040
|
+
});
|
|
5041
|
+
/** Shape of the "export labels" download produced by the report's client-side JS. */
|
|
5042
|
+
const LabelEntrySchema = z.object({
|
|
5043
|
+
feature: z.string(),
|
|
5044
|
+
spec: z.string(),
|
|
5045
|
+
predicted: PredictedLabelSchema,
|
|
5046
|
+
label: FailureLabelSchema,
|
|
5047
|
+
note: z.string().optional()
|
|
5048
|
+
});
|
|
5049
|
+
z.object({
|
|
5050
|
+
schemaVersion: z.literal(1),
|
|
5051
|
+
runId: z.string().nullable(),
|
|
5052
|
+
promptVersion: z.string(),
|
|
5053
|
+
exportedAt: z.string(),
|
|
5054
|
+
labels: z.array(LabelEntrySchema)
|
|
5055
|
+
});
|
|
5056
|
+
//#endregion
|
|
5057
|
+
//#region src/report/analyze.ts
|
|
5058
|
+
/**
|
|
5059
|
+
* Classify one failing spec into TEST_DRIFT / SPEC_CHANGE / PRODUCT_BUG /
|
|
5060
|
+
* UNKNOWN. Same resilience contract as diagnose(): read-only tools, JSON-only
|
|
5061
|
+
* final message, and any parse failure degrades to UNKNOWN with confidence 0
|
|
5062
|
+
* rather than throwing — the report must always render.
|
|
5063
|
+
*/
|
|
5064
|
+
async function analyzeFailure(input, options = {}) {
|
|
5065
|
+
const { result: raw, isError } = await invokeClaudeStreaming({
|
|
5066
|
+
prompt: buildFailureAnalysisPrompt(input),
|
|
5067
|
+
allowedTools: [
|
|
5068
|
+
"Read",
|
|
5069
|
+
"Grep",
|
|
5070
|
+
"Glob"
|
|
5071
|
+
],
|
|
5072
|
+
silenceBashLog: true,
|
|
5073
|
+
maxTurns: 12,
|
|
5074
|
+
...options.model ? { model: options.model } : {},
|
|
5075
|
+
...options.cwd ? { cwd: options.cwd } : {}
|
|
5076
|
+
}, () => {});
|
|
5077
|
+
if (isError || !raw) return {
|
|
5078
|
+
analysis: unknownAnalysis(isError ? "Claude returned an error result" : "Claude returned no output"),
|
|
5079
|
+
raw: raw ?? "",
|
|
5080
|
+
sdkError: isError
|
|
5081
|
+
};
|
|
5082
|
+
for (const candidate of extractJsonCandidates(raw)) {
|
|
5083
|
+
let parsed;
|
|
5084
|
+
try {
|
|
5085
|
+
parsed = JSON.parse(candidate);
|
|
5086
|
+
} catch {
|
|
5087
|
+
continue;
|
|
5088
|
+
}
|
|
5089
|
+
const normalised = normaliseFailureAnalysis(parsed);
|
|
5090
|
+
if (normalised) return {
|
|
5091
|
+
analysis: normalised,
|
|
5092
|
+
raw,
|
|
5093
|
+
sdkError: false
|
|
5094
|
+
};
|
|
5095
|
+
}
|
|
5096
|
+
return {
|
|
5097
|
+
analysis: unknownAnalysis(`analysis returned no parseable JSON: ${truncate$2(raw, 500)}`),
|
|
5098
|
+
raw,
|
|
5099
|
+
sdkError: false
|
|
5100
|
+
};
|
|
5101
|
+
}
|
|
5102
|
+
function unknownAnalysis(reasoning) {
|
|
5103
|
+
return {
|
|
5104
|
+
label: "UNKNOWN",
|
|
5105
|
+
confidence: 0,
|
|
5106
|
+
subDiagnosis: "NONE",
|
|
5107
|
+
evidence: [],
|
|
5108
|
+
reasoning
|
|
5109
|
+
};
|
|
5110
|
+
}
|
|
5111
|
+
const LABELS = new Set(PREDICTED_LABELS);
|
|
5112
|
+
const SUB_SET = new Set(SUB_DIAGNOSES);
|
|
5113
|
+
/**
|
|
5114
|
+
* Manual, lenient normalisation (mirrors diagnose's normaliseResult): a
|
|
5115
|
+
* missing/extra field should degrade gracefully, not reject the whole
|
|
5116
|
+
* prediction — only an unrecognisable label makes the candidate unusable.
|
|
5117
|
+
*/
|
|
5118
|
+
function normaliseFailureAnalysis(parsed) {
|
|
5119
|
+
if (!isObject(parsed)) return null;
|
|
5120
|
+
const label = parsed["label"];
|
|
5121
|
+
if (typeof label !== "string" || !LABELS.has(label)) return null;
|
|
5122
|
+
const confidence = typeof parsed["confidence"] === "number" ? clamp(parsed["confidence"], 0, 1) : 0;
|
|
5123
|
+
const reasoning = typeof parsed["reasoning"] === "string" ? parsed["reasoning"] : "";
|
|
5124
|
+
const rawSub = parsed["subDiagnosis"];
|
|
5125
|
+
const subDiagnosis = typeof rawSub === "string" && SUB_SET.has(rawSub) ? rawSub : "NONE";
|
|
5126
|
+
const evidence = [];
|
|
5127
|
+
if (Array.isArray(parsed["evidence"])) for (const item of parsed["evidence"]) {
|
|
5128
|
+
if (!isObject(item)) continue;
|
|
5129
|
+
const detail = typeof item["detail"] === "string" ? item["detail"] : null;
|
|
5130
|
+
if (detail === null) continue;
|
|
5131
|
+
const file = typeof item["file"] === "string" ? item["file"] : void 0;
|
|
5132
|
+
evidence.push(file !== void 0 ? {
|
|
5133
|
+
file,
|
|
5134
|
+
detail
|
|
5135
|
+
} : { detail });
|
|
5136
|
+
}
|
|
5137
|
+
return {
|
|
5138
|
+
label,
|
|
5139
|
+
confidence,
|
|
5140
|
+
subDiagnosis,
|
|
5141
|
+
evidence,
|
|
5142
|
+
reasoning
|
|
5143
|
+
};
|
|
5144
|
+
}
|
|
5145
|
+
/**
|
|
5146
|
+
* Capture the PR diff used as context for failure analysis. `--relative`
|
|
5147
|
+
* re-roots paths to `cwd` and drops changes outside it, matching how
|
|
5148
|
+
* relatedPaths are declared in a monorepo sub-package.
|
|
5149
|
+
*
|
|
5150
|
+
* Errors (unknown base ref, not a git repo, ...) are returned, not thrown:
|
|
5151
|
+
* the report is still worth generating without diff context.
|
|
5152
|
+
*/
|
|
5153
|
+
async function capturePrDiff(base, cwd) {
|
|
5154
|
+
try {
|
|
5155
|
+
const [{ stdout: head }, { stdout: patch }, { stdout: nameStatus }] = await Promise.all([
|
|
5156
|
+
execFileP("git", [
|
|
5157
|
+
"rev-parse",
|
|
5158
|
+
"--short",
|
|
5159
|
+
"HEAD"
|
|
5160
|
+
], { cwd }),
|
|
5161
|
+
execFileP("git", [
|
|
5162
|
+
"diff",
|
|
5163
|
+
"-M",
|
|
5164
|
+
"--relative",
|
|
5165
|
+
`${base}...HEAD`
|
|
5166
|
+
], {
|
|
5167
|
+
cwd,
|
|
5168
|
+
maxBuffer: 64 * 1024 * 1024
|
|
5169
|
+
}),
|
|
5170
|
+
execFileP("git", [
|
|
5171
|
+
"diff",
|
|
5172
|
+
"--name-status",
|
|
5173
|
+
"-M",
|
|
5174
|
+
"--relative",
|
|
5175
|
+
`${base}...HEAD`
|
|
5176
|
+
], {
|
|
5177
|
+
cwd,
|
|
5178
|
+
maxBuffer: 32 * 1024 * 1024
|
|
5179
|
+
})
|
|
5180
|
+
]);
|
|
5181
|
+
return {
|
|
5182
|
+
ok: true,
|
|
5183
|
+
diff: {
|
|
5184
|
+
patch,
|
|
5185
|
+
nameStatus: nameStatus.trim(),
|
|
5186
|
+
head: head.trim()
|
|
5187
|
+
}
|
|
5188
|
+
};
|
|
5189
|
+
} catch (e) {
|
|
5190
|
+
return {
|
|
5191
|
+
ok: false,
|
|
5192
|
+
error: e.message.split("\n")[0] ?? "git diff failed"
|
|
5193
|
+
};
|
|
5194
|
+
}
|
|
5195
|
+
}
|
|
5196
|
+
/**
|
|
5197
|
+
* Split a unified diff into per-file sections on `diff --git` boundaries.
|
|
5198
|
+
* The path is taken from the `b/` side so renames/edits key on the
|
|
5199
|
+
* post-change layout — the same side relatedPaths are written against.
|
|
5200
|
+
*/
|
|
5201
|
+
const DIFF_HEADER = /^diff --git a\/(.+) b\/(.+)$/;
|
|
5202
|
+
function splitPatchByFile(patch) {
|
|
5203
|
+
const sections = [];
|
|
5204
|
+
const lines = patch.split("\n");
|
|
5205
|
+
let current = null;
|
|
5206
|
+
const flush = () => {
|
|
5207
|
+
if (current) sections.push({
|
|
5208
|
+
path: current.path,
|
|
5209
|
+
body: current.lines.join("\n")
|
|
5210
|
+
});
|
|
5211
|
+
current = null;
|
|
5212
|
+
};
|
|
5213
|
+
for (const line of lines) {
|
|
5214
|
+
const m = DIFF_HEADER.exec(line);
|
|
5215
|
+
if (m) {
|
|
5216
|
+
flush();
|
|
5217
|
+
current = {
|
|
5218
|
+
path: m[2],
|
|
5219
|
+
lines: [line]
|
|
5220
|
+
};
|
|
5221
|
+
} else if (current) current.lines.push(line);
|
|
5222
|
+
}
|
|
5223
|
+
flush();
|
|
5224
|
+
return sections;
|
|
5225
|
+
}
|
|
5226
|
+
/**
|
|
5227
|
+
* Scope a full patch down to the files a spec depends on, then truncate so
|
|
5228
|
+
* the analysis prompt stays bounded. `relatedPaths` null/empty means the
|
|
5229
|
+
* spec is unscoped — keep the whole patch (still truncated). Callers scoping
|
|
5230
|
+
* the same patch for many specs can pass pre-split sections instead.
|
|
5231
|
+
*/
|
|
5232
|
+
function scopePatchForSpec(patch, relatedPaths, caps = {}) {
|
|
5233
|
+
const perFile = caps.perFile ?? 8192;
|
|
5234
|
+
const total = caps.total ?? 49152;
|
|
5235
|
+
let sections = typeof patch === "string" ? splitPatchByFile(patch) : patch;
|
|
5236
|
+
if (relatedPaths && relatedPaths.length > 0) {
|
|
5237
|
+
const scoped = sections.filter((s) => isPathAffectedBy(s.path, relatedPaths));
|
|
5238
|
+
if (scoped.length > 0) sections = scoped;
|
|
5239
|
+
}
|
|
5240
|
+
const parts = [];
|
|
5241
|
+
let used = 0;
|
|
5242
|
+
let droppedFiles = 0;
|
|
5243
|
+
for (const s of sections) {
|
|
5244
|
+
if (used >= total) {
|
|
5245
|
+
droppedFiles++;
|
|
5246
|
+
continue;
|
|
5247
|
+
}
|
|
5248
|
+
let body = s.body;
|
|
5249
|
+
if (body.length > perFile) body = `${body.slice(0, perFile)}\n[truncated: ${body.length - perFile} more chars of ${s.path}]`;
|
|
5250
|
+
if (used + body.length > total) body = `${body.slice(0, total - used)}\n[truncated: total patch cap reached]`;
|
|
5251
|
+
parts.push(body);
|
|
5252
|
+
used += body.length;
|
|
5253
|
+
}
|
|
5254
|
+
if (droppedFiles > 0) parts.push(`[truncated: ${droppedFiles} more changed file(s) omitted]`);
|
|
5255
|
+
return parts.join("\n");
|
|
5256
|
+
}
|
|
5257
|
+
//#endregion
|
|
5258
|
+
//#region src/report/render.ts
|
|
5259
|
+
/**
|
|
5260
|
+
* Render the run report as ONE self-contained HTML file (inline CSS/JS, no
|
|
5261
|
+
* network). It is meant to be uploaded as a CI artifact like Playwright's
|
|
5262
|
+
* HTML report and opened locally; the layout deliberately mirrors that
|
|
5263
|
+
* report's conventions — header stats that double as filters, a search box,
|
|
5264
|
+
* collapsible per-spec cards with a step list and durations, automatic
|
|
5265
|
+
* light/dark theme.
|
|
5266
|
+
*
|
|
5267
|
+
* The measurement loop lives client-side: each analyzed failure gets
|
|
5268
|
+
* ground-truth radio buttons, and a vanilla-JS block recomputes accuracy /
|
|
5269
|
+
* confusion matrix / per-class precision-recall on every change. Labels
|
|
5270
|
+
* persist in localStorage and can be exported/imported as JSON
|
|
5271
|
+
* (LabelsExportSchema) so the grading work survives the browser session.
|
|
5272
|
+
*/
|
|
5273
|
+
function renderRunReport(data) {
|
|
5274
|
+
const failed = data.results.filter((r) => r.status === "failed");
|
|
5275
|
+
const analyzed = failed.filter((r) => r.analysis !== null);
|
|
5276
|
+
const passedCount = data.results.length - failed.length;
|
|
5277
|
+
const totalDuration = data.results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0);
|
|
5278
|
+
const dataJson = JSON.stringify(data).replace(/</g, "\\u003c");
|
|
5279
|
+
return `<!DOCTYPE html>
|
|
5280
|
+
<html lang="en">
|
|
5281
|
+
<head>
|
|
5282
|
+
<meta charset="utf-8">
|
|
5283
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
5284
|
+
<title>ccqa run report</title>
|
|
5285
|
+
<style>${CSS}</style>
|
|
5286
|
+
</head>
|
|
5287
|
+
<body>
|
|
5288
|
+
<header>
|
|
5289
|
+
<div class="header-inner">
|
|
5290
|
+
<div class="header-top">
|
|
5291
|
+
<h1>ccqa run report</h1>
|
|
5292
|
+
<div class="meta">
|
|
5293
|
+
<span title="generated at">${esc(formatDate(data.createdAt))}</span>
|
|
5294
|
+
${totalDuration > 0 ? `<span>${formatDuration$1(totalDuration)}</span>` : ""}
|
|
5295
|
+
${data.runId ? `<span>CI run ${esc(data.runId)}</span>` : ""}
|
|
5296
|
+
${data.git.head ? `<span><code>${esc(data.git.head)}</code>${data.git.base ? ` vs <code>${esc(data.git.base)}</code>` : ""}</span>` : ""}
|
|
5297
|
+
<span class="dim">prompt v${esc(data.promptVersion)}</span>
|
|
5298
|
+
</div>
|
|
5299
|
+
</div>
|
|
5300
|
+
<div class="toolbar">
|
|
5301
|
+
<div class="chips" id="filter-chips">
|
|
5302
|
+
<button type="button" class="chip active" data-filter="all">All <span class="count">${data.results.length}</span></button>
|
|
5303
|
+
<button type="button" class="chip chip-pass" data-filter="passed">${passedCount} passed</button>
|
|
5304
|
+
<button type="button" class="chip chip-fail" data-filter="failed">${failed.length} failed</button>
|
|
5305
|
+
</div>
|
|
5306
|
+
<input type="search" id="search" placeholder="Filter by name…" autocomplete="off">
|
|
5307
|
+
</div>
|
|
5308
|
+
</div>
|
|
5309
|
+
</header>
|
|
5310
|
+
|
|
5311
|
+
<div class="page">
|
|
5312
|
+
${analyzed.length > 0 ? metricsPanel() : ""}
|
|
5313
|
+
|
|
5314
|
+
<main id="spec-list">
|
|
5315
|
+
${data.results.map((r, i) => renderResult(r, i)).join("\n")}
|
|
5316
|
+
</main>
|
|
5317
|
+
<p class="empty-note" id="no-match" hidden>No specs match the current filter.</p>
|
|
5318
|
+
</div>
|
|
5319
|
+
|
|
5320
|
+
<script type="application/json" id="ccqa-report-data">${dataJson}<\/script>
|
|
5321
|
+
<script>${CLIENT_JS}<\/script>
|
|
5322
|
+
</body>
|
|
5323
|
+
</html>
|
|
5324
|
+
`;
|
|
5325
|
+
}
|
|
5326
|
+
function metricsPanel() {
|
|
5327
|
+
return `<section class="panel" id="measure-panel">
|
|
5328
|
+
<div class="panel-head">
|
|
5329
|
+
<h2>Prediction accuracy</h2>
|
|
5330
|
+
<div class="measure-actions">
|
|
5331
|
+
<button type="button" id="export-labels">Export labels (JSON)</button>
|
|
5332
|
+
<label class="import-label">Import labels<input type="file" id="import-labels" accept="application/json"></label>
|
|
5333
|
+
</div>
|
|
5334
|
+
</div>
|
|
5335
|
+
<p class="hint">Grade each failed case below with its true cause; the matrix updates live. Labels are saved in this browser (localStorage) — export them to keep or merge.</p>
|
|
5336
|
+
<div id="metrics"></div>
|
|
5337
|
+
</section>`;
|
|
5338
|
+
}
|
|
5339
|
+
function renderResult(r, index) {
|
|
5340
|
+
const id = `${r.feature}/${r.spec}`;
|
|
5341
|
+
const duration = r.durationMs != null && r.durationMs > 0 ? `<span class="duration">${formatDuration$1(r.durationMs)}</span>` : "";
|
|
5342
|
+
const counts = r.testCounts ? `<span class="counts">${r.testCounts.passed}/${r.testCounts.total}</span>` : "";
|
|
5343
|
+
const predictionChip = r.status === "failed" && r.analysis ? `<span class="badge ${r.analysis.label}">${r.analysis.label}</span>` : "";
|
|
5344
|
+
return `<details class="spec ${r.status}" data-status="${r.status}" data-case-id="${esc(id)}"${r.status === "failed" ? " open" : ""}>
|
|
5345
|
+
<summary>
|
|
5346
|
+
${statusIcon(r.status)}
|
|
5347
|
+
<span class="spec-name">${esc(id)}</span>
|
|
5348
|
+
${predictionChip}
|
|
5349
|
+
<span class="spacer"></span>
|
|
5350
|
+
${counts}
|
|
5351
|
+
${duration}
|
|
5352
|
+
</summary>
|
|
5353
|
+
<div class="spec-body">
|
|
5354
|
+
${renderAssertions(r)}
|
|
5355
|
+
${r.status === "failed" ? r.analysis ? renderAnalysis(r, index) : renderSkipped(r) : ""}
|
|
5356
|
+
${renderDriftIssues(r)}
|
|
5357
|
+
${collapsible("Failure log", r.failureLogExcerpt)}
|
|
5358
|
+
${collapsible("Source diff (scoped)", r.diffExcerpt, "diff")}
|
|
5359
|
+
${collapsible("spec.yaml", r.specYaml)}
|
|
5360
|
+
</div>
|
|
5361
|
+
</details>`;
|
|
5362
|
+
}
|
|
5363
|
+
function statusIcon(status) {
|
|
5364
|
+
if (status === "passed") return `<span class="status-icon pass" aria-label="passed">✓</span>`;
|
|
5365
|
+
if (status === "failed") return `<span class="status-icon fail" aria-label="failed">✕</span>`;
|
|
5366
|
+
return `<span class="status-icon skip" aria-label="skipped">◌</span>`;
|
|
5367
|
+
}
|
|
5368
|
+
function renderAssertions(r) {
|
|
5369
|
+
if (!r.assertions || r.assertions.length === 0) return "";
|
|
5370
|
+
return `<ul class="steps">${r.assertions.map((a) => {
|
|
5371
|
+
const dur = a.durationMs != null ? `<span class="duration">${formatDuration$1(a.durationMs)}</span>` : "";
|
|
5372
|
+
return `<li>${statusIcon(a.status)}<span class="step-name">${esc(a.name)}</span><span class="spacer"></span>${dur}</li>`;
|
|
5373
|
+
}).join("")}</ul>`;
|
|
5374
|
+
}
|
|
5375
|
+
function renderAnalysis(r, index) {
|
|
5376
|
+
const a = r.analysis;
|
|
5377
|
+
const pct = Math.round(a.confidence * 100);
|
|
5378
|
+
const evidence = a.evidence.length > 0 ? `<ul class="evidence">${a.evidence.map((e) => `<li>${e.file ? `<code>${esc(e.file)}</code> — ` : ""}${esc(e.detail)}</li>`).join("")}</ul>` : "";
|
|
5379
|
+
return `<div class="analysis">
|
|
5380
|
+
<div class="prediction">
|
|
5381
|
+
<span class="badge ${a.label}">${a.label}</span>
|
|
5382
|
+
<span class="confidence" title="confidence"><span class="confidence-bar"><span style="width:${pct}%"></span></span>${pct}%</span>
|
|
5383
|
+
${a.subDiagnosis && a.subDiagnosis !== "NONE" ? `<span class="sub">${esc(a.subDiagnosis)}</span>` : ""}
|
|
5384
|
+
</div>
|
|
5385
|
+
<p class="reasoning">${esc(a.reasoning)}</p>
|
|
5386
|
+
${evidence}
|
|
5387
|
+
<div class="truth">
|
|
5388
|
+
<span class="truth-title">True cause</span>
|
|
5389
|
+
${FAILURE_LABELS.map((label) => `<label class="truth-option ${label}"><input type="radio" name="label--${index}" value="${label}"><span>${label}</span></label>`).join("\n ")}
|
|
5390
|
+
<input type="text" class="note" placeholder="note (optional)" data-case-index="${index}">
|
|
5391
|
+
</div>
|
|
5392
|
+
</div>`;
|
|
5393
|
+
}
|
|
5394
|
+
function renderSkipped(r) {
|
|
5395
|
+
return `<div class="analysis skipped">analysis skipped${r.analysisSkipped ? `: ${esc(r.analysisSkipped)}` : ""}</div>`;
|
|
5396
|
+
}
|
|
5397
|
+
function renderDriftIssues(r) {
|
|
5398
|
+
if (!r.driftIssues || r.driftIssues.length === 0) return "";
|
|
5399
|
+
const items = r.driftIssues.map((i) => `<li><span class="severity ${i.severity}">${i.severity}</span> (${esc(DRAFT_CATEGORY_LABEL[i.category])}${i.stepId ? `, step ${esc(i.stepId)}` : ""}) ${esc(i.message)}${i.detail ? ` — ${esc(i.detail)}` : ""}</li>`).join("");
|
|
5400
|
+
return `<details class="drift"><summary>Spec↔code drift audit (${r.driftIssues.length})</summary><ul>${items}</ul></details>`;
|
|
5401
|
+
}
|
|
5402
|
+
function collapsible(title, content, kind = "") {
|
|
5403
|
+
if (!content) return "";
|
|
5404
|
+
return `<details class="raw ${kind}"><summary>${esc(title)}</summary><pre>${esc(content)}</pre></details>`;
|
|
5405
|
+
}
|
|
5406
|
+
const ESC_MAP = {
|
|
5407
|
+
"&": "&",
|
|
5408
|
+
"<": "<",
|
|
5409
|
+
">": ">",
|
|
5410
|
+
"\"": """,
|
|
5411
|
+
"'": "'"
|
|
5412
|
+
};
|
|
5413
|
+
function esc(s) {
|
|
5414
|
+
return s.replace(/[&<>"']/g, (c) => ESC_MAP[c]);
|
|
5415
|
+
}
|
|
5416
|
+
function formatDuration$1(ms) {
|
|
5417
|
+
if (ms < 1e3) return `${Math.round(ms)}ms`;
|
|
5418
|
+
if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
|
|
5419
|
+
return `${Math.floor(ms / 6e4)}m ${Math.round(ms % 6e4 / 1e3)}s`;
|
|
5420
|
+
}
|
|
5421
|
+
function formatDate(iso) {
|
|
5422
|
+
return iso.replace("T", " ").replace(/\.\d+Z$/, " UTC");
|
|
5423
|
+
}
|
|
5424
|
+
const CSS = `
|
|
5425
|
+
:root {
|
|
5426
|
+
color-scheme: light dark;
|
|
5427
|
+
--bg: #f4f5f7;
|
|
5428
|
+
--surface: #ffffff;
|
|
5429
|
+
--surface-2: #f8f9fa;
|
|
5430
|
+
--border: #e1e4e8;
|
|
5431
|
+
--text: #1f2328;
|
|
5432
|
+
--text-dim: #656d76;
|
|
5433
|
+
--accent: #1f6feb;
|
|
5434
|
+
--pass: #1a7f37;
|
|
5435
|
+
--pass-bg: #dafbe1;
|
|
5436
|
+
--fail: #cf222e;
|
|
5437
|
+
--fail-bg: #ffebe9;
|
|
5438
|
+
--skip: #9a6700;
|
|
5439
|
+
--code-bg: #0d1117;
|
|
5440
|
+
--code-text: #e6edf3;
|
|
5441
|
+
--shadow: 0 1px 3px rgba(31, 35, 40, 0.06);
|
|
5442
|
+
}
|
|
5443
|
+
@media (prefers-color-scheme: dark) {
|
|
5444
|
+
:root {
|
|
5445
|
+
--bg: #0d1117;
|
|
5446
|
+
--surface: #161b22;
|
|
5447
|
+
--surface-2: #1c2129;
|
|
5448
|
+
--border: #30363d;
|
|
5449
|
+
--text: #e6edf3;
|
|
5450
|
+
--text-dim: #8b949e;
|
|
5451
|
+
--accent: #58a6ff;
|
|
5452
|
+
--pass: #3fb950;
|
|
5453
|
+
--pass-bg: rgba(63, 185, 80, 0.15);
|
|
5454
|
+
--fail: #f85149;
|
|
5455
|
+
--fail-bg: rgba(248, 81, 73, 0.15);
|
|
5456
|
+
--skip: #d29922;
|
|
5457
|
+
--code-bg: #010409;
|
|
5458
|
+
--code-text: #e6edf3;
|
|
5459
|
+
--shadow: none;
|
|
5460
|
+
}
|
|
5461
|
+
}
|
|
5462
|
+
* { box-sizing: border-box; }
|
|
5463
|
+
body {
|
|
5464
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Hiragino Sans", "Noto Sans JP", sans-serif;
|
|
5465
|
+
margin: 0; background: var(--bg); color: var(--text); font-size: 14px;
|
|
5466
|
+
}
|
|
5467
|
+
header {
|
|
5468
|
+
position: sticky; top: 0; z-index: 10;
|
|
5469
|
+
background: var(--surface); border-bottom: 1px solid var(--border);
|
|
5470
|
+
}
|
|
5471
|
+
.header-inner { max-width: 1080px; margin: 0 auto; padding: 14px 24px 10px; }
|
|
5472
|
+
.header-top { display: flex; align-items: baseline; gap: 18px; flex-wrap: wrap; }
|
|
5473
|
+
h1 { font-size: 17px; margin: 0; font-weight: 650; }
|
|
5474
|
+
h2 { font-size: 14px; margin: 0; font-weight: 650; }
|
|
5475
|
+
.meta { font-size: 12px; color: var(--text-dim); display: flex; gap: 14px; flex-wrap: wrap; }
|
|
5476
|
+
.meta code { background: var(--surface-2); border: 1px solid var(--border); padding: 0 5px; border-radius: 4px; font-size: 11px; }
|
|
5477
|
+
.dim { color: var(--text-dim); }
|
|
5478
|
+
.toolbar { display: flex; align-items: center; gap: 12px; margin-top: 10px; flex-wrap: wrap; }
|
|
5479
|
+
.chips { display: flex; gap: 6px; }
|
|
5480
|
+
.chip {
|
|
5481
|
+
font: inherit; font-size: 12.5px; font-weight: 600; cursor: pointer;
|
|
5482
|
+
padding: 3px 12px; border-radius: 999px; border: 1px solid var(--border);
|
|
5483
|
+
background: var(--surface); color: var(--text-dim);
|
|
5484
|
+
}
|
|
5485
|
+
.chip .count { opacity: 0.7; }
|
|
5486
|
+
.chip.active { background: var(--text); color: var(--surface); border-color: var(--text); }
|
|
5487
|
+
.chip-pass.active { background: var(--pass); border-color: var(--pass); color: #fff; }
|
|
5488
|
+
.chip-fail.active { background: var(--fail); border-color: var(--fail); color: #fff; }
|
|
5489
|
+
#search {
|
|
5490
|
+
font: inherit; font-size: 13px; flex: 1; min-width: 180px; max-width: 320px; margin-left: auto;
|
|
5491
|
+
padding: 5px 12px; border: 1px solid var(--border); border-radius: 6px;
|
|
5492
|
+
background: var(--surface-2); color: var(--text);
|
|
5493
|
+
}
|
|
5494
|
+
#search:focus { outline: 2px solid var(--accent); outline-offset: -1px; }
|
|
5495
|
+
.page { max-width: 1080px; margin: 16px auto; padding: 0 24px; }
|
|
5496
|
+
.panel {
|
|
5497
|
+
background: var(--surface); border: 1px solid var(--border); border-radius: 8px;
|
|
5498
|
+
padding: 14px 18px; margin-bottom: 16px; box-shadow: var(--shadow);
|
|
5499
|
+
}
|
|
5500
|
+
.panel-head { display: flex; align-items: center; justify-content: space-between; gap: 12px; flex-wrap: wrap; }
|
|
5501
|
+
.hint { font-size: 12px; color: var(--text-dim); margin: 6px 0 10px; }
|
|
5502
|
+
.spec {
|
|
5503
|
+
background: var(--surface); border: 1px solid var(--border); border-radius: 8px;
|
|
5504
|
+
margin-bottom: 8px; box-shadow: var(--shadow);
|
|
5505
|
+
}
|
|
5506
|
+
.spec > summary {
|
|
5507
|
+
display: flex; align-items: center; gap: 10px; padding: 10px 16px;
|
|
5508
|
+
cursor: pointer; list-style: none; user-select: none;
|
|
5509
|
+
}
|
|
5510
|
+
.spec > summary::-webkit-details-marker { display: none; }
|
|
5511
|
+
.spec > summary::before {
|
|
5512
|
+
content: "▸"; color: var(--text-dim); font-size: 11px;
|
|
5513
|
+
transition: transform 0.12s ease; flex: 0 0 auto;
|
|
5514
|
+
}
|
|
5515
|
+
.spec[open] > summary::before { transform: rotate(90deg); }
|
|
5516
|
+
.spec-name { font-weight: 600; font-size: 13.5px; }
|
|
5517
|
+
.spacer { flex: 1; }
|
|
5518
|
+
.counts { font-size: 12px; color: var(--text-dim); }
|
|
5519
|
+
.duration { font-size: 12px; color: var(--text-dim); font-variant-numeric: tabular-nums; }
|
|
5520
|
+
.status-icon { font-weight: 700; font-size: 13px; flex: 0 0 auto; }
|
|
5521
|
+
.status-icon.pass { color: var(--pass); }
|
|
5522
|
+
.status-icon.fail { color: var(--fail); }
|
|
5523
|
+
.status-icon.skip { color: var(--skip); }
|
|
5524
|
+
.spec-body { padding: 2px 16px 12px 36px; border-top: 1px solid var(--border); }
|
|
5525
|
+
.steps { list-style: none; margin: 10px 0; padding: 0; }
|
|
5526
|
+
.steps li {
|
|
5527
|
+
display: flex; align-items: center; gap: 8px; padding: 3px 8px;
|
|
5528
|
+
font-size: 13px; border-radius: 5px;
|
|
5529
|
+
}
|
|
5530
|
+
.steps li:hover { background: var(--surface-2); }
|
|
5531
|
+
.step-name { overflow-wrap: anywhere; }
|
|
5532
|
+
.analysis {
|
|
5533
|
+
border: 1px solid var(--border); border-left: 3px solid var(--accent);
|
|
5534
|
+
border-radius: 6px; background: var(--surface-2);
|
|
5535
|
+
padding: 10px 14px; margin: 10px 0;
|
|
5536
|
+
}
|
|
5537
|
+
.analysis.skipped { color: var(--text-dim); font-size: 13px; font-style: italic; border-left-color: var(--border); }
|
|
5538
|
+
.prediction { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; }
|
|
5539
|
+
.badge {
|
|
5540
|
+
font-size: 11.5px; font-weight: 700; letter-spacing: 0.02em;
|
|
5541
|
+
padding: 2px 10px; border-radius: 4px; color: #fff; flex: 0 0 auto;
|
|
5542
|
+
}
|
|
5543
|
+
.badge.TEST_DRIFT { background: #b45309; }
|
|
5544
|
+
.badge.SPEC_CHANGE { background: #1d4ed8; }
|
|
5545
|
+
.badge.PRODUCT_BUG { background: #b91c1c; }
|
|
5546
|
+
.badge.UNKNOWN { background: #6b7280; }
|
|
5547
|
+
.confidence { display: inline-flex; align-items: center; gap: 7px; font-size: 12.5px; font-weight: 600; color: var(--text-dim); }
|
|
5548
|
+
.confidence-bar {
|
|
5549
|
+
display: inline-block; width: 64px; height: 6px; border-radius: 999px;
|
|
5550
|
+
background: var(--border); overflow: hidden;
|
|
5551
|
+
}
|
|
5552
|
+
.confidence-bar > span { display: block; height: 100%; background: var(--accent); border-radius: 999px; }
|
|
5553
|
+
.sub { font-size: 11px; background: var(--surface); border: 1px solid var(--border); color: var(--text-dim); padding: 1px 8px; border-radius: 999px; }
|
|
5554
|
+
.reasoning { font-size: 13px; margin: 9px 0; white-space: pre-wrap; line-height: 1.55; }
|
|
5555
|
+
.evidence { font-size: 12.5px; color: var(--text-dim); margin: 6px 0; padding-left: 18px; line-height: 1.5; }
|
|
5556
|
+
.evidence code { background: var(--surface); border: 1px solid var(--border); padding: 0 5px; border-radius: 4px; font-size: 11px; }
|
|
5557
|
+
.truth {
|
|
5558
|
+
display: flex; align-items: center; gap: 10px; flex-wrap: wrap;
|
|
5559
|
+
background: var(--surface); border: 1px dashed var(--border); border-radius: 6px;
|
|
5560
|
+
padding: 8px 12px; margin-top: 10px; font-size: 12.5px;
|
|
5561
|
+
}
|
|
5562
|
+
.truth-title { font-weight: 650; color: var(--text-dim); }
|
|
5563
|
+
.truth-option {
|
|
5564
|
+
display: inline-flex; align-items: center; gap: 5px; cursor: pointer;
|
|
5565
|
+
border: 1px solid var(--border); border-radius: 999px; padding: 2px 10px;
|
|
5566
|
+
}
|
|
5567
|
+
.truth-option:has(input:checked) { border-color: var(--accent); background: var(--surface-2); font-weight: 650; }
|
|
5568
|
+
.note { flex: 1; min-width: 150px; font: inherit; font-size: 12px; padding: 4px 9px; border: 1px solid var(--border); border-radius: 5px; background: var(--surface-2); color: var(--text); }
|
|
5569
|
+
details.raw, details.drift { margin: 7px 0; font-size: 13px; }
|
|
5570
|
+
details.raw summary, details.drift summary { cursor: pointer; color: var(--text-dim); }
|
|
5571
|
+
details.raw pre {
|
|
5572
|
+
background: var(--code-bg); color: var(--code-text);
|
|
5573
|
+
font-size: 11.5px; line-height: 1.5; padding: 12px 14px; border-radius: 6px;
|
|
5574
|
+
overflow-x: auto; white-space: pre-wrap; word-break: break-word; margin: 6px 0;
|
|
5575
|
+
}
|
|
5576
|
+
.severity { font-size: 10.5px; font-weight: 700; padding: 0 6px; border-radius: 4px; margin-right: 4px; }
|
|
5577
|
+
.severity.ERROR { background: var(--fail-bg); color: var(--fail); }
|
|
5578
|
+
.severity.WARN { background: rgba(212, 167, 44, 0.18); color: var(--skip); }
|
|
5579
|
+
.severity.OK { background: var(--pass-bg); color: var(--pass); }
|
|
5580
|
+
.drift ul { padding-left: 18px; font-size: 12.5px; line-height: 1.55; }
|
|
5581
|
+
table.matrix { border-collapse: collapse; font-size: 12.5px; margin: 10px 16px 10px 0; display: inline-table; vertical-align: top; }
|
|
5582
|
+
table.matrix th, table.matrix td { border: 1px solid var(--border); padding: 4px 12px; text-align: center; }
|
|
5583
|
+
table.matrix th { background: var(--surface-2); font-weight: 600; }
|
|
5584
|
+
table.matrix td { font-variant-numeric: tabular-nums; }
|
|
5585
|
+
table.matrix td.hit { background: var(--pass-bg); font-weight: 700; }
|
|
5586
|
+
table.matrix td.miss-nonzero { background: var(--fail-bg); }
|
|
5587
|
+
.stats { font-size: 13px; }
|
|
5588
|
+
.stats .big { font-size: 17px; font-weight: 700; }
|
|
5589
|
+
.measure-actions { display: flex; gap: 14px; align-items: center; font-size: 12.5px; }
|
|
5590
|
+
.measure-actions button {
|
|
5591
|
+
font: inherit; font-size: 12.5px; padding: 4px 13px; cursor: pointer;
|
|
5592
|
+
border: 1px solid var(--border); border-radius: 6px; background: var(--surface); color: var(--text);
|
|
5593
|
+
}
|
|
5594
|
+
.measure-actions button:hover { background: var(--surface-2); }
|
|
5595
|
+
.import-label { cursor: pointer; color: var(--text-dim); }
|
|
5596
|
+
.import-label input { display: none; }
|
|
5597
|
+
.empty-note { color: var(--text-dim); text-align: center; font-size: 13px; }
|
|
5598
|
+
`;
|
|
5599
|
+
const CLIENT_JS = `
|
|
5600
|
+
(function () {
|
|
5601
|
+
var dataEl = document.getElementById('ccqa-report-data');
|
|
5602
|
+
if (!dataEl) return;
|
|
5603
|
+
var data = JSON.parse(dataEl.textContent);
|
|
5604
|
+
var LABELS = ${JSON.stringify(FAILURE_LABELS)};
|
|
5605
|
+
var PRED_LABELS = LABELS.concat(['UNKNOWN']);
|
|
5606
|
+
var storageKey = 'ccqa-report:' + (data.runId || data.createdAt);
|
|
5607
|
+
|
|
5608
|
+
// ---- filtering ------------------------------------------------------
|
|
5609
|
+
var activeFilter = 'all';
|
|
5610
|
+
var searchQuery = '';
|
|
5611
|
+
|
|
5612
|
+
function applyFilters() {
|
|
5613
|
+
var sections = document.querySelectorAll('.spec');
|
|
5614
|
+
var visible = 0;
|
|
5615
|
+
sections.forEach(function (el) {
|
|
5616
|
+
var statusOk = activeFilter === 'all' || el.getAttribute('data-status') === activeFilter;
|
|
5617
|
+
var name = (el.getAttribute('data-case-id') || '').toLowerCase();
|
|
5618
|
+
var searchOk = !searchQuery || name.indexOf(searchQuery) >= 0;
|
|
5619
|
+
var show = statusOk && searchOk;
|
|
5620
|
+
el.style.display = show ? '' : 'none';
|
|
5621
|
+
if (show) visible++;
|
|
5622
|
+
});
|
|
5623
|
+
var note = document.getElementById('no-match');
|
|
5624
|
+
if (note) note.hidden = visible > 0;
|
|
5625
|
+
}
|
|
5626
|
+
|
|
5627
|
+
var chips = document.querySelectorAll('#filter-chips .chip');
|
|
5628
|
+
chips.forEach(function (chip) {
|
|
5629
|
+
chip.addEventListener('click', function () {
|
|
5630
|
+
activeFilter = chip.getAttribute('data-filter') || 'all';
|
|
5631
|
+
chips.forEach(function (c) { c.classList.toggle('active', c === chip); });
|
|
5632
|
+
applyFilters();
|
|
5633
|
+
});
|
|
5634
|
+
});
|
|
5635
|
+
|
|
5636
|
+
var search = document.getElementById('search');
|
|
5637
|
+
if (search) {
|
|
5638
|
+
search.addEventListener('input', function () {
|
|
5639
|
+
searchQuery = search.value.trim().toLowerCase();
|
|
5640
|
+
applyFilters();
|
|
5641
|
+
});
|
|
5642
|
+
}
|
|
5643
|
+
|
|
5644
|
+
// ---- measurement ----------------------------------------------------
|
|
5645
|
+
// cases: analyzed failures only — they carry a prediction we can grade.
|
|
5646
|
+
var cases = [];
|
|
5647
|
+
for (var i = 0; i < data.results.length; i++) {
|
|
5648
|
+
var r = data.results[i];
|
|
5649
|
+
if (r.status === 'failed' && r.analysis) {
|
|
5650
|
+
cases.push({ index: i, feature: r.feature, spec: r.spec, predicted: r.analysis.label });
|
|
5651
|
+
}
|
|
5652
|
+
}
|
|
5653
|
+
|
|
5654
|
+
var state = {};
|
|
5655
|
+
try { state = JSON.parse(localStorage.getItem(storageKey) || '{}'); } catch (e) { state = {}; }
|
|
5656
|
+
|
|
5657
|
+
function save() {
|
|
5658
|
+
try { localStorage.setItem(storageKey, JSON.stringify(state)); } catch (e) {}
|
|
5659
|
+
}
|
|
5660
|
+
|
|
5661
|
+
function caseKey(c) { return c.feature + '/' + c.spec; }
|
|
5662
|
+
|
|
5663
|
+
function applyStateToInputs() {
|
|
5664
|
+
cases.forEach(function (c) {
|
|
5665
|
+
var entry = state[caseKey(c)];
|
|
5666
|
+
if (!entry) return;
|
|
5667
|
+
// Guard: only known labels may flow into the attribute selector below
|
|
5668
|
+
// (localStorage is user-controlled; anything else is dropped).
|
|
5669
|
+
if (entry.label && LABELS.indexOf(entry.label) >= 0) {
|
|
5670
|
+
var radio = document.querySelector('input[name="label--' + c.index + '"][value="' + entry.label + '"]');
|
|
5671
|
+
if (radio) radio.checked = true;
|
|
5672
|
+
}
|
|
5673
|
+
var note = document.querySelector('.note[data-case-index="' + c.index + '"]');
|
|
5674
|
+
if (note && entry.note) note.value = entry.note;
|
|
5675
|
+
});
|
|
5676
|
+
}
|
|
5677
|
+
|
|
5678
|
+
function renderMetrics() {
|
|
5679
|
+
var target = document.getElementById('metrics');
|
|
5680
|
+
if (!target) return;
|
|
5681
|
+
|
|
5682
|
+
var m = {};
|
|
5683
|
+
PRED_LABELS.forEach(function (p) {
|
|
5684
|
+
m[p] = {};
|
|
5685
|
+
LABELS.forEach(function (a) { m[p][a] = 0; });
|
|
5686
|
+
});
|
|
5687
|
+
|
|
5688
|
+
var labeled = 0;
|
|
5689
|
+
var correct = 0;
|
|
5690
|
+
cases.forEach(function (c) {
|
|
5691
|
+
var entry = state[caseKey(c)];
|
|
5692
|
+
if (!entry || !entry.label || LABELS.indexOf(entry.label) < 0) return;
|
|
5693
|
+
labeled++;
|
|
5694
|
+
m[c.predicted][entry.label]++;
|
|
5695
|
+
if (c.predicted === entry.label) correct++;
|
|
5696
|
+
});
|
|
5697
|
+
|
|
5698
|
+
var html = '';
|
|
5699
|
+
html += '<div class="stats"><span class="big">' +
|
|
5700
|
+
(labeled === 0 ? '–' : Math.round((correct / labeled) * 100) + '%') +
|
|
5701
|
+
'</span> accuracy · ' + labeled + ' labeled / ' + cases.length + ' analyzed failures' +
|
|
5702
|
+
(cases.length - labeled > 0 ? ' · <strong>' + (cases.length - labeled) + ' unlabeled</strong>' : '') +
|
|
5703
|
+
'</div>';
|
|
5704
|
+
|
|
5705
|
+
html += '<table class="matrix"><thead><tr><th>predicted \\\\ actual</th>';
|
|
5706
|
+
LABELS.forEach(function (a) { html += '<th>' + a + '</th>'; });
|
|
5707
|
+
html += '</tr></thead><tbody>';
|
|
5708
|
+
PRED_LABELS.forEach(function (p) {
|
|
5709
|
+
html += '<tr><th>' + p + '</th>';
|
|
5710
|
+
LABELS.forEach(function (a) {
|
|
5711
|
+
var v = m[p][a];
|
|
5712
|
+
var cls = p === a ? 'hit' : (v > 0 ? 'miss-nonzero' : '');
|
|
5713
|
+
html += '<td class="' + cls + '">' + v + '</td>';
|
|
5714
|
+
});
|
|
5715
|
+
html += '</tr>';
|
|
5716
|
+
});
|
|
5717
|
+
html += '</tbody></table>';
|
|
5718
|
+
|
|
5719
|
+
html += '<table class="matrix"><thead><tr><th>class</th><th>precision</th><th>recall</th><th>F1</th><th>support</th></tr></thead><tbody>';
|
|
5720
|
+
LABELS.forEach(function (cls) {
|
|
5721
|
+
var tp = m[cls][cls];
|
|
5722
|
+
var predictedAs = 0;
|
|
5723
|
+
LABELS.forEach(function (a) { predictedAs += m[cls][a]; });
|
|
5724
|
+
var actualAs = 0;
|
|
5725
|
+
PRED_LABELS.forEach(function (p) { actualAs += m[p][cls]; });
|
|
5726
|
+
var precision = predictedAs > 0 ? tp / predictedAs : null;
|
|
5727
|
+
var recall = actualAs > 0 ? tp / actualAs : null;
|
|
5728
|
+
var f1 = precision !== null && recall !== null && precision + recall > 0
|
|
5729
|
+
? (2 * precision * recall) / (precision + recall) : null;
|
|
5730
|
+
html += '<tr><th>' + cls + '</th><td>' + fmt(precision) + '</td><td>' + fmt(recall) +
|
|
5731
|
+
'</td><td>' + fmt(f1) + '</td><td>' + actualAs + '</td></tr>';
|
|
5732
|
+
});
|
|
5733
|
+
html += '</tbody></table>';
|
|
5734
|
+
|
|
5735
|
+
target.innerHTML = html;
|
|
5736
|
+
}
|
|
5737
|
+
|
|
5738
|
+
function fmt(v) { return v === null ? '–' : (Math.round(v * 100) / 100).toFixed(2); }
|
|
5739
|
+
|
|
5740
|
+
function findCaseByIndex(index) {
|
|
5741
|
+
for (var i = 0; i < cases.length; i++) {
|
|
5742
|
+
if (cases[i].index === index) return cases[i];
|
|
5743
|
+
}
|
|
5744
|
+
return null;
|
|
5745
|
+
}
|
|
5746
|
+
|
|
5747
|
+
document.addEventListener('change', function (e) {
|
|
5748
|
+
var t = e.target;
|
|
5749
|
+
if (t && t.name && t.name.indexOf('label--') === 0) {
|
|
5750
|
+
var index = parseInt(t.name.slice('label--'.length), 10);
|
|
5751
|
+
var c = findCaseByIndex(index);
|
|
5752
|
+
if (!c) return;
|
|
5753
|
+
var key = caseKey(c);
|
|
5754
|
+
state[key] = state[key] || {};
|
|
5755
|
+
state[key].label = t.value;
|
|
5756
|
+
save();
|
|
5757
|
+
renderMetrics();
|
|
5758
|
+
}
|
|
5759
|
+
});
|
|
5760
|
+
|
|
5761
|
+
document.addEventListener('input', function (e) {
|
|
5762
|
+
var t = e.target;
|
|
5763
|
+
if (t && t.classList && t.classList.contains('note')) {
|
|
5764
|
+
var index = parseInt(t.getAttribute('data-case-index'), 10);
|
|
5765
|
+
var c = findCaseByIndex(index);
|
|
5766
|
+
if (!c) return;
|
|
5767
|
+
var key = caseKey(c);
|
|
5768
|
+
state[key] = state[key] || {};
|
|
5769
|
+
state[key].note = t.value;
|
|
5770
|
+
save();
|
|
5771
|
+
}
|
|
5772
|
+
});
|
|
5773
|
+
|
|
5774
|
+
var exportBtn = document.getElementById('export-labels');
|
|
5775
|
+
if (exportBtn) {
|
|
5776
|
+
exportBtn.addEventListener('click', function () {
|
|
5777
|
+
var labels = [];
|
|
5778
|
+
cases.forEach(function (c) {
|
|
5779
|
+
var entry = state[caseKey(c)];
|
|
5780
|
+
if (!entry || !entry.label) return;
|
|
5781
|
+
var item = { feature: c.feature, spec: c.spec, predicted: c.predicted, label: entry.label };
|
|
5782
|
+
if (entry.note) item.note = entry.note;
|
|
5783
|
+
labels.push(item);
|
|
5784
|
+
});
|
|
5785
|
+
var payload = {
|
|
5786
|
+
schemaVersion: 1,
|
|
5787
|
+
runId: data.runId,
|
|
5788
|
+
promptVersion: data.promptVersion,
|
|
5789
|
+
exportedAt: new Date().toISOString(),
|
|
5790
|
+
labels: labels
|
|
5791
|
+
};
|
|
5792
|
+
var blob = new Blob([JSON.stringify(payload, null, 2)], { type: 'application/json' });
|
|
5793
|
+
var a = document.createElement('a');
|
|
5794
|
+
a.href = URL.createObjectURL(blob);
|
|
5795
|
+
a.download = 'ccqa-labels-' + (data.runId || data.createdAt).replace(/[^A-Za-z0-9_-]/g, '_') + '.json';
|
|
5796
|
+
a.click();
|
|
5797
|
+
URL.revokeObjectURL(a.href);
|
|
5798
|
+
});
|
|
5799
|
+
}
|
|
5800
|
+
|
|
5801
|
+
var importInput = document.getElementById('import-labels');
|
|
5802
|
+
if (importInput) {
|
|
5803
|
+
importInput.addEventListener('change', function () {
|
|
5804
|
+
var file = importInput.files && importInput.files[0];
|
|
5805
|
+
if (!file) return;
|
|
5806
|
+
var reader = new FileReader();
|
|
5807
|
+
reader.onload = function () {
|
|
5808
|
+
try {
|
|
5809
|
+
var payload = JSON.parse(String(reader.result));
|
|
5810
|
+
(payload.labels || []).forEach(function (item) {
|
|
5811
|
+
var key = item.feature + '/' + item.spec;
|
|
5812
|
+
state[key] = state[key] || {};
|
|
5813
|
+
if (item.label) state[key].label = item.label;
|
|
5814
|
+
if (item.note) state[key].note = item.note;
|
|
5815
|
+
});
|
|
5816
|
+
save();
|
|
5817
|
+
applyStateToInputs();
|
|
5818
|
+
renderMetrics();
|
|
5819
|
+
} catch (e) {
|
|
5820
|
+
alert('Could not parse labels JSON: ' + e.message);
|
|
5821
|
+
}
|
|
5822
|
+
};
|
|
5823
|
+
reader.readAsText(file);
|
|
5824
|
+
});
|
|
5825
|
+
}
|
|
5826
|
+
|
|
5827
|
+
applyStateToInputs();
|
|
5828
|
+
renderMetrics();
|
|
5829
|
+
})();
|
|
5830
|
+
`;
|
|
4626
5831
|
//#endregion
|
|
4627
5832
|
//#region src/cli/run.ts
|
|
4628
5833
|
const USER_VITEST_CONFIG = resolve(".ccqa/vitest.config.ts");
|
|
5834
|
+
const DEFAULT_REPORT_DIR = "ccqa-report";
|
|
4629
5835
|
async function resolveVitestConfig() {
|
|
4630
5836
|
try {
|
|
4631
5837
|
await access(USER_VITEST_CONFIG);
|
|
@@ -4634,7 +5840,7 @@ async function resolveVitestConfig() {
|
|
|
4634
5840
|
return bundledVitestConfigPath();
|
|
4635
5841
|
}
|
|
4636
5842
|
}
|
|
4637
|
-
const runCommand = new Command("run").argument("[target]", "Spec to run: '<feature>/<spec>', '<feature>', or omit for all").description("Run generated agent-browser test scripts. Pass --drift to
|
|
5843
|
+
const runCommand = addLanguageOption(new Command("run").argument("[target]", "Spec to run: '<feature>/<spec>', '<feature>', or omit for all").description("Run generated agent-browser test scripts. Pass --drift-report to also write a self-contained HTML run report: each failing spec gets a drift audit plus a root-cause call (TEST_DRIFT / SPEC_CHANGE / PRODUCT_BUG), and the report lets a human grade the calls to measure their accuracy. Requires ANTHROPIC_API_KEY or a local Claude login for the analysis part.").option("--drift-report [dir]", `Write an HTML run report with drift analysis of failures (default dir: ${DEFAULT_REPORT_DIR}/)`).option("--drift-base <ref>", "Base ref the source diff is taken against for failure analysis (default: GITHUB_BASE_REF, then origin/main)").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Used by --drift-report only. Overrides CCQA_MODEL.")).action(async (target, opts) => {
|
|
4638
5844
|
await runTests(target, opts);
|
|
4639
5845
|
});
|
|
4640
5846
|
async function runTests(target, opts) {
|
|
@@ -4649,6 +5855,7 @@ async function runTests(target, opts) {
|
|
|
4649
5855
|
const summaries = [];
|
|
4650
5856
|
let overallExitCode = 0;
|
|
4651
5857
|
const vitestConfig = await resolveVitestConfig();
|
|
5858
|
+
const captureOutput = Boolean(opts.driftReport);
|
|
4652
5859
|
try {
|
|
4653
5860
|
for (let i = 0; i < specs.length; i++) {
|
|
4654
5861
|
const { featureName, specName } = specs[i];
|
|
@@ -4669,7 +5876,8 @@ async function runTests(target, opts) {
|
|
|
4669
5876
|
"--reporter=json",
|
|
4670
5877
|
`--outputFile.json=${reportFile}`
|
|
4671
5878
|
]);
|
|
4672
|
-
|
|
5879
|
+
const tail = captureOutput ? new TailBuffer(OUTPUT_TAIL_CAP) : null;
|
|
5880
|
+
await Promise.all([streamFiltered(proc.stdout, process.stdout, tail), streamFiltered(proc.stderr, process.stderr, tail)]);
|
|
4673
5881
|
const exitCode = await proc.exited;
|
|
4674
5882
|
if (exitCode !== 0) overallExitCode = exitCode;
|
|
4675
5883
|
const report = await readReport(reportFile);
|
|
@@ -4678,12 +5886,13 @@ async function runTests(target, opts) {
|
|
|
4678
5886
|
specName,
|
|
4679
5887
|
scriptFile,
|
|
4680
5888
|
report,
|
|
4681
|
-
exitCode
|
|
5889
|
+
exitCode,
|
|
5890
|
+
outputTail: tail ? tail.toString() : null
|
|
4682
5891
|
});
|
|
4683
5892
|
blank();
|
|
4684
5893
|
}
|
|
4685
5894
|
printSummary(summaries);
|
|
4686
|
-
|
|
5895
|
+
await maybeWriteDriftReport(summaries, opts);
|
|
4687
5896
|
} finally {
|
|
4688
5897
|
await rm(tmpDir, {
|
|
4689
5898
|
recursive: true,
|
|
@@ -4696,73 +5905,208 @@ function failedSpec(s) {
|
|
|
4696
5905
|
if (s.exitCode !== 0) return true;
|
|
4697
5906
|
return (s.report?.numFailedTests ?? 0) > 0;
|
|
4698
5907
|
}
|
|
4699
|
-
function parseDriftFormat(raw) {
|
|
4700
|
-
const v = raw ?? "text";
|
|
4701
|
-
if (v === "text" || v === "json" || v === "github") return v;
|
|
4702
|
-
error(`invalid --format: ${v} (expected text|json|github)`);
|
|
4703
|
-
process.exit(2);
|
|
4704
|
-
}
|
|
4705
5908
|
/**
|
|
4706
|
-
*
|
|
4707
|
-
*
|
|
4708
|
-
*
|
|
4709
|
-
*
|
|
4710
|
-
*
|
|
4711
|
-
*
|
|
5909
|
+
* Opt-in post-vitest report hook. With `--drift-report`, a self-contained
|
|
5910
|
+
* HTML report is ALWAYS written (a green run is still a useful run summary);
|
|
5911
|
+
* failing specs additionally get a spec↔code drift audit and a three-way
|
|
5912
|
+
* root-cause call with the PR diff as context. The hook never changes the
|
|
5913
|
+
* exit code — the run's outcome is determined by vitest alone — and when
|
|
5914
|
+
* Claude auth is unavailable only the analysis is skipped, not the report.
|
|
4712
5915
|
*/
|
|
4713
|
-
function
|
|
4714
|
-
if (opts.
|
|
4715
|
-
|
|
4716
|
-
|
|
4717
|
-
}
|
|
4718
|
-
/**
|
|
4719
|
-
* Opt-in post-vitest drift hook. With `--drift`, fires only when at least
|
|
4720
|
-
* one spec failed (supplemental signal). With `--drift-strict`, fires
|
|
4721
|
-
* unconditionally so a spec/source divergence is caught even when vitest
|
|
4722
|
-
* passed. Skips silently when auth is unavailable so the run's exit code
|
|
4723
|
-
* is determined by vitest alone.
|
|
4724
|
-
*/
|
|
4725
|
-
async function maybeRunDrift(summaries, opts, currentExitCode) {
|
|
4726
|
-
const candidates = selectDriftTargets(summaries, opts);
|
|
4727
|
-
if (candidates.length === 0) return currentExitCode;
|
|
5916
|
+
async function maybeWriteDriftReport(summaries, opts) {
|
|
5917
|
+
if (!opts.driftReport) return;
|
|
5918
|
+
const outDir = typeof opts.driftReport === "string" ? opts.driftReport : DEFAULT_REPORT_DIR;
|
|
5919
|
+
const cwd = process.cwd();
|
|
4728
5920
|
const auth = driftAuthAvailable();
|
|
4729
|
-
|
|
4730
|
-
|
|
4731
|
-
|
|
5921
|
+
const failed = summaries.filter(failedSpec);
|
|
5922
|
+
if (!auth.ok && failed.length > 0) info(`failure analysis skipped (${auth.reason})`);
|
|
5923
|
+
const baseRef = resolveBaseRef(opts.driftBase);
|
|
5924
|
+
let diff = {
|
|
5925
|
+
ok: false,
|
|
5926
|
+
error: "diff not captured (no failures)"
|
|
5927
|
+
};
|
|
5928
|
+
if (failed.length > 0) {
|
|
5929
|
+
diff = await capturePrDiff(baseRef, cwd);
|
|
5930
|
+
if (!diff.ok) info(`drift-report: source diff unavailable (${diff.error}) — analyzing without diff context`);
|
|
5931
|
+
}
|
|
5932
|
+
const tree = failed.length > 0 ? await listFeatureTree(cwd) : [];
|
|
5933
|
+
const specInfoByKey = new Map(tree.flatMap((f) => f.specs.map((sp) => [`${f.featureName}/${sp.specName}`, sp])));
|
|
5934
|
+
const findSpecInfo = (s) => specInfoByKey.get(`${s.featureName}/${s.specName}`) ?? null;
|
|
5935
|
+
let driftResults = [];
|
|
5936
|
+
if (auth.ok && failed.length > 0) {
|
|
5937
|
+
const targets = failed.map((s) => {
|
|
5938
|
+
const spec = findSpecInfo(s);
|
|
5939
|
+
if (!spec) return null;
|
|
5940
|
+
const t = {
|
|
5941
|
+
featureName: s.featureName,
|
|
5942
|
+
specName: s.specName
|
|
5943
|
+
};
|
|
5944
|
+
if (spec.relatedPaths) t.relatedPaths = spec.relatedPaths;
|
|
5945
|
+
if (spec.includedBlocks) t.includedBlocks = spec.includedBlocks;
|
|
5946
|
+
return t;
|
|
5947
|
+
}).filter((t) => t !== null);
|
|
5948
|
+
if (targets.length > 0) driftResults = await analyzeDrift({
|
|
5949
|
+
targets,
|
|
5950
|
+
cwd,
|
|
5951
|
+
blocks: await loadAvailableBlocks(cwd),
|
|
5952
|
+
concurrency: Math.min(3, targets.length),
|
|
5953
|
+
...opts.model ? { model: opts.model } : {},
|
|
5954
|
+
...opts.language ? { language: opts.language } : {},
|
|
5955
|
+
onSpecStart: (t) => info(`drift audit: ${t.featureName}/${t.specName}`)
|
|
5956
|
+
});
|
|
4732
5957
|
}
|
|
4733
|
-
const
|
|
4734
|
-
|
|
4735
|
-
const
|
|
4736
|
-
const
|
|
4737
|
-
const
|
|
4738
|
-
|
|
4739
|
-
|
|
4740
|
-
|
|
4741
|
-
|
|
5958
|
+
const patchSections = diff.ok && diff.diff.patch.length > 0 ? splitPatchByFile(diff.diff.patch) : null;
|
|
5959
|
+
let printedHeader = false;
|
|
5960
|
+
const results = [];
|
|
5961
|
+
for (const s of summaries) {
|
|
5962
|
+
const assertions = collectAssertions(s);
|
|
5963
|
+
const base = {
|
|
5964
|
+
feature: s.featureName,
|
|
5965
|
+
spec: s.specName,
|
|
5966
|
+
testCounts: s.report ? {
|
|
5967
|
+
total: s.report.numTotalTests,
|
|
5968
|
+
passed: s.report.numPassedTests,
|
|
5969
|
+
failed: s.report.numFailedTests
|
|
5970
|
+
} : null,
|
|
5971
|
+
durationMs: assertions ? assertions.reduce((sum, a) => sum + (a.durationMs ?? 0), 0) : null,
|
|
5972
|
+
assertions
|
|
4742
5973
|
};
|
|
4743
|
-
if (
|
|
4744
|
-
|
|
4745
|
-
|
|
4746
|
-
|
|
4747
|
-
|
|
4748
|
-
|
|
4749
|
-
|
|
4750
|
-
|
|
4751
|
-
|
|
4752
|
-
|
|
4753
|
-
|
|
4754
|
-
|
|
4755
|
-
|
|
4756
|
-
|
|
4757
|
-
|
|
4758
|
-
|
|
5974
|
+
if (!failedSpec(s)) {
|
|
5975
|
+
results.push({
|
|
5976
|
+
...base,
|
|
5977
|
+
status: "passed",
|
|
5978
|
+
analysis: null,
|
|
5979
|
+
analysisSkipped: null,
|
|
5980
|
+
driftIssues: null,
|
|
5981
|
+
failureLogExcerpt: null,
|
|
5982
|
+
diffExcerpt: null,
|
|
5983
|
+
specYaml: null
|
|
5984
|
+
});
|
|
5985
|
+
continue;
|
|
5986
|
+
}
|
|
5987
|
+
const specYaml = await tryReadSpecFile(s.featureName, s.specName, cwd);
|
|
5988
|
+
const relatedPaths = findSpecInfo(s)?.relatedPaths ?? null;
|
|
5989
|
+
const diffExcerpt = patchSections ? scopePatchForSpec(patchSections, relatedPaths) : null;
|
|
5990
|
+
const driftResult = driftResults.find((r) => r.target.featureName === s.featureName && r.target.specName === s.specName);
|
|
5991
|
+
const driftIssues = driftResult?.ok ? driftResult.issues : null;
|
|
5992
|
+
const failureLog = buildFailureLog(s);
|
|
5993
|
+
let analysis = null;
|
|
5994
|
+
let analysisSkipped = null;
|
|
5995
|
+
if (!auth.ok) analysisSkipped = auth.reason;
|
|
5996
|
+
else if (specYaml === null) analysisSkipped = "no spec.yaml found for this spec";
|
|
5997
|
+
else {
|
|
5998
|
+
const script = await readScriptSafe(s.scriptFile);
|
|
5999
|
+
info(`failure analysis: ${s.featureName}/${s.specName}`);
|
|
6000
|
+
const outcome = await analyzeFailure({
|
|
6001
|
+
script,
|
|
6002
|
+
specYaml,
|
|
6003
|
+
failureLog,
|
|
6004
|
+
diffPatch: diffExcerpt,
|
|
6005
|
+
changedFiles: diff.ok ? diff.diff.nameStatus : null,
|
|
6006
|
+
baseRef: diff.ok ? baseRef : null,
|
|
6007
|
+
driftIssues,
|
|
6008
|
+
...opts.language ? { outputLanguage: opts.language } : {}
|
|
6009
|
+
}, {
|
|
6010
|
+
...opts.model ? { model: opts.model } : {},
|
|
6011
|
+
cwd
|
|
6012
|
+
});
|
|
6013
|
+
analysis = outcome.analysis;
|
|
6014
|
+
if (!printedHeader) {
|
|
6015
|
+
process.stdout.write(`\n${C.cyan}${C.bold}──────── failure analysis ────────${C.reset}\n`);
|
|
6016
|
+
printedHeader = true;
|
|
6017
|
+
}
|
|
6018
|
+
const pct = Math.round(outcome.analysis.confidence * 100);
|
|
6019
|
+
const firstLine = outcome.analysis.reasoning.split("\n")[0] ?? "";
|
|
6020
|
+
process.stdout.write(`${C.red}✖${C.reset} ${C.bold}${s.featureName}/${s.specName}${C.reset} → ${C.bold}${outcome.analysis.label}${C.reset} (${pct}%)${firstLine ? ` ${C.dim}${firstLine}${C.reset}` : ""}\n`);
|
|
4759
6021
|
}
|
|
6022
|
+
results.push({
|
|
6023
|
+
...base,
|
|
6024
|
+
status: "failed",
|
|
6025
|
+
analysis,
|
|
6026
|
+
analysisSkipped,
|
|
6027
|
+
driftIssues,
|
|
6028
|
+
failureLogExcerpt: failureLog.length > 0 ? failureLog : null,
|
|
6029
|
+
diffExcerpt,
|
|
6030
|
+
specYaml
|
|
6031
|
+
});
|
|
6032
|
+
}
|
|
6033
|
+
const data = {
|
|
6034
|
+
schemaVersion: 1,
|
|
6035
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6036
|
+
runId: process.env["GITHUB_RUN_ID"] ?? null,
|
|
6037
|
+
git: {
|
|
6038
|
+
head: diff.ok ? diff.diff.head : null,
|
|
6039
|
+
base: diff.ok ? baseRef : null
|
|
6040
|
+
},
|
|
6041
|
+
model: opts.model ?? null,
|
|
6042
|
+
promptVersion: "2",
|
|
6043
|
+
results
|
|
6044
|
+
};
|
|
6045
|
+
const reportPath = join(outDir, "index.html");
|
|
6046
|
+
await mkdir(outDir, { recursive: true });
|
|
6047
|
+
await writeFile(reportPath, renderRunReport(data), "utf8");
|
|
6048
|
+
info(`run report written to ${reportPath}`);
|
|
6049
|
+
}
|
|
6050
|
+
function collectAssertions(s) {
|
|
6051
|
+
if (!s.report) return null;
|
|
6052
|
+
const out = [];
|
|
6053
|
+
for (const file of s.report.testResults) for (const a of file.assertionResults) out.push({
|
|
6054
|
+
name: a.fullName,
|
|
6055
|
+
status: a.status === "passed" || a.status === "failed" ? a.status : "skipped",
|
|
6056
|
+
durationMs: a.duration ?? null
|
|
4760
6057
|
});
|
|
4761
|
-
|
|
4762
|
-
|
|
4763
|
-
|
|
4764
|
-
|
|
6058
|
+
return out;
|
|
6059
|
+
}
|
|
6060
|
+
/**
|
|
6061
|
+
* Compose the failure log fed to the analysis prompt and embedded in the
|
|
6062
|
+
* report. With `--reporter=json` vitest writes (almost) nothing to
|
|
6063
|
+
* stdout/stderr — the assertion failures live in the JSON report — so the
|
|
6064
|
+
* structured failureMessages come first and the raw output tail (console
|
|
6065
|
+
* logs, agent-browser noise) is appended as secondary context.
|
|
6066
|
+
*/
|
|
6067
|
+
function buildFailureLog(s) {
|
|
6068
|
+
const parts = [];
|
|
6069
|
+
if (s.report) for (const file of s.report.testResults) for (const a of file.assertionResults) {
|
|
6070
|
+
if (a.status !== "failed") continue;
|
|
6071
|
+
parts.push(`✖ ${a.fullName}`);
|
|
6072
|
+
for (const m of a.failureMessages ?? []) parts.push(m);
|
|
6073
|
+
}
|
|
6074
|
+
const tail = s.outputTail?.trim();
|
|
6075
|
+
if (tail) {
|
|
6076
|
+
parts.push("--- vitest output (tail) ---");
|
|
6077
|
+
parts.push(tail);
|
|
6078
|
+
}
|
|
6079
|
+
return parts.join("\n");
|
|
6080
|
+
}
|
|
6081
|
+
async function readScriptSafe(path) {
|
|
6082
|
+
try {
|
|
6083
|
+
return await readFile(path, "utf8");
|
|
6084
|
+
} catch {
|
|
6085
|
+
return "";
|
|
6086
|
+
}
|
|
4765
6087
|
}
|
|
6088
|
+
/** Cap on the per-spec output tail kept for the report / analysis prompt. */
|
|
6089
|
+
const OUTPUT_TAIL_CAP = 64 * 1024;
|
|
6090
|
+
/**
|
|
6091
|
+
* Keeps the LAST `cap` characters appended. Vitest puts the failure summary
|
|
6092
|
+
* at the end of its output, so the tail is the part worth keeping when a
|
|
6093
|
+
* noisy spec overflows the cap.
|
|
6094
|
+
*/
|
|
6095
|
+
var TailBuffer = class {
|
|
6096
|
+
buf = "";
|
|
6097
|
+
cap;
|
|
6098
|
+
constructor(cap) {
|
|
6099
|
+
this.cap = cap;
|
|
6100
|
+
}
|
|
6101
|
+
append(s) {
|
|
6102
|
+
this.buf += s;
|
|
6103
|
+
if (this.buf.length > this.cap * 2) this.buf = this.buf.slice(-this.cap);
|
|
6104
|
+
}
|
|
6105
|
+
toString() {
|
|
6106
|
+
if (this.buf.length <= this.cap) return this.buf;
|
|
6107
|
+
return `[...output truncated...]\n${this.buf.slice(-this.cap)}`;
|
|
6108
|
+
}
|
|
6109
|
+
};
|
|
4766
6110
|
async function readReport(path) {
|
|
4767
6111
|
try {
|
|
4768
6112
|
const raw = await readFile(path, "utf8");
|
|
@@ -4834,7 +6178,7 @@ function formatDuration(ms) {
|
|
|
4834
6178
|
return `${(ms / 1e3).toFixed(2)}s`;
|
|
4835
6179
|
}
|
|
4836
6180
|
const NOISE_LINE_PATTERNS = [/^JSON report written to /];
|
|
4837
|
-
async function streamFiltered(source, sink) {
|
|
6181
|
+
async function streamFiltered(source, sink, capture) {
|
|
4838
6182
|
source.setEncoding("utf8");
|
|
4839
6183
|
let buffer = "";
|
|
4840
6184
|
for await (const chunk of source) {
|
|
@@ -4843,11 +6187,17 @@ async function streamFiltered(source, sink) {
|
|
|
4843
6187
|
while (nl !== -1) {
|
|
4844
6188
|
const line = buffer.slice(0, nl);
|
|
4845
6189
|
buffer = buffer.slice(nl + 1);
|
|
4846
|
-
if (!NOISE_LINE_PATTERNS.some((p) => p.test(line)))
|
|
6190
|
+
if (!NOISE_LINE_PATTERNS.some((p) => p.test(line))) {
|
|
6191
|
+
sink.write(line + "\n");
|
|
6192
|
+
capture?.append(line + "\n");
|
|
6193
|
+
}
|
|
4847
6194
|
nl = buffer.indexOf("\n");
|
|
4848
6195
|
}
|
|
4849
6196
|
}
|
|
4850
|
-
if (buffer.length > 0 && !NOISE_LINE_PATTERNS.some((p) => p.test(buffer)))
|
|
6197
|
+
if (buffer.length > 0 && !NOISE_LINE_PATTERNS.some((p) => p.test(buffer))) {
|
|
6198
|
+
sink.write(buffer);
|
|
6199
|
+
capture?.append(buffer);
|
|
6200
|
+
}
|
|
4851
6201
|
}
|
|
4852
6202
|
async function resolveSpecs(target) {
|
|
4853
6203
|
if (!target) return listAllSpecs();
|
|
@@ -4866,7 +6216,7 @@ async function resolveSpecs(target) {
|
|
|
4866
6216
|
//#endregion
|
|
4867
6217
|
//#region src/cli/draft.ts
|
|
4868
6218
|
const CATEGORY_LABEL = DRAFT_CATEGORY_LABEL;
|
|
4869
|
-
const draftCommand = new Command("draft").argument("[feature/spec]", "Optional spec path (e.g. tasks/create-and-complete). If omitted, Claude proposes one from your intent.").description("Interactively draft and refine a spec.yaml with Claude Code").option("--instruction <text>", "Non-interactive single-shot instruction (skips the interactive loop)").option("--apply", "Auto-apply each generated patch without [y/N] confirmation", false).action(async (specPath, opts) => {
|
|
6219
|
+
const draftCommand = addLanguageOption(new Command("draft").argument("[feature/spec]", "Optional spec path (e.g. tasks/create-and-complete). If omitted, Claude proposes one from your intent.").description("Interactively draft and refine a spec.yaml with Claude Code").option("--instruction <text>", "Non-interactive single-shot instruction (skips the interactive loop)").option("--apply", "Auto-apply each generated patch without [y/N] confirmation", false)).action(async (specPath, opts) => {
|
|
4870
6220
|
await ensureCcqaDir();
|
|
4871
6221
|
let featureName;
|
|
4872
6222
|
let specName;
|
|
@@ -4882,6 +6232,7 @@ const draftCommand = new Command("draft").argument("[feature/spec]", "Optional s
|
|
|
4882
6232
|
});
|
|
4883
6233
|
async function runDraft(featureName, specName, opts, prefilledIntent) {
|
|
4884
6234
|
header("draft", `${featureName}/${specName}`);
|
|
6235
|
+
const ja = useJapanesePrompts(opts.language);
|
|
4885
6236
|
const oneShot = opts.instruction !== void 0;
|
|
4886
6237
|
let useIntentOnce = prefilledIntent !== null && !oneShot;
|
|
4887
6238
|
while (true) {
|
|
@@ -4892,7 +6243,7 @@ async function runDraft(featureName, specName, opts, prefilledIntent) {
|
|
|
4892
6243
|
else if (useIntentOnce && isFirstRun) {
|
|
4893
6244
|
userInput = prefilledIntent ?? "";
|
|
4894
6245
|
useIntentOnce = false;
|
|
4895
|
-
} else userInput = await prompt(isFirstRun ? "What do you want to test? > " : "How would you like to refine? (empty = re-validate) > ");
|
|
6246
|
+
} else userInput = await prompt(isFirstRun ? ja ? "何をテストしたいですか? > " : "What do you want to test? > " : ja ? "どのように修正しますか? (空欄で再検証) > " : "How would you like to refine? (empty = re-validate) > ");
|
|
4896
6247
|
if (isFirstRun && !userInput.trim()) {
|
|
4897
6248
|
error("intent required for the first draft (no spec exists yet)");
|
|
4898
6249
|
process.exit(1);
|
|
@@ -4902,11 +6253,12 @@ async function runDraft(featureName, specName, opts, prefilledIntent) {
|
|
|
4902
6253
|
specName,
|
|
4903
6254
|
existing,
|
|
4904
6255
|
userInput: userInput.trim(),
|
|
4905
|
-
autoApply: opts.apply === true
|
|
6256
|
+
autoApply: opts.apply === true,
|
|
6257
|
+
language: opts.language
|
|
4906
6258
|
});
|
|
4907
6259
|
if (oneShot) process.exit(turnResult.hasError && !turnResult.applied ? 1 : 0);
|
|
4908
6260
|
blank();
|
|
4909
|
-
if (/^y/i.test(await prompt("Are you done with this draft? [y/N] "))) {
|
|
6261
|
+
if (/^y/i.test(await prompt(ja ? "このドラフトは完了ですか? [y/N] " : "Are you done with this draft? [y/N] "))) {
|
|
4910
6262
|
info("draft session complete.");
|
|
4911
6263
|
hint(`run 'ccqa trace ${featureName}/${specName}' to record actions`);
|
|
4912
6264
|
process.exit(0);
|
|
@@ -4914,9 +6266,9 @@ async function runDraft(featureName, specName, opts, prefilledIntent) {
|
|
|
4914
6266
|
}
|
|
4915
6267
|
}
|
|
4916
6268
|
async function runOneTurn(input) {
|
|
4917
|
-
const { featureName, specName, existing, userInput, autoApply } = input;
|
|
6269
|
+
const { featureName, specName, existing, userInput, autoApply, language } = input;
|
|
4918
6270
|
const isFirstRun = existing === null;
|
|
4919
|
-
const systemPrompt = buildDraftSystemPrompt(await loadAvailableBlocks());
|
|
6271
|
+
const systemPrompt = buildDraftSystemPrompt(await loadAvailableBlocks()) + languageDirective(language);
|
|
4920
6272
|
const userPrompt = buildDraftPrompt({
|
|
4921
6273
|
mode: isFirstRun ? "create" : "refine",
|
|
4922
6274
|
existing: existing ?? "",
|
|
@@ -4979,7 +6331,7 @@ async function runOneTurn(input) {
|
|
|
4979
6331
|
info("--- proposed changes ---");
|
|
4980
6332
|
printUnifiedDiff(original, report.patch);
|
|
4981
6333
|
blank();
|
|
4982
|
-
if (!(autoApply ? true : /^y/i.test(await prompt("Apply this patch? [y/N] ")))) {
|
|
6334
|
+
if (!(autoApply ? true : /^y/i.test(await prompt(useJapanesePrompts(language) ? "このパッチを適用しますか? [y/N] " : "Apply this patch? [y/N] ")))) {
|
|
4983
6335
|
info("aborted — no changes applied.");
|
|
4984
6336
|
return {
|
|
4985
6337
|
hasError,
|
|
@@ -5071,8 +6423,9 @@ function writeFinding(issue) {
|
|
|
5071
6423
|
if (issue.detail) process.stdout.write(` └ ${issue.detail.replace(/\n/g, "\n ")}\n`);
|
|
5072
6424
|
}
|
|
5073
6425
|
async function proposeNaming(opts) {
|
|
6426
|
+
const ja = useJapanesePrompts(opts.language);
|
|
5074
6427
|
const oneShot = opts.instruction !== void 0;
|
|
5075
|
-
const intent = oneShot ? opts.instruction ?? "" : await prompt("What do you want to test? > ");
|
|
6428
|
+
const intent = oneShot ? opts.instruction ?? "" : await prompt(ja ? "何をテストしたいですか? > " : "What do you want to test? > ");
|
|
5076
6429
|
if (!intent.trim()) {
|
|
5077
6430
|
error("intent required to propose a feature/spec name");
|
|
5078
6431
|
process.exit(1);
|
|
@@ -5124,13 +6477,13 @@ async function proposeNaming(opts) {
|
|
|
5124
6477
|
naming: final,
|
|
5125
6478
|
intent: intent.trim()
|
|
5126
6479
|
};
|
|
5127
|
-
const answer = await prompt(
|
|
6480
|
+
const answer = await prompt(ja ? "この名前を使いますか? [y/N/edit] > " : "Use this name? [y/N/edit] > ");
|
|
5128
6481
|
if (/^y/i.test(answer)) return {
|
|
5129
6482
|
naming: final,
|
|
5130
6483
|
intent: intent.trim()
|
|
5131
6484
|
};
|
|
5132
6485
|
if (/^e/i.test(answer)) {
|
|
5133
|
-
const manual = await prompt("Enter feature/spec (e.g. tasks/create-and-complete) > ");
|
|
6486
|
+
const manual = await prompt(ja ? "feature/spec を入力 (例 tasks/create-and-complete) > " : "Enter feature/spec (e.g. tasks/create-and-complete) > ");
|
|
5134
6487
|
const parts = manual.split("/");
|
|
5135
6488
|
if (parts.length !== 2 || !parts[0] || !parts[1]) {
|
|
5136
6489
|
error(`invalid spec path: "${manual}". Expected "<feature>/<spec>"`);
|
|
@@ -5230,163 +6583,141 @@ function truncate(s, n) {
|
|
|
5230
6583
|
return s.slice(s.length - n);
|
|
5231
6584
|
}
|
|
5232
6585
|
//#endregion
|
|
5233
|
-
//#region src/drift/
|
|
5234
|
-
const execFileP = promisify(execFile);
|
|
5235
|
-
/**
|
|
5236
|
-
* Resolve the base ref to diff against for `ccqa drift --changed`.
|
|
5237
|
-
* Precedence: explicit override > GITHUB_BASE_REF > origin/main.
|
|
5238
|
-
*/
|
|
5239
|
-
function resolveBaseRef(explicit) {
|
|
5240
|
-
if (explicit && explicit.length > 0) return explicit;
|
|
5241
|
-
const ghBase = process.env["GITHUB_BASE_REF"];
|
|
5242
|
-
if (ghBase && ghBase.length > 0) return ghBase.startsWith("origin/") ? ghBase : `origin/${ghBase}`;
|
|
5243
|
-
return "origin/main";
|
|
5244
|
-
}
|
|
5245
|
-
/**
|
|
5246
|
-
* Run `git diff --name-status base...HEAD` from `cwd` and return one entry per
|
|
5247
|
-
* changed file. Renames are reported under their NEW path with status
|
|
5248
|
-
* "renamed" — the OLD path is dropped because the spec mapping is against the
|
|
5249
|
-
* post-rename layout.
|
|
5250
|
-
*
|
|
5251
|
-
* Paths are re-rooted to be relative to `cwd`, not the git repo root. In a
|
|
5252
|
-
* monorepo where `cwd` is a sub-package (e.g. `apps/foo`), git emits paths
|
|
5253
|
-
* relative to the repo root, but specs declare relatedPaths relative to
|
|
5254
|
-
* their own package. Changes outside `cwd` are dropped so an unrelated PR
|
|
5255
|
-
* can never accidentally scope a sub-package's specs in.
|
|
5256
|
-
*/
|
|
5257
|
-
async function getChangedFiles(base, cwd) {
|
|
5258
|
-
const [{ stdout: rootOut }, { stdout: diffOut }] = await Promise.all([execFileP("git", ["rev-parse", "--show-toplevel"], { cwd }), execFileP("git", [
|
|
5259
|
-
"diff",
|
|
5260
|
-
"--name-status",
|
|
5261
|
-
"-M",
|
|
5262
|
-
`${base}...HEAD`
|
|
5263
|
-
], {
|
|
5264
|
-
cwd,
|
|
5265
|
-
maxBuffer: 32 * 1024 * 1024
|
|
5266
|
-
})]);
|
|
5267
|
-
return rerootChangedFiles(parseGitDiffOutput(diffOut), rootOut.trim(), cwd);
|
|
5268
|
-
}
|
|
6586
|
+
//#region src/drift/format.ts
|
|
5269
6587
|
/**
|
|
5270
|
-
*
|
|
5271
|
-
*
|
|
6588
|
+
* Render drift results as a string. The CLI commands and the `run` failure
|
|
6589
|
+
* hook are the only callers; both want the formatted output returned so
|
|
6590
|
+
* they can prefix / interleave / pipe it as needed.
|
|
5272
6591
|
*/
|
|
5273
|
-
function
|
|
5274
|
-
|
|
5275
|
-
if (
|
|
5276
|
-
|
|
5277
|
-
for (const e of entries) {
|
|
5278
|
-
const rel = relative(prefix, e.path);
|
|
5279
|
-
if (rel.startsWith("..") || rel === "") continue;
|
|
5280
|
-
out.push({
|
|
5281
|
-
...e,
|
|
5282
|
-
path: rel
|
|
5283
|
-
});
|
|
5284
|
-
}
|
|
5285
|
-
return out;
|
|
6592
|
+
function renderDrift(results, format, cwd) {
|
|
6593
|
+
if (format === "json") return renderJson(results);
|
|
6594
|
+
if (format === "github") return renderGithub(results, cwd);
|
|
6595
|
+
return renderText(results);
|
|
5286
6596
|
}
|
|
5287
|
-
|
|
6597
|
+
const HEAVY_RULE = "═".repeat(72);
|
|
6598
|
+
function renderText(results) {
|
|
5288
6599
|
const out = [];
|
|
5289
|
-
for (const
|
|
5290
|
-
|
|
5291
|
-
const
|
|
5292
|
-
const
|
|
5293
|
-
|
|
5294
|
-
if (
|
|
5295
|
-
|
|
5296
|
-
if (newPath) out.push({
|
|
5297
|
-
path: newPath,
|
|
5298
|
-
status: "renamed"
|
|
5299
|
-
});
|
|
6600
|
+
for (const r of results) {
|
|
6601
|
+
out.push("");
|
|
6602
|
+
const heading = `══ ${r.target.featureName}/${r.target.specName} `;
|
|
6603
|
+
const tail = "═".repeat(Math.max(3, 72 - heading.length));
|
|
6604
|
+
out.push(`${heading}${tail}`);
|
|
6605
|
+
if (r.error) {
|
|
6606
|
+
out.push(` ERROR ${r.error}`);
|
|
5300
6607
|
continue;
|
|
5301
6608
|
}
|
|
5302
|
-
|
|
5303
|
-
|
|
5304
|
-
|
|
5305
|
-
|
|
5306
|
-
|
|
5307
|
-
}
|
|
6609
|
+
const errors = r.issues.filter((i) => i.severity === "ERROR");
|
|
6610
|
+
const warnings = r.issues.filter((i) => i.severity === "WARN");
|
|
6611
|
+
const passed = r.issues.filter((i) => i.severity === "OK");
|
|
6612
|
+
if (errors.length === 0 && warnings.length === 0) {
|
|
6613
|
+
const label = passed.length === 1 ? "check" : "checks";
|
|
6614
|
+
const detail = passed.length > 0 ? `all ${passed.length} ${label} passed` : "no issues";
|
|
6615
|
+
out.push(` ✓ ${detail}`);
|
|
5308
6616
|
continue;
|
|
5309
6617
|
}
|
|
5310
|
-
const
|
|
5311
|
-
|
|
5312
|
-
|
|
5313
|
-
|
|
5314
|
-
|
|
5315
|
-
|
|
5316
|
-
|
|
5317
|
-
|
|
5318
|
-
|
|
5319
|
-
|
|
5320
|
-
|
|
5321
|
-
|
|
5322
|
-
|
|
5323
|
-
|
|
5324
|
-
|
|
5325
|
-
|
|
5326
|
-
|
|
5327
|
-
|
|
5328
|
-
|
|
5329
|
-
|
|
5330
|
-
|
|
5331
|
-
|
|
5332
|
-
|
|
5333
|
-
|
|
5334
|
-
|
|
5335
|
-
|
|
6618
|
+
for (const issue of errors) appendFinding(out, "ERROR", issue);
|
|
6619
|
+
for (const issue of warnings) appendFinding(out, "WARN", issue);
|
|
6620
|
+
if (passed.length > 0) {
|
|
6621
|
+
const names = passed.map((i) => DRAFT_CATEGORY_LABEL[i.category]).join(", ");
|
|
6622
|
+
out.push("");
|
|
6623
|
+
out.push(` ✓ passed (${passed.length}): ${names}`);
|
|
6624
|
+
}
|
|
6625
|
+
}
|
|
6626
|
+
out.push("");
|
|
6627
|
+
out.push(HEAVY_RULE);
|
|
6628
|
+
const totals = summarize(results);
|
|
6629
|
+
out.push(` specs ${results.length} (${totals.errored} errored)`);
|
|
6630
|
+
out.push(` findings ${totals.error} error, ${totals.warn} warn, ${totals.ok} ok`);
|
|
6631
|
+
out.push("");
|
|
6632
|
+
return out.join("\n");
|
|
6633
|
+
}
|
|
6634
|
+
function appendFinding(out, level, issue) {
|
|
6635
|
+
const stepPart = issue.stepId ? ` ${issue.stepId}` : "";
|
|
6636
|
+
out.push("");
|
|
6637
|
+
out.push(` ${level} ${DRAFT_CATEGORY_LABEL[issue.category]}${stepPart}`);
|
|
6638
|
+
out.push(` ${issue.message}`);
|
|
6639
|
+
if (issue.detail) out.push(` └ ${issue.detail.replace(/\n/g, "\n ")}`);
|
|
6640
|
+
}
|
|
6641
|
+
function renderJson(results) {
|
|
6642
|
+
const payload = { specs: results.map((r) => ({
|
|
6643
|
+
feature: r.target.featureName,
|
|
6644
|
+
spec: r.target.specName,
|
|
6645
|
+
ok: r.ok,
|
|
6646
|
+
...r.error ? { error: r.error } : {},
|
|
6647
|
+
issues: r.issues.map((i) => ({
|
|
6648
|
+
severity: i.severity,
|
|
6649
|
+
category: i.category,
|
|
6650
|
+
stepId: i.stepId,
|
|
6651
|
+
message: i.message,
|
|
6652
|
+
...i.detail ? { detail: i.detail } : {}
|
|
6653
|
+
}))
|
|
6654
|
+
})) };
|
|
6655
|
+
return `${JSON.stringify(payload, null, 2)}\n`;
|
|
6656
|
+
}
|
|
6657
|
+
function renderGithub(results, cwd) {
|
|
6658
|
+
const repoRoot = process.env["GITHUB_WORKSPACE"] ?? process.cwd();
|
|
6659
|
+
const lines = [];
|
|
6660
|
+
for (const r of results) {
|
|
6661
|
+
const file = githubRelPath(cwd, repoRoot, r.target.featureName, r.target.specName);
|
|
6662
|
+
if (r.error) {
|
|
6663
|
+
lines.push(`::error file=${file}::${escapeGhMessage(r.error)}`);
|
|
6664
|
+
continue;
|
|
6665
|
+
}
|
|
6666
|
+
for (const issue of r.issues) {
|
|
6667
|
+
if (issue.severity === "OK") continue;
|
|
6668
|
+
const level = issue.severity === "ERROR" ? "error" : "warning";
|
|
6669
|
+
const title = `${r.target.featureName}/${r.target.specName} — ${issue.category}${issue.stepId ? ` (${issue.stepId})` : ""}`;
|
|
6670
|
+
const body = issue.detail ? `${issue.message}\n${issue.detail}` : issue.message;
|
|
6671
|
+
lines.push(`::${level} file=${file},title=${escapeGhProp(title)}::${escapeGhMessage(body)}`);
|
|
5336
6672
|
}
|
|
5337
6673
|
}
|
|
5338
|
-
return
|
|
6674
|
+
return lines.length === 0 ? "" : `${lines.join("\n")}\n`;
|
|
5339
6675
|
}
|
|
5340
|
-
function
|
|
5341
|
-
|
|
6676
|
+
function githubRelPath(cwd, repoRoot, featureName, specName) {
|
|
6677
|
+
const abs = resolve(cwd, ".ccqa", "features", featureName, "test-cases", specName, "spec.yaml");
|
|
6678
|
+
const rel = relative(repoRoot, abs);
|
|
6679
|
+
return rel.startsWith("..") ? abs : rel;
|
|
5342
6680
|
}
|
|
5343
|
-
|
|
5344
|
-
|
|
5345
|
-
function compileGlob(pattern) {
|
|
5346
|
-
const cached = REGEX_CACHE.get(pattern);
|
|
5347
|
-
if (cached) return cached;
|
|
5348
|
-
const compiled = globToRegExp(stripLeadingDotSlash(pattern));
|
|
5349
|
-
REGEX_CACHE.set(pattern, compiled);
|
|
5350
|
-
return compiled;
|
|
6681
|
+
function escapeGhMessage(s) {
|
|
6682
|
+
return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A");
|
|
5351
6683
|
}
|
|
5352
|
-
function
|
|
5353
|
-
|
|
5354
|
-
|
|
5355
|
-
|
|
5356
|
-
|
|
5357
|
-
|
|
5358
|
-
|
|
5359
|
-
|
|
5360
|
-
|
|
5361
|
-
|
|
5362
|
-
if (
|
|
5363
|
-
|
|
5364
|
-
|
|
5365
|
-
continue;
|
|
5366
|
-
}
|
|
5367
|
-
if (pattern[i + 1] !== "*") {
|
|
5368
|
-
re += "[^/]*";
|
|
5369
|
-
i++;
|
|
5370
|
-
continue;
|
|
5371
|
-
}
|
|
5372
|
-
const hasLeadingSlash = re.endsWith("/");
|
|
5373
|
-
const hasTrailingSlash = pattern[i + 2] === "/";
|
|
5374
|
-
if (hasLeadingSlash) re = re.slice(0, -1);
|
|
5375
|
-
if (hasLeadingSlash || hasTrailingSlash) re += "(?:/?.*)?";
|
|
5376
|
-
else re += ".*";
|
|
5377
|
-
i += hasTrailingSlash ? 3 : 2;
|
|
6684
|
+
function escapeGhProp(s) {
|
|
6685
|
+
return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A").replace(/,/g, "%2C").replace(/:/g, "%3A");
|
|
6686
|
+
}
|
|
6687
|
+
function summarize(results) {
|
|
6688
|
+
let error = 0;
|
|
6689
|
+
let warn = 0;
|
|
6690
|
+
let ok = 0;
|
|
6691
|
+
let errored = 0;
|
|
6692
|
+
for (const r of results) {
|
|
6693
|
+
if (r.error) errored++;
|
|
6694
|
+
for (const issue of r.issues) if (issue.severity === "ERROR") error++;
|
|
6695
|
+
else if (issue.severity === "WARN") warn++;
|
|
6696
|
+
else ok++;
|
|
5378
6697
|
}
|
|
5379
|
-
return
|
|
6698
|
+
return {
|
|
6699
|
+
error,
|
|
6700
|
+
warn,
|
|
6701
|
+
ok,
|
|
6702
|
+
errored
|
|
6703
|
+
};
|
|
5380
6704
|
}
|
|
6705
|
+
//#endregion
|
|
6706
|
+
//#region src/drift/exit-code.ts
|
|
5381
6707
|
/**
|
|
5382
|
-
*
|
|
5383
|
-
*
|
|
5384
|
-
*
|
|
6708
|
+
* Map drift results to an exit code. Spec-level errors (Claude call failed)
|
|
6709
|
+
* always fail; otherwise ERROR severity always fails, WARN fails only when
|
|
6710
|
+
* the threshold is `warn`.
|
|
5385
6711
|
*/
|
|
5386
|
-
function
|
|
5387
|
-
const
|
|
5388
|
-
|
|
5389
|
-
|
|
6712
|
+
function determineExitCode(results, threshold) {
|
|
6713
|
+
for (const r of results) {
|
|
6714
|
+
if (r.error) return 1;
|
|
6715
|
+
for (const issue of r.issues) {
|
|
6716
|
+
if (issue.severity === "ERROR") return 1;
|
|
6717
|
+
if (threshold === "warn" && issue.severity === "WARN") return 1;
|
|
6718
|
+
}
|
|
6719
|
+
}
|
|
6720
|
+
return 0;
|
|
5390
6721
|
}
|
|
5391
6722
|
//#endregion
|
|
5392
6723
|
//#region src/drift/route-new-files.ts
|
|
@@ -5503,7 +6834,7 @@ Return the spec keys that might be affected by any of the new files. Conservativ
|
|
|
5503
6834
|
//#endregion
|
|
5504
6835
|
//#region src/cli/drift.ts
|
|
5505
6836
|
const DEFAULT_CONCURRENCY = 3;
|
|
5506
|
-
const driftCommand = new Command("drift").argument("[feature/spec]", "Optional spec id. If omitted, every spec under .ccqa/features/ is checked.").description("Check whether each spec.yaml is still in sync with the current codebase (CI-friendly, no patches applied).").option("--format <fmt>", "Output format: text | json | github", "text").option("--severity <level>", "Exit non-zero on this severity or higher: warn | error", "error").option("--concurrency <n>", `Parallel spec checks (default: ${DEFAULT_CONCURRENCY})`).option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Overrides CCQA_MODEL.").option("--cwd <path>", "Working directory used as both the .ccqa root and the codebase Claude reads. Useful for monorepos. Defaults to process.cwd().").option("--changed", "Restrict drift checks to specs whose relatedPaths intersect the git diff against --base (or, in CI, $GITHUB_BASE_REF, else origin/main). New files are routed to specs via a single lightweight Claude call.").option("--base <ref>", "Base ref to diff against when --changed is set. Defaults to $GITHUB_BASE_REF (CI) or origin/main.").action(async (specPath, opts) => {
|
|
6837
|
+
const driftCommand = addLanguageOption(new Command("drift").argument("[feature/spec]", "Optional spec id. If omitted, every spec under .ccqa/features/ is checked.").description("Check whether each spec.yaml is still in sync with the current codebase (CI-friendly, no patches applied).").option("--format <fmt>", "Output format: text | json | github", "text").option("--severity <level>", "Exit non-zero on this severity or higher: warn | error", "error").option("--concurrency <n>", `Parallel spec checks (default: ${DEFAULT_CONCURRENCY})`).option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Overrides CCQA_MODEL.").option("--cwd <path>", "Working directory used as both the .ccqa root and the codebase Claude reads. Useful for monorepos. Defaults to process.cwd().").option("--changed", "Restrict drift checks to specs whose relatedPaths intersect the git diff against --base (or, in CI, $GITHUB_BASE_REF, else origin/main). New files are routed to specs via a single lightweight Claude call.").option("--base <ref>", "Base ref to diff against when --changed is set. Defaults to $GITHUB_BASE_REF (CI) or origin/main.")).action(async (specPath, opts) => {
|
|
5507
6838
|
const format = parseFormat(opts.format);
|
|
5508
6839
|
const threshold = parseSeverity(opts.severity);
|
|
5509
6840
|
const concurrency = parseConcurrency(opts.concurrency);
|
|
@@ -5538,6 +6869,7 @@ const driftCommand = new Command("drift").argument("[feature/spec]", "Optional s
|
|
|
5538
6869
|
blocks,
|
|
5539
6870
|
concurrency,
|
|
5540
6871
|
...opts.model ? { model: opts.model } : {},
|
|
6872
|
+
...opts.language ? { language: opts.language } : {},
|
|
5541
6873
|
onSpecStart: (t) => {
|
|
5542
6874
|
if (format === "text") info(`checking ${t.featureName}/${t.specName}`);
|
|
5543
6875
|
}
|
|
@@ -5650,6 +6982,446 @@ function parseConcurrency(raw) {
|
|
|
5650
6982
|
return n;
|
|
5651
6983
|
}
|
|
5652
6984
|
//#endregion
|
|
6985
|
+
//#region src/prompts/perspectives.ts
|
|
6986
|
+
/**
|
|
6987
|
+
* Build the system prompt. By default the descriptive fields follow the
|
|
6988
|
+
* spec's own language (Japanese specs → Japanese fields). An explicit
|
|
6989
|
+
* `--language` is applied by the CLI via `languageDirective`, appended to
|
|
6990
|
+
* this prompt, so the language handling lives in one shared place.
|
|
6991
|
+
*/
|
|
6992
|
+
function buildPerspectivesSystemPrompt() {
|
|
6993
|
+
return `You produce a factual inventory of the E2E test coverage that already exists in a ccqa project.
|
|
6994
|
+
|
|
6995
|
+
Think of it as a QA coverage stock-take: for each existing test case, fill in a few short, neutral descriptive fields derived from its steps. Nothing more.
|
|
6996
|
+
|
|
6997
|
+
## Hard boundaries (do NOT cross)
|
|
6998
|
+
|
|
6999
|
+
- Do NOT assign severity, importance, priority, or risk. Whether a failure hurts the customer is a human + PdM decision; you are not authoring that here.
|
|
7000
|
+
- Do NOT do gap analysis. Do NOT list untested areas, missing coverage, or things the code has but the tests lack.
|
|
7001
|
+
- Do NOT evaluate whether the feature is good, complete, or correct.
|
|
7002
|
+
- Do NOT propose new test cases.
|
|
7003
|
+
- Do NOT restate the full step-by-step procedure or the per-step expected results — the spec.yaml is the source of truth for those and the inventory links to it.
|
|
7004
|
+
- Do NOT touch status, relatedPaths, feature names, or spec names — the CLI already fixed those.
|
|
7005
|
+
|
|
7006
|
+
## Fields to write (per spec)
|
|
7007
|
+
|
|
7008
|
+
- \`summary\`: 1–2 sentences, factual and neutral. What the test exercises and what it ultimately asserts, derived from the spec's \`steps\` (\`instruction\` / \`expected\`).
|
|
7009
|
+
- \`startScreen\`: the screen/URL the test first lands on after setup (e.g. "Dashboard (/dashboard)"). Derive from the first non-login \`instruction\`. Omit if genuinely unclear.
|
|
7010
|
+
- \`testCondition\`: the state/precondition the scenario assumes, phrased as a condition (e.g. "Logged in as an admin", "Unauthenticated user"). Omit if none.
|
|
7011
|
+
- \`preconditions\`: array of short setup prerequisites (e.g. which role logs in, required prior state). Derive from \`include: login\` params and the opening steps. Empty/omit if none.
|
|
7012
|
+
|
|
7013
|
+
## How to write
|
|
7014
|
+
|
|
7015
|
+
- Same language as the spec's title (if titles are Japanese, write these fields in Japanese).
|
|
7016
|
+
- Keep each field short. These are index entries, not the test itself.
|
|
7017
|
+
- You may use Read/Grep/Glob sparingly to clarify domain vocabulary, but the steps are the primary source. Do not over-explore.
|
|
7018
|
+
|
|
7019
|
+
## Output contract (STRICT)
|
|
7020
|
+
|
|
7021
|
+
Output exactly ONE fenced \`\`\`json code block, and nothing else outside it. No prose before or after.
|
|
7022
|
+
|
|
7023
|
+
Schema:
|
|
7024
|
+
|
|
7025
|
+
\`\`\`json
|
|
7026
|
+
{
|
|
7027
|
+
"summaries": [
|
|
7028
|
+
{
|
|
7029
|
+
"featureName": "<verbatim from input>",
|
|
7030
|
+
"specName": "<verbatim from input>",
|
|
7031
|
+
"summary": "<1–2 sentence factual description of what this test verifies>",
|
|
7032
|
+
"startScreen": "<opening screen/URL, or omit>",
|
|
7033
|
+
"testCondition": "<assumed state phrased as a condition, or omit>",
|
|
7034
|
+
"preconditions": ["<setup prerequisite>", "..."]
|
|
7035
|
+
}
|
|
7036
|
+
]
|
|
7037
|
+
}
|
|
7038
|
+
\`\`\`
|
|
7039
|
+
|
|
7040
|
+
Return one entry per spec given in the input. Echo featureName and specName verbatim so the CLI can match them. \`startScreen\`, \`testCondition\`, and \`preconditions\` are optional — omit a field (or use an empty array for preconditions) when the spec does not express it.
|
|
7041
|
+
`;
|
|
7042
|
+
}
|
|
7043
|
+
function buildPerspectivesPrompt(specs, instruction) {
|
|
7044
|
+
return `## Existing test cases to summarise
|
|
7045
|
+
|
|
7046
|
+
${specs.map((s) => `### ${s.featureName}/${s.specName}
|
|
7047
|
+
title: ${s.title}
|
|
7048
|
+
|
|
7049
|
+
\`\`\`yaml
|
|
7050
|
+
${s.specYaml.trimEnd()}
|
|
7051
|
+
\`\`\`
|
|
7052
|
+
`).join("\n")}
|
|
7053
|
+
${instruction?.trim() ? `## Extra guidance from the user\n\n${instruction.trim()}\n\n` : ""}## Task
|
|
7054
|
+
|
|
7055
|
+
For each test case above, write a 1–2 sentence factual \`summary\` of what it verifies, derived from its steps. Return one entry per spec in the JSON contract. Do not assign severity, do gap analysis, or invent new cases.
|
|
7056
|
+
`;
|
|
7057
|
+
}
|
|
7058
|
+
//#endregion
|
|
7059
|
+
//#region src/cli/perspectives.ts
|
|
7060
|
+
const perspectivesCommand = addLanguageOption(new Command("perspectives").description("Generate/update .ccqa/perspectives.yaml — a factual inventory of existing test coverage (no severity, no gap analysis)").option("--instruction <text>", "Hint to steer how summaries are written").option("--apply", "Auto-apply without [y/N] confirmation", false).option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID")).action(async (opts) => {
|
|
7061
|
+
await runPerspectives(opts);
|
|
7062
|
+
});
|
|
7063
|
+
async function runPerspectives(opts) {
|
|
7064
|
+
header("perspectives", ".ccqa/perspectives.yaml");
|
|
7065
|
+
await ensureCcqaDir();
|
|
7066
|
+
const skeleton = await buildSkeleton(await listFeatureTree());
|
|
7067
|
+
const allSpecs = skeleton.flatMap((f) => f.specs);
|
|
7068
|
+
if (allSpecs.length === 0) {
|
|
7069
|
+
info("no test cases found under .ccqa/features — nothing to inventory.");
|
|
7070
|
+
return;
|
|
7071
|
+
}
|
|
7072
|
+
const existingRaw = await tryReadPerspectives() ?? "";
|
|
7073
|
+
const noteMap = extractNotes(existingRaw);
|
|
7074
|
+
const specBodies = await loadSpecBodies(skeleton);
|
|
7075
|
+
meta("language", opts.language ?? "auto");
|
|
7076
|
+
info(`Summarising ${allSpecs.length} test case(s) across ${skeleton.length} feature(s)...`);
|
|
7077
|
+
const summaries = await requestSummaries(specBodies, opts);
|
|
7078
|
+
if (summaries === null) process.exit(1);
|
|
7079
|
+
const merged = mergePerspectives(skeleton, summaries, noteMap);
|
|
7080
|
+
let validated;
|
|
7081
|
+
try {
|
|
7082
|
+
validated = PerspectivesSchema.parse(merged);
|
|
7083
|
+
} catch (e) {
|
|
7084
|
+
error(`refused to write: assembled perspectives failed validation (${e.message})`);
|
|
7085
|
+
process.exit(1);
|
|
7086
|
+
}
|
|
7087
|
+
const next = stringify(validated, { lineWidth: 0 });
|
|
7088
|
+
if (withoutGeneratedAt(existingRaw) === withoutGeneratedAt(next)) {
|
|
7089
|
+
blank();
|
|
7090
|
+
info("perspectives already up to date — no changes.");
|
|
7091
|
+
return;
|
|
7092
|
+
}
|
|
7093
|
+
blank();
|
|
7094
|
+
info("--- proposed changes (perspectives.yaml) ---");
|
|
7095
|
+
printUnifiedDiff(existingRaw, next);
|
|
7096
|
+
blank();
|
|
7097
|
+
if (!(opts.apply === true || /^y/i.test(await prompt(useJapanesePrompts(opts.language) ? "perspectives.yaml + .md を書き込みますか? [y/N] " : "Write perspectives.yaml + .md? [y/N] ")))) {
|
|
7098
|
+
info("aborted — no changes written.");
|
|
7099
|
+
return;
|
|
7100
|
+
}
|
|
7101
|
+
meta("saved", await savePerspectives(next));
|
|
7102
|
+
const labels = labelsFor(opts.language);
|
|
7103
|
+
meta("saved", await savePerspectivesMarkdown(renderIndexMarkdown(validated, labels)));
|
|
7104
|
+
for (const feature of validated.features) meta("saved", await saveFeaturePerspectivesMarkdown(feature.featureName, renderFeatureMarkdown(feature, labels)));
|
|
7105
|
+
}
|
|
7106
|
+
/**
|
|
7107
|
+
* Turn the feature tree into the skeleton perspectives features: title +
|
|
7108
|
+
* relatedPaths transcribed from each spec, status derived mechanically from
|
|
7109
|
+
* on-disk artifacts. `summary` is left empty here; Claude fills it later.
|
|
7110
|
+
* Specs whose spec.yaml is missing or unparsable are skipped.
|
|
7111
|
+
*/
|
|
7112
|
+
async function buildSkeleton(tree) {
|
|
7113
|
+
return (await Promise.all(tree.map(async (feature) => {
|
|
7114
|
+
const specs = await Promise.all(feature.specs.filter((s) => s.hasSpecFile).map(async (s) => {
|
|
7115
|
+
const spec = await readSpecMeta(feature.featureName, s.specName);
|
|
7116
|
+
const status = await deriveStatus(feature.featureName, s.specName);
|
|
7117
|
+
const entry = {
|
|
7118
|
+
specName: s.specName,
|
|
7119
|
+
title: spec.title,
|
|
7120
|
+
summary: "",
|
|
7121
|
+
status
|
|
7122
|
+
};
|
|
7123
|
+
if (s.relatedPaths) entry.relatedPaths = s.relatedPaths;
|
|
7124
|
+
return entry;
|
|
7125
|
+
}));
|
|
7126
|
+
return {
|
|
7127
|
+
featureName: feature.featureName,
|
|
7128
|
+
specs
|
|
7129
|
+
};
|
|
7130
|
+
}))).filter((f) => f.specs.length > 0).map((f) => ({
|
|
7131
|
+
featureName: f.featureName,
|
|
7132
|
+
specs: [...f.specs].sort((a, b) => a.specName.localeCompare(b.specName))
|
|
7133
|
+
})).sort((a, b) => a.featureName.localeCompare(b.featureName));
|
|
7134
|
+
}
|
|
7135
|
+
/**
|
|
7136
|
+
* `(featureName, specName)` → human note, parsed from an existing
|
|
7137
|
+
* perspectives.yaml. Notes are preserved across regeneration; everything
|
|
7138
|
+
* else (title, status, summary) is recomputed. Returns an empty map when the
|
|
7139
|
+
* input is empty or unparsable — note preservation is best-effort and never
|
|
7140
|
+
* blocks regeneration.
|
|
7141
|
+
*/
|
|
7142
|
+
function extractNotes(existingRaw) {
|
|
7143
|
+
const map = /* @__PURE__ */ new Map();
|
|
7144
|
+
if (!existingRaw.trim()) return map;
|
|
7145
|
+
let parsed;
|
|
7146
|
+
try {
|
|
7147
|
+
parsed = parse(existingRaw);
|
|
7148
|
+
} catch {
|
|
7149
|
+
return map;
|
|
7150
|
+
}
|
|
7151
|
+
const result = PerspectivesSchema.safeParse(parsed);
|
|
7152
|
+
if (!result.success) return map;
|
|
7153
|
+
for (const feature of result.data.features) for (const spec of feature.specs) if (spec.note !== void 0 && spec.note !== "") map.set(noteKey(feature.featureName, spec.specName), spec.note);
|
|
7154
|
+
return map;
|
|
7155
|
+
}
|
|
7156
|
+
/**
|
|
7157
|
+
* Merge the mechanical skeleton with Claude's summaries and the preserved
|
|
7158
|
+
* notes into the final perspectives object. Summaries are matched by
|
|
7159
|
+
* (featureName, specName); an unmatched spec keeps its empty summary.
|
|
7160
|
+
*/
|
|
7161
|
+
function mergePerspectives(skeleton, summaries, noteMap) {
|
|
7162
|
+
const summaryMap = /* @__PURE__ */ new Map();
|
|
7163
|
+
for (const s of summaries) summaryMap.set(noteKey(s.featureName, s.specName), s);
|
|
7164
|
+
const features = skeleton.map((feature) => ({
|
|
7165
|
+
featureName: feature.featureName,
|
|
7166
|
+
specs: feature.specs.map((spec) => {
|
|
7167
|
+
const key = noteKey(feature.featureName, spec.specName);
|
|
7168
|
+
const entry = summaryMap.get(key);
|
|
7169
|
+
const merged = {
|
|
7170
|
+
...spec,
|
|
7171
|
+
summary: entry?.summary ?? spec.summary
|
|
7172
|
+
};
|
|
7173
|
+
if (entry?.startScreen) merged.startScreen = entry.startScreen;
|
|
7174
|
+
if (entry?.testCondition) merged.testCondition = entry.testCondition;
|
|
7175
|
+
if (entry?.preconditions && entry.preconditions.length > 0) merged.preconditions = entry.preconditions;
|
|
7176
|
+
const note = noteMap.get(key);
|
|
7177
|
+
if (note !== void 0) merged.note = note;
|
|
7178
|
+
return merged;
|
|
7179
|
+
})
|
|
7180
|
+
}));
|
|
7181
|
+
return {
|
|
7182
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7183
|
+
features
|
|
7184
|
+
};
|
|
7185
|
+
}
|
|
7186
|
+
/**
|
|
7187
|
+
* Strip the top-level `generatedAt:` line so two serialised perspectives can
|
|
7188
|
+
* be compared for substantive equality without the always-fresh timestamp
|
|
7189
|
+
* defeating the "already up to date" check. Exported for unit testing.
|
|
7190
|
+
*/
|
|
7191
|
+
function withoutGeneratedAt(yamlText) {
|
|
7192
|
+
return yamlText.split("\n").filter((line) => !/^generatedAt:/.test(line)).join("\n").trim();
|
|
7193
|
+
}
|
|
7194
|
+
function noteKey(featureName, specName) {
|
|
7195
|
+
return `${featureName}/${specName}`;
|
|
7196
|
+
}
|
|
7197
|
+
async function readSpecMeta(featureName, specName) {
|
|
7198
|
+
const raw = await tryReadSpecFile(featureName, specName);
|
|
7199
|
+
if (raw === null) return { title: specName };
|
|
7200
|
+
try {
|
|
7201
|
+
const parsed = parse(raw);
|
|
7202
|
+
if (typeof parsed.title === "string" && parsed.title.length > 0) return { title: parsed.title };
|
|
7203
|
+
} catch {}
|
|
7204
|
+
return { title: specName };
|
|
7205
|
+
}
|
|
7206
|
+
async function deriveStatus(featureName, specName) {
|
|
7207
|
+
return {
|
|
7208
|
+
traced: await stat(join(getSpecDir(featureName, specName), "actions.json")).then(() => true).catch(() => false),
|
|
7209
|
+
generated: await getTestScript(featureName, specName) !== null
|
|
7210
|
+
};
|
|
7211
|
+
}
|
|
7212
|
+
async function loadSpecBodies(skeleton) {
|
|
7213
|
+
return await Promise.all(skeleton.flatMap((feature) => feature.specs.map(async (spec) => {
|
|
7214
|
+
const specYaml = await tryReadSpecFile(feature.featureName, spec.specName) ?? "";
|
|
7215
|
+
return {
|
|
7216
|
+
featureName: feature.featureName,
|
|
7217
|
+
specName: spec.specName,
|
|
7218
|
+
title: spec.title,
|
|
7219
|
+
specYaml
|
|
7220
|
+
};
|
|
7221
|
+
})));
|
|
7222
|
+
}
|
|
7223
|
+
async function requestSummaries(specs, opts) {
|
|
7224
|
+
const toolCounts = {};
|
|
7225
|
+
const startedAt = Date.now();
|
|
7226
|
+
const { result, isError } = await invokeClaudeStreaming({
|
|
7227
|
+
prompt: buildPerspectivesPrompt(specs, opts.instruction),
|
|
7228
|
+
systemPrompt: buildPerspectivesSystemPrompt() + languageDirective(opts.language),
|
|
7229
|
+
allowedTools: [
|
|
7230
|
+
"Read",
|
|
7231
|
+
"Grep",
|
|
7232
|
+
"Glob"
|
|
7233
|
+
],
|
|
7234
|
+
silenceBashLog: true,
|
|
7235
|
+
...opts.model ? { model: opts.model } : {}
|
|
7236
|
+
}, (msg) => {
|
|
7237
|
+
if (msg.type !== "assistant") return;
|
|
7238
|
+
for (const block of msg.message.content ?? []) if (block.type === "tool_use") toolCounts[block.name] = (toolCounts[block.name] ?? 0) + 1;
|
|
7239
|
+
});
|
|
7240
|
+
process.stdout.write(`${formatToolSummary(toolCounts, Date.now() - startedAt)}\n`);
|
|
7241
|
+
if (isError) {
|
|
7242
|
+
error("Claude returned an error result");
|
|
7243
|
+
return null;
|
|
7244
|
+
}
|
|
7245
|
+
const json = extractJsonBlock(result);
|
|
7246
|
+
if (!json) {
|
|
7247
|
+
error("Claude did not return a json block");
|
|
7248
|
+
return null;
|
|
7249
|
+
}
|
|
7250
|
+
return parseSummaries(json);
|
|
7251
|
+
}
|
|
7252
|
+
/**
|
|
7253
|
+
* Parse the `{ summaries: [...] }` JSON contract into typed entries. Returns
|
|
7254
|
+
* null and logs when the payload is malformed. Exported for unit testing.
|
|
7255
|
+
*/
|
|
7256
|
+
function parseSummaries(json) {
|
|
7257
|
+
let payload;
|
|
7258
|
+
try {
|
|
7259
|
+
payload = JSON.parse(json);
|
|
7260
|
+
} catch (e) {
|
|
7261
|
+
error(`failed to parse summaries JSON: ${e.message}`);
|
|
7262
|
+
return null;
|
|
7263
|
+
}
|
|
7264
|
+
if (typeof payload !== "object" || payload === null) {
|
|
7265
|
+
error("summaries payload is not an object");
|
|
7266
|
+
return null;
|
|
7267
|
+
}
|
|
7268
|
+
const summaries = payload.summaries;
|
|
7269
|
+
if (!Array.isArray(summaries)) {
|
|
7270
|
+
error("summaries payload missing a `summaries` array");
|
|
7271
|
+
return null;
|
|
7272
|
+
}
|
|
7273
|
+
const out = [];
|
|
7274
|
+
for (const item of summaries) {
|
|
7275
|
+
const rec = item ?? {};
|
|
7276
|
+
const { featureName, specName, summary } = rec;
|
|
7277
|
+
if (typeof featureName === "string" && typeof specName === "string" && typeof summary === "string") {
|
|
7278
|
+
const entry = {
|
|
7279
|
+
featureName,
|
|
7280
|
+
specName,
|
|
7281
|
+
summary
|
|
7282
|
+
};
|
|
7283
|
+
if (typeof rec.startScreen === "string" && rec.startScreen.length > 0) entry.startScreen = rec.startScreen;
|
|
7284
|
+
if (typeof rec.testCondition === "string" && rec.testCondition.length > 0) entry.testCondition = rec.testCondition;
|
|
7285
|
+
if (Array.isArray(rec.preconditions)) {
|
|
7286
|
+
const pre = rec.preconditions.filter((p) => typeof p === "string" && p.length > 0);
|
|
7287
|
+
if (pre.length > 0) entry.preconditions = pre;
|
|
7288
|
+
}
|
|
7289
|
+
out.push(entry);
|
|
7290
|
+
}
|
|
7291
|
+
}
|
|
7292
|
+
return out;
|
|
7293
|
+
}
|
|
7294
|
+
const LABELS_JA = {
|
|
7295
|
+
indexTitle: "テスト観点インデックス (perspectives)",
|
|
7296
|
+
caseCol: "ケース",
|
|
7297
|
+
itemCol: "項目",
|
|
7298
|
+
valueCol: "内容",
|
|
7299
|
+
summary: "検証内容",
|
|
7300
|
+
preconditions: "前提条件",
|
|
7301
|
+
startScreen: "開始画面",
|
|
7302
|
+
relatedCode: "関連コード"
|
|
7303
|
+
};
|
|
7304
|
+
const LABELS_EN = {
|
|
7305
|
+
indexTitle: "Test Perspectives (perspectives)",
|
|
7306
|
+
caseCol: "Case",
|
|
7307
|
+
itemCol: "Item",
|
|
7308
|
+
valueCol: "Value",
|
|
7309
|
+
summary: "Verifies",
|
|
7310
|
+
preconditions: "Preconditions",
|
|
7311
|
+
startScreen: "Start screen",
|
|
7312
|
+
relatedCode: "Related code"
|
|
7313
|
+
};
|
|
7314
|
+
/**
|
|
7315
|
+
* Pick the label set for a `--language` value. Only an explicit English tag
|
|
7316
|
+
* (`en`, `en-US`, …) switches to English labels; `auto`, `ja`, and anything
|
|
7317
|
+
* else keep Japanese, matching the source-following default the rest of the
|
|
7318
|
+
* command uses.
|
|
7319
|
+
*/
|
|
7320
|
+
function labelsFor(language) {
|
|
7321
|
+
return /^en\b/i.test(language?.trim() ?? "") ? LABELS_EN : LABELS_JA;
|
|
7322
|
+
}
|
|
7323
|
+
/**
|
|
7324
|
+
* Path to a spec.yaml relative to the **root** `.ccqa/perspectives.md`
|
|
7325
|
+
* (i.e. relative to the `.ccqa/` dir). Used for the category index links.
|
|
7326
|
+
*/
|
|
7327
|
+
function specRelPathFromRoot(featureName, specName) {
|
|
7328
|
+
return `features/${featureName}/test-cases/${specName}/spec.yaml`;
|
|
7329
|
+
}
|
|
7330
|
+
/**
|
|
7331
|
+
* Path to a category detail file relative to the **root** `.ccqa/perspectives.md`.
|
|
7332
|
+
* The detail file is written to `.ccqa/features/<feature>/perspectives.md`
|
|
7333
|
+
* (see `getFeaturePerspectivesMarkdownPath`), so the link must include the
|
|
7334
|
+
* `features/` segment — otherwise the category heading link 404s.
|
|
7335
|
+
*/
|
|
7336
|
+
function featureDetailRelPathFromRoot(featureName) {
|
|
7337
|
+
return `features/${featureName}/perspectives.md`;
|
|
7338
|
+
}
|
|
7339
|
+
/**
|
|
7340
|
+
* Path to a spec.yaml relative to the **category** detail file
|
|
7341
|
+
* `.ccqa/features/<feature>/perspectives.md`. The spec lives alongside under
|
|
7342
|
+
* `test-cases/<spec>/`, so the category file links to it directly — which is
|
|
7343
|
+
* what makes the link resolve both on GitHub and in a local editor.
|
|
7344
|
+
*/
|
|
7345
|
+
function specRelPathFromCategory(specName) {
|
|
7346
|
+
return `test-cases/${specName}/spec.yaml`;
|
|
7347
|
+
}
|
|
7348
|
+
/**
|
|
7349
|
+
* Render the root `.ccqa/perspectives.md`: a category-grouped index of which
|
|
7350
|
+
* cases exist. Each feature is a heading (linking to its own detail
|
|
7351
|
+
* `perspectives.md`) followed by a row per case — title, status, and a link
|
|
7352
|
+
* to that case's spec.yaml. The per-case *detail* (検証内容, preconditions,
|
|
7353
|
+
* note) still lives only in the per-category file; the root stays a scannable
|
|
7354
|
+
* "what is tested, and where" overview.
|
|
7355
|
+
*
|
|
7356
|
+
* Pure and deterministic, so the index rendering is easy to unit-test.
|
|
7357
|
+
*/
|
|
7358
|
+
function renderIndexMarkdown(perspectives, labels = LABELS_JA) {
|
|
7359
|
+
const lines = [];
|
|
7360
|
+
lines.push(`# ${labels.indexTitle}`);
|
|
7361
|
+
lines.push("");
|
|
7362
|
+
for (const feature of perspectives.features) {
|
|
7363
|
+
const detailLink = featureDetailRelPathFromRoot(feature.featureName);
|
|
7364
|
+
lines.push(`## [${feature.featureName}](${detailLink})`);
|
|
7365
|
+
lines.push("");
|
|
7366
|
+
lines.push(`| ${labels.caseCol} | spec |`);
|
|
7367
|
+
lines.push("| --- | --- |");
|
|
7368
|
+
for (const spec of feature.specs) {
|
|
7369
|
+
const specLink = specRelPathFromRoot(feature.featureName, spec.specName);
|
|
7370
|
+
lines.push(`| ${mdCell(spec.title)} | [spec](${specLink}) |`);
|
|
7371
|
+
}
|
|
7372
|
+
lines.push("");
|
|
7373
|
+
}
|
|
7374
|
+
return lines.join("\n");
|
|
7375
|
+
}
|
|
7376
|
+
/**
|
|
7377
|
+
* Render one category's `.ccqa/features/<feature>/perspectives.md`: every
|
|
7378
|
+
* case in the category as a self-contained vertical table. All columns —
|
|
7379
|
+
* including the verification summary (検証内容) and the human note — live
|
|
7380
|
+
* inside the table; nothing is emitted outside it. Detailed steps / expected
|
|
7381
|
+
* results are still not restated (the spec.yaml is their single home); the
|
|
7382
|
+
* table links back to each spec instead.
|
|
7383
|
+
*
|
|
7384
|
+
* Pure and deterministic, so the per-case rendering is easy to unit-test.
|
|
7385
|
+
*/
|
|
7386
|
+
function renderFeatureMarkdown(feature, labels = LABELS_JA) {
|
|
7387
|
+
const lines = [];
|
|
7388
|
+
lines.push(`# ${feature.featureName}`);
|
|
7389
|
+
lines.push("");
|
|
7390
|
+
for (const spec of feature.specs) lines.push(...renderSpecMarkdown(spec, labels));
|
|
7391
|
+
return lines.join("\n");
|
|
7392
|
+
}
|
|
7393
|
+
/**
|
|
7394
|
+
* Render one spec as a single vertical (item | content) Markdown table for a
|
|
7395
|
+
* category file. Verification summary and preconditions lead. The spec link
|
|
7396
|
+
* is relative to this category file so it resolves both on GitHub and in a
|
|
7397
|
+
* local editor. Related-code paths stay inline code rather than links: their
|
|
7398
|
+
* base (the cwd that hosts `.ccqa/`) is not reliably recoverable here — specs
|
|
7399
|
+
* carry a mix of cwd-relative (`src/...`) and repo-root (`pkg/app/src/...`)
|
|
7400
|
+
* forms — and many are globs that no link could open anyway. 検証内容
|
|
7401
|
+
* (summary) and note are rows inside the table; no prose blocks are emitted
|
|
7402
|
+
* around it. Exported for focused unit testing.
|
|
7403
|
+
*/
|
|
7404
|
+
function renderSpecMarkdown(spec, labels = LABELS_JA) {
|
|
7405
|
+
const lines = [];
|
|
7406
|
+
lines.push(`## ${spec.title}`);
|
|
7407
|
+
lines.push("");
|
|
7408
|
+
lines.push(`| ${labels.itemCol} | ${labels.valueCol} |`);
|
|
7409
|
+
lines.push("| --- | --- |");
|
|
7410
|
+
if (spec.summary) lines.push(`| ${labels.summary} | ${mdCell(spec.summary)} |`);
|
|
7411
|
+
if (spec.preconditions && spec.preconditions.length > 0) lines.push(`| ${labels.preconditions} | ${spec.preconditions.map(mdCell).join("<br>")} |`);
|
|
7412
|
+
if (spec.startScreen) lines.push(`| ${labels.startScreen} | ${mdCell(spec.startScreen)} |`);
|
|
7413
|
+
const specPath = specRelPathFromCategory(spec.specName);
|
|
7414
|
+
lines.push(`| spec | [${specPath}](${specPath}) |`);
|
|
7415
|
+
if (spec.relatedPaths && spec.relatedPaths.length > 0) lines.push(`| ${labels.relatedCode} | ${spec.relatedPaths.map((p) => `\`${p}\``).join("<br>")} |`);
|
|
7416
|
+
if (spec.note) lines.push(`| 📝 note | ${mdCell(spec.note)} |`);
|
|
7417
|
+
lines.push("");
|
|
7418
|
+
return lines;
|
|
7419
|
+
}
|
|
7420
|
+
/** Escape pipes / newlines so a value stays inside one Markdown table cell. */
|
|
7421
|
+
function mdCell(value) {
|
|
7422
|
+
return value.replace(/\|/g, "\\|").replace(/\n/g, " ");
|
|
7423
|
+
}
|
|
7424
|
+
//#endregion
|
|
5653
7425
|
//#region src/cli/index.ts
|
|
5654
7426
|
const packageJsonPath = resolvePackageJson();
|
|
5655
7427
|
const { version } = JSON.parse(readFileSync(packageJsonPath, "utf8"));
|
|
@@ -5667,6 +7439,7 @@ const program = new Command();
|
|
|
5667
7439
|
program.name("ccqa").description("E2E test CLI using Claude Code + agent-browser").version(version);
|
|
5668
7440
|
program.addCommand(draftCommand);
|
|
5669
7441
|
program.addCommand(driftCommand);
|
|
7442
|
+
program.addCommand(perspectivesCommand);
|
|
5670
7443
|
program.addCommand(traceCommand);
|
|
5671
7444
|
program.addCommand(generateCommand);
|
|
5672
7445
|
program.addCommand(runCommand);
|