ccqa 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/ccqa.mjs CHANGED
@@ -9,11 +9,11 @@ import { query } from "@anthropic-ai/claude-agent-sdk";
9
9
  import { ZodError, z } from "zod";
10
10
  import { delimiter, dirname, join, relative, resolve } from "node:path";
11
11
  import { parse, stringify } from "yaml";
12
- import { execFile, spawn } from "node:child_process";
12
+ import { execFile, spawn, spawnSync } from "node:child_process";
13
13
  import { createInterface } from "node:readline";
14
14
  import { homedir, tmpdir } from "node:os";
15
- import { createInterface as createInterface$1 } from "node:readline/promises";
16
15
  import { promisify } from "node:util";
16
+ import { createInterface as createInterface$1 } from "node:readline/promises";
17
17
  //#region src/prompts/trace.ts
18
18
  function generateSessionName() {
19
19
  return `ccqa-trace-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}`;
@@ -3378,10 +3378,36 @@ function previewDiff(before, after) {
3378
3378
  return out.join("\n");
3379
3379
  }
3380
3380
  //#endregion
3381
+ //#region src/prompts/format.ts
3382
+ /**
3383
+ * Formatting helpers shared by the Claude prompt builders (diagnose, report).
3384
+ * Centralised so the prompts cannot drift apart on mechanics that must stay
3385
+ * consistent across commands.
3386
+ */
3387
+ /** Prefix every line with its 1-based number, the form fix suggestions cite. */
3388
+ function numberLines(script) {
3389
+ return script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n");
3390
+ }
3391
+ /**
3392
+ * The "## Output language" prompt section. Empty for "auto" so the prompt
3393
+ * stays byte-identical to the no-flag baseline. `fields` names the
3394
+ * human-readable JSON fields to translate; `verbatimNames` names the
3395
+ * enum-like values that must never be translated.
3396
+ */
3397
+ function outputLanguageBlock(outputLanguage, fields, verbatimNames) {
3398
+ if (outputLanguage === "auto") return "";
3399
+ return `## Output language
3400
+
3401
+ Write all human-readable fields (${fields}) in **${outputLanguage}** (BCP-47 tag).
3402
+ Selectors, file paths, identifiers, ${verbatimNames}, JSON keys, and quoted strings stay verbatim regardless of language.
3403
+
3404
+ `;
3405
+ }
3406
+ //#endregion
3381
3407
  //#region src/diagnose/prompt.ts
3382
3408
  function buildDiagnosePrompt(input) {
3383
3409
  const { script, specYaml, actions, failureLog, pageSnapshot, outputLanguage = "auto" } = input;
3384
- const numbered = script.split("\n").map((l, i) => `${i + 1}: ${l}`).join("\n");
3410
+ const numbered = numberLines(script);
3385
3411
  const actionsSummary = actions.map((a, i) => {
3386
3412
  const parts = [`${i + 1}. ${a.command}`];
3387
3413
  if (a.assertType) parts.push(`assertType="${a.assertType}"`);
@@ -3392,12 +3418,7 @@ function buildDiagnosePrompt(input) {
3392
3418
  }).join("\n");
3393
3419
  return `You are diagnosing a failing E2E test. The test was generated from a recorded trace of the original interaction. Compare the failing run against the original spec and recorded actions to determine WHY the test failed and what the right fix is.
3394
3420
 
3395
- ${outputLanguage === "auto" ? "" : `## Output language
3396
-
3397
- Write all human-readable fields (\`reasoning\`, \`reason\`) in **${outputLanguage}** (BCP-47 tag).
3398
- Selectors, file paths, identifiers, code, type names (TIMING_ISSUE, etc.), JSON keys, and quoted strings stay verbatim regardless of language.
3399
-
3400
- `}## You have read-only filesystem tools
3421
+ ${outputLanguageBlock(outputLanguage, "`reasoning`, `reason`", "code, type names (TIMING_ISSUE, etc.)")}## You have read-only filesystem tools
3401
3422
 
3402
3423
  You can call \`Grep\`, \`Glob\`, and \`Read\` against the current repository before producing the JSON.
3403
3424
 
@@ -4630,165 +4651,1187 @@ async function checkSpec(target, opts) {
4630
4651
  };
4631
4652
  }
4632
4653
  //#endregion
4633
- //#region src/drift/format.ts
4654
+ //#region src/drift/affected.ts
4655
+ const execFileP = promisify(execFile);
4634
4656
  /**
4635
- * Render drift results as a string. The CLI commands and the `run` failure
4636
- * hook are the only callers; both want the formatted output returned so
4637
- * they can prefix / interleave / pipe it as needed.
4657
+ * Resolve the base ref to diff against for `ccqa drift --changed`.
4658
+ * Precedence: explicit override > GITHUB_BASE_REF > origin/main.
4638
4659
  */
4639
- function renderDrift(results, format, cwd) {
4640
- if (format === "json") return renderJson(results);
4641
- if (format === "github") return renderGithub(results, cwd);
4642
- return renderText(results);
4660
+ function resolveBaseRef(explicit) {
4661
+ if (explicit && explicit.length > 0) return explicit;
4662
+ const ghBase = process.env["GITHUB_BASE_REF"];
4663
+ if (ghBase && ghBase.length > 0) return ghBase.startsWith("origin/") ? ghBase : `origin/${ghBase}`;
4664
+ return "origin/main";
4643
4665
  }
4644
- const HEAVY_RULE = "═".repeat(72);
4645
- function renderText(results) {
4666
+ /**
4667
+ * Run `git diff --name-status base...HEAD` from `cwd` and return one entry per
4668
+ * changed file. Renames are reported under their NEW path with status
4669
+ * "renamed" — the OLD path is dropped because the spec mapping is against the
4670
+ * post-rename layout.
4671
+ *
4672
+ * Paths are re-rooted to be relative to `cwd`, not the git repo root. In a
4673
+ * monorepo where `cwd` is a sub-package (e.g. `apps/foo`), git emits paths
4674
+ * relative to the repo root, but specs declare relatedPaths relative to
4675
+ * their own package. Changes outside `cwd` are dropped so an unrelated PR
4676
+ * can never accidentally scope a sub-package's specs in.
4677
+ */
4678
+ async function getChangedFiles(base, cwd) {
4679
+ const [{ stdout: rootOut }, { stdout: diffOut }] = await Promise.all([execFileP("git", ["rev-parse", "--show-toplevel"], { cwd }), execFileP("git", [
4680
+ "diff",
4681
+ "--name-status",
4682
+ "-M",
4683
+ `${base}...HEAD`
4684
+ ], {
4685
+ cwd,
4686
+ maxBuffer: 32 * 1024 * 1024
4687
+ })]);
4688
+ return rerootChangedFiles(parseGitDiffOutput(diffOut), rootOut.trim(), cwd);
4689
+ }
4690
+ /**
4691
+ * Convert paths in `entries` from git-repo-root relative to `cwd` relative,
4692
+ * dropping anything outside `cwd`. Exported for unit tests.
4693
+ */
4694
+ function rerootChangedFiles(entries, repoRoot, cwd) {
4695
+ const prefix = relative(repoRoot, cwd);
4696
+ if (!prefix) return entries;
4646
4697
  const out = [];
4647
- for (const r of results) {
4648
- out.push("");
4649
- const heading = `══ ${r.target.featureName}/${r.target.specName} `;
4650
- const tail = "═".repeat(Math.max(3, 72 - heading.length));
4651
- out.push(`${heading}${tail}`);
4652
- if (r.error) {
4653
- out.push(` ERROR ${r.error}`);
4698
+ for (const e of entries) {
4699
+ const rel = relative(prefix, e.path);
4700
+ if (rel.startsWith("..") || rel === "") continue;
4701
+ out.push({
4702
+ ...e,
4703
+ path: rel
4704
+ });
4705
+ }
4706
+ return out;
4707
+ }
4708
+ function parseGitDiffOutput(stdout) {
4709
+ const out = [];
4710
+ for (const line of stdout.split("\n")) {
4711
+ if (!line.trim()) continue;
4712
+ const parts = line.split(" ");
4713
+ const code = parts[0];
4714
+ if (!code) continue;
4715
+ if (code.startsWith("R")) {
4716
+ const newPath = parts[2];
4717
+ if (newPath) out.push({
4718
+ path: newPath,
4719
+ status: "renamed"
4720
+ });
4654
4721
  continue;
4655
4722
  }
4656
- const errors = r.issues.filter((i) => i.severity === "ERROR");
4657
- const warnings = r.issues.filter((i) => i.severity === "WARN");
4658
- const passed = r.issues.filter((i) => i.severity === "OK");
4659
- if (errors.length === 0 && warnings.length === 0) {
4660
- const label = passed.length === 1 ? "check" : "checks";
4661
- const detail = passed.length > 0 ? `all ${passed.length} ${label} passed` : "no issues";
4662
- out.push(` ✓ ${detail}`);
4723
+ if (code.startsWith("C")) {
4724
+ const newPath = parts[2];
4725
+ if (newPath) out.push({
4726
+ path: newPath,
4727
+ status: "added"
4728
+ });
4663
4729
  continue;
4664
4730
  }
4665
- for (const issue of errors) appendFinding(out, "ERROR", issue);
4666
- for (const issue of warnings) appendFinding(out, "WARN", issue);
4667
- if (passed.length > 0) {
4668
- const names = passed.map((i) => DRAFT_CATEGORY_LABEL[i.category]).join(", ");
4669
- out.push("");
4670
- out.push(` ✓ passed (${passed.length}): ${names}`);
4731
+ const path = parts[1];
4732
+ if (!path) continue;
4733
+ switch (code[0]) {
4734
+ case "A":
4735
+ out.push({
4736
+ path,
4737
+ status: "added"
4738
+ });
4739
+ break;
4740
+ case "M":
4741
+ case "T":
4742
+ out.push({
4743
+ path,
4744
+ status: "modified"
4745
+ });
4746
+ break;
4747
+ case "D":
4748
+ out.push({
4749
+ path,
4750
+ status: "deleted"
4751
+ });
4752
+ break;
4753
+ default: out.push({
4754
+ path,
4755
+ status: "modified"
4756
+ });
4671
4757
  }
4672
4758
  }
4673
- out.push("");
4674
- out.push(HEAVY_RULE);
4675
- const totals = summarize(results);
4676
- out.push(` specs ${results.length} (${totals.errored} errored)`);
4677
- out.push(` findings ${totals.error} error, ${totals.warn} warn, ${totals.ok} ok`);
4678
- out.push("");
4679
- return out.join("\n");
4759
+ return out;
4680
4760
  }
4681
- function appendFinding(out, level, issue) {
4682
- const stepPart = issue.stepId ? ` ${issue.stepId}` : "";
4683
- out.push("");
4684
- out.push(` ${level} ${DRAFT_CATEGORY_LABEL[issue.category]}${stepPart}`);
4685
- out.push(` ${issue.message}`);
4686
- if (issue.detail) out.push(` └ ${issue.detail.replace(/\n/g, "\n ")}`);
4761
+ function stripLeadingDotSlash(s) {
4762
+ return s.startsWith("./") ? s.slice(2) : s;
4687
4763
  }
4688
- function renderJson(results) {
4689
- const payload = { specs: results.map((r) => ({
4690
- feature: r.target.featureName,
4691
- spec: r.target.specName,
4692
- ok: r.ok,
4693
- ...r.error ? { error: r.error } : {},
4694
- issues: r.issues.map((i) => ({
4695
- severity: i.severity,
4696
- category: i.category,
4697
- stepId: i.stepId,
4698
- message: i.message,
4699
- ...i.detail ? { detail: i.detail } : {}
4700
- }))
4701
- })) };
4702
- return `${JSON.stringify(payload, null, 2)}\n`;
4764
+ const REGEX_CACHE = /* @__PURE__ */ new Map();
4765
+ /** Compiles `pattern` to a RegExp, memoized so repeated `--changed` matches don't re-build. */
4766
+ function compileGlob(pattern) {
4767
+ const cached = REGEX_CACHE.get(pattern);
4768
+ if (cached) return cached;
4769
+ const compiled = globToRegExp(stripLeadingDotSlash(pattern));
4770
+ REGEX_CACHE.set(pattern, compiled);
4771
+ return compiled;
4703
4772
  }
4704
- function renderGithub(results, cwd) {
4705
- const repoRoot = process.env["GITHUB_WORKSPACE"] ?? process.cwd();
4706
- const lines = [];
4707
- for (const r of results) {
4708
- const file = githubRelPath(cwd, repoRoot, r.target.featureName, r.target.specName);
4709
- if (r.error) {
4710
- lines.push(`::error file=${file}::${escapeGhMessage(r.error)}`);
4773
+ function globToRegExp(pattern) {
4774
+ let re = "^";
4775
+ let i = 0;
4776
+ while (i < pattern.length) {
4777
+ const ch = pattern[i];
4778
+ if (ch === "?") {
4779
+ re += "[^/]";
4780
+ i++;
4711
4781
  continue;
4712
4782
  }
4713
- for (const issue of r.issues) {
4714
- if (issue.severity === "OK") continue;
4715
- const level = issue.severity === "ERROR" ? "error" : "warning";
4716
- const title = `${r.target.featureName}/${r.target.specName} — ${issue.category}${issue.stepId ? ` (${issue.stepId})` : ""}`;
4717
- const body = issue.detail ? `${issue.message}\n${issue.detail}` : issue.message;
4718
- lines.push(`::${level} file=${file},title=${escapeGhProp(title)}::${escapeGhMessage(body)}`);
4783
+ if (ch !== "*") {
4784
+ re += /[.+^${}()|[\]\\]/.test(ch) ? "\\" + ch : ch;
4785
+ i++;
4786
+ continue;
4787
+ }
4788
+ if (pattern[i + 1] !== "*") {
4789
+ re += "[^/]*";
4790
+ i++;
4791
+ continue;
4719
4792
  }
4793
+ const hasLeadingSlash = re.endsWith("/");
4794
+ const hasTrailingSlash = pattern[i + 2] === "/";
4795
+ if (hasLeadingSlash) re = re.slice(0, -1);
4796
+ if (hasLeadingSlash || hasTrailingSlash) re += "(?:/?.*)?";
4797
+ else re += ".*";
4798
+ i += hasTrailingSlash ? 3 : 2;
4720
4799
  }
4721
- return lines.length === 0 ? "" : `${lines.join("\n")}\n`;
4722
- }
4723
- function githubRelPath(cwd, repoRoot, featureName, specName) {
4724
- const abs = resolve(cwd, ".ccqa", "features", featureName, "test-cases", specName, "spec.yaml");
4725
- const rel = relative(repoRoot, abs);
4726
- return rel.startsWith("..") ? abs : rel;
4727
- }
4728
- function escapeGhMessage(s) {
4729
- return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A");
4730
- }
4731
- function escapeGhProp(s) {
4732
- return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A").replace(/,/g, "%2C").replace(/:/g, "%3A");
4800
+ return new RegExp(re + "$");
4733
4801
  }
4734
- function summarize(results) {
4735
- let error = 0;
4736
- let warn = 0;
4737
- let ok = 0;
4738
- let errored = 0;
4739
- for (const r of results) {
4740
- if (r.error) errored++;
4741
- for (const issue of r.issues) if (issue.severity === "ERROR") error++;
4742
- else if (issue.severity === "WARN") warn++;
4743
- else ok++;
4744
- }
4745
- return {
4746
- error,
4747
- warn,
4748
- ok,
4749
- errored
4750
- };
4802
+ /**
4803
+ * Returns true if `changedPath` is covered by any of `relatedPaths`. An empty
4804
+ * `relatedPaths` returns false — callers handle the "unscoped spec" case
4805
+ * separately (treat the spec as always-affected) before calling this.
4806
+ */
4807
+ function isPathAffectedBy(changedPath, relatedPaths) {
4808
+ const stripped = stripLeadingDotSlash(changedPath);
4809
+ for (const pattern of relatedPaths) if (compileGlob(pattern).test(stripped)) return true;
4810
+ return false;
4751
4811
  }
4752
4812
  //#endregion
4753
- //#region src/drift/exit-code.ts
4754
- /**
4755
- * Map drift results to an exit code. Spec-level errors (Claude call failed)
4756
- * always fail; otherwise ERROR severity always fails, WARN fails only when
4757
- * the threshold is `warn`.
4758
- */
4759
- function determineExitCode(results, threshold) {
4760
- for (const r of results) {
4761
- if (r.error) return 1;
4762
- for (const issue of r.issues) {
4763
- if (issue.severity === "ERROR") return 1;
4764
- if (threshold === "warn" && issue.severity === "WARN") return 1;
4765
- }
4766
- }
4767
- return 0;
4768
- }
4769
- //#endregion
4770
- //#region src/drift/auth.ts
4813
+ //#region src/drift/auth.ts
4771
4814
  /**
4772
4815
  * Probe whether the host has any credential the Anthropic SDK can pick up:
4773
4816
  * 1. ANTHROPIC_API_KEY env var (CI / scripted use)
4774
- * 2. ~/.claude/.credentials.json (local Claude Code login)
4817
+ * 2. ~/.claude/.credentials.json (Claude Code login, file-based platforms)
4818
+ * 3. macOS Keychain item "Claude Code-credentials" (Claude Code login on
4819
+ * darwin stores the OAuth credentials in the Keychain, not on disk)
4775
4820
  *
4776
- * `run --drift` is opt-in, so the caller will only consult this after the
4777
- * user has asked for drift. We never throw — auth absence is a normal flow
4778
- * that surfaces as "drift analysis skipped".
4821
+ * Claude-driven hooks are opt-in, so the caller only consults this after the
4822
+ * user has asked for analysis. We never throw — auth absence is a normal flow
4823
+ * that surfaces as "analysis skipped".
4779
4824
  */
4780
4825
  function driftAuthAvailable() {
4781
4826
  const key = process.env["ANTHROPIC_API_KEY"];
4782
4827
  if (typeof key === "string" && key.length > 0) return { ok: true };
4783
4828
  if (existsSync(join(homedir(), ".claude", ".credentials.json"))) return { ok: true };
4829
+ if (process.platform === "darwin" && keychainHasClaudeCredentials()) return { ok: true };
4784
4830
  return {
4785
4831
  ok: false,
4786
4832
  reason: "no ANTHROPIC_API_KEY / claude login"
4787
4833
  };
4788
4834
  }
4835
+ /**
4836
+ * `security find-generic-password` without `-w` only checks the item's
4837
+ * existence (exit 0) — it never reads the secret, so no Keychain unlock
4838
+ * prompt is triggered. Resolved via PATH so tests can stub the binary.
4839
+ */
4840
+ function keychainHasClaudeCredentials() {
4841
+ try {
4842
+ return spawnSync("security", [
4843
+ "find-generic-password",
4844
+ "-s",
4845
+ "Claude Code-credentials"
4846
+ ], {
4847
+ stdio: "ignore",
4848
+ timeout: 3e3
4849
+ }).status === 0;
4850
+ } catch {
4851
+ return false;
4852
+ }
4853
+ }
4854
+ //#endregion
4855
+ //#region src/report/prompt.ts
4856
+ function buildFailureAnalysisPrompt(input) {
4857
+ const { script, specYaml, failureLog, diffPatch, changedFiles, baseRef, driftIssues, outputLanguage = "auto" } = input;
4858
+ const numbered = numberLines(script);
4859
+ return `You are analyzing a failing E2E regression test right after a source change landed. Your job is a root-cause CALL, not a fix: decide which of three categories explains the failure, using the source diff as your primary context.
4860
+
4861
+ ${outputLanguageBlock(outputLanguage, "`reasoning`, `detail`", "label names (TEST_DRIFT, etc.)")}## The three categories
4862
+
4863
+ The question that separates them: **is the behavior the spec describes still what the product intends?**
4864
+
4865
+ 1. TEST_DRIFT — what the spec verifies is unchanged; only the test code drifted from the source. Typical: a selector/aria-label/placeholder rename, a timing change, an over-tight assertion. The diff shows a change that is invisible to the user's intent but visible to the test.
4866
+ 2. SPEC_CHANGE — the thing being verified itself changed: the UI flow, the layout, the feature's intended behavior. The diff deliberately changes what the spec asserts. You MUST cite the diff hunk (file + what changed) as evidence for this label.
4867
+ 3. PRODUCT_BUG — neither of the above: the failure is not explained by the diff nor by test staleness. The product regressed.
4868
+
4869
+ If the evidence is too weak to choose, answer UNKNOWN — a wrong confident call is worse than an honest UNKNOWN, because humans grade these predictions to measure accuracy.
4870
+
4871
+ ## You have read-only filesystem tools
4872
+
4873
+ You can call \`Grep\`, \`Glob\`, and \`Read\` against the current repository (post-change state) before producing the JSON. Use them to:
4874
+ - confirm a suspected selector rename (grep for \`aria-label=\`, \`placeholder=\`, \`data-testid\`, i18n strings),
4875
+ - read the changed files in full when the truncated patch is not enough,
4876
+ - check whether the element/flow the spec describes still exists in the source.
4877
+
4878
+ You have **up to 12 tool turns**. Do NOT write, edit, run shell commands, or hit the network.
4879
+
4880
+ ## Decision guidance
4881
+
4882
+ - Diff touches only attributes/identifiers the test selects on (labels, testids, class names, timing) while the user-visible flow is intact → TEST_DRIFT.
4883
+ - Diff intentionally removes/reworks the UI or flow that a spec step verifies (component deleted, page restructured, copy redefined, feature flag flipped) → SPEC_CHANGE.
4884
+ - Diff UNINTENTIONALLY breaks behavior the spec still intends — e.g. a refactor that drops a side effect, an inverted condition, a regression hiding inside a cleanup commit — → PRODUCT_BUG, citing the diff hunk as evidence. A product bug is often introduced BY the diff; what separates it from SPEC_CHANGE is intent: does the change read as a deliberate redesign of what the spec verifies, or as collateral damage?
4885
+ - Diff is unrelated to the failing step (or there is no relevant diff) and the test was passing before → lean PRODUCT_BUG; first rule out timing/data flakiness and infrastructure errors (daemon not running, network down, missing credentials) — those read as UNKNOWN with low confidence, not PRODUCT_BUG.
4886
+ - The drift audit findings (when present) flag spec↔code mismatches; an ERROR there usually supports TEST_DRIFT or SPEC_CHANGE over PRODUCT_BUG.
4887
+
4888
+ ## Sub-diagnosis vocabulary
4889
+
4890
+ Alongside the label, report the closest fine-grained mechanic:
4891
+ - SELECTOR_DRIFT, TIMING_ISSUE, OVER_ASSERTION — usually under TEST_DRIFT
4892
+ - DATA_MISSING — missing test data/state; usually UNKNOWN or PRODUCT_BUG depending on cause
4893
+ - NONE — when nothing fits (typical for SPEC_CHANGE and PRODUCT_BUG)
4894
+
4895
+ ## Output
4896
+
4897
+ Your **final** assistant message must start with \`{\` and end with \`}\` — a single JSON object, nothing before or after. No prose preamble, no markdown fences, no tool calls in the same turn.
4898
+
4899
+ {
4900
+ "label": "TEST_DRIFT" | "SPEC_CHANGE" | "PRODUCT_BUG" | "UNKNOWN",
4901
+ "confidence": <0.0-1.0>,
4902
+ "subDiagnosis": "SELECTOR_DRIFT" | "TIMING_ISSUE" | "OVER_ASSERTION" | "DATA_MISSING" | "NONE",
4903
+ "evidence": [
4904
+ { "file": "<file:line or diff hunk reference, omit if log-only>", "detail": "<what this shows>" }
4905
+ ],
4906
+ "reasoning": "<why this label, citing the evidence>"
4907
+ }
4908
+
4909
+ ## Confidence guidance
4910
+
4911
+ - 0.9-1.0: the diff (or a file you read) directly shows the cause
4912
+ - 0.7-0.9: strong indirect evidence
4913
+ - 0.4-0.7: plausible but another category could explain it
4914
+ - < 0.4: answer UNKNOWN instead of guessing
4915
+
4916
+ Evidence rules: TEST_DRIFT and SPEC_CHANGE require at least one concrete \`file\` reference (diff hunk or file:line you actually read). PRODUCT_BUG should explain why the diff does NOT account for the failure.
4917
+
4918
+ ## Test Spec (spec.yaml)
4919
+ ${specYaml}
4920
+
4921
+ ## Test Script (with line numbers)
4922
+ ${numbered}
4923
+
4924
+ ${diffPatch ? `## Source changes since ${baseRef ?? "base"} (git diff, may be truncated)
4925
+
4926
+ ### Changed files (name-status)
4927
+ ${changedFiles ?? "(unavailable)"}
4928
+
4929
+ ### Patch
4930
+ \`\`\`diff
4931
+ ${diffPatch}
4932
+ \`\`\`
4933
+ ` : `## Source changes
4934
+
4935
+ No diff context is available (the base ref could not be resolved, or there are no changes). Classify from the failure log, the spec, and what you can read in the repository — and be correspondingly more conservative: prefer UNKNOWN over a confident SPEC_CHANGE/PRODUCT_BUG call without diff evidence.
4936
+ `}
4937
+ ${driftIssues && driftIssues.length > 0 ? `## Spec↔code drift audit findings
4938
+
4939
+ A separate read-only audit compared the spec against the current source. Treat these as hints, not verdicts:
4940
+
4941
+ ${driftIssues.map((i) => `- [${i.severity}] (${DRAFT_CATEGORY_LABEL[i.category]}${i.stepId ? `, step ${i.stepId}` : ""}) ${i.message}${i.detail ? ` — ${i.detail}` : ""}`).join("\n")}
4942
+ ` : ""}## Failure Log
4943
+ ${failureLog.slice(0, 8e3)}`;
4944
+ }
4945
+ //#endregion
4946
+ //#region src/diagnose/types.ts
4947
+ /**
4948
+ * The concrete (fixable) diagnosis tags as a value, for consumers that need
4949
+ * to enumerate them (e.g. the run report's subDiagnosis vocabulary). The
4950
+ * `satisfies` clause makes renaming a union member without updating this
4951
+ * list a compile error.
4952
+ */
4953
+ const FIXABLE_DIAGNOSIS_TYPES = [
4954
+ "SELECTOR_DRIFT",
4955
+ "TIMING_ISSUE",
4956
+ "OVER_ASSERTION",
4957
+ "DATA_MISSING"
4958
+ ];
4959
+ //#endregion
4960
+ //#region src/report/schema.ts
4961
+ /**
4962
+ * The three-way root-cause call for a failing spec, framed as drift analysis:
4963
+ * - TEST_DRIFT: what the spec verifies is unchanged; only the test code
4964
+ * drifted from the source (selector rename, timing, ...).
4965
+ * Future iterations may auto-fix these.
4966
+ * - SPEC_CHANGE: the thing being verified itself changed (UI redesign,
4967
+ * spec change). Never auto-fix — a human must re-draft.
4968
+ * - PRODUCT_BUG: neither of the above explains the failure — treat it as
4969
+ * a product regression.
4970
+ *
4971
+ * The stakeholder ask behind this module is measurement-first: the call is
4972
+ * known to be hard, so every prediction is embedded in the HTML report where
4973
+ * a human records the ground truth and the report computes the confusion
4974
+ * matrix client-side. Accuracy may start low; it must be *visible*.
4975
+ */
4976
+ const FAILURE_LABELS = [
4977
+ "TEST_DRIFT",
4978
+ "SPEC_CHANGE",
4979
+ "PRODUCT_BUG"
4980
+ ];
4981
+ const FailureLabelSchema = z.enum(FAILURE_LABELS);
4982
+ /** What the model may answer: the three labels, or UNKNOWN when evidence is weak. */
4983
+ const PREDICTED_LABELS = [...FAILURE_LABELS, "UNKNOWN"];
4984
+ const PredictedLabelSchema = z.enum(PREDICTED_LABELS);
4985
+ const SUB_DIAGNOSES = [...FIXABLE_DIAGNOSIS_TYPES, "NONE"];
4986
+ const FailureEvidenceSchema = z.object({
4987
+ file: z.string().optional(),
4988
+ detail: z.string()
4989
+ });
4990
+ /**
4991
+ * LLM output shape. Deliberately NOT .strict(): the model occasionally adds
4992
+ * keys, and rejecting the whole analysis over an extra field would collapse
4993
+ * a usable prediction into UNKNOWN. Zod's default strips unknown keys.
4994
+ */
4995
+ const FailureAnalysisSchema = z.object({
4996
+ label: PredictedLabelSchema,
4997
+ confidence: z.number().min(0).max(1),
4998
+ subDiagnosis: z.enum(SUB_DIAGNOSES).optional(),
4999
+ evidence: z.array(FailureEvidenceSchema),
5000
+ reasoning: z.string()
5001
+ });
5002
+ const ReportAssertionSchema = z.object({
5003
+ name: z.string(),
5004
+ status: z.enum([
5005
+ "passed",
5006
+ "failed",
5007
+ "skipped"
5008
+ ]),
5009
+ durationMs: z.number().nullable()
5010
+ });
5011
+ const ReportSpecResultSchema = z.object({
5012
+ feature: z.string(),
5013
+ spec: z.string(),
5014
+ status: z.enum(["passed", "failed"]),
5015
+ testCounts: z.object({
5016
+ total: z.number(),
5017
+ passed: z.number(),
5018
+ failed: z.number()
5019
+ }).nullable(),
5020
+ durationMs: z.number().nullable(),
5021
+ assertions: z.array(ReportAssertionSchema).nullable(),
5022
+ analysis: FailureAnalysisSchema.nullable(),
5023
+ analysisSkipped: z.string().nullable(),
5024
+ driftIssues: z.array(DraftIssueSchema).nullable(),
5025
+ failureLogExcerpt: z.string().nullable(),
5026
+ diffExcerpt: z.string().nullable(),
5027
+ specYaml: z.string().nullable()
5028
+ });
5029
+ z.object({
5030
+ schemaVersion: z.literal(1),
5031
+ createdAt: z.string(),
5032
+ runId: z.string().nullable(),
5033
+ git: z.object({
5034
+ head: z.string().nullable(),
5035
+ base: z.string().nullable()
5036
+ }),
5037
+ model: z.string().nullable(),
5038
+ promptVersion: z.string(),
5039
+ results: z.array(ReportSpecResultSchema)
5040
+ });
5041
+ /** Shape of the "export labels" download produced by the report's client-side JS. */
5042
+ const LabelEntrySchema = z.object({
5043
+ feature: z.string(),
5044
+ spec: z.string(),
5045
+ predicted: PredictedLabelSchema,
5046
+ label: FailureLabelSchema,
5047
+ note: z.string().optional()
5048
+ });
5049
+ z.object({
5050
+ schemaVersion: z.literal(1),
5051
+ runId: z.string().nullable(),
5052
+ promptVersion: z.string(),
5053
+ exportedAt: z.string(),
5054
+ labels: z.array(LabelEntrySchema)
5055
+ });
5056
+ //#endregion
5057
+ //#region src/report/analyze.ts
5058
+ /**
5059
+ * Classify one failing spec into TEST_DRIFT / SPEC_CHANGE / PRODUCT_BUG /
5060
+ * UNKNOWN. Same resilience contract as diagnose(): read-only tools, JSON-only
5061
+ * final message, and any parse failure degrades to UNKNOWN with confidence 0
5062
+ * rather than throwing — the report must always render.
5063
+ */
5064
+ async function analyzeFailure(input, options = {}) {
5065
+ const { result: raw, isError } = await invokeClaudeStreaming({
5066
+ prompt: buildFailureAnalysisPrompt(input),
5067
+ allowedTools: [
5068
+ "Read",
5069
+ "Grep",
5070
+ "Glob"
5071
+ ],
5072
+ silenceBashLog: true,
5073
+ maxTurns: 12,
5074
+ ...options.model ? { model: options.model } : {},
5075
+ ...options.cwd ? { cwd: options.cwd } : {}
5076
+ }, () => {});
5077
+ if (isError || !raw) return {
5078
+ analysis: unknownAnalysis(isError ? "Claude returned an error result" : "Claude returned no output"),
5079
+ raw: raw ?? "",
5080
+ sdkError: isError
5081
+ };
5082
+ for (const candidate of extractJsonCandidates(raw)) {
5083
+ let parsed;
5084
+ try {
5085
+ parsed = JSON.parse(candidate);
5086
+ } catch {
5087
+ continue;
5088
+ }
5089
+ const normalised = normaliseFailureAnalysis(parsed);
5090
+ if (normalised) return {
5091
+ analysis: normalised,
5092
+ raw,
5093
+ sdkError: false
5094
+ };
5095
+ }
5096
+ return {
5097
+ analysis: unknownAnalysis(`analysis returned no parseable JSON: ${truncate$2(raw, 500)}`),
5098
+ raw,
5099
+ sdkError: false
5100
+ };
5101
+ }
5102
+ function unknownAnalysis(reasoning) {
5103
+ return {
5104
+ label: "UNKNOWN",
5105
+ confidence: 0,
5106
+ subDiagnosis: "NONE",
5107
+ evidence: [],
5108
+ reasoning
5109
+ };
5110
+ }
5111
+ const LABELS = new Set(PREDICTED_LABELS);
5112
+ const SUB_SET = new Set(SUB_DIAGNOSES);
5113
+ /**
5114
+ * Manual, lenient normalisation (mirrors diagnose's normaliseResult): a
5115
+ * missing/extra field should degrade gracefully, not reject the whole
5116
+ * prediction — only an unrecognisable label makes the candidate unusable.
5117
+ */
5118
+ function normaliseFailureAnalysis(parsed) {
5119
+ if (!isObject(parsed)) return null;
5120
+ const label = parsed["label"];
5121
+ if (typeof label !== "string" || !LABELS.has(label)) return null;
5122
+ const confidence = typeof parsed["confidence"] === "number" ? clamp(parsed["confidence"], 0, 1) : 0;
5123
+ const reasoning = typeof parsed["reasoning"] === "string" ? parsed["reasoning"] : "";
5124
+ const rawSub = parsed["subDiagnosis"];
5125
+ const subDiagnosis = typeof rawSub === "string" && SUB_SET.has(rawSub) ? rawSub : "NONE";
5126
+ const evidence = [];
5127
+ if (Array.isArray(parsed["evidence"])) for (const item of parsed["evidence"]) {
5128
+ if (!isObject(item)) continue;
5129
+ const detail = typeof item["detail"] === "string" ? item["detail"] : null;
5130
+ if (detail === null) continue;
5131
+ const file = typeof item["file"] === "string" ? item["file"] : void 0;
5132
+ evidence.push(file !== void 0 ? {
5133
+ file,
5134
+ detail
5135
+ } : { detail });
5136
+ }
5137
+ return {
5138
+ label,
5139
+ confidence,
5140
+ subDiagnosis,
5141
+ evidence,
5142
+ reasoning
5143
+ };
5144
+ }
5145
+ /**
5146
+ * Capture the PR diff used as context for failure analysis. `--relative`
5147
+ * re-roots paths to `cwd` and drops changes outside it, matching how
5148
+ * relatedPaths are declared in a monorepo sub-package.
5149
+ *
5150
+ * Errors (unknown base ref, not a git repo, ...) are returned, not thrown:
5151
+ * the report is still worth generating without diff context.
5152
+ */
5153
+ async function capturePrDiff(base, cwd) {
5154
+ try {
5155
+ const [{ stdout: head }, { stdout: patch }, { stdout: nameStatus }] = await Promise.all([
5156
+ execFileP("git", [
5157
+ "rev-parse",
5158
+ "--short",
5159
+ "HEAD"
5160
+ ], { cwd }),
5161
+ execFileP("git", [
5162
+ "diff",
5163
+ "-M",
5164
+ "--relative",
5165
+ `${base}...HEAD`
5166
+ ], {
5167
+ cwd,
5168
+ maxBuffer: 64 * 1024 * 1024
5169
+ }),
5170
+ execFileP("git", [
5171
+ "diff",
5172
+ "--name-status",
5173
+ "-M",
5174
+ "--relative",
5175
+ `${base}...HEAD`
5176
+ ], {
5177
+ cwd,
5178
+ maxBuffer: 32 * 1024 * 1024
5179
+ })
5180
+ ]);
5181
+ return {
5182
+ ok: true,
5183
+ diff: {
5184
+ patch,
5185
+ nameStatus: nameStatus.trim(),
5186
+ head: head.trim()
5187
+ }
5188
+ };
5189
+ } catch (e) {
5190
+ return {
5191
+ ok: false,
5192
+ error: e.message.split("\n")[0] ?? "git diff failed"
5193
+ };
5194
+ }
5195
+ }
5196
+ /**
5197
+ * Split a unified diff into per-file sections on `diff --git` boundaries.
5198
+ * The path is taken from the `b/` side so renames/edits key on the
5199
+ * post-change layout — the same side relatedPaths are written against.
5200
+ */
5201
+ const DIFF_HEADER = /^diff --git a\/(.+) b\/(.+)$/;
5202
+ function splitPatchByFile(patch) {
5203
+ const sections = [];
5204
+ const lines = patch.split("\n");
5205
+ let current = null;
5206
+ const flush = () => {
5207
+ if (current) sections.push({
5208
+ path: current.path,
5209
+ body: current.lines.join("\n")
5210
+ });
5211
+ current = null;
5212
+ };
5213
+ for (const line of lines) {
5214
+ const m = DIFF_HEADER.exec(line);
5215
+ if (m) {
5216
+ flush();
5217
+ current = {
5218
+ path: m[2],
5219
+ lines: [line]
5220
+ };
5221
+ } else if (current) current.lines.push(line);
5222
+ }
5223
+ flush();
5224
+ return sections;
5225
+ }
5226
+ /**
5227
+ * Scope a full patch down to the files a spec depends on, then truncate so
5228
+ * the analysis prompt stays bounded. `relatedPaths` null/empty means the
5229
+ * spec is unscoped — keep the whole patch (still truncated). Callers scoping
5230
+ * the same patch for many specs can pass pre-split sections instead.
5231
+ */
5232
+ function scopePatchForSpec(patch, relatedPaths, caps = {}) {
5233
+ const perFile = caps.perFile ?? 8192;
5234
+ const total = caps.total ?? 49152;
5235
+ let sections = typeof patch === "string" ? splitPatchByFile(patch) : patch;
5236
+ if (relatedPaths && relatedPaths.length > 0) {
5237
+ const scoped = sections.filter((s) => isPathAffectedBy(s.path, relatedPaths));
5238
+ if (scoped.length > 0) sections = scoped;
5239
+ }
5240
+ const parts = [];
5241
+ let used = 0;
5242
+ let droppedFiles = 0;
5243
+ for (const s of sections) {
5244
+ if (used >= total) {
5245
+ droppedFiles++;
5246
+ continue;
5247
+ }
5248
+ let body = s.body;
5249
+ if (body.length > perFile) body = `${body.slice(0, perFile)}\n[truncated: ${body.length - perFile} more chars of ${s.path}]`;
5250
+ if (used + body.length > total) body = `${body.slice(0, total - used)}\n[truncated: total patch cap reached]`;
5251
+ parts.push(body);
5252
+ used += body.length;
5253
+ }
5254
+ if (droppedFiles > 0) parts.push(`[truncated: ${droppedFiles} more changed file(s) omitted]`);
5255
+ return parts.join("\n");
5256
+ }
5257
+ //#endregion
5258
+ //#region src/report/render.ts
5259
+ /**
5260
+ * Render the run report as ONE self-contained HTML file (inline CSS/JS, no
5261
+ * network). It is meant to be uploaded as a CI artifact like Playwright's
5262
+ * HTML report and opened locally; the layout deliberately mirrors that
5263
+ * report's conventions — header stats that double as filters, a search box,
5264
+ * collapsible per-spec cards with a step list and durations, automatic
5265
+ * light/dark theme.
5266
+ *
5267
+ * The measurement loop lives client-side: each analyzed failure gets
5268
+ * ground-truth radio buttons, and a vanilla-JS block recomputes accuracy /
5269
+ * confusion matrix / per-class precision-recall on every change. Labels
5270
+ * persist in localStorage and can be exported/imported as JSON
5271
+ * (LabelsExportSchema) so the grading work survives the browser session.
5272
+ */
5273
+ function renderRunReport(data) {
5274
+ const failed = data.results.filter((r) => r.status === "failed");
5275
+ const analyzed = failed.filter((r) => r.analysis !== null);
5276
+ const passedCount = data.results.length - failed.length;
5277
+ const totalDuration = data.results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0);
5278
+ const dataJson = JSON.stringify(data).replace(/</g, "\\u003c");
5279
+ return `<!DOCTYPE html>
5280
+ <html lang="en">
5281
+ <head>
5282
+ <meta charset="utf-8">
5283
+ <meta name="viewport" content="width=device-width, initial-scale=1">
5284
+ <title>ccqa run report</title>
5285
+ <style>${CSS}</style>
5286
+ </head>
5287
+ <body>
5288
+ <header>
5289
+ <div class="header-inner">
5290
+ <div class="header-top">
5291
+ <h1>ccqa run report</h1>
5292
+ <div class="meta">
5293
+ <span title="generated at">${esc(formatDate(data.createdAt))}</span>
5294
+ ${totalDuration > 0 ? `<span>${formatDuration$1(totalDuration)}</span>` : ""}
5295
+ ${data.runId ? `<span>CI run ${esc(data.runId)}</span>` : ""}
5296
+ ${data.git.head ? `<span><code>${esc(data.git.head)}</code>${data.git.base ? ` vs <code>${esc(data.git.base)}</code>` : ""}</span>` : ""}
5297
+ <span class="dim">prompt v${esc(data.promptVersion)}</span>
5298
+ </div>
5299
+ </div>
5300
+ <div class="toolbar">
5301
+ <div class="chips" id="filter-chips">
5302
+ <button type="button" class="chip active" data-filter="all">All <span class="count">${data.results.length}</span></button>
5303
+ <button type="button" class="chip chip-pass" data-filter="passed">${passedCount} passed</button>
5304
+ <button type="button" class="chip chip-fail" data-filter="failed">${failed.length} failed</button>
5305
+ </div>
5306
+ <input type="search" id="search" placeholder="Filter by name…" autocomplete="off">
5307
+ </div>
5308
+ </div>
5309
+ </header>
5310
+
5311
+ <div class="page">
5312
+ ${analyzed.length > 0 ? metricsPanel() : ""}
5313
+
5314
+ <main id="spec-list">
5315
+ ${data.results.map((r, i) => renderResult(r, i)).join("\n")}
5316
+ </main>
5317
+ <p class="empty-note" id="no-match" hidden>No specs match the current filter.</p>
5318
+ </div>
5319
+
5320
+ <script type="application/json" id="ccqa-report-data">${dataJson}<\/script>
5321
+ <script>${CLIENT_JS}<\/script>
5322
+ </body>
5323
+ </html>
5324
+ `;
5325
+ }
5326
+ function metricsPanel() {
5327
+ return `<section class="panel" id="measure-panel">
5328
+ <div class="panel-head">
5329
+ <h2>Prediction accuracy</h2>
5330
+ <div class="measure-actions">
5331
+ <button type="button" id="export-labels">Export labels (JSON)</button>
5332
+ <label class="import-label">Import labels<input type="file" id="import-labels" accept="application/json"></label>
5333
+ </div>
5334
+ </div>
5335
+ <p class="hint">Grade each failed case below with its true cause; the matrix updates live. Labels are saved in this browser (localStorage) — export them to keep or merge.</p>
5336
+ <div id="metrics"></div>
5337
+ </section>`;
5338
+ }
5339
+ function renderResult(r, index) {
5340
+ const id = `${r.feature}/${r.spec}`;
5341
+ const duration = r.durationMs != null && r.durationMs > 0 ? `<span class="duration">${formatDuration$1(r.durationMs)}</span>` : "";
5342
+ const counts = r.testCounts ? `<span class="counts">${r.testCounts.passed}/${r.testCounts.total}</span>` : "";
5343
+ const predictionChip = r.status === "failed" && r.analysis ? `<span class="badge ${r.analysis.label}">${r.analysis.label}</span>` : "";
5344
+ return `<details class="spec ${r.status}" data-status="${r.status}" data-case-id="${esc(id)}"${r.status === "failed" ? " open" : ""}>
5345
+ <summary>
5346
+ ${statusIcon(r.status)}
5347
+ <span class="spec-name">${esc(id)}</span>
5348
+ ${predictionChip}
5349
+ <span class="spacer"></span>
5350
+ ${counts}
5351
+ ${duration}
5352
+ </summary>
5353
+ <div class="spec-body">
5354
+ ${renderAssertions(r)}
5355
+ ${r.status === "failed" ? r.analysis ? renderAnalysis(r, index) : renderSkipped(r) : ""}
5356
+ ${renderDriftIssues(r)}
5357
+ ${collapsible("Failure log", r.failureLogExcerpt)}
5358
+ ${collapsible("Source diff (scoped)", r.diffExcerpt, "diff")}
5359
+ ${collapsible("spec.yaml", r.specYaml)}
5360
+ </div>
5361
+ </details>`;
5362
+ }
5363
+ function statusIcon(status) {
5364
+ if (status === "passed") return `<span class="status-icon pass" aria-label="passed">✓</span>`;
5365
+ if (status === "failed") return `<span class="status-icon fail" aria-label="failed">✕</span>`;
5366
+ return `<span class="status-icon skip" aria-label="skipped">◌</span>`;
5367
+ }
5368
+ function renderAssertions(r) {
5369
+ if (!r.assertions || r.assertions.length === 0) return "";
5370
+ return `<ul class="steps">${r.assertions.map((a) => {
5371
+ const dur = a.durationMs != null ? `<span class="duration">${formatDuration$1(a.durationMs)}</span>` : "";
5372
+ return `<li>${statusIcon(a.status)}<span class="step-name">${esc(a.name)}</span><span class="spacer"></span>${dur}</li>`;
5373
+ }).join("")}</ul>`;
5374
+ }
5375
+ function renderAnalysis(r, index) {
5376
+ const a = r.analysis;
5377
+ const pct = Math.round(a.confidence * 100);
5378
+ const evidence = a.evidence.length > 0 ? `<ul class="evidence">${a.evidence.map((e) => `<li>${e.file ? `<code>${esc(e.file)}</code> — ` : ""}${esc(e.detail)}</li>`).join("")}</ul>` : "";
5379
+ return `<div class="analysis">
5380
+ <div class="prediction">
5381
+ <span class="badge ${a.label}">${a.label}</span>
5382
+ <span class="confidence" title="confidence"><span class="confidence-bar"><span style="width:${pct}%"></span></span>${pct}%</span>
5383
+ ${a.subDiagnosis && a.subDiagnosis !== "NONE" ? `<span class="sub">${esc(a.subDiagnosis)}</span>` : ""}
5384
+ </div>
5385
+ <p class="reasoning">${esc(a.reasoning)}</p>
5386
+ ${evidence}
5387
+ <div class="truth">
5388
+ <span class="truth-title">True cause</span>
5389
+ ${FAILURE_LABELS.map((label) => `<label class="truth-option ${label}"><input type="radio" name="label--${index}" value="${label}"><span>${label}</span></label>`).join("\n ")}
5390
+ <input type="text" class="note" placeholder="note (optional)" data-case-index="${index}">
5391
+ </div>
5392
+ </div>`;
5393
+ }
5394
+ function renderSkipped(r) {
5395
+ return `<div class="analysis skipped">analysis skipped${r.analysisSkipped ? `: ${esc(r.analysisSkipped)}` : ""}</div>`;
5396
+ }
5397
+ function renderDriftIssues(r) {
5398
+ if (!r.driftIssues || r.driftIssues.length === 0) return "";
5399
+ const items = r.driftIssues.map((i) => `<li><span class="severity ${i.severity}">${i.severity}</span> (${esc(DRAFT_CATEGORY_LABEL[i.category])}${i.stepId ? `, step ${esc(i.stepId)}` : ""}) ${esc(i.message)}${i.detail ? ` — ${esc(i.detail)}` : ""}</li>`).join("");
5400
+ return `<details class="drift"><summary>Spec↔code drift audit (${r.driftIssues.length})</summary><ul>${items}</ul></details>`;
5401
+ }
5402
+ function collapsible(title, content, kind = "") {
5403
+ if (!content) return "";
5404
+ return `<details class="raw ${kind}"><summary>${esc(title)}</summary><pre>${esc(content)}</pre></details>`;
5405
+ }
5406
+ const ESC_MAP = {
5407
+ "&": "&amp;",
5408
+ "<": "&lt;",
5409
+ ">": "&gt;",
5410
+ "\"": "&quot;",
5411
+ "'": "&#39;"
5412
+ };
5413
+ function esc(s) {
5414
+ return s.replace(/[&<>"']/g, (c) => ESC_MAP[c]);
5415
+ }
5416
+ function formatDuration$1(ms) {
5417
+ if (ms < 1e3) return `${Math.round(ms)}ms`;
5418
+ if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
5419
+ return `${Math.floor(ms / 6e4)}m ${Math.round(ms % 6e4 / 1e3)}s`;
5420
+ }
5421
+ function formatDate(iso) {
5422
+ return iso.replace("T", " ").replace(/\.\d+Z$/, " UTC");
5423
+ }
5424
+ const CSS = `
5425
+ :root {
5426
+ color-scheme: light dark;
5427
+ --bg: #f4f5f7;
5428
+ --surface: #ffffff;
5429
+ --surface-2: #f8f9fa;
5430
+ --border: #e1e4e8;
5431
+ --text: #1f2328;
5432
+ --text-dim: #656d76;
5433
+ --accent: #1f6feb;
5434
+ --pass: #1a7f37;
5435
+ --pass-bg: #dafbe1;
5436
+ --fail: #cf222e;
5437
+ --fail-bg: #ffebe9;
5438
+ --skip: #9a6700;
5439
+ --code-bg: #0d1117;
5440
+ --code-text: #e6edf3;
5441
+ --shadow: 0 1px 3px rgba(31, 35, 40, 0.06);
5442
+ }
5443
+ @media (prefers-color-scheme: dark) {
5444
+ :root {
5445
+ --bg: #0d1117;
5446
+ --surface: #161b22;
5447
+ --surface-2: #1c2129;
5448
+ --border: #30363d;
5449
+ --text: #e6edf3;
5450
+ --text-dim: #8b949e;
5451
+ --accent: #58a6ff;
5452
+ --pass: #3fb950;
5453
+ --pass-bg: rgba(63, 185, 80, 0.15);
5454
+ --fail: #f85149;
5455
+ --fail-bg: rgba(248, 81, 73, 0.15);
5456
+ --skip: #d29922;
5457
+ --code-bg: #010409;
5458
+ --code-text: #e6edf3;
5459
+ --shadow: none;
5460
+ }
5461
+ }
5462
+ * { box-sizing: border-box; }
5463
+ body {
5464
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Hiragino Sans", "Noto Sans JP", sans-serif;
5465
+ margin: 0; background: var(--bg); color: var(--text); font-size: 14px;
5466
+ }
5467
+ header {
5468
+ position: sticky; top: 0; z-index: 10;
5469
+ background: var(--surface); border-bottom: 1px solid var(--border);
5470
+ }
5471
+ .header-inner { max-width: 1080px; margin: 0 auto; padding: 14px 24px 10px; }
5472
+ .header-top { display: flex; align-items: baseline; gap: 18px; flex-wrap: wrap; }
5473
+ h1 { font-size: 17px; margin: 0; font-weight: 650; }
5474
+ h2 { font-size: 14px; margin: 0; font-weight: 650; }
5475
+ .meta { font-size: 12px; color: var(--text-dim); display: flex; gap: 14px; flex-wrap: wrap; }
5476
+ .meta code { background: var(--surface-2); border: 1px solid var(--border); padding: 0 5px; border-radius: 4px; font-size: 11px; }
5477
+ .dim { color: var(--text-dim); }
5478
+ .toolbar { display: flex; align-items: center; gap: 12px; margin-top: 10px; flex-wrap: wrap; }
5479
+ .chips { display: flex; gap: 6px; }
5480
+ .chip {
5481
+ font: inherit; font-size: 12.5px; font-weight: 600; cursor: pointer;
5482
+ padding: 3px 12px; border-radius: 999px; border: 1px solid var(--border);
5483
+ background: var(--surface); color: var(--text-dim);
5484
+ }
5485
+ .chip .count { opacity: 0.7; }
5486
+ .chip.active { background: var(--text); color: var(--surface); border-color: var(--text); }
5487
+ .chip-pass.active { background: var(--pass); border-color: var(--pass); color: #fff; }
5488
+ .chip-fail.active { background: var(--fail); border-color: var(--fail); color: #fff; }
5489
+ #search {
5490
+ font: inherit; font-size: 13px; flex: 1; min-width: 180px; max-width: 320px; margin-left: auto;
5491
+ padding: 5px 12px; border: 1px solid var(--border); border-radius: 6px;
5492
+ background: var(--surface-2); color: var(--text);
5493
+ }
5494
+ #search:focus { outline: 2px solid var(--accent); outline-offset: -1px; }
5495
+ .page { max-width: 1080px; margin: 16px auto; padding: 0 24px; }
5496
+ .panel {
5497
+ background: var(--surface); border: 1px solid var(--border); border-radius: 8px;
5498
+ padding: 14px 18px; margin-bottom: 16px; box-shadow: var(--shadow);
5499
+ }
5500
+ .panel-head { display: flex; align-items: center; justify-content: space-between; gap: 12px; flex-wrap: wrap; }
5501
+ .hint { font-size: 12px; color: var(--text-dim); margin: 6px 0 10px; }
5502
+ .spec {
5503
+ background: var(--surface); border: 1px solid var(--border); border-radius: 8px;
5504
+ margin-bottom: 8px; box-shadow: var(--shadow);
5505
+ }
5506
+ .spec > summary {
5507
+ display: flex; align-items: center; gap: 10px; padding: 10px 16px;
5508
+ cursor: pointer; list-style: none; user-select: none;
5509
+ }
5510
+ .spec > summary::-webkit-details-marker { display: none; }
5511
+ .spec > summary::before {
5512
+ content: "▸"; color: var(--text-dim); font-size: 11px;
5513
+ transition: transform 0.12s ease; flex: 0 0 auto;
5514
+ }
5515
+ .spec[open] > summary::before { transform: rotate(90deg); }
5516
+ .spec-name { font-weight: 600; font-size: 13.5px; }
5517
+ .spacer { flex: 1; }
5518
+ .counts { font-size: 12px; color: var(--text-dim); }
5519
+ .duration { font-size: 12px; color: var(--text-dim); font-variant-numeric: tabular-nums; }
5520
+ .status-icon { font-weight: 700; font-size: 13px; flex: 0 0 auto; }
5521
+ .status-icon.pass { color: var(--pass); }
5522
+ .status-icon.fail { color: var(--fail); }
5523
+ .status-icon.skip { color: var(--skip); }
5524
+ .spec-body { padding: 2px 16px 12px 36px; border-top: 1px solid var(--border); }
5525
+ .steps { list-style: none; margin: 10px 0; padding: 0; }
5526
+ .steps li {
5527
+ display: flex; align-items: center; gap: 8px; padding: 3px 8px;
5528
+ font-size: 13px; border-radius: 5px;
5529
+ }
5530
+ .steps li:hover { background: var(--surface-2); }
5531
+ .step-name { overflow-wrap: anywhere; }
5532
+ .analysis {
5533
+ border: 1px solid var(--border); border-left: 3px solid var(--accent);
5534
+ border-radius: 6px; background: var(--surface-2);
5535
+ padding: 10px 14px; margin: 10px 0;
5536
+ }
5537
+ .analysis.skipped { color: var(--text-dim); font-size: 13px; font-style: italic; border-left-color: var(--border); }
5538
+ .prediction { display: flex; align-items: center; gap: 12px; flex-wrap: wrap; }
5539
+ .badge {
5540
+ font-size: 11.5px; font-weight: 700; letter-spacing: 0.02em;
5541
+ padding: 2px 10px; border-radius: 4px; color: #fff; flex: 0 0 auto;
5542
+ }
5543
+ .badge.TEST_DRIFT { background: #b45309; }
5544
+ .badge.SPEC_CHANGE { background: #1d4ed8; }
5545
+ .badge.PRODUCT_BUG { background: #b91c1c; }
5546
+ .badge.UNKNOWN { background: #6b7280; }
5547
+ .confidence { display: inline-flex; align-items: center; gap: 7px; font-size: 12.5px; font-weight: 600; color: var(--text-dim); }
5548
+ .confidence-bar {
5549
+ display: inline-block; width: 64px; height: 6px; border-radius: 999px;
5550
+ background: var(--border); overflow: hidden;
5551
+ }
5552
+ .confidence-bar > span { display: block; height: 100%; background: var(--accent); border-radius: 999px; }
5553
+ .sub { font-size: 11px; background: var(--surface); border: 1px solid var(--border); color: var(--text-dim); padding: 1px 8px; border-radius: 999px; }
5554
+ .reasoning { font-size: 13px; margin: 9px 0; white-space: pre-wrap; line-height: 1.55; }
5555
+ .evidence { font-size: 12.5px; color: var(--text-dim); margin: 6px 0; padding-left: 18px; line-height: 1.5; }
5556
+ .evidence code { background: var(--surface); border: 1px solid var(--border); padding: 0 5px; border-radius: 4px; font-size: 11px; }
5557
+ .truth {
5558
+ display: flex; align-items: center; gap: 10px; flex-wrap: wrap;
5559
+ background: var(--surface); border: 1px dashed var(--border); border-radius: 6px;
5560
+ padding: 8px 12px; margin-top: 10px; font-size: 12.5px;
5561
+ }
5562
+ .truth-title { font-weight: 650; color: var(--text-dim); }
5563
+ .truth-option {
5564
+ display: inline-flex; align-items: center; gap: 5px; cursor: pointer;
5565
+ border: 1px solid var(--border); border-radius: 999px; padding: 2px 10px;
5566
+ }
5567
+ .truth-option:has(input:checked) { border-color: var(--accent); background: var(--surface-2); font-weight: 650; }
5568
+ .note { flex: 1; min-width: 150px; font: inherit; font-size: 12px; padding: 4px 9px; border: 1px solid var(--border); border-radius: 5px; background: var(--surface-2); color: var(--text); }
5569
+ details.raw, details.drift { margin: 7px 0; font-size: 13px; }
5570
+ details.raw summary, details.drift summary { cursor: pointer; color: var(--text-dim); }
5571
+ details.raw pre {
5572
+ background: var(--code-bg); color: var(--code-text);
5573
+ font-size: 11.5px; line-height: 1.5; padding: 12px 14px; border-radius: 6px;
5574
+ overflow-x: auto; white-space: pre-wrap; word-break: break-word; margin: 6px 0;
5575
+ }
5576
+ .severity { font-size: 10.5px; font-weight: 700; padding: 0 6px; border-radius: 4px; margin-right: 4px; }
5577
+ .severity.ERROR { background: var(--fail-bg); color: var(--fail); }
5578
+ .severity.WARN { background: rgba(212, 167, 44, 0.18); color: var(--skip); }
5579
+ .severity.OK { background: var(--pass-bg); color: var(--pass); }
5580
+ .drift ul { padding-left: 18px; font-size: 12.5px; line-height: 1.55; }
5581
+ table.matrix { border-collapse: collapse; font-size: 12.5px; margin: 10px 16px 10px 0; display: inline-table; vertical-align: top; }
5582
+ table.matrix th, table.matrix td { border: 1px solid var(--border); padding: 4px 12px; text-align: center; }
5583
+ table.matrix th { background: var(--surface-2); font-weight: 600; }
5584
+ table.matrix td { font-variant-numeric: tabular-nums; }
5585
+ table.matrix td.hit { background: var(--pass-bg); font-weight: 700; }
5586
+ table.matrix td.miss-nonzero { background: var(--fail-bg); }
5587
+ .stats { font-size: 13px; }
5588
+ .stats .big { font-size: 17px; font-weight: 700; }
5589
+ .measure-actions { display: flex; gap: 14px; align-items: center; font-size: 12.5px; }
5590
+ .measure-actions button {
5591
+ font: inherit; font-size: 12.5px; padding: 4px 13px; cursor: pointer;
5592
+ border: 1px solid var(--border); border-radius: 6px; background: var(--surface); color: var(--text);
5593
+ }
5594
+ .measure-actions button:hover { background: var(--surface-2); }
5595
+ .import-label { cursor: pointer; color: var(--text-dim); }
5596
+ .import-label input { display: none; }
5597
+ .empty-note { color: var(--text-dim); text-align: center; font-size: 13px; }
5598
+ `;
5599
+ const CLIENT_JS = `
5600
+ (function () {
5601
+ var dataEl = document.getElementById('ccqa-report-data');
5602
+ if (!dataEl) return;
5603
+ var data = JSON.parse(dataEl.textContent);
5604
+ var LABELS = ${JSON.stringify(FAILURE_LABELS)};
5605
+ var PRED_LABELS = LABELS.concat(['UNKNOWN']);
5606
+ var storageKey = 'ccqa-report:' + (data.runId || data.createdAt);
5607
+
5608
+ // ---- filtering ------------------------------------------------------
5609
+ var activeFilter = 'all';
5610
+ var searchQuery = '';
5611
+
5612
+ function applyFilters() {
5613
+ var sections = document.querySelectorAll('.spec');
5614
+ var visible = 0;
5615
+ sections.forEach(function (el) {
5616
+ var statusOk = activeFilter === 'all' || el.getAttribute('data-status') === activeFilter;
5617
+ var name = (el.getAttribute('data-case-id') || '').toLowerCase();
5618
+ var searchOk = !searchQuery || name.indexOf(searchQuery) >= 0;
5619
+ var show = statusOk && searchOk;
5620
+ el.style.display = show ? '' : 'none';
5621
+ if (show) visible++;
5622
+ });
5623
+ var note = document.getElementById('no-match');
5624
+ if (note) note.hidden = visible > 0;
5625
+ }
5626
+
5627
+ var chips = document.querySelectorAll('#filter-chips .chip');
5628
+ chips.forEach(function (chip) {
5629
+ chip.addEventListener('click', function () {
5630
+ activeFilter = chip.getAttribute('data-filter') || 'all';
5631
+ chips.forEach(function (c) { c.classList.toggle('active', c === chip); });
5632
+ applyFilters();
5633
+ });
5634
+ });
5635
+
5636
+ var search = document.getElementById('search');
5637
+ if (search) {
5638
+ search.addEventListener('input', function () {
5639
+ searchQuery = search.value.trim().toLowerCase();
5640
+ applyFilters();
5641
+ });
5642
+ }
5643
+
5644
+ // ---- measurement ----------------------------------------------------
5645
+ // cases: analyzed failures only — they carry a prediction we can grade.
5646
+ var cases = [];
5647
+ for (var i = 0; i < data.results.length; i++) {
5648
+ var r = data.results[i];
5649
+ if (r.status === 'failed' && r.analysis) {
5650
+ cases.push({ index: i, feature: r.feature, spec: r.spec, predicted: r.analysis.label });
5651
+ }
5652
+ }
5653
+
5654
+ var state = {};
5655
+ try { state = JSON.parse(localStorage.getItem(storageKey) || '{}'); } catch (e) { state = {}; }
5656
+
5657
+ function save() {
5658
+ try { localStorage.setItem(storageKey, JSON.stringify(state)); } catch (e) {}
5659
+ }
5660
+
5661
+ function caseKey(c) { return c.feature + '/' + c.spec; }
5662
+
5663
+ function applyStateToInputs() {
5664
+ cases.forEach(function (c) {
5665
+ var entry = state[caseKey(c)];
5666
+ if (!entry) return;
5667
+ // Guard: only known labels may flow into the attribute selector below
5668
+ // (localStorage is user-controlled; anything else is dropped).
5669
+ if (entry.label && LABELS.indexOf(entry.label) >= 0) {
5670
+ var radio = document.querySelector('input[name="label--' + c.index + '"][value="' + entry.label + '"]');
5671
+ if (radio) radio.checked = true;
5672
+ }
5673
+ var note = document.querySelector('.note[data-case-index="' + c.index + '"]');
5674
+ if (note && entry.note) note.value = entry.note;
5675
+ });
5676
+ }
5677
+
5678
+ function renderMetrics() {
5679
+ var target = document.getElementById('metrics');
5680
+ if (!target) return;
5681
+
5682
+ var m = {};
5683
+ PRED_LABELS.forEach(function (p) {
5684
+ m[p] = {};
5685
+ LABELS.forEach(function (a) { m[p][a] = 0; });
5686
+ });
5687
+
5688
+ var labeled = 0;
5689
+ var correct = 0;
5690
+ cases.forEach(function (c) {
5691
+ var entry = state[caseKey(c)];
5692
+ if (!entry || !entry.label || LABELS.indexOf(entry.label) < 0) return;
5693
+ labeled++;
5694
+ m[c.predicted][entry.label]++;
5695
+ if (c.predicted === entry.label) correct++;
5696
+ });
5697
+
5698
+ var html = '';
5699
+ html += '<div class="stats"><span class="big">' +
5700
+ (labeled === 0 ? '–' : Math.round((correct / labeled) * 100) + '%') +
5701
+ '</span> accuracy · ' + labeled + ' labeled / ' + cases.length + ' analyzed failures' +
5702
+ (cases.length - labeled > 0 ? ' · <strong>' + (cases.length - labeled) + ' unlabeled</strong>' : '') +
5703
+ '</div>';
5704
+
5705
+ html += '<table class="matrix"><thead><tr><th>predicted \\\\ actual</th>';
5706
+ LABELS.forEach(function (a) { html += '<th>' + a + '</th>'; });
5707
+ html += '</tr></thead><tbody>';
5708
+ PRED_LABELS.forEach(function (p) {
5709
+ html += '<tr><th>' + p + '</th>';
5710
+ LABELS.forEach(function (a) {
5711
+ var v = m[p][a];
5712
+ var cls = p === a ? 'hit' : (v > 0 ? 'miss-nonzero' : '');
5713
+ html += '<td class="' + cls + '">' + v + '</td>';
5714
+ });
5715
+ html += '</tr>';
5716
+ });
5717
+ html += '</tbody></table>';
5718
+
5719
+ html += '<table class="matrix"><thead><tr><th>class</th><th>precision</th><th>recall</th><th>F1</th><th>support</th></tr></thead><tbody>';
5720
+ LABELS.forEach(function (cls) {
5721
+ var tp = m[cls][cls];
5722
+ var predictedAs = 0;
5723
+ LABELS.forEach(function (a) { predictedAs += m[cls][a]; });
5724
+ var actualAs = 0;
5725
+ PRED_LABELS.forEach(function (p) { actualAs += m[p][cls]; });
5726
+ var precision = predictedAs > 0 ? tp / predictedAs : null;
5727
+ var recall = actualAs > 0 ? tp / actualAs : null;
5728
+ var f1 = precision !== null && recall !== null && precision + recall > 0
5729
+ ? (2 * precision * recall) / (precision + recall) : null;
5730
+ html += '<tr><th>' + cls + '</th><td>' + fmt(precision) + '</td><td>' + fmt(recall) +
5731
+ '</td><td>' + fmt(f1) + '</td><td>' + actualAs + '</td></tr>';
5732
+ });
5733
+ html += '</tbody></table>';
5734
+
5735
+ target.innerHTML = html;
5736
+ }
5737
+
5738
+ function fmt(v) { return v === null ? '–' : (Math.round(v * 100) / 100).toFixed(2); }
5739
+
5740
+ function findCaseByIndex(index) {
5741
+ for (var i = 0; i < cases.length; i++) {
5742
+ if (cases[i].index === index) return cases[i];
5743
+ }
5744
+ return null;
5745
+ }
5746
+
5747
+ document.addEventListener('change', function (e) {
5748
+ var t = e.target;
5749
+ if (t && t.name && t.name.indexOf('label--') === 0) {
5750
+ var index = parseInt(t.name.slice('label--'.length), 10);
5751
+ var c = findCaseByIndex(index);
5752
+ if (!c) return;
5753
+ var key = caseKey(c);
5754
+ state[key] = state[key] || {};
5755
+ state[key].label = t.value;
5756
+ save();
5757
+ renderMetrics();
5758
+ }
5759
+ });
5760
+
5761
+ document.addEventListener('input', function (e) {
5762
+ var t = e.target;
5763
+ if (t && t.classList && t.classList.contains('note')) {
5764
+ var index = parseInt(t.getAttribute('data-case-index'), 10);
5765
+ var c = findCaseByIndex(index);
5766
+ if (!c) return;
5767
+ var key = caseKey(c);
5768
+ state[key] = state[key] || {};
5769
+ state[key].note = t.value;
5770
+ save();
5771
+ }
5772
+ });
5773
+
5774
+ var exportBtn = document.getElementById('export-labels');
5775
+ if (exportBtn) {
5776
+ exportBtn.addEventListener('click', function () {
5777
+ var labels = [];
5778
+ cases.forEach(function (c) {
5779
+ var entry = state[caseKey(c)];
5780
+ if (!entry || !entry.label) return;
5781
+ var item = { feature: c.feature, spec: c.spec, predicted: c.predicted, label: entry.label };
5782
+ if (entry.note) item.note = entry.note;
5783
+ labels.push(item);
5784
+ });
5785
+ var payload = {
5786
+ schemaVersion: 1,
5787
+ runId: data.runId,
5788
+ promptVersion: data.promptVersion,
5789
+ exportedAt: new Date().toISOString(),
5790
+ labels: labels
5791
+ };
5792
+ var blob = new Blob([JSON.stringify(payload, null, 2)], { type: 'application/json' });
5793
+ var a = document.createElement('a');
5794
+ a.href = URL.createObjectURL(blob);
5795
+ a.download = 'ccqa-labels-' + (data.runId || data.createdAt).replace(/[^A-Za-z0-9_-]/g, '_') + '.json';
5796
+ a.click();
5797
+ URL.revokeObjectURL(a.href);
5798
+ });
5799
+ }
5800
+
5801
+ var importInput = document.getElementById('import-labels');
5802
+ if (importInput) {
5803
+ importInput.addEventListener('change', function () {
5804
+ var file = importInput.files && importInput.files[0];
5805
+ if (!file) return;
5806
+ var reader = new FileReader();
5807
+ reader.onload = function () {
5808
+ try {
5809
+ var payload = JSON.parse(String(reader.result));
5810
+ (payload.labels || []).forEach(function (item) {
5811
+ var key = item.feature + '/' + item.spec;
5812
+ state[key] = state[key] || {};
5813
+ if (item.label) state[key].label = item.label;
5814
+ if (item.note) state[key].note = item.note;
5815
+ });
5816
+ save();
5817
+ applyStateToInputs();
5818
+ renderMetrics();
5819
+ } catch (e) {
5820
+ alert('Could not parse labels JSON: ' + e.message);
5821
+ }
5822
+ };
5823
+ reader.readAsText(file);
5824
+ });
5825
+ }
5826
+
5827
+ applyStateToInputs();
5828
+ renderMetrics();
5829
+ })();
5830
+ `;
4789
5831
  //#endregion
4790
5832
  //#region src/cli/run.ts
4791
5833
  const USER_VITEST_CONFIG = resolve(".ccqa/vitest.config.ts");
5834
+ const DEFAULT_REPORT_DIR = "ccqa-report";
4792
5835
  async function resolveVitestConfig() {
4793
5836
  try {
4794
5837
  await access(USER_VITEST_CONFIG);
@@ -4797,7 +5840,7 @@ async function resolveVitestConfig() {
4797
5840
  return bundledVitestConfigPath();
4798
5841
  }
4799
5842
  }
4800
- const runCommand = addLanguageOption(new Command("run").argument("[target]", "Spec to run: '<feature>/<spec>', '<feature>', or omit for all").description("Run generated agent-browser test scripts. Pass --drift to invoke a Claude-driven drift analysis on each failing spec (skipped silently when no test fails). Requires ANTHROPIC_API_KEY or a local Claude login.").option("--drift", "On vitest failure, run drift analysis on the failing specs").option("--drift-strict", "Treat drift ERROR findings as a run failure (exit 1 even if vitest passed). Implies --drift.").option("--format <fmt>", "Output format for the drift block: text | json | github", "text").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Used by --drift only. Overrides CCQA_MODEL.")).action(async (target, opts) => {
5843
+ const runCommand = addLanguageOption(new Command("run").argument("[target]", "Spec to run: '<feature>/<spec>', '<feature>', or omit for all").description("Run generated agent-browser test scripts. Pass --drift-report to also write a self-contained HTML run report: each failing spec gets a drift audit plus a root-cause call (TEST_DRIFT / SPEC_CHANGE / PRODUCT_BUG), and the report lets a human grade the calls to measure their accuracy. Requires ANTHROPIC_API_KEY or a local Claude login for the analysis part.").option("--drift-report [dir]", `Write an HTML run report with drift analysis of failures (default dir: ${DEFAULT_REPORT_DIR}/)`).option("--drift-base <ref>", "Base ref the source diff is taken against for failure analysis (default: GITHUB_BASE_REF, then origin/main)").option("-m, --model <name>", "Claude model alias ('sonnet'|'opus'|'haiku') or full ID. Used by --drift-report only. Overrides CCQA_MODEL.")).action(async (target, opts) => {
4801
5844
  await runTests(target, opts);
4802
5845
  });
4803
5846
  async function runTests(target, opts) {
@@ -4812,6 +5855,7 @@ async function runTests(target, opts) {
4812
5855
  const summaries = [];
4813
5856
  let overallExitCode = 0;
4814
5857
  const vitestConfig = await resolveVitestConfig();
5858
+ const captureOutput = Boolean(opts.driftReport);
4815
5859
  try {
4816
5860
  for (let i = 0; i < specs.length; i++) {
4817
5861
  const { featureName, specName } = specs[i];
@@ -4832,7 +5876,8 @@ async function runTests(target, opts) {
4832
5876
  "--reporter=json",
4833
5877
  `--outputFile.json=${reportFile}`
4834
5878
  ]);
4835
- await Promise.all([streamFiltered(proc.stdout, process.stdout), streamFiltered(proc.stderr, process.stderr)]);
5879
+ const tail = captureOutput ? new TailBuffer(OUTPUT_TAIL_CAP) : null;
5880
+ await Promise.all([streamFiltered(proc.stdout, process.stdout, tail), streamFiltered(proc.stderr, process.stderr, tail)]);
4836
5881
  const exitCode = await proc.exited;
4837
5882
  if (exitCode !== 0) overallExitCode = exitCode;
4838
5883
  const report = await readReport(reportFile);
@@ -4841,12 +5886,13 @@ async function runTests(target, opts) {
4841
5886
  specName,
4842
5887
  scriptFile,
4843
5888
  report,
4844
- exitCode
5889
+ exitCode,
5890
+ outputTail: tail ? tail.toString() : null
4845
5891
  });
4846
5892
  blank();
4847
5893
  }
4848
5894
  printSummary(summaries);
4849
- overallExitCode = await maybeRunDrift(summaries, opts, overallExitCode);
5895
+ await maybeWriteDriftReport(summaries, opts);
4850
5896
  } finally {
4851
5897
  await rm(tmpDir, {
4852
5898
  recursive: true,
@@ -4859,74 +5905,208 @@ function failedSpec(s) {
4859
5905
  if (s.exitCode !== 0) return true;
4860
5906
  return (s.report?.numFailedTests ?? 0) > 0;
4861
5907
  }
4862
- function parseDriftFormat(raw) {
4863
- const v = raw ?? "text";
4864
- if (v === "text" || v === "json" || v === "github") return v;
4865
- error(`invalid --format: ${v} (expected text|json|github)`);
4866
- process.exit(2);
4867
- }
4868
5908
  /**
4869
- * Choose which specs to drift-check. `--drift` is a fail-supplement: only the
4870
- * specs that failed get a drift analysis (the goal is to *explain* a vitest
4871
- * failure). `--drift-strict` is an audit: even passing specs are checked,
4872
- * because the CI need is "fail loud if the spec lags behind the source",
4873
- * which can absolutely happen while vitest is still green against a stale
4874
- * staging environment.
5909
+ * Opt-in post-vitest report hook. With `--drift-report`, a self-contained
5910
+ * HTML report is ALWAYS written (a green run is still a useful run summary);
5911
+ * failing specs additionally get a spec↔code drift audit and a three-way
5912
+ * root-cause call with the PR diff as context. The hook never changes the
5913
+ * exit code the run's outcome is determined by vitest alone — and when
5914
+ * Claude auth is unavailable only the analysis is skipped, not the report.
4875
5915
  */
4876
- function selectDriftTargets(summaries, opts) {
4877
- if (opts.driftStrict) return summaries;
4878
- if (opts.drift) return summaries.filter(failedSpec);
4879
- return [];
4880
- }
4881
- /**
4882
- * Opt-in post-vitest drift hook. With `--drift`, fires only when at least
4883
- * one spec failed (supplemental signal). With `--drift-strict`, fires
4884
- * unconditionally so a spec/source divergence is caught even when vitest
4885
- * passed. Skips silently when auth is unavailable so the run's exit code
4886
- * is determined by vitest alone.
4887
- */
4888
- async function maybeRunDrift(summaries, opts, currentExitCode) {
4889
- const candidates = selectDriftTargets(summaries, opts);
4890
- if (candidates.length === 0) return currentExitCode;
5916
+ async function maybeWriteDriftReport(summaries, opts) {
5917
+ if (!opts.driftReport) return;
5918
+ const outDir = typeof opts.driftReport === "string" ? opts.driftReport : DEFAULT_REPORT_DIR;
5919
+ const cwd = process.cwd();
4891
5920
  const auth = driftAuthAvailable();
4892
- if (!auth.ok) {
4893
- info(`drift analysis skipped (${auth.reason})`);
4894
- return currentExitCode;
5921
+ const failed = summaries.filter(failedSpec);
5922
+ if (!auth.ok && failed.length > 0) info(`failure analysis skipped (${auth.reason})`);
5923
+ const baseRef = resolveBaseRef(opts.driftBase);
5924
+ let diff = {
5925
+ ok: false,
5926
+ error: "diff not captured (no failures)"
5927
+ };
5928
+ if (failed.length > 0) {
5929
+ diff = await capturePrDiff(baseRef, cwd);
5930
+ if (!diff.ok) info(`drift-report: source diff unavailable (${diff.error}) — analyzing without diff context`);
5931
+ }
5932
+ const tree = failed.length > 0 ? await listFeatureTree(cwd) : [];
5933
+ const specInfoByKey = new Map(tree.flatMap((f) => f.specs.map((sp) => [`${f.featureName}/${sp.specName}`, sp])));
5934
+ const findSpecInfo = (s) => specInfoByKey.get(`${s.featureName}/${s.specName}`) ?? null;
5935
+ let driftResults = [];
5936
+ if (auth.ok && failed.length > 0) {
5937
+ const targets = failed.map((s) => {
5938
+ const spec = findSpecInfo(s);
5939
+ if (!spec) return null;
5940
+ const t = {
5941
+ featureName: s.featureName,
5942
+ specName: s.specName
5943
+ };
5944
+ if (spec.relatedPaths) t.relatedPaths = spec.relatedPaths;
5945
+ if (spec.includedBlocks) t.includedBlocks = spec.includedBlocks;
5946
+ return t;
5947
+ }).filter((t) => t !== null);
5948
+ if (targets.length > 0) driftResults = await analyzeDrift({
5949
+ targets,
5950
+ cwd,
5951
+ blocks: await loadAvailableBlocks(cwd),
5952
+ concurrency: Math.min(3, targets.length),
5953
+ ...opts.model ? { model: opts.model } : {},
5954
+ ...opts.language ? { language: opts.language } : {},
5955
+ onSpecStart: (t) => info(`drift audit: ${t.featureName}/${t.specName}`)
5956
+ });
4895
5957
  }
4896
- const format = parseDriftFormat(opts.format);
4897
- const cwd = process.cwd();
4898
- const tree = await listFeatureTree(cwd);
4899
- const targets = candidates.map((s) => {
4900
- const spec = tree.find((f) => f.featureName === s.featureName)?.specs.find((sp) => sp.specName === s.specName);
4901
- if (!spec) return null;
4902
- const t = {
4903
- featureName: s.featureName,
4904
- specName: s.specName
5958
+ const patchSections = diff.ok && diff.diff.patch.length > 0 ? splitPatchByFile(diff.diff.patch) : null;
5959
+ let printedHeader = false;
5960
+ const results = [];
5961
+ for (const s of summaries) {
5962
+ const assertions = collectAssertions(s);
5963
+ const base = {
5964
+ feature: s.featureName,
5965
+ spec: s.specName,
5966
+ testCounts: s.report ? {
5967
+ total: s.report.numTotalTests,
5968
+ passed: s.report.numPassedTests,
5969
+ failed: s.report.numFailedTests
5970
+ } : null,
5971
+ durationMs: assertions ? assertions.reduce((sum, a) => sum + (a.durationMs ?? 0), 0) : null,
5972
+ assertions
4905
5973
  };
4906
- if (spec.relatedPaths) t.relatedPaths = spec.relatedPaths;
4907
- if (spec.includedBlocks) t.includedBlocks = spec.includedBlocks;
4908
- return t;
4909
- }).filter((t) => t !== null);
4910
- if (targets.length === 0) {
4911
- info("drift analysis skipped (no spec.yaml found for failing specs)");
4912
- return currentExitCode;
4913
- }
4914
- const results = await analyzeDrift({
4915
- targets,
4916
- cwd,
4917
- blocks: await loadAvailableBlocks(cwd),
4918
- concurrency: Math.min(3, targets.length),
4919
- ...opts.model ? { model: opts.model } : {},
4920
- ...opts.language ? { language: opts.language } : {},
4921
- onSpecStart: (t) => {
4922
- if (format === "text") info(`drift: checking ${t.featureName}/${t.specName}`);
5974
+ if (!failedSpec(s)) {
5975
+ results.push({
5976
+ ...base,
5977
+ status: "passed",
5978
+ analysis: null,
5979
+ analysisSkipped: null,
5980
+ driftIssues: null,
5981
+ failureLogExcerpt: null,
5982
+ diffExcerpt: null,
5983
+ specYaml: null
5984
+ });
5985
+ continue;
5986
+ }
5987
+ const specYaml = await tryReadSpecFile(s.featureName, s.specName, cwd);
5988
+ const relatedPaths = findSpecInfo(s)?.relatedPaths ?? null;
5989
+ const diffExcerpt = patchSections ? scopePatchForSpec(patchSections, relatedPaths) : null;
5990
+ const driftResult = driftResults.find((r) => r.target.featureName === s.featureName && r.target.specName === s.specName);
5991
+ const driftIssues = driftResult?.ok ? driftResult.issues : null;
5992
+ const failureLog = buildFailureLog(s);
5993
+ let analysis = null;
5994
+ let analysisSkipped = null;
5995
+ if (!auth.ok) analysisSkipped = auth.reason;
5996
+ else if (specYaml === null) analysisSkipped = "no spec.yaml found for this spec";
5997
+ else {
5998
+ const script = await readScriptSafe(s.scriptFile);
5999
+ info(`failure analysis: ${s.featureName}/${s.specName}`);
6000
+ const outcome = await analyzeFailure({
6001
+ script,
6002
+ specYaml,
6003
+ failureLog,
6004
+ diffPatch: diffExcerpt,
6005
+ changedFiles: diff.ok ? diff.diff.nameStatus : null,
6006
+ baseRef: diff.ok ? baseRef : null,
6007
+ driftIssues,
6008
+ ...opts.language ? { outputLanguage: opts.language } : {}
6009
+ }, {
6010
+ ...opts.model ? { model: opts.model } : {},
6011
+ cwd
6012
+ });
6013
+ analysis = outcome.analysis;
6014
+ if (!printedHeader) {
6015
+ process.stdout.write(`\n${C.cyan}${C.bold}──────── failure analysis ────────${C.reset}\n`);
6016
+ printedHeader = true;
6017
+ }
6018
+ const pct = Math.round(outcome.analysis.confidence * 100);
6019
+ const firstLine = outcome.analysis.reasoning.split("\n")[0] ?? "";
6020
+ process.stdout.write(`${C.red}✖${C.reset} ${C.bold}${s.featureName}/${s.specName}${C.reset} → ${C.bold}${outcome.analysis.label}${C.reset} (${pct}%)${firstLine ? ` ${C.dim}${firstLine}${C.reset}` : ""}\n`);
4923
6021
  }
6022
+ results.push({
6023
+ ...base,
6024
+ status: "failed",
6025
+ analysis,
6026
+ analysisSkipped,
6027
+ driftIssues,
6028
+ failureLogExcerpt: failureLog.length > 0 ? failureLog : null,
6029
+ diffExcerpt,
6030
+ specYaml
6031
+ });
6032
+ }
6033
+ const data = {
6034
+ schemaVersion: 1,
6035
+ createdAt: (/* @__PURE__ */ new Date()).toISOString(),
6036
+ runId: process.env["GITHUB_RUN_ID"] ?? null,
6037
+ git: {
6038
+ head: diff.ok ? diff.diff.head : null,
6039
+ base: diff.ok ? baseRef : null
6040
+ },
6041
+ model: opts.model ?? null,
6042
+ promptVersion: "2",
6043
+ results
6044
+ };
6045
+ const reportPath = join(outDir, "index.html");
6046
+ await mkdir(outDir, { recursive: true });
6047
+ await writeFile(reportPath, renderRunReport(data), "utf8");
6048
+ info(`run report written to ${reportPath}`);
6049
+ }
6050
+ function collectAssertions(s) {
6051
+ if (!s.report) return null;
6052
+ const out = [];
6053
+ for (const file of s.report.testResults) for (const a of file.assertionResults) out.push({
6054
+ name: a.fullName,
6055
+ status: a.status === "passed" || a.status === "failed" ? a.status : "skipped",
6056
+ durationMs: a.duration ?? null
4924
6057
  });
4925
- if (format === "text") process.stdout.write(`\n${C.cyan}${C.bold}──────── drift analysis ────────${C.reset}\n`);
4926
- process.stdout.write(renderDrift(results, format, cwd));
4927
- if (opts.driftStrict && determineExitCode(results, "error") !== 0) return currentExitCode || 1;
4928
- return currentExitCode;
6058
+ return out;
6059
+ }
6060
+ /**
6061
+ * Compose the failure log fed to the analysis prompt and embedded in the
6062
+ * report. With `--reporter=json` vitest writes (almost) nothing to
6063
+ * stdout/stderr — the assertion failures live in the JSON report — so the
6064
+ * structured failureMessages come first and the raw output tail (console
6065
+ * logs, agent-browser noise) is appended as secondary context.
6066
+ */
6067
+ function buildFailureLog(s) {
6068
+ const parts = [];
6069
+ if (s.report) for (const file of s.report.testResults) for (const a of file.assertionResults) {
6070
+ if (a.status !== "failed") continue;
6071
+ parts.push(`✖ ${a.fullName}`);
6072
+ for (const m of a.failureMessages ?? []) parts.push(m);
6073
+ }
6074
+ const tail = s.outputTail?.trim();
6075
+ if (tail) {
6076
+ parts.push("--- vitest output (tail) ---");
6077
+ parts.push(tail);
6078
+ }
6079
+ return parts.join("\n");
4929
6080
  }
6081
+ async function readScriptSafe(path) {
6082
+ try {
6083
+ return await readFile(path, "utf8");
6084
+ } catch {
6085
+ return "";
6086
+ }
6087
+ }
6088
+ /** Cap on the per-spec output tail kept for the report / analysis prompt. */
6089
+ const OUTPUT_TAIL_CAP = 64 * 1024;
6090
+ /**
6091
+ * Keeps the LAST `cap` characters appended. Vitest puts the failure summary
6092
+ * at the end of its output, so the tail is the part worth keeping when a
6093
+ * noisy spec overflows the cap.
6094
+ */
6095
+ var TailBuffer = class {
6096
+ buf = "";
6097
+ cap;
6098
+ constructor(cap) {
6099
+ this.cap = cap;
6100
+ }
6101
+ append(s) {
6102
+ this.buf += s;
6103
+ if (this.buf.length > this.cap * 2) this.buf = this.buf.slice(-this.cap);
6104
+ }
6105
+ toString() {
6106
+ if (this.buf.length <= this.cap) return this.buf;
6107
+ return `[...output truncated...]\n${this.buf.slice(-this.cap)}`;
6108
+ }
6109
+ };
4930
6110
  async function readReport(path) {
4931
6111
  try {
4932
6112
  const raw = await readFile(path, "utf8");
@@ -4998,7 +6178,7 @@ function formatDuration(ms) {
4998
6178
  return `${(ms / 1e3).toFixed(2)}s`;
4999
6179
  }
5000
6180
  const NOISE_LINE_PATTERNS = [/^JSON report written to /];
5001
- async function streamFiltered(source, sink) {
6181
+ async function streamFiltered(source, sink, capture) {
5002
6182
  source.setEncoding("utf8");
5003
6183
  let buffer = "";
5004
6184
  for await (const chunk of source) {
@@ -5007,11 +6187,17 @@ async function streamFiltered(source, sink) {
5007
6187
  while (nl !== -1) {
5008
6188
  const line = buffer.slice(0, nl);
5009
6189
  buffer = buffer.slice(nl + 1);
5010
- if (!NOISE_LINE_PATTERNS.some((p) => p.test(line))) sink.write(line + "\n");
6190
+ if (!NOISE_LINE_PATTERNS.some((p) => p.test(line))) {
6191
+ sink.write(line + "\n");
6192
+ capture?.append(line + "\n");
6193
+ }
5011
6194
  nl = buffer.indexOf("\n");
5012
6195
  }
5013
6196
  }
5014
- if (buffer.length > 0 && !NOISE_LINE_PATTERNS.some((p) => p.test(buffer))) sink.write(buffer);
6197
+ if (buffer.length > 0 && !NOISE_LINE_PATTERNS.some((p) => p.test(buffer))) {
6198
+ sink.write(buffer);
6199
+ capture?.append(buffer);
6200
+ }
5015
6201
  }
5016
6202
  async function resolveSpecs(target) {
5017
6203
  if (!target) return listAllSpecs();
@@ -5397,163 +6583,141 @@ function truncate(s, n) {
5397
6583
  return s.slice(s.length - n);
5398
6584
  }
5399
6585
  //#endregion
5400
- //#region src/drift/affected.ts
5401
- const execFileP = promisify(execFile);
5402
- /**
5403
- * Resolve the base ref to diff against for `ccqa drift --changed`.
5404
- * Precedence: explicit override > GITHUB_BASE_REF > origin/main.
5405
- */
5406
- function resolveBaseRef(explicit) {
5407
- if (explicit && explicit.length > 0) return explicit;
5408
- const ghBase = process.env["GITHUB_BASE_REF"];
5409
- if (ghBase && ghBase.length > 0) return ghBase.startsWith("origin/") ? ghBase : `origin/${ghBase}`;
5410
- return "origin/main";
5411
- }
5412
- /**
5413
- * Run `git diff --name-status base...HEAD` from `cwd` and return one entry per
5414
- * changed file. Renames are reported under their NEW path with status
5415
- * "renamed" — the OLD path is dropped because the spec mapping is against the
5416
- * post-rename layout.
5417
- *
5418
- * Paths are re-rooted to be relative to `cwd`, not the git repo root. In a
5419
- * monorepo where `cwd` is a sub-package (e.g. `apps/foo`), git emits paths
5420
- * relative to the repo root, but specs declare relatedPaths relative to
5421
- * their own package. Changes outside `cwd` are dropped so an unrelated PR
5422
- * can never accidentally scope a sub-package's specs in.
5423
- */
5424
- async function getChangedFiles(base, cwd) {
5425
- const [{ stdout: rootOut }, { stdout: diffOut }] = await Promise.all([execFileP("git", ["rev-parse", "--show-toplevel"], { cwd }), execFileP("git", [
5426
- "diff",
5427
- "--name-status",
5428
- "-M",
5429
- `${base}...HEAD`
5430
- ], {
5431
- cwd,
5432
- maxBuffer: 32 * 1024 * 1024
5433
- })]);
5434
- return rerootChangedFiles(parseGitDiffOutput(diffOut), rootOut.trim(), cwd);
5435
- }
6586
+ //#region src/drift/format.ts
5436
6587
  /**
5437
- * Convert paths in `entries` from git-repo-root relative to `cwd` relative,
5438
- * dropping anything outside `cwd`. Exported for unit tests.
6588
+ * Render drift results as a string. The CLI commands and the `run` failure
6589
+ * hook are the only callers; both want the formatted output returned so
6590
+ * they can prefix / interleave / pipe it as needed.
5439
6591
  */
5440
- function rerootChangedFiles(entries, repoRoot, cwd) {
5441
- const prefix = relative(repoRoot, cwd);
5442
- if (!prefix) return entries;
5443
- const out = [];
5444
- for (const e of entries) {
5445
- const rel = relative(prefix, e.path);
5446
- if (rel.startsWith("..") || rel === "") continue;
5447
- out.push({
5448
- ...e,
5449
- path: rel
5450
- });
5451
- }
5452
- return out;
6592
+ function renderDrift(results, format, cwd) {
6593
+ if (format === "json") return renderJson(results);
6594
+ if (format === "github") return renderGithub(results, cwd);
6595
+ return renderText(results);
5453
6596
  }
5454
- function parseGitDiffOutput(stdout) {
6597
+ const HEAVY_RULE = "═".repeat(72);
6598
+ function renderText(results) {
5455
6599
  const out = [];
5456
- for (const line of stdout.split("\n")) {
5457
- if (!line.trim()) continue;
5458
- const parts = line.split(" ");
5459
- const code = parts[0];
5460
- if (!code) continue;
5461
- if (code.startsWith("R")) {
5462
- const newPath = parts[2];
5463
- if (newPath) out.push({
5464
- path: newPath,
5465
- status: "renamed"
5466
- });
6600
+ for (const r of results) {
6601
+ out.push("");
6602
+ const heading = `══ ${r.target.featureName}/${r.target.specName} `;
6603
+ const tail = "═".repeat(Math.max(3, 72 - heading.length));
6604
+ out.push(`${heading}${tail}`);
6605
+ if (r.error) {
6606
+ out.push(` ERROR ${r.error}`);
5467
6607
  continue;
5468
6608
  }
5469
- if (code.startsWith("C")) {
5470
- const newPath = parts[2];
5471
- if (newPath) out.push({
5472
- path: newPath,
5473
- status: "added"
5474
- });
6609
+ const errors = r.issues.filter((i) => i.severity === "ERROR");
6610
+ const warnings = r.issues.filter((i) => i.severity === "WARN");
6611
+ const passed = r.issues.filter((i) => i.severity === "OK");
6612
+ if (errors.length === 0 && warnings.length === 0) {
6613
+ const label = passed.length === 1 ? "check" : "checks";
6614
+ const detail = passed.length > 0 ? `all ${passed.length} ${label} passed` : "no issues";
6615
+ out.push(` ✓ ${detail}`);
5475
6616
  continue;
5476
6617
  }
5477
- const path = parts[1];
5478
- if (!path) continue;
5479
- switch (code[0]) {
5480
- case "A":
5481
- out.push({
5482
- path,
5483
- status: "added"
5484
- });
5485
- break;
5486
- case "M":
5487
- case "T":
5488
- out.push({
5489
- path,
5490
- status: "modified"
5491
- });
5492
- break;
5493
- case "D":
5494
- out.push({
5495
- path,
5496
- status: "deleted"
5497
- });
5498
- break;
5499
- default: out.push({
5500
- path,
5501
- status: "modified"
5502
- });
6618
+ for (const issue of errors) appendFinding(out, "ERROR", issue);
6619
+ for (const issue of warnings) appendFinding(out, "WARN", issue);
6620
+ if (passed.length > 0) {
6621
+ const names = passed.map((i) => DRAFT_CATEGORY_LABEL[i.category]).join(", ");
6622
+ out.push("");
6623
+ out.push(` ✓ passed (${passed.length}): ${names}`);
5503
6624
  }
5504
6625
  }
5505
- return out;
6626
+ out.push("");
6627
+ out.push(HEAVY_RULE);
6628
+ const totals = summarize(results);
6629
+ out.push(` specs ${results.length} (${totals.errored} errored)`);
6630
+ out.push(` findings ${totals.error} error, ${totals.warn} warn, ${totals.ok} ok`);
6631
+ out.push("");
6632
+ return out.join("\n");
5506
6633
  }
5507
- function stripLeadingDotSlash(s) {
5508
- return s.startsWith("./") ? s.slice(2) : s;
6634
+ function appendFinding(out, level, issue) {
6635
+ const stepPart = issue.stepId ? ` ${issue.stepId}` : "";
6636
+ out.push("");
6637
+ out.push(` ${level} ${DRAFT_CATEGORY_LABEL[issue.category]}${stepPart}`);
6638
+ out.push(` ${issue.message}`);
6639
+ if (issue.detail) out.push(` └ ${issue.detail.replace(/\n/g, "\n ")}`);
5509
6640
  }
5510
- const REGEX_CACHE = /* @__PURE__ */ new Map();
5511
- /** Compiles `pattern` to a RegExp, memoized so repeated `--changed` matches don't re-build. */
5512
- function compileGlob(pattern) {
5513
- const cached = REGEX_CACHE.get(pattern);
5514
- if (cached) return cached;
5515
- const compiled = globToRegExp(stripLeadingDotSlash(pattern));
5516
- REGEX_CACHE.set(pattern, compiled);
5517
- return compiled;
6641
+ function renderJson(results) {
6642
+ const payload = { specs: results.map((r) => ({
6643
+ feature: r.target.featureName,
6644
+ spec: r.target.specName,
6645
+ ok: r.ok,
6646
+ ...r.error ? { error: r.error } : {},
6647
+ issues: r.issues.map((i) => ({
6648
+ severity: i.severity,
6649
+ category: i.category,
6650
+ stepId: i.stepId,
6651
+ message: i.message,
6652
+ ...i.detail ? { detail: i.detail } : {}
6653
+ }))
6654
+ })) };
6655
+ return `${JSON.stringify(payload, null, 2)}\n`;
5518
6656
  }
5519
- function globToRegExp(pattern) {
5520
- let re = "^";
5521
- let i = 0;
5522
- while (i < pattern.length) {
5523
- const ch = pattern[i];
5524
- if (ch === "?") {
5525
- re += "[^/]";
5526
- i++;
5527
- continue;
5528
- }
5529
- if (ch !== "*") {
5530
- re += /[.+^${}()|[\]\\]/.test(ch) ? "\\" + ch : ch;
5531
- i++;
6657
+ function renderGithub(results, cwd) {
6658
+ const repoRoot = process.env["GITHUB_WORKSPACE"] ?? process.cwd();
6659
+ const lines = [];
6660
+ for (const r of results) {
6661
+ const file = githubRelPath(cwd, repoRoot, r.target.featureName, r.target.specName);
6662
+ if (r.error) {
6663
+ lines.push(`::error file=${file}::${escapeGhMessage(r.error)}`);
5532
6664
  continue;
5533
6665
  }
5534
- if (pattern[i + 1] !== "*") {
5535
- re += "[^/]*";
5536
- i++;
5537
- continue;
6666
+ for (const issue of r.issues) {
6667
+ if (issue.severity === "OK") continue;
6668
+ const level = issue.severity === "ERROR" ? "error" : "warning";
6669
+ const title = `${r.target.featureName}/${r.target.specName} — ${issue.category}${issue.stepId ? ` (${issue.stepId})` : ""}`;
6670
+ const body = issue.detail ? `${issue.message}\n${issue.detail}` : issue.message;
6671
+ lines.push(`::${level} file=${file},title=${escapeGhProp(title)}::${escapeGhMessage(body)}`);
5538
6672
  }
5539
- const hasLeadingSlash = re.endsWith("/");
5540
- const hasTrailingSlash = pattern[i + 2] === "/";
5541
- if (hasLeadingSlash) re = re.slice(0, -1);
5542
- if (hasLeadingSlash || hasTrailingSlash) re += "(?:/?.*)?";
5543
- else re += ".*";
5544
- i += hasTrailingSlash ? 3 : 2;
5545
6673
  }
5546
- return new RegExp(re + "$");
6674
+ return lines.length === 0 ? "" : `${lines.join("\n")}\n`;
6675
+ }
6676
+ function githubRelPath(cwd, repoRoot, featureName, specName) {
6677
+ const abs = resolve(cwd, ".ccqa", "features", featureName, "test-cases", specName, "spec.yaml");
6678
+ const rel = relative(repoRoot, abs);
6679
+ return rel.startsWith("..") ? abs : rel;
6680
+ }
6681
+ function escapeGhMessage(s) {
6682
+ return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A");
6683
+ }
6684
+ function escapeGhProp(s) {
6685
+ return s.replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A").replace(/,/g, "%2C").replace(/:/g, "%3A");
6686
+ }
6687
+ function summarize(results) {
6688
+ let error = 0;
6689
+ let warn = 0;
6690
+ let ok = 0;
6691
+ let errored = 0;
6692
+ for (const r of results) {
6693
+ if (r.error) errored++;
6694
+ for (const issue of r.issues) if (issue.severity === "ERROR") error++;
6695
+ else if (issue.severity === "WARN") warn++;
6696
+ else ok++;
6697
+ }
6698
+ return {
6699
+ error,
6700
+ warn,
6701
+ ok,
6702
+ errored
6703
+ };
5547
6704
  }
6705
+ //#endregion
6706
+ //#region src/drift/exit-code.ts
5548
6707
  /**
5549
- * Returns true if `changedPath` is covered by any of `relatedPaths`. An empty
5550
- * `relatedPaths` returns false callers handle the "unscoped spec" case
5551
- * separately (treat the spec as always-affected) before calling this.
6708
+ * Map drift results to an exit code. Spec-level errors (Claude call failed)
6709
+ * always fail; otherwise ERROR severity always fails, WARN fails only when
6710
+ * the threshold is `warn`.
5552
6711
  */
5553
- function isPathAffectedBy(changedPath, relatedPaths) {
5554
- const stripped = stripLeadingDotSlash(changedPath);
5555
- for (const pattern of relatedPaths) if (compileGlob(pattern).test(stripped)) return true;
5556
- return false;
6712
+ function determineExitCode(results, threshold) {
6713
+ for (const r of results) {
6714
+ if (r.error) return 1;
6715
+ for (const issue of r.issues) {
6716
+ if (issue.severity === "ERROR") return 1;
6717
+ if (threshold === "warn" && issue.severity === "WARN") return 1;
6718
+ }
6719
+ }
6720
+ return 0;
5557
6721
  }
5558
6722
  //#endregion
5559
6723
  //#region src/drift/route-new-files.ts