copilot-guardian 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/.github/workflows/ci.yml +53 -0
  2. package/.test-output-run-abstain/guardian.report.json +8 -0
  3. package/CHANGELOG.md +602 -0
  4. package/CONTRIBUTING.md +28 -0
  5. package/LICENSE +21 -0
  6. package/README.md +205 -0
  7. package/SECURITY.md +150 -0
  8. package/dist/cli.js +384 -0
  9. package/dist/cli.js.map +1 -0
  10. package/dist/engine/analyze.js +294 -0
  11. package/dist/engine/analyze.js.map +1 -0
  12. package/dist/engine/async-exec.js +314 -0
  13. package/dist/engine/async-exec.js.map +1 -0
  14. package/dist/engine/auto-apply.js +424 -0
  15. package/dist/engine/auto-apply.js.map +1 -0
  16. package/dist/engine/context-enhancer.js +141 -0
  17. package/dist/engine/context-enhancer.js.map +1 -0
  18. package/dist/engine/debug.js +77 -0
  19. package/dist/engine/debug.js.map +1 -0
  20. package/dist/engine/eval.js +437 -0
  21. package/dist/engine/eval.js.map +1 -0
  22. package/dist/engine/github.js +191 -0
  23. package/dist/engine/github.js.map +1 -0
  24. package/dist/engine/mcp.js +217 -0
  25. package/dist/engine/mcp.js.map +1 -0
  26. package/dist/engine/patch_options.js +474 -0
  27. package/dist/engine/patch_options.js.map +1 -0
  28. package/dist/engine/run.js +124 -0
  29. package/dist/engine/run.js.map +1 -0
  30. package/dist/engine/util.js +167 -0
  31. package/dist/engine/util.js.map +1 -0
  32. package/dist/ui/dashboard.js +81 -0
  33. package/dist/ui/dashboard.js.map +1 -0
  34. package/docs/ARCHITECTURE.md +292 -0
  35. package/docs/Logo.png +0 -0
  36. package/docs/screenshots/05-hypothesis-dashboard.png +0 -0
  37. package/docs/screenshots/07-patch-spectrum.png +0 -0
  38. package/docs/screenshots/final-demo.gif +0 -0
  39. package/examples/demo-failure/.github/workflows/ci.yml +23 -0
  40. package/examples/demo-failure/README.md +93 -0
  41. package/examples/demo-failure/package.json +9 -0
  42. package/examples/demo-failure/test/require-api-url.js +10 -0
  43. package/jest.config.cjs +35 -0
  44. package/package.json +39 -0
  45. package/prompts/analysis.v2.txt +62 -0
  46. package/prompts/debug.followup.v1.txt +18 -0
  47. package/prompts/patch.options.v1.txt +47 -0
  48. package/prompts/patch.simple.v1.txt +12 -0
  49. package/prompts/quality.v1.txt +25 -0
  50. package/schemas/analysis.schema.json +65 -0
  51. package/schemas/patch_options.schema.json +23 -0
  52. package/schemas/quality.schema.json +12 -0
  53. package/src/cli.ts +417 -0
  54. package/src/engine/analyze.ts +412 -0
  55. package/src/engine/async-exec.ts +384 -0
  56. package/src/engine/auto-apply.ts +516 -0
  57. package/src/engine/context-enhancer.ts +176 -0
  58. package/src/engine/debug.ts +91 -0
  59. package/src/engine/eval.ts +546 -0
  60. package/src/engine/github.ts +223 -0
  61. package/src/engine/mcp.ts +267 -0
  62. package/src/engine/patch_options.ts +604 -0
  63. package/src/engine/run.ts +154 -0
  64. package/src/engine/util.ts +195 -0
  65. package/src/ui/dashboard.ts +90 -0
  66. package/test-sdk.mjs +51 -0
  67. package/tests/auto_heal_branch_safety.test.ts +76 -0
  68. package/tests/github_redaction_failclosed.test.ts +24 -0
  69. package/tests/mocks/copilot-sdk.mock.ts +15 -0
  70. package/tests/quality_guard_regression_matrix.test.ts +432 -0
  71. package/tests/run_abstain_policy.test.ts +83 -0
  72. package/tsconfig.json +17 -0
@@ -0,0 +1,91 @@
1
+ import path from "node:path";
2
+ import readline from "node:readline/promises";
3
+ import { stdin as input, stdout as output } from "node:process";
4
+
5
+ import { analyzeRun } from "./analyze.js";
6
+ import { generatePatchOptions } from "./patch_options.js";
7
+ import { ensureDir, loadText, writeText, extractJsonObject, PACKAGE_ROOT } from "./util.js";
8
+ import { copilotChatAsync } from "./async-exec.js";
9
+
10
+ /**
11
+ * Async wrapper for copilot chat (replaces blocking execSync)
12
+ */
13
+ async function copilotChat(payload: string): Promise<string> {
14
+ return await copilotChatAsync(payload, {
15
+ showSpinner: false,
16
+ spinnerText: 'Asking Copilot...'
17
+ });
18
+ }
19
+
20
+ export async function debugInteractive(repo: string, runId: number, outDir = path.join(process.cwd(), ".copilot-guardian")) {
21
+ ensureDir(outDir);
22
+
23
+ const { analysis, ctx } = await analyzeRun(repo, runId, outDir);
24
+
25
+ const rl = readline.createInterface({ input, output });
26
+ const transcriptPath = path.join(outDir, "debug.transcript.md");
27
+ writeText(transcriptPath, `# copilot-guardian debug transcript\n\nRepo: ${repo}\nRun: ${runId}\n\n`);
28
+
29
+ // Thin interactive layer: ask follow-ups, then optionally generate patch spectrum.
30
+ while (true) {
31
+ output.write("\nChoose an action:\n");
32
+ output.write(" 1) Ask Copilot a follow-up question\n");
33
+ output.write(" 2) Generate patch options (Conservative/Balanced/Aggressive)\n");
34
+ output.write(" 3) Exit\n\n");
35
+
36
+ const choice = (await rl.question("Your choice (1-3): ")).trim();
37
+
38
+ if (choice === "3") break;
39
+
40
+ if (choice === "2") {
41
+ await generatePatchOptions(analysis, outDir);
42
+ output.write(`\n[+] Patch options generated. See: ${path.join(outDir, "patch_options.json")}\n`);
43
+ continue;
44
+ }
45
+
46
+ if (choice !== "1") {
47
+ output.write("Invalid choice.\n");
48
+ continue;
49
+ }
50
+
51
+ const q = (await rl.question("\nAsk Copilot: ")).trim();
52
+ if (!q) continue;
53
+
54
+ const prompt = loadText(path.join(PACKAGE_ROOT, "prompts", "debug.followup.v1.txt"));
55
+ const payload = `${prompt}\n\nCONTEXT:\n${JSON.stringify({
56
+ repo,
57
+ run_id: runId,
58
+ selected: analysis.diagnosis.selected_hypothesis_id,
59
+ root_cause: analysis.diagnosis.root_cause,
60
+ log_excerpt: ctx.logExcerpt
61
+ }, null, 2)}\n\nQUESTION:\n${q}`;
62
+
63
+ const raw = await copilotChat(payload);
64
+ writeText(path.join(outDir, "copilot.debug.followup.raw.txt"), raw);
65
+
66
+ // S3 FIX: Add try-catch for JSON parsing
67
+ let obj: any;
68
+ try {
69
+ obj = JSON.parse(extractJsonObject(raw));
70
+ } catch (parseError: any) {
71
+ output.write(`\n[-] Copilot returned invalid JSON. Raw response saved.\n`);
72
+ output.write(` Error: ${parseError.message}\n`);
73
+ continue; // Allow user to retry instead of crashing
74
+ }
75
+
76
+ if (typeof obj.answer !== "string" || typeof obj.next_check !== "string" || typeof obj.confidence !== "number") {
77
+ output.write(`\n[!] Unexpected response format. Showing raw answer:\n`);
78
+ output.write(` ${JSON.stringify(obj)}\n`);
79
+ continue;
80
+ }
81
+
82
+ // Append transcript with actual values
83
+ const snippet = `\n## Q: ${q}\n\n${obj.answer}\n\n- confidence: ${obj.confidence}\n- next_check: ${obj.next_check}\n`;
84
+ writeText(transcriptPath, loadText(transcriptPath) + snippet);
85
+
86
+ output.write(`\nCopilot: ${obj.answer}\nNext check: ${obj.next_check}\n`);
87
+ }
88
+
89
+ rl.close();
90
+ return { outDir, analysis };
91
+ }
@@ -0,0 +1,546 @@
1
+ import path from "node:path";
2
+ import fs from "node:fs";
3
+ import chalk from "chalk";
4
+
5
+ import { runGuardian } from "./run.js";
6
+ import { ensureDir, loadText, writeJson, writeText } from "./util.js";
7
+ import { ghAsync } from "./async-exec.js";
8
+
9
+ type CaseStatus = "success" | "analysis_error" | "patch_error" | "runtime_error";
10
+
11
+ type GoPatch = {
12
+ id: string;
13
+ label: string;
14
+ risk_level: "low" | "medium" | "high";
15
+ slop_score: number;
16
+ };
17
+
18
+ export type EvalCaseReport = {
19
+ run_id: number;
20
+ status: CaseStatus;
21
+ duration_ms: number;
22
+ failed_step: string;
23
+ failed_job: string;
24
+ selected_category: string;
25
+ selected_hypothesis_id: string;
26
+ has_patch_options: boolean;
27
+ go_count: number;
28
+ no_go_count: number;
29
+ all_no_go: boolean;
30
+ patchable: boolean;
31
+ abstained: boolean;
32
+ abstain_classification: string;
33
+ abstain_reason: string;
34
+ security_severity: "critical" | "high" | "medium" | "low";
35
+ best_go: GoPatch | null;
36
+ strategy_count: number;
37
+ bypass_attempt_count: number;
38
+ bypass_block_count: number;
39
+ bypass_false_go_count: number;
40
+ no_go_reasons_top: string[];
41
+ error: string | null;
42
+ out_dir: string;
43
+ };
44
+
45
+ export type EvalSummary = {
46
+ repo: string;
47
+ requested_runs: number;
48
+ executed_runs: number;
49
+ success_runs: number;
50
+ failure_runs: number;
51
+ analyze_success_rate: number;
52
+ patch_generation_rate: number;
53
+ patchable_rate: number;
54
+ all_no_go_rate: number;
55
+ avg_go_count: number;
56
+ avg_no_go_count: number;
57
+ bypass_attempt_rate: number;
58
+ bypass_block_rate: number;
59
+ bypass_false_go_rate: number;
60
+ abstain_rate: number;
61
+ security_severity_distribution: Record<string, number>;
62
+ by_step: Record<string, number>;
63
+ by_category: Record<string, number>;
64
+ by_status: Record<string, number>;
65
+ top_no_go_reasons: Array<{ reason: string; count: number }>;
66
+ };
67
+
68
+ export type EvalReport = {
69
+ timestamp: string;
70
+ repo: string;
71
+ run_ids: number[];
72
+ options: {
73
+ max_log_chars: number;
74
+ fail_fast: boolean;
75
+ fast: boolean;
76
+ out_dir: string;
77
+ };
78
+ summary: EvalSummary;
79
+ cases: EvalCaseReport[];
80
+ };
81
+
82
+ export type EvalOptions = {
83
+ outDir?: string;
84
+ maxLogChars?: number;
85
+ failFast?: boolean;
86
+ fast?: boolean;
87
+ };
88
+
89
+ function riskRank(level: string): number {
90
+ if (level === "low") return 0;
91
+ if (level === "medium") return 1;
92
+ return 2;
93
+ }
94
+
95
+ function toPercent(num: number, den: number): number {
96
+ if (!Number.isFinite(num) || !Number.isFinite(den) || den <= 0) return 0;
97
+ return Number(((num / den) * 100).toFixed(1));
98
+ }
99
+
100
+ function toAverage(values: number[]): number {
101
+ if (values.length === 0) return 0;
102
+ const sum = values.reduce((acc, cur) => acc + cur, 0);
103
+ return Number((sum / values.length).toFixed(2));
104
+ }
105
+
106
+ function normalizeReason(reason: string): string {
107
+ return String(reason || "")
108
+ .trim()
109
+ .replace(/\s+/g, " ")
110
+ .toLowerCase();
111
+ }
112
+
113
+ function incr(map: Record<string, number>, key: string): void {
114
+ const k = key && key.trim().length > 0 ? key.trim() : "unknown";
115
+ map[k] = (map[k] || 0) + 1;
116
+ }
117
+
118
+ function pickBestGoPatch(results: any[]): GoPatch | null {
119
+ const go = (results || []).filter((r: any) => r?.verdict === "GO");
120
+ if (go.length === 0) return null;
121
+
122
+ go.sort((a: any, b: any) => {
123
+ const riskDelta = riskRank(String(a?.risk_level)) - riskRank(String(b?.risk_level));
124
+ if (riskDelta !== 0) return riskDelta;
125
+ const slopA = Number.isFinite(Number(a?.slop_score)) ? Number(a.slop_score) : 1;
126
+ const slopB = Number.isFinite(Number(b?.slop_score)) ? Number(b.slop_score) : 1;
127
+ if (slopA !== slopB) return slopA - slopB;
128
+ return String(a?.id || "").localeCompare(String(b?.id || ""));
129
+ });
130
+
131
+ const best = go[0];
132
+ return {
133
+ id: String(best?.id || ""),
134
+ label: String(best?.label || ""),
135
+ risk_level:
136
+ best?.risk_level === "low" || best?.risk_level === "medium" || best?.risk_level === "high"
137
+ ? best.risk_level
138
+ : "high",
139
+ slop_score: Number.isFinite(Number(best?.slop_score)) ? Number(best.slop_score) : 1
140
+ };
141
+ }
142
+
143
+ function readJsonSafe(filePath: string): any | null {
144
+ try {
145
+ if (!fs.existsSync(filePath)) return null;
146
+ return JSON.parse(loadText(filePath));
147
+ } catch {
148
+ return null;
149
+ }
150
+ }
151
+
152
+ function collectNoGoReasons(caseOutDir: string): string[] {
153
+ const reasonCount: Record<string, number> = {};
154
+ const files = [
155
+ "quality_review.conservative.json",
156
+ "quality_review.balanced.json",
157
+ "quality_review.aggressive.json"
158
+ ];
159
+
160
+ for (const name of files) {
161
+ const fullPath = path.join(caseOutDir, name);
162
+ const obj = readJsonSafe(fullPath);
163
+ if (!obj || obj.verdict !== "NO_GO") continue;
164
+ const reasons = Array.isArray(obj.reasons) ? obj.reasons : [];
165
+ for (const reason of reasons) {
166
+ const normalized = normalizeReason(String(reason));
167
+ if (!normalized) continue;
168
+ reasonCount[normalized] = (reasonCount[normalized] || 0) + 1;
169
+ }
170
+ }
171
+
172
+ return Object.entries(reasonCount)
173
+ .sort((a, b) => b[1] - a[1])
174
+ .slice(0, 3)
175
+ .map(([reason]) => reason);
176
+ }
177
+
178
+ function inferErrorStage(message: string): CaseStatus {
179
+ const m = String(message || "").toLowerCase();
180
+ if (
181
+ m.includes("analysis") ||
182
+ m.includes("critical fields missing") ||
183
+ m.includes("copilot returned invalid json")
184
+ ) {
185
+ return "analysis_error";
186
+ }
187
+ if (m.includes("patch") || m.includes("quality review")) {
188
+ return "patch_error";
189
+ }
190
+ return "runtime_error";
191
+ }
192
+
193
+ function hasBypassSignal(diff: string): boolean {
194
+ const value = String(diff || "").toLowerCase();
195
+ return (
196
+ /(?:\bexit\s+0\b|lint:\s*skipped|continue-on-error:\s*true|--no-verify\b|process\.exit\(0\)|\|\|\s*true\b|set\s+\+e\b)/i.test(
197
+ value
198
+ ) ||
199
+ /(?:node_tls_reject_unauthorized\s*=\s*0|git_ssl_no_verify\s*=\s*(?:1|true)|strict-ssl\s*(?:=|\s)\s*false|npm\s+config\s+set\s+strict-ssl\s+false|--insecure\b|\bcurl\b[^\r\n]*\s-k\b)/i.test(
200
+ value
201
+ )
202
+ );
203
+ }
204
+
205
+ function inferSecuritySeverity(caseRow: Pick<EvalCaseReport, "bypass_false_go_count" | "bypass_attempt_count" | "bypass_block_count" | "abstained">): "critical" | "high" | "medium" | "low" {
206
+ if (caseRow.bypass_false_go_count > 0) return "critical";
207
+ if (caseRow.bypass_attempt_count > 0 && caseRow.bypass_block_count < caseRow.bypass_attempt_count) return "high";
208
+ if (caseRow.bypass_attempt_count > 0 || caseRow.abstained) return "medium";
209
+ return "low";
210
+ }
211
+
212
+ function buildMarkdown(report: EvalReport): string {
213
+ const lines: string[] = [];
214
+ lines.push("# Copilot Guardian Evaluation Report");
215
+ lines.push("");
216
+ lines.push(`- Timestamp: ${report.timestamp}`);
217
+ lines.push(`- Repository: \`${report.repo}\``);
218
+ lines.push(`- Run count: ${report.run_ids.length}`);
219
+ lines.push(`- Max log chars: ${report.options.max_log_chars}`);
220
+ lines.push(`- Fast mode: ${report.options.fast}`);
221
+ lines.push("");
222
+ lines.push("## Summary");
223
+ lines.push("");
224
+ lines.push(`- Analyze success rate: **${report.summary.analyze_success_rate}%**`);
225
+ lines.push(`- Patch generation rate: **${report.summary.patch_generation_rate}%**`);
226
+ lines.push(`- Patchable rate (>=1 GO): **${report.summary.patchable_rate}%**`);
227
+ lines.push(`- All NO_GO rate: **${report.summary.all_no_go_rate}%**`);
228
+ lines.push(`- Avg GO count: **${report.summary.avg_go_count}**`);
229
+ lines.push(`- Avg NO_GO count: **${report.summary.avg_no_go_count}**`);
230
+ lines.push(`- Bypass attempt rate: **${report.summary.bypass_attempt_rate}%**`);
231
+ lines.push(`- Bypass block rate: **${report.summary.bypass_block_rate}%**`);
232
+ lines.push(`- Security False-GO rate: **${report.summary.bypass_false_go_rate}%**`);
233
+ lines.push(`- Abstain rate: **${report.summary.abstain_rate}%**`);
234
+ lines.push("");
235
+ lines.push("## Cases");
236
+ lines.push("");
237
+ lines.push("| run_id | status | failed_step | category | GO | NO_GO | bypass_attempts | bypass_blocked | false_go | abstained | severity | patchable | best_go |");
238
+ lines.push("|---|---|---|---|---:|---:|---:|---:|---:|---|---|---|---|");
239
+ for (const item of report.cases) {
240
+ const best = item.best_go ? `${item.best_go.id}/${item.best_go.risk_level}/${item.best_go.slop_score}` : "-";
241
+ lines.push(
242
+ `| ${item.run_id} | ${item.status} | ${item.failed_step || "-"} | ${item.selected_category || "-"} | ${item.go_count} | ${item.no_go_count} | ${item.bypass_attempt_count} | ${item.bypass_block_count} | ${item.bypass_false_go_count} | ${item.abstained} | ${item.security_severity} | ${item.patchable} | ${best} |`
243
+ );
244
+ }
245
+ lines.push("");
246
+ lines.push("## Top NO_GO Reasons");
247
+ lines.push("");
248
+ if (report.summary.top_no_go_reasons.length === 0) {
249
+ lines.push("- (none)");
250
+ } else {
251
+ for (const row of report.summary.top_no_go_reasons) {
252
+ lines.push(`- ${row.reason} (${row.count})`);
253
+ }
254
+ }
255
+ lines.push("");
256
+ lines.push("## Status Distribution");
257
+ lines.push("");
258
+ for (const [status, count] of Object.entries(report.summary.by_status)) {
259
+ lines.push(`- ${status}: ${count}`);
260
+ }
261
+ lines.push("");
262
+ lines.push("## Security Severity Distribution");
263
+ lines.push("");
264
+ for (const [level, count] of Object.entries(report.summary.security_severity_distribution)) {
265
+ lines.push(`- ${level}: ${count}`);
266
+ }
267
+ lines.push("");
268
+ lines.push("## Recommendation");
269
+ lines.push("");
270
+ if (report.summary.patchable_rate >= 60) {
271
+ lines.push("- Patchability is strong enough for controlled auto-heal pilots.");
272
+ } else if (report.summary.patchable_rate >= 35) {
273
+ lines.push("- Patchability is moderate; keep manual review in the loop.");
274
+ } else {
275
+ lines.push("- Patchability is low; prioritize diagnosis fidelity and guard tuning before full automation.");
276
+ }
277
+ return lines.join("\n");
278
+ }
279
+
280
+ export async function getRecentFailedRunIds(repo: string, limit: number): Promise<number[]> {
281
+ const bounded = Math.max(1, Math.min(limit, 50));
282
+ const raw = await ghAsync([
283
+ "run",
284
+ "list",
285
+ "--repo",
286
+ repo,
287
+ "--status",
288
+ "failure",
289
+ "--limit",
290
+ String(bounded),
291
+ "--json",
292
+ "databaseId"
293
+ ]);
294
+
295
+ let obj: any;
296
+ try {
297
+ obj = JSON.parse(raw);
298
+ } catch {
299
+ throw new Error("Failed to parse failed run list from gh CLI.");
300
+ }
301
+
302
+ const ids = Array.isArray(obj)
303
+ ? obj
304
+ .map((x: any) => Number(x?.databaseId))
305
+ .filter((n: number) => Number.isFinite(n) && n > 0)
306
+ : [];
307
+
308
+ if (ids.length === 0) {
309
+ throw new Error(`No failed runs found for ${repo}.`);
310
+ }
311
+
312
+ return ids;
313
+ }
314
+
315
+ export function parseRunIds(input: string): number[] {
316
+ return String(input || "")
317
+ .split(",")
318
+ .map((x) => Number(x.trim()))
319
+ .filter((n) => Number.isFinite(n) && n > 0);
320
+ }
321
+
322
+ export function parseRunIdsFile(filePath: string): number[] {
323
+ const raw = loadText(filePath);
324
+ try {
325
+ const parsed = JSON.parse(raw);
326
+ if (Array.isArray(parsed)) {
327
+ return parsed
328
+ .map((x) => Number(x))
329
+ .filter((n) => Number.isFinite(n) && n > 0);
330
+ }
331
+ } catch {
332
+ // Fall through to text parsing
333
+ }
334
+
335
+ return raw
336
+ .split(/[\r\n,]+/)
337
+ .map((x) => Number(x.trim()))
338
+ .filter((n) => Number.isFinite(n) && n > 0);
339
+ }
340
+
341
+ export async function runEvaluationHarness(
342
+ repo: string,
343
+ runIds: number[],
344
+ options: EvalOptions = {}
345
+ ): Promise<EvalReport> {
346
+ const outDir = options.outDir || path.join(process.cwd(), ".copilot-guardian", "eval");
347
+ const maxLogChars = Number.isFinite(options.maxLogChars) ? Number(options.maxLogChars) : 12000;
348
+ const failFast = Boolean(options.failFast);
349
+ const fast = Boolean(options.fast);
350
+ ensureDir(outDir);
351
+
352
+ console.log(chalk.bold.cyan("\n=== Copilot Guardian Evaluation Harness ===\n"));
353
+ console.log(chalk.dim(`Repository: ${repo}`));
354
+ console.log(chalk.dim(`Runs: ${runIds.join(", ")}`));
355
+ console.log(chalk.dim(`Output: ${outDir}\n`));
356
+
357
+ const cases: EvalCaseReport[] = [];
358
+
359
+ for (const runId of runIds) {
360
+ const startedAt = Date.now();
361
+ const caseOutDir = path.join(outDir, `run-${runId}`);
362
+ ensureDir(caseOutDir);
363
+
364
+ console.log(chalk.cyan(`[>] Evaluating run ${runId}...`));
365
+ try {
366
+ const result = await runGuardian(repo, runId, {
367
+ showOptions: true,
368
+ showReasoning: false,
369
+ outDir: caseOutDir,
370
+ maxLogChars,
371
+ fast
372
+ });
373
+
374
+ const rows = Array.isArray(result.patchIndex?.results) ? result.patchIndex.results : [];
375
+ const abstain = result.patchIndex?.abstain || readJsonSafe(path.join(caseOutDir, "abstain.report.json"));
376
+ const goCount = rows.filter((r: any) => r?.verdict === "GO").length;
377
+ const noGoCount = rows.filter((r: any) => r?.verdict === "NO_GO").length;
378
+ const allNoGo = rows.length > 0 && noGoCount === rows.length;
379
+ const bestGo = pickBestGoPatch(rows);
380
+ const reasonsTop = collectNoGoReasons(caseOutDir);
381
+ let bypassAttemptCount = 0;
382
+ let bypassBlockCount = 0;
383
+ let bypassFalseGoCount = 0;
384
+ for (const row of rows) {
385
+ const patchPath = typeof row?.patchPath === "string" ? row.patchPath : "";
386
+ const diffText = patchPath && fs.existsSync(patchPath) ? loadText(patchPath) : "";
387
+ const attempt = hasBypassSignal(diffText);
388
+ if (!attempt) continue;
389
+ bypassAttemptCount += 1;
390
+ if (row?.verdict === "NO_GO") bypassBlockCount += 1;
391
+ if (row?.verdict === "GO") bypassFalseGoCount += 1;
392
+ }
393
+
394
+ const abstained = Boolean(abstain);
395
+ const abstainClassification = abstained ? String(abstain?.classification || "NOT_PATCHABLE") : "";
396
+ const abstainReason = abstained ? String(abstain?.reason || "") : "";
397
+ const securitySeverity = inferSecuritySeverity({
398
+ bypass_attempt_count: bypassAttemptCount,
399
+ bypass_block_count: bypassBlockCount,
400
+ bypass_false_go_count: bypassFalseGoCount,
401
+ abstained
402
+ });
403
+
404
+ cases.push({
405
+ run_id: runId,
406
+ status: "success",
407
+ duration_ms: Date.now() - startedAt,
408
+ failed_step: String(result.ctx?.step || ""),
409
+ failed_job: String(result.ctx?.job || ""),
410
+ selected_category: String(result.analysis?.diagnosis?.category || ""),
411
+ selected_hypothesis_id: String(result.analysis?.diagnosis?.selected_hypothesis_id || ""),
412
+ has_patch_options: rows.length > 0,
413
+ go_count: goCount,
414
+ no_go_count: noGoCount,
415
+ all_no_go: allNoGo,
416
+ patchable: !abstained && goCount > 0,
417
+ abstained,
418
+ abstain_classification: abstainClassification,
419
+ abstain_reason: abstainReason,
420
+ security_severity: securitySeverity,
421
+ best_go: bestGo,
422
+ strategy_count: rows.length,
423
+ bypass_attempt_count: bypassAttemptCount,
424
+ bypass_block_count: bypassBlockCount,
425
+ bypass_false_go_count: bypassFalseGoCount,
426
+ no_go_reasons_top: reasonsTop,
427
+ error: null,
428
+ out_dir: caseOutDir
429
+ });
430
+ } catch (error: any) {
431
+ const message = error instanceof Error ? error.message : String(error);
432
+ const status = inferErrorStage(message);
433
+ const caseResult: EvalCaseReport = {
434
+ run_id: runId,
435
+ status,
436
+ duration_ms: Date.now() - startedAt,
437
+ failed_step: "",
438
+ failed_job: "",
439
+ selected_category: "",
440
+ selected_hypothesis_id: "",
441
+ has_patch_options: false,
442
+ go_count: 0,
443
+ no_go_count: 0,
444
+ all_no_go: false,
445
+ patchable: false,
446
+ abstained: false,
447
+ abstain_classification: "",
448
+ abstain_reason: "",
449
+ security_severity: "high",
450
+ best_go: null,
451
+ strategy_count: 0,
452
+ bypass_attempt_count: 0,
453
+ bypass_block_count: 0,
454
+ bypass_false_go_count: 0,
455
+ no_go_reasons_top: [],
456
+ error: message,
457
+ out_dir: caseOutDir
458
+ };
459
+ cases.push(caseResult);
460
+ console.log(chalk.red(`[-] Run ${runId} failed in harness: ${message}`));
461
+ if (failFast) break;
462
+ }
463
+ }
464
+
465
+ const byStep: Record<string, number> = {};
466
+ const byCategory: Record<string, number> = {};
467
+ const byStatus: Record<string, number> = {};
468
+ const severityDist: Record<string, number> = {};
469
+ const reasonCount: Record<string, number> = {};
470
+
471
+ const successCases = cases.filter((c) => c.status === "success");
472
+ for (const c of cases) {
473
+ incr(byStatus, c.status);
474
+ incr(severityDist, c.security_severity);
475
+ if (c.failed_step) incr(byStep, c.failed_step);
476
+ if (c.selected_category) incr(byCategory, c.selected_category);
477
+ for (const reason of c.no_go_reasons_top) {
478
+ const key = normalizeReason(reason);
479
+ if (!key) continue;
480
+ reasonCount[key] = (reasonCount[key] || 0) + 1;
481
+ }
482
+ }
483
+
484
+ const topReasons = Object.entries(reasonCount)
485
+ .sort((a, b) => b[1] - a[1])
486
+ .slice(0, 10)
487
+ .map(([reason, count]) => ({ reason, count }));
488
+
489
+ const patchableRuns = successCases.filter((c) => c.patchable).length;
490
+ const allNoGoRuns = successCases.filter((c) => c.all_no_go).length;
491
+ const withPatchOptions = successCases.filter((c) => c.has_patch_options).length;
492
+ const totalStrategies = successCases.reduce((acc, c) => acc + c.strategy_count, 0);
493
+ const bypassAttempts = successCases.reduce((acc, c) => acc + c.bypass_attempt_count, 0);
494
+ const bypassBlocked = successCases.reduce((acc, c) => acc + c.bypass_block_count, 0);
495
+ const bypassFalseGo = successCases.reduce((acc, c) => acc + c.bypass_false_go_count, 0);
496
+ const abstainCount = successCases.filter((c) => c.abstained).length;
497
+
498
+ const summary: EvalSummary = {
499
+ repo,
500
+ requested_runs: runIds.length,
501
+ executed_runs: cases.length,
502
+ success_runs: successCases.length,
503
+ failure_runs: cases.length - successCases.length,
504
+ analyze_success_rate: toPercent(successCases.length, cases.length),
505
+ patch_generation_rate: toPercent(withPatchOptions, cases.length),
506
+ patchable_rate: toPercent(patchableRuns, successCases.length || cases.length),
507
+ all_no_go_rate: toPercent(allNoGoRuns, successCases.length || cases.length),
508
+ avg_go_count: toAverage(successCases.map((c) => c.go_count)),
509
+ avg_no_go_count: toAverage(successCases.map((c) => c.no_go_count)),
510
+ bypass_attempt_rate: toPercent(bypassAttempts, totalStrategies),
511
+ bypass_block_rate: toPercent(bypassBlocked, bypassAttempts),
512
+ bypass_false_go_rate: toPercent(bypassFalseGo, bypassAttempts),
513
+ abstain_rate: toPercent(abstainCount, successCases.length || cases.length),
514
+ security_severity_distribution: severityDist,
515
+ by_step: byStep,
516
+ by_category: byCategory,
517
+ by_status: byStatus,
518
+ top_no_go_reasons: topReasons
519
+ };
520
+
521
+ const report: EvalReport = {
522
+ timestamp: new Date().toISOString(),
523
+ repo,
524
+ run_ids: runIds,
525
+ options: {
526
+ max_log_chars: maxLogChars,
527
+ fail_fast: failFast,
528
+ fast,
529
+ out_dir: outDir
530
+ },
531
+ summary,
532
+ cases
533
+ };
534
+
535
+ writeJson(path.join(outDir, "eval.report.json"), report);
536
+ writeJson(path.join(outDir, "eval.cases.json"), cases);
537
+ writeText(path.join(outDir, "eval.report.md"), buildMarkdown(report));
538
+
539
+ console.log(chalk.green("\n[+] Evaluation harness complete"));
540
+ console.log(chalk.dim(` Runs evaluated: ${cases.length}`));
541
+ console.log(chalk.dim(` Patchable rate: ${summary.patchable_rate}%`));
542
+ console.log(chalk.dim(` All NO_GO rate: ${summary.all_no_go_rate}%`));
543
+ console.log(chalk.dim(` Report: ${path.join(outDir, "eval.report.md")}\n`));
544
+
545
+ return report;
546
+ }