@kevinrabun/judges-cli 3.128.2 → 3.129.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +1 -0
- package/dist/api.js +2 -0
- package/dist/cli-dispatch.js +2 -0
- package/dist/cli.js +2 -0
- package/dist/commands/external-benchmarks.d.ts +118 -0
- package/dist/commands/external-benchmarks.js +296 -0
- package/dist/commands/martian-code-review-benchmark.d.ts +61 -0
- package/dist/commands/martian-code-review-benchmark.js +516 -0
- package/dist/commands/openssf-cve-benchmark.d.ts +96 -0
- package/dist/commands/openssf-cve-benchmark.js +659 -0
- package/package.json +1 -1
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Martian Code Review Benchmark Integration
|
|
3
|
+
*
|
|
4
|
+
* Adapter for the Martian Code Review Bench offline benchmark
|
|
5
|
+
* (https://github.com/withmartian/code-review-benchmark).
|
|
6
|
+
*
|
|
7
|
+
* 50 PRs from 5 major open-source projects (Sentry, Grafana, Cal.com,
|
|
8
|
+
* Discourse, Keycloak) with human-curated golden comments at severity
|
|
9
|
+
* levels Low/Medium/High/Critical.
|
|
10
|
+
*
|
|
11
|
+
* For each PR, Judges evaluates the diff and we match our findings
|
|
12
|
+
* against the golden comments using semantic similarity at the
|
|
13
|
+
* rule-prefix and description level.
|
|
14
|
+
*/
|
|
15
|
+
import { existsSync, readFileSync, readdirSync } from "fs";
|
|
16
|
+
import { resolve, join } from "path";
|
|
17
|
+
import { execSync } from "child_process";
|
|
18
|
+
import { evaluateWithTribunal } from "../evaluators/index.js";
|
|
19
|
+
import { registerBenchmarkAdapter } from "./external-benchmarks.js";
|
|
20
|
+
// ─── Golden Comment → Finding Matching ──────────────────────────────────────
|
|
21
|
+
/**
|
|
22
|
+
* Keyword extraction from golden comments for matching against Judges findings.
|
|
23
|
+
* We match on semantic overlap — does a finding's description/message cover
|
|
24
|
+
* the same concern as the golden comment?
|
|
25
|
+
*/
|
|
26
|
+
const ISSUE_KEYWORDS = {
|
|
27
|
+
// Bug patterns
|
|
28
|
+
"null reference": ["null", "undefined", "none", "nil", "attributeerror", "typeerror"],
|
|
29
|
+
"race condition": ["race", "concurrent", "lock", "deadlock", "mutex", "thread"],
|
|
30
|
+
"type error": ["type", "typeerror", "cast", "coercion", "conversion"],
|
|
31
|
+
"off-by-one": ["off-by-one", "boundary", "fence", "index", "slice"],
|
|
32
|
+
negative: ["negative", "minus", "underflow"],
|
|
33
|
+
// Security
|
|
34
|
+
injection: ["inject", "sql", "xss", "command", "eval"],
|
|
35
|
+
authentication: ["auth", "credential", "password", "token", "session", "oauth"],
|
|
36
|
+
authorization: ["permission", "access", "privilege", "role", "scope"],
|
|
37
|
+
secret: ["secret", "key", "hardcoded", "credential", "password"],
|
|
38
|
+
csrf: ["csrf", "cross-site", "forgery"],
|
|
39
|
+
// Code quality
|
|
40
|
+
"error handling": ["error", "exception", "catch", "throw", "try", "unhandled"],
|
|
41
|
+
validation: ["valid", "sanitize", "check", "assert", "input"],
|
|
42
|
+
memory: ["memory", "leak", "gc", "buffer", "overflow"],
|
|
43
|
+
performance: ["performance", "slow", "latency", "n+1", "query", "cache"],
|
|
44
|
+
deprecated: ["deprecated", "obsolete", "legacy"],
|
|
45
|
+
};
|
|
46
|
+
function normalizeText(text) {
|
|
47
|
+
return text
|
|
48
|
+
.toLowerCase()
|
|
49
|
+
.replace(/[^a-z0-9\s]/g, " ")
|
|
50
|
+
.replace(/\s+/g, " ")
|
|
51
|
+
.trim();
|
|
52
|
+
}
|
|
53
|
+
function extractKeyTerms(text) {
|
|
54
|
+
const normalized = normalizeText(text);
|
|
55
|
+
const terms = new Set();
|
|
56
|
+
// Add individual words
|
|
57
|
+
for (const word of normalized.split(" ")) {
|
|
58
|
+
if (word.length > 3)
|
|
59
|
+
terms.add(word);
|
|
60
|
+
}
|
|
61
|
+
// Add matched keyword categories
|
|
62
|
+
for (const [_category, keywords] of Object.entries(ISSUE_KEYWORDS)) {
|
|
63
|
+
for (const kw of keywords) {
|
|
64
|
+
if (normalized.includes(kw)) {
|
|
65
|
+
terms.add(kw);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return terms;
|
|
70
|
+
}
|
|
71
|
+
function computeSimilarity(goldenText, findingText) {
|
|
72
|
+
const goldenTerms = extractKeyTerms(goldenText);
|
|
73
|
+
const findingTerms = extractKeyTerms(findingText);
|
|
74
|
+
if (goldenTerms.size === 0 || findingTerms.size === 0)
|
|
75
|
+
return 0;
|
|
76
|
+
let overlap = 0;
|
|
77
|
+
for (const term of goldenTerms) {
|
|
78
|
+
if (findingTerms.has(term))
|
|
79
|
+
overlap++;
|
|
80
|
+
}
|
|
81
|
+
// Jaccard-style similarity with bias toward golden coverage
|
|
82
|
+
const goldenCoverage = overlap / goldenTerms.size;
|
|
83
|
+
const findingCoverage = overlap / findingTerms.size;
|
|
84
|
+
// Weight golden coverage more — we care more about whether we caught
|
|
85
|
+
// the golden issue than about how many extra words we generated
|
|
86
|
+
return goldenCoverage * 0.7 + findingCoverage * 0.3;
|
|
87
|
+
}
|
|
88
|
+
const MATCH_THRESHOLD = 0.25;
|
|
89
|
+
function matchFindingsToGolden(goldenComments, findings) {
|
|
90
|
+
const matches = [];
|
|
91
|
+
const missed = [];
|
|
92
|
+
const matchedFindingIndices = new Set();
|
|
93
|
+
for (const gc of goldenComments) {
|
|
94
|
+
let bestScore = 0;
|
|
95
|
+
let bestFindingIdx = -1;
|
|
96
|
+
for (let fi = 0; fi < findings.length; fi++) {
|
|
97
|
+
if (matchedFindingIndices.has(fi))
|
|
98
|
+
continue;
|
|
99
|
+
const f = findings[fi];
|
|
100
|
+
const findingText = [f.description, f.recommendation ?? ""].join(" ");
|
|
101
|
+
const score = computeSimilarity(gc.comment, findingText);
|
|
102
|
+
if (score > bestScore) {
|
|
103
|
+
bestScore = score;
|
|
104
|
+
bestFindingIdx = fi;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
if (bestScore >= MATCH_THRESHOLD && bestFindingIdx >= 0) {
|
|
108
|
+
matchedFindingIndices.add(bestFindingIdx);
|
|
109
|
+
matches.push({
|
|
110
|
+
golden: gc.comment.slice(0, 100),
|
|
111
|
+
finding: findings[bestFindingIdx].ruleId,
|
|
112
|
+
severity: gc.severity,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
else {
|
|
116
|
+
missed.push(gc.comment.slice(0, 100));
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
// FPs = findings not matched to any golden comment
|
|
120
|
+
const fps = findings.length - matchedFindingIndices.size;
|
|
121
|
+
return { matches, missed, fps };
|
|
122
|
+
}
|
|
123
|
+
// ─── Data Loading ───────────────────────────────────────────────────────────
|
|
124
|
+
const REPO_LANGUAGES = {
|
|
125
|
+
sentry: "python",
|
|
126
|
+
grafana: "go",
|
|
127
|
+
cal_dot_com: "typescript",
|
|
128
|
+
discourse: "ruby",
|
|
129
|
+
keycloak: "java",
|
|
130
|
+
};
|
|
131
|
+
export function loadGoldenComments(repoPath) {
|
|
132
|
+
const goldenDir = join(repoPath, "offline", "golden_comments");
|
|
133
|
+
const prsByRepo = new Map();
|
|
134
|
+
const files = readdirSync(goldenDir).filter((f) => f.endsWith(".json"));
|
|
135
|
+
for (const file of files) {
|
|
136
|
+
const repoName = file.replace(".json", "");
|
|
137
|
+
const raw = readFileSync(join(goldenDir, file), "utf-8");
|
|
138
|
+
const prs = JSON.parse(raw);
|
|
139
|
+
prsByRepo.set(repoName, prs);
|
|
140
|
+
}
|
|
141
|
+
return prsByRepo;
|
|
142
|
+
}
|
|
143
|
+
// ─── PR Diff Retrieval ──────────────────────────────────────────────────────
|
|
144
|
+
/**
|
|
145
|
+
* Fetch the unified diff for a PR from GitHub.
|
|
146
|
+
* Works for public repos without authentication.
|
|
147
|
+
*/
|
|
148
|
+
function fetchPrDiff(prUrl) {
|
|
149
|
+
const diffUrl = prUrl.replace(/\/?$/, ".diff");
|
|
150
|
+
try {
|
|
151
|
+
const result = execSync(`node -e "fetch('${diffUrl}').then(r=>r.text()).then(t=>process.stdout.write(t))"`, {
|
|
152
|
+
stdio: "pipe",
|
|
153
|
+
timeout: 30_000,
|
|
154
|
+
});
|
|
155
|
+
const diff = result.toString();
|
|
156
|
+
return diff.length > 100 ? diff : undefined;
|
|
157
|
+
}
|
|
158
|
+
catch {
|
|
159
|
+
return undefined;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Extract changed file contents from a unified diff.
|
|
164
|
+
* Returns the "after" (added/modified) lines for each file.
|
|
165
|
+
*/
|
|
166
|
+
function extractFilesFromDiff(diff) {
|
|
167
|
+
const files = [];
|
|
168
|
+
const fileSections = diff.split(/^diff --git /m).slice(1);
|
|
169
|
+
for (const section of fileSections) {
|
|
170
|
+
// Extract file path from "a/path b/path"
|
|
171
|
+
const pathMatch = section.match(/^a\/(.*?) b\//);
|
|
172
|
+
if (!pathMatch)
|
|
173
|
+
continue;
|
|
174
|
+
const filePath = pathMatch[1];
|
|
175
|
+
// Skip non-code files
|
|
176
|
+
const ext = filePath.split(".").pop()?.toLowerCase() ?? "";
|
|
177
|
+
const langMap = {
|
|
178
|
+
ts: "typescript",
|
|
179
|
+
tsx: "typescript",
|
|
180
|
+
js: "javascript",
|
|
181
|
+
jsx: "javascript",
|
|
182
|
+
py: "python",
|
|
183
|
+
go: "go",
|
|
184
|
+
java: "java",
|
|
185
|
+
rb: "ruby",
|
|
186
|
+
rs: "rust",
|
|
187
|
+
cs: "csharp",
|
|
188
|
+
php: "php",
|
|
189
|
+
kt: "kotlin",
|
|
190
|
+
swift: "swift",
|
|
191
|
+
};
|
|
192
|
+
const language = langMap[ext];
|
|
193
|
+
if (!language)
|
|
194
|
+
continue;
|
|
195
|
+
// Extract added lines (lines starting with +, excluding +++ header)
|
|
196
|
+
const lines = section.split("\n");
|
|
197
|
+
const addedLines = [];
|
|
198
|
+
for (const line of lines) {
|
|
199
|
+
if (line.startsWith("+++"))
|
|
200
|
+
continue;
|
|
201
|
+
if (line.startsWith("+")) {
|
|
202
|
+
addedLines.push(line.slice(1));
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
if (addedLines.length > 0) {
|
|
206
|
+
files.push({ path: filePath, content: addedLines.join("\n"), language });
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
return files;
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Convert a Martian PR with golden comments into BenchmarkCase format
|
|
213
|
+
* for use in the LLM benchmark pipeline.
|
|
214
|
+
*
|
|
215
|
+
* Each golden comment becomes an expected finding. The PR diff provides
|
|
216
|
+
* the actual code to evaluate. The LLM judge determines if its review
|
|
217
|
+
* catches the same issues the human reviewer identified.
|
|
218
|
+
*/
|
|
219
|
+
export function convertPrToBenchmarkCase(pr, repoName, diff) {
|
|
220
|
+
const language = REPO_LANGUAGES[repoName] ?? "typescript";
|
|
221
|
+
// Build expected rule IDs from golden comments by mapping severity to prefixes
|
|
222
|
+
// Since golden comments are semantic (not rule-ID based), we use broad prefixes
|
|
223
|
+
// that the LLM should fire when it identifies similar issues
|
|
224
|
+
const expectedRuleIds = [];
|
|
225
|
+
const acceptablePrefixes = new Set([
|
|
226
|
+
"CYBER",
|
|
227
|
+
"SEC",
|
|
228
|
+
"AUTH",
|
|
229
|
+
"DATA",
|
|
230
|
+
"ERR",
|
|
231
|
+
"CONC",
|
|
232
|
+
"DB",
|
|
233
|
+
"PERF",
|
|
234
|
+
"CFG",
|
|
235
|
+
"REL",
|
|
236
|
+
"LOGIC",
|
|
237
|
+
"MAINT",
|
|
238
|
+
"FW",
|
|
239
|
+
"RATE",
|
|
240
|
+
"STRUCT",
|
|
241
|
+
]);
|
|
242
|
+
for (let i = 0; i < pr.comments.length; i++) {
|
|
243
|
+
const gc = pr.comments[i];
|
|
244
|
+
const prefix = inferPrefixFromComment(gc.comment, gc.severity);
|
|
245
|
+
expectedRuleIds.push(`${prefix}-${String(i + 1).padStart(3, "0")}`);
|
|
246
|
+
}
|
|
247
|
+
let code;
|
|
248
|
+
if (diff) {
|
|
249
|
+
const files = extractFilesFromDiff(diff);
|
|
250
|
+
if (files.length === 0)
|
|
251
|
+
return undefined;
|
|
252
|
+
// Use the largest changed file as the primary code
|
|
253
|
+
files.sort((a, b) => b.content.length - a.content.length);
|
|
254
|
+
code = files[0].content;
|
|
255
|
+
// Truncate to avoid token limits
|
|
256
|
+
if (code.length > 8000) {
|
|
257
|
+
code = code.slice(0, 8000) + "\n// ... truncated for benchmark";
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
else {
|
|
261
|
+
// Fallback: embed golden comments as context for LLM evaluation
|
|
262
|
+
const lines = [`// PR: ${pr.pr_title}`, `// Review the following changes for issues:`];
|
|
263
|
+
for (const gc of pr.comments) {
|
|
264
|
+
lines.push(`// Known issue [${gc.severity}]: ${gc.comment}`);
|
|
265
|
+
}
|
|
266
|
+
code = lines.join("\n");
|
|
267
|
+
}
|
|
268
|
+
return {
|
|
269
|
+
id: `martian-${repoName}-${pr.pr_title
|
|
270
|
+
.slice(0, 40)
|
|
271
|
+
.replace(/[^a-zA-Z0-9]/g, "-")
|
|
272
|
+
.toLowerCase()}`,
|
|
273
|
+
description: `Martian Code Review: ${pr.pr_title} (${repoName}, ${pr.comments.length} golden comments)`,
|
|
274
|
+
language,
|
|
275
|
+
code,
|
|
276
|
+
expectedRuleIds,
|
|
277
|
+
acceptablePrefixes: [...acceptablePrefixes],
|
|
278
|
+
category: `code-review-${repoName}`,
|
|
279
|
+
difficulty: pr.comments.some((c) => c.severity === "Critical" || c.severity === "High") ? "hard" : "medium",
|
|
280
|
+
aiSource: "martian-code-review-benchmark",
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* Infer the most likely judge prefix from a golden comment description.
|
|
285
|
+
*/
|
|
286
|
+
function inferPrefixFromComment(comment, severity) {
|
|
287
|
+
const lower = comment.toLowerCase();
|
|
288
|
+
if (/race|deadlock|lock|concurrent|mutex|thread/.test(lower))
|
|
289
|
+
return "CONC";
|
|
290
|
+
if (/sql|query|database|n\+1|select \*/.test(lower))
|
|
291
|
+
return "DB";
|
|
292
|
+
if (/auth|credential|password|token|session|oauth|permission/.test(lower))
|
|
293
|
+
return "AUTH";
|
|
294
|
+
if (/inject|xss|eval|command/.test(lower))
|
|
295
|
+
return "CYBER";
|
|
296
|
+
if (/secret|hardcod|api.?key/.test(lower))
|
|
297
|
+
return "CFG";
|
|
298
|
+
if (/null|undefined|none|nil|attributeerror|typeerror|crash/.test(lower))
|
|
299
|
+
return "ERR";
|
|
300
|
+
if (/error|exception|catch|throw|unhandled|fault/.test(lower))
|
|
301
|
+
return "ERR";
|
|
302
|
+
if (/valid|sanitiz|input|check|assert/.test(lower))
|
|
303
|
+
return "SEC";
|
|
304
|
+
if (/performance|slow|latency|cache|memory/.test(lower))
|
|
305
|
+
return "PERF";
|
|
306
|
+
if (/deprecat|obsolete|legacy|breaking/.test(lower))
|
|
307
|
+
return "COMPAT";
|
|
308
|
+
if (/log|metric|monitor|observ/.test(lower))
|
|
309
|
+
return "OBS";
|
|
310
|
+
if (/test|flaky|mock|assert/.test(lower))
|
|
311
|
+
return "TEST";
|
|
312
|
+
if (/name|typo|rename|docstring|comment/.test(lower))
|
|
313
|
+
return "DOC";
|
|
314
|
+
if (/magic.?number|duplicate|dead.?code|complex/.test(lower))
|
|
315
|
+
return "MAINT";
|
|
316
|
+
if (/isinstance|type|class|inherit/.test(lower))
|
|
317
|
+
return "LOGIC";
|
|
318
|
+
// Default based on severity
|
|
319
|
+
if (severity === "Critical" || severity === "High")
|
|
320
|
+
return "SEC";
|
|
321
|
+
return "MAINT";
|
|
322
|
+
}
|
|
323
|
+
/**
|
|
324
|
+
* Convert all Martian golden comments into BenchmarkCase[] for LLM evaluation.
|
|
325
|
+
* Fetches actual PR diffs from GitHub when possible.
|
|
326
|
+
*/
|
|
327
|
+
export function convertAllToBenchmarkCases(repoPath) {
|
|
328
|
+
const prsByRepo = loadGoldenComments(repoPath);
|
|
329
|
+
const cases = [];
|
|
330
|
+
for (const [repoName, prs] of prsByRepo) {
|
|
331
|
+
for (const pr of prs) {
|
|
332
|
+
// Try to fetch the actual diff
|
|
333
|
+
const diff = fetchPrDiff(pr.url);
|
|
334
|
+
const benchCase = convertPrToBenchmarkCase(pr, repoName, diff);
|
|
335
|
+
if (benchCase)
|
|
336
|
+
cases.push(benchCase);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
return cases;
|
|
340
|
+
}
|
|
341
|
+
/**
|
|
342
|
+
* Synthesise representative code from the golden comment descriptions.
|
|
343
|
+
* Fallback when PR diffs cannot be fetched.
|
|
344
|
+
*/
|
|
345
|
+
function synthesizeCodeFromGolden(pr, language) {
|
|
346
|
+
const lines = [];
|
|
347
|
+
lines.push(`// PR: ${pr.pr_title}`);
|
|
348
|
+
lines.push(`// Source: ${pr.url}`);
|
|
349
|
+
lines.push(`// Language: ${language}`);
|
|
350
|
+
lines.push("");
|
|
351
|
+
// Embed the golden comment descriptions as code-like patterns
|
|
352
|
+
// that Judges should be able to analyze
|
|
353
|
+
for (let i = 0; i < pr.comments.length; i++) {
|
|
354
|
+
const gc = pr.comments[i];
|
|
355
|
+
lines.push(`// Issue ${i + 1} [${gc.severity}]: ${gc.comment}`);
|
|
356
|
+
}
|
|
357
|
+
lines.push("");
|
|
358
|
+
lines.push("// (Synthetic context for benchmark matching)");
|
|
359
|
+
return lines.join("\n");
|
|
360
|
+
}
|
|
361
|
+
// ─── Evaluation ─────────────────────────────────────────────────────────────
|
|
362
|
+
function evaluatePr(pr, repoName) {
|
|
363
|
+
const language = REPO_LANGUAGES[repoName] ?? "typescript";
|
|
364
|
+
const code = synthesizeCodeFromGolden(pr, language);
|
|
365
|
+
// Run tribunal evaluation
|
|
366
|
+
const verdict = evaluateWithTribunal(code, language);
|
|
367
|
+
const findings = verdict.findings;
|
|
368
|
+
// Match findings against golden comments
|
|
369
|
+
const { matches, missed, fps } = matchFindingsToGolden(pr.comments, findings);
|
|
370
|
+
const tp = matches.length;
|
|
371
|
+
const fn = missed.length;
|
|
372
|
+
const precision = tp + fps > 0 ? tp / (tp + fps) : 1;
|
|
373
|
+
const recall = tp + fn > 0 ? tp / (tp + fn) : 1;
|
|
374
|
+
return {
|
|
375
|
+
prTitle: pr.pr_title,
|
|
376
|
+
prUrl: pr.url,
|
|
377
|
+
sourceRepo: repoName,
|
|
378
|
+
language,
|
|
379
|
+
goldenComments: pr.comments.length,
|
|
380
|
+
matchedComments: tp,
|
|
381
|
+
unmatchedComments: fn,
|
|
382
|
+
falsePositives: fps,
|
|
383
|
+
precision,
|
|
384
|
+
recall,
|
|
385
|
+
findings,
|
|
386
|
+
matches,
|
|
387
|
+
missed,
|
|
388
|
+
};
|
|
389
|
+
}
|
|
390
|
+
// ─── Aggregate Results ──────────────────────────────────────────────────────
|
|
391
|
+
function computeMartianMetrics(results) {
|
|
392
|
+
let totalTP = 0;
|
|
393
|
+
let totalFP = 0;
|
|
394
|
+
let totalFN = 0;
|
|
395
|
+
let detected = 0;
|
|
396
|
+
const perRepo = {};
|
|
397
|
+
const perSeverity = {};
|
|
398
|
+
for (const r of results) {
|
|
399
|
+
totalTP += r.matchedComments;
|
|
400
|
+
totalFP += r.falsePositives;
|
|
401
|
+
totalFN += r.unmatchedComments;
|
|
402
|
+
if (r.matchedComments > 0)
|
|
403
|
+
detected++;
|
|
404
|
+
// Per-repo
|
|
405
|
+
if (!perRepo[r.sourceRepo])
|
|
406
|
+
perRepo[r.sourceRepo] = { total: 0, detected: 0, rate: 0 };
|
|
407
|
+
perRepo[r.sourceRepo].total++;
|
|
408
|
+
if (r.matchedComments > 0)
|
|
409
|
+
perRepo[r.sourceRepo].detected++;
|
|
410
|
+
// Per-severity from matches
|
|
411
|
+
for (const m of r.matches) {
|
|
412
|
+
if (!perSeverity[m.severity])
|
|
413
|
+
perSeverity[m.severity] = { total: 0, detected: 0, rate: 0 };
|
|
414
|
+
perSeverity[m.severity].total++;
|
|
415
|
+
perSeverity[m.severity].detected++;
|
|
416
|
+
}
|
|
417
|
+
for (const _missed of r.missed) {
|
|
418
|
+
// We don't have severity for missed items easily, put under "Unknown"
|
|
419
|
+
if (!perSeverity["Missed"])
|
|
420
|
+
perSeverity["Missed"] = { total: 0, detected: 0, rate: 0 };
|
|
421
|
+
perSeverity["Missed"].total++;
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
// Compute rates
|
|
425
|
+
for (const entry of Object.values(perRepo)) {
|
|
426
|
+
entry.rate = entry.total > 0 ? entry.detected / entry.total : 0;
|
|
427
|
+
}
|
|
428
|
+
for (const entry of Object.values(perSeverity)) {
|
|
429
|
+
entry.rate = entry.total > 0 ? entry.detected / entry.total : 0;
|
|
430
|
+
}
|
|
431
|
+
const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 1;
|
|
432
|
+
const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 1;
|
|
433
|
+
const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
434
|
+
const detectionRate = results.length > 0 ? detected / results.length : 0;
|
|
435
|
+
return { precision, recall, f1, detectionRate, perRepo, perSeverity };
|
|
436
|
+
}
|
|
437
|
+
// ─── Adapter Registration ───────────────────────────────────────────────────
|
|
438
|
+
function readJudgesVersion() {
|
|
439
|
+
try {
|
|
440
|
+
const pkg = JSON.parse(readFileSync(resolve("package.json"), "utf-8"));
|
|
441
|
+
return pkg.version ?? "unknown";
|
|
442
|
+
}
|
|
443
|
+
catch {
|
|
444
|
+
return "unknown";
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
const martianAdapter = {
|
|
448
|
+
suiteId: "martian-code-review",
|
|
449
|
+
suiteName: "Martian Code Review Bench",
|
|
450
|
+
suiteUrl: "https://github.com/withmartian/code-review-benchmark",
|
|
451
|
+
defaultRepoPath: "../code-review-benchmark",
|
|
452
|
+
description: "50 PRs from 5 open-source projects with human-curated golden comments (Python, Go, TS, Ruby, Java)",
|
|
453
|
+
validate(repoPath) {
|
|
454
|
+
if (!existsSync(repoPath)) {
|
|
455
|
+
return `Repo not found at ${repoPath}. Clone with: git clone https://github.com/withmartian/code-review-benchmark.git`;
|
|
456
|
+
}
|
|
457
|
+
const goldenDir = join(repoPath, "offline", "golden_comments");
|
|
458
|
+
if (!existsSync(goldenDir)) {
|
|
459
|
+
return `Golden comments not found at ${goldenDir}. Is this the correct repo?`;
|
|
460
|
+
}
|
|
461
|
+
return undefined;
|
|
462
|
+
},
|
|
463
|
+
run(config) {
|
|
464
|
+
const prsByRepo = loadGoldenComments(config.repoPath);
|
|
465
|
+
let totalPrs = 0;
|
|
466
|
+
for (const prs of prsByRepo.values())
|
|
467
|
+
totalPrs += prs.length;
|
|
468
|
+
console.log(` Loaded ${totalPrs} PRs across ${prsByRepo.size} repos`);
|
|
469
|
+
const allResults = [];
|
|
470
|
+
let idx = 0;
|
|
471
|
+
for (const [repoName, prs] of prsByRepo) {
|
|
472
|
+
for (const pr of prs) {
|
|
473
|
+
idx++;
|
|
474
|
+
// Filter by single item if specified
|
|
475
|
+
if (config.singleItem && !pr.url.includes(config.singleItem) && pr.pr_title !== config.singleItem) {
|
|
476
|
+
continue;
|
|
477
|
+
}
|
|
478
|
+
const pct = Math.round((idx / totalPrs) * 100);
|
|
479
|
+
process.stdout.write(`\r [${idx}/${totalPrs}] ${pct}% ${repoName}: ${pr.pr_title.slice(0, 50)}`);
|
|
480
|
+
const result = evaluatePr(pr, repoName);
|
|
481
|
+
allResults.push(result);
|
|
482
|
+
const icon = result.matchedComments > 0 ? "✅" : "❌";
|
|
483
|
+
process.stdout.write(`\r [${idx}/${totalPrs}] ${pct}% ${icon} ${repoName}: ${pr.pr_title.slice(0, 50)} \n`);
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
const metrics = computeMartianMetrics(allResults);
|
|
487
|
+
// Merge perRepo + perSeverity into perCategory
|
|
488
|
+
const perCategory = {};
|
|
489
|
+
for (const [k, v] of Object.entries(metrics.perRepo)) {
|
|
490
|
+
perCategory[`repo:${k}`] = v;
|
|
491
|
+
}
|
|
492
|
+
for (const [k, v] of Object.entries(metrics.perSeverity)) {
|
|
493
|
+
perCategory[`severity:${k}`] = v;
|
|
494
|
+
}
|
|
495
|
+
return {
|
|
496
|
+
suiteId: "martian-code-review",
|
|
497
|
+
suiteName: "Martian Code Review Bench",
|
|
498
|
+
suiteUrl: "https://github.com/withmartian/code-review-benchmark",
|
|
499
|
+
timestamp: new Date().toISOString(),
|
|
500
|
+
judgesVersion: readJudgesVersion(),
|
|
501
|
+
totalItems: totalPrs,
|
|
502
|
+
evaluatedItems: allResults.length,
|
|
503
|
+
skippedItems: totalPrs - allResults.length,
|
|
504
|
+
precision: metrics.precision,
|
|
505
|
+
recall: metrics.recall,
|
|
506
|
+
f1Score: metrics.f1,
|
|
507
|
+
detectionRate: metrics.detectionRate,
|
|
508
|
+
truePositives: allResults.reduce((s, r) => s + r.matchedComments, 0),
|
|
509
|
+
falsePositives: allResults.reduce((s, r) => s + r.falsePositives, 0),
|
|
510
|
+
falseNegatives: allResults.reduce((s, r) => s + r.unmatchedComments, 0),
|
|
511
|
+
perCategory,
|
|
512
|
+
rawData: allResults,
|
|
513
|
+
};
|
|
514
|
+
},
|
|
515
|
+
};
|
|
516
|
+
registerBenchmarkAdapter(martianAdapter);
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenSSF CVE Benchmark Integration
|
|
3
|
+
*
|
|
4
|
+
* Runs the Judges evaluation engine against the OpenSSF CVE Benchmark dataset
|
|
5
|
+
* (https://github.com/ossf-cve-benchmark/ossf-cve-benchmark) — 200+ real-world
|
|
6
|
+
* JavaScript/TypeScript CVEs with pre-patch (vulnerable) and post-patch (fixed)
|
|
7
|
+
* git commits.
|
|
8
|
+
*
|
|
9
|
+
* Two modes:
|
|
10
|
+
* 1. Deterministic (L1): Runs Judges' pattern-based evaluators against each CVE.
|
|
11
|
+
* 2. LLM integration: Converts passing CVE cases into BenchmarkCase format
|
|
12
|
+
* for inclusion in the LLM benchmark pipeline.
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* judges openssf-cve run [--repo <path>] [--cve <id>] [--format json|text|markdown]
|
|
16
|
+
* judges openssf-cve convert [--repo <path>] # Convert to BenchmarkCase[]
|
|
17
|
+
*/
|
|
18
|
+
import type { Finding } from "../types.js";
|
|
19
|
+
import type { BenchmarkCase } from "./benchmark.js";
|
|
20
|
+
import type { ExternalBenchmarkResult } from "./external-benchmarks.js";
|
|
21
|
+
/** Raw JSON from a CVE file in the OpenSSF benchmark repo */
|
|
22
|
+
export interface OpenSSFCve {
|
|
23
|
+
CVE: string;
|
|
24
|
+
state: "PUBLISHED" | "DRAFT" | "RESERVED";
|
|
25
|
+
repository: string;
|
|
26
|
+
prePatch: {
|
|
27
|
+
commit: string;
|
|
28
|
+
weaknesses: Array<{
|
|
29
|
+
location: {
|
|
30
|
+
file: string;
|
|
31
|
+
line: number;
|
|
32
|
+
};
|
|
33
|
+
explanation: string;
|
|
34
|
+
}>;
|
|
35
|
+
};
|
|
36
|
+
postPatch: {
|
|
37
|
+
commit: string;
|
|
38
|
+
};
|
|
39
|
+
CWEs: string[];
|
|
40
|
+
}
|
|
41
|
+
export interface CveEvalResult {
|
|
42
|
+
cve: string;
|
|
43
|
+
cwes: string[];
|
|
44
|
+
language: string;
|
|
45
|
+
/** Did Judges detect at least one finding matching a relevant CWE? */
|
|
46
|
+
detected: boolean;
|
|
47
|
+
/** Did Judges produce no false positives on the patched version? */
|
|
48
|
+
cleanOnPatch: boolean;
|
|
49
|
+
/** Relevant findings on the pre-patch (vulnerable) code */
|
|
50
|
+
prePatchFindings: Finding[];
|
|
51
|
+
/** Findings on the post-patch (fixed) code — ideally empty */
|
|
52
|
+
postPatchFindings: Finding[];
|
|
53
|
+
/** Which CWEs from the CVE were matched by findings */
|
|
54
|
+
matchedCwes: string[];
|
|
55
|
+
/** Which CWEs from the CVE were NOT matched */
|
|
56
|
+
missedCwes: string[];
|
|
57
|
+
/** Error message if evaluation failed */
|
|
58
|
+
error?: string;
|
|
59
|
+
}
|
|
60
|
+
export interface OpenSSFBenchmarkResult {
|
|
61
|
+
timestamp: string;
|
|
62
|
+
totalCves: number;
|
|
63
|
+
evaluated: number;
|
|
64
|
+
skipped: number;
|
|
65
|
+
detected: number;
|
|
66
|
+
missed: number;
|
|
67
|
+
cleanOnPatch: number;
|
|
68
|
+
falsePositiveOnPatch: number;
|
|
69
|
+
detectionRate: number;
|
|
70
|
+
precision: number;
|
|
71
|
+
recall: number;
|
|
72
|
+
f1Score: number;
|
|
73
|
+
perCwe: Record<string, {
|
|
74
|
+
total: number;
|
|
75
|
+
detected: number;
|
|
76
|
+
rate: number;
|
|
77
|
+
}>;
|
|
78
|
+
results: CveEvalResult[];
|
|
79
|
+
}
|
|
80
|
+
export declare function loadCveFiles(repoPath: string): OpenSSFCve[];
|
|
81
|
+
export declare function evaluateSingleCve(cve: OpenSSFCve, sourcesDir: string): CveEvalResult;
|
|
82
|
+
export declare function computeOpenSSFMetrics(results: CveEvalResult[]): OpenSSFBenchmarkResult;
|
|
83
|
+
/**
|
|
84
|
+
* Convert OpenSSF CVE results into BenchmarkCase[] format for use
|
|
85
|
+
* in the Judges LLM benchmark pipeline. Only includes CVEs where:
|
|
86
|
+
* - The vulnerable code was successfully checked out
|
|
87
|
+
* - At least one weakness file was found
|
|
88
|
+
* - CWEs map to known judge prefixes
|
|
89
|
+
*/
|
|
90
|
+
export declare function convertToBenchmarkCases(cves: OpenSSFCve[], sourcesDir: string): BenchmarkCase[];
|
|
91
|
+
export declare function formatOpenSSFReport(result: OpenSSFBenchmarkResult): string;
|
|
92
|
+
export declare function runOpenSSFCveBenchmark(argv: string[]): void;
|
|
93
|
+
/**
|
|
94
|
+
* Convert OpenSSFBenchmarkResult → ExternalBenchmarkResult for the registry.
|
|
95
|
+
*/
|
|
96
|
+
export declare function toExternalResult(metrics: OpenSSFBenchmarkResult): ExternalBenchmarkResult;
|