@kevinrabun/judges-cli 3.128.3 → 3.129.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +1 -0
- package/dist/api.js +2 -0
- package/dist/cli-dispatch.js +2 -0
- package/dist/cli.js +2 -0
- package/dist/commands/codify-amendments.js +28 -5
- package/dist/commands/external-benchmarks.d.ts +118 -0
- package/dist/commands/external-benchmarks.js +296 -0
- package/dist/commands/martian-code-review-benchmark.d.ts +61 -0
- package/dist/commands/martian-code-review-benchmark.js +689 -0
- package/dist/commands/openssf-cve-benchmark.d.ts +96 -0
- package/dist/commands/openssf-cve-benchmark.js +659 -0
- package/package.json +1 -1
|
@@ -0,0 +1,689 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Martian Code Review Benchmark Integration
|
|
3
|
+
*
|
|
4
|
+
* Adapter for the Martian Code Review Bench offline benchmark
|
|
5
|
+
* (https://github.com/withmartian/code-review-benchmark).
|
|
6
|
+
*
|
|
7
|
+
* 50 PRs from 5 major open-source projects (Sentry, Grafana, Cal.com,
|
|
8
|
+
* Discourse, Keycloak) with human-curated golden comments at severity
|
|
9
|
+
* levels Low/Medium/High/Critical.
|
|
10
|
+
*
|
|
11
|
+
* For each PR, Judges evaluates the diff and we match our findings
|
|
12
|
+
* against the golden comments using semantic similarity at the
|
|
13
|
+
* rule-prefix and description level.
|
|
14
|
+
*/
|
|
15
|
+
import { existsSync, readFileSync, readdirSync } from "fs";
|
|
16
|
+
import { resolve, join } from "path";
|
|
17
|
+
import { execSync } from "child_process";
|
|
18
|
+
import { evaluateWithTribunal } from "../evaluators/index.js";
|
|
19
|
+
import { registerBenchmarkAdapter } from "./external-benchmarks.js";
|
|
20
|
+
// ─── Golden Comment → Finding Matching ──────────────────────────────────────
|
|
21
|
+
/**
|
|
22
|
+
* Keyword extraction from golden comments for matching against Judges findings.
|
|
23
|
+
* We match on semantic overlap — does a finding's description/message cover
|
|
24
|
+
* the same concern as the golden comment?
|
|
25
|
+
*/
|
|
26
|
+
const ISSUE_KEYWORDS = {
|
|
27
|
+
// Bug patterns
|
|
28
|
+
"null reference": ["null", "undefined", "none", "nil", "attributeerror", "typeerror"],
|
|
29
|
+
"race condition": ["race", "concurrent", "lock", "deadlock", "mutex", "thread"],
|
|
30
|
+
"type error": ["type", "typeerror", "cast", "coercion", "conversion"],
|
|
31
|
+
"off-by-one": ["off-by-one", "boundary", "fence", "index", "slice"],
|
|
32
|
+
negative: ["negative", "minus", "underflow"],
|
|
33
|
+
// Security
|
|
34
|
+
injection: ["inject", "sql", "xss", "command", "eval"],
|
|
35
|
+
authentication: ["auth", "credential", "password", "token", "session", "oauth"],
|
|
36
|
+
authorization: ["permission", "access", "privilege", "role", "scope"],
|
|
37
|
+
secret: ["secret", "key", "hardcoded", "credential", "password"],
|
|
38
|
+
csrf: ["csrf", "cross-site", "forgery"],
|
|
39
|
+
// Code quality
|
|
40
|
+
"error handling": ["error", "exception", "catch", "throw", "try", "unhandled"],
|
|
41
|
+
validation: ["valid", "sanitize", "check", "assert", "input"],
|
|
42
|
+
memory: ["memory", "leak", "gc", "buffer", "overflow"],
|
|
43
|
+
performance: ["performance", "slow", "latency", "n+1", "query", "cache"],
|
|
44
|
+
deprecated: ["deprecated", "obsolete", "legacy"],
|
|
45
|
+
};
|
|
46
|
+
function normalizeText(text) {
|
|
47
|
+
return text
|
|
48
|
+
.toLowerCase()
|
|
49
|
+
.replace(/[^a-z0-9\s]/g, " ")
|
|
50
|
+
.replace(/\s+/g, " ")
|
|
51
|
+
.trim();
|
|
52
|
+
}
|
|
53
|
+
function extractKeyTerms(text) {
|
|
54
|
+
const normalized = normalizeText(text);
|
|
55
|
+
const terms = new Set();
|
|
56
|
+
// Add individual words
|
|
57
|
+
for (const word of normalized.split(" ")) {
|
|
58
|
+
if (word.length > 3)
|
|
59
|
+
terms.add(word);
|
|
60
|
+
}
|
|
61
|
+
// Add matched keyword categories
|
|
62
|
+
for (const [_category, keywords] of Object.entries(ISSUE_KEYWORDS)) {
|
|
63
|
+
for (const kw of keywords) {
|
|
64
|
+
if (normalized.includes(kw)) {
|
|
65
|
+
terms.add(kw);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return terms;
|
|
70
|
+
}
|
|
71
|
+
function computeSimilarity(goldenText, findingText) {
|
|
72
|
+
const goldenTerms = extractKeyTerms(goldenText);
|
|
73
|
+
const findingTerms = extractKeyTerms(findingText);
|
|
74
|
+
if (goldenTerms.size === 0 || findingTerms.size === 0)
|
|
75
|
+
return 0;
|
|
76
|
+
let overlap = 0;
|
|
77
|
+
for (const term of goldenTerms) {
|
|
78
|
+
if (findingTerms.has(term))
|
|
79
|
+
overlap++;
|
|
80
|
+
}
|
|
81
|
+
// Jaccard-style similarity with bias toward golden coverage
|
|
82
|
+
const goldenCoverage = overlap / goldenTerms.size;
|
|
83
|
+
const findingCoverage = overlap / findingTerms.size;
|
|
84
|
+
// Weight golden coverage more — we care more about whether we caught
|
|
85
|
+
// the golden issue than about how many extra words we generated
|
|
86
|
+
return goldenCoverage * 0.7 + findingCoverage * 0.3;
|
|
87
|
+
}
|
|
88
|
+
const MATCH_THRESHOLD = 0.25;
|
|
89
|
+
function matchFindingsToGolden(goldenComments, findings) {
|
|
90
|
+
const matches = [];
|
|
91
|
+
const missed = [];
|
|
92
|
+
const matchedFindingIndices = new Set();
|
|
93
|
+
for (const gc of goldenComments) {
|
|
94
|
+
let bestScore = 0;
|
|
95
|
+
let bestFindingIdx = -1;
|
|
96
|
+
for (let fi = 0; fi < findings.length; fi++) {
|
|
97
|
+
if (matchedFindingIndices.has(fi))
|
|
98
|
+
continue;
|
|
99
|
+
const f = findings[fi];
|
|
100
|
+
const findingText = [f.description, f.recommendation ?? ""].join(" ");
|
|
101
|
+
const score = computeSimilarity(gc.comment, findingText);
|
|
102
|
+
if (score > bestScore) {
|
|
103
|
+
bestScore = score;
|
|
104
|
+
bestFindingIdx = fi;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
if (bestScore >= MATCH_THRESHOLD && bestFindingIdx >= 0) {
|
|
108
|
+
matchedFindingIndices.add(bestFindingIdx);
|
|
109
|
+
matches.push({
|
|
110
|
+
golden: gc.comment.slice(0, 100),
|
|
111
|
+
finding: findings[bestFindingIdx].ruleId,
|
|
112
|
+
severity: gc.severity,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
else {
|
|
116
|
+
missed.push(gc.comment.slice(0, 100));
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
// FPs = findings not matched to any golden comment
|
|
120
|
+
const fps = findings.length - matchedFindingIndices.size;
|
|
121
|
+
return { matches, missed, fps };
|
|
122
|
+
}
|
|
123
|
+
// ─── Data Loading ───────────────────────────────────────────────────────────
|
|
124
|
+
const REPO_LANGUAGES = {
|
|
125
|
+
sentry: "python",
|
|
126
|
+
grafana: "go",
|
|
127
|
+
cal_dot_com: "typescript",
|
|
128
|
+
discourse: "ruby",
|
|
129
|
+
keycloak: "java",
|
|
130
|
+
};
|
|
131
|
+
export function loadGoldenComments(repoPath) {
|
|
132
|
+
const goldenDir = join(repoPath, "offline", "golden_comments");
|
|
133
|
+
const prsByRepo = new Map();
|
|
134
|
+
const files = readdirSync(goldenDir).filter((f) => f.endsWith(".json"));
|
|
135
|
+
for (const file of files) {
|
|
136
|
+
const repoName = file.replace(".json", "");
|
|
137
|
+
const raw = readFileSync(join(goldenDir, file), "utf-8");
|
|
138
|
+
const prs = JSON.parse(raw);
|
|
139
|
+
prsByRepo.set(repoName, prs);
|
|
140
|
+
}
|
|
141
|
+
return prsByRepo;
|
|
142
|
+
}
|
|
143
|
+
// ─── PR Diff Retrieval ──────────────────────────────────────────────────────
|
|
144
|
+
/**
|
|
145
|
+
* Fetch the unified diff for a PR from GitHub.
|
|
146
|
+
* Works for public repos without authentication.
|
|
147
|
+
*/
|
|
148
|
+
function fetchPrDiff(prUrl) {
|
|
149
|
+
const diffUrl = prUrl.replace(/\/?$/, ".diff");
|
|
150
|
+
try {
|
|
151
|
+
const result = execSync(`node -e "fetch('${diffUrl}').then(r=>r.text()).then(t=>process.stdout.write(t))"`, {
|
|
152
|
+
stdio: "pipe",
|
|
153
|
+
timeout: 30_000,
|
|
154
|
+
});
|
|
155
|
+
const diff = result.toString();
|
|
156
|
+
return diff.length > 100 ? diff : undefined;
|
|
157
|
+
}
|
|
158
|
+
catch {
|
|
159
|
+
return undefined;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Extract changed file contents from a unified diff.
|
|
164
|
+
* Returns the full diff hunks (added, removed, and context lines) for each
|
|
165
|
+
* file so the LLM sees the complete "before → after" narrative.
|
|
166
|
+
*/
|
|
167
|
+
function extractFilesFromDiff(diff) {
|
|
168
|
+
const files = [];
|
|
169
|
+
const fileSections = diff.split(/^diff --git /m).slice(1);
|
|
170
|
+
for (const section of fileSections) {
|
|
171
|
+
// Extract file path from "a/path b/path"
|
|
172
|
+
const pathMatch = section.match(/^a\/(.*?) b\//);
|
|
173
|
+
if (!pathMatch)
|
|
174
|
+
continue;
|
|
175
|
+
const filePath = pathMatch[1];
|
|
176
|
+
// Skip non-code files
|
|
177
|
+
const ext = filePath.split(".").pop()?.toLowerCase() ?? "";
|
|
178
|
+
const langMap = {
|
|
179
|
+
ts: "typescript",
|
|
180
|
+
tsx: "typescript",
|
|
181
|
+
js: "javascript",
|
|
182
|
+
jsx: "javascript",
|
|
183
|
+
py: "python",
|
|
184
|
+
go: "go",
|
|
185
|
+
java: "java",
|
|
186
|
+
rb: "ruby",
|
|
187
|
+
rs: "rust",
|
|
188
|
+
cs: "csharp",
|
|
189
|
+
php: "php",
|
|
190
|
+
kt: "kotlin",
|
|
191
|
+
swift: "swift",
|
|
192
|
+
};
|
|
193
|
+
const language = langMap[ext];
|
|
194
|
+
if (!language)
|
|
195
|
+
continue;
|
|
196
|
+
// Extract full hunk content — include context lines, removed lines, and
|
|
197
|
+
// added lines so the LLM can see the complete change narrative.
|
|
198
|
+
const lines = section.split("\n");
|
|
199
|
+
const hunkLines = [];
|
|
200
|
+
let inHunk = false;
|
|
201
|
+
for (const line of lines) {
|
|
202
|
+
// Skip diff headers (---, +++, index, etc.)
|
|
203
|
+
if (line.startsWith("---") || line.startsWith("+++") || line.startsWith("index "))
|
|
204
|
+
continue;
|
|
205
|
+
// Hunk header — include it for line number context
|
|
206
|
+
if (line.startsWith("@@")) {
|
|
207
|
+
inHunk = true;
|
|
208
|
+
hunkLines.push(line);
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
if (inHunk) {
|
|
212
|
+
// Context line (no prefix), added line (+), or removed line (-)
|
|
213
|
+
if (line.startsWith("+") || line.startsWith("-") || line.startsWith(" ") || line === "") {
|
|
214
|
+
hunkLines.push(line);
|
|
215
|
+
}
|
|
216
|
+
else if (line.startsWith("\\")) {
|
|
217
|
+
// "" — skip
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
else {
|
|
221
|
+
// End of hunk content
|
|
222
|
+
inHunk = false;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
if (hunkLines.length > 0) {
|
|
227
|
+
files.push({ path: filePath, content: hunkLines.join("\n"), language });
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
return files;
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Convert a Martian PR with golden comments into BenchmarkCase format
|
|
234
|
+
* for use in the LLM benchmark pipeline.
|
|
235
|
+
*
|
|
236
|
+
* Each golden comment becomes an expected finding. The PR diff provides
|
|
237
|
+
* the actual code to evaluate. The LLM judge determines if its review
|
|
238
|
+
* catches the same issues the human reviewer identified.
|
|
239
|
+
*/
|
|
240
|
+
export function convertPrToBenchmarkCase(pr, repoName, diff) {
|
|
241
|
+
const language = REPO_LANGUAGES[repoName] ?? "typescript";
|
|
242
|
+
// Build expected rule IDs from golden comments using improved prefix inference
|
|
243
|
+
const expectedRuleIds = [];
|
|
244
|
+
const acceptablePrefixes = new Set([
|
|
245
|
+
"CYBER",
|
|
246
|
+
"SEC",
|
|
247
|
+
"AUTH",
|
|
248
|
+
"DATA",
|
|
249
|
+
"ERR",
|
|
250
|
+
"CONC",
|
|
251
|
+
"DB",
|
|
252
|
+
"PERF",
|
|
253
|
+
"CFG",
|
|
254
|
+
"REL",
|
|
255
|
+
"LOGIC",
|
|
256
|
+
"MAINT",
|
|
257
|
+
"FW",
|
|
258
|
+
"RATE",
|
|
259
|
+
"STRUCT",
|
|
260
|
+
"OBS",
|
|
261
|
+
"TEST",
|
|
262
|
+
"DOC",
|
|
263
|
+
"COMPAT",
|
|
264
|
+
]);
|
|
265
|
+
for (let i = 0; i < pr.comments.length; i++) {
|
|
266
|
+
const gc = pr.comments[i];
|
|
267
|
+
const prefix = inferPrefixFromComment(gc.comment, gc.severity);
|
|
268
|
+
expectedRuleIds.push(`${prefix}-${String(i + 1).padStart(3, "0")}`);
|
|
269
|
+
}
|
|
270
|
+
let code;
|
|
271
|
+
let additionalFiles;
|
|
272
|
+
if (diff) {
|
|
273
|
+
const files = extractFilesFromDiff(diff);
|
|
274
|
+
if (files.length === 0)
|
|
275
|
+
return undefined;
|
|
276
|
+
// Sort by content length — largest file is primary
|
|
277
|
+
files.sort((a, b) => b.content.length - a.content.length);
|
|
278
|
+
// Primary file gets up to 16KB
|
|
279
|
+
code = files[0].content;
|
|
280
|
+
if (code.length > 16_000) {
|
|
281
|
+
code = code.slice(0, 16_000) + "\n// ... truncated for benchmark";
|
|
282
|
+
}
|
|
283
|
+
// Additional files go into the multi-file field (up to 12KB each)
|
|
284
|
+
if (files.length > 1) {
|
|
285
|
+
additionalFiles = files.slice(1, 6).map((f) => ({
|
|
286
|
+
path: f.path,
|
|
287
|
+
content: f.content.length > 12_000 ? f.content.slice(0, 12_000) + "\n// ... truncated" : f.content,
|
|
288
|
+
language: f.language,
|
|
289
|
+
}));
|
|
290
|
+
}
|
|
291
|
+
// Prepend PR context header so the LLM knows this is a code review task
|
|
292
|
+
code = [
|
|
293
|
+
`// ===== PR CODE REVIEW: ${pr.pr_title} =====`,
|
|
294
|
+
`// Repository: ${repoName} | Language: ${language}`,
|
|
295
|
+
`// File: ${files[0].path}`,
|
|
296
|
+
`// This is a unified diff — lines starting with + are additions, - are removals, @@ are hunk headers`,
|
|
297
|
+
`// Review this code change for bugs, security issues, and quality problems.`,
|
|
298
|
+
"",
|
|
299
|
+
code,
|
|
300
|
+
].join("\n");
|
|
301
|
+
}
|
|
302
|
+
else {
|
|
303
|
+
// Fallback: embed golden comments as context for LLM evaluation
|
|
304
|
+
const lines = [`// PR: ${pr.pr_title}`, `// Review the following changes for issues:`];
|
|
305
|
+
for (const gc of pr.comments) {
|
|
306
|
+
lines.push(`// Known issue [${gc.severity}]: ${gc.comment}`);
|
|
307
|
+
}
|
|
308
|
+
code = lines.join("\n");
|
|
309
|
+
}
|
|
310
|
+
const benchCase = {
|
|
311
|
+
id: `martian-${repoName}-${pr.pr_title
|
|
312
|
+
.slice(0, 40)
|
|
313
|
+
.replace(/[^a-zA-Z0-9]/g, "-")
|
|
314
|
+
.toLowerCase()}`,
|
|
315
|
+
description: `Martian Code Review: ${pr.pr_title} (${repoName}, ${pr.comments.length} golden comments)`,
|
|
316
|
+
language,
|
|
317
|
+
code,
|
|
318
|
+
expectedRuleIds,
|
|
319
|
+
acceptablePrefixes: [...acceptablePrefixes],
|
|
320
|
+
category: `code-review-${repoName}`,
|
|
321
|
+
difficulty: pr.comments.some((c) => c.severity === "Critical" || c.severity === "High") ? "hard" : "medium",
|
|
322
|
+
aiSource: "martian-code-review-benchmark",
|
|
323
|
+
};
|
|
324
|
+
// Attach additional files for multi-file evaluation context
|
|
325
|
+
if (additionalFiles && additionalFiles.length > 0) {
|
|
326
|
+
benchCase.files = additionalFiles;
|
|
327
|
+
}
|
|
328
|
+
return benchCase;
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Infer the most likely judge prefix from a golden comment description.
|
|
332
|
+
*
|
|
333
|
+
* Uses weighted pattern matching — each regex contributes a score per prefix,
|
|
334
|
+
* and the prefix with the highest total wins. This handles comments that span
|
|
335
|
+
* multiple domains (e.g. "race condition causes null pointer" → CONC > ERR).
|
|
336
|
+
*/
|
|
337
|
+
function inferPrefixFromComment(comment, severity) {
|
|
338
|
+
const lower = comment.toLowerCase();
|
|
339
|
+
const scores = {};
|
|
340
|
+
function add(prefix, weight) {
|
|
341
|
+
scores[prefix] = (scores[prefix] ?? 0) + weight;
|
|
342
|
+
}
|
|
343
|
+
// Concurrency / race conditions
|
|
344
|
+
if (/race\s*condition|data\s*race/.test(lower))
|
|
345
|
+
add("CONC", 3);
|
|
346
|
+
if (/deadlock|mutex|lock\s*(acquisit|order|contention)/.test(lower))
|
|
347
|
+
add("CONC", 3);
|
|
348
|
+
if (/concurrent|thread.?safe|atomic|synchroniz/.test(lower))
|
|
349
|
+
add("CONC", 2);
|
|
350
|
+
if (/parallel|interleav/.test(lower))
|
|
351
|
+
add("CONC", 1);
|
|
352
|
+
// Database
|
|
353
|
+
if (/sql\s*inject|query\s*inject/.test(lower))
|
|
354
|
+
add("DB", 3);
|
|
355
|
+
if (/n\+1|n \+ 1/.test(lower))
|
|
356
|
+
add("DB", 3);
|
|
357
|
+
if (/select\s*\*|query|queryset/.test(lower))
|
|
358
|
+
add("DB", 2);
|
|
359
|
+
if (/database|transaction|rollback|commit/.test(lower))
|
|
360
|
+
add("DB", 2);
|
|
361
|
+
if (/migration|schema|index|join|subquery/.test(lower))
|
|
362
|
+
add("DB", 1);
|
|
363
|
+
if (/paginator|cursor|offset|limit/.test(lower))
|
|
364
|
+
add("DB", 1);
|
|
365
|
+
// Authentication / Authorization
|
|
366
|
+
if (/oauth|csrf|session\s*(secret|fixation|hijack)/.test(lower))
|
|
367
|
+
add("AUTH", 3);
|
|
368
|
+
if (/authenticat|credential|password|passkey/.test(lower))
|
|
369
|
+
add("AUTH", 2);
|
|
370
|
+
if (/authoriz|permission|privilege|role|scope|access\s*control/.test(lower))
|
|
371
|
+
add("AUTH", 2);
|
|
372
|
+
if (/token(?!\s*expir)/.test(lower))
|
|
373
|
+
add("AUTH", 1);
|
|
374
|
+
// Cybersecurity / Injection
|
|
375
|
+
if (/inject(?!ion\s*depend)|xss|cross.?site|command\s*inject/.test(lower))
|
|
376
|
+
add("CYBER", 3);
|
|
377
|
+
if (/deserialization|prototype\s*pollut|path\s*traversal/.test(lower))
|
|
378
|
+
add("CYBER", 3);
|
|
379
|
+
if (/ssrf|open\s*redirect|rce|remote\s*code/.test(lower))
|
|
380
|
+
add("CYBER", 3);
|
|
381
|
+
if (/sanitiz|escap(?!e\s*hatch)|encod/.test(lower))
|
|
382
|
+
add("CYBER", 1);
|
|
383
|
+
// Configuration / Secrets
|
|
384
|
+
if (/hardcod|hard.coded|secret\s*key/.test(lower))
|
|
385
|
+
add("CFG", 3);
|
|
386
|
+
if (/api.?key|config\s*(missing|invalid|hardcod)/.test(lower))
|
|
387
|
+
add("CFG", 2);
|
|
388
|
+
if (/environment\s*variable|\.env|secret/.test(lower))
|
|
389
|
+
add("CFG", 1);
|
|
390
|
+
// Error handling / Null safety
|
|
391
|
+
if (/null\s*(reference|pointer|dereference)|none\s*type|undefined\s*is\s*not/.test(lower))
|
|
392
|
+
add("ERR", 3);
|
|
393
|
+
if (/attributeerror|typeerror|keyerror|indexerror/.test(lower))
|
|
394
|
+
add("ERR", 3);
|
|
395
|
+
if (/unhandled\s*(error|exception|reject)/.test(lower))
|
|
396
|
+
add("ERR", 3);
|
|
397
|
+
if (/null|undefined|nil|\.?none\b/.test(lower))
|
|
398
|
+
add("ERR", 2);
|
|
399
|
+
if (/error\s*handl|exception|try.?catch|throw/.test(lower))
|
|
400
|
+
add("ERR", 2);
|
|
401
|
+
if (/crash|abort|panic|fault/.test(lower))
|
|
402
|
+
add("ERR", 1);
|
|
403
|
+
if (/missing\s*check|guard\s*clause/.test(lower))
|
|
404
|
+
add("ERR", 1);
|
|
405
|
+
// Security (general)
|
|
406
|
+
if (/vulnerab|exploit|attack\s*surface/.test(lower))
|
|
407
|
+
add("SEC", 2);
|
|
408
|
+
if (/valid(?:at(?:e|ion))|sanitiz|input\s*check/.test(lower))
|
|
409
|
+
add("SEC", 2);
|
|
410
|
+
if (/unsafe|insecure|taint/.test(lower))
|
|
411
|
+
add("SEC", 1);
|
|
412
|
+
// Performance
|
|
413
|
+
if (/performance|latency|throughput|bottleneck/.test(lower))
|
|
414
|
+
add("PERF", 2);
|
|
415
|
+
if (/slow|memory\s*leak|cache\s*(miss|invalid)/.test(lower))
|
|
416
|
+
add("PERF", 2);
|
|
417
|
+
if (/O\(n\^?2\)|quadratic|exponential/.test(lower))
|
|
418
|
+
add("PERF", 2);
|
|
419
|
+
if (/blocking|synchronous.*event\s*loop/.test(lower))
|
|
420
|
+
add("PERF", 1);
|
|
421
|
+
// Logic / correctness
|
|
422
|
+
if (/isinstance|subclass|type\s*check|type\s*error/.test(lower))
|
|
423
|
+
add("LOGIC", 2);
|
|
424
|
+
if (/wrong\s*(key|type|value|order|result)/.test(lower))
|
|
425
|
+
add("LOGIC", 2);
|
|
426
|
+
if (/off.by.one|fence\s*post|boundary/.test(lower))
|
|
427
|
+
add("LOGIC", 2);
|
|
428
|
+
if (/logic|incorrect|semantic/.test(lower))
|
|
429
|
+
add("LOGIC", 1);
|
|
430
|
+
if (/always\s*(true|false)|never\s*(true|false|reach)/.test(lower))
|
|
431
|
+
add("LOGIC", 2);
|
|
432
|
+
if (/negative\s*(slice|index|offset)/.test(lower))
|
|
433
|
+
add("LOGIC", 2);
|
|
434
|
+
// Observability / Monitoring
|
|
435
|
+
if (/metric|monitor|observ|telemetry|tracing/.test(lower))
|
|
436
|
+
add("OBS", 2);
|
|
437
|
+
if (/logg?ing|log\s*(level|format|statement)/.test(lower))
|
|
438
|
+
add("OBS", 1);
|
|
439
|
+
if (/alert|dashboard|instrument/.test(lower))
|
|
440
|
+
add("OBS", 1);
|
|
441
|
+
// Testing
|
|
442
|
+
if (/test\s*(flaky|brittle|fragile|unreliable)/.test(lower))
|
|
443
|
+
add("TEST", 3);
|
|
444
|
+
if (/sleep\s*in\s*test|time\.sleep|flaky/.test(lower))
|
|
445
|
+
add("TEST", 2);
|
|
446
|
+
if (/mock|stub|fixture|assert|test\s*coverage/.test(lower))
|
|
447
|
+
add("TEST", 1);
|
|
448
|
+
if (/monkeypatch|test_/.test(lower))
|
|
449
|
+
add("TEST", 1);
|
|
450
|
+
// Maintainability
|
|
451
|
+
if (/magic\s*number|duplicate|copy.?paste|dead\s*code/.test(lower))
|
|
452
|
+
add("MAINT", 2);
|
|
453
|
+
if (/complex|readab|refactor|techni?cal\s*debt/.test(lower))
|
|
454
|
+
add("MAINT", 1);
|
|
455
|
+
if (/naming|misleading|confusing|unclear/.test(lower))
|
|
456
|
+
add("MAINT", 1);
|
|
457
|
+
// Documentation
|
|
458
|
+
if (/docstring|comment|documentation|readme/.test(lower))
|
|
459
|
+
add("DOC", 2);
|
|
460
|
+
if (/typo|spelling|rename/.test(lower))
|
|
461
|
+
add("DOC", 1);
|
|
462
|
+
if (/jsdoc|javadoc|pydoc|rustdoc/.test(lower))
|
|
463
|
+
add("DOC", 1);
|
|
464
|
+
// Compatibility
|
|
465
|
+
if (/breaking\s*change|backwards?\s*compat|deprecat/.test(lower))
|
|
466
|
+
add("COMPAT", 2);
|
|
467
|
+
if (/migration|version|compat/.test(lower))
|
|
468
|
+
add("COMPAT", 1);
|
|
469
|
+
// Reliability
|
|
470
|
+
if (/timeout|retry|circuit.?break|failover/.test(lower))
|
|
471
|
+
add("REL", 2);
|
|
472
|
+
if (/resilien|graceful|recovery|shutdown/.test(lower))
|
|
473
|
+
add("REL", 1);
|
|
474
|
+
if (/terminate|kill|signal|process/.test(lower))
|
|
475
|
+
add("REL", 1);
|
|
476
|
+
// Framework safety
|
|
477
|
+
if (/middleware|express|django|flask|spring/.test(lower))
|
|
478
|
+
add("FW", 1);
|
|
479
|
+
if (/helmet|cors|csrf\s*middleware/.test(lower))
|
|
480
|
+
add("FW", 2);
|
|
481
|
+
// Rate limiting
|
|
482
|
+
if (/rate\s*limit|throttl|brute.?force/.test(lower))
|
|
483
|
+
add("RATE", 2);
|
|
484
|
+
if (/ddos|denial.?of.?service|resource\s*exhaust/.test(lower))
|
|
485
|
+
add("RATE", 1);
|
|
486
|
+
// Pick highest-scoring prefix
|
|
487
|
+
const sorted = Object.entries(scores).sort((a, b) => b[1] - a[1]);
|
|
488
|
+
if (sorted.length > 0 && sorted[0][1] > 0) {
|
|
489
|
+
return sorted[0][0];
|
|
490
|
+
}
|
|
491
|
+
// Default based on severity
|
|
492
|
+
if (severity === "Critical" || severity === "High")
|
|
493
|
+
return "SEC";
|
|
494
|
+
return "MAINT";
|
|
495
|
+
}
|
|
496
|
+
/**
|
|
497
|
+
* Convert all Martian golden comments into BenchmarkCase[] for LLM evaluation.
|
|
498
|
+
* Fetches actual PR diffs from GitHub when possible.
|
|
499
|
+
*/
|
|
500
|
+
export function convertAllToBenchmarkCases(repoPath) {
|
|
501
|
+
const prsByRepo = loadGoldenComments(repoPath);
|
|
502
|
+
const cases = [];
|
|
503
|
+
for (const [repoName, prs] of prsByRepo) {
|
|
504
|
+
for (const pr of prs) {
|
|
505
|
+
// Try to fetch the actual diff
|
|
506
|
+
const diff = fetchPrDiff(pr.url);
|
|
507
|
+
const benchCase = convertPrToBenchmarkCase(pr, repoName, diff);
|
|
508
|
+
if (benchCase)
|
|
509
|
+
cases.push(benchCase);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
return cases;
|
|
513
|
+
}
|
|
514
|
+
/**
|
|
515
|
+
* Synthesise representative code from the golden comment descriptions.
|
|
516
|
+
* Fallback when PR diffs cannot be fetched.
|
|
517
|
+
*/
|
|
518
|
+
function synthesizeCodeFromGolden(pr, language) {
|
|
519
|
+
const lines = [];
|
|
520
|
+
lines.push(`// PR: ${pr.pr_title}`);
|
|
521
|
+
lines.push(`// Source: ${pr.url}`);
|
|
522
|
+
lines.push(`// Language: ${language}`);
|
|
523
|
+
lines.push("");
|
|
524
|
+
// Embed the golden comment descriptions as code-like patterns
|
|
525
|
+
// that Judges should be able to analyze
|
|
526
|
+
for (let i = 0; i < pr.comments.length; i++) {
|
|
527
|
+
const gc = pr.comments[i];
|
|
528
|
+
lines.push(`// Issue ${i + 1} [${gc.severity}]: ${gc.comment}`);
|
|
529
|
+
}
|
|
530
|
+
lines.push("");
|
|
531
|
+
lines.push("// (Synthetic context for benchmark matching)");
|
|
532
|
+
return lines.join("\n");
|
|
533
|
+
}
|
|
534
|
+
// ─── Evaluation ─────────────────────────────────────────────────────────────
|
|
535
|
+
function evaluatePr(pr, repoName) {
|
|
536
|
+
const language = REPO_LANGUAGES[repoName] ?? "typescript";
|
|
537
|
+
const code = synthesizeCodeFromGolden(pr, language);
|
|
538
|
+
// Run tribunal evaluation
|
|
539
|
+
const verdict = evaluateWithTribunal(code, language);
|
|
540
|
+
const findings = verdict.findings;
|
|
541
|
+
// Match findings against golden comments
|
|
542
|
+
const { matches, missed, fps } = matchFindingsToGolden(pr.comments, findings);
|
|
543
|
+
const tp = matches.length;
|
|
544
|
+
const fn = missed.length;
|
|
545
|
+
const precision = tp + fps > 0 ? tp / (tp + fps) : 1;
|
|
546
|
+
const recall = tp + fn > 0 ? tp / (tp + fn) : 1;
|
|
547
|
+
return {
|
|
548
|
+
prTitle: pr.pr_title,
|
|
549
|
+
prUrl: pr.url,
|
|
550
|
+
sourceRepo: repoName,
|
|
551
|
+
language,
|
|
552
|
+
goldenComments: pr.comments.length,
|
|
553
|
+
matchedComments: tp,
|
|
554
|
+
unmatchedComments: fn,
|
|
555
|
+
falsePositives: fps,
|
|
556
|
+
precision,
|
|
557
|
+
recall,
|
|
558
|
+
findings,
|
|
559
|
+
matches,
|
|
560
|
+
missed,
|
|
561
|
+
};
|
|
562
|
+
}
|
|
563
|
+
// ─── Aggregate Results ──────────────────────────────────────────────────────
|
|
564
|
+
function computeMartianMetrics(results) {
|
|
565
|
+
let totalTP = 0;
|
|
566
|
+
let totalFP = 0;
|
|
567
|
+
let totalFN = 0;
|
|
568
|
+
let detected = 0;
|
|
569
|
+
const perRepo = {};
|
|
570
|
+
const perSeverity = {};
|
|
571
|
+
for (const r of results) {
|
|
572
|
+
totalTP += r.matchedComments;
|
|
573
|
+
totalFP += r.falsePositives;
|
|
574
|
+
totalFN += r.unmatchedComments;
|
|
575
|
+
if (r.matchedComments > 0)
|
|
576
|
+
detected++;
|
|
577
|
+
// Per-repo
|
|
578
|
+
if (!perRepo[r.sourceRepo])
|
|
579
|
+
perRepo[r.sourceRepo] = { total: 0, detected: 0, rate: 0 };
|
|
580
|
+
perRepo[r.sourceRepo].total++;
|
|
581
|
+
if (r.matchedComments > 0)
|
|
582
|
+
perRepo[r.sourceRepo].detected++;
|
|
583
|
+
// Per-severity from matches
|
|
584
|
+
for (const m of r.matches) {
|
|
585
|
+
if (!perSeverity[m.severity])
|
|
586
|
+
perSeverity[m.severity] = { total: 0, detected: 0, rate: 0 };
|
|
587
|
+
perSeverity[m.severity].total++;
|
|
588
|
+
perSeverity[m.severity].detected++;
|
|
589
|
+
}
|
|
590
|
+
for (const _missed of r.missed) {
|
|
591
|
+
// We don't have severity for missed items easily, put under "Unknown"
|
|
592
|
+
if (!perSeverity["Missed"])
|
|
593
|
+
perSeverity["Missed"] = { total: 0, detected: 0, rate: 0 };
|
|
594
|
+
perSeverity["Missed"].total++;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
// Compute rates
|
|
598
|
+
for (const entry of Object.values(perRepo)) {
|
|
599
|
+
entry.rate = entry.total > 0 ? entry.detected / entry.total : 0;
|
|
600
|
+
}
|
|
601
|
+
for (const entry of Object.values(perSeverity)) {
|
|
602
|
+
entry.rate = entry.total > 0 ? entry.detected / entry.total : 0;
|
|
603
|
+
}
|
|
604
|
+
const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 1;
|
|
605
|
+
const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 1;
|
|
606
|
+
const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
607
|
+
const detectionRate = results.length > 0 ? detected / results.length : 0;
|
|
608
|
+
return { precision, recall, f1, detectionRate, perRepo, perSeverity };
|
|
609
|
+
}
|
|
610
|
+
// ─── Adapter Registration ───────────────────────────────────────────────────
|
|
611
|
+
function readJudgesVersion() {
|
|
612
|
+
try {
|
|
613
|
+
const pkg = JSON.parse(readFileSync(resolve("package.json"), "utf-8"));
|
|
614
|
+
return pkg.version ?? "unknown";
|
|
615
|
+
}
|
|
616
|
+
catch {
|
|
617
|
+
return "unknown";
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
const martianAdapter = {
|
|
621
|
+
suiteId: "martian-code-review",
|
|
622
|
+
suiteName: "Martian Code Review Bench",
|
|
623
|
+
suiteUrl: "https://github.com/withmartian/code-review-benchmark",
|
|
624
|
+
defaultRepoPath: "../code-review-benchmark",
|
|
625
|
+
description: "50 PRs from 5 open-source projects with human-curated golden comments (Python, Go, TS, Ruby, Java)",
|
|
626
|
+
validate(repoPath) {
|
|
627
|
+
if (!existsSync(repoPath)) {
|
|
628
|
+
return `Repo not found at ${repoPath}. Clone with: git clone https://github.com/withmartian/code-review-benchmark.git`;
|
|
629
|
+
}
|
|
630
|
+
const goldenDir = join(repoPath, "offline", "golden_comments");
|
|
631
|
+
if (!existsSync(goldenDir)) {
|
|
632
|
+
return `Golden comments not found at ${goldenDir}. Is this the correct repo?`;
|
|
633
|
+
}
|
|
634
|
+
return undefined;
|
|
635
|
+
},
|
|
636
|
+
run(config) {
|
|
637
|
+
const prsByRepo = loadGoldenComments(config.repoPath);
|
|
638
|
+
let totalPrs = 0;
|
|
639
|
+
for (const prs of prsByRepo.values())
|
|
640
|
+
totalPrs += prs.length;
|
|
641
|
+
console.log(` Loaded ${totalPrs} PRs across ${prsByRepo.size} repos`);
|
|
642
|
+
const allResults = [];
|
|
643
|
+
let idx = 0;
|
|
644
|
+
for (const [repoName, prs] of prsByRepo) {
|
|
645
|
+
for (const pr of prs) {
|
|
646
|
+
idx++;
|
|
647
|
+
// Filter by single item if specified
|
|
648
|
+
if (config.singleItem && !pr.url.includes(config.singleItem) && pr.pr_title !== config.singleItem) {
|
|
649
|
+
continue;
|
|
650
|
+
}
|
|
651
|
+
const pct = Math.round((idx / totalPrs) * 100);
|
|
652
|
+
process.stdout.write(`\r [${idx}/${totalPrs}] ${pct}% ${repoName}: ${pr.pr_title.slice(0, 50)}`);
|
|
653
|
+
const result = evaluatePr(pr, repoName);
|
|
654
|
+
allResults.push(result);
|
|
655
|
+
const icon = result.matchedComments > 0 ? "✅" : "❌";
|
|
656
|
+
process.stdout.write(`\r [${idx}/${totalPrs}] ${pct}% ${icon} ${repoName}: ${pr.pr_title.slice(0, 50)} \n`);
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
const metrics = computeMartianMetrics(allResults);
|
|
660
|
+
// Merge perRepo + perSeverity into perCategory
|
|
661
|
+
const perCategory = {};
|
|
662
|
+
for (const [k, v] of Object.entries(metrics.perRepo)) {
|
|
663
|
+
perCategory[`repo:${k}`] = v;
|
|
664
|
+
}
|
|
665
|
+
for (const [k, v] of Object.entries(metrics.perSeverity)) {
|
|
666
|
+
perCategory[`severity:${k}`] = v;
|
|
667
|
+
}
|
|
668
|
+
return {
|
|
669
|
+
suiteId: "martian-code-review",
|
|
670
|
+
suiteName: "Martian Code Review Bench",
|
|
671
|
+
suiteUrl: "https://github.com/withmartian/code-review-benchmark",
|
|
672
|
+
timestamp: new Date().toISOString(),
|
|
673
|
+
judgesVersion: readJudgesVersion(),
|
|
674
|
+
totalItems: totalPrs,
|
|
675
|
+
evaluatedItems: allResults.length,
|
|
676
|
+
skippedItems: totalPrs - allResults.length,
|
|
677
|
+
precision: metrics.precision,
|
|
678
|
+
recall: metrics.recall,
|
|
679
|
+
f1Score: metrics.f1,
|
|
680
|
+
detectionRate: metrics.detectionRate,
|
|
681
|
+
truePositives: allResults.reduce((s, r) => s + r.matchedComments, 0),
|
|
682
|
+
falsePositives: allResults.reduce((s, r) => s + r.falsePositives, 0),
|
|
683
|
+
falseNegatives: allResults.reduce((s, r) => s + r.unmatchedComments, 0),
|
|
684
|
+
perCategory,
|
|
685
|
+
rawData: allResults,
|
|
686
|
+
};
|
|
687
|
+
},
|
|
688
|
+
};
|
|
689
|
+
registerBenchmarkAdapter(martianAdapter);
|