@kevinrabun/judges-cli 3.129.0 โ 3.129.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -47,17 +47,37 @@ function loadAmendments(filePath) {
|
|
|
47
47
|
const store = JSON.parse(readFileSync(resolve(filePath), "utf8"));
|
|
48
48
|
return store.amendments;
|
|
49
49
|
}
|
|
50
|
-
|
|
51
|
-
const appdata = process.env.APPDATA || process.env.HOME;
|
|
52
|
-
if (!appdata)
|
|
53
|
-
throw new Error("Cannot determine global storage path. Use --file to specify.");
|
|
54
|
-
const globalPath = join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
|
|
50
|
+
const globalPath = getAmendmentStorePath();
|
|
55
51
|
if (!existsSync(globalPath)) {
|
|
56
52
|
throw new Error(`No amendments found at ${globalPath}. Run an LLM benchmark first, or use --file.`);
|
|
57
53
|
}
|
|
58
54
|
const store = JSON.parse(readFileSync(globalPath, "utf8"));
|
|
59
55
|
return store.amendments;
|
|
60
56
|
}
|
|
57
|
+
/**
|
|
58
|
+
* Resolve the path to the VS Code global storage amendment file.
|
|
59
|
+
*/
|
|
60
|
+
function getAmendmentStorePath(filePath) {
|
|
61
|
+
if (filePath)
|
|
62
|
+
return resolve(filePath);
|
|
63
|
+
const appdata = process.env.APPDATA || process.env.HOME;
|
|
64
|
+
if (!appdata)
|
|
65
|
+
throw new Error("Cannot determine global storage path. Use --file to specify.");
|
|
66
|
+
return join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Clear the amendment store after codification to prevent double-application.
|
|
70
|
+
* Codified amendments live in the .judge.md files; keeping the runtime store
|
|
71
|
+
* causes them to be injected twice into LLM benchmark prompts.
|
|
72
|
+
*/
|
|
73
|
+
function clearAmendmentStore(filePath) {
|
|
74
|
+
const storePath = getAmendmentStorePath(filePath);
|
|
75
|
+
if (existsSync(storePath)) {
|
|
76
|
+
const emptyStore = { amendments: [], version: 1, history: [] };
|
|
77
|
+
writeFileSync(storePath, JSON.stringify(emptyStore, null, 2), "utf8");
|
|
78
|
+
console.log(` ๐งน Cleared amendment store at ${storePath}`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
61
81
|
/**
|
|
62
82
|
* Codify a single amendment into a judge's .judge.md file by appending
|
|
63
83
|
* to the FALSE POSITIVE AVOIDANCE section (or creating one if missing).
|
|
@@ -152,6 +172,9 @@ export function runCodifyAmendments(argv) {
|
|
|
152
172
|
console.log("");
|
|
153
173
|
console.log(` ${dryRun ? "Would codify" : "Codified"} ${codified}/${amendments.length} amendment(s) into agent files.`);
|
|
154
174
|
if (!dryRun && codified > 0) {
|
|
175
|
+
// Clear the amendment store so codified amendments aren't double-applied
|
|
176
|
+
// at runtime during the next LLM benchmark run
|
|
177
|
+
clearAmendmentStore(filePath);
|
|
155
178
|
console.log(" Next steps:");
|
|
156
179
|
console.log(" 1. npm run generate:agents:force โ sync .ts files from .judge.md");
|
|
157
180
|
console.log(" 2. npm run build โ rebuild");
|
|
@@ -161,7 +161,8 @@ function fetchPrDiff(prUrl) {
|
|
|
161
161
|
}
|
|
162
162
|
/**
|
|
163
163
|
* Extract changed file contents from a unified diff.
|
|
164
|
-
* Returns the
|
|
164
|
+
* Returns the full diff hunks (added, removed, and context lines) for each
|
|
165
|
+
* file so the LLM sees the complete "before โ after" narrative.
|
|
165
166
|
*/
|
|
166
167
|
function extractFilesFromDiff(diff) {
|
|
167
168
|
const files = [];
|
|
@@ -192,18 +193,38 @@ function extractFilesFromDiff(diff) {
|
|
|
192
193
|
const language = langMap[ext];
|
|
193
194
|
if (!language)
|
|
194
195
|
continue;
|
|
195
|
-
// Extract
|
|
196
|
+
// Extract full hunk content โ include context lines, removed lines, and
|
|
197
|
+
// added lines so the LLM can see the complete change narrative.
|
|
196
198
|
const lines = section.split("\n");
|
|
197
|
-
const
|
|
199
|
+
const hunkLines = [];
|
|
200
|
+
let inHunk = false;
|
|
198
201
|
for (const line of lines) {
|
|
199
|
-
|
|
202
|
+
// Skip diff headers (---, +++, index, etc.)
|
|
203
|
+
if (line.startsWith("---") || line.startsWith("+++") || line.startsWith("index "))
|
|
204
|
+
continue;
|
|
205
|
+
// Hunk header โ include it for line number context
|
|
206
|
+
if (line.startsWith("@@")) {
|
|
207
|
+
inHunk = true;
|
|
208
|
+
hunkLines.push(line);
|
|
200
209
|
continue;
|
|
201
|
-
|
|
202
|
-
|
|
210
|
+
}
|
|
211
|
+
if (inHunk) {
|
|
212
|
+
// Context line (no prefix), added line (+), or removed line (-)
|
|
213
|
+
if (line.startsWith("+") || line.startsWith("-") || line.startsWith(" ") || line === "") {
|
|
214
|
+
hunkLines.push(line);
|
|
215
|
+
}
|
|
216
|
+
else if (line.startsWith("\\")) {
|
|
217
|
+
// "" โ skip
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
else {
|
|
221
|
+
// End of hunk content
|
|
222
|
+
inHunk = false;
|
|
223
|
+
}
|
|
203
224
|
}
|
|
204
225
|
}
|
|
205
|
-
if (
|
|
206
|
-
files.push({ path: filePath, content:
|
|
226
|
+
if (hunkLines.length > 0) {
|
|
227
|
+
files.push({ path: filePath, content: hunkLines.join("\n"), language });
|
|
207
228
|
}
|
|
208
229
|
}
|
|
209
230
|
return files;
|
|
@@ -218,9 +239,7 @@ function extractFilesFromDiff(diff) {
|
|
|
218
239
|
*/
|
|
219
240
|
export function convertPrToBenchmarkCase(pr, repoName, diff) {
|
|
220
241
|
const language = REPO_LANGUAGES[repoName] ?? "typescript";
|
|
221
|
-
// Build expected rule IDs from golden comments
|
|
222
|
-
// Since golden comments are semantic (not rule-ID based), we use broad prefixes
|
|
223
|
-
// that the LLM should fire when it identifies similar issues
|
|
242
|
+
// Build expected rule IDs from golden comments using improved prefix inference
|
|
224
243
|
const expectedRuleIds = [];
|
|
225
244
|
const acceptablePrefixes = new Set([
|
|
226
245
|
"CYBER",
|
|
@@ -238,6 +257,10 @@ export function convertPrToBenchmarkCase(pr, repoName, diff) {
|
|
|
238
257
|
"FW",
|
|
239
258
|
"RATE",
|
|
240
259
|
"STRUCT",
|
|
260
|
+
"OBS",
|
|
261
|
+
"TEST",
|
|
262
|
+
"DOC",
|
|
263
|
+
"COMPAT",
|
|
241
264
|
]);
|
|
242
265
|
for (let i = 0; i < pr.comments.length; i++) {
|
|
243
266
|
const gc = pr.comments[i];
|
|
@@ -245,17 +268,36 @@ export function convertPrToBenchmarkCase(pr, repoName, diff) {
|
|
|
245
268
|
expectedRuleIds.push(`${prefix}-${String(i + 1).padStart(3, "0")}`);
|
|
246
269
|
}
|
|
247
270
|
let code;
|
|
271
|
+
let additionalFiles;
|
|
248
272
|
if (diff) {
|
|
249
273
|
const files = extractFilesFromDiff(diff);
|
|
250
274
|
if (files.length === 0)
|
|
251
275
|
return undefined;
|
|
252
|
-
//
|
|
276
|
+
// Sort by content length โ largest file is primary
|
|
253
277
|
files.sort((a, b) => b.content.length - a.content.length);
|
|
278
|
+
// Primary file gets up to 16KB
|
|
254
279
|
code = files[0].content;
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
280
|
+
if (code.length > 16_000) {
|
|
281
|
+
code = code.slice(0, 16_000) + "\n// ... truncated for benchmark";
|
|
282
|
+
}
|
|
283
|
+
// Additional files go into the multi-file field (up to 12KB each)
|
|
284
|
+
if (files.length > 1) {
|
|
285
|
+
additionalFiles = files.slice(1, 6).map((f) => ({
|
|
286
|
+
path: f.path,
|
|
287
|
+
content: f.content.length > 12_000 ? f.content.slice(0, 12_000) + "\n// ... truncated" : f.content,
|
|
288
|
+
language: f.language,
|
|
289
|
+
}));
|
|
258
290
|
}
|
|
291
|
+
// Prepend PR context header so the LLM knows this is a code review task
|
|
292
|
+
code = [
|
|
293
|
+
`// ===== PR CODE REVIEW: ${pr.pr_title} =====`,
|
|
294
|
+
`// Repository: ${repoName} | Language: ${language}`,
|
|
295
|
+
`// File: ${files[0].path}`,
|
|
296
|
+
`// This is a unified diff โ lines starting with + are additions, - are removals, @@ are hunk headers`,
|
|
297
|
+
`// Review this code change for bugs, security issues, and quality problems.`,
|
|
298
|
+
"",
|
|
299
|
+
code,
|
|
300
|
+
].join("\n");
|
|
259
301
|
}
|
|
260
302
|
else {
|
|
261
303
|
// Fallback: embed golden comments as context for LLM evaluation
|
|
@@ -265,7 +307,7 @@ export function convertPrToBenchmarkCase(pr, repoName, diff) {
|
|
|
265
307
|
}
|
|
266
308
|
code = lines.join("\n");
|
|
267
309
|
}
|
|
268
|
-
|
|
310
|
+
const benchCase = {
|
|
269
311
|
id: `martian-${repoName}-${pr.pr_title
|
|
270
312
|
.slice(0, 40)
|
|
271
313
|
.replace(/[^a-zA-Z0-9]/g, "-")
|
|
@@ -279,42 +321,173 @@ export function convertPrToBenchmarkCase(pr, repoName, diff) {
|
|
|
279
321
|
difficulty: pr.comments.some((c) => c.severity === "Critical" || c.severity === "High") ? "hard" : "medium",
|
|
280
322
|
aiSource: "martian-code-review-benchmark",
|
|
281
323
|
};
|
|
324
|
+
// Attach additional files for multi-file evaluation context
|
|
325
|
+
if (additionalFiles && additionalFiles.length > 0) {
|
|
326
|
+
benchCase.files = additionalFiles;
|
|
327
|
+
}
|
|
328
|
+
return benchCase;
|
|
282
329
|
}
|
|
283
330
|
/**
|
|
284
331
|
* Infer the most likely judge prefix from a golden comment description.
|
|
332
|
+
*
|
|
333
|
+
* Uses weighted pattern matching โ each regex contributes a score per prefix,
|
|
334
|
+
* and the prefix with the highest total wins. This handles comments that span
|
|
335
|
+
* multiple domains (e.g. "race condition causes null pointer" โ CONC > ERR).
|
|
285
336
|
*/
|
|
286
337
|
function inferPrefixFromComment(comment, severity) {
|
|
287
338
|
const lower = comment.toLowerCase();
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
if (/
|
|
303
|
-
|
|
304
|
-
if (/
|
|
305
|
-
|
|
306
|
-
if (/
|
|
307
|
-
|
|
308
|
-
if (/
|
|
309
|
-
|
|
310
|
-
if (/
|
|
311
|
-
|
|
312
|
-
if (/
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
339
|
+
const scores = {};
|
|
340
|
+
function add(prefix, weight) {
|
|
341
|
+
scores[prefix] = (scores[prefix] ?? 0) + weight;
|
|
342
|
+
}
|
|
343
|
+
// Concurrency / race conditions
|
|
344
|
+
if (/race\s*condition|data\s*race/.test(lower))
|
|
345
|
+
add("CONC", 3);
|
|
346
|
+
if (/deadlock|mutex|lock\s*(acquisit|order|contention)/.test(lower))
|
|
347
|
+
add("CONC", 3);
|
|
348
|
+
if (/concurrent|thread.?safe|atomic|synchroniz/.test(lower))
|
|
349
|
+
add("CONC", 2);
|
|
350
|
+
if (/parallel|interleav/.test(lower))
|
|
351
|
+
add("CONC", 1);
|
|
352
|
+
// Database
|
|
353
|
+
if (/sql\s*inject|query\s*inject/.test(lower))
|
|
354
|
+
add("DB", 3);
|
|
355
|
+
if (/n\+1|n \+ 1/.test(lower))
|
|
356
|
+
add("DB", 3);
|
|
357
|
+
if (/select\s*\*|query|queryset/.test(lower))
|
|
358
|
+
add("DB", 2);
|
|
359
|
+
if (/database|transaction|rollback|commit/.test(lower))
|
|
360
|
+
add("DB", 2);
|
|
361
|
+
if (/migration|schema|index|join|subquery/.test(lower))
|
|
362
|
+
add("DB", 1);
|
|
363
|
+
if (/paginator|cursor|offset|limit/.test(lower))
|
|
364
|
+
add("DB", 1);
|
|
365
|
+
// Authentication / Authorization
|
|
366
|
+
if (/oauth|csrf|session\s*(secret|fixation|hijack)/.test(lower))
|
|
367
|
+
add("AUTH", 3);
|
|
368
|
+
if (/authenticat|credential|password|passkey/.test(lower))
|
|
369
|
+
add("AUTH", 2);
|
|
370
|
+
if (/authoriz|permission|privilege|role|scope|access\s*control/.test(lower))
|
|
371
|
+
add("AUTH", 2);
|
|
372
|
+
if (/token(?!\s*expir)/.test(lower))
|
|
373
|
+
add("AUTH", 1);
|
|
374
|
+
// Cybersecurity / Injection
|
|
375
|
+
if (/inject(?!ion\s*depend)|xss|cross.?site|command\s*inject/.test(lower))
|
|
376
|
+
add("CYBER", 3);
|
|
377
|
+
if (/deserialization|prototype\s*pollut|path\s*traversal/.test(lower))
|
|
378
|
+
add("CYBER", 3);
|
|
379
|
+
if (/ssrf|open\s*redirect|rce|remote\s*code/.test(lower))
|
|
380
|
+
add("CYBER", 3);
|
|
381
|
+
if (/sanitiz|escap(?!e\s*hatch)|encod/.test(lower))
|
|
382
|
+
add("CYBER", 1);
|
|
383
|
+
// Configuration / Secrets
|
|
384
|
+
if (/hardcod|hard.coded|secret\s*key/.test(lower))
|
|
385
|
+
add("CFG", 3);
|
|
386
|
+
if (/api.?key|config\s*(missing|invalid|hardcod)/.test(lower))
|
|
387
|
+
add("CFG", 2);
|
|
388
|
+
if (/environment\s*variable|\.env|secret/.test(lower))
|
|
389
|
+
add("CFG", 1);
|
|
390
|
+
// Error handling / Null safety
|
|
391
|
+
if (/null\s*(reference|pointer|dereference)|none\s*type|undefined\s*is\s*not/.test(lower))
|
|
392
|
+
add("ERR", 3);
|
|
393
|
+
if (/attributeerror|typeerror|keyerror|indexerror/.test(lower))
|
|
394
|
+
add("ERR", 3);
|
|
395
|
+
if (/unhandled\s*(error|exception|reject)/.test(lower))
|
|
396
|
+
add("ERR", 3);
|
|
397
|
+
if (/null|undefined|nil|\.?none\b/.test(lower))
|
|
398
|
+
add("ERR", 2);
|
|
399
|
+
if (/error\s*handl|exception|try.?catch|throw/.test(lower))
|
|
400
|
+
add("ERR", 2);
|
|
401
|
+
if (/crash|abort|panic|fault/.test(lower))
|
|
402
|
+
add("ERR", 1);
|
|
403
|
+
if (/missing\s*check|guard\s*clause/.test(lower))
|
|
404
|
+
add("ERR", 1);
|
|
405
|
+
// Security (general)
|
|
406
|
+
if (/vulnerab|exploit|attack\s*surface/.test(lower))
|
|
407
|
+
add("SEC", 2);
|
|
408
|
+
if (/valid(?:at(?:e|ion))|sanitiz|input\s*check/.test(lower))
|
|
409
|
+
add("SEC", 2);
|
|
410
|
+
if (/unsafe|insecure|taint/.test(lower))
|
|
411
|
+
add("SEC", 1);
|
|
412
|
+
// Performance
|
|
413
|
+
if (/performance|latency|throughput|bottleneck/.test(lower))
|
|
414
|
+
add("PERF", 2);
|
|
415
|
+
if (/slow|memory\s*leak|cache\s*(miss|invalid)/.test(lower))
|
|
416
|
+
add("PERF", 2);
|
|
417
|
+
if (/O\(n\^?2\)|quadratic|exponential/.test(lower))
|
|
418
|
+
add("PERF", 2);
|
|
419
|
+
if (/blocking|synchronous.*event\s*loop/.test(lower))
|
|
420
|
+
add("PERF", 1);
|
|
421
|
+
// Logic / correctness
|
|
422
|
+
if (/isinstance|subclass|type\s*check|type\s*error/.test(lower))
|
|
423
|
+
add("LOGIC", 2);
|
|
424
|
+
if (/wrong\s*(key|type|value|order|result)/.test(lower))
|
|
425
|
+
add("LOGIC", 2);
|
|
426
|
+
if (/off.by.one|fence\s*post|boundary/.test(lower))
|
|
427
|
+
add("LOGIC", 2);
|
|
428
|
+
if (/logic|incorrect|semantic/.test(lower))
|
|
429
|
+
add("LOGIC", 1);
|
|
430
|
+
if (/always\s*(true|false)|never\s*(true|false|reach)/.test(lower))
|
|
431
|
+
add("LOGIC", 2);
|
|
432
|
+
if (/negative\s*(slice|index|offset)/.test(lower))
|
|
433
|
+
add("LOGIC", 2);
|
|
434
|
+
// Observability / Monitoring
|
|
435
|
+
if (/metric|monitor|observ|telemetry|tracing/.test(lower))
|
|
436
|
+
add("OBS", 2);
|
|
437
|
+
if (/logg?ing|log\s*(level|format|statement)/.test(lower))
|
|
438
|
+
add("OBS", 1);
|
|
439
|
+
if (/alert|dashboard|instrument/.test(lower))
|
|
440
|
+
add("OBS", 1);
|
|
441
|
+
// Testing
|
|
442
|
+
if (/test\s*(flaky|brittle|fragile|unreliable)/.test(lower))
|
|
443
|
+
add("TEST", 3);
|
|
444
|
+
if (/sleep\s*in\s*test|time\.sleep|flaky/.test(lower))
|
|
445
|
+
add("TEST", 2);
|
|
446
|
+
if (/mock|stub|fixture|assert|test\s*coverage/.test(lower))
|
|
447
|
+
add("TEST", 1);
|
|
448
|
+
if (/monkeypatch|test_/.test(lower))
|
|
449
|
+
add("TEST", 1);
|
|
450
|
+
// Maintainability
|
|
451
|
+
if (/magic\s*number|duplicate|copy.?paste|dead\s*code/.test(lower))
|
|
452
|
+
add("MAINT", 2);
|
|
453
|
+
if (/complex|readab|refactor|techni?cal\s*debt/.test(lower))
|
|
454
|
+
add("MAINT", 1);
|
|
455
|
+
if (/naming|misleading|confusing|unclear/.test(lower))
|
|
456
|
+
add("MAINT", 1);
|
|
457
|
+
// Documentation
|
|
458
|
+
if (/docstring|comment|documentation|readme/.test(lower))
|
|
459
|
+
add("DOC", 2);
|
|
460
|
+
if (/typo|spelling|rename/.test(lower))
|
|
461
|
+
add("DOC", 1);
|
|
462
|
+
if (/jsdoc|javadoc|pydoc|rustdoc/.test(lower))
|
|
463
|
+
add("DOC", 1);
|
|
464
|
+
// Compatibility
|
|
465
|
+
if (/breaking\s*change|backwards?\s*compat|deprecat/.test(lower))
|
|
466
|
+
add("COMPAT", 2);
|
|
467
|
+
if (/migration|version|compat/.test(lower))
|
|
468
|
+
add("COMPAT", 1);
|
|
469
|
+
// Reliability
|
|
470
|
+
if (/timeout|retry|circuit.?break|failover/.test(lower))
|
|
471
|
+
add("REL", 2);
|
|
472
|
+
if (/resilien|graceful|recovery|shutdown/.test(lower))
|
|
473
|
+
add("REL", 1);
|
|
474
|
+
if (/terminate|kill|signal|process/.test(lower))
|
|
475
|
+
add("REL", 1);
|
|
476
|
+
// Framework safety
|
|
477
|
+
if (/middleware|express|django|flask|spring/.test(lower))
|
|
478
|
+
add("FW", 1);
|
|
479
|
+
if (/helmet|cors|csrf\s*middleware/.test(lower))
|
|
480
|
+
add("FW", 2);
|
|
481
|
+
// Rate limiting
|
|
482
|
+
if (/rate\s*limit|throttl|brute.?force/.test(lower))
|
|
483
|
+
add("RATE", 2);
|
|
484
|
+
if (/ddos|denial.?of.?service|resource\s*exhaust/.test(lower))
|
|
485
|
+
add("RATE", 1);
|
|
486
|
+
// Pick highest-scoring prefix
|
|
487
|
+
const sorted = Object.entries(scores).sort((a, b) => b[1] - a[1]);
|
|
488
|
+
if (sorted.length > 0 && sorted[0][1] > 0) {
|
|
489
|
+
return sorted[0][0];
|
|
490
|
+
}
|
|
318
491
|
// Default based on severity
|
|
319
492
|
if (severity === "Critical" || severity === "High")
|
|
320
493
|
return "SEC";
|