@kevinrabun/judges-cli 3.129.0 โ†’ 3.129.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47,17 +47,37 @@ function loadAmendments(filePath) {
47
47
  const store = JSON.parse(readFileSync(resolve(filePath), "utf8"));
48
48
  return store.amendments;
49
49
  }
50
- // Try VS Code global storage
51
- const appdata = process.env.APPDATA || process.env.HOME;
52
- if (!appdata)
53
- throw new Error("Cannot determine global storage path. Use --file to specify.");
54
- const globalPath = join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
50
+ const globalPath = getAmendmentStorePath();
55
51
  if (!existsSync(globalPath)) {
56
52
  throw new Error(`No amendments found at ${globalPath}. Run an LLM benchmark first, or use --file.`);
57
53
  }
58
54
  const store = JSON.parse(readFileSync(globalPath, "utf8"));
59
55
  return store.amendments;
60
56
  }
57
+ /**
58
+ * Resolve the path to the VS Code global storage amendment file.
59
+ */
60
+ function getAmendmentStorePath(filePath) {
61
+ if (filePath)
62
+ return resolve(filePath);
63
+ const appdata = process.env.APPDATA || process.env.HOME;
64
+ if (!appdata)
65
+ throw new Error("Cannot determine global storage path. Use --file to specify.");
66
+ return join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
67
+ }
68
+ /**
69
+ * Clear the amendment store after codification to prevent double-application.
70
+ * Codified amendments live in the .judge.md files; keeping the runtime store
71
+ * causes them to be injected twice into LLM benchmark prompts.
72
+ */
73
+ function clearAmendmentStore(filePath) {
74
+ const storePath = getAmendmentStorePath(filePath);
75
+ if (existsSync(storePath)) {
76
+ const emptyStore = { amendments: [], version: 1, history: [] };
77
+ writeFileSync(storePath, JSON.stringify(emptyStore, null, 2), "utf8");
78
+ console.log(` ๐Ÿงน Cleared amendment store at ${storePath}`);
79
+ }
80
+ }
61
81
  /**
62
82
  * Codify a single amendment into a judge's .judge.md file by appending
63
83
  * to the FALSE POSITIVE AVOIDANCE section (or creating one if missing).
@@ -152,6 +172,9 @@ export function runCodifyAmendments(argv) {
152
172
  console.log("");
153
173
  console.log(` ${dryRun ? "Would codify" : "Codified"} ${codified}/${amendments.length} amendment(s) into agent files.`);
154
174
  if (!dryRun && codified > 0) {
175
+ // Clear the amendment store so codified amendments aren't double-applied
176
+ // at runtime during the next LLM benchmark run
177
+ clearAmendmentStore(filePath);
155
178
  console.log(" Next steps:");
156
179
  console.log(" 1. npm run generate:agents:force โ€” sync .ts files from .judge.md");
157
180
  console.log(" 2. npm run build โ€” rebuild");
@@ -161,7 +161,8 @@ function fetchPrDiff(prUrl) {
161
161
  }
162
162
  /**
163
163
  * Extract changed file contents from a unified diff.
164
- * Returns the "after" (added/modified) lines for each file.
164
+ * Returns the full diff hunks (added, removed, and context lines) for each
165
+ * file so the LLM sees the complete "before โ†’ after" narrative.
165
166
  */
166
167
  function extractFilesFromDiff(diff) {
167
168
  const files = [];
@@ -192,18 +193,38 @@ function extractFilesFromDiff(diff) {
192
193
  const language = langMap[ext];
193
194
  if (!language)
194
195
  continue;
195
- // Extract added lines (lines starting with +, excluding +++ header)
196
+ // Extract full hunk content โ€” include context lines, removed lines, and
197
+ // added lines so the LLM can see the complete change narrative.
196
198
  const lines = section.split("\n");
197
- const addedLines = [];
199
+ const hunkLines = [];
200
+ let inHunk = false;
198
201
  for (const line of lines) {
199
- if (line.startsWith("+++"))
202
+ // Skip diff headers (---, +++, index, etc.)
203
+ if (line.startsWith("---") || line.startsWith("+++") || line.startsWith("index "))
204
+ continue;
205
+ // Hunk header โ€” include it for line number context
206
+ if (line.startsWith("@@")) {
207
+ inHunk = true;
208
+ hunkLines.push(line);
200
209
  continue;
201
- if (line.startsWith("+")) {
202
- addedLines.push(line.slice(1));
210
+ }
211
+ if (inHunk) {
212
+ // Context line (no prefix), added line (+), or removed line (-)
213
+ if (line.startsWith("+") || line.startsWith("-") || line.startsWith(" ") || line === "") {
214
+ hunkLines.push(line);
215
+ }
216
+ else if (line.startsWith("\\")) {
217
+ // "" โ€” skip
218
+ continue;
219
+ }
220
+ else {
221
+ // End of hunk content
222
+ inHunk = false;
223
+ }
203
224
  }
204
225
  }
205
- if (addedLines.length > 0) {
206
- files.push({ path: filePath, content: addedLines.join("\n"), language });
226
+ if (hunkLines.length > 0) {
227
+ files.push({ path: filePath, content: hunkLines.join("\n"), language });
207
228
  }
208
229
  }
209
230
  return files;
@@ -218,9 +239,7 @@ function extractFilesFromDiff(diff) {
218
239
  */
219
240
  export function convertPrToBenchmarkCase(pr, repoName, diff) {
220
241
  const language = REPO_LANGUAGES[repoName] ?? "typescript";
221
- // Build expected rule IDs from golden comments by mapping severity to prefixes
222
- // Since golden comments are semantic (not rule-ID based), we use broad prefixes
223
- // that the LLM should fire when it identifies similar issues
242
+ // Build expected rule IDs from golden comments using improved prefix inference
224
243
  const expectedRuleIds = [];
225
244
  const acceptablePrefixes = new Set([
226
245
  "CYBER",
@@ -238,6 +257,10 @@ export function convertPrToBenchmarkCase(pr, repoName, diff) {
238
257
  "FW",
239
258
  "RATE",
240
259
  "STRUCT",
260
+ "OBS",
261
+ "TEST",
262
+ "DOC",
263
+ "COMPAT",
241
264
  ]);
242
265
  for (let i = 0; i < pr.comments.length; i++) {
243
266
  const gc = pr.comments[i];
@@ -245,17 +268,36 @@ export function convertPrToBenchmarkCase(pr, repoName, diff) {
245
268
  expectedRuleIds.push(`${prefix}-${String(i + 1).padStart(3, "0")}`);
246
269
  }
247
270
  let code;
271
+ let additionalFiles;
248
272
  if (diff) {
249
273
  const files = extractFilesFromDiff(diff);
250
274
  if (files.length === 0)
251
275
  return undefined;
252
- // Use the largest changed file as the primary code
276
+ // Sort by content length โ€” largest file is primary
253
277
  files.sort((a, b) => b.content.length - a.content.length);
278
+ // Primary file gets up to 16KB
254
279
  code = files[0].content;
255
- // Truncate to avoid token limits
256
- if (code.length > 8000) {
257
- code = code.slice(0, 8000) + "\n// ... truncated for benchmark";
280
+ if (code.length > 16_000) {
281
+ code = code.slice(0, 16_000) + "\n// ... truncated for benchmark";
282
+ }
283
+ // Additional files go into the multi-file field (up to 12KB each)
284
+ if (files.length > 1) {
285
+ additionalFiles = files.slice(1, 6).map((f) => ({
286
+ path: f.path,
287
+ content: f.content.length > 12_000 ? f.content.slice(0, 12_000) + "\n// ... truncated" : f.content,
288
+ language: f.language,
289
+ }));
258
290
  }
291
+ // Prepend PR context header so the LLM knows this is a code review task
292
+ code = [
293
+ `// ===== PR CODE REVIEW: ${pr.pr_title} =====`,
294
+ `// Repository: ${repoName} | Language: ${language}`,
295
+ `// File: ${files[0].path}`,
296
+ `// This is a unified diff โ€” lines starting with + are additions, - are removals, @@ are hunk headers`,
297
+ `// Review this code change for bugs, security issues, and quality problems.`,
298
+ "",
299
+ code,
300
+ ].join("\n");
259
301
  }
260
302
  else {
261
303
  // Fallback: embed golden comments as context for LLM evaluation
@@ -265,7 +307,7 @@ export function convertPrToBenchmarkCase(pr, repoName, diff) {
265
307
  }
266
308
  code = lines.join("\n");
267
309
  }
268
- return {
310
+ const benchCase = {
269
311
  id: `martian-${repoName}-${pr.pr_title
270
312
  .slice(0, 40)
271
313
  .replace(/[^a-zA-Z0-9]/g, "-")
@@ -279,42 +321,173 @@ export function convertPrToBenchmarkCase(pr, repoName, diff) {
279
321
  difficulty: pr.comments.some((c) => c.severity === "Critical" || c.severity === "High") ? "hard" : "medium",
280
322
  aiSource: "martian-code-review-benchmark",
281
323
  };
324
+ // Attach additional files for multi-file evaluation context
325
+ if (additionalFiles && additionalFiles.length > 0) {
326
+ benchCase.files = additionalFiles;
327
+ }
328
+ return benchCase;
282
329
  }
283
330
  /**
284
331
  * Infer the most likely judge prefix from a golden comment description.
332
+ *
333
+ * Uses weighted pattern matching โ€” each regex contributes a score per prefix,
334
+ * and the prefix with the highest total wins. This handles comments that span
335
+ * multiple domains (e.g. "race condition causes null pointer" โ†’ CONC > ERR).
285
336
  */
286
337
  function inferPrefixFromComment(comment, severity) {
287
338
  const lower = comment.toLowerCase();
288
- if (/race|deadlock|lock|concurrent|mutex|thread/.test(lower))
289
- return "CONC";
290
- if (/sql|query|database|n\+1|select \*/.test(lower))
291
- return "DB";
292
- if (/auth|credential|password|token|session|oauth|permission/.test(lower))
293
- return "AUTH";
294
- if (/inject|xss|eval|command/.test(lower))
295
- return "CYBER";
296
- if (/secret|hardcod|api.?key/.test(lower))
297
- return "CFG";
298
- if (/null|undefined|none|nil|attributeerror|typeerror|crash/.test(lower))
299
- return "ERR";
300
- if (/error|exception|catch|throw|unhandled|fault/.test(lower))
301
- return "ERR";
302
- if (/valid|sanitiz|input|check|assert/.test(lower))
303
- return "SEC";
304
- if (/performance|slow|latency|cache|memory/.test(lower))
305
- return "PERF";
306
- if (/deprecat|obsolete|legacy|breaking/.test(lower))
307
- return "COMPAT";
308
- if (/log|metric|monitor|observ/.test(lower))
309
- return "OBS";
310
- if (/test|flaky|mock|assert/.test(lower))
311
- return "TEST";
312
- if (/name|typo|rename|docstring|comment/.test(lower))
313
- return "DOC";
314
- if (/magic.?number|duplicate|dead.?code|complex/.test(lower))
315
- return "MAINT";
316
- if (/isinstance|type|class|inherit/.test(lower))
317
- return "LOGIC";
339
+ const scores = {};
340
+ function add(prefix, weight) {
341
+ scores[prefix] = (scores[prefix] ?? 0) + weight;
342
+ }
343
+ // Concurrency / race conditions
344
+ if (/race\s*condition|data\s*race/.test(lower))
345
+ add("CONC", 3);
346
+ if (/deadlock|mutex|lock\s*(acquisit|order|contention)/.test(lower))
347
+ add("CONC", 3);
348
+ if (/concurrent|thread.?safe|atomic|synchroniz/.test(lower))
349
+ add("CONC", 2);
350
+ if (/parallel|interleav/.test(lower))
351
+ add("CONC", 1);
352
+ // Database
353
+ if (/sql\s*inject|query\s*inject/.test(lower))
354
+ add("DB", 3);
355
+ if (/n\+1|n \+ 1/.test(lower))
356
+ add("DB", 3);
357
+ if (/select\s*\*|query|queryset/.test(lower))
358
+ add("DB", 2);
359
+ if (/database|transaction|rollback|commit/.test(lower))
360
+ add("DB", 2);
361
+ if (/migration|schema|index|join|subquery/.test(lower))
362
+ add("DB", 1);
363
+ if (/paginator|cursor|offset|limit/.test(lower))
364
+ add("DB", 1);
365
+ // Authentication / Authorization
366
+ if (/oauth|csrf|session\s*(secret|fixation|hijack)/.test(lower))
367
+ add("AUTH", 3);
368
+ if (/authenticat|credential|password|passkey/.test(lower))
369
+ add("AUTH", 2);
370
+ if (/authoriz|permission|privilege|role|scope|access\s*control/.test(lower))
371
+ add("AUTH", 2);
372
+ if (/token(?!\s*expir)/.test(lower))
373
+ add("AUTH", 1);
374
+ // Cybersecurity / Injection
375
+ if (/inject(?!ion\s*depend)|xss|cross.?site|command\s*inject/.test(lower))
376
+ add("CYBER", 3);
377
+ if (/deserialization|prototype\s*pollut|path\s*traversal/.test(lower))
378
+ add("CYBER", 3);
379
+ if (/ssrf|open\s*redirect|rce|remote\s*code/.test(lower))
380
+ add("CYBER", 3);
381
+ if (/sanitiz|escap(?!e\s*hatch)|encod/.test(lower))
382
+ add("CYBER", 1);
383
+ // Configuration / Secrets
384
+ if (/hardcod|hard.coded|secret\s*key/.test(lower))
385
+ add("CFG", 3);
386
+ if (/api.?key|config\s*(missing|invalid|hardcod)/.test(lower))
387
+ add("CFG", 2);
388
+ if (/environment\s*variable|\.env|secret/.test(lower))
389
+ add("CFG", 1);
390
+ // Error handling / Null safety
391
+ if (/null\s*(reference|pointer|dereference)|none\s*type|undefined\s*is\s*not/.test(lower))
392
+ add("ERR", 3);
393
+ if (/attributeerror|typeerror|keyerror|indexerror/.test(lower))
394
+ add("ERR", 3);
395
+ if (/unhandled\s*(error|exception|reject)/.test(lower))
396
+ add("ERR", 3);
397
+ if (/null|undefined|nil|\.?none\b/.test(lower))
398
+ add("ERR", 2);
399
+ if (/error\s*handl|exception|try.?catch|throw/.test(lower))
400
+ add("ERR", 2);
401
+ if (/crash|abort|panic|fault/.test(lower))
402
+ add("ERR", 1);
403
+ if (/missing\s*check|guard\s*clause/.test(lower))
404
+ add("ERR", 1);
405
+ // Security (general)
406
+ if (/vulnerab|exploit|attack\s*surface/.test(lower))
407
+ add("SEC", 2);
408
+ if (/valid(?:at(?:e|ion))|sanitiz|input\s*check/.test(lower))
409
+ add("SEC", 2);
410
+ if (/unsafe|insecure|taint/.test(lower))
411
+ add("SEC", 1);
412
+ // Performance
413
+ if (/performance|latency|throughput|bottleneck/.test(lower))
414
+ add("PERF", 2);
415
+ if (/slow|memory\s*leak|cache\s*(miss|invalid)/.test(lower))
416
+ add("PERF", 2);
417
+ if (/O\(n\^?2\)|quadratic|exponential/.test(lower))
418
+ add("PERF", 2);
419
+ if (/blocking|synchronous.*event\s*loop/.test(lower))
420
+ add("PERF", 1);
421
+ // Logic / correctness
422
+ if (/isinstance|subclass|type\s*check|type\s*error/.test(lower))
423
+ add("LOGIC", 2);
424
+ if (/wrong\s*(key|type|value|order|result)/.test(lower))
425
+ add("LOGIC", 2);
426
+ if (/off.by.one|fence\s*post|boundary/.test(lower))
427
+ add("LOGIC", 2);
428
+ if (/logic|incorrect|semantic/.test(lower))
429
+ add("LOGIC", 1);
430
+ if (/always\s*(true|false)|never\s*(true|false|reach)/.test(lower))
431
+ add("LOGIC", 2);
432
+ if (/negative\s*(slice|index|offset)/.test(lower))
433
+ add("LOGIC", 2);
434
+ // Observability / Monitoring
435
+ if (/metric|monitor|observ|telemetry|tracing/.test(lower))
436
+ add("OBS", 2);
437
+ if (/logg?ing|log\s*(level|format|statement)/.test(lower))
438
+ add("OBS", 1);
439
+ if (/alert|dashboard|instrument/.test(lower))
440
+ add("OBS", 1);
441
+ // Testing
442
+ if (/test\s*(flaky|brittle|fragile|unreliable)/.test(lower))
443
+ add("TEST", 3);
444
+ if (/sleep\s*in\s*test|time\.sleep|flaky/.test(lower))
445
+ add("TEST", 2);
446
+ if (/mock|stub|fixture|assert|test\s*coverage/.test(lower))
447
+ add("TEST", 1);
448
+ if (/monkeypatch|test_/.test(lower))
449
+ add("TEST", 1);
450
+ // Maintainability
451
+ if (/magic\s*number|duplicate|copy.?paste|dead\s*code/.test(lower))
452
+ add("MAINT", 2);
453
+ if (/complex|readab|refactor|techni?cal\s*debt/.test(lower))
454
+ add("MAINT", 1);
455
+ if (/naming|misleading|confusing|unclear/.test(lower))
456
+ add("MAINT", 1);
457
+ // Documentation
458
+ if (/docstring|comment|documentation|readme/.test(lower))
459
+ add("DOC", 2);
460
+ if (/typo|spelling|rename/.test(lower))
461
+ add("DOC", 1);
462
+ if (/jsdoc|javadoc|pydoc|rustdoc/.test(lower))
463
+ add("DOC", 1);
464
+ // Compatibility
465
+ if (/breaking\s*change|backwards?\s*compat|deprecat/.test(lower))
466
+ add("COMPAT", 2);
467
+ if (/migration|version|compat/.test(lower))
468
+ add("COMPAT", 1);
469
+ // Reliability
470
+ if (/timeout|retry|circuit.?break|failover/.test(lower))
471
+ add("REL", 2);
472
+ if (/resilien|graceful|recovery|shutdown/.test(lower))
473
+ add("REL", 1);
474
+ if (/terminate|kill|signal|process/.test(lower))
475
+ add("REL", 1);
476
+ // Framework safety
477
+ if (/middleware|express|django|flask|spring/.test(lower))
478
+ add("FW", 1);
479
+ if (/helmet|cors|csrf\s*middleware/.test(lower))
480
+ add("FW", 2);
481
+ // Rate limiting
482
+ if (/rate\s*limit|throttl|brute.?force/.test(lower))
483
+ add("RATE", 2);
484
+ if (/ddos|denial.?of.?service|resource\s*exhaust/.test(lower))
485
+ add("RATE", 1);
486
+ // Pick highest-scoring prefix
487
+ const sorted = Object.entries(scores).sort((a, b) => b[1] - a[1]);
488
+ if (sorted.length > 0 && sorted[0][1] > 0) {
489
+ return sorted[0][0];
490
+ }
318
491
  // Default based on severity
319
492
  if (severity === "Critical" || severity === "High")
320
493
  return "SEC";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges-cli",
3
- "version": "3.129.0",
3
+ "version": "3.129.2",
4
4
  "description": "CLI wrapper for the Judges code review toolkit.",
5
5
  "type": "module",
6
6
  "main": "dist/cli.js",