npm - @kevinrabun/judges - Versions diffs - 3.129.3 → 3.129.4 - Mend

@kevinrabun/judges 3.129.3 → 3.129.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/commands/llm-benchmark.d.ts +8 -1
package/dist/commands/llm-benchmark.js +24 -3
package/package.json +1 -1
package/server.json +2 -2

package/dist/commands/llm-benchmark.d.ts CHANGED Viewed

@@ -114,7 +114,14 @@ export declare function extractValidatedLlmFindings(response: string, prefixes?:
  * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
  * mirroring the tribunal architecture for consistency and better precision.
  */
-export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
+/**
+ * Review-mode directive injected into per-judge prompts for external code-review
+ * benchmarks. Overrides the precision/suppression mandates to match the code-review
+ * task: flag design concerns, behavior regressions, and subtle issues that a senior
+ * human reviewer would catch — not just CVE-class vulnerabilities.
+ */
+export declare const CODE_REVIEW_MODE_DIRECTIVE = "CODE REVIEW MODE (this is a pull request diff review \u2014 OVERRIDES the SINGLE-FILE LIMITATION and adjusts the PRECISION MANDATE):\n- You are reviewing a pull request diff, not an isolated code snippet. The code shows WHAT CHANGED in a real codebase.\n- Lines starting with `-` were REMOVED. Lines starting with `+` were ADDED. Lines starting with `@@` are hunk headers with line numbers. Unchanged lines provide context.\n- BEHAVIOR REGRESSIONS: Flag any removed validation, removed error handling, removed security checks, weakened conditions, or relaxed constraints. If a check existed before and was removed or weakened, that IS a finding.\n- DESIGN CONCERNS: Flag cache invalidation gaps, missing transaction boundaries, race conditions in concurrent access, incorrect type checks (e.g. isinstance against wrong base class), and architectural issues visible in the diff.\n- ABSENCE IS VALID: Unlike single-file review, this is a complete change set. Missing error handling for new code paths, missing validation on new inputs, and missing tests for new functionality ARE valid findings.\n- LOWER CONFIDENCE THRESHOLD: Report findings at \u226560% confidence (not the usual \u226580%). Code review catches potential problems early \u2014 a \"worth investigating\" concern is valuable even if not certain.\n- SEVERITY CALIBRATION: In code review, a \"Medium\" finding means \"a senior reviewer would comment on this.\" A \"High\" finding means \"this should be fixed before merge.\" A \"Critical\" finding means \"this will cause a production incident.\"\n- DO NOT SUPPRESS: The FINAL GATE and CLEAN CODE GATE do not apply in review mode. Well-structured code can still have review-worthy concerns (e.g., a well-written function that introduces a subtle race condition).\n- REPORT AT LEAST ONE FINDING if you identify ANY concern within your domain \u2014 even if it's only medium severity. An empty report should mean \"I genuinely found nothing in my domain,\" not \"I suppressed everything below high confidence.\"";
+export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[], reviewMode?: boolean): string;
 /**
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
  * When `judges` is provided, uses that filtered list instead of all tribunal judges.

package/dist/commands/llm-benchmark.js CHANGED Viewed

@@ -201,7 +201,23 @@ export function extractValidatedLlmFindings(response, prefixes) {
  * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
  * mirroring the tribunal architecture for consistency and better precision.
  */
-export function constructPerJudgePrompt(judge, code, language, contextSnippets = [], amendments) {
+/**
+ * Review-mode directive injected into per-judge prompts for external code-review
+ * benchmarks. Overrides the precision/suppression mandates to match the code-review
+ * task: flag design concerns, behavior regressions, and subtle issues that a senior
+ * human reviewer would catch — not just CVE-class vulnerabilities.
+ */
+export const CODE_REVIEW_MODE_DIRECTIVE = `CODE REVIEW MODE (this is a pull request diff review — OVERRIDES the SINGLE-FILE LIMITATION and adjusts the PRECISION MANDATE):
+- You are reviewing a pull request diff, not an isolated code snippet. The code shows WHAT CHANGED in a real codebase.
+- Lines starting with \`-\` were REMOVED. Lines starting with \`+\` were ADDED. Lines starting with \`@@\` are hunk headers with line numbers. Unchanged lines provide context.
+- BEHAVIOR REGRESSIONS: Flag any removed validation, removed error handling, removed security checks, weakened conditions, or relaxed constraints. If a check existed before and was removed or weakened, that IS a finding.
+- DESIGN CONCERNS: Flag cache invalidation gaps, missing transaction boundaries, race conditions in concurrent access, incorrect type checks (e.g. isinstance against wrong base class), and architectural issues visible in the diff.
+- ABSENCE IS VALID: Unlike single-file review, this is a complete change set. Missing error handling for new code paths, missing validation on new inputs, and missing tests for new functionality ARE valid findings.
+- LOWER CONFIDENCE THRESHOLD: Report findings at ≥60% confidence (not the usual ≥80%). Code review catches potential problems early — a "worth investigating" concern is valuable even if not certain.
+- SEVERITY CALIBRATION: In code review, a "Medium" finding means "a senior reviewer would comment on this." A "High" finding means "this should be fixed before merge." A "Critical" finding means "this will cause a production incident."
+- DO NOT SUPPRESS: The FINAL GATE and CLEAN CODE GATE do not apply in review mode. Well-structured code can still have review-worthy concerns (e.g., a well-written function that introduces a subtle race condition).
+- REPORT AT LEAST ONE FINDING if you identify ANY concern within your domain — even if it's only medium severity. An empty report should mean "I genuinely found nothing in my domain," not "I suppressed everything below high confidence."`;
+export function constructPerJudgePrompt(judge, code, language, contextSnippets = [], amendments, reviewMode = false) {
     const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
     const criteria = getCondensedCriteria(judge.systemPrompt);
     const contextSection = contextSnippets.length
@@ -210,14 +226,19 @@ export function constructPerJudgePrompt(judge, code, language, contextSnippets =
     // Filter amendments to only those relevant to this judge
     const relevantAmendments = (amendments ?? []).filter((a) => a.judgePrefix === judge.rulePrefix);
     const amendmentSection = formatAmendmentSection(relevantAmendments);
+    const reviewModeSection = reviewMode ? `${CODE_REVIEW_MODE_DIRECTIVE}\n\n` : "";
+    const evaluationTask = reviewMode
+        ? `Please review the following ${language} pull request diff for issues within your domain:\n\n\`\`\`${language}\n${code}\n\`\`\``
+        : `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\``;
     return (`${persona}\n\n` +
         `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
         `${PRECISION_MANDATE}\n\n` +
+        reviewModeSection +
         (amendmentSection ? `${amendmentSection}\n` : "") +
         contextSection +
         `${criteria}\n\n` +
-        `${CLEAN_CODE_GATE}\n\n` +
-        `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
+        (reviewMode ? "" : `${CLEAN_CODE_GATE}\n\n`) +
+        evaluationTask +
         `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`);
 }
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@kevinrabun/judges",
-  "version": "3.129.3",
+  "version": "3.129.4",
   "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
   "mcpName": "io.github.KevinRabun/judges",
   "type": "module",

package/server.json CHANGED Viewed

@@ -16,12 +16,12 @@
       "mimeType": "image/png"
     }
   ],
-  "version": "3.129.3",
+  "version": "3.129.4",
   "packages": [
     {
       "registryType": "npm",
       "identifier": "@kevinrabun/judges",
-      "version": "3.129.3",
+      "version": "3.129.4",
       "transport": {
         "type": "stdio"
       }