@kevinrabun/judges 3.129.3 → 3.129.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -114,7 +114,14 @@ export declare function extractValidatedLlmFindings(response: string, prefixes?:
114
114
  * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
115
115
  * mirroring the tribunal architecture for consistency and better precision.
116
116
  */
117
- export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
117
+ /**
118
+ * Review-mode directive injected into per-judge prompts for external code-review
119
+ * benchmarks. Overrides the precision/suppression mandates to match the code-review
120
+ * task: flag design concerns, behavior regressions, and subtle issues that a senior
121
+ * human reviewer would catch — not just CVE-class vulnerabilities.
122
+ */
123
+ export declare const CODE_REVIEW_MODE_DIRECTIVE = "CODE REVIEW MODE (this is a pull request diff review \u2014 OVERRIDES the SINGLE-FILE LIMITATION and adjusts the PRECISION MANDATE):\n- You are reviewing a pull request diff, not an isolated code snippet. The code shows WHAT CHANGED in a real codebase.\n- Lines starting with `-` were REMOVED. Lines starting with `+` were ADDED. Lines starting with `@@` are hunk headers with line numbers. Unchanged lines provide context.\n- BEHAVIOR REGRESSIONS: Flag any removed validation, removed error handling, removed security checks, weakened conditions, or relaxed constraints. If a check existed before and was removed or weakened, that IS a finding.\n- DESIGN CONCERNS: Flag cache invalidation gaps, missing transaction boundaries, race conditions in concurrent access, incorrect type checks (e.g. isinstance against wrong base class), and architectural issues visible in the diff.\n- ABSENCE IS VALID: Unlike single-file review, this is a complete change set. Missing error handling for new code paths, missing validation on new inputs, and missing tests for new functionality ARE valid findings.\n- LOWER CONFIDENCE THRESHOLD: Report findings at \u226560% confidence (not the usual \u226580%). Code review catches potential problems early \u2014 a \"worth investigating\" concern is valuable even if not certain.\n- SEVERITY CALIBRATION: In code review, a \"Medium\" finding means \"a senior reviewer would comment on this.\" A \"High\" finding means \"this should be fixed before merge.\" A \"Critical\" finding means \"this will cause a production incident.\"\n- DO NOT SUPPRESS: The FINAL GATE and CLEAN CODE GATE do not apply in review mode. Well-structured code can still have review-worthy concerns (e.g., a well-written function that introduces a subtle race condition).\n- REPORT AT LEAST ONE FINDING if you identify ANY concern within your domain \u2014 even if it's only medium severity. An empty report should mean \"I genuinely found nothing in my domain,\" not \"I suppressed everything below high confidence.\"";
124
+ export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[], reviewMode?: boolean): string;
118
125
  /**
119
126
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
120
127
  * When `judges` is provided, uses that filtered list instead of all tribunal judges.
@@ -201,7 +201,23 @@ export function extractValidatedLlmFindings(response, prefixes) {
201
201
  * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
202
202
  * mirroring the tribunal architecture for consistency and better precision.
203
203
  */
204
- export function constructPerJudgePrompt(judge, code, language, contextSnippets = [], amendments) {
204
+ /**
205
+ * Review-mode directive injected into per-judge prompts for external code-review
206
+ * benchmarks. Overrides the precision/suppression mandates to match the code-review
207
+ * task: flag design concerns, behavior regressions, and subtle issues that a senior
208
+ * human reviewer would catch — not just CVE-class vulnerabilities.
209
+ */
210
+ export const CODE_REVIEW_MODE_DIRECTIVE = `CODE REVIEW MODE (this is a pull request diff review — OVERRIDES the SINGLE-FILE LIMITATION and adjusts the PRECISION MANDATE):
211
+ - You are reviewing a pull request diff, not an isolated code snippet. The code shows WHAT CHANGED in a real codebase.
212
+ - Lines starting with \`-\` were REMOVED. Lines starting with \`+\` were ADDED. Lines starting with \`@@\` are hunk headers with line numbers. Unchanged lines provide context.
213
+ - BEHAVIOR REGRESSIONS: Flag any removed validation, removed error handling, removed security checks, weakened conditions, or relaxed constraints. If a check existed before and was removed or weakened, that IS a finding.
214
+ - DESIGN CONCERNS: Flag cache invalidation gaps, missing transaction boundaries, race conditions in concurrent access, incorrect type checks (e.g. isinstance against wrong base class), and architectural issues visible in the diff.
215
+ - ABSENCE IS VALID: Unlike single-file review, this is a complete change set. Missing error handling for new code paths, missing validation on new inputs, and missing tests for new functionality ARE valid findings.
216
+ - LOWER CONFIDENCE THRESHOLD: Report findings at ≥60% confidence (not the usual ≥80%). Code review catches potential problems early — a "worth investigating" concern is valuable even if not certain.
217
+ - SEVERITY CALIBRATION: In code review, a "Medium" finding means "a senior reviewer would comment on this." A "High" finding means "this should be fixed before merge." A "Critical" finding means "this will cause a production incident."
218
+ - DO NOT SUPPRESS: The FINAL GATE and CLEAN CODE GATE do not apply in review mode. Well-structured code can still have review-worthy concerns (e.g., a well-written function that introduces a subtle race condition).
219
+ - REPORT AT LEAST ONE FINDING if you identify ANY concern within your domain — even if it's only medium severity. An empty report should mean "I genuinely found nothing in my domain," not "I suppressed everything below high confidence."`;
220
+ export function constructPerJudgePrompt(judge, code, language, contextSnippets = [], amendments, reviewMode = false) {
205
221
  const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
206
222
  const criteria = getCondensedCriteria(judge.systemPrompt);
207
223
  const contextSection = contextSnippets.length
@@ -210,14 +226,19 @@ export function constructPerJudgePrompt(judge, code, language, contextSnippets =
210
226
  // Filter amendments to only those relevant to this judge
211
227
  const relevantAmendments = (amendments ?? []).filter((a) => a.judgePrefix === judge.rulePrefix);
212
228
  const amendmentSection = formatAmendmentSection(relevantAmendments);
229
+ const reviewModeSection = reviewMode ? `${CODE_REVIEW_MODE_DIRECTIVE}\n\n` : "";
230
+ const evaluationTask = reviewMode
231
+ ? `Please review the following ${language} pull request diff for issues within your domain:\n\n\`\`\`${language}\n${code}\n\`\`\``
232
+ : `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\``;
213
233
  return (`${persona}\n\n` +
214
234
  `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
215
235
  `${PRECISION_MANDATE}\n\n` +
236
+ reviewModeSection +
216
237
  (amendmentSection ? `${amendmentSection}\n` : "") +
217
238
  contextSection +
218
239
  `${criteria}\n\n` +
219
- `${CLEAN_CODE_GATE}\n\n` +
220
- `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
240
+ (reviewMode ? "" : `${CLEAN_CODE_GATE}\n\n`) +
241
+ evaluationTask +
221
242
  `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`);
222
243
  }
223
244
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges",
3
- "version": "3.129.3",
3
+ "version": "3.129.5",
4
4
  "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
5
5
  "mcpName": "io.github.KevinRabun/judges",
6
6
  "type": "module",
package/server.json CHANGED
@@ -16,12 +16,12 @@
16
16
  "mimeType": "image/png"
17
17
  }
18
18
  ],
19
- "version": "3.129.3",
19
+ "version": "3.129.5",
20
20
  "packages": [
21
21
  {
22
22
  "registryType": "npm",
23
23
  "identifier": "@kevinrabun/judges",
24
- "version": "3.129.3",
24
+ "version": "3.129.5",
25
25
  "transport": {
26
26
  "type": "stdio"
27
27
  }