@kevinrabun/judges 3.129.3 → 3.129.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -114,7 +114,14 @@ export declare function extractValidatedLlmFindings(response: string, prefixes?:
|
|
|
114
114
|
* Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
|
|
115
115
|
* mirroring the tribunal architecture for consistency and better precision.
|
|
116
116
|
*/
|
|
117
|
-
|
|
117
|
+
/**
|
|
118
|
+
* Review-mode directive injected into per-judge prompts for external code-review
|
|
119
|
+
* benchmarks. Overrides the precision/suppression mandates to match the code-review
|
|
120
|
+
* task: flag design concerns, behavior regressions, and subtle issues that a senior
|
|
121
|
+
* human reviewer would catch — not just CVE-class vulnerabilities.
|
|
122
|
+
*/
|
|
123
|
+
export declare const CODE_REVIEW_MODE_DIRECTIVE = "CODE REVIEW MODE (this is a pull request diff review \u2014 OVERRIDES the SINGLE-FILE LIMITATION and adjusts the PRECISION MANDATE):\n- You are reviewing a pull request diff, not an isolated code snippet. The code shows WHAT CHANGED in a real codebase.\n- Lines starting with `-` were REMOVED. Lines starting with `+` were ADDED. Lines starting with `@@` are hunk headers with line numbers. Unchanged lines provide context.\n- BEHAVIOR REGRESSIONS: Flag any removed validation, removed error handling, removed security checks, weakened conditions, or relaxed constraints. If a check existed before and was removed or weakened, that IS a finding.\n- DESIGN CONCERNS: Flag cache invalidation gaps, missing transaction boundaries, race conditions in concurrent access, incorrect type checks (e.g. isinstance against wrong base class), and architectural issues visible in the diff.\n- ABSENCE IS VALID: Unlike single-file review, this is a complete change set. Missing error handling for new code paths, missing validation on new inputs, and missing tests for new functionality ARE valid findings.\n- LOWER CONFIDENCE THRESHOLD: Report findings at \u226560% confidence (not the usual \u226580%). Code review catches potential problems early \u2014 a \"worth investigating\" concern is valuable even if not certain.\n- SEVERITY CALIBRATION: In code review, a \"Medium\" finding means \"a senior reviewer would comment on this.\" A \"High\" finding means \"this should be fixed before merge.\" A \"Critical\" finding means \"this will cause a production incident.\"\n- DO NOT SUPPRESS: The FINAL GATE and CLEAN CODE GATE do not apply in review mode. Well-structured code can still have review-worthy concerns (e.g., a well-written function that introduces a subtle race condition).\n- REPORT AT LEAST ONE FINDING if you identify ANY concern within your domain \u2014 even if it's only medium severity. An empty report should mean \"I genuinely found nothing in my domain,\" not \"I suppressed everything below high confidence.\"";
|
|
124
|
+
export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[], reviewMode?: boolean): string;
|
|
118
125
|
/**
|
|
119
126
|
* Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
|
|
120
127
|
* When `judges` is provided, uses that filtered list instead of all tribunal judges.
|
|
@@ -201,7 +201,23 @@ export function extractValidatedLlmFindings(response, prefixes) {
|
|
|
201
201
|
* Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
|
|
202
202
|
* mirroring the tribunal architecture for consistency and better precision.
|
|
203
203
|
*/
|
|
204
|
-
|
|
204
|
+
/**
|
|
205
|
+
* Review-mode directive injected into per-judge prompts for external code-review
|
|
206
|
+
* benchmarks. Overrides the precision/suppression mandates to match the code-review
|
|
207
|
+
* task: flag design concerns, behavior regressions, and subtle issues that a senior
|
|
208
|
+
* human reviewer would catch — not just CVE-class vulnerabilities.
|
|
209
|
+
*/
|
|
210
|
+
export const CODE_REVIEW_MODE_DIRECTIVE = `CODE REVIEW MODE (this is a pull request diff review — OVERRIDES the SINGLE-FILE LIMITATION and adjusts the PRECISION MANDATE):
|
|
211
|
+
- You are reviewing a pull request diff, not an isolated code snippet. The code shows WHAT CHANGED in a real codebase.
|
|
212
|
+
- Lines starting with \`-\` were REMOVED. Lines starting with \`+\` were ADDED. Lines starting with \`@@\` are hunk headers with line numbers. Unchanged lines provide context.
|
|
213
|
+
- BEHAVIOR REGRESSIONS: Flag any removed validation, removed error handling, removed security checks, weakened conditions, or relaxed constraints. If a check existed before and was removed or weakened, that IS a finding.
|
|
214
|
+
- DESIGN CONCERNS: Flag cache invalidation gaps, missing transaction boundaries, race conditions in concurrent access, incorrect type checks (e.g. isinstance against wrong base class), and architectural issues visible in the diff.
|
|
215
|
+
- ABSENCE IS VALID: Unlike single-file review, this is a complete change set. Missing error handling for new code paths, missing validation on new inputs, and missing tests for new functionality ARE valid findings.
|
|
216
|
+
- LOWER CONFIDENCE THRESHOLD: Report findings at ≥60% confidence (not the usual ≥80%). Code review catches potential problems early — a "worth investigating" concern is valuable even if not certain.
|
|
217
|
+
- SEVERITY CALIBRATION: In code review, a "Medium" finding means "a senior reviewer would comment on this." A "High" finding means "this should be fixed before merge." A "Critical" finding means "this will cause a production incident."
|
|
218
|
+
- DO NOT SUPPRESS: The FINAL GATE and CLEAN CODE GATE do not apply in review mode. Well-structured code can still have review-worthy concerns (e.g., a well-written function that introduces a subtle race condition).
|
|
219
|
+
- REPORT AT LEAST ONE FINDING if you identify ANY concern within your domain — even if it's only medium severity. An empty report should mean "I genuinely found nothing in my domain," not "I suppressed everything below high confidence."`;
|
|
220
|
+
export function constructPerJudgePrompt(judge, code, language, contextSnippets = [], amendments, reviewMode = false) {
|
|
205
221
|
const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
|
|
206
222
|
const criteria = getCondensedCriteria(judge.systemPrompt);
|
|
207
223
|
const contextSection = contextSnippets.length
|
|
@@ -210,14 +226,19 @@ export function constructPerJudgePrompt(judge, code, language, contextSnippets =
|
|
|
210
226
|
// Filter amendments to only those relevant to this judge
|
|
211
227
|
const relevantAmendments = (amendments ?? []).filter((a) => a.judgePrefix === judge.rulePrefix);
|
|
212
228
|
const amendmentSection = formatAmendmentSection(relevantAmendments);
|
|
229
|
+
const reviewModeSection = reviewMode ? `${CODE_REVIEW_MODE_DIRECTIVE}\n\n` : "";
|
|
230
|
+
const evaluationTask = reviewMode
|
|
231
|
+
? `Please review the following ${language} pull request diff for issues within your domain:\n\n\`\`\`${language}\n${code}\n\`\`\``
|
|
232
|
+
: `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\``;
|
|
213
233
|
return (`${persona}\n\n` +
|
|
214
234
|
`${SHARED_ADVERSARIAL_MANDATE}\n\n` +
|
|
215
235
|
`${PRECISION_MANDATE}\n\n` +
|
|
236
|
+
reviewModeSection +
|
|
216
237
|
(amendmentSection ? `${amendmentSection}\n` : "") +
|
|
217
238
|
contextSection +
|
|
218
239
|
`${criteria}\n\n` +
|
|
219
|
-
`${CLEAN_CODE_GATE}\n\n` +
|
|
220
|
-
|
|
240
|
+
(reviewMode ? "" : `${CLEAN_CODE_GATE}\n\n`) +
|
|
241
|
+
evaluationTask +
|
|
221
242
|
`\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`);
|
|
222
243
|
}
|
|
223
244
|
/**
|
package/package.json
CHANGED
package/server.json
CHANGED
|
@@ -16,12 +16,12 @@
|
|
|
16
16
|
"mimeType": "image/png"
|
|
17
17
|
}
|
|
18
18
|
],
|
|
19
|
-
"version": "3.129.
|
|
19
|
+
"version": "3.129.4",
|
|
20
20
|
"packages": [
|
|
21
21
|
{
|
|
22
22
|
"registryType": "npm",
|
|
23
23
|
"identifier": "@kevinrabun/judges",
|
|
24
|
-
"version": "3.129.
|
|
24
|
+
"version": "3.129.4",
|
|
25
25
|
"transport": {
|
|
26
26
|
"type": "stdio"
|
|
27
27
|
}
|