@kevinrabun/judges 3.113.0 → 3.115.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -0
- package/agents/accessibility.judge.md +37 -0
- package/agents/agent-instructions.judge.md +37 -0
- package/agents/ai-code-safety.judge.md +48 -0
- package/agents/api-contract.judge.md +30 -0
- package/agents/api-design.judge.md +39 -0
- package/agents/authentication.judge.md +37 -0
- package/agents/backwards-compatibility.judge.md +37 -0
- package/agents/caching.judge.md +37 -0
- package/agents/ci-cd.judge.md +37 -0
- package/agents/cloud-readiness.judge.md +37 -0
- package/agents/code-structure.judge.md +48 -0
- package/agents/compliance.judge.md +40 -0
- package/agents/concurrency.judge.md +39 -0
- package/agents/configuration-management.judge.md +37 -0
- package/agents/cost-effectiveness.judge.md +40 -0
- package/agents/cybersecurity.judge.md +36 -0
- package/agents/data-security.judge.md +34 -0
- package/agents/data-sovereignty.judge.md +58 -0
- package/agents/database.judge.md +41 -0
- package/agents/dependency-health.judge.md +39 -0
- package/agents/documentation.judge.md +39 -0
- package/agents/error-handling.judge.md +37 -0
- package/agents/ethics-bias.judge.md +39 -0
- package/agents/false-positive-review.judge.md +73 -0
- package/agents/framework-safety.judge.md +40 -0
- package/agents/hallucination-detection.judge.md +33 -0
- package/agents/iac-security.judge.md +38 -0
- package/agents/intent-alignment.judge.md +31 -0
- package/agents/internationalization.judge.md +42 -0
- package/agents/logging-privacy.judge.md +37 -0
- package/agents/logic-review.judge.md +34 -0
- package/agents/maintainability.judge.md +37 -0
- package/agents/model-fingerprint.judge.md +31 -0
- package/agents/multi-turn-coherence.judge.md +29 -0
- package/agents/observability.judge.md +37 -0
- package/agents/over-engineering.judge.md +48 -0
- package/agents/performance.judge.md +44 -0
- package/agents/portability.judge.md +37 -0
- package/agents/rate-limiting.judge.md +37 -0
- package/agents/reliability.judge.md +39 -0
- package/agents/scalability.judge.md +41 -0
- package/agents/security.judge.md +31 -0
- package/agents/software-practices.judge.md +44 -0
- package/agents/testing.judge.md +39 -0
- package/agents/ux.judge.md +37 -0
- package/dist/api.d.ts +9 -1
- package/dist/api.js +9 -1
- package/dist/commands/fix.d.ts +10 -0
- package/dist/commands/fix.js +52 -0
- package/dist/commands/llm-benchmark.d.ts +13 -4
- package/dist/commands/llm-benchmark.js +39 -8
- package/dist/commands/review.d.ts +51 -1
- package/dist/commands/review.js +213 -7
- package/dist/evaluators/index.js +61 -35
- package/dist/github-app.d.ts +35 -0
- package/dist/github-app.js +125 -4
- package/dist/judges/index.d.ts +23 -61
- package/dist/judges/index.js +49 -63
- package/dist/patches/apply.d.ts +15 -0
- package/dist/patches/apply.js +37 -0
- package/dist/tools/prompts.d.ts +2 -2
- package/dist/tools/prompts.js +21 -10
- package/docs/skills.md +7 -0
- package/package.json +18 -3
- package/packages/judges-cli/README.md +24 -0
- package/packages/judges-cli/bin/judges.js +8 -0
- package/scripts/generate-agents-from-judges.ts +111 -0
- package/scripts/generate-skills-docs.ts +26 -0
- package/scripts/validate-agents.ts +104 -0
- package/server.json +2 -2
- package/skills/ai-code-review.skill.md +57 -0
- package/skills/release-gate.skill.md +27 -0
- package/skills/security-review.skill.md +32 -0
- package/src/agent-loader.ts +324 -0
- package/src/skill-loader.ts +199 -0
package/dist/github-app.js
CHANGED
|
@@ -24,6 +24,11 @@ import { readFileSync, existsSync } from "fs";
|
|
|
24
24
|
import { createServer } from "http";
|
|
25
25
|
import { evaluateWithTribunal } from "./evaluators/index.js";
|
|
26
26
|
import { evaluateProject } from "./evaluators/project.js";
|
|
27
|
+
import { extractValidatedLlmFindings, getValidRulePrefixes, constructTribunalPrompt, } from "./commands/llm-benchmark.js";
|
|
28
|
+
import { buildContextSnippets } from "./context/context-snippets.js";
|
|
29
|
+
// Test override hooks (exported for tsx/node:test to avoid esbuild inlining)
|
|
30
|
+
export let evaluateWithTribunalImpl = evaluateWithTribunal;
|
|
31
|
+
export let evaluateProjectImpl = evaluateProject;
|
|
27
32
|
// ─── Language Detection ─────────────────────────────────────────────────────
|
|
28
33
|
export const EXT_TO_LANG = {
|
|
29
34
|
".ts": "typescript",
|
|
@@ -71,7 +76,12 @@ export function generateJwt(appId, privateKey) {
|
|
|
71
76
|
return `${signingInput}.${signature}`;
|
|
72
77
|
}
|
|
73
78
|
// ─── GitHub API Helper ──────────────────────────────────────────────────────
|
|
79
|
+
// Test hook for API injection
|
|
80
|
+
let ghApiImpl;
|
|
74
81
|
async function ghApi(method, path, token, body) {
|
|
82
|
+
if (ghApiImpl) {
|
|
83
|
+
return ghApiImpl(method, path, token, body);
|
|
84
|
+
}
|
|
75
85
|
const { default: https } = await import("https");
|
|
76
86
|
const payload = body ? JSON.stringify(body) : "";
|
|
77
87
|
return new Promise((resolve, reject) => {
|
|
@@ -105,8 +115,50 @@ async function ghApi(method, path, token, body) {
|
|
|
105
115
|
req.end();
|
|
106
116
|
});
|
|
107
117
|
}
|
|
118
|
+
export function __setGhApiImplForTest(fn) {
|
|
119
|
+
ghApiImpl = fn;
|
|
120
|
+
}
|
|
121
|
+
async function callOpenAiChat(prompt, opts) {
|
|
122
|
+
// Node 18+ provides global fetch
|
|
123
|
+
const fetchImpl = globalThis.fetch;
|
|
124
|
+
if (!fetchImpl)
|
|
125
|
+
throw new Error("fetch() not available. Run on Node 18+ or polyfill fetch.");
|
|
126
|
+
const url = opts.baseUrl || "https://api.openai.com/v1/chat/completions";
|
|
127
|
+
const res = await fetchImpl(url, {
|
|
128
|
+
method: "POST",
|
|
129
|
+
headers: {
|
|
130
|
+
Authorization: `Bearer ${opts.apiKey}`,
|
|
131
|
+
"Content-Type": "application/json",
|
|
132
|
+
},
|
|
133
|
+
body: JSON.stringify({
|
|
134
|
+
model: opts.model,
|
|
135
|
+
max_tokens: opts.maxTokens ?? 800,
|
|
136
|
+
temperature: 0,
|
|
137
|
+
messages: [{ role: "user", content: prompt }],
|
|
138
|
+
}),
|
|
139
|
+
});
|
|
140
|
+
if (!res.ok) {
|
|
141
|
+
const text = await res.text();
|
|
142
|
+
throw new Error(`LLM request failed: ${res.status} ${res.statusText} ${text}`);
|
|
143
|
+
}
|
|
144
|
+
const json = (await res.json());
|
|
145
|
+
const content = json.choices?.[0]
|
|
146
|
+
?.message?.content;
|
|
147
|
+
if (!content)
|
|
148
|
+
throw new Error("LLM response missing content");
|
|
149
|
+
return content.trim();
|
|
150
|
+
}
|
|
151
|
+
// Test hook
|
|
152
|
+
let callOpenAiChatImpl = callOpenAiChat;
|
|
153
|
+
export function __setCallOpenAiChatImplForTest(fn) {
|
|
154
|
+
callOpenAiChatImpl = fn;
|
|
155
|
+
}
|
|
108
156
|
// ─── Installation Token ─────────────────────────────────────────────────────
|
|
157
|
+
// Test hook
|
|
158
|
+
let getInstallationTokenImpl;
|
|
109
159
|
async function getInstallationToken(appId, privateKey, installationId) {
|
|
160
|
+
if (getInstallationTokenImpl)
|
|
161
|
+
return getInstallationTokenImpl(appId, privateKey, installationId);
|
|
110
162
|
const jwt = generateJwt(appId, privateKey);
|
|
111
163
|
const res = await ghApi("POST", `/app/installations/${installationId}/access_tokens`, jwt);
|
|
112
164
|
const data = res.data;
|
|
@@ -136,7 +188,8 @@ export function parsePatchToHunk(filePath, patch) {
|
|
|
136
188
|
const changedLineNumbers = [];
|
|
137
189
|
let newLineNum = 0;
|
|
138
190
|
for (const line of lines) {
|
|
139
|
-
|
|
191
|
+
// Hunk header: @@ -10,5 +20,8 @@ (some tools omit trailing space/@@)
|
|
192
|
+
const hunkMatch = line.match(/^@@\s*-\d+(?:,\d+)?\s+\+(\d+)(?:,\d+)?\s*@@?/);
|
|
140
193
|
if (hunkMatch) {
|
|
141
194
|
newLineNum = parseInt(hunkMatch[1], 10) - 1;
|
|
142
195
|
continue;
|
|
@@ -206,7 +259,9 @@ async function reviewPullRequest(payload, token, config) {
|
|
|
206
259
|
if (!hunk.newContent.trim())
|
|
207
260
|
continue;
|
|
208
261
|
try {
|
|
209
|
-
|
|
262
|
+
// indirection to allow test overrides even when bundlers inline imports
|
|
263
|
+
const evalFn = getEvaluateWithTribunalImpl();
|
|
264
|
+
const verdict = evalFn(hunk.newContent, lang, undefined, {
|
|
210
265
|
filePath: file.filename,
|
|
211
266
|
includeAstFindings: true,
|
|
212
267
|
});
|
|
@@ -243,8 +298,8 @@ async function reviewPullRequest(payload, token, config) {
|
|
|
243
298
|
}
|
|
244
299
|
if (projectFiles.length >= 2) {
|
|
245
300
|
try {
|
|
246
|
-
const runner = { evaluateWithTribunal };
|
|
247
|
-
const projectVerdict =
|
|
301
|
+
const runner = { evaluateWithTribunal: evaluateWithTribunalImpl };
|
|
302
|
+
const projectVerdict = evaluateProjectImpl(runner, projectFiles);
|
|
248
303
|
for (const f of projectVerdict.architecturalFindings ?? []) {
|
|
249
304
|
if (!meetsSeverityThreshold(f.severity, minSeverity))
|
|
250
305
|
continue;
|
|
@@ -255,6 +310,43 @@ async function reviewPullRequest(payload, token, config) {
|
|
|
255
310
|
// Cross-file failure should not block the review
|
|
256
311
|
}
|
|
257
312
|
}
|
|
313
|
+
// 2c. Optional Layer 2 (LLM) augmentation — append summary comment
|
|
314
|
+
let llmSummary;
|
|
315
|
+
try {
|
|
316
|
+
if (process.env.OPENAI_API_KEY && config.llmDeepReview !== false) {
|
|
317
|
+
const codeBlobs = [];
|
|
318
|
+
const snippetsForRag = [];
|
|
319
|
+
for (const file of prFiles) {
|
|
320
|
+
if (!file.patch)
|
|
321
|
+
continue;
|
|
322
|
+
const hunk = parsePatchToHunk(file.filename, file.patch);
|
|
323
|
+
codeBlobs.push(`// FILE: ${file.filename}\n${hunk.newContent}`);
|
|
324
|
+
snippetsForRag.push(hunk.newContent);
|
|
325
|
+
}
|
|
326
|
+
const combinedCode = codeBlobs.join("\n\n");
|
|
327
|
+
const ragSnippets = await buildContextSnippets(snippetsForRag.join("\n\n"), {
|
|
328
|
+
maxSnippets: 4,
|
|
329
|
+
chunkSize: 1500,
|
|
330
|
+
});
|
|
331
|
+
const contextText = ragSnippets.map((s) => s.snippet);
|
|
332
|
+
const tribunalPrompt = constructTribunalPrompt(combinedCode, "mixed", contextText);
|
|
333
|
+
const content = await callOpenAiChatImpl(tribunalPrompt, {
|
|
334
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
335
|
+
model: process.env.OPENAI_MODEL || "gpt-4o",
|
|
336
|
+
baseUrl: process.env.OPENAI_BASE_URL,
|
|
337
|
+
maxTokens: 800,
|
|
338
|
+
});
|
|
339
|
+
const validation = extractValidatedLlmFindings(content, getValidRulePrefixes());
|
|
340
|
+
const warnings = validation.errors?.length ? `\n\n⚠️ Validation warnings: ${validation.errors.join("; ")}` : "";
|
|
341
|
+
llmSummary =
|
|
342
|
+
`### 🤖 LLM Deep Review (model: ${process.env.OPENAI_MODEL || "gpt-4o"})\n` +
|
|
343
|
+
(validation.ruleIds.length ? `Detected rule IDs: ${validation.ruleIds.join(", ")}` : "No rule IDs detected.") +
|
|
344
|
+
`\n\n${content}${warnings}`;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
catch (err) {
|
|
348
|
+
llmSummary = `⚠️ LLM deep review failed: ${String(err.message ?? err)}`;
|
|
349
|
+
}
|
|
258
350
|
// 3. Build review comments
|
|
259
351
|
const comments = [];
|
|
260
352
|
const seen = new Set();
|
|
@@ -338,6 +430,9 @@ async function reviewPullRequest(payload, token, config) {
|
|
|
338
430
|
if (allFindings.length === 0) {
|
|
339
431
|
summaryLines.push("✅ No findings — code looks good!");
|
|
340
432
|
}
|
|
433
|
+
if (typeof llmSummary === "string") {
|
|
434
|
+
summaryLines.push("", llmSummary);
|
|
435
|
+
}
|
|
341
436
|
const reviewBody = summaryLines.join("\n");
|
|
342
437
|
if (comments.length > 0) {
|
|
343
438
|
const reviewRes = await ghApi("POST", `/repos/${repoFullName}/pulls/${prNumber}/reviews`, token, {
|
|
@@ -534,8 +629,34 @@ export function loadAppConfig() {
|
|
|
534
629
|
autoApprove: process.env.JUDGES_AUTO_APPROVE === "true",
|
|
535
630
|
diffOnly: process.env.JUDGES_DIFF_ONLY !== "false",
|
|
536
631
|
configPath: process.env.JUDGES_CONFIG_PATH,
|
|
632
|
+
llmDeepReview: process.env.JUDGES_LLM_DEEP_REVIEW !== "false", // default on if key exists
|
|
537
633
|
};
|
|
538
634
|
}
|
|
635
|
+
// Test hooks (non-public)
|
|
636
|
+
export function __setEvaluateWithTribunalForTest(fn) {
|
|
637
|
+
evaluateWithTribunalImpl = fn ?? evaluateWithTribunal;
|
|
638
|
+
}
|
|
639
|
+
export function __setEvaluateProjectForTest(fn) {
|
|
640
|
+
evaluateProjectImpl = fn ?? evaluateProject;
|
|
641
|
+
}
|
|
642
|
+
export function getEvaluateWithTribunalImpl() {
|
|
643
|
+
return evaluateWithTribunalImpl;
|
|
644
|
+
}
|
|
645
|
+
export function __getEvaluateWithTribunalImplForTest() {
|
|
646
|
+
return evaluateWithTribunalImpl;
|
|
647
|
+
}
|
|
648
|
+
export const __test = {
|
|
649
|
+
__setCallOpenAiChatImplForTest,
|
|
650
|
+
__getInstallationTokenForTest: (fn) => {
|
|
651
|
+
getInstallationTokenImpl = fn;
|
|
652
|
+
},
|
|
653
|
+
__setGhApiImplForTest,
|
|
654
|
+
__setEvaluateWithTribunalForTest,
|
|
655
|
+
__setEvaluateProjectForTest,
|
|
656
|
+
__getEvaluateWithTribunalImplForTest,
|
|
657
|
+
parsePatchToHunk,
|
|
658
|
+
reviewPullRequest,
|
|
659
|
+
};
|
|
539
660
|
// ─── Standalone HTTP Server ─────────────────────────────────────────────────
|
|
540
661
|
/**
|
|
541
662
|
* Start a standalone HTTP server that listens for GitHub webhooks.
|
package/dist/judges/index.d.ts
CHANGED
|
@@ -1,69 +1,31 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Judge
|
|
3
|
-
* unified JudgeRegistry. Each judge file imports its own evaluator and
|
|
4
|
-
* calls `defaultRegistry.register()`, so this file just needs to import
|
|
5
|
-
* each module for its side effects.
|
|
2
|
+
* Judge registry bootstrap (agent-native).
|
|
6
3
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
4
|
+
* Judges are now sourced from `.judge.md` files in the `agents/` folder (legacy
|
|
5
|
+
* `.agent.md` still supported). Each agent frontmatter references an evaluator
|
|
6
|
+
* script (in `src/evaluators/`), and the agent loader registers them with the
|
|
7
|
+
* unified `JudgeRegistry`.
|
|
8
|
+
*
|
|
9
|
+
* Legacy side-effect imports have been removed. If you need to add a judge, add
|
|
10
|
+
* an agent file and (optionally) an evaluator script, then run:
|
|
11
|
+
* - `npm run generate:agents` (to sync)
|
|
12
|
+
* - `npm run validate:agents`
|
|
11
13
|
*/
|
|
12
14
|
import type { JudgeDefinition } from "../types.js";
|
|
13
|
-
import "./data-security.js";
|
|
14
|
-
import "./cybersecurity.js";
|
|
15
|
-
import "./cost-effectiveness.js";
|
|
16
|
-
import "./scalability.js";
|
|
17
|
-
import "./cloud-readiness.js";
|
|
18
|
-
import "./software-practices.js";
|
|
19
|
-
import "./accessibility.js";
|
|
20
|
-
import "./api-design.js";
|
|
21
|
-
import "./reliability.js";
|
|
22
|
-
import "./observability.js";
|
|
23
|
-
import "./performance.js";
|
|
24
|
-
import "./compliance.js";
|
|
25
|
-
import "./data-sovereignty.js";
|
|
26
|
-
import "./testing.js";
|
|
27
|
-
import "./documentation.js";
|
|
28
|
-
import "./internationalization.js";
|
|
29
|
-
import "./dependency-health.js";
|
|
30
|
-
import "./concurrency.js";
|
|
31
|
-
import "./ethics-bias.js";
|
|
32
|
-
import "./maintainability.js";
|
|
33
|
-
import "./error-handling.js";
|
|
34
|
-
import "./authentication.js";
|
|
35
|
-
import "./database.js";
|
|
36
|
-
import "./caching.js";
|
|
37
|
-
import "./configuration-management.js";
|
|
38
|
-
import "./backwards-compatibility.js";
|
|
39
|
-
import "./portability.js";
|
|
40
|
-
import "./ux.js";
|
|
41
|
-
import "./logging-privacy.js";
|
|
42
|
-
import "./rate-limiting.js";
|
|
43
|
-
import "./ci-cd.js";
|
|
44
|
-
import "./code-structure.js";
|
|
45
|
-
import "./agent-instructions.js";
|
|
46
|
-
import "./ai-code-safety.js";
|
|
47
|
-
import "./framework-safety.js";
|
|
48
|
-
import "./iac-security.js";
|
|
49
|
-
import "./security.js";
|
|
50
|
-
import "./hallucination-detection.js";
|
|
51
|
-
import "./intent-alignment.js";
|
|
52
|
-
import "./api-contract.js";
|
|
53
|
-
import "./multi-turn-coherence.js";
|
|
54
|
-
import "./model-fingerprint.js";
|
|
55
|
-
import "./over-engineering.js";
|
|
56
|
-
import "./logic-review.js";
|
|
57
|
-
import "./false-positive-review.js";
|
|
58
15
|
/**
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
16
|
+
* Load judges (agent-native). Loads agents from the default `agents/` folder
|
|
17
|
+
* and returns the current registry snapshot.
|
|
18
|
+
*/
|
|
19
|
+
export declare function loadJudges(): Promise<JudgeDefinition[]>;
|
|
20
|
+
/**
|
|
21
|
+
* Load agent-based judges from a directory of `.judge.md` files (legacy
|
|
22
|
+
* `.agent.md` supported). This enables hybrid operation where file-based
|
|
23
|
+
* agents can augment or replace built-in judges. If a judge is already
|
|
24
|
+
* registered, it is skipped.
|
|
25
|
+
*/
|
|
26
|
+
export declare function loadAgentJudges(dir?: string): number;
|
|
27
|
+
/**
|
|
28
|
+
* Snapshot of the currently registered judges. (Agent-native)
|
|
67
29
|
*/
|
|
68
30
|
export declare const JUDGES: JudgeDefinition[];
|
|
69
31
|
/**
|
package/dist/judges/index.js
CHANGED
|
@@ -1,73 +1,59 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Judge
|
|
3
|
-
* unified JudgeRegistry. Each judge file imports its own evaluator and
|
|
4
|
-
* calls `defaultRegistry.register()`, so this file just needs to import
|
|
5
|
-
* each module for its side effects.
|
|
2
|
+
* Judge registry bootstrap (agent-native).
|
|
6
3
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
4
|
+
* Judges are now sourced from `.judge.md` files in the `agents/` folder (legacy
|
|
5
|
+
* `.agent.md` still supported). Each agent frontmatter references an evaluator
|
|
6
|
+
* script (in `src/evaluators/`), and the agent loader registers them with the
|
|
7
|
+
* unified `JudgeRegistry`.
|
|
8
|
+
*
|
|
9
|
+
* Legacy side-effect imports have been removed. If you need to add a judge, add
|
|
10
|
+
* an agent file and (optionally) an evaluator script, then run:
|
|
11
|
+
* - `npm run generate:agents` (to sync)
|
|
12
|
+
* - `npm run validate:agents`
|
|
11
13
|
*/
|
|
12
14
|
import { defaultRegistry } from "../judge-registry.js";
|
|
13
|
-
|
|
14
|
-
import "
|
|
15
|
-
import "
|
|
16
|
-
import
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
import "./ai-code-safety.js";
|
|
48
|
-
import "./framework-safety.js";
|
|
49
|
-
import "./iac-security.js";
|
|
50
|
-
import "./security.js";
|
|
51
|
-
import "./hallucination-detection.js";
|
|
52
|
-
import "./intent-alignment.js";
|
|
53
|
-
import "./api-contract.js";
|
|
54
|
-
import "./multi-turn-coherence.js";
|
|
55
|
-
import "./model-fingerprint.js";
|
|
56
|
-
import "./over-engineering.js";
|
|
57
|
-
import "./logic-review.js";
|
|
58
|
-
import "./false-positive-review.js";
|
|
15
|
+
import { loadAndRegisterAgents } from "../agent-loader.js";
|
|
16
|
+
import { resolve, dirname } from "node:path";
|
|
17
|
+
import { fileURLToPath } from "node:url";
|
|
18
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
19
|
+
const __dirname = dirname(__filename);
|
|
20
|
+
let agentsLoaded = false;
|
|
21
|
+
function loadDefaultAgents() {
|
|
22
|
+
if (agentsLoaded)
|
|
23
|
+
return;
|
|
24
|
+
const agentsDir = resolve(__dirname, "..", "..", "agents");
|
|
25
|
+
loadAndRegisterAgents(agentsDir, defaultRegistry);
|
|
26
|
+
agentsLoaded = true;
|
|
27
|
+
}
|
|
28
|
+
// ─── Optional Agent Loader Integration ──────────────────────────────────────
|
|
29
|
+
/**
|
|
30
|
+
* Load judges (agent-native). Loads agents from the default `agents/` folder
|
|
31
|
+
* and returns the current registry snapshot.
|
|
32
|
+
*/
|
|
33
|
+
export async function loadJudges() {
|
|
34
|
+
loadDefaultAgents();
|
|
35
|
+
return defaultRegistry.getJudges();
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Load agent-based judges from a directory of `.judge.md` files (legacy
|
|
39
|
+
* `.agent.md` supported). This enables hybrid operation where file-based
|
|
40
|
+
* agents can augment or replace built-in judges. If a judge is already
|
|
41
|
+
* registered, it is skipped.
|
|
42
|
+
*/
|
|
43
|
+
export function loadAgentJudges(dir = resolve(__dirname, "..", "..", "agents")) {
|
|
44
|
+
agentsLoaded = false; // allow re-run to pick up new agents if dir changes
|
|
45
|
+
const count = loadAndRegisterAgents(dir, defaultRegistry);
|
|
46
|
+
agentsLoaded = true;
|
|
47
|
+
return count;
|
|
48
|
+
}
|
|
59
49
|
// ─── Re-exports backed by the registry ──────────────────────────────────────
|
|
60
50
|
/**
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
* Each judge is a specialized evaluator with deep expertise in a single domain.
|
|
64
|
-
* They operate independently and produce structured findings with
|
|
65
|
-
* severity-rated, actionable recommendations.
|
|
66
|
-
*
|
|
67
|
-
* Note: this snapshot is taken at module-load time, after all built-in judges
|
|
68
|
-
* have self-registered via the side-effect imports above.
|
|
51
|
+
* Snapshot of the currently registered judges. (Agent-native)
|
|
69
52
|
*/
|
|
70
|
-
export const JUDGES =
|
|
53
|
+
export const JUDGES = (() => {
|
|
54
|
+
loadDefaultAgents();
|
|
55
|
+
return defaultRegistry.getJudges();
|
|
56
|
+
})();
|
|
71
57
|
/**
|
|
72
58
|
* Look up a judge by ID.
|
|
73
59
|
*/
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { Finding } from "../types.js";
|
|
2
|
+
export interface ApplyPatchOptions {
|
|
3
|
+
dryRun?: boolean;
|
|
4
|
+
cwd?: string;
|
|
5
|
+
}
|
|
6
|
+
export interface ApplyPatchResult {
|
|
7
|
+
applied: number;
|
|
8
|
+
skipped: number;
|
|
9
|
+
errors: string[];
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Minimal safe apply pipeline. For now, uses `git apply --3way` when available.
|
|
13
|
+
* This is intentionally conservative; if anything fails, it records an error and continues.
|
|
14
|
+
*/
|
|
15
|
+
export declare function applyPatchesFromFindings(findings: Finding[], opts?: ApplyPatchOptions): ApplyPatchResult;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { execSync } from "node:child_process";
|
|
2
|
+
/**
|
|
3
|
+
* Minimal safe apply pipeline. For now, uses `git apply --3way` when available.
|
|
4
|
+
* This is intentionally conservative; if anything fails, it records an error and continues.
|
|
5
|
+
*/
|
|
6
|
+
export function applyPatchesFromFindings(findings, opts = {}) {
|
|
7
|
+
const errors = [];
|
|
8
|
+
let applied = 0;
|
|
9
|
+
let skipped = 0;
|
|
10
|
+
const cwd = opts.cwd ?? process.cwd();
|
|
11
|
+
for (const f of findings) {
|
|
12
|
+
const patchText = f.patch?.newText ?? f.suggestedFix;
|
|
13
|
+
if (!patchText) {
|
|
14
|
+
skipped++;
|
|
15
|
+
continue;
|
|
16
|
+
}
|
|
17
|
+
try {
|
|
18
|
+
if (opts.dryRun) {
|
|
19
|
+
// simulate success
|
|
20
|
+
applied++;
|
|
21
|
+
continue;
|
|
22
|
+
}
|
|
23
|
+
const fileLike = f;
|
|
24
|
+
const filePath = fileLike.filePath ?? fileLike._file ?? "file";
|
|
25
|
+
const patchWithHeader = patchText.startsWith("diff --git")
|
|
26
|
+
? patchText
|
|
27
|
+
: `diff --git a/${filePath} b/${filePath}\n${patchText}`;
|
|
28
|
+
execSync("git apply --3way -", { cwd, input: patchWithHeader, stdio: "pipe" });
|
|
29
|
+
applied++;
|
|
30
|
+
}
|
|
31
|
+
catch (err) {
|
|
32
|
+
errors.push(String(err.message ?? err));
|
|
33
|
+
skipped++;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return { applied, skipped, errors };
|
|
37
|
+
}
|
package/dist/tools/prompts.d.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
2
2
|
/** Adversarial evaluation stance — shared across all judges. */
|
|
3
|
-
export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n-
|
|
3
|
+
export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n- Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).\n- Report only real problems, risks, and deficiencies that exist in the actual code.\n- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.\n- If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.";
|
|
4
4
|
/** Precision override — ensures evidence-based findings. */
|
|
5
|
-
export declare const PRECISION_MANDATE = "PRECISION MANDATE (
|
|
5
|
+
export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.";
|
|
6
6
|
/**
|
|
7
7
|
* Extract only the unique evaluation criteria from a judge's systemPrompt,
|
|
8
8
|
* stripping the persona introduction line, the ADVERSARIAL MANDATE block,
|
package/dist/tools/prompts.js
CHANGED
|
@@ -17,18 +17,22 @@ import { JUDGES } from "../judges/index.js";
|
|
|
17
17
|
// ──────────────────────────────────────────────────────────────────────────────
|
|
18
18
|
/** Adversarial evaluation stance — shared across all judges. */
|
|
19
19
|
export const SHARED_ADVERSARIAL_MANDATE = `ADVERSARIAL MANDATE (applies to ALL judges):
|
|
20
|
-
-
|
|
21
|
-
-
|
|
20
|
+
- Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).
|
|
21
|
+
- Report only real problems, risks, and deficiencies that exist in the actual code.
|
|
22
22
|
- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
|
|
23
|
-
- If no concrete issues are found after thorough analysis, report
|
|
23
|
+
- If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.`;
|
|
24
24
|
/** Precision override — ensures evidence-based findings. */
|
|
25
|
-
export const PRECISION_MANDATE = `PRECISION MANDATE (
|
|
26
|
-
- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence
|
|
25
|
+
export const PRECISION_MANDATE = `PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):
|
|
26
|
+
- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded — no exceptions.
|
|
27
27
|
- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.
|
|
28
28
|
- Speculative, hypothetical, or "just in case" findings erode developer trust. Only flag issues you are confident exist in the actual code.
|
|
29
29
|
- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.
|
|
30
30
|
- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.
|
|
31
|
-
- Clean, well-structured code exists. Acknowledge it by not forcing false issues
|
|
31
|
+
- Clean, well-structured code exists. Acknowledge it by not forcing false issues.
|
|
32
|
+
- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.
|
|
33
|
+
- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.
|
|
34
|
+
- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (≥80%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.
|
|
35
|
+
- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.`;
|
|
32
36
|
// ─── Criteria Extraction ─────────────────────────────────────────────────────
|
|
33
37
|
/**
|
|
34
38
|
* Extract only the unique evaluation criteria from a judge's systemPrompt,
|
|
@@ -73,18 +77,24 @@ export function getCondensedCriteria(systemPrompt) {
|
|
|
73
77
|
*/
|
|
74
78
|
export function registerPrompts(server) {
|
|
75
79
|
// ── Per-judge prompts ──────────────────────────────────────────────────
|
|
76
|
-
// Each prompt
|
|
77
|
-
//
|
|
80
|
+
// Each prompt uses condensed criteria (adversarial mandate stripped) plus
|
|
81
|
+
// the shared mandates, mirroring the tribunal architecture for consistency
|
|
82
|
+
// and better precision on clean code.
|
|
78
83
|
for (const judge of JUDGES) {
|
|
79
84
|
server.prompt(`judge-${judge.id}`, `Use the ${judge.name} persona to perform a deep ${judge.domain} review of code. This prompt provides the judge's expert criteria for LLM-powered analysis that goes beyond pattern matching.`, {
|
|
80
85
|
code: z.string().describe("The source code to evaluate"),
|
|
81
86
|
language: z.string().describe("The programming language"),
|
|
82
87
|
context: z.string().optional().describe("Additional context about the code"),
|
|
83
88
|
}, async ({ code, language, context }) => {
|
|
84
|
-
const
|
|
89
|
+
const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
|
|
90
|
+
const criteria = getCondensedCriteria(judge.systemPrompt);
|
|
91
|
+
const userMessage = `${persona}\n\n` +
|
|
92
|
+
`${SHARED_ADVERSARIAL_MANDATE}\n\n` +
|
|
93
|
+
`${PRECISION_MANDATE}\n\n` +
|
|
94
|
+
`${criteria}\n\n` +
|
|
85
95
|
`Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
|
|
86
96
|
(context ? `\n\nAdditional context: ${context}` : "") +
|
|
87
|
-
`\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. End with an overall score (0-100) and verdict (pass/warning/fail).`;
|
|
97
|
+
`\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`;
|
|
88
98
|
return {
|
|
89
99
|
messages: [
|
|
90
100
|
{
|
|
@@ -118,6 +128,7 @@ export function registerPrompts(server) {
|
|
|
118
128
|
`2. Verdict (PASS / WARNING / FAIL)\n` +
|
|
119
129
|
`3. Score (0-100)\n` +
|
|
120
130
|
`4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
|
|
131
|
+
`For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
|
|
121
132
|
`Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
|
|
122
133
|
`## The Judges\n\n${judgeInstructions}\n\n` +
|
|
123
134
|
`## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\`` +
|
package/docs/skills.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# Skills Catalog
|
|
2
|
+
|
|
3
|
+
| ID | Name | Description | Tags | Agents |
|
|
4
|
+
| --- | --- | --- | --- | --- |
|
|
5
|
+
| security-review | Security Review Skill | "Security-focused review for production readiness,covering AppSec,DataSec,AuthZ,and IaC." | security, appsec, datasec | cybersecurity, data-security, authentication, logging-privacy, api-contract, database, iac-security, framework-safety, dependency-health, configuration-management, rate-limiting, compliance, data-sovereignty, security, ai-code-safety, false-positive-review |
|
|
6
|
+
| release-gate | Release Gate Skill | "Pre-deploy release gate combining reliability,observability,CI/CD,and security checks." | release, sre, reliability, deployment | reliability, observability, performance, ci-cd, testing, cloud-readiness, cost-effectiveness, security, data-security, cybersecurity, false-positive-review |
|
|
7
|
+
| ai-code-review | AI Code Review Skill | "Full-spectrum AI-generated code review using the Judges Panel,tuned for minimizing false positives and focusing on AI-specific failure modes." | ai-code, code-review, tribunal | ai-code-safety, hallucination-detection, logic-review, over-engineering, code-structure, maintainability, performance, reliability, cybersecurity, data-security, authentication, api-design, api-contract, database, caching, observability, logging-privacy, configuration-management, dependency-health, framework-safety, testing, ci-cd, intent-alignment, multi-turn-coherence, model-fingerprint, agent-instructions, cloud-readiness, cost-effectiveness, ethics-bias, accessibility, internationalization, data-sovereignty, iac-security, rate-limiting, portability, ux, backwards-compatibility, security, false-positive-review |
|
package/package.json
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kevinrabun/judges",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.115.0",
|
|
4
4
|
"description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
|
|
5
5
|
"mcpName": "io.github.KevinRabun/judges",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"main": "dist/index.js",
|
|
8
|
+
"bin": {
|
|
9
|
+
"judges": "packages/judges-cli/bin/judges.js"
|
|
10
|
+
},
|
|
8
11
|
"exports": {
|
|
9
12
|
".": {
|
|
10
13
|
"import": "./dist/api.js",
|
|
@@ -78,6 +81,14 @@
|
|
|
78
81
|
"dist/fingerprint.d.ts",
|
|
79
82
|
"dist/fix-history.js",
|
|
80
83
|
"dist/fix-history.d.ts",
|
|
84
|
+
"agents/**/*.judge.md",
|
|
85
|
+
"skills/**/*.skill.md",
|
|
86
|
+
"docs/skills.md",
|
|
87
|
+
"scripts/generate-agents-from-judges.ts",
|
|
88
|
+
"scripts/validate-agents.ts",
|
|
89
|
+
"scripts/generate-skills-docs.ts",
|
|
90
|
+
"src/agent-loader.ts",
|
|
91
|
+
"src/skill-loader.ts",
|
|
81
92
|
"dist/github-app.js",
|
|
82
93
|
"dist/github-app.d.ts",
|
|
83
94
|
"dist/index.js",
|
|
@@ -156,7 +167,7 @@
|
|
|
156
167
|
"test": "npx tsx --test \"tests/**/*.test.ts\"",
|
|
157
168
|
"test:coverage": "node scripts/run-tests-with-coverage.mjs",
|
|
158
169
|
"self-eval": "npx tsx scripts/self-eval.ts",
|
|
159
|
-
"check": "tsc --noEmit && eslint src/ tests/ && npx tsx scripts/self-eval.ts",
|
|
170
|
+
"check": "tsc --noEmit && eslint src/ tests/ && npx tsx scripts/self-eval.ts && npm run validate:agents && npm run check:agents",
|
|
160
171
|
"lint": "eslint src/ tests/",
|
|
161
172
|
"lint:fix": "eslint src/ tests/ --fix",
|
|
162
173
|
"format": "prettier --write \"src/**/*.ts\" \"tests/**/*.ts\"",
|
|
@@ -165,8 +176,12 @@
|
|
|
165
176
|
"report:public-repo": "npx tsx scripts/generate-public-repo-report.ts",
|
|
166
177
|
"report:quickstart": "npx tsx scripts/generate-public-repo-report.ts --quickStart",
|
|
167
178
|
"automation:daily-popular": "npx tsx scripts/daily-popular-repo-autofix.ts",
|
|
168
|
-
"benchmark:llm": "npx tsx scripts/run-llm-benchmark.ts",
|
|
169
179
|
"sync-docs": "npx tsx scripts/sync-docs.ts",
|
|
180
|
+
"generate:agents": "npx tsx scripts/generate-agents-from-judges.ts",
|
|
181
|
+
"generate:agents:force": "npx tsx scripts/generate-agents-from-judges.ts --force",
|
|
182
|
+
"validate:agents": "npx tsx scripts/validate-agents.ts",
|
|
183
|
+
"docs:skills": "npx tsx scripts/generate-skills-docs.ts",
|
|
184
|
+
"check:agents": "npx tsx scripts/check-agents.ts",
|
|
170
185
|
"prepublishOnly": "npm run build",
|
|
171
186
|
"prepare": "husky"
|
|
172
187
|
},
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# @kevinrabun/judges-cli
|
|
2
|
+
|
|
3
|
+
Standalone CLI package for Judges.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install -g @kevinrabun/judges-cli
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
judges eval src/app.ts
|
|
15
|
+
judges list
|
|
16
|
+
judges hook install
|
|
17
|
+
|
|
18
|
+
# Agentic skills
|
|
19
|
+
judges skill ai-code-review --file src/app.ts
|
|
20
|
+
judges skill security-review --file src/api.ts --format json
|
|
21
|
+
judges skills # list available skills
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Use `@kevinrabun/judges` when you need the MCP server or programmatic API.
|