@kevinrabun/judges 3.113.0 → 3.115.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/README.md +9 -0
  2. package/agents/accessibility.judge.md +37 -0
  3. package/agents/agent-instructions.judge.md +37 -0
  4. package/agents/ai-code-safety.judge.md +48 -0
  5. package/agents/api-contract.judge.md +30 -0
  6. package/agents/api-design.judge.md +39 -0
  7. package/agents/authentication.judge.md +37 -0
  8. package/agents/backwards-compatibility.judge.md +37 -0
  9. package/agents/caching.judge.md +37 -0
  10. package/agents/ci-cd.judge.md +37 -0
  11. package/agents/cloud-readiness.judge.md +37 -0
  12. package/agents/code-structure.judge.md +48 -0
  13. package/agents/compliance.judge.md +40 -0
  14. package/agents/concurrency.judge.md +39 -0
  15. package/agents/configuration-management.judge.md +37 -0
  16. package/agents/cost-effectiveness.judge.md +40 -0
  17. package/agents/cybersecurity.judge.md +36 -0
  18. package/agents/data-security.judge.md +34 -0
  19. package/agents/data-sovereignty.judge.md +58 -0
  20. package/agents/database.judge.md +41 -0
  21. package/agents/dependency-health.judge.md +39 -0
  22. package/agents/documentation.judge.md +39 -0
  23. package/agents/error-handling.judge.md +37 -0
  24. package/agents/ethics-bias.judge.md +39 -0
  25. package/agents/false-positive-review.judge.md +73 -0
  26. package/agents/framework-safety.judge.md +40 -0
  27. package/agents/hallucination-detection.judge.md +33 -0
  28. package/agents/iac-security.judge.md +38 -0
  29. package/agents/intent-alignment.judge.md +31 -0
  30. package/agents/internationalization.judge.md +42 -0
  31. package/agents/logging-privacy.judge.md +37 -0
  32. package/agents/logic-review.judge.md +34 -0
  33. package/agents/maintainability.judge.md +37 -0
  34. package/agents/model-fingerprint.judge.md +31 -0
  35. package/agents/multi-turn-coherence.judge.md +29 -0
  36. package/agents/observability.judge.md +37 -0
  37. package/agents/over-engineering.judge.md +48 -0
  38. package/agents/performance.judge.md +44 -0
  39. package/agents/portability.judge.md +37 -0
  40. package/agents/rate-limiting.judge.md +37 -0
  41. package/agents/reliability.judge.md +39 -0
  42. package/agents/scalability.judge.md +41 -0
  43. package/agents/security.judge.md +31 -0
  44. package/agents/software-practices.judge.md +44 -0
  45. package/agents/testing.judge.md +39 -0
  46. package/agents/ux.judge.md +37 -0
  47. package/dist/api.d.ts +9 -1
  48. package/dist/api.js +9 -1
  49. package/dist/commands/fix.d.ts +10 -0
  50. package/dist/commands/fix.js +52 -0
  51. package/dist/commands/llm-benchmark.d.ts +13 -4
  52. package/dist/commands/llm-benchmark.js +39 -8
  53. package/dist/commands/review.d.ts +51 -1
  54. package/dist/commands/review.js +213 -7
  55. package/dist/evaluators/index.js +61 -35
  56. package/dist/github-app.d.ts +35 -0
  57. package/dist/github-app.js +125 -4
  58. package/dist/judges/index.d.ts +23 -61
  59. package/dist/judges/index.js +49 -63
  60. package/dist/patches/apply.d.ts +15 -0
  61. package/dist/patches/apply.js +37 -0
  62. package/dist/tools/prompts.d.ts +2 -2
  63. package/dist/tools/prompts.js +21 -10
  64. package/docs/skills.md +7 -0
  65. package/package.json +18 -3
  66. package/packages/judges-cli/README.md +24 -0
  67. package/packages/judges-cli/bin/judges.js +8 -0
  68. package/scripts/generate-agents-from-judges.ts +111 -0
  69. package/scripts/generate-skills-docs.ts +26 -0
  70. package/scripts/validate-agents.ts +104 -0
  71. package/server.json +2 -2
  72. package/skills/ai-code-review.skill.md +57 -0
  73. package/skills/release-gate.skill.md +27 -0
  74. package/skills/security-review.skill.md +32 -0
  75. package/src/agent-loader.ts +324 -0
  76. package/src/skill-loader.ts +199 -0
@@ -24,6 +24,11 @@ import { readFileSync, existsSync } from "fs";
24
24
  import { createServer } from "http";
25
25
  import { evaluateWithTribunal } from "./evaluators/index.js";
26
26
  import { evaluateProject } from "./evaluators/project.js";
27
+ import { extractValidatedLlmFindings, getValidRulePrefixes, constructTribunalPrompt, } from "./commands/llm-benchmark.js";
28
+ import { buildContextSnippets } from "./context/context-snippets.js";
29
+ // Test override hooks (exported for tsx/node:test to avoid esbuild inlining)
30
+ export let evaluateWithTribunalImpl = evaluateWithTribunal;
31
+ export let evaluateProjectImpl = evaluateProject;
27
32
  // ─── Language Detection ─────────────────────────────────────────────────────
28
33
  export const EXT_TO_LANG = {
29
34
  ".ts": "typescript",
@@ -71,7 +76,12 @@ export function generateJwt(appId, privateKey) {
71
76
  return `${signingInput}.${signature}`;
72
77
  }
73
78
  // ─── GitHub API Helper ──────────────────────────────────────────────────────
79
+ // Test hook for API injection
80
+ let ghApiImpl;
74
81
  async function ghApi(method, path, token, body) {
82
+ if (ghApiImpl) {
83
+ return ghApiImpl(method, path, token, body);
84
+ }
75
85
  const { default: https } = await import("https");
76
86
  const payload = body ? JSON.stringify(body) : "";
77
87
  return new Promise((resolve, reject) => {
@@ -105,8 +115,50 @@ async function ghApi(method, path, token, body) {
105
115
  req.end();
106
116
  });
107
117
  }
118
+ export function __setGhApiImplForTest(fn) {
119
+ ghApiImpl = fn;
120
+ }
121
+ async function callOpenAiChat(prompt, opts) {
122
+ // Node 18+ provides global fetch
123
+ const fetchImpl = globalThis.fetch;
124
+ if (!fetchImpl)
125
+ throw new Error("fetch() not available. Run on Node 18+ or polyfill fetch.");
126
+ const url = opts.baseUrl || "https://api.openai.com/v1/chat/completions";
127
+ const res = await fetchImpl(url, {
128
+ method: "POST",
129
+ headers: {
130
+ Authorization: `Bearer ${opts.apiKey}`,
131
+ "Content-Type": "application/json",
132
+ },
133
+ body: JSON.stringify({
134
+ model: opts.model,
135
+ max_tokens: opts.maxTokens ?? 800,
136
+ temperature: 0,
137
+ messages: [{ role: "user", content: prompt }],
138
+ }),
139
+ });
140
+ if (!res.ok) {
141
+ const text = await res.text();
142
+ throw new Error(`LLM request failed: ${res.status} ${res.statusText} ${text}`);
143
+ }
144
+ const json = (await res.json());
145
+ const content = json.choices?.[0]
146
+ ?.message?.content;
147
+ if (!content)
148
+ throw new Error("LLM response missing content");
149
+ return content.trim();
150
+ }
151
+ // Test hook
152
+ let callOpenAiChatImpl = callOpenAiChat;
153
+ export function __setCallOpenAiChatImplForTest(fn) {
154
+ callOpenAiChatImpl = fn;
155
+ }
108
156
  // ─── Installation Token ─────────────────────────────────────────────────────
157
+ // Test hook
158
+ let getInstallationTokenImpl;
109
159
  async function getInstallationToken(appId, privateKey, installationId) {
160
+ if (getInstallationTokenImpl)
161
+ return getInstallationTokenImpl(appId, privateKey, installationId);
110
162
  const jwt = generateJwt(appId, privateKey);
111
163
  const res = await ghApi("POST", `/app/installations/${installationId}/access_tokens`, jwt);
112
164
  const data = res.data;
@@ -136,7 +188,8 @@ export function parsePatchToHunk(filePath, patch) {
136
188
  const changedLineNumbers = [];
137
189
  let newLineNum = 0;
138
190
  for (const line of lines) {
139
- const hunkMatch = line.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/);
191
+ // Hunk header: @@ -10,5 +20,8 @@ (some tools omit trailing space/@@)
192
+ const hunkMatch = line.match(/^@@\s*-\d+(?:,\d+)?\s+\+(\d+)(?:,\d+)?\s*@@?/);
140
193
  if (hunkMatch) {
141
194
  newLineNum = parseInt(hunkMatch[1], 10) - 1;
142
195
  continue;
@@ -206,7 +259,9 @@ async function reviewPullRequest(payload, token, config) {
206
259
  if (!hunk.newContent.trim())
207
260
  continue;
208
261
  try {
209
- const verdict = evaluateWithTribunal(hunk.newContent, lang, undefined, {
262
+ // indirection to allow test overrides even when bundlers inline imports
263
+ const evalFn = getEvaluateWithTribunalImpl();
264
+ const verdict = evalFn(hunk.newContent, lang, undefined, {
210
265
  filePath: file.filename,
211
266
  includeAstFindings: true,
212
267
  });
@@ -243,8 +298,8 @@ async function reviewPullRequest(payload, token, config) {
243
298
  }
244
299
  if (projectFiles.length >= 2) {
245
300
  try {
246
- const runner = { evaluateWithTribunal };
247
- const projectVerdict = evaluateProject(runner, projectFiles);
301
+ const runner = { evaluateWithTribunal: evaluateWithTribunalImpl };
302
+ const projectVerdict = evaluateProjectImpl(runner, projectFiles);
248
303
  for (const f of projectVerdict.architecturalFindings ?? []) {
249
304
  if (!meetsSeverityThreshold(f.severity, minSeverity))
250
305
  continue;
@@ -255,6 +310,43 @@ async function reviewPullRequest(payload, token, config) {
255
310
  // Cross-file failure should not block the review
256
311
  }
257
312
  }
313
+ // 2c. Optional Layer 2 (LLM) augmentation — append summary comment
314
+ let llmSummary;
315
+ try {
316
+ if (process.env.OPENAI_API_KEY && config.llmDeepReview !== false) {
317
+ const codeBlobs = [];
318
+ const snippetsForRag = [];
319
+ for (const file of prFiles) {
320
+ if (!file.patch)
321
+ continue;
322
+ const hunk = parsePatchToHunk(file.filename, file.patch);
323
+ codeBlobs.push(`// FILE: ${file.filename}\n${hunk.newContent}`);
324
+ snippetsForRag.push(hunk.newContent);
325
+ }
326
+ const combinedCode = codeBlobs.join("\n\n");
327
+ const ragSnippets = await buildContextSnippets(snippetsForRag.join("\n\n"), {
328
+ maxSnippets: 4,
329
+ chunkSize: 1500,
330
+ });
331
+ const contextText = ragSnippets.map((s) => s.snippet);
332
+ const tribunalPrompt = constructTribunalPrompt(combinedCode, "mixed", contextText);
333
+ const content = await callOpenAiChatImpl(tribunalPrompt, {
334
+ apiKey: process.env.OPENAI_API_KEY,
335
+ model: process.env.OPENAI_MODEL || "gpt-4o",
336
+ baseUrl: process.env.OPENAI_BASE_URL,
337
+ maxTokens: 800,
338
+ });
339
+ const validation = extractValidatedLlmFindings(content, getValidRulePrefixes());
340
+ const warnings = validation.errors?.length ? `\n\n⚠️ Validation warnings: ${validation.errors.join("; ")}` : "";
341
+ llmSummary =
342
+ `### 🤖 LLM Deep Review (model: ${process.env.OPENAI_MODEL || "gpt-4o"})\n` +
343
+ (validation.ruleIds.length ? `Detected rule IDs: ${validation.ruleIds.join(", ")}` : "No rule IDs detected.") +
344
+ `\n\n${content}${warnings}`;
345
+ }
346
+ }
347
+ catch (err) {
348
+ llmSummary = `⚠️ LLM deep review failed: ${String(err.message ?? err)}`;
349
+ }
258
350
  // 3. Build review comments
259
351
  const comments = [];
260
352
  const seen = new Set();
@@ -338,6 +430,9 @@ async function reviewPullRequest(payload, token, config) {
338
430
  if (allFindings.length === 0) {
339
431
  summaryLines.push("✅ No findings — code looks good!");
340
432
  }
433
+ if (typeof llmSummary === "string") {
434
+ summaryLines.push("", llmSummary);
435
+ }
341
436
  const reviewBody = summaryLines.join("\n");
342
437
  if (comments.length > 0) {
343
438
  const reviewRes = await ghApi("POST", `/repos/${repoFullName}/pulls/${prNumber}/reviews`, token, {
@@ -534,8 +629,34 @@ export function loadAppConfig() {
534
629
  autoApprove: process.env.JUDGES_AUTO_APPROVE === "true",
535
630
  diffOnly: process.env.JUDGES_DIFF_ONLY !== "false",
536
631
  configPath: process.env.JUDGES_CONFIG_PATH,
632
+ llmDeepReview: process.env.JUDGES_LLM_DEEP_REVIEW !== "false", // default on if key exists
537
633
  };
538
634
  }
635
+ // Test hooks (non-public)
636
+ export function __setEvaluateWithTribunalForTest(fn) {
637
+ evaluateWithTribunalImpl = fn ?? evaluateWithTribunal;
638
+ }
639
+ export function __setEvaluateProjectForTest(fn) {
640
+ evaluateProjectImpl = fn ?? evaluateProject;
641
+ }
642
+ export function getEvaluateWithTribunalImpl() {
643
+ return evaluateWithTribunalImpl;
644
+ }
645
+ export function __getEvaluateWithTribunalImplForTest() {
646
+ return evaluateWithTribunalImpl;
647
+ }
648
+ export const __test = {
649
+ __setCallOpenAiChatImplForTest,
650
+ __getInstallationTokenForTest: (fn) => {
651
+ getInstallationTokenImpl = fn;
652
+ },
653
+ __setGhApiImplForTest,
654
+ __setEvaluateWithTribunalForTest,
655
+ __setEvaluateProjectForTest,
656
+ __getEvaluateWithTribunalImplForTest,
657
+ parsePatchToHunk,
658
+ reviewPullRequest,
659
+ };
539
660
  // ─── Standalone HTTP Server ─────────────────────────────────────────────────
540
661
  /**
541
662
  * Start a standalone HTTP server that listens for GitHub webhooks.
@@ -1,69 +1,31 @@
1
1
  /**
2
- * Judge barrel side-effect imports trigger self-registration with the
3
- * unified JudgeRegistry. Each judge file imports its own evaluator and
4
- * calls `defaultRegistry.register()`, so this file just needs to import
5
- * each module for its side effects.
2
+ * Judge registry bootstrap (agent-native).
6
3
  *
7
- * To add a new built-in judge:
8
- * 1. Create `src/judges/my-judge.ts` (with self-registration)
9
- * 2. Create `src/evaluators/my-judge.ts` (analyzer)
10
- * 3. Add a side-effect import here: `import "./my-judge.js";`
4
+ * Judges are now sourced from `.judge.md` files in the `agents/` folder (legacy
5
+ * `.agent.md` still supported). Each agent frontmatter references an evaluator
6
+ * script (in `src/evaluators/`), and the agent loader registers them with the
7
+ * unified `JudgeRegistry`.
8
+ *
9
+ * Legacy side-effect imports have been removed. If you need to add a judge, add
10
+ * an agent file and (optionally) an evaluator script, then run:
11
+ * - `npm run generate:agents` (to sync)
12
+ * - `npm run validate:agents`
11
13
  */
12
14
  import type { JudgeDefinition } from "../types.js";
13
- import "./data-security.js";
14
- import "./cybersecurity.js";
15
- import "./cost-effectiveness.js";
16
- import "./scalability.js";
17
- import "./cloud-readiness.js";
18
- import "./software-practices.js";
19
- import "./accessibility.js";
20
- import "./api-design.js";
21
- import "./reliability.js";
22
- import "./observability.js";
23
- import "./performance.js";
24
- import "./compliance.js";
25
- import "./data-sovereignty.js";
26
- import "./testing.js";
27
- import "./documentation.js";
28
- import "./internationalization.js";
29
- import "./dependency-health.js";
30
- import "./concurrency.js";
31
- import "./ethics-bias.js";
32
- import "./maintainability.js";
33
- import "./error-handling.js";
34
- import "./authentication.js";
35
- import "./database.js";
36
- import "./caching.js";
37
- import "./configuration-management.js";
38
- import "./backwards-compatibility.js";
39
- import "./portability.js";
40
- import "./ux.js";
41
- import "./logging-privacy.js";
42
- import "./rate-limiting.js";
43
- import "./ci-cd.js";
44
- import "./code-structure.js";
45
- import "./agent-instructions.js";
46
- import "./ai-code-safety.js";
47
- import "./framework-safety.js";
48
- import "./iac-security.js";
49
- import "./security.js";
50
- import "./hallucination-detection.js";
51
- import "./intent-alignment.js";
52
- import "./api-contract.js";
53
- import "./multi-turn-coherence.js";
54
- import "./model-fingerprint.js";
55
- import "./over-engineering.js";
56
- import "./logic-review.js";
57
- import "./false-positive-review.js";
58
15
  /**
59
- * The panel of judges that comprise the Judges Panel.
60
- *
61
- * Each judge is a specialized evaluator with deep expertise in a single domain.
62
- * They operate independently and produce structured findings with
63
- * severity-rated, actionable recommendations.
64
- *
65
- * Note: this snapshot is taken at module-load time, after all built-in judges
66
- * have self-registered via the side-effect imports above.
16
+ * Load judges (agent-native). Loads agents from the default `agents/` folder
17
+ * and returns the current registry snapshot.
18
+ */
19
+ export declare function loadJudges(): Promise<JudgeDefinition[]>;
20
+ /**
21
+ * Load agent-based judges from a directory of `.judge.md` files (legacy
22
+ * `.agent.md` supported). This enables hybrid operation where file-based
23
+ * agents can augment or replace built-in judges. If a judge is already
24
+ * registered, it is skipped.
25
+ */
26
+ export declare function loadAgentJudges(dir?: string): number;
27
+ /**
28
+ * Snapshot of the currently registered judges. (Agent-native)
67
29
  */
68
30
  export declare const JUDGES: JudgeDefinition[];
69
31
  /**
@@ -1,73 +1,59 @@
1
1
  /**
2
- * Judge barrel side-effect imports trigger self-registration with the
3
- * unified JudgeRegistry. Each judge file imports its own evaluator and
4
- * calls `defaultRegistry.register()`, so this file just needs to import
5
- * each module for its side effects.
2
+ * Judge registry bootstrap (agent-native).
6
3
  *
7
- * To add a new built-in judge:
8
- * 1. Create `src/judges/my-judge.ts` (with self-registration)
9
- * 2. Create `src/evaluators/my-judge.ts` (analyzer)
10
- * 3. Add a side-effect import here: `import "./my-judge.js";`
4
+ * Judges are now sourced from `.judge.md` files in the `agents/` folder (legacy
5
+ * `.agent.md` still supported). Each agent frontmatter references an evaluator
6
+ * script (in `src/evaluators/`), and the agent loader registers them with the
7
+ * unified `JudgeRegistry`.
8
+ *
9
+ * Legacy side-effect imports have been removed. If you need to add a judge, add
10
+ * an agent file and (optionally) an evaluator script, then run:
11
+ * - `npm run generate:agents` (to sync)
12
+ * - `npm run validate:agents`
11
13
  */
12
14
  import { defaultRegistry } from "../judge-registry.js";
13
- // ─── Side-effect imports each judge self-registers on import ───────────────
14
- import "./data-security.js";
15
- import "./cybersecurity.js";
16
- import "./cost-effectiveness.js";
17
- import "./scalability.js";
18
- import "./cloud-readiness.js";
19
- import "./software-practices.js";
20
- import "./accessibility.js";
21
- import "./api-design.js";
22
- import "./reliability.js";
23
- import "./observability.js";
24
- import "./performance.js";
25
- import "./compliance.js";
26
- import "./data-sovereignty.js";
27
- import "./testing.js";
28
- import "./documentation.js";
29
- import "./internationalization.js";
30
- import "./dependency-health.js";
31
- import "./concurrency.js";
32
- import "./ethics-bias.js";
33
- import "./maintainability.js";
34
- import "./error-handling.js";
35
- import "./authentication.js";
36
- import "./database.js";
37
- import "./caching.js";
38
- import "./configuration-management.js";
39
- import "./backwards-compatibility.js";
40
- import "./portability.js";
41
- import "./ux.js";
42
- import "./logging-privacy.js";
43
- import "./rate-limiting.js";
44
- import "./ci-cd.js";
45
- import "./code-structure.js";
46
- import "./agent-instructions.js";
47
- import "./ai-code-safety.js";
48
- import "./framework-safety.js";
49
- import "./iac-security.js";
50
- import "./security.js";
51
- import "./hallucination-detection.js";
52
- import "./intent-alignment.js";
53
- import "./api-contract.js";
54
- import "./multi-turn-coherence.js";
55
- import "./model-fingerprint.js";
56
- import "./over-engineering.js";
57
- import "./logic-review.js";
58
- import "./false-positive-review.js";
15
+ import { loadAndRegisterAgents } from "../agent-loader.js";
16
+ import { resolve, dirname } from "node:path";
17
+ import { fileURLToPath } from "node:url";
18
+ const __filename = fileURLToPath(import.meta.url);
19
+ const __dirname = dirname(__filename);
20
+ let agentsLoaded = false;
21
+ function loadDefaultAgents() {
22
+ if (agentsLoaded)
23
+ return;
24
+ const agentsDir = resolve(__dirname, "..", "..", "agents");
25
+ loadAndRegisterAgents(agentsDir, defaultRegistry);
26
+ agentsLoaded = true;
27
+ }
28
+ // ─── Optional Agent Loader Integration ──────────────────────────────────────
29
+ /**
30
+ * Load judges (agent-native). Loads agents from the default `agents/` folder
31
+ * and returns the current registry snapshot.
32
+ */
33
+ export async function loadJudges() {
34
+ loadDefaultAgents();
35
+ return defaultRegistry.getJudges();
36
+ }
37
+ /**
38
+ * Load agent-based judges from a directory of `.judge.md` files (legacy
39
+ * `.agent.md` supported). This enables hybrid operation where file-based
40
+ * agents can augment or replace built-in judges. If a judge is already
41
+ * registered, it is skipped.
42
+ */
43
+ export function loadAgentJudges(dir = resolve(__dirname, "..", "..", "agents")) {
44
+ agentsLoaded = false; // allow re-run to pick up new agents if dir changes
45
+ const count = loadAndRegisterAgents(dir, defaultRegistry);
46
+ agentsLoaded = true;
47
+ return count;
48
+ }
59
49
  // ─── Re-exports backed by the registry ──────────────────────────────────────
60
50
  /**
61
- * The panel of judges that comprise the Judges Panel.
62
- *
63
- * Each judge is a specialized evaluator with deep expertise in a single domain.
64
- * They operate independently and produce structured findings with
65
- * severity-rated, actionable recommendations.
66
- *
67
- * Note: this snapshot is taken at module-load time, after all built-in judges
68
- * have self-registered via the side-effect imports above.
51
+ * Snapshot of the currently registered judges. (Agent-native)
69
52
  */
70
- export const JUDGES = defaultRegistry.getJudges();
53
+ export const JUDGES = (() => {
54
+ loadDefaultAgents();
55
+ return defaultRegistry.getJudges();
56
+ })();
71
57
  /**
72
58
  * Look up a judge by ID.
73
59
  */
@@ -0,0 +1,15 @@
1
+ import type { Finding } from "../types.js";
2
+ export interface ApplyPatchOptions {
3
+ dryRun?: boolean;
4
+ cwd?: string;
5
+ }
6
+ export interface ApplyPatchResult {
7
+ applied: number;
8
+ skipped: number;
9
+ errors: string[];
10
+ }
11
+ /**
12
+ * Minimal safe apply pipeline. For now, uses `git apply --3way` when available.
13
+ * This is intentionally conservative; if anything fails, it records an error and continues.
14
+ */
15
+ export declare function applyPatchesFromFindings(findings: Finding[], opts?: ApplyPatchOptions): ApplyPatchResult;
@@ -0,0 +1,37 @@
1
+ import { execSync } from "node:child_process";
2
+ /**
3
+ * Minimal safe apply pipeline. For now, uses `git apply --3way` when available.
4
+ * This is intentionally conservative; if anything fails, it records an error and continues.
5
+ */
6
+ export function applyPatchesFromFindings(findings, opts = {}) {
7
+ const errors = [];
8
+ let applied = 0;
9
+ let skipped = 0;
10
+ const cwd = opts.cwd ?? process.cwd();
11
+ for (const f of findings) {
12
+ const patchText = f.patch?.newText ?? f.suggestedFix;
13
+ if (!patchText) {
14
+ skipped++;
15
+ continue;
16
+ }
17
+ try {
18
+ if (opts.dryRun) {
19
+ // simulate success
20
+ applied++;
21
+ continue;
22
+ }
23
+ const fileLike = f;
24
+ const filePath = fileLike.filePath ?? fileLike._file ?? "file";
25
+ const patchWithHeader = patchText.startsWith("diff --git")
26
+ ? patchText
27
+ : `diff --git a/${filePath} b/${filePath}\n${patchText}`;
28
+ execSync("git apply --3way -", { cwd, input: patchWithHeader, stdio: "pipe" });
29
+ applied++;
30
+ }
31
+ catch (err) {
32
+ errors.push(String(err.message ?? err));
33
+ skipped++;
34
+ }
35
+ }
36
+ return { applied, skipped, errors };
37
+ }
@@ -1,8 +1,8 @@
1
1
  import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
2
  /** Adversarial evaluation stance — shared across all judges. */
3
- export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n- Your role is adversarial: assume the code has problems and actively hunt for them. Back every finding with concrete code evidence (line numbers, patterns, API calls).\n- Never praise or compliment the code. Report only problems, risks, and deficiencies.\n- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.\n- If no concrete issues are found after thorough analysis, report zero findings. Do not pad the report with speculative issues.";
3
+ export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n- Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).\n- Report only real problems, risks, and deficiencies that exist in the actual code.\n- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.\n- If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.";
4
4
  /** Precision override — ensures evidence-based findings. */
5
- export declare const PRECISION_MANDATE = "PRECISION MANDATE (overrides adversarial stance when in conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence must be discarded.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.";
5
+ export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.";
6
6
  /**
7
7
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
8
8
  * stripping the persona introduction line, the ADVERSARIAL MANDATE block,
@@ -17,18 +17,22 @@ import { JUDGES } from "../judges/index.js";
17
17
  // ──────────────────────────────────────────────────────────────────────────────
18
18
  /** Adversarial evaluation stance — shared across all judges. */
19
19
  export const SHARED_ADVERSARIAL_MANDATE = `ADVERSARIAL MANDATE (applies to ALL judges):
20
- - Your role is adversarial: assume the code has problems and actively hunt for them. Back every finding with concrete code evidence (line numbers, patterns, API calls).
21
- - Never praise or compliment the code. Report only problems, risks, and deficiencies.
20
+ - Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).
21
+ - Report only real problems, risks, and deficiencies that exist in the actual code.
22
22
  - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
23
- - If no concrete issues are found after thorough analysis, report zero findings. Do not pad the report with speculative issues.`;
23
+ - If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.`;
24
24
  /** Precision override — ensures evidence-based findings. */
25
- export const PRECISION_MANDATE = `PRECISION MANDATE (overrides adversarial stance when in conflict):
26
- - Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence must be discarded.
25
+ export const PRECISION_MANDATE = `PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):
26
+ - Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded — no exceptions.
27
27
  - Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.
28
28
  - Speculative, hypothetical, or "just in case" findings erode developer trust. Only flag issues you are confident exist in the actual code.
29
29
  - Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.
30
30
  - If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.
31
- - Clean, well-structured code exists. Acknowledge it by not forcing false issues.`;
31
+ - Clean, well-structured code exists. Acknowledge it by not forcing false issues.
32
+ - RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.
33
+ - SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.
34
+ - CONFIDENCE THRESHOLD: Only report findings where you are highly confident (≥80%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.
35
+ - FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.`;
32
36
  // ─── Criteria Extraction ─────────────────────────────────────────────────────
33
37
  /**
34
38
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
@@ -73,18 +77,24 @@ export function getCondensedCriteria(systemPrompt) {
73
77
  */
74
78
  export function registerPrompts(server) {
75
79
  // ── Per-judge prompts ──────────────────────────────────────────────────
76
- // Each prompt includes the judge's full systemPrompt + precision mandate
77
- // so the LLM has complete evaluation criteria for single-judge reviews.
80
+ // Each prompt uses condensed criteria (adversarial mandate stripped) plus
81
+ // the shared mandates, mirroring the tribunal architecture for consistency
82
+ // and better precision on clean code.
78
83
  for (const judge of JUDGES) {
79
84
  server.prompt(`judge-${judge.id}`, `Use the ${judge.name} persona to perform a deep ${judge.domain} review of code. This prompt provides the judge's expert criteria for LLM-powered analysis that goes beyond pattern matching.`, {
80
85
  code: z.string().describe("The source code to evaluate"),
81
86
  language: z.string().describe("The programming language"),
82
87
  context: z.string().optional().describe("Additional context about the code"),
83
88
  }, async ({ code, language, context }) => {
84
- const userMessage = `${judge.systemPrompt}\n\n${PRECISION_MANDATE}\n\n` +
89
+ const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
90
+ const criteria = getCondensedCriteria(judge.systemPrompt);
91
+ const userMessage = `${persona}\n\n` +
92
+ `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
93
+ `${PRECISION_MANDATE}\n\n` +
94
+ `${criteria}\n\n` +
85
95
  `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
86
96
  (context ? `\n\nAdditional context: ${context}` : "") +
87
- `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. End with an overall score (0-100) and verdict (pass/warning/fail).`;
97
+ `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`;
88
98
  return {
89
99
  messages: [
90
100
  {
@@ -118,6 +128,7 @@ export function registerPrompts(server) {
118
128
  `2. Verdict (PASS / WARNING / FAIL)\n` +
119
129
  `3. Score (0-100)\n` +
120
130
  `4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
131
+ `For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
121
132
  `Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
122
133
  `## The Judges\n\n${judgeInstructions}\n\n` +
123
134
  `## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\`` +
package/docs/skills.md ADDED
@@ -0,0 +1,7 @@
1
+ # Skills Catalog
2
+
3
+ | ID | Name | Description | Tags | Agents |
4
+ | --- | --- | --- | --- | --- |
5
+ | security-review | Security Review Skill | "Security-focused review for production readiness,covering AppSec,DataSec,AuthZ,and IaC." | security, appsec, datasec | cybersecurity, data-security, authentication, logging-privacy, api-contract, database, iac-security, framework-safety, dependency-health, configuration-management, rate-limiting, compliance, data-sovereignty, security, ai-code-safety, false-positive-review |
6
+ | release-gate | Release Gate Skill | "Pre-deploy release gate combining reliability,observability,CI/CD,and security checks." | release, sre, reliability, deployment | reliability, observability, performance, ci-cd, testing, cloud-readiness, cost-effectiveness, security, data-security, cybersecurity, false-positive-review |
7
+ | ai-code-review | AI Code Review Skill | "Full-spectrum AI-generated code review using the Judges Panel,tuned for minimizing false positives and focusing on AI-specific failure modes." | ai-code, code-review, tribunal | ai-code-safety, hallucination-detection, logic-review, over-engineering, code-structure, maintainability, performance, reliability, cybersecurity, data-security, authentication, api-design, api-contract, database, caching, observability, logging-privacy, configuration-management, dependency-health, framework-safety, testing, ci-cd, intent-alignment, multi-turn-coherence, model-fingerprint, agent-instructions, cloud-readiness, cost-effectiveness, ethics-bias, accessibility, internationalization, data-sovereignty, iac-security, rate-limiting, portability, ux, backwards-compatibility, security, false-positive-review |
package/package.json CHANGED
@@ -1,10 +1,13 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges",
3
- "version": "3.113.0",
3
+ "version": "3.115.0",
4
4
  "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
5
5
  "mcpName": "io.github.KevinRabun/judges",
6
6
  "type": "module",
7
7
  "main": "dist/index.js",
8
+ "bin": {
9
+ "judges": "packages/judges-cli/bin/judges.js"
10
+ },
8
11
  "exports": {
9
12
  ".": {
10
13
  "import": "./dist/api.js",
@@ -78,6 +81,14 @@
78
81
  "dist/fingerprint.d.ts",
79
82
  "dist/fix-history.js",
80
83
  "dist/fix-history.d.ts",
84
+ "agents/**/*.judge.md",
85
+ "skills/**/*.skill.md",
86
+ "docs/skills.md",
87
+ "scripts/generate-agents-from-judges.ts",
88
+ "scripts/validate-agents.ts",
89
+ "scripts/generate-skills-docs.ts",
90
+ "src/agent-loader.ts",
91
+ "src/skill-loader.ts",
81
92
  "dist/github-app.js",
82
93
  "dist/github-app.d.ts",
83
94
  "dist/index.js",
@@ -156,7 +167,7 @@
156
167
  "test": "npx tsx --test \"tests/**/*.test.ts\"",
157
168
  "test:coverage": "node scripts/run-tests-with-coverage.mjs",
158
169
  "self-eval": "npx tsx scripts/self-eval.ts",
159
- "check": "tsc --noEmit && eslint src/ tests/ && npx tsx scripts/self-eval.ts",
170
+ "check": "tsc --noEmit && eslint src/ tests/ && npx tsx scripts/self-eval.ts && npm run validate:agents && npm run check:agents",
160
171
  "lint": "eslint src/ tests/",
161
172
  "lint:fix": "eslint src/ tests/ --fix",
162
173
  "format": "prettier --write \"src/**/*.ts\" \"tests/**/*.ts\"",
@@ -165,8 +176,12 @@
165
176
  "report:public-repo": "npx tsx scripts/generate-public-repo-report.ts",
166
177
  "report:quickstart": "npx tsx scripts/generate-public-repo-report.ts --quickStart",
167
178
  "automation:daily-popular": "npx tsx scripts/daily-popular-repo-autofix.ts",
168
- "benchmark:llm": "npx tsx scripts/run-llm-benchmark.ts",
169
179
  "sync-docs": "npx tsx scripts/sync-docs.ts",
180
+ "generate:agents": "npx tsx scripts/generate-agents-from-judges.ts",
181
+ "generate:agents:force": "npx tsx scripts/generate-agents-from-judges.ts --force",
182
+ "validate:agents": "npx tsx scripts/validate-agents.ts",
183
+ "docs:skills": "npx tsx scripts/generate-skills-docs.ts",
184
+ "check:agents": "npx tsx scripts/check-agents.ts",
170
185
  "prepublishOnly": "npm run build",
171
186
  "prepare": "husky"
172
187
  },
@@ -0,0 +1,24 @@
1
+ # @kevinrabun/judges-cli
2
+
3
+ Standalone CLI package for Judges.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm install -g @kevinrabun/judges-cli
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```bash
14
+ judges eval src/app.ts
15
+ judges list
16
+ judges hook install
17
+
18
+ # Agentic skills
19
+ judges skill ai-code-review --file src/app.ts
20
+ judges skill security-review --file src/api.ts --format json
21
+ judges skills # list available skills
22
+ ```
23
+
24
+ Use `@kevinrabun/judges` when you need the MCP server or programmatic API.