npm - @kevinrabun/judges - Versions diffs - 3.113.0 → 3.115.0 - Mend

@kevinrabun/judges 3.113.0 → 3.115.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/README.md +9 -0
package/agents/accessibility.judge.md +37 -0
package/agents/agent-instructions.judge.md +37 -0
package/agents/ai-code-safety.judge.md +48 -0
package/agents/api-contract.judge.md +30 -0
package/agents/api-design.judge.md +39 -0
package/agents/authentication.judge.md +37 -0
package/agents/backwards-compatibility.judge.md +37 -0
package/agents/caching.judge.md +37 -0
package/agents/ci-cd.judge.md +37 -0
package/agents/cloud-readiness.judge.md +37 -0
package/agents/code-structure.judge.md +48 -0
package/agents/compliance.judge.md +40 -0
package/agents/concurrency.judge.md +39 -0
package/agents/configuration-management.judge.md +37 -0
package/agents/cost-effectiveness.judge.md +40 -0
package/agents/cybersecurity.judge.md +36 -0
package/agents/data-security.judge.md +34 -0
package/agents/data-sovereignty.judge.md +58 -0
package/agents/database.judge.md +41 -0
package/agents/dependency-health.judge.md +39 -0
package/agents/documentation.judge.md +39 -0
package/agents/error-handling.judge.md +37 -0
package/agents/ethics-bias.judge.md +39 -0
package/agents/false-positive-review.judge.md +73 -0
package/agents/framework-safety.judge.md +40 -0
package/agents/hallucination-detection.judge.md +33 -0
package/agents/iac-security.judge.md +38 -0
package/agents/intent-alignment.judge.md +31 -0
package/agents/internationalization.judge.md +42 -0
package/agents/logging-privacy.judge.md +37 -0
package/agents/logic-review.judge.md +34 -0
package/agents/maintainability.judge.md +37 -0
package/agents/model-fingerprint.judge.md +31 -0
package/agents/multi-turn-coherence.judge.md +29 -0
package/agents/observability.judge.md +37 -0
package/agents/over-engineering.judge.md +48 -0
package/agents/performance.judge.md +44 -0
package/agents/portability.judge.md +37 -0
package/agents/rate-limiting.judge.md +37 -0
package/agents/reliability.judge.md +39 -0
package/agents/scalability.judge.md +41 -0
package/agents/security.judge.md +31 -0
package/agents/software-practices.judge.md +44 -0
package/agents/testing.judge.md +39 -0
package/agents/ux.judge.md +37 -0
package/dist/api.d.ts +9 -1
package/dist/api.js +9 -1
package/dist/commands/fix.d.ts +10 -0
package/dist/commands/fix.js +52 -0
package/dist/commands/llm-benchmark.d.ts +13 -4
package/dist/commands/llm-benchmark.js +39 -8
package/dist/commands/review.d.ts +51 -1
package/dist/commands/review.js +213 -7
package/dist/evaluators/index.js +61 -35
package/dist/github-app.d.ts +35 -0
package/dist/github-app.js +125 -4
package/dist/judges/index.d.ts +23 -61
package/dist/judges/index.js +49 -63
package/dist/patches/apply.d.ts +15 -0
package/dist/patches/apply.js +37 -0
package/dist/tools/prompts.d.ts +2 -2
package/dist/tools/prompts.js +21 -10
package/docs/skills.md +7 -0
package/package.json +18 -3
package/packages/judges-cli/README.md +24 -0
package/packages/judges-cli/bin/judges.js +8 -0
package/scripts/generate-agents-from-judges.ts +111 -0
package/scripts/generate-skills-docs.ts +26 -0
package/scripts/validate-agents.ts +104 -0
package/server.json +2 -2
package/skills/ai-code-review.skill.md +57 -0
package/skills/release-gate.skill.md +27 -0
package/skills/security-review.skill.md +32 -0
package/src/agent-loader.ts +324 -0
package/src/skill-loader.ts +199 -0

package/dist/github-app.js CHANGED Viewed

@@ -24,6 +24,11 @@ import { readFileSync, existsSync } from "fs";
 import { createServer } from "http";
 import { evaluateWithTribunal } from "./evaluators/index.js";
 import { evaluateProject } from "./evaluators/project.js";
+import { extractValidatedLlmFindings, getValidRulePrefixes, constructTribunalPrompt, } from "./commands/llm-benchmark.js";
+import { buildContextSnippets } from "./context/context-snippets.js";
+// Test override hooks (exported for tsx/node:test to avoid esbuild inlining)
+export let evaluateWithTribunalImpl = evaluateWithTribunal;
+export let evaluateProjectImpl = evaluateProject;
 // ─── Language Detection ─────────────────────────────────────────────────────
 export const EXT_TO_LANG = {
     ".ts": "typescript",
@@ -71,7 +76,12 @@ export function generateJwt(appId, privateKey) {
     return `${signingInput}.${signature}`;
 }
 // ─── GitHub API Helper ──────────────────────────────────────────────────────
+// Test hook for API injection
+let ghApiImpl;
 async function ghApi(method, path, token, body) {
+    if (ghApiImpl) {
+        return ghApiImpl(method, path, token, body);
+    }
     const { default: https } = await import("https");
     const payload = body ? JSON.stringify(body) : "";
     return new Promise((resolve, reject) => {
@@ -105,8 +115,50 @@ async function ghApi(method, path, token, body) {
         req.end();
     });
 }
+export function __setGhApiImplForTest(fn) {
+    ghApiImpl = fn;
+}
+async function callOpenAiChat(prompt, opts) {
+    // Node 18+ provides global fetch
+    const fetchImpl = globalThis.fetch;
+    if (!fetchImpl)
+        throw new Error("fetch() not available. Run on Node 18+ or polyfill fetch.");
+    const url = opts.baseUrl || "https://api.openai.com/v1/chat/completions";
+    const res = await fetchImpl(url, {
+        method: "POST",
+        headers: {
+            Authorization: `Bearer ${opts.apiKey}`,
+            "Content-Type": "application/json",
+        },
+        body: JSON.stringify({
+            model: opts.model,
+            max_tokens: opts.maxTokens ?? 800,
+            temperature: 0,
+            messages: [{ role: "user", content: prompt }],
+        }),
+    });
+    if (!res.ok) {
+        const text = await res.text();
+        throw new Error(`LLM request failed: ${res.status} ${res.statusText} ${text}`);
+    }
+    const json = (await res.json());
+    const content = json.choices?.[0]
+        ?.message?.content;
+    if (!content)
+        throw new Error("LLM response missing content");
+    return content.trim();
+}
+// Test hook
+let callOpenAiChatImpl = callOpenAiChat;
+export function __setCallOpenAiChatImplForTest(fn) {
+    callOpenAiChatImpl = fn;
+}
 // ─── Installation Token ─────────────────────────────────────────────────────
+// Test hook
+let getInstallationTokenImpl;
 async function getInstallationToken(appId, privateKey, installationId) {
+    if (getInstallationTokenImpl)
+        return getInstallationTokenImpl(appId, privateKey, installationId);
     const jwt = generateJwt(appId, privateKey);
     const res = await ghApi("POST", `/app/installations/${installationId}/access_tokens`, jwt);
     const data = res.data;
@@ -136,7 +188,8 @@ export function parsePatchToHunk(filePath, patch) {
     const changedLineNumbers = [];
     let newLineNum = 0;
     for (const line of lines) {
-        const hunkMatch = line.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/);
+        // Hunk header: @@ -10,5 +20,8 @@ (some tools omit trailing space/@@)
+        const hunkMatch = line.match(/^@@\s*-\d+(?:,\d+)?\s+\+(\d+)(?:,\d+)?\s*@@?/);
         if (hunkMatch) {
             newLineNum = parseInt(hunkMatch[1], 10) - 1;
             continue;
@@ -206,7 +259,9 @@ async function reviewPullRequest(payload, token, config) {
         if (!hunk.newContent.trim())
             continue;
         try {
-            const verdict = evaluateWithTribunal(hunk.newContent, lang, undefined, {
+            // indirection to allow test overrides even when bundlers inline imports
+            const evalFn = getEvaluateWithTribunalImpl();
+            const verdict = evalFn(hunk.newContent, lang, undefined, {
                 filePath: file.filename,
                 includeAstFindings: true,
             });
@@ -243,8 +298,8 @@ async function reviewPullRequest(payload, token, config) {
     }
     if (projectFiles.length >= 2) {
         try {
-            const runner = { evaluateWithTribunal };
-            const projectVerdict = evaluateProject(runner, projectFiles);
+            const runner = { evaluateWithTribunal: evaluateWithTribunalImpl };
+            const projectVerdict = evaluateProjectImpl(runner, projectFiles);
             for (const f of projectVerdict.architecturalFindings ?? []) {
                 if (!meetsSeverityThreshold(f.severity, minSeverity))
                     continue;
@@ -255,6 +310,43 @@ async function reviewPullRequest(payload, token, config) {
             // Cross-file failure should not block the review
         }
     }
+    // 2c. Optional Layer 2 (LLM) augmentation — append summary comment
+    let llmSummary;
+    try {
+        if (process.env.OPENAI_API_KEY && config.llmDeepReview !== false) {
+            const codeBlobs = [];
+            const snippetsForRag = [];
+            for (const file of prFiles) {
+                if (!file.patch)
+                    continue;
+                const hunk = parsePatchToHunk(file.filename, file.patch);
+                codeBlobs.push(`// FILE: ${file.filename}\n${hunk.newContent}`);
+                snippetsForRag.push(hunk.newContent);
+            }
+            const combinedCode = codeBlobs.join("\n\n");
+            const ragSnippets = await buildContextSnippets(snippetsForRag.join("\n\n"), {
+                maxSnippets: 4,
+                chunkSize: 1500,
+            });
+            const contextText = ragSnippets.map((s) => s.snippet);
+            const tribunalPrompt = constructTribunalPrompt(combinedCode, "mixed", contextText);
+            const content = await callOpenAiChatImpl(tribunalPrompt, {
+                apiKey: process.env.OPENAI_API_KEY,
+                model: process.env.OPENAI_MODEL || "gpt-4o",
+                baseUrl: process.env.OPENAI_BASE_URL,
+                maxTokens: 800,
+            });
+            const validation = extractValidatedLlmFindings(content, getValidRulePrefixes());
+            const warnings = validation.errors?.length ? `\n\n⚠️ Validation warnings: ${validation.errors.join("; ")}` : "";
+            llmSummary =
+                `### 🤖 LLM Deep Review (model: ${process.env.OPENAI_MODEL || "gpt-4o"})\n` +
+                    (validation.ruleIds.length ? `Detected rule IDs: ${validation.ruleIds.join(", ")}` : "No rule IDs detected.") +
+                    `\n\n${content}${warnings}`;
+        }
+    }
+    catch (err) {
+        llmSummary = `⚠️ LLM deep review failed: ${String(err.message ?? err)}`;
+    }
     // 3. Build review comments
     const comments = [];
     const seen = new Set();
@@ -338,6 +430,9 @@ async function reviewPullRequest(payload, token, config) {
     if (allFindings.length === 0) {
         summaryLines.push("✅ No findings — code looks good!");
     }
+    if (typeof llmSummary === "string") {
+        summaryLines.push("", llmSummary);
+    }
     const reviewBody = summaryLines.join("\n");
     if (comments.length > 0) {
         const reviewRes = await ghApi("POST", `/repos/${repoFullName}/pulls/${prNumber}/reviews`, token, {
@@ -534,8 +629,34 @@ export function loadAppConfig() {
         autoApprove: process.env.JUDGES_AUTO_APPROVE === "true",
         diffOnly: process.env.JUDGES_DIFF_ONLY !== "false",
         configPath: process.env.JUDGES_CONFIG_PATH,
+        llmDeepReview: process.env.JUDGES_LLM_DEEP_REVIEW !== "false", // default on if key exists
     };
 }
+// Test hooks (non-public)
+export function __setEvaluateWithTribunalForTest(fn) {
+    evaluateWithTribunalImpl = fn ?? evaluateWithTribunal;
+}
+export function __setEvaluateProjectForTest(fn) {
+    evaluateProjectImpl = fn ?? evaluateProject;
+}
+export function getEvaluateWithTribunalImpl() {
+    return evaluateWithTribunalImpl;
+}
+export function __getEvaluateWithTribunalImplForTest() {
+    return evaluateWithTribunalImpl;
+}
+export const __test = {
+    __setCallOpenAiChatImplForTest,
+    __getInstallationTokenForTest: (fn) => {
+        getInstallationTokenImpl = fn;
+    },
+    __setGhApiImplForTest,
+    __setEvaluateWithTribunalForTest,
+    __setEvaluateProjectForTest,
+    __getEvaluateWithTribunalImplForTest,
+    parsePatchToHunk,
+    reviewPullRequest,
+};
 // ─── Standalone HTTP Server ─────────────────────────────────────────────────
 /**
  * Start a standalone HTTP server that listens for GitHub webhooks.

package/dist/judges/index.d.ts CHANGED Viewed

@@ -1,69 +1,31 @@
 /**
- * Judge barrel — side-effect imports trigger self-registration with the
- * unified JudgeRegistry. Each judge file imports its own evaluator and
- * calls `defaultRegistry.register()`, so this file just needs to import
- * each module for its side effects.
+ * Judge registry bootstrap (agent-native).
  *
- * To add a new built-in judge:
- *   1. Create `src/judges/my-judge.ts` (with self-registration)
- *   2. Create `src/evaluators/my-judge.ts` (analyzer)
- *   3. Add a side-effect import here: `import "./my-judge.js";`
+ * Judges are now sourced from `.judge.md` files in the `agents/` folder (legacy
+ * `.agent.md` still supported). Each agent frontmatter references an evaluator
+ * script (in `src/evaluators/`), and the agent loader registers them with the
+ * unified `JudgeRegistry`.
+ *
+ * Legacy side-effect imports have been removed. If you need to add a judge, add
+ * an agent file and (optionally) an evaluator script, then run:
+ *   - `npm run generate:agents` (to sync)
+ *   - `npm run validate:agents`
  */
 import type { JudgeDefinition } from "../types.js";
-import "./data-security.js";
-import "./cybersecurity.js";
-import "./cost-effectiveness.js";
-import "./scalability.js";
-import "./cloud-readiness.js";
-import "./software-practices.js";
-import "./accessibility.js";
-import "./api-design.js";
-import "./reliability.js";
-import "./observability.js";
-import "./performance.js";
-import "./compliance.js";
-import "./data-sovereignty.js";
-import "./testing.js";
-import "./documentation.js";
-import "./internationalization.js";
-import "./dependency-health.js";
-import "./concurrency.js";
-import "./ethics-bias.js";
-import "./maintainability.js";
-import "./error-handling.js";
-import "./authentication.js";
-import "./database.js";
-import "./caching.js";
-import "./configuration-management.js";
-import "./backwards-compatibility.js";
-import "./portability.js";
-import "./ux.js";
-import "./logging-privacy.js";
-import "./rate-limiting.js";
-import "./ci-cd.js";
-import "./code-structure.js";
-import "./agent-instructions.js";
-import "./ai-code-safety.js";
-import "./framework-safety.js";
-import "./iac-security.js";
-import "./security.js";
-import "./hallucination-detection.js";
-import "./intent-alignment.js";
-import "./api-contract.js";
-import "./multi-turn-coherence.js";
-import "./model-fingerprint.js";
-import "./over-engineering.js";
-import "./logic-review.js";
-import "./false-positive-review.js";
 /**
- * The panel of judges that comprise the Judges Panel.
- *
- * Each judge is a specialized evaluator with deep expertise in a single domain.
- * They operate independently and produce structured findings with
- * severity-rated, actionable recommendations.
- *
- * Note: this snapshot is taken at module-load time, after all built-in judges
- * have self-registered via the side-effect imports above.
+ * Load judges (agent-native). Loads agents from the default `agents/` folder
+ * and returns the current registry snapshot.
+ */
+export declare function loadJudges(): Promise<JudgeDefinition[]>;
+/**
+ * Load agent-based judges from a directory of `.judge.md` files (legacy
+ * `.agent.md` supported). This enables hybrid operation where file-based
+ * agents can augment or replace built-in judges. If a judge is already
+ * registered, it is skipped.
+ */
+export declare function loadAgentJudges(dir?: string): number;
+/**
+ * Snapshot of the currently registered judges. (Agent-native)
  */
 export declare const JUDGES: JudgeDefinition[];
 /**

package/dist/judges/index.js CHANGED Viewed

@@ -1,73 +1,59 @@
 /**
- * Judge barrel — side-effect imports trigger self-registration with the
- * unified JudgeRegistry. Each judge file imports its own evaluator and
- * calls `defaultRegistry.register()`, so this file just needs to import
- * each module for its side effects.
+ * Judge registry bootstrap (agent-native).
  *
- * To add a new built-in judge:
- *   1. Create `src/judges/my-judge.ts` (with self-registration)
- *   2. Create `src/evaluators/my-judge.ts` (analyzer)
- *   3. Add a side-effect import here: `import "./my-judge.js";`
+ * Judges are now sourced from `.judge.md` files in the `agents/` folder (legacy
+ * `.agent.md` still supported). Each agent frontmatter references an evaluator
+ * script (in `src/evaluators/`), and the agent loader registers them with the
+ * unified `JudgeRegistry`.
+ *
+ * Legacy side-effect imports have been removed. If you need to add a judge, add
+ * an agent file and (optionally) an evaluator script, then run:
+ *   - `npm run generate:agents` (to sync)
+ *   - `npm run validate:agents`
  */
 import { defaultRegistry } from "../judge-registry.js";
-// ─── Side-effect imports — each judge self-registers on import ───────────────
-import "./data-security.js";
-import "./cybersecurity.js";
-import "./cost-effectiveness.js";
-import "./scalability.js";
-import "./cloud-readiness.js";
-import "./software-practices.js";
-import "./accessibility.js";
-import "./api-design.js";
-import "./reliability.js";
-import "./observability.js";
-import "./performance.js";
-import "./compliance.js";
-import "./data-sovereignty.js";
-import "./testing.js";
-import "./documentation.js";
-import "./internationalization.js";
-import "./dependency-health.js";
-import "./concurrency.js";
-import "./ethics-bias.js";
-import "./maintainability.js";
-import "./error-handling.js";
-import "./authentication.js";
-import "./database.js";
-import "./caching.js";
-import "./configuration-management.js";
-import "./backwards-compatibility.js";
-import "./portability.js";
-import "./ux.js";
-import "./logging-privacy.js";
-import "./rate-limiting.js";
-import "./ci-cd.js";
-import "./code-structure.js";
-import "./agent-instructions.js";
-import "./ai-code-safety.js";
-import "./framework-safety.js";
-import "./iac-security.js";
-import "./security.js";
-import "./hallucination-detection.js";
-import "./intent-alignment.js";
-import "./api-contract.js";
-import "./multi-turn-coherence.js";
-import "./model-fingerprint.js";
-import "./over-engineering.js";
-import "./logic-review.js";
-import "./false-positive-review.js";
+import { loadAndRegisterAgents } from "../agent-loader.js";
+import { resolve, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+let agentsLoaded = false;
+function loadDefaultAgents() {
+    if (agentsLoaded)
+        return;
+    const agentsDir = resolve(__dirname, "..", "..", "agents");
+    loadAndRegisterAgents(agentsDir, defaultRegistry);
+    agentsLoaded = true;
+}
+// ─── Optional Agent Loader Integration ──────────────────────────────────────
+/**
+ * Load judges (agent-native). Loads agents from the default `agents/` folder
+ * and returns the current registry snapshot.
+ */
+export async function loadJudges() {
+    loadDefaultAgents();
+    return defaultRegistry.getJudges();
+}
+/**
+ * Load agent-based judges from a directory of `.judge.md` files (legacy
+ * `.agent.md` supported). This enables hybrid operation where file-based
+ * agents can augment or replace built-in judges. If a judge is already
+ * registered, it is skipped.
+ */
+export function loadAgentJudges(dir = resolve(__dirname, "..", "..", "agents")) {
+    agentsLoaded = false; // allow re-run to pick up new agents if dir changes
+    const count = loadAndRegisterAgents(dir, defaultRegistry);
+    agentsLoaded = true;
+    return count;
+}
 // ─── Re-exports backed by the registry ──────────────────────────────────────
 /**
- * The panel of judges that comprise the Judges Panel.
- *
- * Each judge is a specialized evaluator with deep expertise in a single domain.
- * They operate independently and produce structured findings with
- * severity-rated, actionable recommendations.
- *
- * Note: this snapshot is taken at module-load time, after all built-in judges
- * have self-registered via the side-effect imports above.
+ * Snapshot of the currently registered judges. (Agent-native)
  */
-export const JUDGES = defaultRegistry.getJudges();
+export const JUDGES = (() => {
+    loadDefaultAgents();
+    return defaultRegistry.getJudges();
+})();
 /**
  * Look up a judge by ID.
  */

package/dist/patches/apply.d.ts ADDED Viewed

@@ -0,0 +1,15 @@
+import type { Finding } from "../types.js";
+export interface ApplyPatchOptions {
+    dryRun?: boolean;
+    cwd?: string;
+}
+export interface ApplyPatchResult {
+    applied: number;
+    skipped: number;
+    errors: string[];
+}
+/**
+ * Minimal safe apply pipeline. For now, uses `git apply --3way` when available.
+ * This is intentionally conservative; if anything fails, it records an error and continues.
+ */
+export declare function applyPatchesFromFindings(findings: Finding[], opts?: ApplyPatchOptions): ApplyPatchResult;

package/dist/patches/apply.js ADDED Viewed

@@ -0,0 +1,37 @@
+import { execSync } from "node:child_process";
+/**
+ * Minimal safe apply pipeline. For now, uses `git apply --3way` when available.
+ * This is intentionally conservative; if anything fails, it records an error and continues.
+ */
+export function applyPatchesFromFindings(findings, opts = {}) {
+    const errors = [];
+    let applied = 0;
+    let skipped = 0;
+    const cwd = opts.cwd ?? process.cwd();
+    for (const f of findings) {
+        const patchText = f.patch?.newText ?? f.suggestedFix;
+        if (!patchText) {
+            skipped++;
+            continue;
+        }
+        try {
+            if (opts.dryRun) {
+                // simulate success
+                applied++;
+                continue;
+            }
+            const fileLike = f;
+            const filePath = fileLike.filePath ?? fileLike._file ?? "file";
+            const patchWithHeader = patchText.startsWith("diff --git")
+                ? patchText
+                : `diff --git a/${filePath} b/${filePath}\n${patchText}`;
+            execSync("git apply --3way -", { cwd, input: patchWithHeader, stdio: "pipe" });
+            applied++;
+        }
+        catch (err) {
+            errors.push(String(err.message ?? err));
+            skipped++;
+        }
+    }
+    return { applied, skipped, errors };
+}

package/dist/tools/prompts.d.ts CHANGED Viewed

@@ -1,8 +1,8 @@
 import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 /** Adversarial evaluation stance — shared across all judges. */
-export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n- Your role is adversarial: assume the code has problems and actively hunt for them. Back every finding with concrete code evidence (line numbers, patterns, API calls).\n- Never praise or compliment the code. Report only problems, risks, and deficiencies.\n- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.\n- If no concrete issues are found after thorough analysis, report zero findings. Do not pad the report with speculative issues.";
+export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n- Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).\n- Report only real problems, risks, and deficiencies that exist in the actual code.\n- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.\n- If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.";
 /** Precision override — ensures evidence-based findings. */
-export declare const PRECISION_MANDATE = "PRECISION MANDATE (overrides adversarial stance when in conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence must be discarded.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.";
+export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.";
 /**
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
  * stripping the persona introduction line, the ADVERSARIAL MANDATE block,

package/dist/tools/prompts.js CHANGED Viewed

@@ -17,18 +17,22 @@ import { JUDGES } from "../judges/index.js";
 // ──────────────────────────────────────────────────────────────────────────────
 /** Adversarial evaluation stance — shared across all judges. */
 export const SHARED_ADVERSARIAL_MANDATE = `ADVERSARIAL MANDATE (applies to ALL judges):
-- Your role is adversarial: assume the code has problems and actively hunt for them. Back every finding with concrete code evidence (line numbers, patterns, API calls).
-- Never praise or compliment the code. Report only problems, risks, and deficiencies.
+- Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).
+- Report only real problems, risks, and deficiencies that exist in the actual code.
 - If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
-- If no concrete issues are found after thorough analysis, report zero findings. Do not pad the report with speculative issues.`;
+- If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.`;
 /** Precision override — ensures evidence-based findings. */
-export const PRECISION_MANDATE = `PRECISION MANDATE (overrides adversarial stance when in conflict):
-- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence must be discarded.
+export const PRECISION_MANDATE = `PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):
+- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded — no exceptions.
 - Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.
 - Speculative, hypothetical, or "just in case" findings erode developer trust. Only flag issues you are confident exist in the actual code.
 - Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.
 - If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.
-- Clean, well-structured code exists. Acknowledge it by not forcing false issues.`;
+- Clean, well-structured code exists. Acknowledge it by not forcing false issues.
+- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.
+- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.
+- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (≥80%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.
+- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.`;
 // ─── Criteria Extraction ─────────────────────────────────────────────────────
 /**
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
@@ -73,18 +77,24 @@ export function getCondensedCriteria(systemPrompt) {
  */
 export function registerPrompts(server) {
     // ── Per-judge prompts ──────────────────────────────────────────────────
-    // Each prompt includes the judge's full systemPrompt + precision mandate
-    // so the LLM has complete evaluation criteria for single-judge reviews.
+    // Each prompt uses condensed criteria (adversarial mandate stripped) plus
+    // the shared mandates, mirroring the tribunal architecture for consistency
+    // and better precision on clean code.
     for (const judge of JUDGES) {
         server.prompt(`judge-${judge.id}`, `Use the ${judge.name} persona to perform a deep ${judge.domain} review of code. This prompt provides the judge's expert criteria for LLM-powered analysis that goes beyond pattern matching.`, {
             code: z.string().describe("The source code to evaluate"),
             language: z.string().describe("The programming language"),
             context: z.string().optional().describe("Additional context about the code"),
         }, async ({ code, language, context }) => {
-            const userMessage = `${judge.systemPrompt}\n\n${PRECISION_MANDATE}\n\n` +
+            const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
+            const criteria = getCondensedCriteria(judge.systemPrompt);
+            const userMessage = `${persona}\n\n` +
+                `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
+                `${PRECISION_MANDATE}\n\n` +
+                `${criteria}\n\n` +
                 `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
                 (context ? `\n\nAdditional context: ${context}` : "") +
-                `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. End with an overall score (0-100) and verdict (pass/warning/fail).`;
+                `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`;
             return {
                 messages: [
                     {
@@ -118,6 +128,7 @@ export function registerPrompts(server) {
             `2. Verdict (PASS / WARNING / FAIL)\n` +
             `3. Score (0-100)\n` +
             `4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
+            `For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
             `Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
             `## The Judges\n\n${judgeInstructions}\n\n` +
             `## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\`` +

package/docs/skills.md ADDED Viewed

@@ -0,0 +1,7 @@
+# Skills Catalog
+| ID | Name | Description | Tags | Agents |
+| --- | --- | --- | --- | --- |
+| security-review | Security Review Skill | "Security-focused review for production readiness,covering AppSec,DataSec,AuthZ,and IaC." | security, appsec, datasec | cybersecurity, data-security, authentication, logging-privacy, api-contract, database, iac-security, framework-safety, dependency-health, configuration-management, rate-limiting, compliance, data-sovereignty, security, ai-code-safety, false-positive-review |
+| release-gate | Release Gate Skill | "Pre-deploy release gate combining reliability,observability,CI/CD,and security checks." | release, sre, reliability, deployment | reliability, observability, performance, ci-cd, testing, cloud-readiness, cost-effectiveness, security, data-security, cybersecurity, false-positive-review |
+| ai-code-review | AI Code Review Skill | "Full-spectrum AI-generated code review using the Judges Panel,tuned for minimizing false positives and focusing on AI-specific failure modes." | ai-code, code-review, tribunal | ai-code-safety, hallucination-detection, logic-review, over-engineering, code-structure, maintainability, performance, reliability, cybersecurity, data-security, authentication, api-design, api-contract, database, caching, observability, logging-privacy, configuration-management, dependency-health, framework-safety, testing, ci-cd, intent-alignment, multi-turn-coherence, model-fingerprint, agent-instructions, cloud-readiness, cost-effectiveness, ethics-bias, accessibility, internationalization, data-sovereignty, iac-security, rate-limiting, portability, ux, backwards-compatibility, security, false-positive-review |

package/package.json CHANGED Viewed

@@ -1,10 +1,13 @@
 {
   "name": "@kevinrabun/judges",
-  "version": "3.113.0",
+  "version": "3.115.0",
   "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
   "mcpName": "io.github.KevinRabun/judges",
   "type": "module",
   "main": "dist/index.js",
+  "bin": {
+    "judges": "packages/judges-cli/bin/judges.js"
+  },
   "exports": {
     ".": {
       "import": "./dist/api.js",
@@ -78,6 +81,14 @@
     "dist/fingerprint.d.ts",
     "dist/fix-history.js",
     "dist/fix-history.d.ts",
+    "agents/**/*.judge.md",
+    "skills/**/*.skill.md",
+    "docs/skills.md",
+    "scripts/generate-agents-from-judges.ts",
+    "scripts/validate-agents.ts",
+    "scripts/generate-skills-docs.ts",
+    "src/agent-loader.ts",
+    "src/skill-loader.ts",
     "dist/github-app.js",
     "dist/github-app.d.ts",
     "dist/index.js",
@@ -156,7 +167,7 @@
     "test": "npx tsx --test \"tests/**/*.test.ts\"",
     "test:coverage": "node scripts/run-tests-with-coverage.mjs",
     "self-eval": "npx tsx scripts/self-eval.ts",
-    "check": "tsc --noEmit && eslint src/ tests/ && npx tsx scripts/self-eval.ts",
+    "check": "tsc --noEmit && eslint src/ tests/ && npx tsx scripts/self-eval.ts && npm run validate:agents && npm run check:agents",
     "lint": "eslint src/ tests/",
     "lint:fix": "eslint src/ tests/ --fix",
     "format": "prettier --write \"src/**/*.ts\" \"tests/**/*.ts\"",
@@ -165,8 +176,12 @@
     "report:public-repo": "npx tsx scripts/generate-public-repo-report.ts",
     "report:quickstart": "npx tsx scripts/generate-public-repo-report.ts --quickStart",
     "automation:daily-popular": "npx tsx scripts/daily-popular-repo-autofix.ts",
-    "benchmark:llm": "npx tsx scripts/run-llm-benchmark.ts",
     "sync-docs": "npx tsx scripts/sync-docs.ts",
+    "generate:agents": "npx tsx scripts/generate-agents-from-judges.ts",
+    "generate:agents:force": "npx tsx scripts/generate-agents-from-judges.ts --force",
+    "validate:agents": "npx tsx scripts/validate-agents.ts",
+    "docs:skills": "npx tsx scripts/generate-skills-docs.ts",
+    "check:agents": "npx tsx scripts/check-agents.ts",
     "prepublishOnly": "npm run build",
     "prepare": "husky"
   },

package/packages/judges-cli/README.md ADDED Viewed

@@ -0,0 +1,24 @@
+# @kevinrabun/judges-cli
+Standalone CLI package for Judges.
+## Install
+```bash
+npm install -g @kevinrabun/judges-cli
+```
+## Usage
+```bash
+judges eval src/app.ts
+judges list
+judges hook install
+# Agentic skills
+judges skill ai-code-review --file src/app.ts
+judges skill security-review --file src/api.ts --format json
+judges skills   # list available skills
+```
+Use `@kevinrabun/judges` when you need the MCP server or programmatic API.