npm - @kevinrabun/judges - Versions diffs - 3.121.0 → 3.123.0 - Mend

@kevinrabun/judges 3.121.0 → 3.123.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/README.md +10 -9
package/agents/authentication.judge.md +17 -0
package/agents/cybersecurity.judge.md +18 -0
package/agents/error-handling.judge.md +9 -0
package/agents/rate-limiting.judge.md +9 -0
package/agents/security.judge.md +18 -0
package/dist/api.d.ts +1 -2
package/dist/api.js +1 -3
package/dist/commands/benchmark-advanced.js +1 -1
package/dist/commands/benchmark-ai-agents.js +3 -3
package/dist/commands/benchmark-infrastructure.js +1 -1
package/dist/commands/benchmark-quality-ops.js +1 -1
package/dist/commands/benchmark.d.ts +7 -0
package/dist/commands/benchmark.js +7 -7
package/dist/commands/llm-benchmark.d.ts +22 -2
package/dist/commands/llm-benchmark.js +184 -23
package/dist/config.js +1 -1
package/dist/judges/authentication.js +17 -0
package/dist/judges/cybersecurity.js +18 -0
package/dist/judges/error-handling.js +9 -0
package/dist/judges/rate-limiting.js +9 -0
package/dist/judges/security.js +18 -0
package/dist/probabilistic/llm-response-validator.js +2 -2
package/dist/reports/public-repo-report.js +9 -1
package/dist/skill-loader.js +9 -6
package/dist/tools/prompts.d.ts +3 -2
package/dist/tools/prompts.js +25 -51
package/package.json +2 -2
package/server.json +13 -4
package/src/skill-loader.ts +9 -6

package/README.md CHANGED Viewed

@@ -15,7 +15,7 @@ An MCP (Model Context Protocol) server that provides a panel of **45 specialized
 [![npm](https://img.shields.io/npm/v/@kevinrabun/judges)](https://www.npmjs.com/package/@kevinrabun/judges)
 [![npm downloads](https://img.shields.io/npm/dw/@kevinrabun/judges)](https://www.npmjs.com/package/@kevinrabun/judges)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-[![Tests](https://img.shields.io/badge/tests-2481-brightgreen)](https://github.com/KevinRabun/judges/actions)
+[![Tests](https://img.shields.io/badge/tests-2482-brightgreen)](https://github.com/KevinRabun/judges/actions)
 > 🔰 **Packages**
 > - **CLI**: `@kevinrabun/judges-cli` → binary `judges` (use `npx @kevinrabun/judges-cli eval --file app.ts`).
@@ -731,6 +731,8 @@ Use `--preset` to apply pre-configured evaluation settings:
 | `healthtech` | Healthcare — HIPAA compliance, data sovereignty, encryption, audit trails |
 | `saas` | Multi-tenant SaaS — tenant isolation, rate limiting, scalability |
 | `government` | Government/public sector — compliance, sovereignty, authentication |
+| `open-source` | Open-source projects — documentation, backwards compatibility, security, dependency health |
+| `ai-review` | AI-generated code review — hallucination detection, security, authentication, correctness |
 ```bash
 judges eval --preset security-only src/api.ts
@@ -833,7 +835,7 @@ The tribunal operates in three layers:
 2. **AST-Based Structural Analysis** — The Code Structure judge (`STRUCT-*` rules) uses real Abstract Syntax Tree parsing to measure cyclomatic complexity, nesting depth, function length, parameter count, dead code, and type safety with precision that regex cannot achieve. All supported languages — **TypeScript, JavaScript, Python, Rust, Go, Java, C#, and C++** — are parsed via **tree-sitter WASM grammars** (real syntax trees compiled to WebAssembly, in-process, zero native dependencies). A scope-tracking structural parser is kept as a fallback when WASM grammars are unavailable. No external AST server required.
-3. **LLM-Powered Deep Analysis (Prompts)** — The server exposes MCP prompts (e.g., `judge-data-security`, `full-tribunal`) that provide each judge's expert persona as a system prompt. When used by an LLM-based client (Copilot, Claude, Cursor, etc.), the host LLM performs deeper, context-aware probabilistic analysis beyond what static patterns can detect. This is where the `systemPrompt` on each judge comes alive — Judges itself makes no LLM calls, but it provides the expert criteria so your AI assistant can act as 45 specialized reviewers.
+3. **LLM-Powered Deep Analysis (Prompts)** — The server exposes MCP prompts (e.g., `judge-data-security`, `judge-cybersecurity`) that provide each judge's expert persona as a system prompt. When used by an LLM-based client (Copilot, Claude, Cursor, etc.), the host LLM performs deeper, context-aware probabilistic analysis beyond what static patterns can detect. This is where the `systemPrompt` on each judge comes alive — Judges itself makes no LLM calls, but it provides the expert criteria so your AI assistant can act as 45 specialized reviewers.
 ---
@@ -877,7 +879,7 @@ When your AI coding assistant connects to multiple MCP servers, each one contrib
   │   Judges     │  │  CVE / │  │ Linter │
   │   Panel      │  │  SBOM  │  │ Server │
   │ ─────────────│  └────────┘  └────────┘
-  │ 36 Heuristic │   Vuln DB     Style &
+  │ 44 Heuristic │   Vuln DB     Style &
   │   judges     │   scanning    correctness
   │ + AST judge  │
   └──────────────┘
@@ -1130,7 +1132,7 @@ Re-run the tribunal with **prior findings as context** for iterative refinement.
 #### Judge IDs
-`data-security` · `cybersecurity` · `cost-effectiveness` · `scalability` · `cloud-readiness` · `software-practices` · `accessibility` · `api-design` · `reliability` · `observability` · `performance` · `compliance` · `data-sovereignty` · `testing` · `documentation` · `internationalization` · `dependency-health` · `concurrency` · `ethics-bias` · `maintainability` · `error-handling` · `authentication` · `database` · `caching` · `configuration-management` · `backwards-compatibility` · `portability` · `ux` · `logging-privacy` · `rate-limiting` · `ci-cd` · `code-structure` · `agent-instructions` · `ai-code-safety` · `framework-safety` · `iac-security` · `false-positive-review`
+`data-security` · `cybersecurity` · `security` · `cost-effectiveness` · `scalability` · `cloud-readiness` · `software-practices` · `accessibility` · `api-design` · `api-contract` · `reliability` · `observability` · `performance` · `compliance` · `data-sovereignty` · `testing` · `documentation` · `internationalization` · `dependency-health` · `concurrency` · `ethics-bias` · `maintainability` · `error-handling` · `authentication` · `database` · `caching` · `configuration-management` · `backwards-compatibility` · `portability` · `ux` · `logging-privacy` · `rate-limiting` · `ci-cd` · `code-structure` · `agent-instructions` · `ai-code-safety` · `framework-safety` · `iac-security` · `hallucination-detection` · `intent-alignment` · `multi-turn-coherence` · `model-fingerprint` · `over-engineering` · `logic-review` · `false-positive-review`
 ---
@@ -1186,7 +1188,6 @@ Each judge has a corresponding prompt for LLM-powered deep analysis:
 | `judge-over-engineering` | Deep review of unnecessary abstractions, wrapper-mania, premature generalization |
 | `judge-logic-review` | Deep review of logic correctness, semantic mismatches, and dead code in AI-generated code |
 | `judge-false-positive-review` | Meta-judge review of pattern-based findings for false positive detection and accuracy |
-| `full-tribunal` | all 45 judges in a single prompt |
 <!-- PROMPTS_TABLE_END -->
 ---
@@ -1216,7 +1217,7 @@ Create a `.judgesrc.json` (or `.judgesrc`) file in your project root to customiz
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
 | `$schema` | `string` | — | JSON Schema URL for IDE validation |
-| `preset` | `string` | — | Named preset (see [Named Presets](#named-presets) for all 18 options) |
+| `preset` | `string` | — | Named preset (see [Named Presets](#named-presets) for all 22 options) |
 | `minSeverity` | `string` | `"info"` | Minimum severity to report: `critical` · `high` · `medium` · `low` · `info` |
 | `disabledRules` | `string[]` | `[]` | Rule IDs or prefix wildcards to suppress (e.g. `"COST-*"`, `"SEC-003"`) |
 | `disabledJudges` | `string[]` | `[]` | Judge IDs to skip entirely (e.g. `"cost-effectiveness"`) |
@@ -1344,7 +1345,7 @@ judges/
 │   ├── evaluators/           # Analysis engine for each judge
 │   │   ├── index.ts          # evaluateWithJudge(), evaluateWithTribunal(), evaluateProject(), etc.
 │   │   ├── shared.ts         # Scoring, verdict logic, markdown formatters
-│   │   └── *.ts              # One analyzer per judge (39 files)
+│   │   └── *.ts              # One analyzer per judge (45 files)
 │   ├── formatters/           # Output formatters
 │   │   ├── sarif.ts              # SARIF 2.1.0 output
 │   │   ├── html.ts               # Self-contained HTML report (dark/light theme, filters)
@@ -1371,12 +1372,12 @@ judges/
 │   │   └── config-share.ts       # Shareable team/org configuration
 │   ├── presets.ts            # Named evaluation presets (strict, lenient, security-only, …)
 │   ├── patches/
-│   │   └── index.ts              # 53 deterministic auto-fix patch rules
+│   │   └── index.ts              # 201 deterministic auto-fix patch rules
 │   ├── tools/                # MCP tool registrations
 │   │   ├── register.ts           # Tool registration orchestrator
 │   │   ├── register-evaluation.ts    # Evaluation tools (evaluate_code, etc.)
 │   │   ├── register-workflow.ts      # Workflow tools (app builder, reports, etc.)
-│   │   ├── prompts.ts            # MCP prompt registrations (per-judge + full-tribunal)
+│   │   ├── prompts.ts            # MCP prompt registrations (per-judge prompts)
 │   │   └── schemas.ts            # Zod schemas for tool parameters
 │   ├── reports/
 │   │   └── public-repo-report.ts   # Public repo clone + full tribunal report generation

package/agents/authentication.judge.md CHANGED Viewed

@@ -30,6 +30,23 @@ RULES FOR YOUR EVALUATION:
 - Flag any endpoint that accepts user input without verifying the caller's identity and permissions.
 - Score from 0-100 where 100 means robust auth implementation.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Authentication middleware protects all routes that handle user data or state changes.
+- Passwords are hashed with bcrypt, scrypt, or argon2 — not stored in plaintext or weak hashes.
+- JWTs are verified with explicit algorithm restrictions, expiration, and issuer/audience checks.
+- Sessions use secure, httpOnly, sameSite cookies with proper expiration and rotation.
+- OAuth/OIDC flows use PKCE, validate state parameters, and allowlist redirect URIs.
+- API keys are transmitted in headers (not query params) and scoped to minimum permissions.
+If the code meets these criteria, authentication is implemented correctly. Do NOT manufacture findings.
+DOMAIN BOUNDARY (defer these to other judges):
+- Injection attacks and XSS exploit paths → defer to CYBER judge.
+- General security posture and cryptographic practices → defer to SEC judge.
+- Rate limiting on login endpoints → defer to RATE judge (unless auth logic itself is broken).
+- Error handling in auth flows → defer to ERR judge.
+- Data privacy in auth tokens/logs → defer to DATA/LOGPRIV judges.
+Only flag issues within YOUR domain: authentication middleware gaps, credential handling, token security, session management, authorization checks, OAuth/OIDC implementation, privilege escalation.
 FALSE POSITIVE AVOIDANCE:
 - Do NOT flag code that uses established authentication libraries (passport, next-auth, Spring Security, etc.) following their documented patterns.
 - JWT verification with explicit algorithm restrictions and proper expiration checks is correct implementation, not a vulnerability.

package/agents/cybersecurity.judge.md CHANGED Viewed

@@ -29,6 +29,24 @@ RULES FOR YOUR EVALUATION:
 - Reference OWASP, CWE IDs, and CVE IDs where applicable.
 - Score from 0-100 where 100 means no exploitable vulnerabilities found.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Input validation and sanitization are applied to user-controlled data before use in queries, commands, or output.
+- Passwords are hashed with bcrypt, scrypt, or argon2 — not MD5/SHA1.
+- Database queries use parameterized statements or an ORM with proper escaping.
+- Security middleware is present (helmet, CORS, CSRF tokens) for web applications.
+- Secrets are loaded from environment variables or a secrets manager, not hardcoded.
+- Dependencies are imported from standard registries with version pinning.
+- Error responses do not leak stack traces or internal details to clients.
+If the code meets these criteria, it is implementing security correctly. Do NOT manufacture findings.
+DOMAIN BOUNDARY (defer these to other judges):
+- Rate limiting, throttling, and abuse prevention → defer to RATE judge.
+- Authentication flows, session management, OAuth/OIDC → defer to AUTH judge.
+- General security posture, defense-in-depth patterns → defer to SEC judge.
+- Error handling completeness and error propagation → defer to ERR judge.
+- Data privacy, PII handling, logging of sensitive data → defer to DATA/LOGPRIV judges.
+Only flag issues within YOUR domain: injection attacks, XSS, CSRF/SSRF, dependency CVEs, cryptographic weaknesses, OWASP Top 10 violations with concrete exploit paths.
 FALSE POSITIVE AVOIDANCE:
 - Do NOT flag established security library usage (helmet, cors, bcrypt, argon2, parameterized queries) as security issues — these ARE the correct patterns.
 - Code that properly validates input, uses HTTPS, and parameterizes queries is implementing security correctly.

package/agents/error-handling.judge.md CHANGED Viewed

@@ -30,6 +30,15 @@ RULES FOR YOUR EVALUATION:
 - Flag any code path that could throw without a handler in scope.
 - Score from 0-100 where 100 means robust error handling.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Try-catch blocks wrap code paths that can throw, with meaningful handling (log, re-throw, or recover).
+- Async operations use try-catch or .catch() to handle rejections.
+- Error responses return consistent structures with appropriate HTTP status codes.
+- Resources (connections, file handles, streams) are cleaned up in finally blocks or using disposal patterns.
+- Framework error middleware or global handlers are present (Express error middleware, Spring @ExceptionHandler, etc.).
+- Stack traces and internal details are not exposed to end users in error responses.
+If the code meets these criteria, error handling is implemented correctly. Do NOT manufacture findings.
 FALSE POSITIVE AVOIDANCE:
 - Do NOT flag error handling in code that delegates error handling to a framework (Express middleware, Spring @ExceptionHandler, etc.).
 - Try-catch with logging and re-throw is a valid error handling pattern, not a deficiency.

package/agents/rate-limiting.judge.md CHANGED Viewed

@@ -30,6 +30,15 @@ RULES FOR YOUR EVALUATION:
 - Consider both inbound (protecting your service) and outbound (respecting others') rate limits.
 - Score from 0-100 where 100 means comprehensive rate limiting.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Rate limiting middleware is applied to public-facing API endpoints (express-rate-limit, API gateway config, etc.).
+- Request body size limits are configured (bodyParser limits, multer limits, etc.).
+- List/query endpoints have pagination with enforced maximum page sizes.
+- External API calls use bounded retries with exponential backoff and jitter.
+- Connection pools and concurrent request limits are bounded.
+If the code meets these criteria, rate limiting is implemented correctly. Do NOT manufacture findings.
+IMPORTANT: CLI tools, data scripts, utility libraries, batch processors, and internal services do NOT need rate limiting. If the code is not a public-facing API or web server, report ZERO findings.
 FALSE POSITIVE AVOIDANCE:
 - Only flag rate-limiting issues in code that accepts external requests (APIs, WebSocket servers, public endpoints).
 - Do NOT flag internal services, batch processors, CLI tools, or cron jobs for missing rate limiting.

package/agents/security.judge.md CHANGED Viewed

@@ -30,6 +30,24 @@ RULES FOR YOUR EVALUATION:
 - Reference CWE IDs where applicable.
 - Score from 0-100 where 100 means excellent security posture.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Security middleware is configured (helmet, CORS, CSRF protection) for web applications.
+- User input is validated before use in data flows (queries, file ops, HTTP requests).
+- Cryptographic operations use modern algorithms (AES-256, SHA-256+, bcrypt/argon2).
+- Secrets are sourced from environment variables or a secrets manager, not hardcoded.
+- Deserialization of untrusted data uses safe mechanisms (JSON.parse, not pickle/eval).
+- JWT/token verification includes algorithm restrictions and expiration checks.
+- No user-controlled URLs are used in redirects without validation.
+If the code meets these criteria, it has a strong security posture. Do NOT manufacture findings.
+DOMAIN BOUNDARY (defer these to other judges):
+- Injection attacks (SQL, XSS, command injection) with exploit paths → defer to CYBER judge.
+- Authentication flows, credential storage, session management → defer to AUTH judge.
+- Rate limiting and abuse prevention → defer to RATE judge.
+- Error handling patterns and error propagation → defer to ERR judge.
+- Infrastructure-as-code security → defer to IAC judge.
+Only flag issues within YOUR domain: insecure data flows, weak cryptography, missing security controls, unsafe deserialization, XML security, secret management, mass assignment, redirect validation.
 FALSE POSITIVE AVOIDANCE:
 - Do NOT flag code that uses established security libraries correctly (helmet, bcrypt, argon2, parameterized queries, CSRF tokens, rate limiters, proper TLS configuration).
 - Do NOT flag security controls in non-application code (CI/CD configs, IaC templates, documentation examples) unless they contain actual secrets or credentials.

package/dist/api.d.ts CHANGED Viewed

@@ -70,10 +70,9 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
 export type { ToolProfile, ToolCapability, ComparisonResult } from "./comparison.js";
 export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
 export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGateResult, L2CoverageAnalysis, L2JudgeCoverage, L2CategoryCoverage, } from "./commands/benchmark.js";
-export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
+export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
 export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
 export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
-export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
 export type { PromptAmendment, OptimizerInsight, OptimizationResult, AmendmentStore, } from "./commands/llm-benchmark-optimizer.js";
 export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
 export { buildContextSnippets } from "./context/context-snippets.js";

package/dist/api.js CHANGED Viewed

@@ -80,9 +80,7 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
 // ─── Benchmark Gate ──────────────────────────────────────────────────────────
 export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
 // ─── LLM Benchmark ──────────────────────────────────────────────────────────
-export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
-// ─── LLM Benchmark Optimizer (Self-Teaching) ────────────────────────────────
-export { optimizeBenchmark, formatAmendmentSection, createEmptyStore, mergeAmendments, } from "./commands/llm-benchmark-optimizer.js";
+export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
 // Review autopilot (GitHub App / scripts)
 export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
 export { buildContextSnippets } from "./context/context-snippets.js";

package/dist/commands/benchmark-advanced.js CHANGED Viewed

@@ -229,7 +229,7 @@ def load_config(path):
     public void authenticateUser(String u, String p) { /* 40 lines */ }
     public void scheduleTask(Task t) { /* 20 lines */ }
 }`,
-        expectedRuleIds: ["DATA-001", "COMP-001", "SOV-001", "DOC-001"],
+        expectedRuleIds: ["STRUCT-001", "MAINT-001", "DOC-001"],
         category: "code-structure",
         difficulty: "medium",
     },

package/dist/commands/benchmark-ai-agents.js CHANGED Viewed

@@ -27,7 +27,7 @@ app.post("/api/users", validateSchema(userSchema), async (req, res) => {
   await user.save();
   res.json({ id: user.id });
 });`,
-        expectedRuleIds: ["SCALE-001", "PERF-001", "COST-001", "API-001", "COMP-001"],
+        expectedRuleIds: ["HALLU-001"],
         category: "hallucination",
         difficulty: "medium",
     },
@@ -57,7 +57,7 @@ app.post("/api/users", validateSchema(userSchema), async (req, res) => {
   return { formatted, config, serialized };
 }`,
-        expectedRuleIds: ["CYBER-001", "CONC-001", "CACHE-001", "SEC-001"],
+        expectedRuleIds: ["HALLU-001"],
         category: "hallucination",
         difficulty: "easy",
     },
@@ -1018,7 +1018,7 @@ def delete_user(request):
   async auditLog(action: string) { /* 30 lines */ }
   // 2000+ lines, 50+ methods, handles everything
 }`,
-        expectedRuleIds: ["SOV-001"],
+        expectedRuleIds: ["MAINT-001"],
         category: "software-development",
         difficulty: "medium",
     },

package/dist/commands/benchmark-infrastructure.js CHANGED Viewed

@@ -205,7 +205,7 @@ spec:
     }
   ]
 }`,
-        expectedRuleIds: ["DEPS-001"],
+        expectedRuleIds: ["IAC-001"],
         category: "cloud",
         difficulty: "easy",
     },

package/dist/commands/benchmark-quality-ops.js CHANGED Viewed

@@ -1201,7 +1201,7 @@ jobs:
             -H "Authorization: Bearer \${{ secrets.DEPLOY_TOKEN }}" \\
             -d '{"sha": "\${{ github.sha }}"}'
       - run: echo "$\{{ secrets.AWS_SECRET_KEY }}" > /tmp/key`,
-        expectedRuleIds: ["CLOUD-001", "PORTA-001"],
+        expectedRuleIds: ["SEC-001"],
         category: "cicd",
         difficulty: "medium",
     },

package/dist/commands/benchmark.d.ts CHANGED Viewed

@@ -25,6 +25,13 @@ export interface BenchmarkCase {
     expectedRuleIds: string[];
     /** Rule IDs that should NOT be detected (known false positives) */
     unexpectedRuleIds?: string[];
+    /**
+     * Acceptable rule prefixes: findings from these judge domains are
+     * domain-relevant and should NOT count as false positives even if not
+     * in expectedRuleIds. For example, a SQL-injection case may acceptably
+     * also trigger AUTH or SEC findings.
+     */
+    acceptablePrefixes?: string[];
     /** Category of vulnerability (e.g. "injection", "auth", "xss") */
     category: string;
     /** Difficulty level */

package/dist/commands/benchmark.js CHANGED Viewed

@@ -844,7 +844,7 @@ function getErrorMessage(code: number): string {
     "bower": "^1.8.0"
   }
 }`,
-        expectedRuleIds: ["DEPS-001", "SUPPLY-001"],
+        expectedRuleIds: ["DEPS-001"],
         category: "dependency-health",
         difficulty: "easy",
     },
@@ -2337,13 +2337,13 @@ export function runBenchmarkSuite(cases, judgeId) {
         cat.truePositives += caseTP;
         cat.falseNegatives += caseFN;
         cat.falsePositives += caseFP;
-        // Per-judge accumulators
-        // Only count detections on clean cases (expectedRuleIds empty) as FP.
-        // Dirty-case "extra" detections are legitimate secondary findings and
-        // should not inflate per-judge false-positive rates.
-        const isCleanCase = tc.expectedRuleIds.length === 0;
+        // Per-judge accumulators (deduplicate by prefix per case to match case-level FP counting)
+        const seenPrefixes = new Set();
         for (const ruleId of foundRuleIds) {
             const prefix = ruleId.split("-")[0];
+            if (seenPrefixes.has(prefix))
+                continue;
+            seenPrefixes.add(prefix);
             if (!perJudge[prefix]) {
                 perJudge[prefix] = {
                     judgeId: prefix,
@@ -2361,7 +2361,7 @@ export function runBenchmarkSuite(cases, judgeId) {
             if (expectedPrefixes.has(prefix)) {
                 jb.truePositives++;
             }
-            else if (isCleanCase) {
+            else {
                 jb.falsePositives++;
             }
         }

package/dist/commands/llm-benchmark.d.ts CHANGED Viewed

@@ -17,6 +17,19 @@ import type { JudgeDefinition } from "../types.js";
 import type { BenchmarkCase, CategoryResult, JudgeBenchmarkResult, DifficultyResult } from "./benchmark.js";
 import type { PromptAmendment } from "./llm-benchmark-optimizer.js";
 export declare const TRIBUNAL_JUDGES: JudgeDefinition[];
+/**
+ * Get acceptable prefixes for a benchmark case. Uses the case's explicit
+ * acceptablePrefixes if defined, otherwise falls back to the category map.
+ * Expected prefixes are always included (they're TPs, not FPs).
+ */
+export declare function getAcceptablePrefixes(tc: BenchmarkCase): Set<string>;
+/**
+ * Select a focused subset of tribunal judges relevant to a benchmark case's
+ * category. Returns core judges + category-specific judges, typically 8-15
+ * instead of the full 35. Returns undefined if no routing is possible
+ * (unknown category), signalling the caller to use all tribunal judges.
+ */
+export declare function selectJudgesForCategory(category: string): JudgeDefinition[] | undefined;
 export interface LlmBenchmarkSnapshot {
     /** Timestamp of this LLM benchmark run */
     timestamp: string;
@@ -48,6 +61,8 @@ export interface LlmBenchmarkSnapshot {
     recall: number;
     /** F1 Score */
     f1Score: number;
+    /** Severity-weighted F1 — penalizes critical/high FPs more heavily */
+    weightedF1Score?: number;
     /** Detection rate: cases detected / total cases */
     detectionRate: number;
     /** Per-category breakdown */
@@ -102,8 +117,9 @@ export declare function extractValidatedLlmFindings(response: string, prefixes?:
 export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
 /**
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
+ * When `judges` is provided, uses that filtered list instead of all tribunal judges.
  */
-export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
+export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[], judges?: JudgeDefinition[]): string;
 /**
  * Select a stratified sample of benchmark cases, ensuring representation
  * across categories, difficulties, and both clean/dirty cases.
@@ -112,8 +128,12 @@ export declare function selectStratifiedSample(cases: BenchmarkCase[], targetSiz
 /**
  * Score a single LLM benchmark case using prefix-based matching.
  * Returns a fully populated LlmCaseResult.
+ *
+ * @param topKPrefixes - If set, only keep the first `topKPrefixes` unique
+ *   detected prefixes (in the order they appear in the LLM response).
+ *   This prevents verbose tribunal output from inflating FP counts.
  */
-export declare function scoreLlmCase(tc: BenchmarkCase, detectedRuleIds: string[], rawResponse: string, tokensUsed?: number): LlmCaseResult;
+export declare function scoreLlmCase(tc: BenchmarkCase, detectedRuleIds: string[], rawResponse: string, tokensUsed?: number, topKPrefixes?: number): LlmCaseResult;
 /**
  * Compute aggregate metrics for an LLM benchmark snapshot from raw case results.
  * Uses the same prefix-based matching methodology as the L1 benchmark.

package/dist/commands/llm-benchmark.js CHANGED Viewed

@@ -14,7 +14,7 @@
  * `scripts/run-llm-benchmark.ts` has been removed.
  */
 import { JUDGES } from "../judges/index.js";
-import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE } from "../tools/prompts.js";
+import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE, CLEAN_CODE_GATE, } from "../tools/prompts.js";
 import { extractAndValidateLlmFindings, mergeFindings } from "../probabilistic/llm-response-validator.js";
 import { formatAmendmentSection } from "./llm-benchmark-optimizer.js";
 // ─── Tribunal Judge Filtering ───────────────────────────────────────────────
@@ -22,6 +22,89 @@ import { formatAmendmentSection } from "./llm-benchmark-optimizer.js";
 // near-100% false positives in single-pass tribunal mode and are excluded.
 const TRIBUNAL_EXCLUDED_PREFIXES = new Set(["INTENT", "COH", "MFPR", "FPR", "OVER"]);
 export const TRIBUNAL_JUDGES = JUDGES.filter((j) => !TRIBUNAL_EXCLUDED_PREFIXES.has(j.rulePrefix));
+// ─── Category → Acceptable Prefixes Mapping ────────────────────────────────
+// For each benchmark case category, these judge prefixes are domain-relevant
+// and findings from them should NOT count as false positives even when not
+// in expectedRuleIds. This prevents legitimate cross-domain observations
+// from inflating the FP metric.
+// ─────────────────────────────────────────────────────────────────────────────
+const CATEGORY_ACCEPTABLE_PREFIXES = {
+    injection: ["SEC", "CYBER", "DB", "DATA", "ERR", "FW", "LOGIC"],
+    xss: ["SEC", "CYBER", "FW", "DATA", "ERR", "LOGIC"],
+    auth: ["AUTH", "SEC", "CYBER", "DATA", "CFG", "ERR", "LOGIC"],
+    "rate-limiting": ["RATE", "PERF", "SCALE", "REL", "SEC", "ERR"],
+    "error-handling": ["ERR", "REL", "OBS", "LOGIC", "MAINT", "STRUCT"],
+    "data-security": ["DATA", "SEC", "CYBER", "LOGPRIV", "SOV", "CFG", "ERR"],
+    security: ["SEC", "CYBER", "AUTH", "DATA", "FW", "CFG", "ERR", "LOGIC"],
+    concurrency: ["CONC", "PERF", "REL", "LOGIC", "ERR", "MAINT"],
+    performance: ["PERF", "SCALE", "CACHE", "DB", "CONC", "LOGIC", "MAINT"],
+    database: ["DB", "SEC", "DATA", "PERF", "ERR", "LOGIC"],
+    "api-design": ["API", "ERR", "AUTH", "SEC", "STRUCT", "LOGIC", "MAINT"],
+    observability: ["OBS", "LOGPRIV", "REL", "ERR", "CFG", "MAINT"],
+    reliability: ["REL", "ERR", "CONC", "PERF", "OBS", "LOGIC"],
+    scalability: ["SCALE", "PERF", "CACHE", "CLOUD", "CONC", "STRUCT"],
+    "cloud-readiness": ["CLOUD", "CFG", "CICD", "SCALE", "PORTA", "SEC"],
+    configuration: ["CFG", "SEC", "DATA", "CLOUD", "ERR"],
+    maintainability: ["MAINT", "STRUCT", "SWDEV", "DOC", "LOGIC", "ERR"],
+    "code-structure": ["STRUCT", "MAINT", "SWDEV", "LOGIC", "ERR"],
+    documentation: ["DOC", "MAINT", "SWDEV", "STRUCT"],
+    testing: ["TEST", "SWDEV", "LOGIC", "ERR", "MAINT"],
+    "cost-effectiveness": ["COST", "CLOUD", "SCALE", "PERF", "IAC"],
+    compliance: ["COMP", "DATA", "SOV", "LOGPRIV", "SEC", "CYBER"],
+    accessibility: ["A11Y", "UX", "I18N", "STRUCT", "LOGIC"],
+    internationalization: ["I18N", "A11Y", "UX", "STRUCT"],
+    "dependency-health": ["DEPS", "SEC", "COMPAT", "MAINT"],
+    "logging-privacy": ["LOGPRIV", "DATA", "OBS", "SEC", "ERR"],
+    "backwards-compatibility": ["COMPAT", "API", "STRUCT", "LOGIC"],
+    caching: ["CACHE", "PERF", "SCALE", "REL", "LOGIC"],
+    "ethics-bias": ["ETHICS", "DATA", "COMP", "SEC"],
+    portability: ["PORTA", "CLOUD", "STRUCT", "CFG"],
+    "ci-cd": ["CICD", "SEC", "CFG", "CLOUD", "TEST"],
+    "iac-security": ["IAC", "SEC", "CYBER", "CFG", "CLOUD"],
+    cloud: ["CLOUD", "IAC", "SEC", "CYBER", "CFG", "SCALE"],
+    ethics: ["ETHICS", "A11Y", "UX", "DATA", "COMP"],
+    "framework-safety": ["FW", "SEC", "CYBER", "ERR", "LOGIC"],
+    "framework-security": ["FW", "SEC", "CYBER", "AUTH", "ERR", "API", "COMP", "OBS", "COMPAT", "CONC", "DOC"],
+    "agent-instructions": ["AGENT", "SEC", "CYBER", "AICS", "ERR", "LOGIC"],
+    cicd: ["CICD", "SEC", "CFG", "CLOUD", "TEST", "PORTA"],
+    ux: ["UX", "ERR", "SEC", "A11Y", "I18N", "LOGIC"],
+    "software-practices": ["SWDEV", "MAINT", "STRUCT", "DOC", "LOGIC", "ERR"],
+    "software-development": ["SWDEV", "MAINT", "STRUCT", "DOC", "LOGIC", "ERR"],
+    "code-quality": ["MAINT", "API", "STRUCT", "SWDEV", "LOGIC", "ERR"],
+    "supply-chain": ["DEPS", "SEC", "COMPAT", "MAINT"],
+    "ai-security": ["AICS", "SEC", "CYBER", "DATA", "ERR", "LOGIC"],
+    clean: [], // Clean code — no acceptable prefixes, all findings are FPs
+};
+/**
+ * Get acceptable prefixes for a benchmark case. Uses the case's explicit
+ * acceptablePrefixes if defined, otherwise falls back to the category map.
+ * Expected prefixes are always included (they're TPs, not FPs).
+ */
+export function getAcceptablePrefixes(tc) {
+    const explicit = tc.acceptablePrefixes;
+    const fromCategory = CATEGORY_ACCEPTABLE_PREFIXES[tc.category] ?? [];
+    const combined = new Set([...tc.expectedRuleIds.map((r) => r.split("-")[0]), ...(explicit ?? fromCategory)]);
+    return combined;
+}
+// ─── Core Judges (always included in routed tribunals) ──────────────────────
+// These judges provide universal code quality signals and should always be
+// part of the tribunal regardless of category.
+const CORE_JUDGE_PREFIXES = new Set(["SEC", "ERR", "LOGIC", "STRUCT", "MAINT"]);
+/**
+ * Select a focused subset of tribunal judges relevant to a benchmark case's
+ * category. Returns core judges + category-specific judges, typically 8-15
+ * instead of the full 35. Returns undefined if no routing is possible
+ * (unknown category), signalling the caller to use all tribunal judges.
+ */
+export function selectJudgesForCategory(category) {
+    const acceptable = CATEGORY_ACCEPTABLE_PREFIXES[category];
+    if (!acceptable || acceptable.length === 0)
+        return undefined;
+    const targetPrefixes = new Set([...CORE_JUDGE_PREFIXES, ...acceptable]);
+    const selected = TRIBUNAL_JUDGES.filter((j) => targetPrefixes.has(j.rulePrefix));
+    // Only route if we meaningfully reduced the set (at least 40% fewer)
+    return selected.length < TRIBUNAL_JUDGES.length * 0.6 ? selected : undefined;
+}
 // ─── Rule ID Parsing ────────────────────────────────────────────────────────
 /**
  * Extract unique rule IDs from LLM response text.
@@ -41,7 +124,7 @@ export function getTribunalValidPrefixes() {
 }
 export function parseLlmRuleIds(response) {
     const validPrefixes = getValidRulePrefixes();
-    const pattern = /\b([A-Z]{2,})-(\d{3})\b/g;
+    const pattern = /\b([A-Z][A-Z0-9]+)-(\d{1,3})\b/g;
     const found = new Set();
     let match;
     while ((match = pattern.exec(response)) !== null) {
@@ -85,35 +168,55 @@ export function constructPerJudgePrompt(judge, code, language, contextSnippets =
         (amendmentSection ? `${amendmentSection}\n` : "") +
         contextSection +
         `${criteria}\n\n` +
+        `${CLEAN_CODE_GATE}\n\n` +
         `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
         `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`);
 }
 /**
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
+ * When `judges` is provided, uses that filtered list instead of all tribunal judges.
  */
-export function constructTribunalPrompt(code, language, contextSnippets = [], amendments) {
-    const judgeInstructions = TRIBUNAL_JUDGES.map((j) => `### ${j.name} — ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
+export function constructTribunalPrompt(code, language, contextSnippets = [], amendments, judges) {
+    const activeJudges = judges ?? TRIBUNAL_JUDGES;
+    const judgeInstructions = activeJudges
+        .map((j) => `### ${j.name} — ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`)
+        .join("\n\n---\n\n");
     const contextSection = contextSnippets.length
         ? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
         : "";
     const amendmentSection = formatAmendmentSection(amendments ?? []);
-    return (`You are the Judges Panel — a panel of ${TRIBUNAL_JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
+    return (`You are the Judges Panel — a panel of ${activeJudges.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
         `## Universal Evaluation Directives\n\n` +
         `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
         `${PRECISION_MANDATE}\n\n` +
+        `${CLEAN_CODE_GATE}\n\n` +
         `DOMAIN SCOPE DIRECTIVE (applies to ALL judges):\n` +
         `- Each judge MUST only report findings within their stated domain expertise.\n` +
         `- A CI/CD judge should NOT report authentication findings. An ethics judge should NOT report performance findings.\n` +
         `- If code falls entirely outside your domain (e.g., a YAML CI workflow being evaluated by the Database judge), report ZERO findings for that judge.\n` +
-        `- Cross-domain observations should ONLY be reported by the judge whose domain they fall under.\n\n` +
+        `- Cross-domain observations should ONLY be reported by the judge whose domain they fall under.\n` +
+        `- HARD LIMIT: Each judge may report AT MOST 2 findings. If a judge has more than 2 potential findings, keep only the 2 highest-severity, highest-confidence ones and discard the rest.\n\n` +
         (amendmentSection ? `${amendmentSection}\n` : "") +
         contextSection +
         `## Evaluation Instructions\n\n` +
-        `Evaluate the following ${language} code from the perspective of ALL ${TRIBUNAL_JUDGES.length} judges below. For each judge, provide:\n` +
-        `1. Judge name and domain\n` +
-        `2. Verdict (PASS / WARNING / FAIL)\n` +
-        `3. Score (0-100)\n` +
-        `4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
+        `Evaluate the following ${language} code from the perspective of ALL ${activeJudges.length} judges below.\n\n` +
+        `### Output Format — Tiered Findings\n` +
+        `Organize ALL findings into three tiers:\n\n` +
+        `**🔴 MUST FIX** (critical/high severity — blocks merge):\n` +
+        `These are real bugs, security vulnerabilities, data loss risks, or correctness issues. ` +
+        `Report at most 5 findings here. Each must have concrete code evidence.\n\n` +
+        `**🟡 WORTH REVIEWING** (medium severity — warrants discussion):\n` +
+        `Design flaws, maintainability concerns, or reliability risks that a senior reviewer would flag. ` +
+        `Only include findings with specific code evidence.\n\n` +
+        `**🟢 INFORMATIONAL** (low/info severity — optional improvements):\n` +
+        `Minor style or optimization suggestions. Limit to the most impactful 3. Omit if none are genuinely useful.\n\n` +
+        `For each finding, provide:\n` +
+        `1. Rule ID (using the judge's prefix)\n` +
+        `2. Severity (critical/high/medium/low/info)\n` +
+        `3. Confidence (0-100%): How certain are you this is a real issue? Only include findings ≥80%.\n` +
+        `4. Judge name and domain\n` +
+        `5. Specific code evidence (line numbers, patterns)\n` +
+        `6. Description and recommendation\n\n` +
         `For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
         `Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
         `## The Judges\n\n${judgeInstructions}\n\n` +
@@ -181,8 +284,24 @@ export function selectStratifiedSample(cases, targetSize) {
 /**
  * Score a single LLM benchmark case using prefix-based matching.
  * Returns a fully populated LlmCaseResult.
+ *
+ * @param topKPrefixes - If set, only keep the first `topKPrefixes` unique
+ *   detected prefixes (in the order they appear in the LLM response).
+ *   This prevents verbose tribunal output from inflating FP counts.
  */
-export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
+export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed, topKPrefixes) {
+    // ── Optional top-K prefix cap ─────────────────────────────────────────
+    let filteredDetected = detectedRuleIds;
+    if (topKPrefixes !== undefined && topKPrefixes > 0) {
+        const seenPrefixes = new Set();
+        filteredDetected = detectedRuleIds.filter((id) => {
+            const prefix = id.split("-")[0];
+            if (seenPrefixes.size >= topKPrefixes && !seenPrefixes.has(prefix))
+                return false;
+            seenPrefixes.add(prefix);
+            return true;
+        });
+    }
     // ── Prefix-level FP deduplication ─────────────────────────────────────
     // TPs are counted per-expected-rule using prefix matching: a single
     // detected CYBER-xxx satisfies all expected CYBER-yyy rules.
@@ -191,7 +310,7 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
     // generates for that prefix.  This prevents verbose LLM output from
     // inflating the FP metric (e.g. CYBER-001…005 on clean code = 1 FP,
     // not 5).
-    const detectedPrefixes = new Set(detectedRuleIds.map((r) => r.split("-")[0]));
+    const detectedPrefixes = new Set(filteredDetected.map((r) => r.split("-")[0]));
     const matchedExpected = tc.expectedRuleIds.filter((expected) => {
         const prefix = expected.split("-")[0];
         return detectedPrefixes.has(prefix);
@@ -203,19 +322,19 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
     // For clean cases (no expected findings), ALL detections are false positives.
     // For dirty cases with unexpectedRuleIds, FPs are detections matching those prefixes.
     // For dirty cases WITHOUT unexpectedRuleIds, FPs are detections whose prefix
-    // doesn't match any expected prefix (prevents silent over-reporting).
+    // doesn't match any expected or acceptable prefix.
     const isCleanCase = tc.expectedRuleIds.length === 0;
-    const expectedPrefixes = new Set(tc.expectedRuleIds.map((r) => r.split("-")[0]));
+    const acceptablePrefixes = getAcceptablePrefixes(tc);
     const falsePositiveIdsRaw = isCleanCase
-        ? detectedRuleIds
+        ? filteredDetected
         : tc.unexpectedRuleIds
-            ? detectedRuleIds.filter((found) => {
+            ? filteredDetected.filter((found) => {
                 const prefix = found.split("-")[0];
                 return tc.unexpectedRuleIds.some((u) => u.split("-")[0] === prefix);
             })
-            : detectedRuleIds.filter((found) => {
+            : filteredDetected.filter((found) => {
                 const prefix = found.split("-")[0];
-                return !expectedPrefixes.has(prefix);
+                return !acceptablePrefixes.has(prefix);
             });
     // Deduplicate FPs by prefix — keep one representative rule ID per prefix
     const fpPrefixSeen = new Set();
@@ -233,7 +352,7 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
         difficulty: tc.difficulty,
         passed: casePassed,
         expectedRuleIds: tc.expectedRuleIds,
-        detectedRuleIds,
+        detectedRuleIds: filteredDetected,
         missedRuleIds: missedExpected,
         falsePositiveRuleIds: falsePositiveIds,
         rawResponse,
@@ -289,11 +408,16 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
         cat.truePositives += caseTP;
         cat.falseNegatives += caseFN;
         cat.falsePositives += caseFP;
-        // Per-judge
+        // Per-judge (deduplicate by prefix per case to match case-level FP counting)
+        // Use pre-computed falsePositiveRuleIds to stay consistent with scoreLlmCase
+        const fpPrefixes = new Set(c.falsePositiveRuleIds.map((r) => r.split("-")[0]));
         const expectedPrefixes = new Set(c.expectedRuleIds.map((r) => r.split("-")[0]));
-        const isCleanCase = c.expectedRuleIds.length === 0;
+        const seenPrefixes = new Set();
         for (const ruleId of c.detectedRuleIds) {
             const prefix = ruleId.split("-")[0];
+            if (seenPrefixes.has(prefix))
+                continue;
+            seenPrefixes.add(prefix);
             if (!perJudge[prefix]) {
                 perJudge[prefix] = {
                     judgeId: prefix,
@@ -311,15 +435,48 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
             if (expectedPrefixes.has(prefix)) {
                 jb.truePositives++;
             }
-            else if (isCleanCase) {
+            else if (fpPrefixes.has(prefix)) {
                 jb.falsePositives++;
             }
+            // Acceptable (non-expected, non-FP) detections are silently ignored
         }
     }
     // Compute aggregate metrics
     const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 1;
     const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 1;
     const f1Score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
+    // Severity-weighted F1 — re-extract findings from raw responses to get
+    // severity info, then weight FPs: critical/high=3x, medium=1x, low/info=0.3x
+    const SEVERITY_WEIGHTS = {
+        critical: 3,
+        high: 3,
+        medium: 1,
+        low: 0.3,
+        info: 0.3,
+    };
+    let weightedFP = 0;
+    const tribunalPrefixes = getTribunalValidPrefixes();
+    for (const c of rawCases) {
+        if (c.falsePositiveRuleIds.length === 0)
+            continue;
+        const fpSet = new Set(c.falsePositiveRuleIds.map((r) => r.split("-")[0]));
+        const validation = extractValidatedLlmFindings(c.rawResponse, tribunalPrefixes);
+        // Map finding ruleId prefix → max severity weight
+        const prefixMaxWeight = new Map();
+        for (const f of validation.findings) {
+            const prefix = f.ruleId.split("-")[0];
+            if (!fpSet.has(prefix))
+                continue;
+            const weight = SEVERITY_WEIGHTS[f.severity] ?? 1;
+            prefixMaxWeight.set(prefix, Math.max(prefixMaxWeight.get(prefix) ?? 0, weight));
+        }
+        // Sum weights for FP prefixes (use weight=1 default if severity unknown)
+        for (const prefix of fpSet) {
+            weightedFP += prefixMaxWeight.get(prefix) ?? 1;
+        }
+    }
+    const weightedPrecision = totalTP + weightedFP > 0 ? totalTP / (totalTP + weightedFP) : 1;
+    const weightedF1Score = weightedPrecision + recall > 0 ? (2 * weightedPrecision * recall) / (weightedPrecision + recall) : 0;
     // Compute per-difficulty rates
     for (const d of Object.values(perDifficulty)) {
         d.detectionRate = d.total > 0 ? d.detected / d.total : 0;
@@ -356,6 +513,7 @@ export function computeLlmMetrics(rawCases, version, model, provider, promptMode
         precision,
         recall,
         f1Score,
+        weightedF1Score,
         detectionRate: rawCases.length > 0 ? totalDetected / rawCases.length : 0,
         perCategory,
         perJudge,
@@ -394,6 +552,9 @@ export function formatLlmSnapshotMarkdown(snapshot) {
     lines.push(`| Precision | ${pct(snapshot.precision)} |`);
     lines.push(`| Recall | ${pct(snapshot.recall)} |`);
     lines.push(`| F1 Score | ${pct(snapshot.f1Score)} |`);
+    if (snapshot.weightedF1Score !== null && snapshot.weightedF1Score !== undefined) {
+        lines.push(`| Weighted F1 | ${pct(snapshot.weightedF1Score)} |`);
+    }
     lines.push(`| True Positives | ${snapshot.truePositives} |`);
     lines.push(`| False Negatives | ${snapshot.falseNegatives} |`);
     lines.push(`| False Positives | ${snapshot.falsePositives} |`);

package/dist/config.js CHANGED Viewed

@@ -12,7 +12,7 @@ import { matchGlobPath } from "./tools/command-safety.js";
 export function expandEnvPlaceholders(content) {
     if (!content)
         return content;
-    return content.replace(/\$\{([^}]+)\}/g, (_match, varName) => {
+    return content.replace(/\$\{([^}]{1,100})\}/g, (_match, varName) => {
         const envVal = process.env[varName];
         return envVal !== undefined ? envVal : "";
     });

package/dist/judges/authentication.js CHANGED Viewed

@@ -29,6 +29,23 @@ RULES FOR YOUR EVALUATION:
 - Flag any endpoint that accepts user input without verifying the caller's identity and permissions.
 - Score from 0-100 where 100 means robust auth implementation.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Authentication middleware protects all routes that handle user data or state changes.
+- Passwords are hashed with bcrypt, scrypt, or argon2 — not stored in plaintext or weak hashes.
+- JWTs are verified with explicit algorithm restrictions, expiration, and issuer/audience checks.
+- Sessions use secure, httpOnly, sameSite cookies with proper expiration and rotation.
+- OAuth/OIDC flows use PKCE, validate state parameters, and allowlist redirect URIs.
+- API keys are transmitted in headers (not query params) and scoped to minimum permissions.
+If the code meets these criteria, authentication is implemented correctly. Do NOT manufacture findings.
+DOMAIN BOUNDARY (defer these to other judges):
+- Injection attacks and XSS exploit paths → defer to CYBER judge.
+- General security posture and cryptographic practices → defer to SEC judge.
+- Rate limiting on login endpoints → defer to RATE judge (unless auth logic itself is broken).
+- Error handling in auth flows → defer to ERR judge.
+- Data privacy in auth tokens/logs → defer to DATA/LOGPRIV judges.
+Only flag issues within YOUR domain: authentication middleware gaps, credential handling, token security, session management, authorization checks, OAuth/OIDC implementation, privilege escalation.
 FALSE POSITIVE AVOIDANCE:
 - Do NOT flag code that uses established authentication libraries (passport, next-auth, Spring Security, etc.) following their documented patterns.
 - JWT verification with explicit algorithm restrictions and proper expiration checks is correct implementation, not a vulnerability.

package/dist/judges/cybersecurity.js CHANGED Viewed

@@ -28,6 +28,24 @@ RULES FOR YOUR EVALUATION:
 - Reference OWASP, CWE IDs, and CVE IDs where applicable.
 - Score from 0-100 where 100 means no exploitable vulnerabilities found.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Input validation and sanitization are applied to user-controlled data before use in queries, commands, or output.
+- Passwords are hashed with bcrypt, scrypt, or argon2 — not MD5/SHA1.
+- Database queries use parameterized statements or an ORM with proper escaping.
+- Security middleware is present (helmet, CORS, CSRF tokens) for web applications.
+- Secrets are loaded from environment variables or a secrets manager, not hardcoded.
+- Dependencies are imported from standard registries with version pinning.
+- Error responses do not leak stack traces or internal details to clients.
+If the code meets these criteria, it is implementing security correctly. Do NOT manufacture findings.
+DOMAIN BOUNDARY (defer these to other judges):
+- Rate limiting, throttling, and abuse prevention → defer to RATE judge.
+- Authentication flows, session management, OAuth/OIDC → defer to AUTH judge.
+- General security posture, defense-in-depth patterns → defer to SEC judge.
+- Error handling completeness and error propagation → defer to ERR judge.
+- Data privacy, PII handling, logging of sensitive data → defer to DATA/LOGPRIV judges.
+Only flag issues within YOUR domain: injection attacks, XSS, CSRF/SSRF, dependency CVEs, cryptographic weaknesses, OWASP Top 10 violations with concrete exploit paths.
 FALSE POSITIVE AVOIDANCE:
 - Do NOT flag established security library usage (helmet, cors, bcrypt, argon2, parameterized queries) as security issues — these ARE the correct patterns.
 - Code that properly validates input, uses HTTPS, and parameterizes queries is implementing security correctly.

package/dist/judges/error-handling.js CHANGED Viewed

@@ -29,6 +29,15 @@ RULES FOR YOUR EVALUATION:
 - Flag any code path that could throw without a handler in scope.
 - Score from 0-100 where 100 means robust error handling.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Try-catch blocks wrap code paths that can throw, with meaningful handling (log, re-throw, or recover).
+- Async operations use try-catch or .catch() to handle rejections.
+- Error responses return consistent structures with appropriate HTTP status codes.
+- Resources (connections, file handles, streams) are cleaned up in finally blocks or using disposal patterns.
+- Framework error middleware or global handlers are present (Express error middleware, Spring @ExceptionHandler, etc.).
+- Stack traces and internal details are not exposed to end users in error responses.
+If the code meets these criteria, error handling is implemented correctly. Do NOT manufacture findings.
 FALSE POSITIVE AVOIDANCE:
 - Do NOT flag error handling in code that delegates error handling to a framework (Express middleware, Spring @ExceptionHandler, etc.).
 - Try-catch with logging and re-throw is a valid error handling pattern, not a deficiency.

package/dist/judges/rate-limiting.js CHANGED Viewed

@@ -29,6 +29,15 @@ RULES FOR YOUR EVALUATION:
 - Consider both inbound (protecting your service) and outbound (respecting others') rate limits.
 - Score from 0-100 where 100 means comprehensive rate limiting.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Rate limiting middleware is applied to public-facing API endpoints (express-rate-limit, API gateway config, etc.).
+- Request body size limits are configured (bodyParser limits, multer limits, etc.).
+- List/query endpoints have pagination with enforced maximum page sizes.
+- External API calls use bounded retries with exponential backoff and jitter.
+- Connection pools and concurrent request limits are bounded.
+If the code meets these criteria, rate limiting is implemented correctly. Do NOT manufacture findings.
+IMPORTANT: CLI tools, data scripts, utility libraries, batch processors, and internal services do NOT need rate limiting. If the code is not a public-facing API or web server, report ZERO findings.
 FALSE POSITIVE AVOIDANCE:
 - Only flag rate-limiting issues in code that accepts external requests (APIs, WebSocket servers, public endpoints).
 - Do NOT flag internal services, batch processors, CLI tools, or cron jobs for missing rate limiting.

package/dist/judges/security.js CHANGED Viewed

@@ -29,6 +29,24 @@ RULES FOR YOUR EVALUATION:
 - Reference CWE IDs where applicable.
 - Score from 0-100 where 100 means excellent security posture.
+CLEAN CODE RECOGNITION (if ALL of the following are true, report ZERO findings):
+- Security middleware is configured (helmet, CORS, CSRF protection) for web applications.
+- User input is validated before use in data flows (queries, file ops, HTTP requests).
+- Cryptographic operations use modern algorithms (AES-256, SHA-256+, bcrypt/argon2).
+- Secrets are sourced from environment variables or a secrets manager, not hardcoded.
+- Deserialization of untrusted data uses safe mechanisms (JSON.parse, not pickle/eval).
+- JWT/token verification includes algorithm restrictions and expiration checks.
+- No user-controlled URLs are used in redirects without validation.
+If the code meets these criteria, it has a strong security posture. Do NOT manufacture findings.
+DOMAIN BOUNDARY (defer these to other judges):
+- Injection attacks (SQL, XSS, command injection) with exploit paths → defer to CYBER judge.
+- Authentication flows, credential storage, session management → defer to AUTH judge.
+- Rate limiting and abuse prevention → defer to RATE judge.
+- Error handling patterns and error propagation → defer to ERR judge.
+- Infrastructure-as-code security → defer to IAC judge.
+Only flag issues within YOUR domain: insecure data flows, weak cryptography, missing security controls, unsafe deserialization, XML security, secret management, mass assignment, redirect validation.
 FALSE POSITIVE AVOIDANCE:
 - Do NOT flag code that uses established security libraries correctly (helmet, bcrypt, argon2, parameterized queries, CSRF tokens, rate limiters, proper TLS configuration).
 - Do NOT flag security controls in non-application code (CI/CD configs, IaC templates, documentation examples) unless they contain actual secrets or credentials.

package/dist/probabilistic/llm-response-validator.js CHANGED Viewed

@@ -5,7 +5,7 @@ const SEVERITY_SET = new Set(["critical", "high", "medium", "low", "info"]);
  * Attempt to parse a JSON payload embedded in LLM output. Supports fenced code blocks and raw JSON.
  */
 function parseJsonBlock(text) {
-    const fenceMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/i);
+    const fenceMatch = text.match(/```(?:json)?[ \t]*\n([\s\S]*?)\n[ \t]*```/i) ?? text.match(/```(?:json)?[ \t]*([\s\S]*?)```/i);
     if (fenceMatch) {
         try {
             return JSON.parse(fenceMatch[1]);
@@ -25,7 +25,7 @@ function normalizeRuleId(id) {
     return id.trim().toUpperCase();
 }
 function isValidRuleId(id, validPrefixes) {
-    const match = id.match(/^([A-Z]{2,})-\d{3}$/);
+    const match = id.match(/^([A-Z][A-Z0-9]+)-\d{1,3}$/);
     if (!match)
         return false;
     return validPrefixes.has(match[1]);

package/dist/reports/public-repo-report.js CHANGED Viewed

@@ -215,7 +215,15 @@ function countBySeverity(findings) {
 function compileExcludeRegexes(patterns) {
     if (!patterns || patterns.length === 0)
         return [];
-    return patterns.map((pattern) => new RegExp(pattern, "i"));
+    return patterns.map((pattern) => {
+        try {
+            return new RegExp(pattern, "i");
+        }
+        catch {
+            // Invalid regex from user input — treat as literal string match
+            return new RegExp(pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "i");
+        }
+    });
 }
 function isLikelyNonProductionPath(path) {
     return /(^|\/)(test|tests|__tests__|spec|specs|e2e)(\/|\.|$)|\.(?:test|tests|spec|specs|e2e)\.[^/]+$|mock|fixture|fixtures|(^|\/)docs(-|\/)i18n(\/|$)|(^|\/)docs(\/|$)/i.test(path);

package/dist/skill-loader.js CHANGED Viewed

@@ -25,7 +25,7 @@ export function parseSkillFrontmatter(raw) {
             i++;
             continue;
         }
-        const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/);
+        const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*)$/);
         if (!kv) {
             i++;
             continue;
@@ -64,9 +64,10 @@ export function parseSkillFrontmatter(raw) {
         if (typeof value === "string" && ((value.startsWith("[") && value.endsWith("]")) || value.includes(","))) {
             // simple array parsing: split on comma
             const normalized = value
-                .replace(/^\s*\[/, "")
-                .replace(/\]\s*$/, "")
-                .split(/\s*,\s*/)
+                .replace(/^[ \t]*\[/, "")
+                .replace(/\][ \t]*$/, "")
+                .split(",")
+                .map((s) => s.trim())
                 .filter(Boolean);
             value = normalized;
         }
@@ -93,13 +94,15 @@ export function validateSkillFrontmatter(meta, sourcePath) {
         agents: Array.isArray(meta.agents)
             ? meta.agents
             : String(meta.agents ?? "")
-                .split(/\s*,\s*/)
+                .split(",")
+                .map((s) => s.trim())
                 .filter(Boolean),
         tags: Array.isArray(meta.tags)
             ? meta.tags
             : meta.tags
                 ? String(meta.tags)
-                    .split(/\s*,\s*/)
+                    .split(",")
+                    .map((s) => s.trim())
                     .filter(Boolean)
                 : undefined,
         priority: meta.priority ? Number(meta.priority) : 10,

package/dist/tools/prompts.d.ts CHANGED Viewed

@@ -2,7 +2,9 @@ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 /** Adversarial evaluation stance — shared across all judges. */
 export declare const SHARED_ADVERSARIAL_MANDATE = "ADVERSARIAL MANDATE (applies to ALL judges):\n- Examine the code critically and look for genuine issues. Back every finding with concrete code evidence (line numbers, patterns, API calls).\n- Report only real problems, risks, and deficiencies that exist in the actual code.\n- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.\n- If no concrete issues are found after thorough analysis, report ZERO findings. An empty findings list is the correct output for well-written code.";
 /** Precision override — ensures evidence-based findings. */
-export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.";
+export declare const PRECISION_MANDATE = "PRECISION MANDATE (this section OVERRIDES the adversarial mandate whenever they conflict):\n- Every finding MUST cite specific code evidence: exact line numbers, API calls, variable names, or patterns. Findings without concrete evidence MUST be discarded \u2014 no exceptions.\n- Do NOT flag the absence of a feature or pattern unless you can identify the specific code location where it SHOULD have been implemented and explain WHY it is required for THIS code.\n- Speculative, hypothetical, or \"just in case\" findings erode developer trust. Only flag issues you are confident exist in the actual code.\n- Prefer fewer, high-confidence findings over many uncertain ones. Quality of findings matters more than quantity.\n- If the code is genuinely well-written with no real issues, reporting ZERO findings is the correct and expected behavior. Do not manufacture findings to avoid an empty report.\n- Clean, well-structured code exists. Acknowledge it by not forcing false issues.\n- RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.\n- SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.\n- CONFIDENCE THRESHOLD: Only report findings where you are highly confident (\u226580%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.\n- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.\n\nCOMMON FALSE POSITIVE PATTERNS (do NOT report these):\n- ERR: Do not flag error handling as inadequate when try/catch blocks, validation, or error middleware are present. Missing error handling in a utility function that is clearly called within a guarded context is NOT a finding.\n- LOGIC: Do not flag logic issues for standard patterns (early returns, guard clauses, switch/case with default). Only flag logic errors when you can demonstrate a concrete input that produces an incorrect output.\n- MAINT: Do not flag maintainability concerns for code that follows the language's established idioms. Complexity or length alone is NOT a finding unless it introduces a concrete maintenance burden.\n- SEC: Do not flag security issues when established security libraries (helmet, cors, bcrypt, parameterized queries) are correctly used. \"Could be stronger\" is NOT a vulnerability.\n- STRUCT: Do not flag code structure preferences (file organization, naming conventions) unless they create a concrete deficiency like circular dependencies or unreachable code.";
+/** Clean code gate — explicit instructions when code quality is high. */
+export declare const CLEAN_CODE_GATE = "CLEAN CODE GATE (applies AFTER individual judge evaluation):\n- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.\n- Do NOT report stylistic preferences, alternative approaches, or \"nice to have\" improvements as findings. These are opinions, not defects.\n- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).\n- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.\n- SELF-CHECK before finalizing: For each finding, ask \"Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?\" If the answer is not a clear YES, discard the finding.\n- The goal is to match what a thoughtful, experienced human reviewer would flag \u2014 not to demonstrate comprehensive knowledge of every possible concern.";
 /**
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
  * stripping the persona introduction line, the ADVERSARIAL MANDATE block,
@@ -21,6 +23,5 @@ export declare function getCondensedCriteria(systemPrompt: string): string;
 /**
  * Register all MCP prompts on the given server:
  *  - One per-judge prompt (`judge-{id}`) for single-persona deep reviews
- *  - A `full-tribunal` prompt that convenes all judges at once
  */
 export declare function registerPrompts(server: McpServer): void;

package/dist/tools/prompts.js CHANGED Viewed

@@ -2,18 +2,15 @@
 // Expose judge system prompts as MCP prompts so LLM-based clients can use
 // them for deeper, AI-powered analysis beyond pattern matching.
 //
-// Token-optimised: shared behavioural directives (adversarial mandate,
-// precision mandate) are stated ONCE in the tribunal preamble instead of
-// being duplicated across all 44 judges. Per-judge sections include only
-// the unique evaluation criteria, domain-specific rules, and FP-avoidance
-// guidance. This reduces the tribunal prompt by ~40 000 chars (~10 000
-// tokens) without removing any evaluation criteria.
+// Each per-judge prompt includes shared behavioural directives (adversarial
+// mandate, precision mandate, clean-code gate) plus the judge's unique
+// evaluation criteria, domain-specific rules, and FP-avoidance guidance.
 // ──────────────────────────────────────────────────────────────────────────────
 import { z } from "zod";
 import { JUDGES } from "../judges/index.js";
-// ─── Shared Behavioural Directives ───────────────────────────────────────────
-// Stated ONCE in the tribunal preamble so every judge benefits without
-// repeating the text 39 times.
+// ─── Shared Behavioural Directives & Gates ──────────────────────────────────
+// Included in every per-judge prompt to ensure consistent evaluation
+// behaviour across all judges.
 // ──────────────────────────────────────────────────────────────────────────────
 /** Adversarial evaluation stance — shared across all judges. */
 export const SHARED_ADVERSARIAL_MANDATE = `ADVERSARIAL MANDATE (applies to ALL judges):
@@ -32,7 +29,22 @@ export const PRECISION_MANDATE = `PRECISION MANDATE (this section OVERRIDES the
 - RECOGNIZE SECURE PATTERNS: Code using established security libraries and patterns (e.g. helmet, bcrypt/argon2, parameterized queries, input validation, CSRF tokens, rate limiters, proper TLS) is correctly implementing security. Do NOT flag these as insufficient or suggest alternatives unless a concrete vulnerability exists.
 - SCOPE LIMITATION: Only evaluate code that is actually present. Do NOT flag missing features, tests, logging, documentation, error handling, or infrastructure that may exist in other files. Evaluate what IS provided, not what COULD be elsewhere.
 - CONFIDENCE THRESHOLD: Only report findings where you are highly confident (≥80%) that a real, exploitable issue or concrete deficiency exists in the provided code. When in doubt, do NOT report.
-- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.`;
+- FALSE POSITIVE COST: A false positive is MORE harmful than a missed finding. False positives erode developer trust and cause real issues to be ignored. When uncertain, silence is better than a questionable finding.
+COMMON FALSE POSITIVE PATTERNS (do NOT report these):
+- ERR: Do not flag error handling as inadequate when try/catch blocks, validation, or error middleware are present. Missing error handling in a utility function that is clearly called within a guarded context is NOT a finding.
+- LOGIC: Do not flag logic issues for standard patterns (early returns, guard clauses, switch/case with default). Only flag logic errors when you can demonstrate a concrete input that produces an incorrect output.
+- MAINT: Do not flag maintainability concerns for code that follows the language's established idioms. Complexity or length alone is NOT a finding unless it introduces a concrete maintenance burden.
+- SEC: Do not flag security issues when established security libraries (helmet, cors, bcrypt, parameterized queries) are correctly used. "Could be stronger" is NOT a vulnerability.
+- STRUCT: Do not flag code structure preferences (file organization, naming conventions) unless they create a concrete deficiency like circular dependencies or unreachable code.`;
+/** Clean code gate — explicit instructions when code quality is high. */
+export const CLEAN_CODE_GATE = `CLEAN CODE GATE (applies AFTER individual judge evaluation):
+- Before reporting findings, assess the OVERALL quality of the code. If the code follows established conventions, uses appropriate patterns, handles errors, and has no concrete vulnerabilities or deficiencies, the expected output is ZERO findings across ALL judges.
+- Do NOT report stylistic preferences, alternative approaches, or "nice to have" improvements as findings. These are opinions, not defects.
+- Do NOT report findings about missing functionality that is likely in other files (tests, configs, middleware, error handlers, logging setup).
+- Do NOT report theoretical risks that require assumptions about the runtime environment, deployment configuration, or code outside the provided snippet.
+- SELF-CHECK before finalizing: For each finding, ask "Would a senior engineer reviewing this code in a PR agree this must be fixed before merging?" If the answer is not a clear YES, discard the finding.
+- The goal is to match what a thoughtful, experienced human reviewer would flag — not to demonstrate comprehensive knowledge of every possible concern.`;
 // ─── Criteria Extraction ─────────────────────────────────────────────────────
 /**
  * Extract only the unique evaluation criteria from a judge's systemPrompt,
@@ -73,13 +85,11 @@ export function getCondensedCriteria(systemPrompt) {
 /**
  * Register all MCP prompts on the given server:
  *  - One per-judge prompt (`judge-{id}`) for single-persona deep reviews
- *  - A `full-tribunal` prompt that convenes all judges at once
  */
 export function registerPrompts(server) {
     // ── Per-judge prompts ──────────────────────────────────────────────────
-    // Each prompt uses condensed criteria (adversarial mandate stripped) plus
-    // the shared mandates, mirroring the tribunal architecture for consistency
-    // and better precision on clean code.
+    // Each prompt uses condensed criteria plus the shared mandates for
+    // better precision on clean code.
     for (const judge of JUDGES) {
         server.prompt(`judge-${judge.id}`, `Use the ${judge.name} persona to perform a deep ${judge.domain} review of code. This prompt provides the judge's expert criteria for LLM-powered analysis that goes beyond pattern matching.`, {
             code: z.string().describe("The source code to evaluate"),
@@ -92,6 +102,7 @@ export function registerPrompts(server) {
                 `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
                 `${PRECISION_MANDATE}\n\n` +
                 `${criteria}\n\n` +
+                `${CLEAN_CODE_GATE}\n\n` +
                 `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
                 (context ? `\n\nAdditional context: ${context}` : "") +
                 `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`;
@@ -108,41 +119,4 @@ export function registerPrompts(server) {
             };
         });
     }
-    // ── Full tribunal prompt (token-optimised) ─────────────────────────────
-    // Shared directives (adversarial mandate, precision mandate) are stated
-    // ONCE in the preamble. Each judge section includes only its unique
-    // evaluation criteria, domain-specific rules, and FP-avoidance guidance.
-    server.prompt("full-tribunal", `Convene the full Judges Panel — all ${JUDGES.length} judges evaluate the code in their respective domains and produce a combined verdict.`, {
-        code: z.string().describe("The source code to evaluate"),
-        language: z.string().describe("The programming language"),
-        context: z.string().optional().describe("Additional context about the code"),
-    }, async ({ code, language, context }) => {
-        const judgeInstructions = JUDGES.map((j) => `### ${j.name} — ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
-        const userMessage = `You are the Judges Panel — a panel of ${JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
-            `## Universal Evaluation Directives\n\n` +
-            `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
-            `${PRECISION_MANDATE}\n\n` +
-            `## Evaluation Instructions\n\n` +
-            `Evaluate the following ${language} code from the perspective of ALL ${JUDGES.length} judges below. For each judge, provide:\n` +
-            `1. Judge name and domain\n` +
-            `2. Verdict (PASS / WARNING / FAIL)\n` +
-            `3. Score (0-100)\n` +
-            `4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
-            `For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
-            `Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
-            `## The Judges\n\n${judgeInstructions}\n\n` +
-            `## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\`` +
-            (context ? `\n\n## Additional Context\n${context}` : "");
-        return {
-            messages: [
-                {
-                    role: "user",
-                    content: {
-                        type: "text",
-                        text: userMessage,
-                    },
-                },
-            ],
-        };
-    });
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@kevinrabun/judges",
-  "version": "3.121.0",
+  "version": "3.123.0",
   "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
   "mcpName": "io.github.KevinRabun/judges",
   "type": "module",
@@ -145,7 +145,7 @@
     "zod": "^4.3.6"
   },
   "devDependencies": {
-    "@anthropic-ai/sdk": "^0.79.0",
+    "@anthropic-ai/sdk": "^0.80.0",
     "@eslint/js": "^10.0.1",
     "@types/node": "^25.3.0",
     "@typescript-eslint/eslint-plugin": "^8.56.1",

package/server.json CHANGED Viewed

@@ -3,16 +3,25 @@
   "name": "io.github.KevinRabun/judges",
   "title": "Judges Panel",
   "description": "45 judges that evaluate AI-generated code for security, cost, and quality with built-in AST.",
+  "websiteUrl": "https://kevinrabun.github.io/judges/",
   "repository": {
-    "url": "https://github.com/kevinrabun/judges",
-    "source": "github"
+    "url": "https://github.com/KevinRabun/judges",
+    "source": "github",
+    "id": "1161966307"
   },
-  "version": "3.121.0",
+  "icons": [
+    {
+      "src": "https://raw.githubusercontent.com/KevinRabun/judges/main/vscode-extension/icon.png",
+      "sizes": ["128x128"],
+      "mimeType": "image/png"
+    }
+  ],
+  "version": "3.123.0",
   "packages": [
     {
       "registryType": "npm",
       "identifier": "@kevinrabun/judges",
-      "version": "3.121.0",
+      "version": "3.123.0",
       "transport": {
         "type": "stdio"
       }

package/src/skill-loader.ts CHANGED Viewed

@@ -44,7 +44,7 @@ export function parseSkillFrontmatter(raw: string): { meta: SkillMeta; body: str
       i++;
       continue;
     }
-    const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)\s*:\s*(.*)$/);
+    const kv = line.match(/^([a-zA-Z_][a-zA-Z0-9_-]*)[ \t]*:[ \t]*(.*)$/);
     if (!kv) {
       i++;
       continue;
@@ -85,9 +85,10 @@ export function parseSkillFrontmatter(raw: string): { meta: SkillMeta; body: str
     if (typeof value === "string" && ((value.startsWith("[") && value.endsWith("]")) || value.includes(","))) {
       // simple array parsing: split on comma
       const normalized = (value as string)
-        .replace(/^\s*\[/, "")
-        .replace(/\]\s*$/, "")
-        .split(/\s*,\s*/)
+        .replace(/^[ \t]*\[/, "")
+        .replace(/\][ \t]*$/, "")
+        .split(",")
+        .map((s) => s.trim())
         .filter(Boolean);
       value = normalized;
     } else if (
@@ -117,13 +118,15 @@ export function validateSkillFrontmatter(meta: SkillMeta, sourcePath: string): S
     agents: Array.isArray(meta.agents)
       ? (meta.agents as string[])
       : String(meta.agents ?? "")
-          .split(/\s*,\s*/)
+          .split(",")
+          .map((s) => s.trim())
           .filter(Boolean),
     tags: Array.isArray(meta.tags)
       ? (meta.tags as string[])
       : meta.tags
         ? String(meta.tags)
-            .split(/\s*,\s*/)
+            .split(",")
+            .map((s) => s.trim())
             .filter(Boolean)
         : undefined,
     priority: meta.priority ? Number(meta.priority) : 10,