npm - @kevinrabun/judges - Versions diffs - 3.113.0 → 3.115.0 - Mend

@kevinrabun/judges 3.113.0 → 3.115.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

package/README.md +9 -0
package/agents/accessibility.judge.md +37 -0
package/agents/agent-instructions.judge.md +37 -0
package/agents/ai-code-safety.judge.md +48 -0
package/agents/api-contract.judge.md +30 -0
package/agents/api-design.judge.md +39 -0
package/agents/authentication.judge.md +37 -0
package/agents/backwards-compatibility.judge.md +37 -0
package/agents/caching.judge.md +37 -0
package/agents/ci-cd.judge.md +37 -0
package/agents/cloud-readiness.judge.md +37 -0
package/agents/code-structure.judge.md +48 -0
package/agents/compliance.judge.md +40 -0
package/agents/concurrency.judge.md +39 -0
package/agents/configuration-management.judge.md +37 -0
package/agents/cost-effectiveness.judge.md +40 -0
package/agents/cybersecurity.judge.md +36 -0
package/agents/data-security.judge.md +34 -0
package/agents/data-sovereignty.judge.md +58 -0
package/agents/database.judge.md +41 -0
package/agents/dependency-health.judge.md +39 -0
package/agents/documentation.judge.md +39 -0
package/agents/error-handling.judge.md +37 -0
package/agents/ethics-bias.judge.md +39 -0
package/agents/false-positive-review.judge.md +73 -0
package/agents/framework-safety.judge.md +40 -0
package/agents/hallucination-detection.judge.md +33 -0
package/agents/iac-security.judge.md +38 -0
package/agents/intent-alignment.judge.md +31 -0
package/agents/internationalization.judge.md +42 -0
package/agents/logging-privacy.judge.md +37 -0
package/agents/logic-review.judge.md +34 -0
package/agents/maintainability.judge.md +37 -0
package/agents/model-fingerprint.judge.md +31 -0
package/agents/multi-turn-coherence.judge.md +29 -0
package/agents/observability.judge.md +37 -0
package/agents/over-engineering.judge.md +48 -0
package/agents/performance.judge.md +44 -0
package/agents/portability.judge.md +37 -0
package/agents/rate-limiting.judge.md +37 -0
package/agents/reliability.judge.md +39 -0
package/agents/scalability.judge.md +41 -0
package/agents/security.judge.md +31 -0
package/agents/software-practices.judge.md +44 -0
package/agents/testing.judge.md +39 -0
package/agents/ux.judge.md +37 -0
package/dist/api.d.ts +9 -1
package/dist/api.js +9 -1
package/dist/commands/fix.d.ts +10 -0
package/dist/commands/fix.js +52 -0
package/dist/commands/llm-benchmark.d.ts +13 -4
package/dist/commands/llm-benchmark.js +39 -8
package/dist/commands/review.d.ts +51 -1
package/dist/commands/review.js +213 -7
package/dist/evaluators/index.js +61 -35
package/dist/github-app.d.ts +35 -0
package/dist/github-app.js +125 -4
package/dist/judges/index.d.ts +23 -61
package/dist/judges/index.js +49 -63
package/dist/patches/apply.d.ts +15 -0
package/dist/patches/apply.js +37 -0
package/dist/tools/prompts.d.ts +2 -2
package/dist/tools/prompts.js +21 -10
package/docs/skills.md +7 -0
package/package.json +18 -3
package/packages/judges-cli/README.md +24 -0
package/packages/judges-cli/bin/judges.js +8 -0
package/scripts/generate-agents-from-judges.ts +111 -0
package/scripts/generate-skills-docs.ts +26 -0
package/scripts/validate-agents.ts +104 -0
package/server.json +2 -2
package/skills/ai-code-review.skill.md +57 -0
package/skills/release-gate.skill.md +27 -0
package/skills/security-review.skill.md +32 -0
package/src/agent-loader.ts +324 -0
package/src/skill-loader.ts +199 -0

package/agents/testing.judge.md ADDED Viewed

@@ -0,0 +1,39 @@
+---
+id: testing
+name: Judge Testing
+domain: Test Quality & Coverage
+rulePrefix: TEST
+description: Evaluates code for test-to-code ratio, test isolation, mocking strategy, edge case coverage, flaky test patterns, and test pyramid balance (unit/integration/e2e).
+tableDescription: Test coverage, assertions, test isolation, naming
+promptDescription: Deep testing quality review
+script: ../src/evaluators/testing.ts
+priority: 10
+---
+You are Judge Testing — a quality engineering architect with mastery of TDD, BDD, testing pyramids, mutation testing, and test infrastructure at scale.
+YOUR EVALUATION CRITERIA:
+1. **Testability**: Is the code structured for easy testing? Are dependencies injectable? Are side effects isolated? Is business logic separated from I/O?
+2. **Test Pyramid Balance**: Is there an appropriate mix of unit tests (many), integration tests (some), and E2E tests (few)? Are tests at the right level?
+3. **Edge Cases**: Are boundary conditions tested (empty arrays, null inputs, max values, concurrent access, unicode, timezone boundaries)?
+4. **Mocking Strategy**: Are mocks/stubs/spies used appropriately? Is there over-mocking (mocking implementation details rather than contracts)? Are test doubles faithful to real behavior?
+5. **Test Isolation**: Do tests depend on each other's state? Is there shared mutable state between tests? Do tests clean up after themselves?
+6. **Flaky Test Patterns**: Are there patterns that could cause flaky tests (timing dependencies, random data without seeds, file system access, network calls)?
+7. **Assertion Quality**: Are assertions specific and meaningful? Do they test behavior rather than implementation? Are error messages in assertions helpful?
+8. **Test Naming & Organization**: Do test names describe the behavior being tested? Are tests organized by feature/behavior rather than by class?
+9. **Error Path Testing**: Are error conditions and exception paths tested? Are failure modes verified, not just success paths?
+10. **Performance Testing**: Are there tests for response time, throughput, or resource usage? Are performance baselines established?
+11. **Security Testing**: Are there tests for authentication, authorization, input validation, and injection attempts?
+12. **Test Data Management**: Is test data created programmatically? Are fixtures/factories used instead of hardcoded data? Is sensitive data avoided in tests?
+RULES FOR YOUR EVALUATION:
+- Assign rule IDs with prefix "TEST-" (e.g. TEST-001).
+- Reference testing best practices (Kent Beck, Martin Fowler's Test Pyramid, FIRST principles).
+- Recommend specific test cases that should be written, with example test code.
+- Evaluate both the tests AND the testability of the code under test.
+- Score from 0-100 where 100 means comprehensive, well-structured test suite.
+ADVERSARIAL MANDATE:
+- Your role is adversarial: assume the test coverage is insufficient and actively hunt for gaps. Back every finding with concrete code evidence (line numbers, patterns, API calls).
+- Never praise or compliment the code. Report only problems, risks, and deficiencies.
+- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
+- Absence of findings does not mean the code is well-tested. It means your analysis reached its limits. State this explicitly.

package/agents/ux.judge.md ADDED Viewed

@@ -0,0 +1,37 @@
+---
+id: ux
+name: Judge UX
+domain: User Experience & Interface Quality
+rulePrefix: UX
+description: Evaluates code for user experience patterns including loading states, error feedback, responsive design, mobile-friendliness, and interaction quality.
+tableDescription: Loading states, error messages, pagination, destructive actions
+promptDescription: Deep user experience review
+script: ../src/evaluators/ux.ts
+priority: 10
+---
+You are Judge UX — a UX engineer and frontend architect who bridges design and engineering, specializing in performance perception, error communication, and inclusive interaction design.
+YOUR EVALUATION CRITERIA:
+1. **Loading States**: Are loading indicators shown during async operations? Is there feedback when the user initiates an action? Are skeleton screens or spinners used?
+2. **Error Feedback**: Are errors communicated to users in a clear, actionable way? Are generic "Something went wrong" messages avoided? Do errors suggest next steps?
+3. **Responsive Design**: Does the UI adapt to different screen sizes? Are media queries or responsive frameworks used? Is content readable on mobile?
+4. **Form UX**: Are forms validated with inline feedback? Are error messages placed near the relevant field? Are required fields marked? Is there auto-save or draft preservation?
+5. **Navigation & Wayfinding**: Is navigation intuitive? Are breadcrumbs provided? Can users always find their way back? Are deep links supported?
+6. **Performance Perception**: Are optimistic updates used? Is there pagination or infinite scroll for large lists? Are perceived loading times minimized?
+7. **Empty States**: Are empty states handled (no data, no results, first-time user)? Do they provide guidance on what to do next?
+8. **Confirmation & Safety**: Are destructive actions confirmed (delete, submit, send)? Can actions be undone? Are users warned about data loss?
+9. **Mobile & Touch**: Are touch targets large enough (48x48px)? Are hover-dependent interactions avoided? Is the interface usable without a mouse?
+10. **Progressive Enhancement**: Does the core functionality work without JavaScript? Are there graceful fallbacks for unsupported features?
+RULES FOR YOUR EVALUATION:
+- Assign rule IDs with prefix "UX-" (e.g. UX-001).
+- Reference Nielsen's Heuristics, Material Design guidelines, and WCAG criteria where applicable.
+- Distinguish between "functional" and "user-friendly."
+- Consider diverse users: slow connections, small screens, assistive technology.
+- Score from 0-100 where 100 means excellent user experience.
+ADVERSARIAL MANDATE:
+- Your role is adversarial: assume the user experience is poor and actively hunt for problems. Back every finding with concrete code evidence (line numbers, patterns, API calls).
+- Never praise or compliment the code. Report only problems, risks, and deficiencies.
+- If you are uncertain whether something is an issue, flag it only when you can cite specific code evidence (line numbers, patterns, API calls). Speculative findings without concrete evidence erode developer trust.
+- Absence of findings does not mean the UX is good. It means your analysis reached its limits. State this explicitly.

package/dist/api.d.ts CHANGED Viewed

@@ -31,6 +31,10 @@ export { verdictToGitHubActions } from "./formatters/github-actions.js";
 export { registerPlugin, unregisterPlugin, getRegisteredPlugins, getCustomRules, getPluginJudges, evaluateCustomRules, runBeforeHooks, runAfterHooks, clearPlugins, } from "./plugins.js";
 export type { CustomRule, JudgesPlugin, PluginRegistration } from "./plugins.js";
 export { JudgeRegistry, defaultRegistry } from "./judge-registry.js";
+export { parseFrontmatter, validateFrontmatter, parseAgentFile, resolveEvaluator, agentToJudgeDefinition, loadAgentDirectory, loadAndRegisterAgents, } from "./agent-loader.js";
+export type { AgentFrontmatter, ParsedAgent } from "./agent-loader.js";
+export { parseSkillFrontmatter, validateSkillFrontmatter, parseSkillFile, loadSkillDirectory, listSkills, runSkill, } from "./skill-loader.js";
+export type { SkillFrontmatter, ParsedSkill } from "./skill-loader.js";
 export { fingerprintCode, fingerprintToFindings } from "./fingerprint.js";
 export type { AiFingerprint, AiSignal } from "./fingerprint.js";
 export { buildCalibrationProfile, calibrateFindings, autoCalibrateFindings } from "./calibration.js";
@@ -49,8 +53,12 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
 export type { ToolProfile, ToolCapability, ComparisonResult } from "./comparison.js";
 export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
 export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGateResult, L2CoverageAnalysis, L2JudgeCoverage, L2CategoryCoverage, } from "./commands/benchmark.js";
-export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, } from "./commands/llm-benchmark.js";
+export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, } from "./commands/llm-benchmark.js";
 export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
+export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
+export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
+export { buildContextSnippets } from "./context/context-snippets.js";
+export { EmbeddingCache, FallbackEmbeddingProvider, getOrCreateEmbedding } from "./context/embedding-cache.js";
 export { exportTeamConfig, importTeamConfig, pullRemoteConfig, writePolicyLock, readPolicyLock, validatePolicyCompliance, } from "./commands/config-share.js";
 export type { TeamConfig, PolicyLock, PolicyValidationResult } from "./commands/config-share.js";
 export { getLanguagePack, listLanguagePacks, suggestPack, LANGUAGE_PACKS } from "./commands/language-packs.js";

package/dist/api.js CHANGED Viewed

@@ -38,6 +38,10 @@ export { verdictToGitHubActions } from "./formatters/github-actions.js";
 export { registerPlugin, unregisterPlugin, getRegisteredPlugins, getCustomRules, getPluginJudges, evaluateCustomRules, runBeforeHooks, runAfterHooks, clearPlugins, } from "./plugins.js";
 // ─── Judge Registry ──────────────────────────────────────────────────────────
 export { JudgeRegistry, defaultRegistry } from "./judge-registry.js";
+// ─── Agent Markdown Loader ───────────────────────────────────────────────────
+export { parseFrontmatter, validateFrontmatter, parseAgentFile, resolveEvaluator, agentToJudgeDefinition, loadAgentDirectory, loadAndRegisterAgents, } from "./agent-loader.js";
+// ─── Skill Loader ───────────────────────────────────────────────────────────
+export { parseSkillFrontmatter, validateSkillFrontmatter, parseSkillFile, loadSkillDirectory, listSkills, runSkill, } from "./skill-loader.js";
 // ─── AI Code Fingerprinting ─────────────────────────────────────────────────
 export { fingerprintCode, fingerprintToFindings } from "./fingerprint.js";
 // ─── Confidence Calibration ─────────────────────────────────────────────────
@@ -56,7 +60,11 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
 // ─── Benchmark Gate ──────────────────────────────────────────────────────────
 export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
 // ─── LLM Benchmark ──────────────────────────────────────────────────────────
-export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, } from "./commands/llm-benchmark.js";
+export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, } from "./commands/llm-benchmark.js";
+// Review autopilot (GitHub App / scripts)
+export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
+export { buildContextSnippets } from "./context/context-snippets.js";
+export { EmbeddingCache, FallbackEmbeddingProvider, getOrCreateEmbedding } from "./context/embedding-cache.js";
 // ─── Config Sharing & Policy ─────────────────────────────────────────────────
 export { exportTeamConfig, importTeamConfig, pullRemoteConfig, writePolicyLock, readPolicyLock, validatePolicyCompliance, } from "./commands/config-share.js";
 // ─── Language Packs ──────────────────────────────────────────────────────────

package/dist/commands/fix.d.ts CHANGED Viewed

@@ -53,6 +53,14 @@ export declare function applyPatches(code: string, patches: PatchCandidate[]): {
     skipped: number;
     overlapped: number;
 };
+/**
+ * Generate a unified diff for a single patch candidate. Useful for git apply fallback.
+ */
+export declare function toUnifiedDiff(filePath: string, patch: Patch): string;
+/**
+ * Try to apply a patch via `git apply` (with 3-way merge) as a fallback when direct text replacement fails.
+ */
+export declare function tryGitApply(filePath: string, patch: Patch): boolean;
 /** A group of patches scoped to a single file within a cross-file fix set. */
 export interface FilePatchGroup {
     /** Absolute or relative file path. */
@@ -92,6 +100,7 @@ export declare function applyPatchSet(patchSet: PatchSet, options?: {
     apply?: boolean;
     filter?: PatchFilter;
     basePath?: string;
+    gitApplyFallback?: boolean;
 }): PatchSetResult;
 interface FixArgs {
     file: string | undefined;
@@ -101,6 +110,7 @@ interface FixArgs {
     severity: string | undefined;
     lines: string | undefined;
     apply: boolean;
+    gitApplyFallback?: boolean;
 }
 export declare function parseFixArgs(argv: string[]): FixArgs;
 export declare function runFix(argv: string[]): void;

package/dist/commands/fix.js CHANGED Viewed

@@ -134,6 +134,44 @@ export function applyPatches(code, patches) {
     }
     return { result: lines.join("\n"), applied, skipped, overlapped };
 }
+/**
+ * Generate a unified diff for a single patch candidate. Useful for git apply fallback.
+ */
+export function toUnifiedDiff(filePath, patch) {
+    const header = `--- a/${filePath}\n+++ b/${filePath}`;
+    const hunkHeader = `@@ -${patch.startLine},${patch.endLine - patch.startLine + 1} +${patch.startLine},${patch.endLine - patch.startLine + 1} @@`;
+    const oldLines = patch.oldText.split(/\r?\n/).map((l) => `-${l}`);
+    const newLines = patch.newText.split(/\r?\n/).map((l) => `+${l}`);
+    return [header, hunkHeader, ...oldLines, ...newLines].join("\n") + "\n";
+}
+/**
+ * Try to apply a patch via `git apply` (with 3-way merge) as a fallback when direct text replacement fails.
+ */
+export function tryGitApply(filePath, patch) {
+    try {
+        const diff = toUnifiedDiff(filePath, patch);
+        const { execFileSync } = require("node:child_process");
+        const { tmpdir } = require("node:os");
+        const { writeFileSync, unlinkSync } = require("node:fs");
+        const tmp = require("node:path").join(tmpdir(), `.judges-patch-${Date.now()}.diff`);
+        writeFileSync(tmp, diff, "utf-8");
+        try {
+            execFileSync("git", ["apply", "--3way", "--reject", tmp], { stdio: "ignore" });
+            return true;
+        }
+        finally {
+            try {
+                unlinkSync(tmp);
+            }
+            catch {
+                // ignore
+            }
+        }
+    }
+    catch {
+        return false;
+    }
+}
 /**
  * Collect patches from findings across multiple files into a PatchSet.
  * Groups findings by their associated file path.
@@ -198,6 +236,16 @@ export function applyPatchSet(patchSet, options = {}) {
         if (options.apply && applied > 0) {
             writeFileSync(absPath, result, "utf-8");
         }
+        // If we skipped patches due to mismatched context, optionally try git apply fallback
+        if (options.apply && skipped > 0 && options.gitApplyFallback) {
+            for (const p of patches) {
+                const ok = tryGitApply(group.filePath, p.patch);
+                if (ok) {
+                    results.totalApplied++;
+                    results.totalSkipped = Math.max(0, results.totalSkipped - 1);
+                }
+            }
+        }
         results.files.push({ filePath: group.filePath, applied, skipped, overlapped });
         results.totalApplied += applied;
         results.totalSkipped += skipped;
@@ -214,6 +262,7 @@ export function parseFixArgs(argv) {
         severity: undefined,
         lines: undefined,
         apply: false,
+        gitApplyFallback: false,
     };
     for (let i = 3; i < argv.length; i++) {
         // skip node, script, "fix"
@@ -223,6 +272,9 @@ export function parseFixArgs(argv) {
             case "-a":
                 args.apply = true;
                 break;
+            case "--git-apply":
+                args.gitApplyFallback = true;
+                break;
             case "--language":
             case "-l":
                 args.language = argv[++i];

package/dist/commands/llm-benchmark.d.ts CHANGED Viewed

@@ -8,8 +8,10 @@
  * - Prompt construction (mirrors MCP-served prompts exactly)
  * - Scoring logic (same methodology as L1 deterministic benchmark)
  *
- * The actual LLM API calls live in scripts/run-llm-benchmark.ts,
- * keeping the npm package free of LLM API dependencies.
+ * LLM API calling is intentionally kept out of the npm package. Wire this
+ * to your preferred provider in a thin runner script (or use the CLI
+ * command `judges llm-benchmark`). The former helper script
+ * `scripts/run-llm-benchmark.ts` has been removed.
  */
 import type { JudgeDefinition } from "../types.js";
 import type { BenchmarkCase, CategoryResult, JudgeBenchmarkResult, DifficultyResult } from "./benchmark.js";
@@ -77,15 +79,22 @@ export interface LlmCaseResult {
  * Extract unique rule IDs from LLM response text.
  * Matches patterns like CYBER-001, SEC-003, AUTH-001, etc.
  */
+export declare function getValidRulePrefixes(): Set<string>;
 export declare function parseLlmRuleIds(response: string): string[];
+/**
+ * Preferred entrypoint: extract findings from raw LLM text with validation. Falls back to regex rule-id scan.
+ */
+export declare function extractValidatedLlmFindings(response: string, prefixes?: Set<string>): import("../probabilistic/llm-response-validator.js").ValidationResult;
 /**
  * Construct a per-judge prompt — identical to the MCP-served `judge-{id}` prompt.
+ * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
+ * mirroring the tribunal architecture for consistency and better precision.
  */
-export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string): string;
+export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[]): string;
 /**
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
  */
-export declare function constructTribunalPrompt(code: string, language: string): string;
+export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[]): string;
 /**
  * Select a stratified sample of benchmark cases, ensuring representation
  * across categories, difficulties, and both clean/dirty cases.

package/dist/commands/llm-benchmark.js CHANGED Viewed

@@ -8,18 +8,24 @@
  * - Prompt construction (mirrors MCP-served prompts exactly)
  * - Scoring logic (same methodology as L1 deterministic benchmark)
  *
- * The actual LLM API calls live in scripts/run-llm-benchmark.ts,
- * keeping the npm package free of LLM API dependencies.
+ * LLM API calling is intentionally kept out of the npm package. Wire this
+ * to your preferred provider in a thin runner script (or use the CLI
+ * command `judges llm-benchmark`). The former helper script
+ * `scripts/run-llm-benchmark.ts` has been removed.
  */
 import { JUDGES } from "../judges/index.js";
 import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE } from "../tools/prompts.js";
+import { extractAndValidateLlmFindings, mergeFindings } from "../probabilistic/llm-response-validator.js";
 // ─── Rule ID Parsing ────────────────────────────────────────────────────────
 /**
  * Extract unique rule IDs from LLM response text.
  * Matches patterns like CYBER-001, SEC-003, AUTH-001, etc.
  */
+export function getValidRulePrefixes() {
+    return new Set(JUDGES.map((j) => j.rulePrefix));
+}
 export function parseLlmRuleIds(response) {
-    const validPrefixes = new Set(JUDGES.map((j) => j.rulePrefix));
+    const validPrefixes = getValidRulePrefixes();
     const pattern = /\b([A-Z]{2,})-(\d{3})\b/g;
     const found = new Set();
     let match;
@@ -30,33 +36,59 @@ export function parseLlmRuleIds(response) {
     }
     return [...found];
 }
+/**
+ * Preferred entrypoint: extract findings from raw LLM text with validation. Falls back to regex rule-id scan.
+ */
+export function extractValidatedLlmFindings(response, prefixes) {
+    const validPrefixes = prefixes ?? getValidRulePrefixes();
+    const primary = extractAndValidateLlmFindings(response, validPrefixes);
+    // Fallback regex scan (for unstructured responses)
+    const fallbackRuleIds = parseLlmRuleIds(response);
+    return mergeFindings(primary, fallbackRuleIds);
+}
 // ─── Prompt Construction ────────────────────────────────────────────────────
 // These construct the exact same prompts served via MCP, ensuring the
 // benchmark tests the same prompts real users experience.
 // ─────────────────────────────────────────────────────────────────────────────
 /**
  * Construct a per-judge prompt — identical to the MCP-served `judge-{id}` prompt.
+ * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
+ * mirroring the tribunal architecture for consistency and better precision.
  */
-export function constructPerJudgePrompt(judge, code, language) {
-    return (`${judge.systemPrompt}\n\n${PRECISION_MANDATE}\n\n` +
+export function constructPerJudgePrompt(judge, code, language, contextSnippets = []) {
+    const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
+    const criteria = getCondensedCriteria(judge.systemPrompt);
+    const contextSection = contextSnippets.length
+        ? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
+        : "";
+    return (`${persona}\n\n` +
+        `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
+        `${PRECISION_MANDATE}\n\n` +
+        contextSection +
+        `${criteria}\n\n` +
         `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
-        `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. End with an overall score (0-100) and verdict (pass/warning/fail).`);
+        `\n\nProvide your evaluation as structured findings with rule IDs (prefix: ${judge.rulePrefix}-), severity levels (critical/high/medium/low/info), descriptions, and actionable recommendations. If no issues meet the confidence threshold, report zero findings explicitly. End with an overall score (0-100) and verdict (pass/warning/fail).`);
 }
 /**
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
  */
-export function constructTribunalPrompt(code, language) {
+export function constructTribunalPrompt(code, language, contextSnippets = []) {
     const judgeInstructions = JUDGES.map((j) => `### ${j.name} — ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
+    const contextSection = contextSnippets.length
+        ? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
+        : "";
     return (`You are the Judges Panel — a panel of ${JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
         `## Universal Evaluation Directives\n\n` +
         `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
         `${PRECISION_MANDATE}\n\n` +
+        contextSection +
         `## Evaluation Instructions\n\n` +
         `Evaluate the following ${language} code from the perspective of ALL ${JUDGES.length} judges below. For each judge, provide:\n` +
         `1. Judge name and domain\n` +
         `2. Verdict (PASS / WARNING / FAIL)\n` +
         `3. Score (0-100)\n` +
         `4. Specific findings with rule IDs (using each judge's rule prefix), severity, and recommendations\n\n` +
+        `For judges where no issues meet the confidence threshold, report a PASS verdict with zero findings.\n\n` +
         `Then provide an OVERALL TRIBUNAL VERDICT that synthesizes all judges' input.\n\n` +
         `## The Judges\n\n${judgeInstructions}\n\n` +
         `## Code to Evaluate\n\n\`\`\`${language}\n${code}\n\`\`\``);
@@ -125,7 +157,6 @@ export function selectStratifiedSample(cases, targetSize) {
  * Returns a fully populated LlmCaseResult.
  */
 export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
-    const _expectedPrefixes = new Set(tc.expectedRuleIds.map((r) => r.split("-")[0]));
     const detectedPrefixes = new Set(detectedRuleIds.map((r) => r.split("-")[0]));
     const matchedExpected = tc.expectedRuleIds.filter((expected) => {
         const prefix = expected.split("-")[0];

package/dist/commands/review.d.ts CHANGED Viewed

@@ -13,6 +13,8 @@
  *
  * Requires: GITHUB_TOKEN environment variable (or gh CLI authenticated).
  */
+import { evaluateDiff } from "../evaluators/index.js";
+export declare function __setEvaluateDiffImplForTest(fn: typeof evaluateDiff | undefined): void;
 import type { Finding, Severity } from "../types.js";
 interface ReviewArgs {
     pr: number;
@@ -35,6 +37,21 @@ interface ReviewArgs {
     crossFile: boolean;
     /** Only run these judges (comma-separated IDs). All others are disabled. */
     judges?: string[];
+    /** Enable Layer 2 (LLM) deep review augmentation */
+    llmDeepReview?: boolean;
+    /** OpenAI-compatible model name (e.g., gpt-4o) */
+    llmModel?: string;
+    /** OpenAI-compatible base URL override */
+    llmBaseUrl?: string;
+    /** Max tokens for LLM responses */
+    llmMaxTokens?: number;
+    /** Enable autopilot: fetch diff, post inline comments, and summary automatically */
+    autopilot?: boolean;
+}
+interface PrFile {
+    filename: string;
+    status: "added" | "modified" | "removed" | "renamed";
+    patch?: string;
 }
 interface ReviewComment {
     path: string;
@@ -46,12 +63,31 @@ interface ReviewComment {
     /** Side for start line (always RIGHT for new code) */
     start_side?: "RIGHT";
 }
+export declare function dedupeComments(comments: ReviewComment[]): ReviewComment[];
+export declare function filterAlreadyPostedComments(repo: string, pr: number, token: string | undefined, comments: ReviewComment[]): ReviewComment[];
 interface DiffHunk {
     filePath: string;
     newContent: string;
     changedLines: number[];
 }
 export declare function parsePatchToHunk(filePath: string, patch: string): DiffHunk;
+declare function ghApiRequest(method: string, endpoint: string, token: string, body?: unknown): {
+    status: number;
+    data: unknown;
+};
+export declare function __setApiRequestImplForTest(fn: typeof ghApiRequest | undefined): void;
+interface LlmClientOptions {
+    model: string;
+    baseUrl?: string;
+    apiKey: string;
+    maxTokens?: number;
+}
+declare function callOpenAiChat(prompt: string, opts: LlmClientOptions): Promise<string>;
+export declare function __setCallOpenAiChatImplForTest(fn: typeof callOpenAiChat): void;
+export declare function runLlmDeepReview(prFiles: PrFile[], args: ReviewArgs): Promise<{
+    summary?: string;
+    warnings?: string[];
+}>;
 export declare function findingToCommentBody(finding: Finding, fpRate?: number): string;
 interface ReviewResult {
     filesAnalyzed: number;
@@ -64,6 +100,9 @@ interface ReviewResult {
     fpSuppressed: number;
     approved: boolean;
     comments: ReviewComment[];
+    /** Optional LLM deep review summary (non-inline). */
+    llmSummary?: string;
+    llmWarnings?: string[];
 }
 /**
  * Build a rich PR-level review narrative with executive summary, per-file
@@ -101,5 +140,16 @@ export declare function assessReviewCompleteness(prFiles: Array<{
     calibrated?: boolean;
 }): ReviewCompleteness;
 export declare function parseReviewArgs(argv: string[]): ReviewArgs;
-export declare function runReview(argv: string[]): void;
+export declare function runReview(argv: string[]): Promise<void>;
+/**
+ * Programmatic autopilot entrypoint for GitHub App / automations.
+ */
+export declare function runReviewAutopilot(pr: number, repo?: string): Promise<void>;
+export declare const __test: {
+    __setCallOpenAiChatImplForTest: typeof __setCallOpenAiChatImplForTest;
+    __setApiRequestImplForTest: typeof __setApiRequestImplForTest;
+    __setEvaluateDiffImplForTest: typeof __setEvaluateDiffImplForTest;
+    runLlmDeepReview: typeof runLlmDeepReview;
+    __evaluateDiffForTest: typeof evaluateDiff;
+};
 export {};