npm - @kevinrabun/judges - Versions diffs - 3.115.4 → 3.117.0 - Mend

@kevinrabun/judges 3.115.4 → 3.117.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

package/agents/accessibility.judge.md +7 -0
package/agents/agent-instructions.judge.md +7 -0
package/agents/ai-code-safety.judge.md +7 -0
package/agents/api-contract.judge.md +7 -0
package/agents/api-design.judge.md +7 -0
package/agents/authentication.judge.md +7 -0
package/agents/backwards-compatibility.judge.md +7 -0
package/agents/caching.judge.md +7 -0
package/agents/ci-cd.judge.md +7 -0
package/agents/cloud-readiness.judge.md +7 -0
package/agents/concurrency.judge.md +7 -0
package/agents/configuration-management.judge.md +7 -0
package/agents/cybersecurity.judge.md +7 -0
package/agents/data-security.judge.md +7 -0
package/agents/dependency-health.judge.md +7 -0
package/agents/documentation.judge.md +7 -0
package/agents/error-handling.judge.md +7 -0
package/agents/ethics-bias.judge.md +7 -0
package/agents/false-positive-review.judge.md +12 -0
package/agents/framework-safety.judge.md +7 -0
package/agents/hallucination-detection.judge.md +13 -0
package/agents/iac-security.judge.md +7 -0
package/agents/intent-alignment.judge.md +13 -0
package/agents/logging-privacy.judge.md +7 -0
package/agents/maintainability.judge.md +7 -0
package/agents/multi-turn-coherence.judge.md +7 -0
package/agents/observability.judge.md +7 -0
package/agents/portability.judge.md +7 -0
package/agents/rate-limiting.judge.md +7 -0
package/agents/reliability.judge.md +7 -0
package/agents/security.judge.md +13 -0
package/agents/testing.judge.md +7 -0
package/agents/ux.judge.md +7 -0
package/dist/a2a-protocol.d.ts +136 -0
package/dist/a2a-protocol.js +218 -0
package/dist/api.d.ts +21 -3
package/dist/api.js +21 -1
package/dist/audit-trail.d.ts +245 -0
package/dist/audit-trail.js +257 -0
package/dist/commands/benchmark-advanced.js +51 -51
package/dist/commands/benchmark-ai-agents.js +16 -16
package/dist/commands/benchmark-compliance-ethics.js +12 -12
package/dist/commands/benchmark-expanded-2.js +2 -2
package/dist/commands/benchmark-expanded.js +2 -2
package/dist/commands/benchmark-infrastructure.js +12 -12
package/dist/commands/benchmark-languages.js +11 -11
package/dist/commands/benchmark-quality-ops.js +7 -7
package/dist/commands/benchmark-security-deep.js +9 -9
package/dist/commands/benchmark.js +1 -1
package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
package/dist/commands/llm-benchmark-optimizer.js +241 -0
package/dist/commands/llm-benchmark.d.ts +4 -2
package/dist/commands/llm-benchmark.js +40 -12
package/dist/escalation.d.ts +100 -0
package/dist/escalation.js +292 -0
package/dist/evaluation-session.d.ts +74 -0
package/dist/evaluation-session.js +152 -0
package/dist/evaluators/index.d.ts +23 -1
package/dist/evaluators/index.js +192 -3
package/dist/evaluators/judge-selector.d.ts +19 -0
package/dist/evaluators/judge-selector.js +141 -0
package/dist/evaluators/recall-boost.d.ts +27 -0
package/dist/evaluators/recall-boost.js +409 -0
package/dist/feedback-loop.d.ts +62 -0
package/dist/feedback-loop.js +179 -0
package/dist/index.js +2 -0
package/dist/judges/accessibility.js +7 -0
package/dist/judges/agent-instructions.js +7 -0
package/dist/judges/ai-code-safety.js +7 -0
package/dist/judges/api-contract.js +7 -0
package/dist/judges/api-design.js +7 -0
package/dist/judges/authentication.js +7 -0
package/dist/judges/backwards-compatibility.js +7 -0
package/dist/judges/caching.js +7 -0
package/dist/judges/ci-cd.js +7 -0
package/dist/judges/cloud-readiness.js +7 -0
package/dist/judges/concurrency.js +7 -0
package/dist/judges/configuration-management.js +7 -0
package/dist/judges/cybersecurity.js +7 -0
package/dist/judges/data-security.js +7 -0
package/dist/judges/dependency-health.js +7 -0
package/dist/judges/documentation.js +7 -0
package/dist/judges/error-handling.js +7 -0
package/dist/judges/ethics-bias.js +7 -0
package/dist/judges/false-positive-review.js +13 -1
package/dist/judges/framework-safety.js +7 -0
package/dist/judges/hallucination-detection.js +14 -1
package/dist/judges/iac-security.js +7 -0
package/dist/judges/intent-alignment.js +14 -1
package/dist/judges/logging-privacy.js +7 -0
package/dist/judges/maintainability.js +7 -0
package/dist/judges/multi-turn-coherence.js +7 -0
package/dist/judges/observability.js +7 -0
package/dist/judges/portability.js +7 -0
package/dist/judges/rate-limiting.js +7 -0
package/dist/judges/reliability.js +7 -0
package/dist/judges/security.js +14 -1
package/dist/judges/testing.js +7 -0
package/dist/judges/ux.js +7 -0
package/dist/review-conversation.d.ts +87 -0
package/dist/review-conversation.js +307 -0
package/dist/sast-integration.d.ts +112 -0
package/dist/sast-integration.js +215 -0
package/dist/tools/register-evaluation.js +208 -8
package/dist/tools/register-fix.js +24 -1
package/dist/tools/register-resources.d.ts +6 -0
package/dist/tools/register-resources.js +177 -0
package/dist/tools/register-review.js +26 -1
package/dist/tools/register-workflow.js +384 -11
package/dist/tools/validation.d.ts +13 -0
package/dist/tools/validation.js +77 -0
package/dist/types.d.ts +122 -0
package/package.json +25 -12
package/server.json +2 -2

package/dist/evaluators/index.js CHANGED Viewed

@@ -19,6 +19,10 @@ import { loadFeedbackStore } from "../commands/feedback.js";
 import { CROSS_FILE_SECURITY_CATEGORIES } from "./project.js";
 import { applyTriageFeedback, loadFindingStore } from "../finding-lifecycle.js";
 import { enrichWithSecurityIds } from "../security-ids.js";
+import { selectJudges } from "./judge-selector.js";
+import { getGlobalSession } from "../evaluation-session.js";
+import { evaluateEscalations, enhanceReviewWithEscalations } from "../escalation.js";
+import { applyRecallBoost } from "./recall-boost.js";
 // ── AST-aware post-processing ───────────────────────────────────────────────
 // ── Module-level caches for AST/taint results ───────────────────────────────
 const astStructureCache = new LRUCache(256);
@@ -390,7 +394,19 @@ function resolveJudgeSet(options) {
         const disabled = new Set(options.config.disabledJudges);
         judges = judges.filter((j) => !disabled.has(j.id));
     }
-    return judges;
+    // Adaptive judge selection — skip irrelevant judges based on file context
+    if (options?.adaptiveSelection && options.filePath) {
+        const fileCategory = classifyFile("", options.filePath.split(".").pop() ?? "", options.filePath);
+        const ctx = {
+            language: options.filePath.split(".").pop() ?? "unknown",
+            fileCategory,
+            filePath: options.filePath,
+            projectMode: options.projectMode,
+        };
+        const result = selectJudges(judges, ctx);
+        return { judges: result.selected, skipped: result.skipped };
+    }
+    return { judges };
 }
 /**
  * Check whether an absence-based finding is mitigated by a pre-scanned
@@ -419,6 +435,16 @@ export function evaluateWithJudge(judge, code, language, context, options) {
             : undefined;
         findings.push(...judge.analyze(code, language, analyzeCtx));
     }
+    // ── Recall boost: supplementary patterns for weak-recall categories ──
+    const boostResult = applyRecallBoost(code, language);
+    if (boostResult.findings.length > 0) {
+        // Deduplicate: only add boost findings whose ruleId isn't already present
+        for (const bf of boostResult.findings) {
+            if (!findings.some((f) => f.ruleId === bf.ruleId)) {
+                findings.push(bf);
+            }
+        }
+    }
     // ── Absence gating ──
     // Absence-based findings ("no rate limiting", "no monitoring", etc.) are
     // project-level concerns that cannot be accurately assessed from a single
@@ -649,7 +675,7 @@ export function evaluateWithTribunal(code, language, context, options) {
         ...(astResult ? { _astCache: astResult } : {}),
         ...(taintResult ? { _taintFlows: taintResult } : {}),
     };
-    const judges = resolveJudgeSet(enrichedOptions);
+    const { judges, skipped: skippedJudges } = resolveJudgeSet(enrichedOptions);
     const tribunalStart = performance.now();
     const evaluations = judges.map((judge) => {
         const start = performance.now();
@@ -776,7 +802,30 @@ export function evaluateWithTribunal(code, language, context, options) {
         // No triage data or error loading — continue without adjustment
     }
     const maxFindings = options?.maxFindingsPerFile ?? DEFAULT_MAX_FINDINGS_PER_FILE;
-    const cappedFindings = applyPerFileFindingCap(triageAdjusted, maxFindings);
+    // ── Session feedback calibration ──
+    // Apply confidence penalties from accumulated FP feedback in the
+    // current evaluation session. This is the real-time agentic loop:
+    // user marks findings as FP → session records it → subsequent
+    // evaluations automatically reduce confidence on those rules.
+    let sessionAdjusted = triageAdjusted;
+    try {
+        const session = getGlobalSession();
+        const tally = session.getFeedbackTally();
+        if (tally.size > 0) {
+            sessionAdjusted = triageAdjusted.map((f) => {
+                const penalty = session.getConfidencePenalty(f.ruleId);
+                if (penalty < 1.0) {
+                    const adjusted = clampConfidence((f.confidence ?? 0.5) * penalty);
+                    return { ...f, confidence: adjusted };
+                }
+                return f;
+            });
+        }
+    }
+    catch {
+        // Session feedback calibration failure is non-fatal
+    }
+    const cappedFindings = applyPerFileFindingCap(sessionAdjusted, maxFindings);
     // ── Confidence-based tiering for progressive disclosure ──
     // Tag each finding with a disclosure tier so downstream consumers (CLI,
     // formatters, VS Code extension) can show only high-confidence findings
@@ -852,6 +901,23 @@ export function evaluateWithTribunal(code, language, context, options) {
                 : "AI-generated code patterns detected — review for model-specific biases",
         };
     }
+    // ── Human escalation protocol ──
+    // Evaluate which findings need human review based on escalation policy.
+    // Enhances the review decision with escalation routing information.
+    if (options?.config?.escalationThreshold || options?.filePath) {
+        try {
+            const escalationPolicy = options?.config?.escalationThreshold
+                ? { confidenceThreshold: options.config.escalationThreshold }
+                : undefined;
+            const escalations = evaluateEscalations(result, options?.filePath ?? "<unknown>", escalationPolicy);
+            if (escalations.length > 0 && result.reviewDecision) {
+                result.reviewDecision = enhanceReviewWithEscalations(result.reviewDecision, escalations);
+            }
+        }
+        catch {
+            // Escalation evaluation failure is non-fatal
+        }
+    }
     // ── Disk cache: persist for future runs ──
     if (diskCache) {
         try {
@@ -863,6 +929,129 @@ export function evaluateWithTribunal(code, language, context, options) {
     }
     return result;
 }
+// ─── Streaming Evaluation ────────────────────────────────────────────────────
+/**
+ * Streaming tribunal evaluation — yields per-judge results as each judge
+ * completes, enabling progressive UI updates and early termination.
+ *
+ * Each yielded `StreamingBatch` contains the judge evaluation, execution
+ * trace, and running aggregate statistics.
+ *
+ * Usage:
+ * ```ts
+ * for await (const batch of evaluateWithTribunalStreaming(code, lang)) {
+ *   console.log(`${batch.judgeName}: ${batch.evaluation.findings.length} findings`);
+ *   if (batch.aggregate.criticalSoFar > 10) break; // early termination
+ * }
+ * ```
+ */
+export async function* evaluateWithTribunalStreaming(code, language, context, options) {
+    const includeAst = options?.includeAstFindings ?? true;
+    const hash = contentHash(code, language);
+    let astResult = options?._astCache;
+    if (!astResult && includeAst) {
+        astResult = astStructureCache.get(hash);
+        if (!astResult) {
+            astResult = analyzeStructure(code, language);
+            astStructureCache.set(hash, astResult);
+        }
+    }
+    let taintResult = options?._taintFlows;
+    if (!taintResult) {
+        taintResult = taintFlowCache.get(hash);
+        if (!taintResult) {
+            taintResult = analyzeTaintFlows(code, language);
+            taintFlowCache.set(hash, taintResult);
+        }
+    }
+    const enrichedOptions = {
+        ...options,
+        ...(astResult ? { _astCache: astResult } : {}),
+        ...(taintResult ? { _taintFlows: taintResult } : {}),
+    };
+    const { judges, skipped: skippedJudges } = resolveJudgeSet(enrichedOptions);
+    const totalJudges = judges.length;
+    let completedJudges = 0;
+    let findingsSoFar = 0;
+    let criticalSoFar = 0;
+    let highSoFar = 0;
+    let scoreSum = 0;
+    let hasFailure = false;
+    let hasWarning = false;
+    for (const judge of judges) {
+        const start = performance.now();
+        const evaluation = evaluateWithJudge(judge, code, language, context, enrichedOptions);
+        const durationMs = Math.round(performance.now() - start);
+        evaluation.durationMs = durationMs;
+        completedJudges++;
+        findingsSoFar += evaluation.findings.length;
+        criticalSoFar += evaluation.findings.filter((f) => f.severity === "critical").length;
+        highSoFar += evaluation.findings.filter((f) => f.severity === "high").length;
+        scoreSum += evaluation.score;
+        if (evaluation.verdict === "fail")
+            hasFailure = true;
+        if (evaluation.verdict === "warning")
+            hasWarning = true;
+        const trace = {
+            judgeId: judge.id,
+            judgeName: judge.name,
+            durationMs,
+            rules: buildRuleTraces(evaluation),
+            rawFindingCount: evaluation.findings.length,
+            finalFindingCount: evaluation.findings.length,
+            ...(astResult
+                ? {
+                    astResolution: {
+                        functionsAnalyzed: astResult.functions.length,
+                        maxComplexity: Math.max(0, ...astResult.functions.map((f) => f.cyclomaticComplexity)),
+                        taintFlowsDetected: taintResult?.length ?? 0,
+                    },
+                }
+                : {}),
+        };
+        const currentVerdict = hasFailure ? "fail" : hasWarning ? "warning" : "pass";
+        yield {
+            judgeId: judge.id,
+            judgeName: judge.name,
+            evaluation,
+            trace,
+            aggregate: {
+                completedJudges,
+                totalJudges,
+                findingsSoFar,
+                criticalSoFar,
+                highSoFar,
+                currentScore: Math.round(scoreSum / completedJudges),
+                currentVerdict,
+            },
+            done: completedJudges === totalJudges,
+        };
+        // Yield to the event loop between judges for responsiveness
+        await new Promise((r) => setTimeout(r, 0));
+    }
+}
+/**
+ * Build rule-level traces from a judge evaluation for observability.
+ */
+function buildRuleTraces(evaluation) {
+    const ruleMap = new Map();
+    for (const f of evaluation.findings) {
+        const existing = ruleMap.get(f.ruleId);
+        if (existing) {
+            existing.count++;
+            existing.peakConf = Math.max(existing.peakConf, f.confidence ?? 0.5);
+        }
+        else {
+            ruleMap.set(f.ruleId, { count: 1, peakConf: f.confidence ?? 0.5 });
+        }
+    }
+    return [...ruleMap.entries()].map(([ruleId, { count, peakConf }]) => ({
+        ruleId,
+        matched: true,
+        findingCount: count,
+        peakConfidence: peakConf,
+    }));
+}
 // ─── Project-level Multi-file Analysis (delegated to project.ts) ─────────────
 import { evaluateProject as _evaluateProject } from "./project.js";
 export { scanProjectWideSecurityPatterns } from "./project.js";

package/dist/evaluators/judge-selector.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+/**
+ * Adaptive judge selection — picks only the judges relevant to a given file
+ * based on language, framework, file role, and project context.
+ *
+ * Eliminates wasted work (e.g. running "testing" judge on a Dockerfile,
+ * or "iac-security" on a React component) while keeping the full panel
+ * available for explicit requests.
+ */
+import type { JudgeDefinition, JudgeSelectionContext, JudgeSelectionResult } from "../types.js";
+/**
+ * Select the most relevant judges for a given file context.
+ *
+ * Strategy:
+ * 1. Always include core judges (security, false-positive-review)
+ * 2. Skip judges with language incompatibility
+ * 3. Skip judges irrelevant to the file category
+ * 4. Return selection with skip reasons for observability
+ */
+export declare function selectJudges(judges: JudgeDefinition[], ctx: JudgeSelectionContext): JudgeSelectionResult;

package/dist/evaluators/judge-selector.js ADDED Viewed

@@ -0,0 +1,141 @@
+/**
+ * Adaptive judge selection — picks only the judges relevant to a given file
+ * based on language, framework, file role, and project context.
+ *
+ * Eliminates wasted work (e.g. running "testing" judge on a Dockerfile,
+ * or "iac-security" on a React component) while keeping the full panel
+ * available for explicit requests.
+ */
+// ─── Language → judge relevance ──────────────────────────────────────────────
+/**
+ * Judges that are ONLY relevant for specific language families.
+ * If the language isn't listed, the judge is skipped.
+ * Most judges are language-agnostic and not listed here.
+ */
+const LANGUAGE_SPECIFIC = {
+    // IaC judges only apply to infrastructure languages
+    "iac-security": new Set(["terraform", "bicep", "arm", "dockerfile", "yaml"]),
+};
+/**
+ * Judges to SKIP for specific languages — inverse of above.
+ * E.g. testing patterns don't apply to SQL or Dockerfile.
+ */
+const LANGUAGE_SKIP = {
+    testing: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
+    documentation: new Set(["sql", "dockerfile", "terraform", "bicep", "arm"]),
+    "code-structure": new Set(["sql", "dockerfile", "yaml"]),
+    ux: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "bash", "powershell"]),
+    accessibility: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "bash", "powershell"]),
+    internationalization: new Set(["sql", "dockerfile", "terraform", "bicep", "arm"]),
+    concurrency: new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
+    "over-engineering": new Set(["sql", "dockerfile", "terraform", "bicep", "arm", "yaml"]),
+};
+// ─── File category → judge relevance ────────────────────────────────────────
+/**
+ * Judges to skip when evaluating test files — noise reduction.
+ */
+const SKIP_FOR_TESTS = new Set([
+    "documentation",
+    "rate-limiting",
+    "scalability",
+    "cloud-readiness",
+    "ci-cd",
+    "configuration-management",
+    "cost-effectiveness",
+    "data-sovereignty",
+    "compliance",
+    "internationalization",
+    "ux",
+    "accessibility",
+    "observability",
+]);
+/**
+ * Judges to skip for config/manifest files.
+ */
+const SKIP_FOR_CONFIG = new Set([
+    "testing",
+    "documentation",
+    "code-structure",
+    "error-handling",
+    "performance",
+    "concurrency",
+    "scalability",
+    "ux",
+    "accessibility",
+    "internationalization",
+    "over-engineering",
+    "backwards-compatibility",
+    "maintainability",
+]);
+/**
+ * Judges to skip for IaC files (Terraform, Bicep, ARM, Dockerfile).
+ */
+const SKIP_FOR_IAC = new Set([
+    "testing",
+    "code-structure",
+    "concurrency",
+    "over-engineering",
+    "ux",
+    "accessibility",
+    "internationalization",
+    "api-design",
+    "api-contract",
+    "backwards-compatibility",
+    "hallucination-detection",
+    "multi-turn-coherence",
+    "model-fingerprint",
+]);
+// ─── Core judges that always run ─────────────────────────────────────────────
+/** These judges run unconditionally — they cover universally applicable concerns. */
+const ALWAYS_RUN = new Set(["security", "cybersecurity", "false-positive-review"]);
+// ─── Selection logic ─────────────────────────────────────────────────────────
+/**
+ * Select the most relevant judges for a given file context.
+ *
+ * Strategy:
+ * 1. Always include core judges (security, false-positive-review)
+ * 2. Skip judges with language incompatibility
+ * 3. Skip judges irrelevant to the file category
+ * 4. Return selection with skip reasons for observability
+ */
+export function selectJudges(judges, ctx) {
+    const selected = [];
+    const skipped = [];
+    const lang = ctx.language.toLowerCase();
+    const cat = ctx.fileCategory?.toLowerCase() ?? "";
+    for (const judge of judges) {
+        // Core judges always run
+        if (ALWAYS_RUN.has(judge.id)) {
+            selected.push(judge);
+            continue;
+        }
+        // Language-specific judge: skip if language not in its set
+        const langOnly = LANGUAGE_SPECIFIC[judge.id];
+        if (langOnly && !langOnly.has(lang)) {
+            skipped.push({ judgeId: judge.id, reason: `not relevant for language: ${lang}` });
+            continue;
+        }
+        // Language skip: judge not useful for this language
+        const langSkip = LANGUAGE_SKIP[judge.id];
+        if (langSkip && langSkip.has(lang)) {
+            skipped.push({ judgeId: judge.id, reason: `skipped for language: ${lang}` });
+            continue;
+        }
+        // File category gating
+        if (cat === "test" && SKIP_FOR_TESTS.has(judge.id)) {
+            skipped.push({ judgeId: judge.id, reason: "not relevant for test files" });
+            continue;
+        }
+        if (cat === "config" && SKIP_FOR_CONFIG.has(judge.id)) {
+            skipped.push({ judgeId: judge.id, reason: "not relevant for config files" });
+            continue;
+        }
+        if ((cat === "iac" || lang === "terraform" || lang === "bicep" || lang === "arm" || lang === "dockerfile") &&
+            SKIP_FOR_IAC.has(judge.id)) {
+            skipped.push({ judgeId: judge.id, reason: "not relevant for infrastructure code" });
+            continue;
+        }
+        selected.push(judge);
+    }
+    return { selected, skipped };
+}

package/dist/evaluators/recall-boost.d.ts ADDED Viewed

@@ -0,0 +1,27 @@
+/**
+ * Recall Booster — Additional detection patterns for weak-recall categories
+ *
+ * This module provides supplementary pattern detection for judge categories
+ * where the deterministic evaluators have recall below 85%. It acts as
+ * a second-pass augmentation applied after the primary evaluator.
+ *
+ * Categories strengthened (by recall gap analysis):
+ * - hallucination-detection (46.2% → improved)
+ * - ci-cd (41.7% → improved)
+ * - internationalization (42.9% → improved)
+ * - cost-effectiveness (57.1% → improved)
+ * - documentation (63.6% → improved)
+ * - iac-security (66.7% → improved)
+ * - cloud/cloud-readiness (50-73% → improved)
+ */
+import type { Finding } from "../types.js";
+interface BoostResult {
+    findings: Finding[];
+    boostedCategories: string[];
+}
+/**
+ * Apply recall-boosting patterns to detect issues that primary evaluators miss.
+ * Returns additional findings (does not modify existing ones).
+ */
+export declare function applyRecallBoost(code: string, language: string): BoostResult;
+export {};