@kevinrabun/judges 3.115.4 → 3.117.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/agents/accessibility.judge.md +7 -0
  2. package/agents/agent-instructions.judge.md +7 -0
  3. package/agents/ai-code-safety.judge.md +7 -0
  4. package/agents/api-contract.judge.md +7 -0
  5. package/agents/api-design.judge.md +7 -0
  6. package/agents/authentication.judge.md +7 -0
  7. package/agents/backwards-compatibility.judge.md +7 -0
  8. package/agents/caching.judge.md +7 -0
  9. package/agents/ci-cd.judge.md +7 -0
  10. package/agents/cloud-readiness.judge.md +7 -0
  11. package/agents/concurrency.judge.md +7 -0
  12. package/agents/configuration-management.judge.md +7 -0
  13. package/agents/cybersecurity.judge.md +7 -0
  14. package/agents/data-security.judge.md +7 -0
  15. package/agents/dependency-health.judge.md +7 -0
  16. package/agents/documentation.judge.md +7 -0
  17. package/agents/error-handling.judge.md +7 -0
  18. package/agents/ethics-bias.judge.md +7 -0
  19. package/agents/false-positive-review.judge.md +12 -0
  20. package/agents/framework-safety.judge.md +7 -0
  21. package/agents/hallucination-detection.judge.md +13 -0
  22. package/agents/iac-security.judge.md +7 -0
  23. package/agents/intent-alignment.judge.md +13 -0
  24. package/agents/logging-privacy.judge.md +7 -0
  25. package/agents/maintainability.judge.md +7 -0
  26. package/agents/multi-turn-coherence.judge.md +7 -0
  27. package/agents/observability.judge.md +7 -0
  28. package/agents/portability.judge.md +7 -0
  29. package/agents/rate-limiting.judge.md +7 -0
  30. package/agents/reliability.judge.md +7 -0
  31. package/agents/security.judge.md +13 -0
  32. package/agents/testing.judge.md +7 -0
  33. package/agents/ux.judge.md +7 -0
  34. package/dist/a2a-protocol.d.ts +136 -0
  35. package/dist/a2a-protocol.js +218 -0
  36. package/dist/api.d.ts +21 -3
  37. package/dist/api.js +21 -1
  38. package/dist/audit-trail.d.ts +245 -0
  39. package/dist/audit-trail.js +257 -0
  40. package/dist/commands/benchmark-advanced.js +51 -51
  41. package/dist/commands/benchmark-ai-agents.js +16 -16
  42. package/dist/commands/benchmark-compliance-ethics.js +12 -12
  43. package/dist/commands/benchmark-expanded-2.js +2 -2
  44. package/dist/commands/benchmark-expanded.js +2 -2
  45. package/dist/commands/benchmark-infrastructure.js +12 -12
  46. package/dist/commands/benchmark-languages.js +11 -11
  47. package/dist/commands/benchmark-quality-ops.js +7 -7
  48. package/dist/commands/benchmark-security-deep.js +9 -9
  49. package/dist/commands/benchmark.js +1 -1
  50. package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
  51. package/dist/commands/llm-benchmark-optimizer.js +241 -0
  52. package/dist/commands/llm-benchmark.d.ts +4 -2
  53. package/dist/commands/llm-benchmark.js +40 -12
  54. package/dist/escalation.d.ts +100 -0
  55. package/dist/escalation.js +292 -0
  56. package/dist/evaluation-session.d.ts +74 -0
  57. package/dist/evaluation-session.js +152 -0
  58. package/dist/evaluators/index.d.ts +23 -1
  59. package/dist/evaluators/index.js +192 -3
  60. package/dist/evaluators/judge-selector.d.ts +19 -0
  61. package/dist/evaluators/judge-selector.js +141 -0
  62. package/dist/evaluators/recall-boost.d.ts +27 -0
  63. package/dist/evaluators/recall-boost.js +409 -0
  64. package/dist/feedback-loop.d.ts +62 -0
  65. package/dist/feedback-loop.js +179 -0
  66. package/dist/index.js +2 -0
  67. package/dist/judges/accessibility.js +7 -0
  68. package/dist/judges/agent-instructions.js +7 -0
  69. package/dist/judges/ai-code-safety.js +7 -0
  70. package/dist/judges/api-contract.js +7 -0
  71. package/dist/judges/api-design.js +7 -0
  72. package/dist/judges/authentication.js +7 -0
  73. package/dist/judges/backwards-compatibility.js +7 -0
  74. package/dist/judges/caching.js +7 -0
  75. package/dist/judges/ci-cd.js +7 -0
  76. package/dist/judges/cloud-readiness.js +7 -0
  77. package/dist/judges/concurrency.js +7 -0
  78. package/dist/judges/configuration-management.js +7 -0
  79. package/dist/judges/cybersecurity.js +7 -0
  80. package/dist/judges/data-security.js +7 -0
  81. package/dist/judges/dependency-health.js +7 -0
  82. package/dist/judges/documentation.js +7 -0
  83. package/dist/judges/error-handling.js +7 -0
  84. package/dist/judges/ethics-bias.js +7 -0
  85. package/dist/judges/false-positive-review.js +13 -1
  86. package/dist/judges/framework-safety.js +7 -0
  87. package/dist/judges/hallucination-detection.js +14 -1
  88. package/dist/judges/iac-security.js +7 -0
  89. package/dist/judges/intent-alignment.js +14 -1
  90. package/dist/judges/logging-privacy.js +7 -0
  91. package/dist/judges/maintainability.js +7 -0
  92. package/dist/judges/multi-turn-coherence.js +7 -0
  93. package/dist/judges/observability.js +7 -0
  94. package/dist/judges/portability.js +7 -0
  95. package/dist/judges/rate-limiting.js +7 -0
  96. package/dist/judges/reliability.js +7 -0
  97. package/dist/judges/security.js +14 -1
  98. package/dist/judges/testing.js +7 -0
  99. package/dist/judges/ux.js +7 -0
  100. package/dist/review-conversation.d.ts +87 -0
  101. package/dist/review-conversation.js +307 -0
  102. package/dist/sast-integration.d.ts +112 -0
  103. package/dist/sast-integration.js +215 -0
  104. package/dist/tools/register-evaluation.js +208 -8
  105. package/dist/tools/register-fix.js +24 -1
  106. package/dist/tools/register-resources.d.ts +6 -0
  107. package/dist/tools/register-resources.js +177 -0
  108. package/dist/tools/register-review.js +26 -1
  109. package/dist/tools/register-workflow.js +384 -11
  110. package/dist/tools/validation.d.ts +13 -0
  111. package/dist/tools/validation.js +77 -0
  112. package/dist/types.d.ts +122 -0
  113. package/package.json +25 -12
  114. package/server.json +2 -2
@@ -0,0 +1,241 @@
1
+ /**
2
+ * LLM Benchmark Optimizer — Self-Teaching Feedback Loop
3
+ *
4
+ * Analyzes benchmark snapshots to identify systematic weaknesses
5
+ * (high-FP judges, problematic categories, difficulty gaps) and
6
+ * generates targeted prompt amendments that are applied on the
7
+ * next benchmark run to improve precision without sacrificing recall.
8
+ *
9
+ * Closed loop: run → analyze → amend prompts → run → better scores
10
+ */
11
+ import { JUDGES } from "../judges/index.js";
12
+ // ─── Thresholds ─────────────────────────────────────────────────────────────
13
+ /** Judges below this precision get amendments */
14
+ const AMENDMENT_PRECISION_THRESHOLD = 0.4;
15
+ /** Minimum findings before generating amendment (avoid noise) */
16
+ const MIN_FINDINGS_FOR_AMENDMENT = 5;
17
+ /** Categories below this F1 get flagged */
18
+ const CATEGORY_F1_THRESHOLD = 0.5;
19
+ /** Difficulty detection rate below this gets flagged */
20
+ const DIFFICULTY_DETECTION_THRESHOLD = 0.8;
21
+ /** Conservative estimate: amendment reduces that judge's FPs by this fraction */
22
+ const FP_REDUCTION_ESTIMATE = 0.35;
23
+ // ─── Core Optimizer ─────────────────────────────────────────────────────────
24
+ /**
25
+ * Analyze a benchmark snapshot and produce optimization results.
26
+ * This is the main self-teaching entry point.
27
+ */
28
+ export function optimizeBenchmark(snapshot, existingAmendments) {
29
+ const amendments = [];
30
+ const insights = [];
31
+ const existingPrefixes = new Set((existingAmendments ?? []).map((a) => a.judgePrefix));
32
+ // 1. Identify high-FP judges and generate amendments
33
+ const judgeEntries = Object.entries(snapshot.perJudge).sort(([, a], [, b]) => a.precision - b.precision);
34
+ const worstJudges = [];
35
+ for (const [prefix, stats] of judgeEntries) {
36
+ if (stats.total < MIN_FINDINGS_FOR_AMENDMENT)
37
+ continue;
38
+ if (stats.precision < AMENDMENT_PRECISION_THRESHOLD) {
39
+ worstJudges.push(prefix);
40
+ // Only generate new amendment if one doesn't already exist (or if it's gotten worse)
41
+ const existing = (existingAmendments ?? []).find((a) => a.judgePrefix === prefix);
42
+ const shouldRegenerate = !existing || stats.precision < existing.fpRate * 0.8;
43
+ if (!existingPrefixes.has(prefix) || shouldRegenerate) {
44
+ amendments.push(generateAmendment(prefix, stats.precision, stats.falsePositives, stats.total, snapshot));
45
+ }
46
+ insights.push({
47
+ category: "high-fp-judge",
48
+ severity: stats.precision < 0.1 ? "critical" : "high",
49
+ target: prefix,
50
+ metric: stats.precision,
51
+ recommendation: `Judge ${prefix} has ${pct(stats.precision)} precision ` +
52
+ `(${stats.falsePositives} FP / ${stats.total} findings). ` +
53
+ (existingPrefixes.has(prefix) ? "Existing amendment needs strengthening." : "New amendment generated."),
54
+ });
55
+ }
56
+ }
57
+ // 2. Identify problematic categories
58
+ const worstCategories = [];
59
+ for (const [catName, cat] of Object.entries(snapshot.perCategory)) {
60
+ if (cat.total < 2)
61
+ continue;
62
+ if (cat.f1Score < CATEGORY_F1_THRESHOLD) {
63
+ worstCategories.push(catName);
64
+ insights.push({
65
+ category: catName === "clean" ? "clean-case-leak" : "missed-category",
66
+ severity: cat.f1Score === 0 ? "critical" : "high",
67
+ target: catName,
68
+ metric: cat.f1Score,
69
+ recommendation: catName === "clean"
70
+ ? `All ${cat.total} clean-code cases produced false positives. ` +
71
+ `The precision mandate needs strengthening for clean code recognition.`
72
+ : `Category "${catName}" has F1=${pct(cat.f1Score)}. ` +
73
+ `Review prompts and benchmark cases for this category.`,
74
+ });
75
+ }
76
+ }
77
+ // 3. Check difficulty gaps
78
+ for (const [diff, stats] of Object.entries(snapshot.perDifficulty)) {
79
+ if (stats.detectionRate < DIFFICULTY_DETECTION_THRESHOLD) {
80
+ insights.push({
81
+ category: "difficulty-gap",
82
+ severity: "medium",
83
+ target: diff,
84
+ metric: stats.detectionRate,
85
+ recommendation: `${diff} cases: ${pct(stats.detectionRate)} detection rate. ` +
86
+ `Consider adding targeted training examples for this difficulty level.`,
87
+ });
88
+ }
89
+ }
90
+ // 4. Project improvement from amendments
91
+ const { projectedF1, projectedImprovement } = projectImprovement(snapshot, amendments);
92
+ return {
93
+ amendments,
94
+ insights,
95
+ projectedF1Improvement: projectedImprovement,
96
+ summary: {
97
+ worstJudges,
98
+ worstCategories,
99
+ amendmentsGenerated: amendments.length,
100
+ currentF1: snapshot.f1Score,
101
+ projectedF1,
102
+ },
103
+ };
104
+ }
105
+ // ─── Amendment Generation ───────────────────────────────────────────────────
106
+ function generateAmendment(prefix, precision, fpCount, total, snapshot) {
107
+ const judge = JUDGES.find((j) => j.rulePrefix === prefix);
108
+ const judgeName = judge?.name ?? `Judge ${prefix}`;
109
+ const domain = judge?.domain ?? "its domain";
110
+ // Analyze what the FPs look like — which categories get falsely flagged
111
+ const fpCategories = new Map();
112
+ // Collect specific FP case IDs for pattern extraction
113
+ const fpCaseExamples = [];
114
+ for (const c of snapshot.cases) {
115
+ for (const fp of c.falsePositiveRuleIds) {
116
+ if (fp.startsWith(prefix + "-")) {
117
+ fpCategories.set(c.category, (fpCategories.get(c.category) ?? 0) + 1);
118
+ if (fpCaseExamples.length < 10) {
119
+ fpCaseExamples.push({ caseId: c.caseId, category: c.category, ruleId: fp });
120
+ }
121
+ }
122
+ }
123
+ }
124
+ const topFpCategories = [...fpCategories.entries()]
125
+ .sort((a, b) => b[1] - a[1])
126
+ .slice(0, 5)
127
+ .map(([cat]) => cat);
128
+ // Build specific anti-FP instructions based on observed patterns
129
+ const categoryBlocklist = topFpCategories.length > 0
130
+ ? `\nDo NOT report ${prefix}- findings on code in these categories: ${topFpCategories.join(", ")}. ` +
131
+ `These categories fall outside ${domain} and historically produce false positives.`
132
+ : "";
133
+ // Extract specific FP patterns for concrete guidance
134
+ const fpRuleIds = new Set(fpCaseExamples.map((e) => e.ruleId));
135
+ const specificRules = [...fpRuleIds].slice(0, 5).join(", ");
136
+ const ruleWarning = specificRules
137
+ ? `\nSpecific rule IDs with high FP rates: ${specificRules}. Require >=80% confidence with exact line citations before reporting these.`
138
+ : "";
139
+ // Identify if clean cases are a problem for this judge
140
+ const cleanFPs = fpCaseExamples.filter((e) => e.category === "clean" || e.category.startsWith("ai-negative")).length;
141
+ const cleanWarning = cleanFPs > 0
142
+ ? `\nThis judge produced ${cleanFPs} false positives on CLEAN code. Well-written code using standard patterns exists. If the code follows established best practices, report ZERO ${prefix}- findings.`
143
+ : "";
144
+ const amendment = `PRECISION OVERRIDE for ${judgeName} (${prefix}-): ` +
145
+ `Empirical precision: ${pct(precision)} (${fpCount} FP in ${total} findings). ` +
146
+ `SCOPE: Only report ${prefix}- findings for code that specifically involves ${domain}. ` +
147
+ `EVIDENCE: Every ${prefix}- finding MUST cite exact line numbers and specific code patterns.` +
148
+ categoryBlocklist +
149
+ ruleWarning +
150
+ cleanWarning +
151
+ ` When confidence is below 80%, OMIT the ${prefix}- finding.`;
152
+ return {
153
+ judgePrefix: prefix,
154
+ amendment,
155
+ reason: `${pct(precision)} precision (${fpCount} FP out of ${total} findings)`,
156
+ fpRate: 1 - precision,
157
+ generatedFrom: `benchmark-${snapshot.timestamp.slice(0, 10)}`,
158
+ timestamp: new Date().toISOString(),
159
+ };
160
+ }
161
+ // ─── Prompt Section Formatting ──────────────────────────────────────────────
162
+ /**
163
+ * Format amendments as a prompt section to inject into tribunal/per-judge prompts.
164
+ * Returns empty string if no amendments.
165
+ */
166
+ export function formatAmendmentSection(amendments) {
167
+ if (amendments.length === 0)
168
+ return "";
169
+ const lines = [
170
+ "## Precision Overrides — Based on Empirical Benchmark Data",
171
+ "",
172
+ "The following judges have been identified as having high false positive rates. " +
173
+ "Apply EXTRA scrutiny before reporting findings with these prefixes. " +
174
+ "False positives erode developer trust more than missed findings.",
175
+ "",
176
+ ];
177
+ for (const a of amendments) {
178
+ lines.push(`- **${a.judgePrefix}-**: ${a.amendment}`);
179
+ }
180
+ lines.push("");
181
+ return lines.join("\n");
182
+ }
183
+ // ─── Amendment Store Operations ─────────────────────────────────────────────
184
+ export function createEmptyStore() {
185
+ return { version: 1, amendments: [], history: [] };
186
+ }
187
+ /**
188
+ * Merge new amendments into existing store.
189
+ * Newer amendments for the same prefix replace older ones.
190
+ */
191
+ export function mergeAmendments(store, result, snapshotF1) {
192
+ const amendmentMap = new Map();
193
+ // Existing amendments first
194
+ for (const a of store.amendments) {
195
+ amendmentMap.set(a.judgePrefix, a);
196
+ }
197
+ // New amendments overwrite
198
+ for (const a of result.amendments) {
199
+ amendmentMap.set(a.judgePrefix, a);
200
+ }
201
+ // Remove amendments for judges that improved above threshold
202
+ // (no longer need the amendment)
203
+ const keptAmendments = [...amendmentMap.values()];
204
+ return {
205
+ version: 1,
206
+ amendments: keptAmendments,
207
+ history: [
208
+ ...store.history.slice(-19), // keep last 20 entries
209
+ {
210
+ timestamp: new Date().toISOString(),
211
+ snapshotF1,
212
+ amendmentsApplied: store.amendments.length,
213
+ amendmentsGenerated: result.amendments.length,
214
+ },
215
+ ],
216
+ };
217
+ }
218
+ // ─── Helpers ────────────────────────────────────────────────────────────────
219
+ function pct(n) {
220
+ return `${(n * 100).toFixed(1)}%`;
221
+ }
222
+ function projectImprovement(snapshot, newAmendments) {
223
+ if (newAmendments.length === 0) {
224
+ return { projectedF1: snapshot.f1Score, projectedImprovement: 0 };
225
+ }
226
+ // Estimate: each amendment reduces its judge's FPs by FP_REDUCTION_ESTIMATE
227
+ let reducedFP = 0;
228
+ for (const a of newAmendments) {
229
+ const judgeStats = snapshot.perJudge[a.judgePrefix];
230
+ if (judgeStats) {
231
+ reducedFP += judgeStats.falsePositives * FP_REDUCTION_ESTIMATE;
232
+ }
233
+ }
234
+ const newFP = Math.max(0, snapshot.falsePositives - reducedFP);
235
+ const newPrecision = snapshot.truePositives + newFP > 0 ? snapshot.truePositives / (snapshot.truePositives + newFP) : 1;
236
+ const newF1 = newPrecision + snapshot.recall > 0 ? (2 * newPrecision * snapshot.recall) / (newPrecision + snapshot.recall) : 0;
237
+ return {
238
+ projectedF1: newF1,
239
+ projectedImprovement: newF1 - snapshot.f1Score,
240
+ };
241
+ }
@@ -15,6 +15,8 @@
15
15
  */
16
16
  import type { JudgeDefinition } from "../types.js";
17
17
  import type { BenchmarkCase, CategoryResult, JudgeBenchmarkResult, DifficultyResult } from "./benchmark.js";
18
+ import type { PromptAmendment } from "./llm-benchmark-optimizer.js";
19
+ export declare const TRIBUNAL_JUDGES: JudgeDefinition[];
18
20
  export interface LlmBenchmarkSnapshot {
19
21
  /** Timestamp of this LLM benchmark run */
20
22
  timestamp: string;
@@ -90,11 +92,11 @@ export declare function extractValidatedLlmFindings(response: string, prefixes?:
90
92
  * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
91
93
  * mirroring the tribunal architecture for consistency and better precision.
92
94
  */
93
- export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[]): string;
95
+ export declare function constructPerJudgePrompt(judge: JudgeDefinition, code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
94
96
  /**
95
97
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
96
98
  */
97
- export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[]): string;
99
+ export declare function constructTribunalPrompt(code: string, language: string, contextSnippets?: string[], amendments?: PromptAmendment[]): string;
98
100
  /**
99
101
  * Select a stratified sample of benchmark cases, ensuring representation
100
102
  * across categories, difficulties, and both clean/dirty cases.
@@ -16,6 +16,12 @@
16
16
  import { JUDGES } from "../judges/index.js";
17
17
  import { getCondensedCriteria, SHARED_ADVERSARIAL_MANDATE, PRECISION_MANDATE } from "../tools/prompts.js";
18
18
  import { extractAndValidateLlmFindings, mergeFindings } from "../probabilistic/llm-response-validator.js";
19
+ import { formatAmendmentSection } from "./llm-benchmark-optimizer.js";
20
+ // ─── Tribunal Judge Filtering ───────────────────────────────────────────────
21
+ // Meta-judges that assess analysis quality rather than code quality produce
22
+ // near-100% false positives in single-pass tribunal mode and are excluded.
23
+ const TRIBUNAL_EXCLUDED_PREFIXES = new Set(["INTENT", "COH", "MFPR", "FPR", "OVER"]);
24
+ export const TRIBUNAL_JUDGES = JUDGES.filter((j) => !TRIBUNAL_EXCLUDED_PREFIXES.has(j.rulePrefix));
19
25
  // ─── Rule ID Parsing ────────────────────────────────────────────────────────
20
26
  /**
21
27
  * Extract unique rule IDs from LLM response text.
@@ -55,15 +61,19 @@ export function extractValidatedLlmFindings(response, prefixes) {
55
61
  * Uses condensed criteria (adversarial mandate stripped) plus shared mandates,
56
62
  * mirroring the tribunal architecture for consistency and better precision.
57
63
  */
58
- export function constructPerJudgePrompt(judge, code, language, contextSnippets = []) {
64
+ export function constructPerJudgePrompt(judge, code, language, contextSnippets = [], amendments) {
59
65
  const persona = judge.systemPrompt.substring(0, judge.systemPrompt.indexOf("\n\n"));
60
66
  const criteria = getCondensedCriteria(judge.systemPrompt);
61
67
  const contextSection = contextSnippets.length
62
68
  ? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
63
69
  : "";
70
+ // Filter amendments to only those relevant to this judge
71
+ const relevantAmendments = (amendments ?? []).filter((a) => a.judgePrefix === judge.rulePrefix);
72
+ const amendmentSection = formatAmendmentSection(relevantAmendments);
64
73
  return (`${persona}\n\n` +
65
74
  `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
66
75
  `${PRECISION_MANDATE}\n\n` +
76
+ (amendmentSection ? `${amendmentSection}\n` : "") +
67
77
  contextSection +
68
78
  `${criteria}\n\n` +
69
79
  `Please evaluate the following ${language} code:\n\n\`\`\`${language}\n${code}\n\`\`\`` +
@@ -72,18 +82,25 @@ export function constructPerJudgePrompt(judge, code, language, contextSnippets =
72
82
  /**
73
83
  * Construct the full-tribunal prompt — identical to the MCP-served `full-tribunal` prompt.
74
84
  */
75
- export function constructTribunalPrompt(code, language, contextSnippets = []) {
76
- const judgeInstructions = JUDGES.map((j) => `### ${j.name} — ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
85
+ export function constructTribunalPrompt(code, language, contextSnippets = [], amendments) {
86
+ const judgeInstructions = TRIBUNAL_JUDGES.map((j) => `### ${j.name} — ${j.domain}\n**Rule prefix:** \`${j.rulePrefix}-\`\n\n${getCondensedCriteria(j.systemPrompt)}`).join("\n\n---\n\n");
77
87
  const contextSection = contextSnippets.length
78
88
  ? `## Repository Context\n\n${contextSnippets.map((s) => `- ${s.replace(/\n/g, " ")}`).join("\n")}\n\n`
79
89
  : "";
80
- return (`You are the Judges Panel — a panel of ${JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
90
+ const amendmentSection = formatAmendmentSection(amendments ?? []);
91
+ return (`You are the Judges Panel — a panel of ${TRIBUNAL_JUDGES.length} expert judges who independently evaluate code for quality, security, and operational readiness.\n\n` +
81
92
  `## Universal Evaluation Directives\n\n` +
82
93
  `${SHARED_ADVERSARIAL_MANDATE}\n\n` +
83
94
  `${PRECISION_MANDATE}\n\n` +
95
+ `DOMAIN SCOPE DIRECTIVE (applies to ALL judges):\n` +
96
+ `- Each judge MUST only report findings within their stated domain expertise.\n` +
97
+ `- A CI/CD judge should NOT report authentication findings. An ethics judge should NOT report performance findings.\n` +
98
+ `- If code falls entirely outside your domain (e.g., a YAML CI workflow being evaluated by the Database judge), report ZERO findings for that judge.\n` +
99
+ `- Cross-domain observations should ONLY be reported by the judge whose domain they fall under.\n\n` +
100
+ (amendmentSection ? `${amendmentSection}\n` : "") +
84
101
  contextSection +
85
102
  `## Evaluation Instructions\n\n` +
86
- `Evaluate the following ${language} code from the perspective of ALL ${JUDGES.length} judges below. For each judge, provide:\n` +
103
+ `Evaluate the following ${language} code from the perspective of ALL ${TRIBUNAL_JUDGES.length} judges below. For each judge, provide:\n` +
87
104
  `1. Judge name and domain\n` +
88
105
  `2. Verdict (PASS / WARNING / FAIL)\n` +
89
106
  `3. Score (0-100)\n` +
@@ -166,13 +183,24 @@ export function scoreLlmCase(tc, detectedRuleIds, rawResponse, tokensUsed) {
166
183
  const prefix = expected.split("-")[0];
167
184
  return !detectedPrefixes.has(prefix);
168
185
  });
169
- const falsePositiveIds = tc.unexpectedRuleIds
170
- ? detectedRuleIds.filter((found) => {
171
- const prefix = found.split("-")[0];
172
- return tc.unexpectedRuleIds.some((u) => u.split("-")[0] === prefix);
173
- })
174
- : [];
175
- const casePassed = tc.expectedRuleIds.length === 0 ? falsePositiveIds.length === 0 : matchedExpected.length > 0;
186
+ // For clean cases (no expected findings), ALL detections are false positives.
187
+ // For dirty cases with unexpectedRuleIds, FPs are detections matching those prefixes.
188
+ // For dirty cases WITHOUT unexpectedRuleIds, FPs are detections whose prefix
189
+ // doesn't match any expected prefix (prevents silent over-reporting).
190
+ const isCleanCase = tc.expectedRuleIds.length === 0;
191
+ const expectedPrefixes = new Set(tc.expectedRuleIds.map((r) => r.split("-")[0]));
192
+ const falsePositiveIds = isCleanCase
193
+ ? detectedRuleIds
194
+ : tc.unexpectedRuleIds
195
+ ? detectedRuleIds.filter((found) => {
196
+ const prefix = found.split("-")[0];
197
+ return tc.unexpectedRuleIds.some((u) => u.split("-")[0] === prefix);
198
+ })
199
+ : detectedRuleIds.filter((found) => {
200
+ const prefix = found.split("-")[0];
201
+ return !expectedPrefixes.has(prefix);
202
+ });
203
+ const casePassed = isCleanCase ? falsePositiveIds.length === 0 : matchedExpected.length > 0;
176
204
  return {
177
205
  caseId: tc.id,
178
206
  category: tc.category,
@@ -0,0 +1,100 @@
1
+ /**
2
+ * Human Escalation Protocol
3
+ *
4
+ * Routes low-confidence findings to human reviewers instead of auto-actioning.
5
+ * Provides a structured escalation workflow with reasons, routing suggestions,
6
+ * and a persistent escalation queue.
7
+ *
8
+ * Data stored in .judges-escalations.json
9
+ */
10
+ import type { Finding, Severity, TribunalVerdict, ReviewDecision } from "./types.js";
11
+ import { type DataAdapter } from "./data-adapter.js";
12
+ export type EscalationReason = "low-confidence" | "conflicting-judges" | "novel-pattern" | "cross-file-uncertainty" | "ai-generated-code" | "compliance-sensitive" | "security-critical-low-evidence";
13
+ export type EscalationStatus = "pending" | "acknowledged" | "resolved" | "dismissed";
14
+ export type EscalationRouting = "security-team" | "senior-developer" | "tech-lead" | "compliance-officer" | "any-human";
15
+ export interface EscalatedFinding {
16
+ /** Unique escalation ID */
17
+ escalationId: string;
18
+ /** The finding that triggered escalation */
19
+ finding: Finding;
20
+ /** File where the finding was detected */
21
+ filePath: string;
22
+ /** Why this finding was escalated */
23
+ reasons: EscalationReason[];
24
+ /** Suggested routing — which team/role should review */
25
+ routing: EscalationRouting;
26
+ /** Human-readable explanation of why escalation is needed */
27
+ explanation: string;
28
+ /** Current status */
29
+ status: EscalationStatus;
30
+ /** When the escalation was created */
31
+ createdAt: string;
32
+ /** When the escalation was resolved/dismissed */
33
+ resolvedAt?: string;
34
+ /** Who resolved it */
35
+ resolvedBy?: string;
36
+ /** Resolution notes */
37
+ resolutionNotes?: string;
38
+ }
39
+ export interface EscalationStore {
40
+ version: string;
41
+ escalations: EscalatedFinding[];
42
+ lastUpdated: string;
43
+ }
44
+ export interface EscalationSummary {
45
+ /** Total escalations in queue */
46
+ total: number;
47
+ /** Count by status */
48
+ pending: number;
49
+ acknowledged: number;
50
+ resolved: number;
51
+ dismissed: number;
52
+ /** Count by routing target */
53
+ byRouting: Record<string, number>;
54
+ /** Count by reason */
55
+ byReason: Record<string, number>;
56
+ /** Oldest pending escalation age in hours */
57
+ oldestPendingHours: number;
58
+ }
59
+ export interface EscalationPolicy {
60
+ /** Confidence threshold below which findings are escalated (default: from config) */
61
+ confidenceThreshold?: number;
62
+ /** Severity levels that always escalate when confidence is below threshold */
63
+ alwaysEscalateSeverities?: Severity[];
64
+ /** Rule prefixes that always escalate regardless of confidence */
65
+ alwaysEscalatePrefixes?: string[];
66
+ /** Maximum pending escalations before blocking (0 = no limit) */
67
+ maxPendingBeforeBlock?: number;
68
+ }
69
+ export declare function loadEscalationStore(dir?: string): EscalationStore;
70
+ export declare function saveEscalationStore(store: EscalationStore, dir?: string): void;
71
+ export declare function loadEscalationsViaAdapter(projectDir: string, adapter?: DataAdapter): Promise<EscalationStore>;
72
+ export declare function saveEscalationsViaAdapter(store: EscalationStore, projectDir: string, adapter?: DataAdapter): Promise<void>;
73
+ /**
74
+ * Evaluate which findings in a tribunal verdict need human escalation.
75
+ * Mutates findings to set `needsHumanReview` and returns the escalation records.
76
+ */
77
+ export declare function evaluateEscalations(verdict: TribunalVerdict, filePath: string, policy?: EscalationPolicy): EscalatedFinding[];
78
+ /**
79
+ * Resolve an escalation — mark it as resolved or dismissed.
80
+ */
81
+ export declare function resolveEscalation(store: EscalationStore, escalationId: string, resolution: {
82
+ status: "resolved" | "dismissed";
83
+ resolvedBy?: string;
84
+ notes?: string;
85
+ }): boolean;
86
+ /**
87
+ * Compute summary statistics for the escalation queue.
88
+ */
89
+ export declare function computeEscalationSummary(store: EscalationStore): EscalationSummary;
90
+ /**
91
+ * Check whether the escalation queue should block a merge.
92
+ * Blocks when pending escalations exceed the policy limit.
93
+ */
94
+ export declare function shouldBlockOnEscalations(store: EscalationStore, policy?: EscalationPolicy): boolean;
95
+ /**
96
+ * Enhance a ReviewDecision with escalation information.
97
+ * When escalations exist, the review action may be upgraded to "request-changes"
98
+ * to ensure a human signs off.
99
+ */
100
+ export declare function enhanceReviewWithEscalations(decision: ReviewDecision, escalations: EscalatedFinding[]): ReviewDecision;