@kevinrabun/judges 3.115.4 → 3.117.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/accessibility.judge.md +7 -0
- package/agents/agent-instructions.judge.md +7 -0
- package/agents/ai-code-safety.judge.md +7 -0
- package/agents/api-contract.judge.md +7 -0
- package/agents/api-design.judge.md +7 -0
- package/agents/authentication.judge.md +7 -0
- package/agents/backwards-compatibility.judge.md +7 -0
- package/agents/caching.judge.md +7 -0
- package/agents/ci-cd.judge.md +7 -0
- package/agents/cloud-readiness.judge.md +7 -0
- package/agents/concurrency.judge.md +7 -0
- package/agents/configuration-management.judge.md +7 -0
- package/agents/cybersecurity.judge.md +7 -0
- package/agents/data-security.judge.md +7 -0
- package/agents/dependency-health.judge.md +7 -0
- package/agents/documentation.judge.md +7 -0
- package/agents/error-handling.judge.md +7 -0
- package/agents/ethics-bias.judge.md +7 -0
- package/agents/false-positive-review.judge.md +12 -0
- package/agents/framework-safety.judge.md +7 -0
- package/agents/hallucination-detection.judge.md +13 -0
- package/agents/iac-security.judge.md +7 -0
- package/agents/intent-alignment.judge.md +13 -0
- package/agents/logging-privacy.judge.md +7 -0
- package/agents/maintainability.judge.md +7 -0
- package/agents/multi-turn-coherence.judge.md +7 -0
- package/agents/observability.judge.md +7 -0
- package/agents/portability.judge.md +7 -0
- package/agents/rate-limiting.judge.md +7 -0
- package/agents/reliability.judge.md +7 -0
- package/agents/security.judge.md +13 -0
- package/agents/testing.judge.md +7 -0
- package/agents/ux.judge.md +7 -0
- package/dist/a2a-protocol.d.ts +136 -0
- package/dist/a2a-protocol.js +218 -0
- package/dist/api.d.ts +21 -3
- package/dist/api.js +21 -1
- package/dist/audit-trail.d.ts +245 -0
- package/dist/audit-trail.js +257 -0
- package/dist/commands/benchmark-advanced.js +51 -51
- package/dist/commands/benchmark-ai-agents.js +16 -16
- package/dist/commands/benchmark-compliance-ethics.js +12 -12
- package/dist/commands/benchmark-expanded-2.js +2 -2
- package/dist/commands/benchmark-expanded.js +2 -2
- package/dist/commands/benchmark-infrastructure.js +12 -12
- package/dist/commands/benchmark-languages.js +11 -11
- package/dist/commands/benchmark-quality-ops.js +7 -7
- package/dist/commands/benchmark-security-deep.js +9 -9
- package/dist/commands/benchmark.js +1 -1
- package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
- package/dist/commands/llm-benchmark-optimizer.js +241 -0
- package/dist/commands/llm-benchmark.d.ts +4 -2
- package/dist/commands/llm-benchmark.js +40 -12
- package/dist/escalation.d.ts +100 -0
- package/dist/escalation.js +292 -0
- package/dist/evaluation-session.d.ts +74 -0
- package/dist/evaluation-session.js +152 -0
- package/dist/evaluators/index.d.ts +23 -1
- package/dist/evaluators/index.js +192 -3
- package/dist/evaluators/judge-selector.d.ts +19 -0
- package/dist/evaluators/judge-selector.js +141 -0
- package/dist/evaluators/recall-boost.d.ts +27 -0
- package/dist/evaluators/recall-boost.js +409 -0
- package/dist/feedback-loop.d.ts +62 -0
- package/dist/feedback-loop.js +179 -0
- package/dist/index.js +2 -0
- package/dist/judges/accessibility.js +7 -0
- package/dist/judges/agent-instructions.js +7 -0
- package/dist/judges/ai-code-safety.js +7 -0
- package/dist/judges/api-contract.js +7 -0
- package/dist/judges/api-design.js +7 -0
- package/dist/judges/authentication.js +7 -0
- package/dist/judges/backwards-compatibility.js +7 -0
- package/dist/judges/caching.js +7 -0
- package/dist/judges/ci-cd.js +7 -0
- package/dist/judges/cloud-readiness.js +7 -0
- package/dist/judges/concurrency.js +7 -0
- package/dist/judges/configuration-management.js +7 -0
- package/dist/judges/cybersecurity.js +7 -0
- package/dist/judges/data-security.js +7 -0
- package/dist/judges/dependency-health.js +7 -0
- package/dist/judges/documentation.js +7 -0
- package/dist/judges/error-handling.js +7 -0
- package/dist/judges/ethics-bias.js +7 -0
- package/dist/judges/false-positive-review.js +13 -1
- package/dist/judges/framework-safety.js +7 -0
- package/dist/judges/hallucination-detection.js +14 -1
- package/dist/judges/iac-security.js +7 -0
- package/dist/judges/intent-alignment.js +14 -1
- package/dist/judges/logging-privacy.js +7 -0
- package/dist/judges/maintainability.js +7 -0
- package/dist/judges/multi-turn-coherence.js +7 -0
- package/dist/judges/observability.js +7 -0
- package/dist/judges/portability.js +7 -0
- package/dist/judges/rate-limiting.js +7 -0
- package/dist/judges/reliability.js +7 -0
- package/dist/judges/security.js +14 -1
- package/dist/judges/testing.js +7 -0
- package/dist/judges/ux.js +7 -0
- package/dist/review-conversation.d.ts +87 -0
- package/dist/review-conversation.js +307 -0
- package/dist/sast-integration.d.ts +112 -0
- package/dist/sast-integration.js +215 -0
- package/dist/tools/register-evaluation.js +208 -8
- package/dist/tools/register-fix.js +24 -1
- package/dist/tools/register-resources.d.ts +6 -0
- package/dist/tools/register-resources.js +177 -0
- package/dist/tools/register-review.js +26 -1
- package/dist/tools/register-workflow.js +384 -11
- package/dist/tools/validation.d.ts +13 -0
- package/dist/tools/validation.js +77 -0
- package/dist/types.d.ts +122 -0
- package/package.json +25 -12
- package/server.json +2 -2
|
@@ -5,10 +5,12 @@ import { z } from "zod";
|
|
|
5
5
|
import { readFileSync, existsSync } from "fs";
|
|
6
6
|
import { extname } from "path";
|
|
7
7
|
import { JUDGES, getJudge, getJudgeSummaries } from "../judges/index.js";
|
|
8
|
-
import { evaluateWithJudge, evaluateWithTribunal, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, } from "../evaluators/index.js";
|
|
8
|
+
import { evaluateWithJudge, evaluateWithTribunal, evaluateWithTribunalStreaming, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, } from "../evaluators/index.js";
|
|
9
9
|
import { evaluateCodeV2, evaluateProjectV2, getSupportedPolicyProfiles } from "../evaluators/v2.js";
|
|
10
10
|
import { detectProjectContext } from "../evaluators/shared.js";
|
|
11
|
+
import { getGlobalSession } from "../evaluation-session.js";
|
|
11
12
|
import { configSchema, toJudgesConfig } from "./schemas.js";
|
|
13
|
+
import { validateCodeSize } from "./validation.js";
|
|
12
14
|
import { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection } from "./deep-review.js";
|
|
13
15
|
/**
|
|
14
16
|
* Register evaluation-focused tools: get_judges, evaluate_code,
|
|
@@ -20,6 +22,7 @@ export function registerEvaluationTools(server) {
|
|
|
20
22
|
registerEvaluateSingleJudge(server);
|
|
21
23
|
registerEvaluateV2(server);
|
|
22
24
|
registerEvaluateFile(server);
|
|
25
|
+
registerEvaluateCodeStreaming(server);
|
|
23
26
|
}
|
|
24
27
|
// ─── get_judges ──────────────────────────────────────────────────────────────
|
|
25
28
|
function registerGetJudges(server) {
|
|
@@ -34,6 +37,15 @@ function registerGetJudges(server) {
|
|
|
34
37
|
type: "text",
|
|
35
38
|
text: `# Judges Panel\n\n${text}`,
|
|
36
39
|
},
|
|
40
|
+
{
|
|
41
|
+
type: "text",
|
|
42
|
+
text: "```json\n" +
|
|
43
|
+
JSON.stringify({
|
|
44
|
+
judgeCount: judges.length,
|
|
45
|
+
judges: judges.map((j) => ({ id: j.id, name: j.name, domain: j.domain })),
|
|
46
|
+
}, null, 2) +
|
|
47
|
+
"\n```",
|
|
48
|
+
},
|
|
37
49
|
],
|
|
38
50
|
};
|
|
39
51
|
});
|
|
@@ -70,20 +82,52 @@ function registerEvaluateCode(server) {
|
|
|
70
82
|
config: configSchema,
|
|
71
83
|
}, async ({ code, language, context, includeAstFindings, minConfidence, relatedFiles, config }) => {
|
|
72
84
|
try {
|
|
85
|
+
const sizeError = validateCodeSize(code);
|
|
86
|
+
if (sizeError) {
|
|
87
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
88
|
+
}
|
|
89
|
+
const session = getGlobalSession();
|
|
73
90
|
const verdict = evaluateWithTribunal(code, language, context, {
|
|
74
91
|
includeAstFindings,
|
|
75
92
|
minConfidence,
|
|
76
93
|
config: toJudgesConfig(config),
|
|
94
|
+
adaptiveSelection: true,
|
|
95
|
+
filePath: context,
|
|
77
96
|
});
|
|
97
|
+
// Track evaluation in session
|
|
98
|
+
session.recordEvaluation(context ?? `<inline:${language}>`, code, verdict);
|
|
78
99
|
const projectContext = detectProjectContext(code, language);
|
|
79
100
|
const patternResults = formatVerdictAsMarkdown(verdict);
|
|
80
101
|
const deepReview = buildTribunalDeepReviewSection(JUDGES, language, context, relatedFiles, projectContext);
|
|
102
|
+
// Structured JSON content block for programmatic consumption
|
|
103
|
+
const structuredData = {
|
|
104
|
+
score: verdict.overallScore,
|
|
105
|
+
verdict: verdict.overallVerdict,
|
|
106
|
+
findingCount: verdict.findings.length,
|
|
107
|
+
criticalCount: verdict.findings.filter((f) => f.severity === "critical").length,
|
|
108
|
+
highCount: verdict.findings.filter((f) => f.severity === "high").length,
|
|
109
|
+
judgesRun: verdict.evaluations.length,
|
|
110
|
+
findings: verdict.findings.map((f) => ({
|
|
111
|
+
ruleId: f.ruleId,
|
|
112
|
+
severity: f.severity,
|
|
113
|
+
title: f.title,
|
|
114
|
+
lineNumbers: f.lineNumbers,
|
|
115
|
+
confidence: f.confidence,
|
|
116
|
+
})),
|
|
117
|
+
sessionStats: {
|
|
118
|
+
evaluationCount: session.evaluationCount,
|
|
119
|
+
},
|
|
120
|
+
};
|
|
81
121
|
return {
|
|
82
122
|
content: [
|
|
83
123
|
{
|
|
84
124
|
type: "text",
|
|
85
125
|
text: patternResults + deepReview,
|
|
86
126
|
},
|
|
127
|
+
{
|
|
128
|
+
type: "text",
|
|
129
|
+
text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```",
|
|
130
|
+
},
|
|
87
131
|
],
|
|
88
132
|
};
|
|
89
133
|
}
|
|
@@ -130,6 +174,10 @@ function registerEvaluateSingleJudge(server) {
|
|
|
130
174
|
config: configSchema,
|
|
131
175
|
}, async ({ code, language, judgeId, context, minConfidence, relatedFiles, config }) => {
|
|
132
176
|
try {
|
|
177
|
+
const sizeError = validateCodeSize(code);
|
|
178
|
+
if (sizeError) {
|
|
179
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
180
|
+
}
|
|
133
181
|
const judge = getJudge(judgeId);
|
|
134
182
|
if (!judge) {
|
|
135
183
|
return {
|
|
@@ -149,12 +197,25 @@ function registerEvaluateSingleJudge(server) {
|
|
|
149
197
|
const projectContext = detectProjectContext(code, language);
|
|
150
198
|
const patternResults = formatEvaluationAsMarkdown(evaluation);
|
|
151
199
|
const deepReview = buildSingleJudgeDeepReviewSection(judge, language, context, relatedFiles, projectContext);
|
|
200
|
+
const structured = {
|
|
201
|
+
judgeId,
|
|
202
|
+
judgeName: judge.name,
|
|
203
|
+
domain: judge.domain,
|
|
204
|
+
score: evaluation.score,
|
|
205
|
+
verdict: evaluation.verdict,
|
|
206
|
+
findingCount: evaluation.findings.length,
|
|
207
|
+
findings: evaluation.findings.map((f) => ({
|
|
208
|
+
ruleId: f.ruleId,
|
|
209
|
+
severity: f.severity,
|
|
210
|
+
title: f.title,
|
|
211
|
+
lineNumbers: f.lineNumbers,
|
|
212
|
+
confidence: f.confidence,
|
|
213
|
+
})),
|
|
214
|
+
};
|
|
152
215
|
return {
|
|
153
216
|
content: [
|
|
154
|
-
{
|
|
155
|
-
|
|
156
|
-
text: patternResults + deepReview,
|
|
157
|
-
},
|
|
217
|
+
{ type: "text", text: patternResults + deepReview },
|
|
218
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
158
219
|
],
|
|
159
220
|
};
|
|
160
221
|
}
|
|
@@ -173,7 +234,7 @@ function registerEvaluateSingleJudge(server) {
|
|
|
173
234
|
}
|
|
174
235
|
// ─── evaluate_v2 ─────────────────────────────────────────────────────────────
|
|
175
236
|
function registerEvaluateV2(server) {
|
|
176
|
-
server.tool("
|
|
237
|
+
server.tool("evaluate_policy_aware", "Run policy-aware tribunal evaluation with named policy profiles (startup, regulated, healthcare, fintech, public-sector), evidence calibration from runtime metrics, specialty-per-judge feedback, confidence scoring, and uncertainty reporting. Use this when code must meet specific compliance or vertical requirements.", {
|
|
177
238
|
code: z.string().optional().describe("Source code for single-file mode"),
|
|
178
239
|
language: z.string().optional().describe("Language for single-file mode"),
|
|
179
240
|
files: z
|
|
@@ -263,7 +324,7 @@ function registerEvaluateV2(server) {
|
|
|
263
324
|
evaluationContext,
|
|
264
325
|
evidence,
|
|
265
326
|
});
|
|
266
|
-
let md = `#
|
|
327
|
+
let md = `# Policy-Aware Tribunal Evaluation\n\n`;
|
|
267
328
|
md += `**Policy Profile:** ${result.policyProfile}\n`;
|
|
268
329
|
md += `**Calibrated Verdict:** ${result.calibratedVerdict.toUpperCase()} (${result.calibratedScore}/100)\n`;
|
|
269
330
|
md += `**Base Verdict:** ${result.baseVerdict.overallVerdict.toUpperCase()} (${result.baseVerdict.overallScore}/100)\n`;
|
|
@@ -310,7 +371,28 @@ function registerEvaluateV2(server) {
|
|
|
310
371
|
md += `\n## Supported Policy Profiles\n\n`;
|
|
311
372
|
md += supportedProfiles.map((profile) => `- ${profile}`).join("\n");
|
|
312
373
|
md += "\n";
|
|
313
|
-
|
|
374
|
+
const structured = {
|
|
375
|
+
policyProfile: result.policyProfile,
|
|
376
|
+
calibratedScore: result.calibratedScore,
|
|
377
|
+
calibratedVerdict: result.calibratedVerdict,
|
|
378
|
+
baseScore: result.baseVerdict.overallScore,
|
|
379
|
+
baseVerdict: result.baseVerdict.overallVerdict,
|
|
380
|
+
confidence: result.confidence,
|
|
381
|
+
findingCount: result.findings.length,
|
|
382
|
+
findings: result.findings.map((f) => ({
|
|
383
|
+
ruleId: f.ruleId,
|
|
384
|
+
severity: f.severity,
|
|
385
|
+
title: f.title,
|
|
386
|
+
confidence: f.confidence,
|
|
387
|
+
})),
|
|
388
|
+
uncertainty: result.uncertainty,
|
|
389
|
+
};
|
|
390
|
+
return {
|
|
391
|
+
content: [
|
|
392
|
+
{ type: "text", text: md },
|
|
393
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
394
|
+
],
|
|
395
|
+
};
|
|
314
396
|
}
|
|
315
397
|
catch (error) {
|
|
316
398
|
return {
|
|
@@ -382,20 +464,60 @@ function registerEvaluateFile(server) {
|
|
|
382
464
|
}
|
|
383
465
|
const code = readFileSync(filePath, "utf-8");
|
|
384
466
|
const detectedLang = language || detectLanguageFromPath(filePath);
|
|
467
|
+
const session = getGlobalSession();
|
|
468
|
+
// Skip re-evaluation if verdict is stable for this file
|
|
469
|
+
if (session.isVerdictStable(filePath)) {
|
|
470
|
+
const history = session.getVerdictHistory(filePath);
|
|
471
|
+
return {
|
|
472
|
+
content: [
|
|
473
|
+
{
|
|
474
|
+
type: "text",
|
|
475
|
+
text: `# Evaluation: ${filePath}\n\n` +
|
|
476
|
+
`> ⚡ **Verdict stable** — score has converged at **${history[0]?.score ?? 0}/100** ` +
|
|
477
|
+
`across last evaluations. Skipping redundant re-evaluation.\n\n` +
|
|
478
|
+
`Use \`evaluate_code\` with the code directly to force a fresh evaluation.`,
|
|
479
|
+
},
|
|
480
|
+
],
|
|
481
|
+
};
|
|
482
|
+
}
|
|
385
483
|
const verdict = evaluateWithTribunal(code, detectedLang, context, {
|
|
386
484
|
includeAstFindings,
|
|
387
485
|
minConfidence,
|
|
388
486
|
config: toJudgesConfig(config),
|
|
487
|
+
adaptiveSelection: true,
|
|
488
|
+
filePath,
|
|
389
489
|
});
|
|
490
|
+
session.recordEvaluation(filePath, code, verdict);
|
|
390
491
|
const projectContext = detectProjectContext(code, detectedLang, filePath);
|
|
391
492
|
const patternResults = formatVerdictAsMarkdown(verdict);
|
|
392
493
|
const deepReview = buildTribunalDeepReviewSection(JUDGES, detectedLang, context, undefined, projectContext);
|
|
494
|
+
const structuredData = {
|
|
495
|
+
filePath,
|
|
496
|
+
language: detectedLang,
|
|
497
|
+
score: verdict.overallScore,
|
|
498
|
+
verdict: verdict.overallVerdict,
|
|
499
|
+
findingCount: verdict.findings.length,
|
|
500
|
+
criticalCount: verdict.findings.filter((f) => f.severity === "critical").length,
|
|
501
|
+
highCount: verdict.findings.filter((f) => f.severity === "high").length,
|
|
502
|
+
judgesRun: verdict.evaluations.length,
|
|
503
|
+
findings: verdict.findings.map((f) => ({
|
|
504
|
+
ruleId: f.ruleId,
|
|
505
|
+
severity: f.severity,
|
|
506
|
+
title: f.title,
|
|
507
|
+
lineNumbers: f.lineNumbers,
|
|
508
|
+
confidence: f.confidence,
|
|
509
|
+
})),
|
|
510
|
+
};
|
|
393
511
|
return {
|
|
394
512
|
content: [
|
|
395
513
|
{
|
|
396
514
|
type: "text",
|
|
397
515
|
text: `# Evaluation: ${filePath}\n\n` + patternResults + deepReview,
|
|
398
516
|
},
|
|
517
|
+
{
|
|
518
|
+
type: "text",
|
|
519
|
+
text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```",
|
|
520
|
+
},
|
|
399
521
|
],
|
|
400
522
|
};
|
|
401
523
|
}
|
|
@@ -412,3 +534,81 @@ function registerEvaluateFile(server) {
|
|
|
412
534
|
}
|
|
413
535
|
});
|
|
414
536
|
}
|
|
537
|
+
// ─── evaluate_code_streaming ─────────────────────────────────────────────────
|
|
538
|
+
function registerEvaluateCodeStreaming(server) {
|
|
539
|
+
server.tool("evaluate_code_streaming", `Submit code for streaming evaluation — returns per-judge results as each judge completes, with running aggregate scores. Ideal for long evaluations where you want progressive feedback. All ${JUDGES.length} judges run sequentially with per-judge results accumulated into a single structured response.`, {
|
|
540
|
+
code: z.string().describe("The source code to evaluate."),
|
|
541
|
+
language: z.string().describe("The programming language (e.g., 'typescript', 'python', 'javascript')."),
|
|
542
|
+
context: z.string().optional().describe("Optional context about the code."),
|
|
543
|
+
includeAstFindings: z.boolean().optional().describe("Include AST/code-structure findings (default: true)"),
|
|
544
|
+
minConfidence: z
|
|
545
|
+
.number()
|
|
546
|
+
.min(0)
|
|
547
|
+
.max(1)
|
|
548
|
+
.optional()
|
|
549
|
+
.describe("Minimum finding confidence to include (0-1, default: 0)"),
|
|
550
|
+
config: configSchema,
|
|
551
|
+
}, async ({ code, language, context, includeAstFindings, minConfidence, config }) => {
|
|
552
|
+
try {
|
|
553
|
+
const session = getGlobalSession();
|
|
554
|
+
const batches = [];
|
|
555
|
+
let finalBatch;
|
|
556
|
+
for await (const batch of evaluateWithTribunalStreaming(code, language, context, {
|
|
557
|
+
includeAstFindings,
|
|
558
|
+
minConfidence,
|
|
559
|
+
config: toJudgesConfig(config),
|
|
560
|
+
adaptiveSelection: true,
|
|
561
|
+
})) {
|
|
562
|
+
batches.push({
|
|
563
|
+
judgeId: batch.judgeId,
|
|
564
|
+
judgeName: batch.judgeName,
|
|
565
|
+
findingCount: batch.evaluation.findings.length,
|
|
566
|
+
durationMs: batch.evaluation.durationMs ?? 0,
|
|
567
|
+
runningScore: batch.aggregate.currentScore,
|
|
568
|
+
runningVerdict: batch.aggregate.currentVerdict,
|
|
569
|
+
});
|
|
570
|
+
finalBatch = batch;
|
|
571
|
+
}
|
|
572
|
+
// Build progressive markdown
|
|
573
|
+
let md = `# Streaming Evaluation Results\n\n`;
|
|
574
|
+
md += `**Final Score:** ${finalBatch?.aggregate.currentScore ?? 0}/100\n`;
|
|
575
|
+
md += `**Verdict:** ${(finalBatch?.aggregate.currentVerdict ?? "pass").toUpperCase()}\n`;
|
|
576
|
+
md += `**Judges Run:** ${finalBatch?.aggregate.completedJudges ?? 0}/${finalBatch?.aggregate.totalJudges ?? 0}\n`;
|
|
577
|
+
md += `**Total Findings:** ${finalBatch?.aggregate.findingsSoFar ?? 0}\n\n`;
|
|
578
|
+
md += `## Per-Judge Breakdown\n\n`;
|
|
579
|
+
md += `| Judge | Findings | Time (ms) | Running Score |\n`;
|
|
580
|
+
md += `|-------|----------|-----------|---------------|\n`;
|
|
581
|
+
for (const b of batches) {
|
|
582
|
+
md += `| ${b.judgeName} | ${b.findingCount} | ${b.durationMs} | ${b.runningScore}/100 |\n`;
|
|
583
|
+
}
|
|
584
|
+
const structuredData = {
|
|
585
|
+
score: finalBatch?.aggregate.currentScore ?? 0,
|
|
586
|
+
verdict: finalBatch?.aggregate.currentVerdict ?? "pass",
|
|
587
|
+
totalFindings: finalBatch?.aggregate.findingsSoFar ?? 0,
|
|
588
|
+
criticalFindings: finalBatch?.aggregate.criticalSoFar ?? 0,
|
|
589
|
+
highFindings: finalBatch?.aggregate.highSoFar ?? 0,
|
|
590
|
+
judgesRun: finalBatch?.aggregate.completedJudges ?? 0,
|
|
591
|
+
totalJudges: finalBatch?.aggregate.totalJudges ?? 0,
|
|
592
|
+
perJudge: batches,
|
|
593
|
+
sessionEvaluationCount: session.evaluationCount,
|
|
594
|
+
};
|
|
595
|
+
return {
|
|
596
|
+
content: [
|
|
597
|
+
{ type: "text", text: md },
|
|
598
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```" },
|
|
599
|
+
],
|
|
600
|
+
};
|
|
601
|
+
}
|
|
602
|
+
catch (error) {
|
|
603
|
+
return {
|
|
604
|
+
content: [
|
|
605
|
+
{
|
|
606
|
+
type: "text",
|
|
607
|
+
text: error instanceof Error ? `Error: ${error.message}` : "Error: Streaming evaluation failed",
|
|
608
|
+
},
|
|
609
|
+
],
|
|
610
|
+
isError: true,
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
});
|
|
614
|
+
}
|
|
@@ -7,6 +7,7 @@ import { evaluateWithTribunal, evaluateWithJudge } from "../evaluators/index.js"
|
|
|
7
7
|
import { getJudge, JUDGES } from "../judges/index.js";
|
|
8
8
|
import { applyPatches } from "../commands/fix.js";
|
|
9
9
|
import { configSchema, toJudgesConfig } from "./schemas.js";
|
|
10
|
+
import { validateCodeSize } from "./validation.js";
|
|
10
11
|
/**
|
|
11
12
|
* Register the fix_code tool for one-shot code evaluation + auto-fix.
|
|
12
13
|
*/
|
|
@@ -38,6 +39,10 @@ function registerFixCode(server) {
|
|
|
38
39
|
config: configSchema,
|
|
39
40
|
}, async ({ code, language, judgeId, context, minConfidence, config }) => {
|
|
40
41
|
try {
|
|
42
|
+
const sizeError = validateCodeSize(code);
|
|
43
|
+
if (sizeError) {
|
|
44
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
45
|
+
}
|
|
41
46
|
const effectiveMinConfidence = minConfidence ?? 0.5;
|
|
42
47
|
// ── Evaluate ────────────────────────────────────────────────
|
|
43
48
|
let allFindings;
|
|
@@ -133,8 +138,26 @@ function registerFixCode(server) {
|
|
|
133
138
|
text += `\n`;
|
|
134
139
|
}
|
|
135
140
|
text += `### Fixed Code\n\n\`\`\`${language}\n${fixedCode}\n\`\`\`\n`;
|
|
141
|
+
const structured = {
|
|
142
|
+
totalFindings: allFindings.length,
|
|
143
|
+
autoFixable: fixable.length,
|
|
144
|
+
applied,
|
|
145
|
+
skipped,
|
|
146
|
+
remaining: remaining.length,
|
|
147
|
+
patches: fixable.map((p) => ({
|
|
148
|
+
ruleId: p.ruleId,
|
|
149
|
+
severity: p.severity,
|
|
150
|
+
title: p.title,
|
|
151
|
+
line: p.patch.startLine,
|
|
152
|
+
oldText: p.patch.oldText,
|
|
153
|
+
newText: p.patch.newText,
|
|
154
|
+
})),
|
|
155
|
+
};
|
|
136
156
|
return {
|
|
137
|
-
content: [
|
|
157
|
+
content: [
|
|
158
|
+
{ type: "text", text },
|
|
159
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
160
|
+
],
|
|
138
161
|
};
|
|
139
162
|
}
|
|
140
163
|
catch (error) {
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
2
|
+
/**
|
|
3
|
+
* Register MCP resources: judges catalog, presets, session state,
|
|
4
|
+
* and parameterized templates for single-judge / single-preset lookups.
|
|
5
|
+
*/
|
|
6
|
+
export declare function registerResources(server: McpServer): void;
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
// ─── MCP Resource Registration ───────────────────────────────────────────────
|
|
2
|
+
// Expose judges metadata, presets, and session state as MCP resources.
|
|
3
|
+
// Includes both static resources and parameterized resource templates for
|
|
4
|
+
// efficient single-item lookups (judges://judge/{id}, judges://preset/{key}).
|
|
5
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
6
|
+
import { ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
7
|
+
import { getJudge, getJudgeSummaries, JUDGES } from "../judges/index.js";
|
|
8
|
+
import { getPreset, PRESETS } from "../presets.js";
|
|
9
|
+
import { getGlobalSession } from "../evaluation-session.js";
|
|
10
|
+
/**
|
|
11
|
+
* Register MCP resources: judges catalog, presets, session state,
|
|
12
|
+
* and parameterized templates for single-judge / single-preset lookups.
|
|
13
|
+
*/
|
|
14
|
+
export function registerResources(server) {
|
|
15
|
+
registerJudgesCatalog(server);
|
|
16
|
+
registerPresetsResource(server);
|
|
17
|
+
registerSessionResource(server);
|
|
18
|
+
registerJudgeTemplate(server);
|
|
19
|
+
registerPresetTemplate(server);
|
|
20
|
+
}
|
|
21
|
+
// ─── judges://catalog ────────────────────────────────────────────────────────
|
|
22
|
+
function registerJudgesCatalog(server) {
|
|
23
|
+
server.resource("judges-catalog", "judges://catalog", { description: "Full catalog of all judges on the panel — IDs, names, domains, and descriptions." }, async (uri) => {
|
|
24
|
+
const judges = getJudgeSummaries();
|
|
25
|
+
const data = judges.map((j) => ({
|
|
26
|
+
id: j.id,
|
|
27
|
+
name: j.name,
|
|
28
|
+
domain: j.domain,
|
|
29
|
+
description: j.description,
|
|
30
|
+
}));
|
|
31
|
+
return {
|
|
32
|
+
contents: [
|
|
33
|
+
{
|
|
34
|
+
uri: uri.href,
|
|
35
|
+
mimeType: "application/json",
|
|
36
|
+
text: JSON.stringify(data, null, 2),
|
|
37
|
+
},
|
|
38
|
+
],
|
|
39
|
+
};
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
// ─── judges://presets ────────────────────────────────────────────────────────
|
|
43
|
+
function registerPresetsResource(server) {
|
|
44
|
+
server.resource("presets", "judges://presets", { description: "Available evaluation presets with names, descriptions, and configuration overrides." }, async (uri) => {
|
|
45
|
+
const data = Object.entries(PRESETS).map(([key, preset]) => ({
|
|
46
|
+
key,
|
|
47
|
+
name: preset.name,
|
|
48
|
+
description: preset.description,
|
|
49
|
+
config: preset.config,
|
|
50
|
+
}));
|
|
51
|
+
return {
|
|
52
|
+
contents: [
|
|
53
|
+
{
|
|
54
|
+
uri: uri.href,
|
|
55
|
+
mimeType: "application/json",
|
|
56
|
+
text: JSON.stringify(data, null, 2),
|
|
57
|
+
},
|
|
58
|
+
],
|
|
59
|
+
};
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
// ─── judges://session ────────────────────────────────────────────────────────
|
|
63
|
+
function registerSessionResource(server) {
|
|
64
|
+
server.resource("session", "judges://session", {
|
|
65
|
+
description: "Current evaluation session state — evaluation count, detected frameworks, verdict history, and stability indicators.",
|
|
66
|
+
}, async (uri) => {
|
|
67
|
+
const session = getGlobalSession();
|
|
68
|
+
const ctx = session.getContext();
|
|
69
|
+
const filesEvaluated = [...ctx.verdictHistory.entries()].map(([file, history]) => ({
|
|
70
|
+
file,
|
|
71
|
+
evaluations: history.length,
|
|
72
|
+
latestScore: history[history.length - 1]?.score ?? 0,
|
|
73
|
+
stable: session.isVerdictStable(file),
|
|
74
|
+
}));
|
|
75
|
+
const data = {
|
|
76
|
+
evaluationCount: ctx.evaluationCount,
|
|
77
|
+
startedAt: ctx.startedAt,
|
|
78
|
+
frameworks: ctx.frameworks,
|
|
79
|
+
capabilities: [...ctx.capabilities],
|
|
80
|
+
filesEvaluated,
|
|
81
|
+
};
|
|
82
|
+
return {
|
|
83
|
+
contents: [
|
|
84
|
+
{
|
|
85
|
+
uri: uri.href,
|
|
86
|
+
mimeType: "application/json",
|
|
87
|
+
text: JSON.stringify(data, null, 2),
|
|
88
|
+
},
|
|
89
|
+
],
|
|
90
|
+
};
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
// ─── judges://judge/{id} (template) ─────────────────────────────────────────
|
|
94
|
+
function registerJudgeTemplate(server) {
|
|
95
|
+
const judgeIds = JUDGES.map((j) => j.id);
|
|
96
|
+
server.resource("judge-detail", new ResourceTemplate("judges://judge/{id}", {
|
|
97
|
+
list: async () => ({
|
|
98
|
+
resources: judgeIds.map((id) => ({
|
|
99
|
+
uri: `judges://judge/${id}`,
|
|
100
|
+
name: id,
|
|
101
|
+
})),
|
|
102
|
+
}),
|
|
103
|
+
complete: {
|
|
104
|
+
id: (value) => judgeIds.filter((id) => id.startsWith(value)),
|
|
105
|
+
},
|
|
106
|
+
}), { description: "Detailed info for a single judge — rules, domain, system prompt summary." }, async (uri, { id }) => {
|
|
107
|
+
const judgeId = Array.isArray(id) ? id[0] : id;
|
|
108
|
+
const judge = getJudge(judgeId);
|
|
109
|
+
if (!judge) {
|
|
110
|
+
return {
|
|
111
|
+
contents: [
|
|
112
|
+
{
|
|
113
|
+
uri: uri.href,
|
|
114
|
+
mimeType: "application/json",
|
|
115
|
+
text: JSON.stringify({ error: `Judge '${judgeId}' not found` }),
|
|
116
|
+
},
|
|
117
|
+
],
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
const data = {
|
|
121
|
+
id: judge.id,
|
|
122
|
+
name: judge.name,
|
|
123
|
+
domain: judge.domain,
|
|
124
|
+
description: judge.description,
|
|
125
|
+
rulePrefix: judge.rulePrefix,
|
|
126
|
+
tableDescription: judge.tableDescription,
|
|
127
|
+
promptDescription: judge.promptDescription,
|
|
128
|
+
};
|
|
129
|
+
return {
|
|
130
|
+
contents: [
|
|
131
|
+
{
|
|
132
|
+
uri: uri.href,
|
|
133
|
+
mimeType: "application/json",
|
|
134
|
+
text: JSON.stringify(data, null, 2),
|
|
135
|
+
},
|
|
136
|
+
],
|
|
137
|
+
};
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
// ─── judges://preset/{key} (template) ───────────────────────────────────────
|
|
141
|
+
function registerPresetTemplate(server) {
|
|
142
|
+
const presetKeys = Object.keys(PRESETS);
|
|
143
|
+
server.resource("preset-detail", new ResourceTemplate("judges://preset/{key}", {
|
|
144
|
+
list: async () => ({
|
|
145
|
+
resources: presetKeys.map((key) => ({
|
|
146
|
+
uri: `judges://preset/${key}`,
|
|
147
|
+
name: key,
|
|
148
|
+
})),
|
|
149
|
+
}),
|
|
150
|
+
complete: {
|
|
151
|
+
key: (value) => presetKeys.filter((k) => k.startsWith(value)),
|
|
152
|
+
},
|
|
153
|
+
}), { description: "Detailed configuration for a single evaluation preset." }, async (uri, { key }) => {
|
|
154
|
+
const presetKey = Array.isArray(key) ? key[0] : key;
|
|
155
|
+
const preset = getPreset(presetKey);
|
|
156
|
+
if (!preset) {
|
|
157
|
+
return {
|
|
158
|
+
contents: [
|
|
159
|
+
{
|
|
160
|
+
uri: uri.href,
|
|
161
|
+
mimeType: "application/json",
|
|
162
|
+
text: JSON.stringify({ error: `Preset '${presetKey}' not found` }),
|
|
163
|
+
},
|
|
164
|
+
],
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
return {
|
|
168
|
+
contents: [
|
|
169
|
+
{
|
|
170
|
+
uri: uri.href,
|
|
171
|
+
mimeType: "application/json",
|
|
172
|
+
text: JSON.stringify({ key: presetKey, ...preset }, null, 2),
|
|
173
|
+
},
|
|
174
|
+
],
|
|
175
|
+
};
|
|
176
|
+
});
|
|
177
|
+
}
|
|
@@ -115,8 +115,21 @@ function registerExplainFinding(server) {
|
|
|
115
115
|
sections.push(`\n## Remediation\n${remediation}`);
|
|
116
116
|
}
|
|
117
117
|
sections.push(`\n## Next steps\n- Use \`triage_finding\` to accept, defer, or dismiss this finding\n- Use \`fix_code\` to auto-fix if a patch is available\n- Use \`evaluate_code\` to re-evaluate after fixing`);
|
|
118
|
+
const structured = {
|
|
119
|
+
ruleId,
|
|
120
|
+
prefix,
|
|
121
|
+
title: title ?? null,
|
|
122
|
+
severity: severity ?? null,
|
|
123
|
+
owasp: ctx?.owasp ?? null,
|
|
124
|
+
cwe: ctx?.cwe ?? null,
|
|
125
|
+
learn: ctx?.learn ?? null,
|
|
126
|
+
remediation: getRemediationGuidance(prefix) ?? null,
|
|
127
|
+
};
|
|
118
128
|
return {
|
|
119
|
-
content: [
|
|
129
|
+
content: [
|
|
130
|
+
{ type: "text", text: sections.join("\n") },
|
|
131
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
132
|
+
],
|
|
120
133
|
};
|
|
121
134
|
});
|
|
122
135
|
}
|
|
@@ -173,6 +186,18 @@ function registerTriageFinding(server) {
|
|
|
173
186
|
type: "text",
|
|
174
187
|
text: `✓ Triaged finding \`${result.ruleId}\` in ${result.filePath} as **${status}**${reason ? `\n\nReason: ${reason}` : ""}${triagedBy ? `\nTriaged by: ${triagedBy}` : ""}`,
|
|
175
188
|
},
|
|
189
|
+
{
|
|
190
|
+
type: "text",
|
|
191
|
+
text: "```json\n" +
|
|
192
|
+
JSON.stringify({
|
|
193
|
+
ruleId: result.ruleId,
|
|
194
|
+
filePath: result.filePath,
|
|
195
|
+
status,
|
|
196
|
+
reason: reason ?? null,
|
|
197
|
+
triagedBy: triagedBy ?? null,
|
|
198
|
+
}, null, 2) +
|
|
199
|
+
"\n```",
|
|
200
|
+
},
|
|
176
201
|
],
|
|
177
202
|
};
|
|
178
203
|
}
|