@kevinrabun/judges 3.115.4 → 3.117.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/agents/accessibility.judge.md +7 -0
- package/agents/agent-instructions.judge.md +7 -0
- package/agents/ai-code-safety.judge.md +7 -0
- package/agents/api-contract.judge.md +7 -0
- package/agents/api-design.judge.md +7 -0
- package/agents/authentication.judge.md +7 -0
- package/agents/backwards-compatibility.judge.md +7 -0
- package/agents/caching.judge.md +7 -0
- package/agents/ci-cd.judge.md +7 -0
- package/agents/cloud-readiness.judge.md +7 -0
- package/agents/concurrency.judge.md +7 -0
- package/agents/configuration-management.judge.md +7 -0
- package/agents/cybersecurity.judge.md +7 -0
- package/agents/data-security.judge.md +7 -0
- package/agents/dependency-health.judge.md +7 -0
- package/agents/documentation.judge.md +7 -0
- package/agents/error-handling.judge.md +7 -0
- package/agents/ethics-bias.judge.md +7 -0
- package/agents/false-positive-review.judge.md +12 -0
- package/agents/framework-safety.judge.md +7 -0
- package/agents/hallucination-detection.judge.md +13 -0
- package/agents/iac-security.judge.md +7 -0
- package/agents/intent-alignment.judge.md +13 -0
- package/agents/logging-privacy.judge.md +7 -0
- package/agents/maintainability.judge.md +7 -0
- package/agents/multi-turn-coherence.judge.md +7 -0
- package/agents/observability.judge.md +7 -0
- package/agents/portability.judge.md +7 -0
- package/agents/rate-limiting.judge.md +7 -0
- package/agents/reliability.judge.md +7 -0
- package/agents/security.judge.md +13 -0
- package/agents/testing.judge.md +7 -0
- package/agents/ux.judge.md +7 -0
- package/dist/a2a-protocol.d.ts +136 -0
- package/dist/a2a-protocol.js +218 -0
- package/dist/api.d.ts +21 -3
- package/dist/api.js +21 -1
- package/dist/audit-trail.d.ts +245 -0
- package/dist/audit-trail.js +257 -0
- package/dist/commands/benchmark-advanced.js +51 -51
- package/dist/commands/benchmark-ai-agents.js +16 -16
- package/dist/commands/benchmark-compliance-ethics.js +12 -12
- package/dist/commands/benchmark-expanded-2.js +2 -2
- package/dist/commands/benchmark-expanded.js +2 -2
- package/dist/commands/benchmark-infrastructure.js +12 -12
- package/dist/commands/benchmark-languages.js +11 -11
- package/dist/commands/benchmark-quality-ops.js +7 -7
- package/dist/commands/benchmark-security-deep.js +9 -9
- package/dist/commands/benchmark.js +1 -1
- package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
- package/dist/commands/llm-benchmark-optimizer.js +241 -0
- package/dist/commands/llm-benchmark.d.ts +4 -2
- package/dist/commands/llm-benchmark.js +40 -12
- package/dist/escalation.d.ts +100 -0
- package/dist/escalation.js +292 -0
- package/dist/evaluation-session.d.ts +74 -0
- package/dist/evaluation-session.js +152 -0
- package/dist/evaluators/index.d.ts +23 -1
- package/dist/evaluators/index.js +192 -3
- package/dist/evaluators/judge-selector.d.ts +19 -0
- package/dist/evaluators/judge-selector.js +141 -0
- package/dist/evaluators/recall-boost.d.ts +27 -0
- package/dist/evaluators/recall-boost.js +409 -0
- package/dist/feedback-loop.d.ts +62 -0
- package/dist/feedback-loop.js +179 -0
- package/dist/index.js +2 -0
- package/dist/judges/accessibility.js +7 -0
- package/dist/judges/agent-instructions.js +7 -0
- package/dist/judges/ai-code-safety.js +7 -0
- package/dist/judges/api-contract.js +7 -0
- package/dist/judges/api-design.js +7 -0
- package/dist/judges/authentication.js +7 -0
- package/dist/judges/backwards-compatibility.js +7 -0
- package/dist/judges/caching.js +7 -0
- package/dist/judges/ci-cd.js +7 -0
- package/dist/judges/cloud-readiness.js +7 -0
- package/dist/judges/concurrency.js +7 -0
- package/dist/judges/configuration-management.js +7 -0
- package/dist/judges/cybersecurity.js +7 -0
- package/dist/judges/data-security.js +7 -0
- package/dist/judges/dependency-health.js +7 -0
- package/dist/judges/documentation.js +7 -0
- package/dist/judges/error-handling.js +7 -0
- package/dist/judges/ethics-bias.js +7 -0
- package/dist/judges/false-positive-review.js +13 -1
- package/dist/judges/framework-safety.js +7 -0
- package/dist/judges/hallucination-detection.js +14 -1
- package/dist/judges/iac-security.js +7 -0
- package/dist/judges/intent-alignment.js +14 -1
- package/dist/judges/logging-privacy.js +7 -0
- package/dist/judges/maintainability.js +7 -0
- package/dist/judges/multi-turn-coherence.js +7 -0
- package/dist/judges/observability.js +7 -0
- package/dist/judges/portability.js +7 -0
- package/dist/judges/rate-limiting.js +7 -0
- package/dist/judges/reliability.js +7 -0
- package/dist/judges/security.js +14 -1
- package/dist/judges/testing.js +7 -0
- package/dist/judges/ux.js +7 -0
- package/dist/review-conversation.d.ts +87 -0
- package/dist/review-conversation.js +307 -0
- package/dist/sast-integration.d.ts +112 -0
- package/dist/sast-integration.js +215 -0
- package/dist/tools/register-evaluation.js +208 -8
- package/dist/tools/register-fix.js +24 -1
- package/dist/tools/register-resources.d.ts +6 -0
- package/dist/tools/register-resources.js +177 -0
- package/dist/tools/register-review.js +26 -1
- package/dist/tools/register-workflow.js +384 -11
- package/dist/tools/validation.d.ts +13 -0
- package/dist/tools/validation.js +77 -0
- package/dist/types.d.ts +122 -0
- package/package.json +25 -12
- package/server.json +2 -2
|
@@ -4,10 +4,12 @@
|
|
|
4
4
|
// ──────────────────────────────────────────────────────────────────────────────
|
|
5
5
|
import { z } from "zod";
|
|
6
6
|
import { JUDGES } from "../judges/index.js";
|
|
7
|
-
import { evaluateProject, evaluateDiff, analyzeDependencies, runAppBuilderWorkflow } from "../evaluators/index.js";
|
|
7
|
+
import { evaluateProject, evaluateDiff, analyzeDependencies, runAppBuilderWorkflow, evaluateWithTribunal, enrichWithPatches, formatVerdictAsMarkdown, } from "../evaluators/index.js";
|
|
8
8
|
import { evaluateFilesBatch } from "../api.js";
|
|
9
|
+
import { getGlobalSession } from "../evaluation-session.js";
|
|
9
10
|
import { generatePublicRepoReport } from "../reports/public-repo-report.js";
|
|
10
11
|
import { configSchema, toJudgesConfig } from "./schemas.js";
|
|
12
|
+
import { validateCodeSize } from "./validation.js";
|
|
11
13
|
import { benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, runBenchmarkSuite, } from "../commands/benchmark.js";
|
|
12
14
|
/**
|
|
13
15
|
* Register workflow-focused tools: evaluate_public_repo_report, evaluate_project,
|
|
@@ -22,6 +24,10 @@ export function registerWorkflowTools(server) {
|
|
|
22
24
|
registerBenchmarkGate(server);
|
|
23
25
|
registerBenchmarkDashboard(server);
|
|
24
26
|
registerEvaluateBatch(server);
|
|
27
|
+
registerEvaluateThenFix(server);
|
|
28
|
+
registerEvaluateFocused(server);
|
|
29
|
+
registerSessionStatus(server);
|
|
30
|
+
registerRecordFeedback(server);
|
|
25
31
|
}
|
|
26
32
|
// ─── evaluate_public_repo_report ─────────────────────────────────────────────
|
|
27
33
|
function registerPublicRepoReport(server) {
|
|
@@ -75,6 +81,10 @@ function registerPublicRepoReport(server) {
|
|
|
75
81
|
keepClone: z.boolean().optional().describe("Keep cloned repository on disk for inspection"),
|
|
76
82
|
}, async ({ repoUrl, branch, outputPath, maxFiles, maxFileBytes, maxFindingsInReport, credentialMode, includeAstFindings, minConfidence, enableMustFixGate, mustFixMinConfidence, mustFixDangerousRulePrefixes, keepClone, }) => {
|
|
77
83
|
try {
|
|
84
|
+
await server.sendLoggingMessage({
|
|
85
|
+
level: "info",
|
|
86
|
+
data: `Cloning repository: ${repoUrl}${branch ? ` (branch: ${branch})` : ""}...`,
|
|
87
|
+
});
|
|
78
88
|
const report = generatePublicRepoReport({
|
|
79
89
|
repoUrl,
|
|
80
90
|
branch,
|
|
@@ -112,12 +122,18 @@ function registerPublicRepoReport(server) {
|
|
|
112
122
|
if (keepClone) {
|
|
113
123
|
summary += `- Clone path: ${report.clonePath}\n`;
|
|
114
124
|
}
|
|
125
|
+
const structured = {
|
|
126
|
+
repoUrl,
|
|
127
|
+
overallVerdict: report.overallVerdict,
|
|
128
|
+
averageScore: report.averageScore,
|
|
129
|
+
analyzedFileCount: report.analyzedFileCount,
|
|
130
|
+
totalFindings: report.totalFindings,
|
|
131
|
+
outputPath: report.outputPath ?? null,
|
|
132
|
+
};
|
|
115
133
|
return {
|
|
116
134
|
content: [
|
|
117
|
-
{
|
|
118
|
-
|
|
119
|
-
text: `${summary}\n---\n\n${report.markdown}`,
|
|
120
|
-
},
|
|
135
|
+
{ type: "text", text: `${summary}\n---\n\n${report.markdown}` },
|
|
136
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
121
137
|
],
|
|
122
138
|
};
|
|
123
139
|
}
|
|
@@ -217,7 +233,37 @@ function registerAppBuilderFlow(server) {
|
|
|
217
233
|
md += `- **${task.priority} ${task.ruleId}** ${task.task}\n`;
|
|
218
234
|
}
|
|
219
235
|
}
|
|
220
|
-
|
|
236
|
+
const structured = {
|
|
237
|
+
mode: result.mode,
|
|
238
|
+
releaseDecision: result.releaseDecision,
|
|
239
|
+
score: result.score,
|
|
240
|
+
verdict: result.verdict,
|
|
241
|
+
criticalCount: result.criticalCount,
|
|
242
|
+
highCount: result.highCount,
|
|
243
|
+
mediumCount: result.mediumCount,
|
|
244
|
+
taskCount: result.tasks.length,
|
|
245
|
+
aiFixableCount: result.aiFixableNow.length,
|
|
246
|
+
findings: result.plainLanguageFindings.map((f) => ({
|
|
247
|
+
ruleId: f.ruleId,
|
|
248
|
+
severity: f.severity,
|
|
249
|
+
title: f.title,
|
|
250
|
+
whatIsWrong: f.whatIsWrong,
|
|
251
|
+
nextAction: f.nextAction,
|
|
252
|
+
})),
|
|
253
|
+
tasks: result.tasks.map((t) => ({
|
|
254
|
+
priority: t.priority,
|
|
255
|
+
owner: t.owner,
|
|
256
|
+
effort: t.effort,
|
|
257
|
+
ruleId: t.ruleId,
|
|
258
|
+
task: t.task,
|
|
259
|
+
})),
|
|
260
|
+
};
|
|
261
|
+
return {
|
|
262
|
+
content: [
|
|
263
|
+
{ type: "text", text: md },
|
|
264
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
265
|
+
],
|
|
266
|
+
};
|
|
221
267
|
}
|
|
222
268
|
catch (error) {
|
|
223
269
|
return {
|
|
@@ -253,6 +299,10 @@ function registerEvaluateProject(server) {
|
|
|
253
299
|
config: configSchema,
|
|
254
300
|
}, async ({ files, context, includeAstFindings, minConfidence, config }) => {
|
|
255
301
|
try {
|
|
302
|
+
await server.sendLoggingMessage({
|
|
303
|
+
level: "info",
|
|
304
|
+
data: `Evaluating ${files.length} files with ${JUDGES.length} judges...`,
|
|
305
|
+
});
|
|
256
306
|
const result = evaluateProject(files, context, {
|
|
257
307
|
includeAstFindings,
|
|
258
308
|
minConfidence,
|
|
@@ -282,7 +332,37 @@ function registerEvaluateProject(server) {
|
|
|
282
332
|
md += `- **[${f.severity.toUpperCase()}]** ${f.ruleId}: ${f.title}\n ${f.description}\n`;
|
|
283
333
|
}
|
|
284
334
|
}
|
|
285
|
-
|
|
335
|
+
const structured = {
|
|
336
|
+
overallScore: result.overallScore,
|
|
337
|
+
overallVerdict: result.overallVerdict,
|
|
338
|
+
fileCount: result.fileResults.length,
|
|
339
|
+
criticalCount: result.criticalCount,
|
|
340
|
+
highCount: result.highCount,
|
|
341
|
+
fileResults: result.fileResults.map((fr) => ({
|
|
342
|
+
path: fr.path,
|
|
343
|
+
language: fr.language,
|
|
344
|
+
score: fr.score,
|
|
345
|
+
findingCount: fr.findings.length,
|
|
346
|
+
findings: fr.findings.map((f) => ({
|
|
347
|
+
ruleId: f.ruleId,
|
|
348
|
+
severity: f.severity,
|
|
349
|
+
title: f.title,
|
|
350
|
+
line: f.lineNumbers?.[0],
|
|
351
|
+
})),
|
|
352
|
+
})),
|
|
353
|
+
architecturalFindings: result.architecturalFindings.map((f) => ({
|
|
354
|
+
ruleId: f.ruleId,
|
|
355
|
+
severity: f.severity,
|
|
356
|
+
title: f.title,
|
|
357
|
+
description: f.description,
|
|
358
|
+
})),
|
|
359
|
+
};
|
|
360
|
+
return {
|
|
361
|
+
content: [
|
|
362
|
+
{ type: "text", text: md },
|
|
363
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
364
|
+
],
|
|
365
|
+
};
|
|
286
366
|
}
|
|
287
367
|
catch (error) {
|
|
288
368
|
return {
|
|
@@ -314,6 +394,10 @@ function registerEvaluateDiff(server) {
|
|
|
314
394
|
config: configSchema,
|
|
315
395
|
}, async ({ code, language, changedLines, context, includeAstFindings, minConfidence, config }) => {
|
|
316
396
|
try {
|
|
397
|
+
const sizeError = validateCodeSize(code);
|
|
398
|
+
if (sizeError) {
|
|
399
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
400
|
+
}
|
|
317
401
|
const result = evaluateDiff(code, language, changedLines, context, {
|
|
318
402
|
includeAstFindings,
|
|
319
403
|
minConfidence,
|
|
@@ -334,7 +418,24 @@ function registerEvaluateDiff(server) {
|
|
|
334
418
|
md += `**Recommendation:** ${f.recommendation}\n\n`;
|
|
335
419
|
}
|
|
336
420
|
}
|
|
337
|
-
|
|
421
|
+
const structured = {
|
|
422
|
+
score: result.score,
|
|
423
|
+
verdict: result.verdict,
|
|
424
|
+
linesAnalyzed: result.linesAnalyzed,
|
|
425
|
+
findingCount: result.findings.length,
|
|
426
|
+
findings: result.findings.map((f) => ({
|
|
427
|
+
ruleId: f.ruleId,
|
|
428
|
+
severity: f.severity,
|
|
429
|
+
title: f.title,
|
|
430
|
+
lineNumbers: f.lineNumbers,
|
|
431
|
+
})),
|
|
432
|
+
};
|
|
433
|
+
return {
|
|
434
|
+
content: [
|
|
435
|
+
{ type: "text", text: md },
|
|
436
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
437
|
+
],
|
|
438
|
+
};
|
|
338
439
|
}
|
|
339
440
|
catch (error) {
|
|
340
441
|
return {
|
|
@@ -382,7 +483,24 @@ function registerAnalyzeDependencies(server) {
|
|
|
382
483
|
md += `**Development (${dev.length}):** ${dev.map((d) => `${d.name}@${d.version}`).join(", ")}\n\n`;
|
|
383
484
|
}
|
|
384
485
|
}
|
|
385
|
-
|
|
486
|
+
const structured = {
|
|
487
|
+
manifestType,
|
|
488
|
+
score: result.score,
|
|
489
|
+
verdict: result.verdict,
|
|
490
|
+
totalDependencies: result.totalDependencies,
|
|
491
|
+
findingCount: result.findings.length,
|
|
492
|
+
findings: result.findings.map((f) => ({
|
|
493
|
+
ruleId: f.ruleId,
|
|
494
|
+
severity: f.severity,
|
|
495
|
+
title: f.title,
|
|
496
|
+
})),
|
|
497
|
+
};
|
|
498
|
+
return {
|
|
499
|
+
content: [
|
|
500
|
+
{ type: "text", text: md },
|
|
501
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
502
|
+
],
|
|
503
|
+
};
|
|
386
504
|
}
|
|
387
505
|
catch (error) {
|
|
388
506
|
return {
|
|
@@ -509,8 +627,13 @@ function registerEvaluateBatch(server) {
|
|
|
509
627
|
}, async (params) => {
|
|
510
628
|
const config = params.config ? toJudgesConfig(params.config) : undefined;
|
|
511
629
|
const options = config ? { config } : undefined;
|
|
630
|
+
await server.sendLoggingMessage({ level: "info", data: `Batch evaluation: ${params.files.length} files...` });
|
|
512
631
|
// Use bounded-concurrency parallel evaluation instead of sequential loop
|
|
513
|
-
const batchResults = await evaluateFilesBatch(params.files, 4, options)
|
|
632
|
+
const batchResults = await evaluateFilesBatch(params.files, 4, options, (completed, total) => {
|
|
633
|
+
server
|
|
634
|
+
.sendLoggingMessage({ level: "info", data: `Progress: ${completed}/${total} files evaluated` })
|
|
635
|
+
.catch(() => { });
|
|
636
|
+
});
|
|
514
637
|
const results = batchResults.map((r) => {
|
|
515
638
|
const criticals = r.verdict.findings.filter((f) => f.severity === "critical").length;
|
|
516
639
|
return {
|
|
@@ -541,8 +664,258 @@ function registerEvaluateBatch(server) {
|
|
|
541
664
|
results.map((r) => `| ${r.path} | ${r.score} | ${r.findingCount} | ${r.criticalCount} |`).join("\n") +
|
|
542
665
|
"\n\n" +
|
|
543
666
|
allFindings.join("\n\n");
|
|
667
|
+
const structured = {
|
|
668
|
+
fileCount: results.length,
|
|
669
|
+
averageScore: avgScore,
|
|
670
|
+
totalFindings,
|
|
671
|
+
totalCriticals,
|
|
672
|
+
files: results,
|
|
673
|
+
};
|
|
674
|
+
return {
|
|
675
|
+
content: [
|
|
676
|
+
{ type: "text", text: summary },
|
|
677
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
678
|
+
],
|
|
679
|
+
};
|
|
680
|
+
});
|
|
681
|
+
}
|
|
682
|
+
// ─── evaluate_then_fix ───────────────────────────────────────────────────────
|
|
683
|
+
function registerEvaluateThenFix(server) {
|
|
684
|
+
server.tool("evaluate_then_fix", "Evaluate code and automatically generate fix patches for all findings that have auto-fix support. Returns the evaluation verdict alongside ready-to-apply patches. Use this for a single-step 'review + fix' workflow.", {
|
|
685
|
+
code: z.string().describe("The source code to evaluate and fix."),
|
|
686
|
+
language: z.string().describe("The programming language (e.g., 'typescript', 'python')."),
|
|
687
|
+
context: z.string().optional().describe("Optional context about the code."),
|
|
688
|
+
includeAstFindings: z.boolean().optional().describe("Include AST/code-structure findings (default: true)"),
|
|
689
|
+
minConfidence: z
|
|
690
|
+
.number()
|
|
691
|
+
.min(0)
|
|
692
|
+
.max(1)
|
|
693
|
+
.optional()
|
|
694
|
+
.describe("Minimum finding confidence to include (0-1, default: 0)"),
|
|
695
|
+
config: configSchema,
|
|
696
|
+
}, async ({ code, language, context, includeAstFindings, minConfidence, config }) => {
|
|
697
|
+
try {
|
|
698
|
+
const sizeError = validateCodeSize(code);
|
|
699
|
+
if (sizeError) {
|
|
700
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
701
|
+
}
|
|
702
|
+
const session = getGlobalSession();
|
|
703
|
+
// Step 1: Evaluate
|
|
704
|
+
const verdict = evaluateWithTribunal(code, language, context, {
|
|
705
|
+
includeAstFindings,
|
|
706
|
+
minConfidence,
|
|
707
|
+
config: toJudgesConfig(config),
|
|
708
|
+
adaptiveSelection: true,
|
|
709
|
+
});
|
|
710
|
+
// Step 2: Generate fix patches for all findings
|
|
711
|
+
const patchedFindings = enrichWithPatches(verdict.findings, code);
|
|
712
|
+
session.recordEvaluation(context ?? `<inline:${language}>`, code, verdict);
|
|
713
|
+
const patchableFindings = patchedFindings.filter((f) => f.patch);
|
|
714
|
+
const patchCount = patchableFindings.length;
|
|
715
|
+
let md = `# Evaluate & Fix Results\n\n`;
|
|
716
|
+
md += `**Score:** ${verdict.overallScore}/100 | **Verdict:** ${verdict.overallVerdict.toUpperCase()}\n`;
|
|
717
|
+
md += `**Total Findings:** ${verdict.findings.length} | **Auto-fixable:** ${patchCount}\n\n`;
|
|
718
|
+
if (patchCount > 0) {
|
|
719
|
+
md += `## Auto-Fix Patches\n\n`;
|
|
720
|
+
md += `The following findings have auto-fix patches ready to apply:\n\n`;
|
|
721
|
+
for (const f of patchableFindings.slice(0, 20)) {
|
|
722
|
+
md += `### ${f.ruleId}: ${f.title}\n`;
|
|
723
|
+
md += `- **Severity:** ${f.severity} | **Lines:** ${f.lineNumbers?.join(", ") ?? "N/A"}\n`;
|
|
724
|
+
md += `- **Fix:**\n\`\`\`diff\n`;
|
|
725
|
+
if (f.patch?.oldText)
|
|
726
|
+
md += `- ${f.patch.oldText}\n`;
|
|
727
|
+
if (f.patch?.newText)
|
|
728
|
+
md += `+ ${f.patch.newText}\n`;
|
|
729
|
+
md += `\`\`\`\n\n`;
|
|
730
|
+
}
|
|
731
|
+
if (patchableFindings.length > 20) {
|
|
732
|
+
md += `> ... and ${patchableFindings.length - 20} more auto-fixable findings\n\n`;
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
md += formatVerdictAsMarkdown(verdict);
|
|
736
|
+
const structuredData = {
|
|
737
|
+
score: verdict.overallScore,
|
|
738
|
+
verdict: verdict.overallVerdict,
|
|
739
|
+
totalFindings: verdict.findings.length,
|
|
740
|
+
autoFixable: patchCount,
|
|
741
|
+
patches: patchableFindings.slice(0, 50).map((f) => ({
|
|
742
|
+
ruleId: f.ruleId,
|
|
743
|
+
severity: f.severity,
|
|
744
|
+
title: f.title,
|
|
745
|
+
lineNumbers: f.lineNumbers,
|
|
746
|
+
oldText: f.patch?.oldText,
|
|
747
|
+
newText: f.patch?.newText,
|
|
748
|
+
})),
|
|
749
|
+
};
|
|
750
|
+
return {
|
|
751
|
+
content: [
|
|
752
|
+
{ type: "text", text: md },
|
|
753
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```" },
|
|
754
|
+
],
|
|
755
|
+
};
|
|
756
|
+
}
|
|
757
|
+
catch (error) {
|
|
758
|
+
return {
|
|
759
|
+
content: [
|
|
760
|
+
{
|
|
761
|
+
type: "text",
|
|
762
|
+
text: error instanceof Error ? `Error: ${error.message}` : "Error: evaluate_then_fix failed",
|
|
763
|
+
},
|
|
764
|
+
],
|
|
765
|
+
isError: true,
|
|
766
|
+
};
|
|
767
|
+
}
|
|
768
|
+
});
|
|
769
|
+
}
|
|
770
|
+
// ─── evaluate_focused ────────────────────────────────────────────────────────
|
|
771
|
+
function registerEvaluateFocused(server) {
|
|
772
|
+
server.tool("evaluate_focused", "Run a focused evaluation using only the specified judges. Use this after an initial full evaluation to re-check specific areas — for example, re-run only 'cybersecurity' and 'authentication' judges after applying security fixes. Much faster than a full tribunal evaluation.", {
|
|
773
|
+
code: z.string().describe("The source code to evaluate."),
|
|
774
|
+
language: z.string().describe("The programming language (e.g., 'typescript', 'python')."),
|
|
775
|
+
judgeIds: z
|
|
776
|
+
.array(z.string())
|
|
777
|
+
.min(1)
|
|
778
|
+
.describe("Array of judge IDs to run (e.g., ['cybersecurity', 'authentication', 'data-sovereignty'])"),
|
|
779
|
+
context: z.string().optional().describe("Optional context about the code."),
|
|
780
|
+
includeAstFindings: z.boolean().optional().describe("Include AST/code-structure findings (default: true)"),
|
|
781
|
+
minConfidence: z
|
|
782
|
+
.number()
|
|
783
|
+
.min(0)
|
|
784
|
+
.max(1)
|
|
785
|
+
.optional()
|
|
786
|
+
.describe("Minimum finding confidence to include (0-1, default: 0)"),
|
|
787
|
+
config: configSchema,
|
|
788
|
+
}, async ({ code, language, judgeIds, context, includeAstFindings, minConfidence, config }) => {
|
|
789
|
+
try {
|
|
790
|
+
const sizeError = validateCodeSize(code);
|
|
791
|
+
if (sizeError) {
|
|
792
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
793
|
+
}
|
|
794
|
+
const cfgObj = toJudgesConfig(config);
|
|
795
|
+
// Build a config that disables all judges EXCEPT the focused ones
|
|
796
|
+
const allJudgeIds = JUDGES.map((j) => j.id);
|
|
797
|
+
const focusedSet = new Set(judgeIds);
|
|
798
|
+
const disabledJudges = allJudgeIds.filter((id) => !focusedSet.has(id));
|
|
799
|
+
const mergedConfig = cfgObj
|
|
800
|
+
? { ...cfgObj, disabledJudges: [...(cfgObj.disabledJudges ?? []), ...disabledJudges] }
|
|
801
|
+
: { disabledJudges };
|
|
802
|
+
const verdict = evaluateWithTribunal(code, language, context, {
|
|
803
|
+
includeAstFindings,
|
|
804
|
+
minConfidence,
|
|
805
|
+
config: mergedConfig,
|
|
806
|
+
});
|
|
807
|
+
let md = `# Focused Evaluation (${judgeIds.length} judges)\n\n`;
|
|
808
|
+
md += `**Judges:** ${judgeIds.join(", ")}\n`;
|
|
809
|
+
md += `**Score:** ${verdict.overallScore}/100 | **Verdict:** ${verdict.overallVerdict.toUpperCase()}\n`;
|
|
810
|
+
md += `**Findings:** ${verdict.findings.length}\n\n`;
|
|
811
|
+
md += formatVerdictAsMarkdown(verdict);
|
|
812
|
+
const structuredData = {
|
|
813
|
+
focusedJudges: judgeIds,
|
|
814
|
+
score: verdict.overallScore,
|
|
815
|
+
verdict: verdict.overallVerdict,
|
|
816
|
+
findingCount: verdict.findings.length,
|
|
817
|
+
findings: verdict.findings.map((f) => ({
|
|
818
|
+
ruleId: f.ruleId,
|
|
819
|
+
severity: f.severity,
|
|
820
|
+
title: f.title,
|
|
821
|
+
lineNumbers: f.lineNumbers,
|
|
822
|
+
confidence: f.confidence,
|
|
823
|
+
})),
|
|
824
|
+
};
|
|
825
|
+
return {
|
|
826
|
+
content: [
|
|
827
|
+
{ type: "text", text: md },
|
|
828
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```" },
|
|
829
|
+
],
|
|
830
|
+
};
|
|
831
|
+
}
|
|
832
|
+
catch (error) {
|
|
833
|
+
return {
|
|
834
|
+
content: [
|
|
835
|
+
{
|
|
836
|
+
type: "text",
|
|
837
|
+
text: error instanceof Error ? `Error: ${error.message}` : "Error: Focused evaluation failed",
|
|
838
|
+
},
|
|
839
|
+
],
|
|
840
|
+
isError: true,
|
|
841
|
+
};
|
|
842
|
+
}
|
|
843
|
+
});
|
|
844
|
+
}
|
|
845
|
+
// ─── session_status ──────────────────────────────────────────────────────────
|
|
846
|
+
function registerSessionStatus(server) {
|
|
847
|
+
server.tool("session_status", "Get the current evaluation session status — how many evaluations have been run, detected frameworks, verdict history per file, and stability indicators. Useful for understanding what the tribunal has already reviewed.", {}, async () => {
|
|
848
|
+
const session = getGlobalSession();
|
|
849
|
+
const ctx = session.getContext();
|
|
850
|
+
const filesEvaluated = [...ctx.verdictHistory.entries()].map(([file, history]) => ({
|
|
851
|
+
file,
|
|
852
|
+
evaluations: history.length,
|
|
853
|
+
latestScore: history[history.length - 1]?.score ?? 0,
|
|
854
|
+
stable: session.isVerdictStable(file),
|
|
855
|
+
}));
|
|
856
|
+
let md = `# Evaluation Session Status\n\n`;
|
|
857
|
+
md += `**Evaluations:** ${ctx.evaluationCount}\n`;
|
|
858
|
+
md += `**Started:** ${ctx.startedAt}\n`;
|
|
859
|
+
md += `**Detected Frameworks:** ${ctx.frameworks.length > 0 ? ctx.frameworks.join(", ") : "None yet"}\n`;
|
|
860
|
+
md += `**Capabilities:** ${ctx.capabilities.size > 0 ? [...ctx.capabilities].join(", ") : "None yet"}\n\n`;
|
|
861
|
+
if (filesEvaluated.length > 0) {
|
|
862
|
+
md += `## Files Evaluated\n\n`;
|
|
863
|
+
md += `| File | Evals | Latest Score | Stable |\n`;
|
|
864
|
+
md += `|------|-------|--------------|--------|\n`;
|
|
865
|
+
for (const f of filesEvaluated) {
|
|
866
|
+
md += `| ${f.file} | ${f.evaluations} | ${f.latestScore}/100 | ${f.stable ? "Yes" : "No"} |\n`;
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
const feedbackTally = [...session.getFeedbackTally().entries()];
|
|
870
|
+
if (feedbackTally.length > 0) {
|
|
871
|
+
md += `\n## Feedback Tally\n\n`;
|
|
872
|
+
md += `| Rule | TP | FP | Won't Fix |\n`;
|
|
873
|
+
md += `|------|----|----|----------|\n`;
|
|
874
|
+
for (const [rule, counts] of feedbackTally) {
|
|
875
|
+
md += `| ${rule} | ${counts.tp} | ${counts.fp} | ${counts.wontfix} |\n`;
|
|
876
|
+
}
|
|
877
|
+
}
|
|
544
878
|
return {
|
|
545
|
-
content: [
|
|
879
|
+
content: [
|
|
880
|
+
{ type: "text", text: md },
|
|
881
|
+
{
|
|
882
|
+
type: "text",
|
|
883
|
+
text: "```json\n" +
|
|
884
|
+
JSON.stringify({
|
|
885
|
+
evaluationCount: ctx.evaluationCount,
|
|
886
|
+
startedAt: ctx.startedAt,
|
|
887
|
+
frameworks: ctx.frameworks,
|
|
888
|
+
capabilities: [...ctx.capabilities],
|
|
889
|
+
filesEvaluated,
|
|
890
|
+
feedbackTally: Object.fromEntries(feedbackTally),
|
|
891
|
+
}, null, 2) +
|
|
892
|
+
"\n```",
|
|
893
|
+
},
|
|
894
|
+
],
|
|
895
|
+
};
|
|
896
|
+
});
|
|
897
|
+
}
|
|
898
|
+
// ─── record_feedback ─────────────────────────────────────────────────────────
|
|
899
|
+
function registerRecordFeedback(server) {
|
|
900
|
+
server.tool("record_feedback", "Record user feedback on a finding — mark it as a true positive (tp), false positive (fp), or won't fix (wontfix). This feedback calibrates confidence scores in subsequent evaluations during the current session, reducing noise from rules the user considers inaccurate.", {
|
|
901
|
+
ruleId: z.string().describe("The rule ID of the finding (e.g., 'SEC-001', 'AUTH-003')."),
|
|
902
|
+
verdict: z
|
|
903
|
+
.enum(["tp", "fp", "wontfix"])
|
|
904
|
+
.describe("The feedback verdict: tp (true positive), fp (false positive), wontfix (acknowledged but won't fix)."),
|
|
905
|
+
}, async ({ ruleId, verdict }) => {
|
|
906
|
+
const session = getGlobalSession();
|
|
907
|
+
session.recordFeedback(ruleId, verdict);
|
|
908
|
+
const penalty = session.getConfidencePenalty(ruleId);
|
|
909
|
+
const penaltyPct = Math.round(penalty * 100);
|
|
910
|
+
return {
|
|
911
|
+
content: [
|
|
912
|
+
{
|
|
913
|
+
type: "text",
|
|
914
|
+
text: `Feedback recorded: **${ruleId}** → **${verdict}**\n\n` +
|
|
915
|
+
`Current confidence multiplier for ${ruleId}: **${penaltyPct}%**\n` +
|
|
916
|
+
(verdict === "fp" ? `Future findings for this rule will have reduced confidence in this session.` : ``),
|
|
917
|
+
},
|
|
918
|
+
],
|
|
546
919
|
};
|
|
547
920
|
});
|
|
548
921
|
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validate that code input is within acceptable size limits.
|
|
3
|
+
* Returns an error message string if validation fails, or `undefined` if valid.
|
|
4
|
+
*/
|
|
5
|
+
export declare function validateCodeSize(code: string, maxBytes?: number): string | undefined;
|
|
6
|
+
/** Recognized programming languages for validation warnings. */
|
|
7
|
+
export declare const KNOWN_LANGUAGES: Set<string>;
|
|
8
|
+
/**
|
|
9
|
+
* Check whether a language string is recognized.
|
|
10
|
+
* Returns the normalized (lowercased) language, or `undefined` if not recognized.
|
|
11
|
+
* This is advisory only — unrecognized languages are still accepted.
|
|
12
|
+
*/
|
|
13
|
+
export declare function normalizeLanguage(lang: string): string;
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
// ─── Input Validation Helpers ────────────────────────────────────────────────
|
|
2
|
+
// Shared validation for MCP tool inputs at system boundaries.
|
|
3
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
4
|
+
/** Maximum code input size (1 MB). Prevents excessive memory/CPU usage. */
|
|
5
|
+
const MAX_CODE_BYTES = 1_048_576;
|
|
6
|
+
/**
|
|
7
|
+
* Validate that code input is within acceptable size limits.
|
|
8
|
+
* Returns an error message string if validation fails, or `undefined` if valid.
|
|
9
|
+
*/
|
|
10
|
+
export function validateCodeSize(code, maxBytes = MAX_CODE_BYTES) {
|
|
11
|
+
if (code.length === 0) {
|
|
12
|
+
return "Code input is empty.";
|
|
13
|
+
}
|
|
14
|
+
const byteLength = Buffer.byteLength(code, "utf-8");
|
|
15
|
+
if (byteLength > maxBytes) {
|
|
16
|
+
return `Code input too large (${(byteLength / 1024).toFixed(0)} KB). Maximum allowed: ${(maxBytes / 1024).toFixed(0)} KB.`;
|
|
17
|
+
}
|
|
18
|
+
return undefined;
|
|
19
|
+
}
|
|
20
|
+
/** Recognized programming languages for validation warnings. */
|
|
21
|
+
export const KNOWN_LANGUAGES = new Set([
|
|
22
|
+
"typescript",
|
|
23
|
+
"javascript",
|
|
24
|
+
"python",
|
|
25
|
+
"java",
|
|
26
|
+
"csharp",
|
|
27
|
+
"c",
|
|
28
|
+
"cpp",
|
|
29
|
+
"go",
|
|
30
|
+
"rust",
|
|
31
|
+
"ruby",
|
|
32
|
+
"php",
|
|
33
|
+
"swift",
|
|
34
|
+
"kotlin",
|
|
35
|
+
"scala",
|
|
36
|
+
"r",
|
|
37
|
+
"powershell",
|
|
38
|
+
"bash",
|
|
39
|
+
"shell",
|
|
40
|
+
"sql",
|
|
41
|
+
"html",
|
|
42
|
+
"css",
|
|
43
|
+
"scss",
|
|
44
|
+
"bicep",
|
|
45
|
+
"terraform",
|
|
46
|
+
"hcl",
|
|
47
|
+
"yaml",
|
|
48
|
+
"yml",
|
|
49
|
+
"json",
|
|
50
|
+
"xml",
|
|
51
|
+
"toml",
|
|
52
|
+
"dockerfile",
|
|
53
|
+
"makefile",
|
|
54
|
+
"markdown",
|
|
55
|
+
"plaintext",
|
|
56
|
+
"objective-c",
|
|
57
|
+
"dart",
|
|
58
|
+
"lua",
|
|
59
|
+
"perl",
|
|
60
|
+
"elixir",
|
|
61
|
+
"erlang",
|
|
62
|
+
"haskell",
|
|
63
|
+
"fsharp",
|
|
64
|
+
"vb",
|
|
65
|
+
"assembly",
|
|
66
|
+
"zig",
|
|
67
|
+
"nim",
|
|
68
|
+
"cloudformation",
|
|
69
|
+
]);
|
|
70
|
+
/**
|
|
71
|
+
* Check whether a language string is recognized.
|
|
72
|
+
* Returns the normalized (lowercased) language, or `undefined` if not recognized.
|
|
73
|
+
* This is advisory only — unrecognized languages are still accepted.
|
|
74
|
+
*/
|
|
75
|
+
export function normalizeLanguage(lang) {
|
|
76
|
+
return lang.toLowerCase().trim();
|
|
77
|
+
}
|