@kevinrabun/judges 3.115.3 → 3.116.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +7 -3
- package/dist/api.js +7 -1
- package/dist/evaluation-session.d.ts +74 -0
- package/dist/evaluation-session.js +152 -0
- package/dist/evaluators/index.d.ts +23 -1
- package/dist/evaluators/index.js +163 -3
- package/dist/evaluators/judge-selector.d.ts +19 -0
- package/dist/evaluators/judge-selector.js +141 -0
- package/dist/index.js +2 -0
- package/dist/judges/index.d.ts +54 -9
- package/dist/judges/index.js +72 -14
- package/dist/tools/register-evaluation.js +208 -8
- package/dist/tools/register-fix.js +24 -1
- package/dist/tools/register-resources.d.ts +6 -0
- package/dist/tools/register-resources.js +177 -0
- package/dist/tools/register-review.js +26 -1
- package/dist/tools/register-workflow.js +384 -11
- package/dist/tools/validation.d.ts +13 -0
- package/dist/tools/validation.js +77 -0
- package/dist/types.d.ts +122 -0
- package/package.json +24 -12
- package/server.json +2 -2
package/dist/index.js
CHANGED
|
@@ -6,6 +6,7 @@ import("@modelcontextprotocol/sdk/server/mcp.js")
|
|
|
6
6
|
const { StdioServerTransport } = await import("@modelcontextprotocol/sdk/server/stdio.js");
|
|
7
7
|
const { registerTools } = await import("./tools/register.js");
|
|
8
8
|
const { registerPrompts } = await import("./tools/prompts.js");
|
|
9
|
+
const { registerResources } = await import("./tools/register-resources.js");
|
|
9
10
|
const { readFileSync } = await import("fs");
|
|
10
11
|
const { resolve, dirname } = await import("path");
|
|
11
12
|
const { fileURLToPath } = await import("url");
|
|
@@ -25,6 +26,7 @@ import("@modelcontextprotocol/sdk/server/mcp.js")
|
|
|
25
26
|
});
|
|
26
27
|
registerTools(server);
|
|
27
28
|
registerPrompts(server);
|
|
29
|
+
registerResources(server);
|
|
28
30
|
const transport = new StdioServerTransport();
|
|
29
31
|
await server.connect(transport);
|
|
30
32
|
console.error("Judges Panel MCP server running on stdio");
|
package/dist/judges/index.d.ts
CHANGED
|
@@ -1,17 +1,62 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Judge registry bootstrap
|
|
2
|
+
* Judge registry bootstrap.
|
|
3
3
|
*
|
|
4
|
-
* Judges are
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
4
|
+
* Judges are dual-registered:
|
|
5
|
+
* 1. Static side-effect imports below — each module calls
|
|
6
|
+
* `defaultRegistry.register()` at load time. These are inlined by
|
|
7
|
+
* esbuild and work in both ESM and CJS bundles.
|
|
8
|
+
* 2. Agent-native `.judge.md` files loaded at runtime from the `agents/`
|
|
9
|
+
* directory (when available). This enriches / overrides metadata.
|
|
8
10
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
* - `npm run generate:agents` (to sync)
|
|
12
|
-
* - `npm run validate:agents`
|
|
11
|
+
* The static imports guarantee that judges are always available, even in
|
|
12
|
+
* bundled environments (VS Code extension) where `agents/` is absent.
|
|
13
13
|
*/
|
|
14
14
|
import type { JudgeDefinition } from "../types.js";
|
|
15
|
+
import "./accessibility.js";
|
|
16
|
+
import "./agent-instructions.js";
|
|
17
|
+
import "./ai-code-safety.js";
|
|
18
|
+
import "./api-contract.js";
|
|
19
|
+
import "./api-design.js";
|
|
20
|
+
import "./authentication.js";
|
|
21
|
+
import "./backwards-compatibility.js";
|
|
22
|
+
import "./caching.js";
|
|
23
|
+
import "./ci-cd.js";
|
|
24
|
+
import "./cloud-readiness.js";
|
|
25
|
+
import "./code-structure.js";
|
|
26
|
+
import "./compliance.js";
|
|
27
|
+
import "./concurrency.js";
|
|
28
|
+
import "./configuration-management.js";
|
|
29
|
+
import "./cost-effectiveness.js";
|
|
30
|
+
import "./cybersecurity.js";
|
|
31
|
+
import "./data-security.js";
|
|
32
|
+
import "./data-sovereignty.js";
|
|
33
|
+
import "./database.js";
|
|
34
|
+
import "./dependency-health.js";
|
|
35
|
+
import "./documentation.js";
|
|
36
|
+
import "./error-handling.js";
|
|
37
|
+
import "./ethics-bias.js";
|
|
38
|
+
import "./false-positive-review.js";
|
|
39
|
+
import "./framework-safety.js";
|
|
40
|
+
import "./hallucination-detection.js";
|
|
41
|
+
import "./iac-security.js";
|
|
42
|
+
import "./intent-alignment.js";
|
|
43
|
+
import "./internationalization.js";
|
|
44
|
+
import "./logging-privacy.js";
|
|
45
|
+
import "./logic-review.js";
|
|
46
|
+
import "./maintainability.js";
|
|
47
|
+
import "./model-fingerprint.js";
|
|
48
|
+
import "./multi-turn-coherence.js";
|
|
49
|
+
import "./observability.js";
|
|
50
|
+
import "./over-engineering.js";
|
|
51
|
+
import "./performance.js";
|
|
52
|
+
import "./portability.js";
|
|
53
|
+
import "./rate-limiting.js";
|
|
54
|
+
import "./reliability.js";
|
|
55
|
+
import "./scalability.js";
|
|
56
|
+
import "./security.js";
|
|
57
|
+
import "./software-practices.js";
|
|
58
|
+
import "./testing.js";
|
|
59
|
+
import "./ux.js";
|
|
15
60
|
/**
|
|
16
61
|
* Load judges (agent-native). Loads agents from the default `agents/` folder
|
|
17
62
|
* and returns the current registry snapshot.
|
package/dist/judges/index.js
CHANGED
|
@@ -1,30 +1,85 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Judge registry bootstrap
|
|
2
|
+
* Judge registry bootstrap.
|
|
3
3
|
*
|
|
4
|
-
* Judges are
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
4
|
+
* Judges are dual-registered:
|
|
5
|
+
* 1. Static side-effect imports below — each module calls
|
|
6
|
+
* `defaultRegistry.register()` at load time. These are inlined by
|
|
7
|
+
* esbuild and work in both ESM and CJS bundles.
|
|
8
|
+
* 2. Agent-native `.judge.md` files loaded at runtime from the `agents/`
|
|
9
|
+
* directory (when available). This enriches / overrides metadata.
|
|
8
10
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
* - `npm run generate:agents` (to sync)
|
|
12
|
-
* - `npm run validate:agents`
|
|
11
|
+
* The static imports guarantee that judges are always available, even in
|
|
12
|
+
* bundled environments (VS Code extension) where `agents/` is absent.
|
|
13
13
|
*/
|
|
14
14
|
import { defaultRegistry } from "../judge-registry.js";
|
|
15
15
|
import { loadAndRegisterAgents } from "../agent-loader.js";
|
|
16
16
|
import { resolve, dirname } from "node:path";
|
|
17
17
|
import { fileURLToPath } from "node:url";
|
|
18
|
+
// ─── Static side-effect imports (self-registering) ──────────────────────────
|
|
19
|
+
import "./accessibility.js";
|
|
20
|
+
import "./agent-instructions.js";
|
|
21
|
+
import "./ai-code-safety.js";
|
|
22
|
+
import "./api-contract.js";
|
|
23
|
+
import "./api-design.js";
|
|
24
|
+
import "./authentication.js";
|
|
25
|
+
import "./backwards-compatibility.js";
|
|
26
|
+
import "./caching.js";
|
|
27
|
+
import "./ci-cd.js";
|
|
28
|
+
import "./cloud-readiness.js";
|
|
29
|
+
import "./code-structure.js";
|
|
30
|
+
import "./compliance.js";
|
|
31
|
+
import "./concurrency.js";
|
|
32
|
+
import "./configuration-management.js";
|
|
33
|
+
import "./cost-effectiveness.js";
|
|
34
|
+
import "./cybersecurity.js";
|
|
35
|
+
import "./data-security.js";
|
|
36
|
+
import "./data-sovereignty.js";
|
|
37
|
+
import "./database.js";
|
|
38
|
+
import "./dependency-health.js";
|
|
39
|
+
import "./documentation.js";
|
|
40
|
+
import "./error-handling.js";
|
|
41
|
+
import "./ethics-bias.js";
|
|
42
|
+
import "./false-positive-review.js";
|
|
43
|
+
import "./framework-safety.js";
|
|
44
|
+
import "./hallucination-detection.js";
|
|
45
|
+
import "./iac-security.js";
|
|
46
|
+
import "./intent-alignment.js";
|
|
47
|
+
import "./internationalization.js";
|
|
48
|
+
import "./logging-privacy.js";
|
|
49
|
+
import "./logic-review.js";
|
|
50
|
+
import "./maintainability.js";
|
|
51
|
+
import "./model-fingerprint.js";
|
|
52
|
+
import "./multi-turn-coherence.js";
|
|
53
|
+
import "./observability.js";
|
|
54
|
+
import "./over-engineering.js";
|
|
55
|
+
import "./performance.js";
|
|
56
|
+
import "./portability.js";
|
|
57
|
+
import "./rate-limiting.js";
|
|
58
|
+
import "./reliability.js";
|
|
59
|
+
import "./scalability.js";
|
|
60
|
+
import "./security.js";
|
|
61
|
+
import "./software-practices.js";
|
|
62
|
+
import "./testing.js";
|
|
63
|
+
import "./ux.js";
|
|
18
64
|
// Support both ESM (import.meta.url) and CJS (esbuild bundle) environments.
|
|
19
65
|
const _importMetaUrl = typeof import.meta?.url === "string" ? import.meta.url : undefined;
|
|
20
66
|
const __filename = _importMetaUrl ? fileURLToPath(_importMetaUrl) : "";
|
|
21
|
-
const __dirname = __filename ? dirname(__filename) :
|
|
67
|
+
const __dirname = __filename ? dirname(__filename) : "";
|
|
22
68
|
let agentsLoaded = false;
|
|
23
69
|
function loadDefaultAgents() {
|
|
24
70
|
if (agentsLoaded)
|
|
25
71
|
return;
|
|
26
|
-
|
|
27
|
-
|
|
72
|
+
// Static side-effect imports above already registered all built-in judges.
|
|
73
|
+
// In ESM mode, also load from agents/ directory for metadata enrichment.
|
|
74
|
+
if (__dirname) {
|
|
75
|
+
try {
|
|
76
|
+
const agentsDir = resolve(__dirname, "..", "..", "agents");
|
|
77
|
+
loadAndRegisterAgents(agentsDir, defaultRegistry);
|
|
78
|
+
}
|
|
79
|
+
catch {
|
|
80
|
+
// agents/ directory may not exist — built-in judges are already loaded
|
|
81
|
+
}
|
|
82
|
+
}
|
|
28
83
|
agentsLoaded = true;
|
|
29
84
|
}
|
|
30
85
|
// ─── Optional Agent Loader Integration ──────────────────────────────────────
|
|
@@ -42,9 +97,12 @@ export async function loadJudges() {
|
|
|
42
97
|
* agents can augment or replace built-in judges. If a judge is already
|
|
43
98
|
* registered, it is skipped.
|
|
44
99
|
*/
|
|
45
|
-
export function loadAgentJudges(dir
|
|
100
|
+
export function loadAgentJudges(dir) {
|
|
101
|
+
const agentsDir = dir ?? (__dirname ? resolve(__dirname, "..", "..", "agents") : "");
|
|
102
|
+
if (!agentsDir)
|
|
103
|
+
return 0; // CJS bundle — no agents directory available
|
|
46
104
|
agentsLoaded = false; // allow re-run to pick up new agents if dir changes
|
|
47
|
-
const count = loadAndRegisterAgents(
|
|
105
|
+
const count = loadAndRegisterAgents(agentsDir, defaultRegistry);
|
|
48
106
|
agentsLoaded = true;
|
|
49
107
|
return count;
|
|
50
108
|
}
|
|
@@ -5,10 +5,12 @@ import { z } from "zod";
|
|
|
5
5
|
import { readFileSync, existsSync } from "fs";
|
|
6
6
|
import { extname } from "path";
|
|
7
7
|
import { JUDGES, getJudge, getJudgeSummaries } from "../judges/index.js";
|
|
8
|
-
import { evaluateWithJudge, evaluateWithTribunal, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, } from "../evaluators/index.js";
|
|
8
|
+
import { evaluateWithJudge, evaluateWithTribunal, evaluateWithTribunalStreaming, formatVerdictAsMarkdown, formatEvaluationAsMarkdown, } from "../evaluators/index.js";
|
|
9
9
|
import { evaluateCodeV2, evaluateProjectV2, getSupportedPolicyProfiles } from "../evaluators/v2.js";
|
|
10
10
|
import { detectProjectContext } from "../evaluators/shared.js";
|
|
11
|
+
import { getGlobalSession } from "../evaluation-session.js";
|
|
11
12
|
import { configSchema, toJudgesConfig } from "./schemas.js";
|
|
13
|
+
import { validateCodeSize } from "./validation.js";
|
|
12
14
|
import { buildSingleJudgeDeepReviewSection, buildTribunalDeepReviewSection } from "./deep-review.js";
|
|
13
15
|
/**
|
|
14
16
|
* Register evaluation-focused tools: get_judges, evaluate_code,
|
|
@@ -20,6 +22,7 @@ export function registerEvaluationTools(server) {
|
|
|
20
22
|
registerEvaluateSingleJudge(server);
|
|
21
23
|
registerEvaluateV2(server);
|
|
22
24
|
registerEvaluateFile(server);
|
|
25
|
+
registerEvaluateCodeStreaming(server);
|
|
23
26
|
}
|
|
24
27
|
// ─── get_judges ──────────────────────────────────────────────────────────────
|
|
25
28
|
function registerGetJudges(server) {
|
|
@@ -34,6 +37,15 @@ function registerGetJudges(server) {
|
|
|
34
37
|
type: "text",
|
|
35
38
|
text: `# Judges Panel\n\n${text}`,
|
|
36
39
|
},
|
|
40
|
+
{
|
|
41
|
+
type: "text",
|
|
42
|
+
text: "```json\n" +
|
|
43
|
+
JSON.stringify({
|
|
44
|
+
judgeCount: judges.length,
|
|
45
|
+
judges: judges.map((j) => ({ id: j.id, name: j.name, domain: j.domain })),
|
|
46
|
+
}, null, 2) +
|
|
47
|
+
"\n```",
|
|
48
|
+
},
|
|
37
49
|
],
|
|
38
50
|
};
|
|
39
51
|
});
|
|
@@ -70,20 +82,52 @@ function registerEvaluateCode(server) {
|
|
|
70
82
|
config: configSchema,
|
|
71
83
|
}, async ({ code, language, context, includeAstFindings, minConfidence, relatedFiles, config }) => {
|
|
72
84
|
try {
|
|
85
|
+
const sizeError = validateCodeSize(code);
|
|
86
|
+
if (sizeError) {
|
|
87
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
88
|
+
}
|
|
89
|
+
const session = getGlobalSession();
|
|
73
90
|
const verdict = evaluateWithTribunal(code, language, context, {
|
|
74
91
|
includeAstFindings,
|
|
75
92
|
minConfidence,
|
|
76
93
|
config: toJudgesConfig(config),
|
|
94
|
+
adaptiveSelection: true,
|
|
95
|
+
filePath: context,
|
|
77
96
|
});
|
|
97
|
+
// Track evaluation in session
|
|
98
|
+
session.recordEvaluation(context ?? `<inline:${language}>`, code, verdict);
|
|
78
99
|
const projectContext = detectProjectContext(code, language);
|
|
79
100
|
const patternResults = formatVerdictAsMarkdown(verdict);
|
|
80
101
|
const deepReview = buildTribunalDeepReviewSection(JUDGES, language, context, relatedFiles, projectContext);
|
|
102
|
+
// Structured JSON content block for programmatic consumption
|
|
103
|
+
const structuredData = {
|
|
104
|
+
score: verdict.overallScore,
|
|
105
|
+
verdict: verdict.overallVerdict,
|
|
106
|
+
findingCount: verdict.findings.length,
|
|
107
|
+
criticalCount: verdict.findings.filter((f) => f.severity === "critical").length,
|
|
108
|
+
highCount: verdict.findings.filter((f) => f.severity === "high").length,
|
|
109
|
+
judgesRun: verdict.evaluations.length,
|
|
110
|
+
findings: verdict.findings.map((f) => ({
|
|
111
|
+
ruleId: f.ruleId,
|
|
112
|
+
severity: f.severity,
|
|
113
|
+
title: f.title,
|
|
114
|
+
lineNumbers: f.lineNumbers,
|
|
115
|
+
confidence: f.confidence,
|
|
116
|
+
})),
|
|
117
|
+
sessionStats: {
|
|
118
|
+
evaluationCount: session.evaluationCount,
|
|
119
|
+
},
|
|
120
|
+
};
|
|
81
121
|
return {
|
|
82
122
|
content: [
|
|
83
123
|
{
|
|
84
124
|
type: "text",
|
|
85
125
|
text: patternResults + deepReview,
|
|
86
126
|
},
|
|
127
|
+
{
|
|
128
|
+
type: "text",
|
|
129
|
+
text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```",
|
|
130
|
+
},
|
|
87
131
|
],
|
|
88
132
|
};
|
|
89
133
|
}
|
|
@@ -130,6 +174,10 @@ function registerEvaluateSingleJudge(server) {
|
|
|
130
174
|
config: configSchema,
|
|
131
175
|
}, async ({ code, language, judgeId, context, minConfidence, relatedFiles, config }) => {
|
|
132
176
|
try {
|
|
177
|
+
const sizeError = validateCodeSize(code);
|
|
178
|
+
if (sizeError) {
|
|
179
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
180
|
+
}
|
|
133
181
|
const judge = getJudge(judgeId);
|
|
134
182
|
if (!judge) {
|
|
135
183
|
return {
|
|
@@ -149,12 +197,25 @@ function registerEvaluateSingleJudge(server) {
|
|
|
149
197
|
const projectContext = detectProjectContext(code, language);
|
|
150
198
|
const patternResults = formatEvaluationAsMarkdown(evaluation);
|
|
151
199
|
const deepReview = buildSingleJudgeDeepReviewSection(judge, language, context, relatedFiles, projectContext);
|
|
200
|
+
const structured = {
|
|
201
|
+
judgeId,
|
|
202
|
+
judgeName: judge.name,
|
|
203
|
+
domain: judge.domain,
|
|
204
|
+
score: evaluation.score,
|
|
205
|
+
verdict: evaluation.verdict,
|
|
206
|
+
findingCount: evaluation.findings.length,
|
|
207
|
+
findings: evaluation.findings.map((f) => ({
|
|
208
|
+
ruleId: f.ruleId,
|
|
209
|
+
severity: f.severity,
|
|
210
|
+
title: f.title,
|
|
211
|
+
lineNumbers: f.lineNumbers,
|
|
212
|
+
confidence: f.confidence,
|
|
213
|
+
})),
|
|
214
|
+
};
|
|
152
215
|
return {
|
|
153
216
|
content: [
|
|
154
|
-
{
|
|
155
|
-
|
|
156
|
-
text: patternResults + deepReview,
|
|
157
|
-
},
|
|
217
|
+
{ type: "text", text: patternResults + deepReview },
|
|
218
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
158
219
|
],
|
|
159
220
|
};
|
|
160
221
|
}
|
|
@@ -173,7 +234,7 @@ function registerEvaluateSingleJudge(server) {
|
|
|
173
234
|
}
|
|
174
235
|
// ─── evaluate_v2 ─────────────────────────────────────────────────────────────
|
|
175
236
|
function registerEvaluateV2(server) {
|
|
176
|
-
server.tool("
|
|
237
|
+
server.tool("evaluate_policy_aware", "Run policy-aware tribunal evaluation with named policy profiles (startup, regulated, healthcare, fintech, public-sector), evidence calibration from runtime metrics, specialty-per-judge feedback, confidence scoring, and uncertainty reporting. Use this when code must meet specific compliance or vertical requirements.", {
|
|
177
238
|
code: z.string().optional().describe("Source code for single-file mode"),
|
|
178
239
|
language: z.string().optional().describe("Language for single-file mode"),
|
|
179
240
|
files: z
|
|
@@ -263,7 +324,7 @@ function registerEvaluateV2(server) {
|
|
|
263
324
|
evaluationContext,
|
|
264
325
|
evidence,
|
|
265
326
|
});
|
|
266
|
-
let md = `#
|
|
327
|
+
let md = `# Policy-Aware Tribunal Evaluation\n\n`;
|
|
267
328
|
md += `**Policy Profile:** ${result.policyProfile}\n`;
|
|
268
329
|
md += `**Calibrated Verdict:** ${result.calibratedVerdict.toUpperCase()} (${result.calibratedScore}/100)\n`;
|
|
269
330
|
md += `**Base Verdict:** ${result.baseVerdict.overallVerdict.toUpperCase()} (${result.baseVerdict.overallScore}/100)\n`;
|
|
@@ -310,7 +371,28 @@ function registerEvaluateV2(server) {
|
|
|
310
371
|
md += `\n## Supported Policy Profiles\n\n`;
|
|
311
372
|
md += supportedProfiles.map((profile) => `- ${profile}`).join("\n");
|
|
312
373
|
md += "\n";
|
|
313
|
-
|
|
374
|
+
const structured = {
|
|
375
|
+
policyProfile: result.policyProfile,
|
|
376
|
+
calibratedScore: result.calibratedScore,
|
|
377
|
+
calibratedVerdict: result.calibratedVerdict,
|
|
378
|
+
baseScore: result.baseVerdict.overallScore,
|
|
379
|
+
baseVerdict: result.baseVerdict.overallVerdict,
|
|
380
|
+
confidence: result.confidence,
|
|
381
|
+
findingCount: result.findings.length,
|
|
382
|
+
findings: result.findings.map((f) => ({
|
|
383
|
+
ruleId: f.ruleId,
|
|
384
|
+
severity: f.severity,
|
|
385
|
+
title: f.title,
|
|
386
|
+
confidence: f.confidence,
|
|
387
|
+
})),
|
|
388
|
+
uncertainty: result.uncertainty,
|
|
389
|
+
};
|
|
390
|
+
return {
|
|
391
|
+
content: [
|
|
392
|
+
{ type: "text", text: md },
|
|
393
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
394
|
+
],
|
|
395
|
+
};
|
|
314
396
|
}
|
|
315
397
|
catch (error) {
|
|
316
398
|
return {
|
|
@@ -382,20 +464,60 @@ function registerEvaluateFile(server) {
|
|
|
382
464
|
}
|
|
383
465
|
const code = readFileSync(filePath, "utf-8");
|
|
384
466
|
const detectedLang = language || detectLanguageFromPath(filePath);
|
|
467
|
+
const session = getGlobalSession();
|
|
468
|
+
// Skip re-evaluation if verdict is stable for this file
|
|
469
|
+
if (session.isVerdictStable(filePath)) {
|
|
470
|
+
const history = session.getVerdictHistory(filePath);
|
|
471
|
+
return {
|
|
472
|
+
content: [
|
|
473
|
+
{
|
|
474
|
+
type: "text",
|
|
475
|
+
text: `# Evaluation: ${filePath}\n\n` +
|
|
476
|
+
`> ⚡ **Verdict stable** — score has converged at **${history[0]?.score ?? 0}/100** ` +
|
|
477
|
+
`across last evaluations. Skipping redundant re-evaluation.\n\n` +
|
|
478
|
+
`Use \`evaluate_code\` with the code directly to force a fresh evaluation.`,
|
|
479
|
+
},
|
|
480
|
+
],
|
|
481
|
+
};
|
|
482
|
+
}
|
|
385
483
|
const verdict = evaluateWithTribunal(code, detectedLang, context, {
|
|
386
484
|
includeAstFindings,
|
|
387
485
|
minConfidence,
|
|
388
486
|
config: toJudgesConfig(config),
|
|
487
|
+
adaptiveSelection: true,
|
|
488
|
+
filePath,
|
|
389
489
|
});
|
|
490
|
+
session.recordEvaluation(filePath, code, verdict);
|
|
390
491
|
const projectContext = detectProjectContext(code, detectedLang, filePath);
|
|
391
492
|
const patternResults = formatVerdictAsMarkdown(verdict);
|
|
392
493
|
const deepReview = buildTribunalDeepReviewSection(JUDGES, detectedLang, context, undefined, projectContext);
|
|
494
|
+
const structuredData = {
|
|
495
|
+
filePath,
|
|
496
|
+
language: detectedLang,
|
|
497
|
+
score: verdict.overallScore,
|
|
498
|
+
verdict: verdict.overallVerdict,
|
|
499
|
+
findingCount: verdict.findings.length,
|
|
500
|
+
criticalCount: verdict.findings.filter((f) => f.severity === "critical").length,
|
|
501
|
+
highCount: verdict.findings.filter((f) => f.severity === "high").length,
|
|
502
|
+
judgesRun: verdict.evaluations.length,
|
|
503
|
+
findings: verdict.findings.map((f) => ({
|
|
504
|
+
ruleId: f.ruleId,
|
|
505
|
+
severity: f.severity,
|
|
506
|
+
title: f.title,
|
|
507
|
+
lineNumbers: f.lineNumbers,
|
|
508
|
+
confidence: f.confidence,
|
|
509
|
+
})),
|
|
510
|
+
};
|
|
393
511
|
return {
|
|
394
512
|
content: [
|
|
395
513
|
{
|
|
396
514
|
type: "text",
|
|
397
515
|
text: `# Evaluation: ${filePath}\n\n` + patternResults + deepReview,
|
|
398
516
|
},
|
|
517
|
+
{
|
|
518
|
+
type: "text",
|
|
519
|
+
text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```",
|
|
520
|
+
},
|
|
399
521
|
],
|
|
400
522
|
};
|
|
401
523
|
}
|
|
@@ -412,3 +534,81 @@ function registerEvaluateFile(server) {
|
|
|
412
534
|
}
|
|
413
535
|
});
|
|
414
536
|
}
|
|
537
|
+
// ─── evaluate_code_streaming ─────────────────────────────────────────────────
|
|
538
|
+
function registerEvaluateCodeStreaming(server) {
|
|
539
|
+
server.tool("evaluate_code_streaming", `Submit code for streaming evaluation — returns per-judge results as each judge completes, with running aggregate scores. Ideal for long evaluations where you want progressive feedback. All ${JUDGES.length} judges run sequentially with per-judge results accumulated into a single structured response.`, {
|
|
540
|
+
code: z.string().describe("The source code to evaluate."),
|
|
541
|
+
language: z.string().describe("The programming language (e.g., 'typescript', 'python', 'javascript')."),
|
|
542
|
+
context: z.string().optional().describe("Optional context about the code."),
|
|
543
|
+
includeAstFindings: z.boolean().optional().describe("Include AST/code-structure findings (default: true)"),
|
|
544
|
+
minConfidence: z
|
|
545
|
+
.number()
|
|
546
|
+
.min(0)
|
|
547
|
+
.max(1)
|
|
548
|
+
.optional()
|
|
549
|
+
.describe("Minimum finding confidence to include (0-1, default: 0)"),
|
|
550
|
+
config: configSchema,
|
|
551
|
+
}, async ({ code, language, context, includeAstFindings, minConfidence, config }) => {
|
|
552
|
+
try {
|
|
553
|
+
const session = getGlobalSession();
|
|
554
|
+
const batches = [];
|
|
555
|
+
let finalBatch;
|
|
556
|
+
for await (const batch of evaluateWithTribunalStreaming(code, language, context, {
|
|
557
|
+
includeAstFindings,
|
|
558
|
+
minConfidence,
|
|
559
|
+
config: toJudgesConfig(config),
|
|
560
|
+
adaptiveSelection: true,
|
|
561
|
+
})) {
|
|
562
|
+
batches.push({
|
|
563
|
+
judgeId: batch.judgeId,
|
|
564
|
+
judgeName: batch.judgeName,
|
|
565
|
+
findingCount: batch.evaluation.findings.length,
|
|
566
|
+
durationMs: batch.evaluation.durationMs ?? 0,
|
|
567
|
+
runningScore: batch.aggregate.currentScore,
|
|
568
|
+
runningVerdict: batch.aggregate.currentVerdict,
|
|
569
|
+
});
|
|
570
|
+
finalBatch = batch;
|
|
571
|
+
}
|
|
572
|
+
// Build progressive markdown
|
|
573
|
+
let md = `# Streaming Evaluation Results\n\n`;
|
|
574
|
+
md += `**Final Score:** ${finalBatch?.aggregate.currentScore ?? 0}/100\n`;
|
|
575
|
+
md += `**Verdict:** ${(finalBatch?.aggregate.currentVerdict ?? "pass").toUpperCase()}\n`;
|
|
576
|
+
md += `**Judges Run:** ${finalBatch?.aggregate.completedJudges ?? 0}/${finalBatch?.aggregate.totalJudges ?? 0}\n`;
|
|
577
|
+
md += `**Total Findings:** ${finalBatch?.aggregate.findingsSoFar ?? 0}\n\n`;
|
|
578
|
+
md += `## Per-Judge Breakdown\n\n`;
|
|
579
|
+
md += `| Judge | Findings | Time (ms) | Running Score |\n`;
|
|
580
|
+
md += `|-------|----------|-----------|---------------|\n`;
|
|
581
|
+
for (const b of batches) {
|
|
582
|
+
md += `| ${b.judgeName} | ${b.findingCount} | ${b.durationMs} | ${b.runningScore}/100 |\n`;
|
|
583
|
+
}
|
|
584
|
+
const structuredData = {
|
|
585
|
+
score: finalBatch?.aggregate.currentScore ?? 0,
|
|
586
|
+
verdict: finalBatch?.aggregate.currentVerdict ?? "pass",
|
|
587
|
+
totalFindings: finalBatch?.aggregate.findingsSoFar ?? 0,
|
|
588
|
+
criticalFindings: finalBatch?.aggregate.criticalSoFar ?? 0,
|
|
589
|
+
highFindings: finalBatch?.aggregate.highSoFar ?? 0,
|
|
590
|
+
judgesRun: finalBatch?.aggregate.completedJudges ?? 0,
|
|
591
|
+
totalJudges: finalBatch?.aggregate.totalJudges ?? 0,
|
|
592
|
+
perJudge: batches,
|
|
593
|
+
sessionEvaluationCount: session.evaluationCount,
|
|
594
|
+
};
|
|
595
|
+
return {
|
|
596
|
+
content: [
|
|
597
|
+
{ type: "text", text: md },
|
|
598
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```" },
|
|
599
|
+
],
|
|
600
|
+
};
|
|
601
|
+
}
|
|
602
|
+
catch (error) {
|
|
603
|
+
return {
|
|
604
|
+
content: [
|
|
605
|
+
{
|
|
606
|
+
type: "text",
|
|
607
|
+
text: error instanceof Error ? `Error: ${error.message}` : "Error: Streaming evaluation failed",
|
|
608
|
+
},
|
|
609
|
+
],
|
|
610
|
+
isError: true,
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
});
|
|
614
|
+
}
|
|
@@ -7,6 +7,7 @@ import { evaluateWithTribunal, evaluateWithJudge } from "../evaluators/index.js"
|
|
|
7
7
|
import { getJudge, JUDGES } from "../judges/index.js";
|
|
8
8
|
import { applyPatches } from "../commands/fix.js";
|
|
9
9
|
import { configSchema, toJudgesConfig } from "./schemas.js";
|
|
10
|
+
import { validateCodeSize } from "./validation.js";
|
|
10
11
|
/**
|
|
11
12
|
* Register the fix_code tool for one-shot code evaluation + auto-fix.
|
|
12
13
|
*/
|
|
@@ -38,6 +39,10 @@ function registerFixCode(server) {
|
|
|
38
39
|
config: configSchema,
|
|
39
40
|
}, async ({ code, language, judgeId, context, minConfidence, config }) => {
|
|
40
41
|
try {
|
|
42
|
+
const sizeError = validateCodeSize(code);
|
|
43
|
+
if (sizeError) {
|
|
44
|
+
return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
|
|
45
|
+
}
|
|
41
46
|
const effectiveMinConfidence = minConfidence ?? 0.5;
|
|
42
47
|
// ── Evaluate ────────────────────────────────────────────────
|
|
43
48
|
let allFindings;
|
|
@@ -133,8 +138,26 @@ function registerFixCode(server) {
|
|
|
133
138
|
text += `\n`;
|
|
134
139
|
}
|
|
135
140
|
text += `### Fixed Code\n\n\`\`\`${language}\n${fixedCode}\n\`\`\`\n`;
|
|
141
|
+
const structured = {
|
|
142
|
+
totalFindings: allFindings.length,
|
|
143
|
+
autoFixable: fixable.length,
|
|
144
|
+
applied,
|
|
145
|
+
skipped,
|
|
146
|
+
remaining: remaining.length,
|
|
147
|
+
patches: fixable.map((p) => ({
|
|
148
|
+
ruleId: p.ruleId,
|
|
149
|
+
severity: p.severity,
|
|
150
|
+
title: p.title,
|
|
151
|
+
line: p.patch.startLine,
|
|
152
|
+
oldText: p.patch.oldText,
|
|
153
|
+
newText: p.patch.newText,
|
|
154
|
+
})),
|
|
155
|
+
};
|
|
136
156
|
return {
|
|
137
|
-
content: [
|
|
157
|
+
content: [
|
|
158
|
+
{ type: "text", text },
|
|
159
|
+
{ type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
|
|
160
|
+
],
|
|
138
161
|
};
|
|
139
162
|
}
|
|
140
163
|
catch (error) {
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
2
|
+
/**
|
|
3
|
+
* Register MCP resources: judges catalog, presets, session state,
|
|
4
|
+
* and parameterized templates for single-judge / single-preset lookups.
|
|
5
|
+
*/
|
|
6
|
+
export declare function registerResources(server: McpServer): void;
|