@kevinrabun/judges 3.115.4 → 3.117.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/agents/accessibility.judge.md +7 -0
  2. package/agents/agent-instructions.judge.md +7 -0
  3. package/agents/ai-code-safety.judge.md +7 -0
  4. package/agents/api-contract.judge.md +7 -0
  5. package/agents/api-design.judge.md +7 -0
  6. package/agents/authentication.judge.md +7 -0
  7. package/agents/backwards-compatibility.judge.md +7 -0
  8. package/agents/caching.judge.md +7 -0
  9. package/agents/ci-cd.judge.md +7 -0
  10. package/agents/cloud-readiness.judge.md +7 -0
  11. package/agents/concurrency.judge.md +7 -0
  12. package/agents/configuration-management.judge.md +7 -0
  13. package/agents/cybersecurity.judge.md +7 -0
  14. package/agents/data-security.judge.md +7 -0
  15. package/agents/dependency-health.judge.md +7 -0
  16. package/agents/documentation.judge.md +7 -0
  17. package/agents/error-handling.judge.md +7 -0
  18. package/agents/ethics-bias.judge.md +7 -0
  19. package/agents/false-positive-review.judge.md +12 -0
  20. package/agents/framework-safety.judge.md +7 -0
  21. package/agents/hallucination-detection.judge.md +13 -0
  22. package/agents/iac-security.judge.md +7 -0
  23. package/agents/intent-alignment.judge.md +13 -0
  24. package/agents/logging-privacy.judge.md +7 -0
  25. package/agents/maintainability.judge.md +7 -0
  26. package/agents/multi-turn-coherence.judge.md +7 -0
  27. package/agents/observability.judge.md +7 -0
  28. package/agents/portability.judge.md +7 -0
  29. package/agents/rate-limiting.judge.md +7 -0
  30. package/agents/reliability.judge.md +7 -0
  31. package/agents/security.judge.md +13 -0
  32. package/agents/testing.judge.md +7 -0
  33. package/agents/ux.judge.md +7 -0
  34. package/dist/a2a-protocol.d.ts +136 -0
  35. package/dist/a2a-protocol.js +218 -0
  36. package/dist/api.d.ts +21 -3
  37. package/dist/api.js +21 -1
  38. package/dist/audit-trail.d.ts +245 -0
  39. package/dist/audit-trail.js +257 -0
  40. package/dist/commands/benchmark-advanced.js +51 -51
  41. package/dist/commands/benchmark-ai-agents.js +16 -16
  42. package/dist/commands/benchmark-compliance-ethics.js +12 -12
  43. package/dist/commands/benchmark-expanded-2.js +2 -2
  44. package/dist/commands/benchmark-expanded.js +2 -2
  45. package/dist/commands/benchmark-infrastructure.js +12 -12
  46. package/dist/commands/benchmark-languages.js +11 -11
  47. package/dist/commands/benchmark-quality-ops.js +7 -7
  48. package/dist/commands/benchmark-security-deep.js +9 -9
  49. package/dist/commands/benchmark.js +1 -1
  50. package/dist/commands/llm-benchmark-optimizer.d.ts +78 -0
  51. package/dist/commands/llm-benchmark-optimizer.js +241 -0
  52. package/dist/commands/llm-benchmark.d.ts +4 -2
  53. package/dist/commands/llm-benchmark.js +40 -12
  54. package/dist/escalation.d.ts +100 -0
  55. package/dist/escalation.js +292 -0
  56. package/dist/evaluation-session.d.ts +74 -0
  57. package/dist/evaluation-session.js +152 -0
  58. package/dist/evaluators/index.d.ts +23 -1
  59. package/dist/evaluators/index.js +192 -3
  60. package/dist/evaluators/judge-selector.d.ts +19 -0
  61. package/dist/evaluators/judge-selector.js +141 -0
  62. package/dist/evaluators/recall-boost.d.ts +27 -0
  63. package/dist/evaluators/recall-boost.js +409 -0
  64. package/dist/feedback-loop.d.ts +62 -0
  65. package/dist/feedback-loop.js +179 -0
  66. package/dist/index.js +2 -0
  67. package/dist/judges/accessibility.js +7 -0
  68. package/dist/judges/agent-instructions.js +7 -0
  69. package/dist/judges/ai-code-safety.js +7 -0
  70. package/dist/judges/api-contract.js +7 -0
  71. package/dist/judges/api-design.js +7 -0
  72. package/dist/judges/authentication.js +7 -0
  73. package/dist/judges/backwards-compatibility.js +7 -0
  74. package/dist/judges/caching.js +7 -0
  75. package/dist/judges/ci-cd.js +7 -0
  76. package/dist/judges/cloud-readiness.js +7 -0
  77. package/dist/judges/concurrency.js +7 -0
  78. package/dist/judges/configuration-management.js +7 -0
  79. package/dist/judges/cybersecurity.js +7 -0
  80. package/dist/judges/data-security.js +7 -0
  81. package/dist/judges/dependency-health.js +7 -0
  82. package/dist/judges/documentation.js +7 -0
  83. package/dist/judges/error-handling.js +7 -0
  84. package/dist/judges/ethics-bias.js +7 -0
  85. package/dist/judges/false-positive-review.js +13 -1
  86. package/dist/judges/framework-safety.js +7 -0
  87. package/dist/judges/hallucination-detection.js +14 -1
  88. package/dist/judges/iac-security.js +7 -0
  89. package/dist/judges/intent-alignment.js +14 -1
  90. package/dist/judges/logging-privacy.js +7 -0
  91. package/dist/judges/maintainability.js +7 -0
  92. package/dist/judges/multi-turn-coherence.js +7 -0
  93. package/dist/judges/observability.js +7 -0
  94. package/dist/judges/portability.js +7 -0
  95. package/dist/judges/rate-limiting.js +7 -0
  96. package/dist/judges/reliability.js +7 -0
  97. package/dist/judges/security.js +14 -1
  98. package/dist/judges/testing.js +7 -0
  99. package/dist/judges/ux.js +7 -0
  100. package/dist/review-conversation.d.ts +87 -0
  101. package/dist/review-conversation.js +307 -0
  102. package/dist/sast-integration.d.ts +112 -0
  103. package/dist/sast-integration.js +215 -0
  104. package/dist/tools/register-evaluation.js +208 -8
  105. package/dist/tools/register-fix.js +24 -1
  106. package/dist/tools/register-resources.d.ts +6 -0
  107. package/dist/tools/register-resources.js +177 -0
  108. package/dist/tools/register-review.js +26 -1
  109. package/dist/tools/register-workflow.js +384 -11
  110. package/dist/tools/validation.d.ts +13 -0
  111. package/dist/tools/validation.js +77 -0
  112. package/dist/types.d.ts +122 -0
  113. package/package.json +25 -12
  114. package/server.json +2 -2
@@ -4,10 +4,12 @@
4
4
  // ──────────────────────────────────────────────────────────────────────────────
5
5
  import { z } from "zod";
6
6
  import { JUDGES } from "../judges/index.js";
7
- import { evaluateProject, evaluateDiff, analyzeDependencies, runAppBuilderWorkflow } from "../evaluators/index.js";
7
+ import { evaluateProject, evaluateDiff, analyzeDependencies, runAppBuilderWorkflow, evaluateWithTribunal, enrichWithPatches, formatVerdictAsMarkdown, } from "../evaluators/index.js";
8
8
  import { evaluateFilesBatch } from "../api.js";
9
+ import { getGlobalSession } from "../evaluation-session.js";
9
10
  import { generatePublicRepoReport } from "../reports/public-repo-report.js";
10
11
  import { configSchema, toJudgesConfig } from "./schemas.js";
12
+ import { validateCodeSize } from "./validation.js";
11
13
  import { benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, runBenchmarkSuite, } from "../commands/benchmark.js";
12
14
  /**
13
15
  * Register workflow-focused tools: evaluate_public_repo_report, evaluate_project,
@@ -22,6 +24,10 @@ export function registerWorkflowTools(server) {
22
24
  registerBenchmarkGate(server);
23
25
  registerBenchmarkDashboard(server);
24
26
  registerEvaluateBatch(server);
27
+ registerEvaluateThenFix(server);
28
+ registerEvaluateFocused(server);
29
+ registerSessionStatus(server);
30
+ registerRecordFeedback(server);
25
31
  }
26
32
  // ─── evaluate_public_repo_report ─────────────────────────────────────────────
27
33
  function registerPublicRepoReport(server) {
@@ -75,6 +81,10 @@ function registerPublicRepoReport(server) {
75
81
  keepClone: z.boolean().optional().describe("Keep cloned repository on disk for inspection"),
76
82
  }, async ({ repoUrl, branch, outputPath, maxFiles, maxFileBytes, maxFindingsInReport, credentialMode, includeAstFindings, minConfidence, enableMustFixGate, mustFixMinConfidence, mustFixDangerousRulePrefixes, keepClone, }) => {
77
83
  try {
84
+ await server.sendLoggingMessage({
85
+ level: "info",
86
+ data: `Cloning repository: ${repoUrl}${branch ? ` (branch: ${branch})` : ""}...`,
87
+ });
78
88
  const report = generatePublicRepoReport({
79
89
  repoUrl,
80
90
  branch,
@@ -112,12 +122,18 @@ function registerPublicRepoReport(server) {
112
122
  if (keepClone) {
113
123
  summary += `- Clone path: ${report.clonePath}\n`;
114
124
  }
125
+ const structured = {
126
+ repoUrl,
127
+ overallVerdict: report.overallVerdict,
128
+ averageScore: report.averageScore,
129
+ analyzedFileCount: report.analyzedFileCount,
130
+ totalFindings: report.totalFindings,
131
+ outputPath: report.outputPath ?? null,
132
+ };
115
133
  return {
116
134
  content: [
117
- {
118
- type: "text",
119
- text: `${summary}\n---\n\n${report.markdown}`,
120
- },
135
+ { type: "text", text: `${summary}\n---\n\n${report.markdown}` },
136
+ { type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
121
137
  ],
122
138
  };
123
139
  }
@@ -217,7 +233,37 @@ function registerAppBuilderFlow(server) {
217
233
  md += `- **${task.priority} ${task.ruleId}** ${task.task}\n`;
218
234
  }
219
235
  }
220
- return { content: [{ type: "text", text: md }] };
236
+ const structured = {
237
+ mode: result.mode,
238
+ releaseDecision: result.releaseDecision,
239
+ score: result.score,
240
+ verdict: result.verdict,
241
+ criticalCount: result.criticalCount,
242
+ highCount: result.highCount,
243
+ mediumCount: result.mediumCount,
244
+ taskCount: result.tasks.length,
245
+ aiFixableCount: result.aiFixableNow.length,
246
+ findings: result.plainLanguageFindings.map((f) => ({
247
+ ruleId: f.ruleId,
248
+ severity: f.severity,
249
+ title: f.title,
250
+ whatIsWrong: f.whatIsWrong,
251
+ nextAction: f.nextAction,
252
+ })),
253
+ tasks: result.tasks.map((t) => ({
254
+ priority: t.priority,
255
+ owner: t.owner,
256
+ effort: t.effort,
257
+ ruleId: t.ruleId,
258
+ task: t.task,
259
+ })),
260
+ };
261
+ return {
262
+ content: [
263
+ { type: "text", text: md },
264
+ { type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
265
+ ],
266
+ };
221
267
  }
222
268
  catch (error) {
223
269
  return {
@@ -253,6 +299,10 @@ function registerEvaluateProject(server) {
253
299
  config: configSchema,
254
300
  }, async ({ files, context, includeAstFindings, minConfidence, config }) => {
255
301
  try {
302
+ await server.sendLoggingMessage({
303
+ level: "info",
304
+ data: `Evaluating ${files.length} files with ${JUDGES.length} judges...`,
305
+ });
256
306
  const result = evaluateProject(files, context, {
257
307
  includeAstFindings,
258
308
  minConfidence,
@@ -282,7 +332,37 @@ function registerEvaluateProject(server) {
282
332
  md += `- **[${f.severity.toUpperCase()}]** ${f.ruleId}: ${f.title}\n ${f.description}\n`;
283
333
  }
284
334
  }
285
- return { content: [{ type: "text", text: md }] };
335
+ const structured = {
336
+ overallScore: result.overallScore,
337
+ overallVerdict: result.overallVerdict,
338
+ fileCount: result.fileResults.length,
339
+ criticalCount: result.criticalCount,
340
+ highCount: result.highCount,
341
+ fileResults: result.fileResults.map((fr) => ({
342
+ path: fr.path,
343
+ language: fr.language,
344
+ score: fr.score,
345
+ findingCount: fr.findings.length,
346
+ findings: fr.findings.map((f) => ({
347
+ ruleId: f.ruleId,
348
+ severity: f.severity,
349
+ title: f.title,
350
+ line: f.lineNumbers?.[0],
351
+ })),
352
+ })),
353
+ architecturalFindings: result.architecturalFindings.map((f) => ({
354
+ ruleId: f.ruleId,
355
+ severity: f.severity,
356
+ title: f.title,
357
+ description: f.description,
358
+ })),
359
+ };
360
+ return {
361
+ content: [
362
+ { type: "text", text: md },
363
+ { type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
364
+ ],
365
+ };
286
366
  }
287
367
  catch (error) {
288
368
  return {
@@ -314,6 +394,10 @@ function registerEvaluateDiff(server) {
314
394
  config: configSchema,
315
395
  }, async ({ code, language, changedLines, context, includeAstFindings, minConfidence, config }) => {
316
396
  try {
397
+ const sizeError = validateCodeSize(code);
398
+ if (sizeError) {
399
+ return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
400
+ }
317
401
  const result = evaluateDiff(code, language, changedLines, context, {
318
402
  includeAstFindings,
319
403
  minConfidence,
@@ -334,7 +418,24 @@ function registerEvaluateDiff(server) {
334
418
  md += `**Recommendation:** ${f.recommendation}\n\n`;
335
419
  }
336
420
  }
337
- return { content: [{ type: "text", text: md }] };
421
+ const structured = {
422
+ score: result.score,
423
+ verdict: result.verdict,
424
+ linesAnalyzed: result.linesAnalyzed,
425
+ findingCount: result.findings.length,
426
+ findings: result.findings.map((f) => ({
427
+ ruleId: f.ruleId,
428
+ severity: f.severity,
429
+ title: f.title,
430
+ lineNumbers: f.lineNumbers,
431
+ })),
432
+ };
433
+ return {
434
+ content: [
435
+ { type: "text", text: md },
436
+ { type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
437
+ ],
438
+ };
338
439
  }
339
440
  catch (error) {
340
441
  return {
@@ -382,7 +483,24 @@ function registerAnalyzeDependencies(server) {
382
483
  md += `**Development (${dev.length}):** ${dev.map((d) => `${d.name}@${d.version}`).join(", ")}\n\n`;
383
484
  }
384
485
  }
385
- return { content: [{ type: "text", text: md }] };
486
+ const structured = {
487
+ manifestType,
488
+ score: result.score,
489
+ verdict: result.verdict,
490
+ totalDependencies: result.totalDependencies,
491
+ findingCount: result.findings.length,
492
+ findings: result.findings.map((f) => ({
493
+ ruleId: f.ruleId,
494
+ severity: f.severity,
495
+ title: f.title,
496
+ })),
497
+ };
498
+ return {
499
+ content: [
500
+ { type: "text", text: md },
501
+ { type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
502
+ ],
503
+ };
386
504
  }
387
505
  catch (error) {
388
506
  return {
@@ -509,8 +627,13 @@ function registerEvaluateBatch(server) {
509
627
  }, async (params) => {
510
628
  const config = params.config ? toJudgesConfig(params.config) : undefined;
511
629
  const options = config ? { config } : undefined;
630
+ await server.sendLoggingMessage({ level: "info", data: `Batch evaluation: ${params.files.length} files...` });
512
631
  // Use bounded-concurrency parallel evaluation instead of sequential loop
513
- const batchResults = await evaluateFilesBatch(params.files, 4, options);
632
+ const batchResults = await evaluateFilesBatch(params.files, 4, options, (completed, total) => {
633
+ server
634
+ .sendLoggingMessage({ level: "info", data: `Progress: ${completed}/${total} files evaluated` })
635
+ .catch(() => { });
636
+ });
514
637
  const results = batchResults.map((r) => {
515
638
  const criticals = r.verdict.findings.filter((f) => f.severity === "critical").length;
516
639
  return {
@@ -541,8 +664,258 @@ function registerEvaluateBatch(server) {
541
664
  results.map((r) => `| ${r.path} | ${r.score} | ${r.findingCount} | ${r.criticalCount} |`).join("\n") +
542
665
  "\n\n" +
543
666
  allFindings.join("\n\n");
667
+ const structured = {
668
+ fileCount: results.length,
669
+ averageScore: avgScore,
670
+ totalFindings,
671
+ totalCriticals,
672
+ files: results,
673
+ };
674
+ return {
675
+ content: [
676
+ { type: "text", text: summary },
677
+ { type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
678
+ ],
679
+ };
680
+ });
681
+ }
682
+ // ─── evaluate_then_fix ───────────────────────────────────────────────────────
683
+ function registerEvaluateThenFix(server) {
684
+ server.tool("evaluate_then_fix", "Evaluate code and automatically generate fix patches for all findings that have auto-fix support. Returns the evaluation verdict alongside ready-to-apply patches. Use this for a single-step 'review + fix' workflow.", {
685
+ code: z.string().describe("The source code to evaluate and fix."),
686
+ language: z.string().describe("The programming language (e.g., 'typescript', 'python')."),
687
+ context: z.string().optional().describe("Optional context about the code."),
688
+ includeAstFindings: z.boolean().optional().describe("Include AST/code-structure findings (default: true)"),
689
+ minConfidence: z
690
+ .number()
691
+ .min(0)
692
+ .max(1)
693
+ .optional()
694
+ .describe("Minimum finding confidence to include (0-1, default: 0)"),
695
+ config: configSchema,
696
+ }, async ({ code, language, context, includeAstFindings, minConfidence, config }) => {
697
+ try {
698
+ const sizeError = validateCodeSize(code);
699
+ if (sizeError) {
700
+ return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
701
+ }
702
+ const session = getGlobalSession();
703
+ // Step 1: Evaluate
704
+ const verdict = evaluateWithTribunal(code, language, context, {
705
+ includeAstFindings,
706
+ minConfidence,
707
+ config: toJudgesConfig(config),
708
+ adaptiveSelection: true,
709
+ });
710
+ // Step 2: Generate fix patches for all findings
711
+ const patchedFindings = enrichWithPatches(verdict.findings, code);
712
+ session.recordEvaluation(context ?? `<inline:${language}>`, code, verdict);
713
+ const patchableFindings = patchedFindings.filter((f) => f.patch);
714
+ const patchCount = patchableFindings.length;
715
+ let md = `# Evaluate & Fix Results\n\n`;
716
+ md += `**Score:** ${verdict.overallScore}/100 | **Verdict:** ${verdict.overallVerdict.toUpperCase()}\n`;
717
+ md += `**Total Findings:** ${verdict.findings.length} | **Auto-fixable:** ${patchCount}\n\n`;
718
+ if (patchCount > 0) {
719
+ md += `## Auto-Fix Patches\n\n`;
720
+ md += `The following findings have auto-fix patches ready to apply:\n\n`;
721
+ for (const f of patchableFindings.slice(0, 20)) {
722
+ md += `### ${f.ruleId}: ${f.title}\n`;
723
+ md += `- **Severity:** ${f.severity} | **Lines:** ${f.lineNumbers?.join(", ") ?? "N/A"}\n`;
724
+ md += `- **Fix:**\n\`\`\`diff\n`;
725
+ if (f.patch?.oldText)
726
+ md += `- ${f.patch.oldText}\n`;
727
+ if (f.patch?.newText)
728
+ md += `+ ${f.patch.newText}\n`;
729
+ md += `\`\`\`\n\n`;
730
+ }
731
+ if (patchableFindings.length > 20) {
732
+ md += `> ... and ${patchableFindings.length - 20} more auto-fixable findings\n\n`;
733
+ }
734
+ }
735
+ md += formatVerdictAsMarkdown(verdict);
736
+ const structuredData = {
737
+ score: verdict.overallScore,
738
+ verdict: verdict.overallVerdict,
739
+ totalFindings: verdict.findings.length,
740
+ autoFixable: patchCount,
741
+ patches: patchableFindings.slice(0, 50).map((f) => ({
742
+ ruleId: f.ruleId,
743
+ severity: f.severity,
744
+ title: f.title,
745
+ lineNumbers: f.lineNumbers,
746
+ oldText: f.patch?.oldText,
747
+ newText: f.patch?.newText,
748
+ })),
749
+ };
750
+ return {
751
+ content: [
752
+ { type: "text", text: md },
753
+ { type: "text", text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```" },
754
+ ],
755
+ };
756
+ }
757
+ catch (error) {
758
+ return {
759
+ content: [
760
+ {
761
+ type: "text",
762
+ text: error instanceof Error ? `Error: ${error.message}` : "Error: evaluate_then_fix failed",
763
+ },
764
+ ],
765
+ isError: true,
766
+ };
767
+ }
768
+ });
769
+ }
770
+ // ─── evaluate_focused ────────────────────────────────────────────────────────
771
+ function registerEvaluateFocused(server) {
772
+ server.tool("evaluate_focused", "Run a focused evaluation using only the specified judges. Use this after an initial full evaluation to re-check specific areas — for example, re-run only 'cybersecurity' and 'authentication' judges after applying security fixes. Much faster than a full tribunal evaluation.", {
773
+ code: z.string().describe("The source code to evaluate."),
774
+ language: z.string().describe("The programming language (e.g., 'typescript', 'python')."),
775
+ judgeIds: z
776
+ .array(z.string())
777
+ .min(1)
778
+ .describe("Array of judge IDs to run (e.g., ['cybersecurity', 'authentication', 'data-sovereignty'])"),
779
+ context: z.string().optional().describe("Optional context about the code."),
780
+ includeAstFindings: z.boolean().optional().describe("Include AST/code-structure findings (default: true)"),
781
+ minConfidence: z
782
+ .number()
783
+ .min(0)
784
+ .max(1)
785
+ .optional()
786
+ .describe("Minimum finding confidence to include (0-1, default: 0)"),
787
+ config: configSchema,
788
+ }, async ({ code, language, judgeIds, context, includeAstFindings, minConfidence, config }) => {
789
+ try {
790
+ const sizeError = validateCodeSize(code);
791
+ if (sizeError) {
792
+ return { content: [{ type: "text", text: `Error: ${sizeError}` }], isError: true };
793
+ }
794
+ const cfgObj = toJudgesConfig(config);
795
+ // Build a config that disables all judges EXCEPT the focused ones
796
+ const allJudgeIds = JUDGES.map((j) => j.id);
797
+ const focusedSet = new Set(judgeIds);
798
+ const disabledJudges = allJudgeIds.filter((id) => !focusedSet.has(id));
799
+ const mergedConfig = cfgObj
800
+ ? { ...cfgObj, disabledJudges: [...(cfgObj.disabledJudges ?? []), ...disabledJudges] }
801
+ : { disabledJudges };
802
+ const verdict = evaluateWithTribunal(code, language, context, {
803
+ includeAstFindings,
804
+ minConfidence,
805
+ config: mergedConfig,
806
+ });
807
+ let md = `# Focused Evaluation (${judgeIds.length} judges)\n\n`;
808
+ md += `**Judges:** ${judgeIds.join(", ")}\n`;
809
+ md += `**Score:** ${verdict.overallScore}/100 | **Verdict:** ${verdict.overallVerdict.toUpperCase()}\n`;
810
+ md += `**Findings:** ${verdict.findings.length}\n\n`;
811
+ md += formatVerdictAsMarkdown(verdict);
812
+ const structuredData = {
813
+ focusedJudges: judgeIds,
814
+ score: verdict.overallScore,
815
+ verdict: verdict.overallVerdict,
816
+ findingCount: verdict.findings.length,
817
+ findings: verdict.findings.map((f) => ({
818
+ ruleId: f.ruleId,
819
+ severity: f.severity,
820
+ title: f.title,
821
+ lineNumbers: f.lineNumbers,
822
+ confidence: f.confidence,
823
+ })),
824
+ };
825
+ return {
826
+ content: [
827
+ { type: "text", text: md },
828
+ { type: "text", text: "```json\n" + JSON.stringify(structuredData, null, 2) + "\n```" },
829
+ ],
830
+ };
831
+ }
832
+ catch (error) {
833
+ return {
834
+ content: [
835
+ {
836
+ type: "text",
837
+ text: error instanceof Error ? `Error: ${error.message}` : "Error: Focused evaluation failed",
838
+ },
839
+ ],
840
+ isError: true,
841
+ };
842
+ }
843
+ });
844
+ }
845
+ // ─── session_status ──────────────────────────────────────────────────────────
846
+ function registerSessionStatus(server) {
847
+ server.tool("session_status", "Get the current evaluation session status — how many evaluations have been run, detected frameworks, verdict history per file, and stability indicators. Useful for understanding what the tribunal has already reviewed.", {}, async () => {
848
+ const session = getGlobalSession();
849
+ const ctx = session.getContext();
850
+ const filesEvaluated = [...ctx.verdictHistory.entries()].map(([file, history]) => ({
851
+ file,
852
+ evaluations: history.length,
853
+ latestScore: history[history.length - 1]?.score ?? 0,
854
+ stable: session.isVerdictStable(file),
855
+ }));
856
+ let md = `# Evaluation Session Status\n\n`;
857
+ md += `**Evaluations:** ${ctx.evaluationCount}\n`;
858
+ md += `**Started:** ${ctx.startedAt}\n`;
859
+ md += `**Detected Frameworks:** ${ctx.frameworks.length > 0 ? ctx.frameworks.join(", ") : "None yet"}\n`;
860
+ md += `**Capabilities:** ${ctx.capabilities.size > 0 ? [...ctx.capabilities].join(", ") : "None yet"}\n\n`;
861
+ if (filesEvaluated.length > 0) {
862
+ md += `## Files Evaluated\n\n`;
863
+ md += `| File | Evals | Latest Score | Stable |\n`;
864
+ md += `|------|-------|--------------|--------|\n`;
865
+ for (const f of filesEvaluated) {
866
+ md += `| ${f.file} | ${f.evaluations} | ${f.latestScore}/100 | ${f.stable ? "Yes" : "No"} |\n`;
867
+ }
868
+ }
869
+ const feedbackTally = [...session.getFeedbackTally().entries()];
870
+ if (feedbackTally.length > 0) {
871
+ md += `\n## Feedback Tally\n\n`;
872
+ md += `| Rule | TP | FP | Won't Fix |\n`;
873
+ md += `|------|----|----|----------|\n`;
874
+ for (const [rule, counts] of feedbackTally) {
875
+ md += `| ${rule} | ${counts.tp} | ${counts.fp} | ${counts.wontfix} |\n`;
876
+ }
877
+ }
544
878
  return {
545
- content: [{ type: "text", text: summary }],
879
+ content: [
880
+ { type: "text", text: md },
881
+ {
882
+ type: "text",
883
+ text: "```json\n" +
884
+ JSON.stringify({
885
+ evaluationCount: ctx.evaluationCount,
886
+ startedAt: ctx.startedAt,
887
+ frameworks: ctx.frameworks,
888
+ capabilities: [...ctx.capabilities],
889
+ filesEvaluated,
890
+ feedbackTally: Object.fromEntries(feedbackTally),
891
+ }, null, 2) +
892
+ "\n```",
893
+ },
894
+ ],
895
+ };
896
+ });
897
+ }
898
+ // ─── record_feedback ─────────────────────────────────────────────────────────
899
+ function registerRecordFeedback(server) {
900
+ server.tool("record_feedback", "Record user feedback on a finding — mark it as a true positive (tp), false positive (fp), or won't fix (wontfix). This feedback calibrates confidence scores in subsequent evaluations during the current session, reducing noise from rules the user considers inaccurate.", {
901
+ ruleId: z.string().describe("The rule ID of the finding (e.g., 'SEC-001', 'AUTH-003')."),
902
+ verdict: z
903
+ .enum(["tp", "fp", "wontfix"])
904
+ .describe("The feedback verdict: tp (true positive), fp (false positive), wontfix (acknowledged but won't fix)."),
905
+ }, async ({ ruleId, verdict }) => {
906
+ const session = getGlobalSession();
907
+ session.recordFeedback(ruleId, verdict);
908
+ const penalty = session.getConfidencePenalty(ruleId);
909
+ const penaltyPct = Math.round(penalty * 100);
910
+ return {
911
+ content: [
912
+ {
913
+ type: "text",
914
+ text: `Feedback recorded: **${ruleId}** → **${verdict}**\n\n` +
915
+ `Current confidence multiplier for ${ruleId}: **${penaltyPct}%**\n` +
916
+ (verdict === "fp" ? `Future findings for this rule will have reduced confidence in this session.` : ``),
917
+ },
918
+ ],
546
919
  };
547
920
  });
548
921
  }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Validate that code input is within acceptable size limits.
3
+ * Returns an error message string if validation fails, or `undefined` if valid.
4
+ */
5
+ export declare function validateCodeSize(code: string, maxBytes?: number): string | undefined;
6
+ /** Recognized programming languages for validation warnings. */
7
+ export declare const KNOWN_LANGUAGES: Set<string>;
8
+ /**
9
+ * Check whether a language string is recognized.
10
+ * Returns the normalized (lowercased) language, or `undefined` if not recognized.
11
+ * This is advisory only — unrecognized languages are still accepted.
12
+ */
13
+ export declare function normalizeLanguage(lang: string): string;
@@ -0,0 +1,77 @@
1
+ // ─── Input Validation Helpers ────────────────────────────────────────────────
2
+ // Shared validation for MCP tool inputs at system boundaries.
3
+ // ──────────────────────────────────────────────────────────────────────────────
4
+ /** Maximum code input size (1 MB). Prevents excessive memory/CPU usage. */
5
+ const MAX_CODE_BYTES = 1_048_576;
6
+ /**
7
+ * Validate that code input is within acceptable size limits.
8
+ * Returns an error message string if validation fails, or `undefined` if valid.
9
+ */
10
+ export function validateCodeSize(code, maxBytes = MAX_CODE_BYTES) {
11
+ if (code.length === 0) {
12
+ return "Code input is empty.";
13
+ }
14
+ const byteLength = Buffer.byteLength(code, "utf-8");
15
+ if (byteLength > maxBytes) {
16
+ return `Code input too large (${(byteLength / 1024).toFixed(0)} KB). Maximum allowed: ${(maxBytes / 1024).toFixed(0)} KB.`;
17
+ }
18
+ return undefined;
19
+ }
20
+ /** Recognized programming languages for validation warnings. */
21
+ export const KNOWN_LANGUAGES = new Set([
22
+ "typescript",
23
+ "javascript",
24
+ "python",
25
+ "java",
26
+ "csharp",
27
+ "c",
28
+ "cpp",
29
+ "go",
30
+ "rust",
31
+ "ruby",
32
+ "php",
33
+ "swift",
34
+ "kotlin",
35
+ "scala",
36
+ "r",
37
+ "powershell",
38
+ "bash",
39
+ "shell",
40
+ "sql",
41
+ "html",
42
+ "css",
43
+ "scss",
44
+ "bicep",
45
+ "terraform",
46
+ "hcl",
47
+ "yaml",
48
+ "yml",
49
+ "json",
50
+ "xml",
51
+ "toml",
52
+ "dockerfile",
53
+ "makefile",
54
+ "markdown",
55
+ "plaintext",
56
+ "objective-c",
57
+ "dart",
58
+ "lua",
59
+ "perl",
60
+ "elixir",
61
+ "erlang",
62
+ "haskell",
63
+ "fsharp",
64
+ "vb",
65
+ "assembly",
66
+ "zig",
67
+ "nim",
68
+ "cloudformation",
69
+ ]);
70
+ /**
71
+ * Check whether a language string is recognized.
72
+ * Returns the normalized (lowercased) language, or `undefined` if not recognized.
73
+ * This is advisory only — unrecognized languages are still accepted.
74
+ */
75
+ export function normalizeLanguage(lang) {
76
+ return lang.toLowerCase().trim();
77
+ }