@kevinrabun/judges 3.117.8 → 3.119.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ import { evaluateProject, evaluateDiff, analyzeDependencies, runAppBuilderWorkfl
8
8
  import { evaluateFilesBatch } from "../api.js";
9
9
  import { getGlobalSession } from "../evaluation-session.js";
10
10
  import { generatePublicRepoReport } from "../reports/public-repo-report.js";
11
+ import { evaluateGitDiff, evaluateUnifiedDiff } from "../git-diff.js";
11
12
  import { configSchema, toJudgesConfig } from "./schemas.js";
12
13
  import { validateCodeSize } from "./validation.js";
13
14
  import { benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, runBenchmarkSuite, } from "../commands/benchmark.js";
@@ -20,6 +21,7 @@ export function registerWorkflowTools(server) {
20
21
  registerAppBuilderFlow(server);
21
22
  registerEvaluateProject(server);
22
23
  registerEvaluateDiff(server);
24
+ registerEvaluateGitDiff(server);
23
25
  registerAnalyzeDependencies(server);
24
26
  registerBenchmarkGate(server);
25
27
  registerBenchmarkDashboard(server);
@@ -919,3 +921,117 @@ function registerRecordFeedback(server) {
919
921
  };
920
922
  });
921
923
  }
924
+ // ─── evaluate_git_diff ───────────────────────────────────────────────────────
925
+ function registerEvaluateGitDiff(server) {
926
+ server.tool("evaluate_git_diff", "Evaluate code changes from a git diff. Parses the unified diff from a git repository, identifies changed files and lines, and runs the full tribunal on each changed file — filtering findings to only those on changed lines. Supports both live git repos (provide repoPath + base ref) and pre-computed diffs (provide diffText).", {
927
+ repoPath: z
928
+ .string()
929
+ .optional()
930
+ .describe("Absolute path to the git repository. Required when not providing diffText."),
931
+ base: z
932
+ .string()
933
+ .optional()
934
+ .describe("Git ref to diff against (e.g., 'main', 'HEAD~1', 'origin/main'). Default: 'HEAD~1'"),
935
+ diffText: z
936
+ .string()
937
+ .optional()
938
+ .describe("Pre-computed unified diff text. When provided, repoPath is used only for reading file contents."),
939
+ confidenceFilter: z
940
+ .number()
941
+ .min(0)
942
+ .max(1)
943
+ .optional()
944
+ .describe("Minimum confidence threshold for findings (default: no filter)"),
945
+ autoTune: z
946
+ .boolean()
947
+ .optional()
948
+ .describe("Apply feedback-driven auto-tuning to reduce false positives (default: false)"),
949
+ maxPromptChars: z
950
+ .number()
951
+ .min(0)
952
+ .optional()
953
+ .describe("Maximum character budget for LLM prompts. Controls truncation of deep-review prompts. Set to 0 to disable all truncation. Default: 100000."),
954
+ config: configSchema,
955
+ }, async ({ repoPath, base, diffText, confidenceFilter, autoTune, maxPromptChars, config }) => {
956
+ try {
957
+ const evalOptions = {
958
+ confidenceFilter,
959
+ autoTune,
960
+ maxPromptChars,
961
+ config: toJudgesConfig(config),
962
+ };
963
+ let result;
964
+ if (diffText) {
965
+ result = evaluateUnifiedDiff(diffText, repoPath ?? ".", evalOptions);
966
+ }
967
+ else if (repoPath) {
968
+ result = evaluateGitDiff(repoPath, base ?? "HEAD~1", evalOptions);
969
+ }
970
+ else {
971
+ return {
972
+ content: [
973
+ {
974
+ type: "text",
975
+ text: "Error: Provide either `repoPath` (for live git diff) or `diffText` (for pre-computed diff).",
976
+ },
977
+ ],
978
+ isError: true,
979
+ };
980
+ }
981
+ let md = `# Git Diff Analysis\n\n`;
982
+ md += `**Files changed:** ${result.files.length}\n`;
983
+ md += `**Total findings:** ${result.totalFindings}\n\n`;
984
+ for (const file of result.files) {
985
+ md += `## ${file.filePath}\n`;
986
+ md += `**Verdict:** ${file.verdict.verdict} · **Score:** ${file.verdict.score}/100 · `;
987
+ md += `**Changed lines:** ${file.verdict.linesAnalyzed} · **Findings:** ${file.verdict.findings.length}\n\n`;
988
+ if (file.verdict.findings.length > 0) {
989
+ for (const f of file.verdict.findings) {
990
+ const conf = f.confidence !== undefined && f.confidence !== null ? ` (${Math.round(f.confidence * 100)}%)` : "";
991
+ md += `- **${f.ruleId}** ${f.severity}${conf}: ${f.title}`;
992
+ if (f.lineNumbers && f.lineNumbers.length > 0) {
993
+ md += ` (L${f.lineNumbers.join(", L")})`;
994
+ }
995
+ md += `\n`;
996
+ }
997
+ md += `\n`;
998
+ }
999
+ }
1000
+ const structured = {
1001
+ filesAnalyzed: result.files.length,
1002
+ totalFindings: result.totalFindings,
1003
+ fileVerdicts: result.files.map((fv) => ({
1004
+ filePath: fv.filePath,
1005
+ verdict: fv.verdict.verdict,
1006
+ score: fv.verdict.score,
1007
+ changedLineCount: fv.verdict.linesAnalyzed,
1008
+ findingCount: fv.verdict.findings.length,
1009
+ findings: fv.verdict.findings.map((f) => ({
1010
+ ruleId: f.ruleId,
1011
+ severity: f.severity,
1012
+ confidence: f.confidence,
1013
+ title: f.title,
1014
+ lineNumbers: f.lineNumbers,
1015
+ })),
1016
+ })),
1017
+ };
1018
+ return {
1019
+ content: [
1020
+ { type: "text", text: md },
1021
+ { type: "text", text: "```json\n" + JSON.stringify(structured, null, 2) + "\n```" },
1022
+ ],
1023
+ };
1024
+ }
1025
+ catch (error) {
1026
+ return {
1027
+ content: [
1028
+ {
1029
+ type: "text",
1030
+ text: error instanceof Error ? `Error: ${error.message}` : "Error: Failed to evaluate git diff",
1031
+ },
1032
+ ],
1033
+ isError: true,
1034
+ };
1035
+ }
1036
+ });
1037
+ }
package/dist/types.d.ts CHANGED
@@ -670,6 +670,28 @@ export interface TribunalVerdict {
670
670
  /** Recommended action */
671
671
  recommendation: string;
672
672
  };
673
+ /**
674
+ * LLM deep-review prompt section. Present when `deepReview: true` is set
675
+ * in evaluation options. Contains a structured prompt that downstream LLM
676
+ * consumers can use for a second-pass analysis of the findings.
677
+ */
678
+ deepReviewPrompt?: string;
679
+ /**
680
+ * Auto-tune metadata. Present when `autoTune: true` is set and feedback
681
+ * data was applied. Records how many findings were suppressed or downgraded.
682
+ */
683
+ autoTuneApplied?: {
684
+ suppressed: number;
685
+ downgraded: number;
686
+ };
687
+ /**
688
+ * Confidence filter metadata. Present when `confidenceFilter` is set.
689
+ * Records how many findings were filtered out due to low confidence.
690
+ */
691
+ confidenceFilterApplied?: {
692
+ threshold: number;
693
+ filteredOut: number;
694
+ };
673
695
  }
674
696
  /**
675
697
  * Must-fix gate configuration for high-risk findings.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@kevinrabun/judges",
3
- "version": "3.117.8",
3
+ "version": "3.119.0",
4
4
  "description": "45 specialized judges that evaluate AI-generated code for security, cost, and quality.",
5
5
  "mcpName": "io.github.KevinRabun/judges",
6
6
  "type": "module",
package/server.json CHANGED
@@ -7,12 +7,12 @@
7
7
  "url": "https://github.com/kevinrabun/judges",
8
8
  "source": "github"
9
9
  },
10
- "version": "3.117.8",
10
+ "version": "3.119.0",
11
11
  "packages": [
12
12
  {
13
13
  "registryType": "npm",
14
14
  "identifier": "@kevinrabun/judges",
15
- "version": "3.117.8",
15
+ "version": "3.119.0",
16
16
  "transport": {
17
17
  "type": "stdio"
18
18
  }