@kevinrabun/judges 3.119.0 → 3.122.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/api.d.ts +2 -1
- package/dist/api.js +3 -1
- package/dist/cli-dispatch.d.ts +7 -0
- package/dist/cli-dispatch.js +654 -0
- package/dist/cli-formatters.d.ts +6 -0
- package/dist/cli-formatters.js +186 -0
- package/dist/cli.js +69 -4159
- package/dist/commands/baseline.js +2 -42
- package/dist/commands/coverage.js +3 -39
- package/dist/commands/diff.js +2 -38
- package/dist/commands/fix-pr.js +2 -23
- package/dist/commands/fix.js +3 -27
- package/dist/commands/llm-benchmark.d.ts +7 -0
- package/dist/commands/llm-benchmark.js +27 -1
- package/dist/commands/quality-gate.js +1 -12
- package/dist/commands/review-parallel.js +1 -19
- package/dist/commands/review.js +2 -33
- package/dist/commands/rule-test.js +1 -15
- package/dist/commands/tune.js +2 -29
- package/dist/commands/watch.js +3 -42
- package/dist/config.js +1 -1
- package/dist/evaluators/hallucination-detection.js +343 -0
- package/dist/evaluators/index.d.ts +2 -11
- package/dist/evaluators/index.js +3 -181
- package/dist/evaluators/security.js +226 -2
- package/dist/evaluators/suppressions.d.ts +49 -0
- package/dist/evaluators/suppressions.js +185 -0
- package/dist/ext-to-lang.d.ts +16 -0
- package/dist/ext-to-lang.js +60 -0
- package/dist/github-app.d.ts +1 -3
- package/dist/github-app.js +2 -34
- package/dist/parallel.js +2 -14
- package/dist/probabilistic/llm-response-validator.js +1 -1
- package/dist/reports/public-repo-report.js +9 -1
- package/dist/skill-loader.js +9 -6
- package/dist/tools/register-evaluation.js +2 -29
- package/package.json +1 -1
- package/server.json +2 -2
- package/src/skill-loader.ts +9 -6
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { TribunalVerdict, JudgeEvaluation } from "./types.js";
|
|
2
|
+
export type OutputFormat = "text" | "json" | "sarif" | "markdown" | "html" | "pdf" | "junit" | "codeclimate" | "github-actions";
|
|
3
|
+
export declare function formatTribunalOutput(verdict: TribunalVerdict, format: OutputFormat, filePath?: string): string;
|
|
4
|
+
export declare function writeOutputIfSpecified(outputPath: string | undefined, contents: string): void;
|
|
5
|
+
export declare function formatTextOutput(verdict: TribunalVerdict): string;
|
|
6
|
+
export declare function formatSingleJudgeTextOutput(evaluation: JudgeEvaluation): string;
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from "fs";
|
|
2
|
+
import { dirname } from "path";
|
|
3
|
+
import { formatVerdictAsMarkdown } from "./evaluators/index.js";
|
|
4
|
+
import { verdictToSarif } from "./formatters/sarif.js";
|
|
5
|
+
import { verdictToGitHubActions } from "./formatters/github-actions.js";
|
|
6
|
+
export function formatTribunalOutput(verdict, format, filePath) {
|
|
7
|
+
switch (format) {
|
|
8
|
+
case "json":
|
|
9
|
+
return JSON.stringify(verdict, null, 2);
|
|
10
|
+
case "sarif": {
|
|
11
|
+
const sarif = verdictToSarif(verdict, filePath);
|
|
12
|
+
return JSON.stringify(sarif, null, 2);
|
|
13
|
+
}
|
|
14
|
+
case "markdown":
|
|
15
|
+
return formatVerdictAsMarkdown(verdict);
|
|
16
|
+
case "html":
|
|
17
|
+
// HTML is handled separately in runCli (needs async import)
|
|
18
|
+
return formatTextOutput(verdict);
|
|
19
|
+
case "github-actions":
|
|
20
|
+
return verdictToGitHubActions(verdict, filePath);
|
|
21
|
+
case "text":
|
|
22
|
+
default:
|
|
23
|
+
return formatTextOutput(verdict);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
export function writeOutputIfSpecified(outputPath, contents) {
|
|
27
|
+
if (!outputPath)
|
|
28
|
+
return;
|
|
29
|
+
const dir = dirname(outputPath);
|
|
30
|
+
try {
|
|
31
|
+
mkdirSync(dir, { recursive: true });
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
// directory may already exist
|
|
35
|
+
}
|
|
36
|
+
writeFileSync(outputPath, contents, "utf-8");
|
|
37
|
+
}
|
|
38
|
+
export function formatTextOutput(verdict) {
|
|
39
|
+
const lines = [];
|
|
40
|
+
const totalFindings = verdict.evaluations.reduce((s, e) => s + e.findings.length, 0);
|
|
41
|
+
const fixableCount = verdict.evaluations.reduce((s, e) => s + e.findings.filter((f) => f.patch).length, 0);
|
|
42
|
+
lines.push("╔══════════════════════════════════════════════════════════════╗");
|
|
43
|
+
lines.push("║ Judges Panel — Evaluation Result ║");
|
|
44
|
+
lines.push("╚══════════════════════════════════════════════════════════════╝");
|
|
45
|
+
lines.push("");
|
|
46
|
+
lines.push(` Verdict : ${verdict.overallVerdict.toUpperCase()}`);
|
|
47
|
+
lines.push(` Score : ${verdict.overallScore}/100`);
|
|
48
|
+
lines.push(` Critical : ${verdict.criticalCount}`);
|
|
49
|
+
lines.push(` High : ${verdict.highCount}`);
|
|
50
|
+
lines.push(` Findings : ${totalFindings}${fixableCount > 0 ? ` (${fixableCount} auto-fixable)` : ""}`);
|
|
51
|
+
lines.push(` Judges : ${verdict.evaluations.length}`);
|
|
52
|
+
lines.push("");
|
|
53
|
+
// Per-judge table
|
|
54
|
+
lines.push(" Per-Judge Breakdown:");
|
|
55
|
+
lines.push(" " + "─".repeat(60));
|
|
56
|
+
for (const evaluation of verdict.evaluations) {
|
|
57
|
+
const icon = evaluation.verdict === "pass" ? "✅" : evaluation.verdict === "warning" ? "⚠️ " : "❌";
|
|
58
|
+
const name = evaluation.judgeName.padEnd(28);
|
|
59
|
+
const score = String(evaluation.score).padStart(3);
|
|
60
|
+
const findings = String(evaluation.findings.length).padStart(2);
|
|
61
|
+
const timing = evaluation.durationMs !== undefined ? ` ${evaluation.durationMs}ms` : "";
|
|
62
|
+
lines.push(` ${icon} ${name} ${score}/100 ${findings} finding(s)${timing}`);
|
|
63
|
+
}
|
|
64
|
+
lines.push("");
|
|
65
|
+
// Timing summary
|
|
66
|
+
if (verdict.timing) {
|
|
67
|
+
lines.push(` Total evaluation time: ${verdict.timing.totalMs}ms`);
|
|
68
|
+
const sorted = [...verdict.timing.perJudge].sort((a, b) => b.durationMs - a.durationMs);
|
|
69
|
+
const slowest = sorted.slice(0, 5);
|
|
70
|
+
if (slowest.length > 0) {
|
|
71
|
+
lines.push(" Slowest judges:");
|
|
72
|
+
for (const j of slowest) {
|
|
73
|
+
lines.push(` ${j.judgeName.padEnd(28)} ${j.durationMs}ms`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
lines.push("");
|
|
77
|
+
}
|
|
78
|
+
// Suppression metrics
|
|
79
|
+
if (verdict.suppressions && verdict.suppressions.length > 0) {
|
|
80
|
+
const supps = verdict.suppressions;
|
|
81
|
+
const byKind = { line: 0, "next-line": 0, block: 0, file: 0 };
|
|
82
|
+
const byRule = new Map();
|
|
83
|
+
for (const s of supps) {
|
|
84
|
+
byKind[s.kind] = (byKind[s.kind] || 0) + 1;
|
|
85
|
+
byRule.set(s.ruleId, (byRule.get(s.ruleId) ?? 0) + 1);
|
|
86
|
+
}
|
|
87
|
+
lines.push(` Suppressed Findings: ${supps.length}`);
|
|
88
|
+
const kinds = Object.entries(byKind)
|
|
89
|
+
.filter(([, v]) => v > 0)
|
|
90
|
+
.map(([k, v]) => `${k}: ${v}`);
|
|
91
|
+
lines.push(` By type: ${kinds.join(", ")}`);
|
|
92
|
+
const topRules = [...byRule.entries()].sort((a, b) => b[1] - a[1]).slice(0, 5);
|
|
93
|
+
if (topRules.length > 0) {
|
|
94
|
+
lines.push(` Top suppressed rules: ${topRules.map(([r, c]) => `${r} (${c})`).join(", ")}`);
|
|
95
|
+
}
|
|
96
|
+
lines.push("");
|
|
97
|
+
}
|
|
98
|
+
// Top findings
|
|
99
|
+
const allFindings = verdict.evaluations.flatMap((e) => e.findings);
|
|
100
|
+
const critical = allFindings.filter((f) => f.severity === "critical" || f.severity === "high");
|
|
101
|
+
if (critical.length > 0) {
|
|
102
|
+
lines.push(" Critical & High Findings:");
|
|
103
|
+
lines.push(" " + "─".repeat(60));
|
|
104
|
+
for (const f of critical.slice(0, 20)) {
|
|
105
|
+
const fixTag = f.patch ? " 🔧" : "";
|
|
106
|
+
const confTag = f.confidence !== undefined ? ` (${Math.round(f.confidence * 100)}% confidence)` : "";
|
|
107
|
+
lines.push(` [${f.severity.toUpperCase().padEnd(8)}] ${f.ruleId}: ${f.title}${fixTag}${confTag}`);
|
|
108
|
+
if (f.lineNumbers && f.lineNumbers.length > 0) {
|
|
109
|
+
lines.push(` Line ${f.lineNumbers[0]}: ${f.description.slice(0, 100)}`);
|
|
110
|
+
}
|
|
111
|
+
if (f.provenance) {
|
|
112
|
+
lines.push(` Evidence: ${f.provenance}`);
|
|
113
|
+
}
|
|
114
|
+
if (f.evidenceBasis) {
|
|
115
|
+
lines.push(` Basis: ${f.evidenceBasis}`);
|
|
116
|
+
}
|
|
117
|
+
if (f.evidenceChain && f.evidenceChain.steps.length > 0) {
|
|
118
|
+
lines.push(` Impact: ${f.evidenceChain.impactStatement}`);
|
|
119
|
+
for (const step of f.evidenceChain.steps.slice(0, 3)) {
|
|
120
|
+
const loc = step.line ? ` (L${step.line})` : "";
|
|
121
|
+
lines.push(` → [${step.source}]${loc} ${step.observation}`);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
if (f.cweIds && f.cweIds.length > 0) {
|
|
125
|
+
lines.push(` CWE: ${f.cweIds.join(", ")}`);
|
|
126
|
+
}
|
|
127
|
+
if (f.owaspLlmTop10) {
|
|
128
|
+
lines.push(` OWASP LLM: ${f.owaspLlmTop10}`);
|
|
129
|
+
}
|
|
130
|
+
if (f.learnMoreUrl) {
|
|
131
|
+
lines.push(` 📖 Learn more: ${f.learnMoreUrl}`);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
if (critical.length > 20) {
|
|
135
|
+
lines.push(` ... and ${critical.length - 20} more critical/high findings`);
|
|
136
|
+
}
|
|
137
|
+
lines.push("");
|
|
138
|
+
}
|
|
139
|
+
// Exit guidance
|
|
140
|
+
if (verdict.overallVerdict === "fail") {
|
|
141
|
+
lines.push(" ⛔ FAIL — This code has issues that should be addressed before shipping.");
|
|
142
|
+
}
|
|
143
|
+
else if (verdict.overallVerdict === "warning") {
|
|
144
|
+
lines.push(" ⚠️ WARNING — Review findings above before proceeding.");
|
|
145
|
+
}
|
|
146
|
+
else {
|
|
147
|
+
lines.push(" ✅ PASS — No critical issues detected.");
|
|
148
|
+
}
|
|
149
|
+
if (fixableCount > 0) {
|
|
150
|
+
lines.push(` 🔧 ${fixableCount} finding(s) can be auto-fixed. Run: judges eval <file> --fix`);
|
|
151
|
+
}
|
|
152
|
+
lines.push("");
|
|
153
|
+
return lines.join("\n");
|
|
154
|
+
}
|
|
155
|
+
export function formatSingleJudgeTextOutput(evaluation) {
|
|
156
|
+
const lines = [];
|
|
157
|
+
lines.push("╔══════════════════════════════════════════════════════════════╗");
|
|
158
|
+
lines.push(`║ Judge: ${evaluation.judgeName.padEnd(49)}║`);
|
|
159
|
+
lines.push("╚══════════════════════════════════════════════════════════════╝");
|
|
160
|
+
lines.push("");
|
|
161
|
+
lines.push(` Verdict : ${evaluation.verdict.toUpperCase()}`);
|
|
162
|
+
lines.push(` Score : ${evaluation.score}/100`);
|
|
163
|
+
lines.push(` Findings : ${evaluation.findings.length}`);
|
|
164
|
+
lines.push("");
|
|
165
|
+
for (const f of evaluation.findings) {
|
|
166
|
+
const confTag = f.confidence !== undefined ? ` (${Math.round(f.confidence * 100)}%)` : "";
|
|
167
|
+
lines.push(` [${f.severity.toUpperCase().padEnd(8)}] ${f.ruleId}: ${f.title}${confTag}`);
|
|
168
|
+
if (f.lineNumbers && f.lineNumbers.length > 0) {
|
|
169
|
+
lines.push(` Line ${f.lineNumbers[0]}: ${f.description.slice(0, 120)}`);
|
|
170
|
+
}
|
|
171
|
+
if (f.provenance) {
|
|
172
|
+
lines.push(` Evidence: ${f.provenance}`);
|
|
173
|
+
}
|
|
174
|
+
if (f.evidenceChain && f.evidenceChain.steps.length > 0) {
|
|
175
|
+
lines.push(` Impact: ${f.evidenceChain.impactStatement}`);
|
|
176
|
+
}
|
|
177
|
+
if (f.suggestedFix) {
|
|
178
|
+
lines.push(` Fix: ${f.suggestedFix.slice(0, 120)}`);
|
|
179
|
+
}
|
|
180
|
+
if (f.learnMoreUrl) {
|
|
181
|
+
lines.push(` 📖 ${f.learnMoreUrl}`);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
lines.push("");
|
|
185
|
+
return lines.join("\n");
|
|
186
|
+
}
|