@kevinrabun/judges-cli 3.128.2 → 3.129.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +1 -0
- package/dist/api.js +2 -0
- package/dist/cli-dispatch.js +2 -0
- package/dist/cli.js +2 -0
- package/dist/commands/external-benchmarks.d.ts +118 -0
- package/dist/commands/external-benchmarks.js +296 -0
- package/dist/commands/martian-code-review-benchmark.d.ts +61 -0
- package/dist/commands/martian-code-review-benchmark.js +516 -0
- package/dist/commands/openssf-cve-benchmark.d.ts +96 -0
- package/dist/commands/openssf-cve-benchmark.js +659 -0
- package/package.json +1 -1
package/dist/api.d.ts
CHANGED
|
@@ -74,6 +74,7 @@ export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGat
|
|
|
74
74
|
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
75
75
|
export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
|
|
76
76
|
export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
|
|
77
|
+
export { convertAllToBenchmarkCases as convertMartianToBenchmarkCases } from "./commands/martian-code-review-benchmark.js";
|
|
77
78
|
export type { PromptAmendment, OptimizerInsight, OptimizationResult, AmendmentStore, } from "./commands/llm-benchmark-optimizer.js";
|
|
78
79
|
export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
|
|
79
80
|
export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
|
package/dist/api.js
CHANGED
|
@@ -83,6 +83,8 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
|
|
|
83
83
|
export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
|
|
84
84
|
// ─── LLM Benchmark ──────────────────────────────────────────────────────────
|
|
85
85
|
export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
|
|
86
|
+
// ─── External Benchmarks ────────────────────────────────────────────────────
|
|
87
|
+
export { convertAllToBenchmarkCases as convertMartianToBenchmarkCases } from "./commands/martian-code-review-benchmark.js";
|
|
86
88
|
export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
|
|
87
89
|
// Review autopilot (GitHub App / scripts)
|
|
88
90
|
export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
|
package/dist/cli-dispatch.js
CHANGED
|
@@ -90,6 +90,7 @@ export const COMMAND_TABLE = {
|
|
|
90
90
|
"event-leak": ["./commands/event-leak.js", "runEventLeak"],
|
|
91
91
|
"evidence-chain": ["./commands/evidence-chain.js", "runEvidenceChain"],
|
|
92
92
|
"example-leak": ["./commands/example-leak.js", "runExampleLeak"],
|
|
93
|
+
"external-benchmark": ["./commands/external-benchmarks.js", "runExternalBenchmark"],
|
|
93
94
|
"exception-consistency": ["./commands/exception-consistency.js", "runExceptionConsistency"],
|
|
94
95
|
"exec-report": ["./commands/exec-report.js", "runExecReport"],
|
|
95
96
|
"explain-finding": ["./commands/explain-finding.js", "runExplainFinding"],
|
|
@@ -311,6 +312,7 @@ export const COMMAND_TABLE = {
|
|
|
311
312
|
"null-safety-audit": ["./commands/null-safety-audit.js", "runNullSafetyAudit"],
|
|
312
313
|
"observability-gap": ["./commands/observability-gap.js", "runObservabilityGap"],
|
|
313
314
|
onboard: ["./commands/onboard.js", "runOnboard"],
|
|
315
|
+
"openssf-cve": ["./commands/openssf-cve-benchmark.js", "runOpenSSFCveBenchmark"],
|
|
314
316
|
"org-metrics": ["./commands/org-metrics.js", "runOrgMetrics"],
|
|
315
317
|
"org-policy": ["./commands/org-policy.js", "runOrgPolicy"],
|
|
316
318
|
"over-abstraction": ["./commands/over-abstraction.js", "runOverAbstraction"],
|
package/dist/cli.js
CHANGED
|
@@ -251,6 +251,8 @@ function printHelp() {
|
|
|
251
251
|
["judges feedback", "Track finding feedback (false positives)"],
|
|
252
252
|
["judges override", "Manage per-path rule overrides"],
|
|
253
253
|
["judges benchmark", "Run detection accuracy benchmarks"],
|
|
254
|
+
["judges openssf-cve", "Run OpenSSF CVE Benchmark (real-world CVEs)"],
|
|
255
|
+
["judges external-benchmark", "Run external benchmarks (OpenSSF, Martian, etc.)"],
|
|
254
256
|
["judges config", "Export/import shared team configs"],
|
|
255
257
|
["judges review", "Post inline review comments on a GitHub PR"],
|
|
256
258
|
["judges app serve", "Start GitHub App webhook server"],
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* External Benchmark Registry
|
|
3
|
+
*
|
|
4
|
+
* Provides a unified framework for running third-party benchmarks against
|
|
5
|
+
* Judges and producing comparable, per-suite scoring reports.
|
|
6
|
+
*
|
|
7
|
+
* Each benchmark registers as a named suite with its own adapter that knows
|
|
8
|
+
* how to load data, run evaluations, and produce a standardised result.
|
|
9
|
+
* Results are stored per-suite so they can be compared individually or
|
|
10
|
+
* aggregated into a composite scorecard.
|
|
11
|
+
*
|
|
12
|
+
* Usage:
|
|
13
|
+
* judges external-benchmark run # Run all registered suites
|
|
14
|
+
* judges external-benchmark run --suite openssf-cve # Run one suite
|
|
15
|
+
* judges external-benchmark list # List available suites
|
|
16
|
+
* judges external-benchmark report # Composite report from saved results
|
|
17
|
+
*/
|
|
18
|
+
/**
|
|
19
|
+
* Standardised result format that every external benchmark adapter must produce.
|
|
20
|
+
* This makes results comparable across benchmarks and enables composite reports.
|
|
21
|
+
*/
|
|
22
|
+
export interface ExternalBenchmarkResult {
|
|
23
|
+
/** Unique suite identifier (e.g. "openssf-cve", "martian-code-review") */
|
|
24
|
+
suiteId: string;
|
|
25
|
+
/** Human-readable name */
|
|
26
|
+
suiteName: string;
|
|
27
|
+
/** URL to the benchmark's public repo / site */
|
|
28
|
+
suiteUrl: string;
|
|
29
|
+
/** ISO-8601 timestamp of this run */
|
|
30
|
+
timestamp: string;
|
|
31
|
+
/** Judges version used */
|
|
32
|
+
judgesVersion: string;
|
|
33
|
+
/** Total items evaluated (CVEs, PRs, test cases, etc.) */
|
|
34
|
+
totalItems: number;
|
|
35
|
+
/** Items successfully evaluated (excludes skipped/errored) */
|
|
36
|
+
evaluatedItems: number;
|
|
37
|
+
/** Items that could not be evaluated */
|
|
38
|
+
skippedItems: number;
|
|
39
|
+
/** Precision (0–1) */
|
|
40
|
+
precision: number;
|
|
41
|
+
/** Recall (0–1) */
|
|
42
|
+
recall: number;
|
|
43
|
+
/** F1 score (0–1) */
|
|
44
|
+
f1Score: number;
|
|
45
|
+
/** Detection / match rate (0–1) */
|
|
46
|
+
detectionRate: number;
|
|
47
|
+
/** True positives count */
|
|
48
|
+
truePositives?: number;
|
|
49
|
+
/** False positives count */
|
|
50
|
+
falsePositives?: number;
|
|
51
|
+
/** False negatives count */
|
|
52
|
+
falseNegatives?: number;
|
|
53
|
+
/** Per-category breakdown (CWE, severity, language, etc.) */
|
|
54
|
+
perCategory?: Record<string, {
|
|
55
|
+
total: number;
|
|
56
|
+
detected: number;
|
|
57
|
+
rate: number;
|
|
58
|
+
}>;
|
|
59
|
+
/** Suite-specific raw data (varies per benchmark) */
|
|
60
|
+
rawData?: unknown;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Configuration for a benchmark run.
|
|
64
|
+
*/
|
|
65
|
+
export interface BenchmarkRunConfig {
|
|
66
|
+
/** Path to the benchmark repo / data directory */
|
|
67
|
+
repoPath: string;
|
|
68
|
+
/** Restrict to a single item (CVE ID, PR URL, etc.) */
|
|
69
|
+
singleItem?: string;
|
|
70
|
+
/** Output format */
|
|
71
|
+
format?: "text" | "json" | "markdown";
|
|
72
|
+
/** Output file path */
|
|
73
|
+
outputPath?: string;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Adapter interface that every external benchmark must implement.
|
|
77
|
+
*/
|
|
78
|
+
export interface ExternalBenchmarkAdapter {
|
|
79
|
+
/** Unique suite identifier */
|
|
80
|
+
readonly suiteId: string;
|
|
81
|
+
/** Human-readable name */
|
|
82
|
+
readonly suiteName: string;
|
|
83
|
+
/** URL to the benchmark's public repo / site */
|
|
84
|
+
readonly suiteUrl: string;
|
|
85
|
+
/** Default path to look for the benchmark data */
|
|
86
|
+
readonly defaultRepoPath: string;
|
|
87
|
+
/** Short description shown in `list` command */
|
|
88
|
+
readonly description: string;
|
|
89
|
+
/**
|
|
90
|
+
* Validate that the benchmark repo/data exists at the given path.
|
|
91
|
+
* Return an error message if not, or undefined if OK.
|
|
92
|
+
*/
|
|
93
|
+
validate(repoPath: string): string | undefined;
|
|
94
|
+
/**
|
|
95
|
+
* Run the benchmark and return a standardised result.
|
|
96
|
+
*/
|
|
97
|
+
run(config: BenchmarkRunConfig): ExternalBenchmarkResult;
|
|
98
|
+
}
|
|
99
|
+
export declare function registerBenchmarkAdapter(adapter: ExternalBenchmarkAdapter): void;
|
|
100
|
+
export declare function getAdapter(suiteId: string): ExternalBenchmarkAdapter | undefined;
|
|
101
|
+
export declare function listAdapters(): ExternalBenchmarkAdapter[];
|
|
102
|
+
export interface CompositeReport {
|
|
103
|
+
timestamp: string;
|
|
104
|
+
suites: ExternalBenchmarkResult[];
|
|
105
|
+
aggregate: {
|
|
106
|
+
totalItems: number;
|
|
107
|
+
evaluatedItems: number;
|
|
108
|
+
weightedPrecision: number;
|
|
109
|
+
weightedRecall: number;
|
|
110
|
+
weightedF1: number;
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
export declare function computeCompositeReport(results: ExternalBenchmarkResult[]): CompositeReport;
|
|
114
|
+
export declare function formatCompositeReport(report: CompositeReport): string;
|
|
115
|
+
export declare function saveResult(result: ExternalBenchmarkResult): string;
|
|
116
|
+
export declare function loadLatestResult(suiteId: string): ExternalBenchmarkResult | undefined;
|
|
117
|
+
export declare function loadAllLatestResults(): ExternalBenchmarkResult[];
|
|
118
|
+
export declare function runExternalBenchmark(argv: string[]): Promise<void>;
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* External Benchmark Registry
|
|
3
|
+
*
|
|
4
|
+
* Provides a unified framework for running third-party benchmarks against
|
|
5
|
+
* Judges and producing comparable, per-suite scoring reports.
|
|
6
|
+
*
|
|
7
|
+
* Each benchmark registers as a named suite with its own adapter that knows
|
|
8
|
+
* how to load data, run evaluations, and produce a standardised result.
|
|
9
|
+
* Results are stored per-suite so they can be compared individually or
|
|
10
|
+
* aggregated into a composite scorecard.
|
|
11
|
+
*
|
|
12
|
+
* Usage:
|
|
13
|
+
* judges external-benchmark run # Run all registered suites
|
|
14
|
+
* judges external-benchmark run --suite openssf-cve # Run one suite
|
|
15
|
+
* judges external-benchmark list # List available suites
|
|
16
|
+
* judges external-benchmark report # Composite report from saved results
|
|
17
|
+
*/
|
|
18
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
19
|
+
import { resolve, join } from "path";
|
|
20
|
+
// ─── Registry ───────────────────────────────────────────────────────────────
|
|
21
|
+
const _adapters = new Map();
|
|
22
|
+
export function registerBenchmarkAdapter(adapter) {
|
|
23
|
+
_adapters.set(adapter.suiteId, adapter);
|
|
24
|
+
}
|
|
25
|
+
export function getAdapter(suiteId) {
|
|
26
|
+
return _adapters.get(suiteId);
|
|
27
|
+
}
|
|
28
|
+
export function listAdapters() {
|
|
29
|
+
return [..._adapters.values()];
|
|
30
|
+
}
|
|
31
|
+
export function computeCompositeReport(results) {
|
|
32
|
+
let totalItems = 0;
|
|
33
|
+
let evaluatedItems = 0;
|
|
34
|
+
let weightedPrecSum = 0;
|
|
35
|
+
let weightedRecSum = 0;
|
|
36
|
+
for (const r of results) {
|
|
37
|
+
totalItems += r.totalItems;
|
|
38
|
+
evaluatedItems += r.evaluatedItems;
|
|
39
|
+
weightedPrecSum += r.precision * r.evaluatedItems;
|
|
40
|
+
weightedRecSum += r.recall * r.evaluatedItems;
|
|
41
|
+
}
|
|
42
|
+
const weightedPrecision = evaluatedItems > 0 ? weightedPrecSum / evaluatedItems : 0;
|
|
43
|
+
const weightedRecall = evaluatedItems > 0 ? weightedRecSum / evaluatedItems : 0;
|
|
44
|
+
const weightedF1 = weightedPrecision + weightedRecall > 0
|
|
45
|
+
? (2 * weightedPrecision * weightedRecall) / (weightedPrecision + weightedRecall)
|
|
46
|
+
: 0;
|
|
47
|
+
return {
|
|
48
|
+
timestamp: new Date().toISOString(),
|
|
49
|
+
suites: results,
|
|
50
|
+
aggregate: { totalItems, evaluatedItems, weightedPrecision, weightedRecall, weightedF1 },
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
export function formatCompositeReport(report) {
|
|
54
|
+
const lines = [];
|
|
55
|
+
lines.push("# External Benchmark Scorecard");
|
|
56
|
+
lines.push("");
|
|
57
|
+
lines.push(`**Date:** ${report.timestamp}`);
|
|
58
|
+
lines.push("");
|
|
59
|
+
// Aggregate summary
|
|
60
|
+
lines.push("## Aggregate");
|
|
61
|
+
lines.push("");
|
|
62
|
+
lines.push("| Metric | Value |");
|
|
63
|
+
lines.push("|--------|-------|");
|
|
64
|
+
lines.push(`| Total Items | ${report.aggregate.totalItems} |`);
|
|
65
|
+
lines.push(`| Evaluated | ${report.aggregate.evaluatedItems} |`);
|
|
66
|
+
lines.push(`| Weighted Precision | ${(report.aggregate.weightedPrecision * 100).toFixed(1)}% |`);
|
|
67
|
+
lines.push(`| Weighted Recall | ${(report.aggregate.weightedRecall * 100).toFixed(1)}% |`);
|
|
68
|
+
lines.push(`| Weighted F1 | ${(report.aggregate.weightedF1 * 100).toFixed(1)}% |`);
|
|
69
|
+
lines.push("");
|
|
70
|
+
// Per-suite table
|
|
71
|
+
lines.push("## Per-Suite Results");
|
|
72
|
+
lines.push("");
|
|
73
|
+
lines.push("| Suite | Items | Detection Rate | Precision | Recall | F1 |");
|
|
74
|
+
lines.push("|-------|-------|---------------|-----------|--------|-----|");
|
|
75
|
+
for (const s of report.suites) {
|
|
76
|
+
lines.push(`| [${s.suiteName}](${s.suiteUrl}) | ${s.evaluatedItems}/${s.totalItems} ` +
|
|
77
|
+
`| ${(s.detectionRate * 100).toFixed(1)}% ` +
|
|
78
|
+
`| ${(s.precision * 100).toFixed(1)}% ` +
|
|
79
|
+
`| ${(s.recall * 100).toFixed(1)}% ` +
|
|
80
|
+
`| ${(s.f1Score * 100).toFixed(1)}% |`);
|
|
81
|
+
}
|
|
82
|
+
lines.push("");
|
|
83
|
+
// Per-suite detail sections
|
|
84
|
+
for (const s of report.suites) {
|
|
85
|
+
lines.push(`## ${s.suiteName}`);
|
|
86
|
+
lines.push("");
|
|
87
|
+
lines.push(`**Source:** ${s.suiteUrl}`);
|
|
88
|
+
lines.push(`**Items:** ${s.evaluatedItems} evaluated, ${s.skippedItems} skipped`);
|
|
89
|
+
lines.push("");
|
|
90
|
+
if (s.perCategory && Object.keys(s.perCategory).length > 0) {
|
|
91
|
+
lines.push("| Category | Total | Detected | Rate |");
|
|
92
|
+
lines.push("|----------|-------|----------|------|");
|
|
93
|
+
const entries = Object.entries(s.perCategory).sort((a, b) => b[1].total - a[1].total);
|
|
94
|
+
for (const [cat, data] of entries) {
|
|
95
|
+
lines.push(`| ${cat} | ${data.total} | ${data.detected} | ${(data.rate * 100).toFixed(0)}% |`);
|
|
96
|
+
}
|
|
97
|
+
lines.push("");
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
return lines.join("\n");
|
|
101
|
+
}
|
|
102
|
+
// ─── Result Persistence ─────────────────────────────────────────────────────
|
|
103
|
+
const RESULTS_DIR = "benchmarks/external";
|
|
104
|
+
function ensureResultsDir() {
|
|
105
|
+
const dir = resolve(RESULTS_DIR);
|
|
106
|
+
if (!existsSync(dir)) {
|
|
107
|
+
mkdirSync(dir, { recursive: true });
|
|
108
|
+
}
|
|
109
|
+
return dir;
|
|
110
|
+
}
|
|
111
|
+
export function saveResult(result) {
|
|
112
|
+
const dir = ensureResultsDir();
|
|
113
|
+
const fileName = `${result.suiteId}-${result.timestamp.replace(/[:.]/g, "-")}.json`;
|
|
114
|
+
const filePath = join(dir, fileName);
|
|
115
|
+
writeFileSync(filePath, JSON.stringify(result, null, 2), "utf-8");
|
|
116
|
+
// Also write a "latest" symlink-style file
|
|
117
|
+
const latestPath = join(dir, `${result.suiteId}-latest.json`);
|
|
118
|
+
writeFileSync(latestPath, JSON.stringify(result, null, 2), "utf-8");
|
|
119
|
+
return filePath;
|
|
120
|
+
}
|
|
121
|
+
export function loadLatestResult(suiteId) {
|
|
122
|
+
const latestPath = resolve(RESULTS_DIR, `${suiteId}-latest.json`);
|
|
123
|
+
if (!existsSync(latestPath))
|
|
124
|
+
return undefined;
|
|
125
|
+
return JSON.parse(readFileSync(latestPath, "utf-8"));
|
|
126
|
+
}
|
|
127
|
+
export function loadAllLatestResults() {
|
|
128
|
+
const results = [];
|
|
129
|
+
for (const adapter of listAdapters()) {
|
|
130
|
+
const r = loadLatestResult(adapter.suiteId);
|
|
131
|
+
if (r)
|
|
132
|
+
results.push(r);
|
|
133
|
+
}
|
|
134
|
+
return results;
|
|
135
|
+
}
|
|
136
|
+
// ─── CLI Entry Point ────────────────────────────────────────────────────────
|
|
137
|
+
// Ensure adapters are registered when the CLI entry point is called.
|
|
138
|
+
// Each adapter file calls registerBenchmarkAdapter() at module scope.
|
|
139
|
+
let _adaptersLoaded = false;
|
|
140
|
+
async function ensureAdaptersLoaded() {
|
|
141
|
+
if (_adaptersLoaded)
|
|
142
|
+
return;
|
|
143
|
+
_adaptersLoaded = true;
|
|
144
|
+
try {
|
|
145
|
+
await import("./openssf-cve-benchmark.js");
|
|
146
|
+
}
|
|
147
|
+
catch {
|
|
148
|
+
/* adapter unavailable */
|
|
149
|
+
}
|
|
150
|
+
try {
|
|
151
|
+
await import("./martian-code-review-benchmark.js");
|
|
152
|
+
}
|
|
153
|
+
catch {
|
|
154
|
+
/* adapter unavailable */
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
export async function runExternalBenchmark(argv) {
|
|
158
|
+
await ensureAdaptersLoaded();
|
|
159
|
+
const subcommand = argv[3] || "run";
|
|
160
|
+
if (subcommand === "--help" || subcommand === "-h") {
|
|
161
|
+
console.log(`
|
|
162
|
+
Judges Panel — External Benchmark Runner
|
|
163
|
+
|
|
164
|
+
Run third-party benchmarks to demonstrate Judges' capabilities and produce
|
|
165
|
+
comparable, per-suite scoring reports.
|
|
166
|
+
|
|
167
|
+
USAGE:
|
|
168
|
+
judges external-benchmark run [options] Run benchmark suite(s)
|
|
169
|
+
judges external-benchmark list List available suites
|
|
170
|
+
judges external-benchmark report [options] Composite report from saved results
|
|
171
|
+
|
|
172
|
+
OPTIONS:
|
|
173
|
+
--suite, -s <id> Run a specific suite (default: all)
|
|
174
|
+
--repo, -r <path> Override the benchmark repo path
|
|
175
|
+
--item <id> Evaluate a single item (CVE ID, PR URL, etc.)
|
|
176
|
+
--output, -o <path> Save results to file
|
|
177
|
+
--format <fmt> Output: text, json, markdown (default: text)
|
|
178
|
+
|
|
179
|
+
AVAILABLE SUITES:`);
|
|
180
|
+
for (const a of listAdapters()) {
|
|
181
|
+
console.log(` ${a.suiteId.padEnd(24)} ${a.description}`);
|
|
182
|
+
}
|
|
183
|
+
console.log("");
|
|
184
|
+
process.exit(0);
|
|
185
|
+
}
|
|
186
|
+
if (subcommand === "list") {
|
|
187
|
+
console.log("\nAvailable external benchmark suites:\n");
|
|
188
|
+
for (const a of listAdapters()) {
|
|
189
|
+
console.log(` ${a.suiteId.padEnd(24)} ${a.suiteName}`);
|
|
190
|
+
console.log(` ${"".padEnd(24)} ${a.description}`);
|
|
191
|
+
console.log(` ${"".padEnd(24)} ${a.suiteUrl}`);
|
|
192
|
+
console.log(` ${"".padEnd(24)} Default path: ${a.defaultRepoPath}`);
|
|
193
|
+
console.log("");
|
|
194
|
+
}
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
197
|
+
if (subcommand === "report") {
|
|
198
|
+
const results = loadAllLatestResults();
|
|
199
|
+
if (results.length === 0) {
|
|
200
|
+
console.error("No saved results found. Run benchmarks first with: judges external-benchmark run");
|
|
201
|
+
process.exit(1);
|
|
202
|
+
}
|
|
203
|
+
const report = computeCompositeReport(results);
|
|
204
|
+
let reportFormat = "markdown";
|
|
205
|
+
let outputPath;
|
|
206
|
+
for (let i = 4; i < argv.length; i++) {
|
|
207
|
+
if (argv[i] === "--format")
|
|
208
|
+
reportFormat = argv[++i];
|
|
209
|
+
else if (argv[i] === "--output" || argv[i] === "-o")
|
|
210
|
+
outputPath = argv[++i];
|
|
211
|
+
}
|
|
212
|
+
const output = reportFormat === "json" ? JSON.stringify(report, null, 2) : formatCompositeReport(report);
|
|
213
|
+
if (outputPath) {
|
|
214
|
+
writeFileSync(outputPath, output, "utf-8");
|
|
215
|
+
console.log(`Report saved to ${outputPath}`);
|
|
216
|
+
}
|
|
217
|
+
else {
|
|
218
|
+
console.log(output);
|
|
219
|
+
}
|
|
220
|
+
return;
|
|
221
|
+
}
|
|
222
|
+
// ── "run" subcommand ──
|
|
223
|
+
let suiteId;
|
|
224
|
+
let repoPath;
|
|
225
|
+
let singleItem;
|
|
226
|
+
let format = "text";
|
|
227
|
+
let outputPath;
|
|
228
|
+
for (let i = 4; i < argv.length; i++) {
|
|
229
|
+
const arg = argv[i];
|
|
230
|
+
if (arg === "--suite" || arg === "-s")
|
|
231
|
+
suiteId = argv[++i];
|
|
232
|
+
else if (arg === "--repo" || arg === "-r")
|
|
233
|
+
repoPath = argv[++i];
|
|
234
|
+
else if (arg === "--item")
|
|
235
|
+
singleItem = argv[++i];
|
|
236
|
+
else if (arg === "--output" || arg === "-o")
|
|
237
|
+
outputPath = argv[++i];
|
|
238
|
+
else if (arg === "--format")
|
|
239
|
+
format = argv[++i];
|
|
240
|
+
}
|
|
241
|
+
const adapters = suiteId ? [getAdapter(suiteId)].filter(Boolean) : listAdapters();
|
|
242
|
+
if (adapters.length === 0) {
|
|
243
|
+
if (suiteId) {
|
|
244
|
+
console.error(`Unknown suite: ${suiteId}. Use 'judges external-benchmark list' to see available suites.`);
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
console.error("No benchmark adapters registered.");
|
|
248
|
+
}
|
|
249
|
+
process.exit(1);
|
|
250
|
+
}
|
|
251
|
+
const allResults = [];
|
|
252
|
+
for (const adapter of adapters) {
|
|
253
|
+
const effectiveRepo = repoPath ? resolve(repoPath) : resolve(adapter.defaultRepoPath);
|
|
254
|
+
console.log(`\n━━━ ${adapter.suiteName} ━━━`);
|
|
255
|
+
console.log(`Suite: ${adapter.suiteId}`);
|
|
256
|
+
console.log(`Repo: ${effectiveRepo}`);
|
|
257
|
+
const validationError = adapter.validate(effectiveRepo);
|
|
258
|
+
if (validationError) {
|
|
259
|
+
console.error(` ⚠️ ${validationError}`);
|
|
260
|
+
console.error(` Skipping ${adapter.suiteId}.\n`);
|
|
261
|
+
continue;
|
|
262
|
+
}
|
|
263
|
+
const result = adapter.run({
|
|
264
|
+
repoPath: effectiveRepo,
|
|
265
|
+
singleItem,
|
|
266
|
+
format: format,
|
|
267
|
+
outputPath,
|
|
268
|
+
});
|
|
269
|
+
allResults.push(result);
|
|
270
|
+
// Save per-suite result
|
|
271
|
+
const savedPath = saveResult(result);
|
|
272
|
+
console.log(`\n Results saved to ${savedPath}`);
|
|
273
|
+
// Print per-suite summary
|
|
274
|
+
console.log(`\n Detection Rate: ${(result.detectionRate * 100).toFixed(1)}%`);
|
|
275
|
+
console.log(` Precision: ${(result.precision * 100).toFixed(1)}%`);
|
|
276
|
+
console.log(` Recall: ${(result.recall * 100).toFixed(1)}%`);
|
|
277
|
+
console.log(` F1 Score: ${(result.f1Score * 100).toFixed(1)}%`);
|
|
278
|
+
}
|
|
279
|
+
// Composite summary if multiple suites ran
|
|
280
|
+
if (allResults.length > 1) {
|
|
281
|
+
const report = computeCompositeReport(allResults);
|
|
282
|
+
console.log("\n━━━ Composite Scorecard ━━━");
|
|
283
|
+
console.log(` Weighted Precision: ${(report.aggregate.weightedPrecision * 100).toFixed(1)}%`);
|
|
284
|
+
console.log(` Weighted Recall: ${(report.aggregate.weightedRecall * 100).toFixed(1)}%`);
|
|
285
|
+
console.log(` Weighted F1: ${(report.aggregate.weightedF1 * 100).toFixed(1)}%`);
|
|
286
|
+
}
|
|
287
|
+
if (outputPath && allResults.length > 0) {
|
|
288
|
+
const finalOutput = format === "json"
|
|
289
|
+
? JSON.stringify(allResults.length === 1 ? allResults[0] : computeCompositeReport(allResults), null, 2)
|
|
290
|
+
: format === "markdown"
|
|
291
|
+
? formatCompositeReport(computeCompositeReport(allResults))
|
|
292
|
+
: allResults.map((r) => `${r.suiteName}: F1=${(r.f1Score * 100).toFixed(1)}%`).join("\n");
|
|
293
|
+
writeFileSync(outputPath, finalOutput, "utf-8");
|
|
294
|
+
console.log(`\nResults saved to ${outputPath}`);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Martian Code Review Benchmark Integration
|
|
3
|
+
*
|
|
4
|
+
* Adapter for the Martian Code Review Bench offline benchmark
|
|
5
|
+
* (https://github.com/withmartian/code-review-benchmark).
|
|
6
|
+
*
|
|
7
|
+
* 50 PRs from 5 major open-source projects (Sentry, Grafana, Cal.com,
|
|
8
|
+
* Discourse, Keycloak) with human-curated golden comments at severity
|
|
9
|
+
* levels Low/Medium/High/Critical.
|
|
10
|
+
*
|
|
11
|
+
* For each PR, Judges evaluates the diff and we match our findings
|
|
12
|
+
* against the golden comments using semantic similarity at the
|
|
13
|
+
* rule-prefix and description level.
|
|
14
|
+
*/
|
|
15
|
+
import type { Finding } from "../types.js";
|
|
16
|
+
import type { BenchmarkCase } from "./benchmark.js";
|
|
17
|
+
export interface MartianGoldenComment {
|
|
18
|
+
comment: string;
|
|
19
|
+
severity: "Low" | "Medium" | "High" | "Critical";
|
|
20
|
+
}
|
|
21
|
+
export interface MartianPr {
|
|
22
|
+
pr_title: string;
|
|
23
|
+
url: string;
|
|
24
|
+
original_url?: string;
|
|
25
|
+
az_comment?: string;
|
|
26
|
+
comments: MartianGoldenComment[];
|
|
27
|
+
}
|
|
28
|
+
export interface MartianPrResult {
|
|
29
|
+
prTitle: string;
|
|
30
|
+
prUrl: string;
|
|
31
|
+
sourceRepo: string;
|
|
32
|
+
language: string;
|
|
33
|
+
goldenComments: number;
|
|
34
|
+
matchedComments: number;
|
|
35
|
+
unmatchedComments: number;
|
|
36
|
+
falsePositives: number;
|
|
37
|
+
precision: number;
|
|
38
|
+
recall: number;
|
|
39
|
+
findings: Finding[];
|
|
40
|
+
matches: Array<{
|
|
41
|
+
golden: string;
|
|
42
|
+
finding: string;
|
|
43
|
+
severity: string;
|
|
44
|
+
}>;
|
|
45
|
+
missed: string[];
|
|
46
|
+
}
|
|
47
|
+
export declare function loadGoldenComments(repoPath: string): Map<string, MartianPr[]>;
|
|
48
|
+
/**
|
|
49
|
+
* Convert a Martian PR with golden comments into BenchmarkCase format
|
|
50
|
+
* for use in the LLM benchmark pipeline.
|
|
51
|
+
*
|
|
52
|
+
* Each golden comment becomes an expected finding. The PR diff provides
|
|
53
|
+
* the actual code to evaluate. The LLM judge determines if its review
|
|
54
|
+
* catches the same issues the human reviewer identified.
|
|
55
|
+
*/
|
|
56
|
+
export declare function convertPrToBenchmarkCase(pr: MartianPr, repoName: string, diff?: string): BenchmarkCase | undefined;
|
|
57
|
+
/**
|
|
58
|
+
* Convert all Martian golden comments into BenchmarkCase[] for LLM evaluation.
|
|
59
|
+
* Fetches actual PR diffs from GitHub when possible.
|
|
60
|
+
*/
|
|
61
|
+
export declare function convertAllToBenchmarkCases(repoPath: string): BenchmarkCase[];
|