@kevinrabun/judges-cli 3.128.3 → 3.129.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/api.d.ts CHANGED
@@ -74,6 +74,7 @@ export type { BenchmarkCase, BenchmarkResult, BenchmarkGateOptions, BenchmarkGat
74
74
  export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
75
75
  export type { LlmBenchmarkSnapshot, LlmCaseResult } from "./commands/llm-benchmark.js";
76
76
  export type { LlmFinding, ValidationResult } from "./probabilistic/llm-response-validator.js";
77
+ export { convertAllToBenchmarkCases as convertMartianToBenchmarkCases } from "./commands/martian-code-review-benchmark.js";
77
78
  export type { PromptAmendment, OptimizerInsight, OptimizationResult, AmendmentStore, } from "./commands/llm-benchmark-optimizer.js";
78
79
  export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
79
80
  export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
package/dist/api.js CHANGED
@@ -83,6 +83,8 @@ export { compareCapabilities, formatComparisonReport, formatFullComparisonMatrix
83
83
  export { runBenchmarkSuite, benchmarkGate, formatBenchmarkReport, formatBenchmarkMarkdown, analyzeL2Coverage, formatL2CoverageReport, ingestFindingsAsBenchmarkCases, deduplicateIngestCases, BENCHMARK_CASES, } from "./commands/benchmark.js";
84
84
  // ─── LLM Benchmark ──────────────────────────────────────────────────────────
85
85
  export { parseLlmRuleIds, constructPerJudgePrompt, constructTribunalPrompt, selectStratifiedSample, selectJudgesForCategory, scoreLlmCase, computeLlmMetrics, formatLlmSnapshotMarkdown, formatLayerComparisonMarkdown, extractValidatedLlmFindings, getValidRulePrefixes, getTribunalValidPrefixes, } from "./commands/llm-benchmark.js";
86
+ // ─── External Benchmarks ────────────────────────────────────────────────────
87
+ export { convertAllToBenchmarkCases as convertMartianToBenchmarkCases } from "./commands/martian-code-review-benchmark.js";
86
88
  export { optimizeBenchmark, mergeAmendments, createEmptyStore, formatAmendmentSection, } from "./commands/llm-benchmark-optimizer.js";
87
89
  // Review autopilot (GitHub App / scripts)
88
90
  export { runReviewAutopilot, dedupeComments, filterAlreadyPostedComments } from "./commands/review.js";
@@ -90,6 +90,7 @@ export const COMMAND_TABLE = {
90
90
  "event-leak": ["./commands/event-leak.js", "runEventLeak"],
91
91
  "evidence-chain": ["./commands/evidence-chain.js", "runEvidenceChain"],
92
92
  "example-leak": ["./commands/example-leak.js", "runExampleLeak"],
93
+ "external-benchmark": ["./commands/external-benchmarks.js", "runExternalBenchmark"],
93
94
  "exception-consistency": ["./commands/exception-consistency.js", "runExceptionConsistency"],
94
95
  "exec-report": ["./commands/exec-report.js", "runExecReport"],
95
96
  "explain-finding": ["./commands/explain-finding.js", "runExplainFinding"],
@@ -311,6 +312,7 @@ export const COMMAND_TABLE = {
311
312
  "null-safety-audit": ["./commands/null-safety-audit.js", "runNullSafetyAudit"],
312
313
  "observability-gap": ["./commands/observability-gap.js", "runObservabilityGap"],
313
314
  onboard: ["./commands/onboard.js", "runOnboard"],
315
+ "openssf-cve": ["./commands/openssf-cve-benchmark.js", "runOpenSSFCveBenchmark"],
314
316
  "org-metrics": ["./commands/org-metrics.js", "runOrgMetrics"],
315
317
  "org-policy": ["./commands/org-policy.js", "runOrgPolicy"],
316
318
  "over-abstraction": ["./commands/over-abstraction.js", "runOverAbstraction"],
package/dist/cli.js CHANGED
@@ -251,6 +251,8 @@ function printHelp() {
251
251
  ["judges feedback", "Track finding feedback (false positives)"],
252
252
  ["judges override", "Manage per-path rule overrides"],
253
253
  ["judges benchmark", "Run detection accuracy benchmarks"],
254
+ ["judges openssf-cve", "Run OpenSSF CVE Benchmark (real-world CVEs)"],
255
+ ["judges external-benchmark", "Run external benchmarks (OpenSSF, Martian, etc.)"],
254
256
  ["judges config", "Export/import shared team configs"],
255
257
  ["judges review", "Post inline review comments on a GitHub PR"],
256
258
  ["judges app serve", "Start GitHub App webhook server"],
@@ -47,17 +47,37 @@ function loadAmendments(filePath) {
47
47
  const store = JSON.parse(readFileSync(resolve(filePath), "utf8"));
48
48
  return store.amendments;
49
49
  }
50
- // Try VS Code global storage
51
- const appdata = process.env.APPDATA || process.env.HOME;
52
- if (!appdata)
53
- throw new Error("Cannot determine global storage path. Use --file to specify.");
54
- const globalPath = join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
50
+ const globalPath = getAmendmentStorePath();
55
51
  if (!existsSync(globalPath)) {
56
52
  throw new Error(`No amendments found at ${globalPath}. Run an LLM benchmark first, or use --file.`);
57
53
  }
58
54
  const store = JSON.parse(readFileSync(globalPath, "utf8"));
59
55
  return store.amendments;
60
56
  }
57
+ /**
58
+ * Resolve the path to the VS Code global storage amendment file.
59
+ */
60
+ function getAmendmentStorePath(filePath) {
61
+ if (filePath)
62
+ return resolve(filePath);
63
+ const appdata = process.env.APPDATA || process.env.HOME;
64
+ if (!appdata)
65
+ throw new Error("Cannot determine global storage path. Use --file to specify.");
66
+ return join(appdata, "Code", "User", "globalStorage", "kevinrabun.judges-panel", "llm-benchmark-amendments.json");
67
+ }
68
+ /**
69
+ * Clear the amendment store after codification to prevent double-application.
70
+ * Codified amendments live in the .judge.md files; keeping the runtime store
71
+ * causes them to be injected twice into LLM benchmark prompts.
72
+ */
73
+ function clearAmendmentStore(filePath) {
74
+ const storePath = getAmendmentStorePath(filePath);
75
+ if (existsSync(storePath)) {
76
+ const emptyStore = { amendments: [], version: 1, history: [] };
77
+ writeFileSync(storePath, JSON.stringify(emptyStore, null, 2), "utf8");
78
+ console.log(` 🧹 Cleared amendment store at ${storePath}`);
79
+ }
80
+ }
61
81
  /**
62
82
  * Codify a single amendment into a judge's .judge.md file by appending
63
83
  * to the FALSE POSITIVE AVOIDANCE section (or creating one if missing).
@@ -152,6 +172,9 @@ export function runCodifyAmendments(argv) {
152
172
  console.log("");
153
173
  console.log(` ${dryRun ? "Would codify" : "Codified"} ${codified}/${amendments.length} amendment(s) into agent files.`);
154
174
  if (!dryRun && codified > 0) {
175
+ // Clear the amendment store so codified amendments aren't double-applied
176
+ // at runtime during the next LLM benchmark run
177
+ clearAmendmentStore(filePath);
155
178
  console.log(" Next steps:");
156
179
  console.log(" 1. npm run generate:agents:force — sync .ts files from .judge.md");
157
180
  console.log(" 2. npm run build — rebuild");
@@ -0,0 +1,118 @@
1
+ /**
2
+ * External Benchmark Registry
3
+ *
4
+ * Provides a unified framework for running third-party benchmarks against
5
+ * Judges and producing comparable, per-suite scoring reports.
6
+ *
7
+ * Each benchmark registers as a named suite with its own adapter that knows
8
+ * how to load data, run evaluations, and produce a standardised result.
9
+ * Results are stored per-suite so they can be compared individually or
10
+ * aggregated into a composite scorecard.
11
+ *
12
+ * Usage:
13
+ * judges external-benchmark run # Run all registered suites
14
+ * judges external-benchmark run --suite openssf-cve # Run one suite
15
+ * judges external-benchmark list # List available suites
16
+ * judges external-benchmark report # Composite report from saved results
17
+ */
18
+ /**
19
+ * Standardised result format that every external benchmark adapter must produce.
20
+ * This makes results comparable across benchmarks and enables composite reports.
21
+ */
22
+ export interface ExternalBenchmarkResult {
23
+ /** Unique suite identifier (e.g. "openssf-cve", "martian-code-review") */
24
+ suiteId: string;
25
+ /** Human-readable name */
26
+ suiteName: string;
27
+ /** URL to the benchmark's public repo / site */
28
+ suiteUrl: string;
29
+ /** ISO-8601 timestamp of this run */
30
+ timestamp: string;
31
+ /** Judges version used */
32
+ judgesVersion: string;
33
+ /** Total items evaluated (CVEs, PRs, test cases, etc.) */
34
+ totalItems: number;
35
+ /** Items successfully evaluated (excludes skipped/errored) */
36
+ evaluatedItems: number;
37
+ /** Items that could not be evaluated */
38
+ skippedItems: number;
39
+ /** Precision (0–1) */
40
+ precision: number;
41
+ /** Recall (0–1) */
42
+ recall: number;
43
+ /** F1 score (0–1) */
44
+ f1Score: number;
45
+ /** Detection / match rate (0–1) */
46
+ detectionRate: number;
47
+ /** True positives count */
48
+ truePositives?: number;
49
+ /** False positives count */
50
+ falsePositives?: number;
51
+ /** False negatives count */
52
+ falseNegatives?: number;
53
+ /** Per-category breakdown (CWE, severity, language, etc.) */
54
+ perCategory?: Record<string, {
55
+ total: number;
56
+ detected: number;
57
+ rate: number;
58
+ }>;
59
+ /** Suite-specific raw data (varies per benchmark) */
60
+ rawData?: unknown;
61
+ }
62
+ /**
63
+ * Configuration for a benchmark run.
64
+ */
65
+ export interface BenchmarkRunConfig {
66
+ /** Path to the benchmark repo / data directory */
67
+ repoPath: string;
68
+ /** Restrict to a single item (CVE ID, PR URL, etc.) */
69
+ singleItem?: string;
70
+ /** Output format */
71
+ format?: "text" | "json" | "markdown";
72
+ /** Output file path */
73
+ outputPath?: string;
74
+ }
75
+ /**
76
+ * Adapter interface that every external benchmark must implement.
77
+ */
78
+ export interface ExternalBenchmarkAdapter {
79
+ /** Unique suite identifier */
80
+ readonly suiteId: string;
81
+ /** Human-readable name */
82
+ readonly suiteName: string;
83
+ /** URL to the benchmark's public repo / site */
84
+ readonly suiteUrl: string;
85
+ /** Default path to look for the benchmark data */
86
+ readonly defaultRepoPath: string;
87
+ /** Short description shown in `list` command */
88
+ readonly description: string;
89
+ /**
90
+ * Validate that the benchmark repo/data exists at the given path.
91
+ * Return an error message if not, or undefined if OK.
92
+ */
93
+ validate(repoPath: string): string | undefined;
94
+ /**
95
+ * Run the benchmark and return a standardised result.
96
+ */
97
+ run(config: BenchmarkRunConfig): ExternalBenchmarkResult;
98
+ }
99
+ export declare function registerBenchmarkAdapter(adapter: ExternalBenchmarkAdapter): void;
100
+ export declare function getAdapter(suiteId: string): ExternalBenchmarkAdapter | undefined;
101
+ export declare function listAdapters(): ExternalBenchmarkAdapter[];
102
+ export interface CompositeReport {
103
+ timestamp: string;
104
+ suites: ExternalBenchmarkResult[];
105
+ aggregate: {
106
+ totalItems: number;
107
+ evaluatedItems: number;
108
+ weightedPrecision: number;
109
+ weightedRecall: number;
110
+ weightedF1: number;
111
+ };
112
+ }
113
+ export declare function computeCompositeReport(results: ExternalBenchmarkResult[]): CompositeReport;
114
+ export declare function formatCompositeReport(report: CompositeReport): string;
115
+ export declare function saveResult(result: ExternalBenchmarkResult): string;
116
+ export declare function loadLatestResult(suiteId: string): ExternalBenchmarkResult | undefined;
117
+ export declare function loadAllLatestResults(): ExternalBenchmarkResult[];
118
+ export declare function runExternalBenchmark(argv: string[]): Promise<void>;
@@ -0,0 +1,296 @@
1
+ /**
2
+ * External Benchmark Registry
3
+ *
4
+ * Provides a unified framework for running third-party benchmarks against
5
+ * Judges and producing comparable, per-suite scoring reports.
6
+ *
7
+ * Each benchmark registers as a named suite with its own adapter that knows
8
+ * how to load data, run evaluations, and produce a standardised result.
9
+ * Results are stored per-suite so they can be compared individually or
10
+ * aggregated into a composite scorecard.
11
+ *
12
+ * Usage:
13
+ * judges external-benchmark run # Run all registered suites
14
+ * judges external-benchmark run --suite openssf-cve # Run one suite
15
+ * judges external-benchmark list # List available suites
16
+ * judges external-benchmark report # Composite report from saved results
17
+ */
18
+ import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
19
+ import { resolve, join } from "path";
20
+ // ─── Registry ───────────────────────────────────────────────────────────────
21
+ const _adapters = new Map();
22
+ export function registerBenchmarkAdapter(adapter) {
23
+ _adapters.set(adapter.suiteId, adapter);
24
+ }
25
+ export function getAdapter(suiteId) {
26
+ return _adapters.get(suiteId);
27
+ }
28
+ export function listAdapters() {
29
+ return [..._adapters.values()];
30
+ }
31
+ export function computeCompositeReport(results) {
32
+ let totalItems = 0;
33
+ let evaluatedItems = 0;
34
+ let weightedPrecSum = 0;
35
+ let weightedRecSum = 0;
36
+ for (const r of results) {
37
+ totalItems += r.totalItems;
38
+ evaluatedItems += r.evaluatedItems;
39
+ weightedPrecSum += r.precision * r.evaluatedItems;
40
+ weightedRecSum += r.recall * r.evaluatedItems;
41
+ }
42
+ const weightedPrecision = evaluatedItems > 0 ? weightedPrecSum / evaluatedItems : 0;
43
+ const weightedRecall = evaluatedItems > 0 ? weightedRecSum / evaluatedItems : 0;
44
+ const weightedF1 = weightedPrecision + weightedRecall > 0
45
+ ? (2 * weightedPrecision * weightedRecall) / (weightedPrecision + weightedRecall)
46
+ : 0;
47
+ return {
48
+ timestamp: new Date().toISOString(),
49
+ suites: results,
50
+ aggregate: { totalItems, evaluatedItems, weightedPrecision, weightedRecall, weightedF1 },
51
+ };
52
+ }
53
+ export function formatCompositeReport(report) {
54
+ const lines = [];
55
+ lines.push("# External Benchmark Scorecard");
56
+ lines.push("");
57
+ lines.push(`**Date:** ${report.timestamp}`);
58
+ lines.push("");
59
+ // Aggregate summary
60
+ lines.push("## Aggregate");
61
+ lines.push("");
62
+ lines.push("| Metric | Value |");
63
+ lines.push("|--------|-------|");
64
+ lines.push(`| Total Items | ${report.aggregate.totalItems} |`);
65
+ lines.push(`| Evaluated | ${report.aggregate.evaluatedItems} |`);
66
+ lines.push(`| Weighted Precision | ${(report.aggregate.weightedPrecision * 100).toFixed(1)}% |`);
67
+ lines.push(`| Weighted Recall | ${(report.aggregate.weightedRecall * 100).toFixed(1)}% |`);
68
+ lines.push(`| Weighted F1 | ${(report.aggregate.weightedF1 * 100).toFixed(1)}% |`);
69
+ lines.push("");
70
+ // Per-suite table
71
+ lines.push("## Per-Suite Results");
72
+ lines.push("");
73
+ lines.push("| Suite | Items | Detection Rate | Precision | Recall | F1 |");
74
+ lines.push("|-------|-------|---------------|-----------|--------|-----|");
75
+ for (const s of report.suites) {
76
+ lines.push(`| [${s.suiteName}](${s.suiteUrl}) | ${s.evaluatedItems}/${s.totalItems} ` +
77
+ `| ${(s.detectionRate * 100).toFixed(1)}% ` +
78
+ `| ${(s.precision * 100).toFixed(1)}% ` +
79
+ `| ${(s.recall * 100).toFixed(1)}% ` +
80
+ `| ${(s.f1Score * 100).toFixed(1)}% |`);
81
+ }
82
+ lines.push("");
83
+ // Per-suite detail sections
84
+ for (const s of report.suites) {
85
+ lines.push(`## ${s.suiteName}`);
86
+ lines.push("");
87
+ lines.push(`**Source:** ${s.suiteUrl}`);
88
+ lines.push(`**Items:** ${s.evaluatedItems} evaluated, ${s.skippedItems} skipped`);
89
+ lines.push("");
90
+ if (s.perCategory && Object.keys(s.perCategory).length > 0) {
91
+ lines.push("| Category | Total | Detected | Rate |");
92
+ lines.push("|----------|-------|----------|------|");
93
+ const entries = Object.entries(s.perCategory).sort((a, b) => b[1].total - a[1].total);
94
+ for (const [cat, data] of entries) {
95
+ lines.push(`| ${cat} | ${data.total} | ${data.detected} | ${(data.rate * 100).toFixed(0)}% |`);
96
+ }
97
+ lines.push("");
98
+ }
99
+ }
100
+ return lines.join("\n");
101
+ }
102
+ // ─── Result Persistence ─────────────────────────────────────────────────────
103
+ const RESULTS_DIR = "benchmarks/external";
104
+ function ensureResultsDir() {
105
+ const dir = resolve(RESULTS_DIR);
106
+ if (!existsSync(dir)) {
107
+ mkdirSync(dir, { recursive: true });
108
+ }
109
+ return dir;
110
+ }
111
+ export function saveResult(result) {
112
+ const dir = ensureResultsDir();
113
+ const fileName = `${result.suiteId}-${result.timestamp.replace(/[:.]/g, "-")}.json`;
114
+ const filePath = join(dir, fileName);
115
+ writeFileSync(filePath, JSON.stringify(result, null, 2), "utf-8");
116
+ // Also write a "latest" symlink-style file
117
+ const latestPath = join(dir, `${result.suiteId}-latest.json`);
118
+ writeFileSync(latestPath, JSON.stringify(result, null, 2), "utf-8");
119
+ return filePath;
120
+ }
121
+ export function loadLatestResult(suiteId) {
122
+ const latestPath = resolve(RESULTS_DIR, `${suiteId}-latest.json`);
123
+ if (!existsSync(latestPath))
124
+ return undefined;
125
+ return JSON.parse(readFileSync(latestPath, "utf-8"));
126
+ }
127
+ export function loadAllLatestResults() {
128
+ const results = [];
129
+ for (const adapter of listAdapters()) {
130
+ const r = loadLatestResult(adapter.suiteId);
131
+ if (r)
132
+ results.push(r);
133
+ }
134
+ return results;
135
+ }
136
+ // ─── CLI Entry Point ────────────────────────────────────────────────────────
137
+ // Ensure adapters are registered when the CLI entry point is called.
138
+ // Each adapter file calls registerBenchmarkAdapter() at module scope.
139
+ let _adaptersLoaded = false;
140
+ async function ensureAdaptersLoaded() {
141
+ if (_adaptersLoaded)
142
+ return;
143
+ _adaptersLoaded = true;
144
+ try {
145
+ await import("./openssf-cve-benchmark.js");
146
+ }
147
+ catch {
148
+ /* adapter unavailable */
149
+ }
150
+ try {
151
+ await import("./martian-code-review-benchmark.js");
152
+ }
153
+ catch {
154
+ /* adapter unavailable */
155
+ }
156
+ }
157
+ export async function runExternalBenchmark(argv) {
158
+ await ensureAdaptersLoaded();
159
+ const subcommand = argv[3] || "run";
160
+ if (subcommand === "--help" || subcommand === "-h") {
161
+ console.log(`
162
+ Judges Panel — External Benchmark Runner
163
+
164
+ Run third-party benchmarks to demonstrate Judges' capabilities and produce
165
+ comparable, per-suite scoring reports.
166
+
167
+ USAGE:
168
+ judges external-benchmark run [options] Run benchmark suite(s)
169
+ judges external-benchmark list List available suites
170
+ judges external-benchmark report [options] Composite report from saved results
171
+
172
+ OPTIONS:
173
+ --suite, -s <id> Run a specific suite (default: all)
174
+ --repo, -r <path> Override the benchmark repo path
175
+ --item <id> Evaluate a single item (CVE ID, PR URL, etc.)
176
+ --output, -o <path> Save results to file
177
+ --format <fmt> Output: text, json, markdown (default: text)
178
+
179
+ AVAILABLE SUITES:`);
180
+ for (const a of listAdapters()) {
181
+ console.log(` ${a.suiteId.padEnd(24)} ${a.description}`);
182
+ }
183
+ console.log("");
184
+ process.exit(0);
185
+ }
186
+ if (subcommand === "list") {
187
+ console.log("\nAvailable external benchmark suites:\n");
188
+ for (const a of listAdapters()) {
189
+ console.log(` ${a.suiteId.padEnd(24)} ${a.suiteName}`);
190
+ console.log(` ${"".padEnd(24)} ${a.description}`);
191
+ console.log(` ${"".padEnd(24)} ${a.suiteUrl}`);
192
+ console.log(` ${"".padEnd(24)} Default path: ${a.defaultRepoPath}`);
193
+ console.log("");
194
+ }
195
+ return;
196
+ }
197
+ if (subcommand === "report") {
198
+ const results = loadAllLatestResults();
199
+ if (results.length === 0) {
200
+ console.error("No saved results found. Run benchmarks first with: judges external-benchmark run");
201
+ process.exit(1);
202
+ }
203
+ const report = computeCompositeReport(results);
204
+ let reportFormat = "markdown";
205
+ let outputPath;
206
+ for (let i = 4; i < argv.length; i++) {
207
+ if (argv[i] === "--format")
208
+ reportFormat = argv[++i];
209
+ else if (argv[i] === "--output" || argv[i] === "-o")
210
+ outputPath = argv[++i];
211
+ }
212
+ const output = reportFormat === "json" ? JSON.stringify(report, null, 2) : formatCompositeReport(report);
213
+ if (outputPath) {
214
+ writeFileSync(outputPath, output, "utf-8");
215
+ console.log(`Report saved to ${outputPath}`);
216
+ }
217
+ else {
218
+ console.log(output);
219
+ }
220
+ return;
221
+ }
222
+ // ── "run" subcommand ──
223
+ let suiteId;
224
+ let repoPath;
225
+ let singleItem;
226
+ let format = "text";
227
+ let outputPath;
228
+ for (let i = 4; i < argv.length; i++) {
229
+ const arg = argv[i];
230
+ if (arg === "--suite" || arg === "-s")
231
+ suiteId = argv[++i];
232
+ else if (arg === "--repo" || arg === "-r")
233
+ repoPath = argv[++i];
234
+ else if (arg === "--item")
235
+ singleItem = argv[++i];
236
+ else if (arg === "--output" || arg === "-o")
237
+ outputPath = argv[++i];
238
+ else if (arg === "--format")
239
+ format = argv[++i];
240
+ }
241
+ const adapters = suiteId ? [getAdapter(suiteId)].filter(Boolean) : listAdapters();
242
+ if (adapters.length === 0) {
243
+ if (suiteId) {
244
+ console.error(`Unknown suite: ${suiteId}. Use 'judges external-benchmark list' to see available suites.`);
245
+ }
246
+ else {
247
+ console.error("No benchmark adapters registered.");
248
+ }
249
+ process.exit(1);
250
+ }
251
+ const allResults = [];
252
+ for (const adapter of adapters) {
253
+ const effectiveRepo = repoPath ? resolve(repoPath) : resolve(adapter.defaultRepoPath);
254
+ console.log(`\n━━━ ${adapter.suiteName} ━━━`);
255
+ console.log(`Suite: ${adapter.suiteId}`);
256
+ console.log(`Repo: ${effectiveRepo}`);
257
+ const validationError = adapter.validate(effectiveRepo);
258
+ if (validationError) {
259
+ console.error(` ⚠️ ${validationError}`);
260
+ console.error(` Skipping ${adapter.suiteId}.\n`);
261
+ continue;
262
+ }
263
+ const result = adapter.run({
264
+ repoPath: effectiveRepo,
265
+ singleItem,
266
+ format: format,
267
+ outputPath,
268
+ });
269
+ allResults.push(result);
270
+ // Save per-suite result
271
+ const savedPath = saveResult(result);
272
+ console.log(`\n Results saved to ${savedPath}`);
273
+ // Print per-suite summary
274
+ console.log(`\n Detection Rate: ${(result.detectionRate * 100).toFixed(1)}%`);
275
+ console.log(` Precision: ${(result.precision * 100).toFixed(1)}%`);
276
+ console.log(` Recall: ${(result.recall * 100).toFixed(1)}%`);
277
+ console.log(` F1 Score: ${(result.f1Score * 100).toFixed(1)}%`);
278
+ }
279
+ // Composite summary if multiple suites ran
280
+ if (allResults.length > 1) {
281
+ const report = computeCompositeReport(allResults);
282
+ console.log("\n━━━ Composite Scorecard ━━━");
283
+ console.log(` Weighted Precision: ${(report.aggregate.weightedPrecision * 100).toFixed(1)}%`);
284
+ console.log(` Weighted Recall: ${(report.aggregate.weightedRecall * 100).toFixed(1)}%`);
285
+ console.log(` Weighted F1: ${(report.aggregate.weightedF1 * 100).toFixed(1)}%`);
286
+ }
287
+ if (outputPath && allResults.length > 0) {
288
+ const finalOutput = format === "json"
289
+ ? JSON.stringify(allResults.length === 1 ? allResults[0] : computeCompositeReport(allResults), null, 2)
290
+ : format === "markdown"
291
+ ? formatCompositeReport(computeCompositeReport(allResults))
292
+ : allResults.map((r) => `${r.suiteName}: F1=${(r.f1Score * 100).toFixed(1)}%`).join("\n");
293
+ writeFileSync(outputPath, finalOutput, "utf-8");
294
+ console.log(`\nResults saved to ${outputPath}`);
295
+ }
296
+ }
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Martian Code Review Benchmark Integration
3
+ *
4
+ * Adapter for the Martian Code Review Bench offline benchmark
5
+ * (https://github.com/withmartian/code-review-benchmark).
6
+ *
7
+ * 50 PRs from 5 major open-source projects (Sentry, Grafana, Cal.com,
8
+ * Discourse, Keycloak) with human-curated golden comments at severity
9
+ * levels Low/Medium/High/Critical.
10
+ *
11
+ * For each PR, Judges evaluates the diff and we match our findings
12
+ * against the golden comments using semantic similarity at the
13
+ * rule-prefix and description level.
14
+ */
15
+ import type { Finding } from "../types.js";
16
+ import type { BenchmarkCase } from "./benchmark.js";
17
+ export interface MartianGoldenComment {
18
+ comment: string;
19
+ severity: "Low" | "Medium" | "High" | "Critical";
20
+ }
21
+ export interface MartianPr {
22
+ pr_title: string;
23
+ url: string;
24
+ original_url?: string;
25
+ az_comment?: string;
26
+ comments: MartianGoldenComment[];
27
+ }
28
+ export interface MartianPrResult {
29
+ prTitle: string;
30
+ prUrl: string;
31
+ sourceRepo: string;
32
+ language: string;
33
+ goldenComments: number;
34
+ matchedComments: number;
35
+ unmatchedComments: number;
36
+ falsePositives: number;
37
+ precision: number;
38
+ recall: number;
39
+ findings: Finding[];
40
+ matches: Array<{
41
+ golden: string;
42
+ finding: string;
43
+ severity: string;
44
+ }>;
45
+ missed: string[];
46
+ }
47
+ export declare function loadGoldenComments(repoPath: string): Map<string, MartianPr[]>;
48
+ /**
49
+ * Convert a Martian PR with golden comments into BenchmarkCase format
50
+ * for use in the LLM benchmark pipeline.
51
+ *
52
+ * Each golden comment becomes an expected finding. The PR diff provides
53
+ * the actual code to evaluate. The LLM judge determines if its review
54
+ * catches the same issues the human reviewer identified.
55
+ */
56
+ export declare function convertPrToBenchmarkCase(pr: MartianPr, repoName: string, diff?: string): BenchmarkCase | undefined;
57
+ /**
58
+ * Convert all Martian golden comments into BenchmarkCase[] for LLM evaluation.
59
+ * Fetches actual PR diffs from GitHub when possible.
60
+ */
61
+ export declare function convertAllToBenchmarkCases(repoPath: string): BenchmarkCase[];